@Article{kotz:prefetch, author = {David F. Kotz and Carla Schlatter Ellis}, title = {Prefetching in File Systems for {MIMD} Multiprocessors}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1990}, month = {April}, volume = {1}, number = {2}, pages = {218--230}, publisher = {IEEE Computer Society Press}, earlier = {ellis:prefetch}, later = {kotz:thesis}, keyword = {dfk, parallel file system, prefetching, MIMD, disk caching, parallel I/O, pario-bib}, abstract = {The problem of providing file I/O to parallel programs has been largely neglected in the development of multiprocessor systems. There are two essential elements of any file system design intended for a highly parallel environment: parallel I/O and effective caching schemes. This paper concentrates on the second aspect of file system design and specifically, on the question of whether prefetching blocks of the file into the block cache can effectively reduce overall execution time of a parallel computation, even under favorable assumptions. \par Experiments have been conducted with an interleaved file system testbed on the Butterfly Plus multiprocessor. Results of these experiments suggest that 1) the hit ratio, the accepted measure in traditional caching studies, may not be an adequate measure of performance when the workload consists of parallel computations and parallel file access patterns, 2) caching with prefetching can significantly improve the hit ratio and the average time to perform an I/O operation, and 3) an improvement in overall execution time has been observed in most cases. In spite of these gains, prefetching sometimes results in increased execution times (a negative result, given the optimistic nature of the study). \par We explore why is it not trivial to translate savings on individual I/O requests into consistently better overall performance and identify the key problems that need to be addressed in order to improve the potential of prefetching techniques in this environment.} } @PhdThesis{kotz:thesis, author = {David Kotz}, title = {Prefetching and Caching Techniques in File Systems for {MIMD} Multiprocessors}, year = {1991}, month = {April}, school = {Duke University}, note = {Available as technical report CS-1991-016}, URL = {http://www.cs.dartmouth.edu/~dfk/papers/thesis_note.html}, keyword = {dfk, parallel file system, prefetching, MIMD, disk caching, parallel I/O, pario-bib}, abstract = {The increasing speed of the most powerful computers, especially multiprocessors, makes it difficult to provide sufficient I/O bandwidth to keep them running at full speed for the largest problems. Trends show that the difference in the speed of disk hardware and the speed of processors is increasing, with I/O severely limiting the performance of otherwise fast machines. This widening access-time gap is known as the ``I/O bottleneck crisis.'' One solution to the crisis, suggested by many researchers, is to use many disks in parallel to increase the overall bandwidth. \par This dissertation studies some of the file system issues needed to get high performance from parallel disk systems, since parallel hardware alone cannot guarantee good performance. The target systems are large MIMD multiprocessors used for scientific applications, with large files spread over multiple disks attached in parallel. The focus is on automatic caching and prefetching techniques. We show that caching and prefetching can transparently provide the power of parallel disk hardware to both sequential and parallel applications using a conventional file system interface. We also propose a new file system interface (compatible with the conventional interface) that could make it easier to use parallel disks effectively. \par Our methodology is a mixture of implementation and simulation, using a software testbed that we built to run on a BBN GP1000 multiprocessor. The testbed simulates the disks and fully implements the caching and prefetching policies. Using a synthetic workload as input, we use the testbed in an extensive set of experiments. The results show that prefetching and caching improved the performance of parallel file systems, often dramatically.}, comment = {Published as kotz:prefetch, kotz:jwriteback, kotz:jpractical, kotz:fsint2.} } @TechReport{kotz:throughput, author = {David Kotz}, title = {Throughput of Existing Multiprocessor File Systems}, year = {1993}, month = {May}, number = {PCS-TR93-190}, institution = {Dept. of Math and Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR93-190.ps.Z}, keyword = {parallel I/O, multiprocessor file system, performance, survey, dfk, pario-bib}, comment = {A brief note on the reported performance of existing file systems (Intel CFS, nCUBE, CM-2, CM-5, and Cray). Many have disappointingly low absolute throughput, in MB/s.} } @TechReport{kotz:tuning, author = {David Kotz}, title = {Tuning {STARFISH}}, year = {1996}, month = {October}, number = {PCS-TR96-296}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-296.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {STARFISH is a parallel file-system simulator we built for our research into the concept of disk-directed I/O. In this report, we detail steps taken to tune the file systems supported by STARFISH, which include a traditional parallel file system (with caching) and a disk-directed I/O system. In particular, we now support two-phase I/O, use smarter disk scheduling, increased the maximum number of outstanding requests that a compute processor may make to each disk, and added gather/scatter block transfer. We also present results of the experiments driving the tuning effort.}, comment = {Reports on some new changes to the STARFISH simulator that implements traditional caching and disk-directed I/O. This is meant mainly as a companion to kotz:jdiskdir. See also kotz:jdiskdir, kotz:diskdir, kotz:expand.} } @TechReport{kotz:workload-tr, author = {David Kotz and Nils Nieuwejaar}, title = {Dynamic File-Access Characteristics of a Production Parallel Scientific Workload}, year = {1994}, month = {April}, number = {PCS-TR94-211}, institution = {Dept. of Math and Computer Science, Dartmouth College}, note = {Revised May 11, 1994}, later = {kotz:workload}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR94-211.ps.Z}, keyword = {parallel file system, file access pattern, multiprocessor file system workload, parallel I/O, pario-bib, dfk}, abstract = {Multiprocessors have permitted astounding increases in computational performance, but many cannot meet the intense I/O requirements of some scientific applications. An important component of any solution to this I/O bottleneck is a parallel file system that can provide high-bandwidth access to tremendous amounts of data {\em in parallel\/} to hundreds or thousands of processors. \par Most successful systems are based on a solid understanding of the characteristics of the expected workload, but until now there have been no comprehensive workload characterizations of multiprocessor file systems. We began the CHARISMA project in an attempt to fill that gap. We instrumented the common node library on the iPSC/860 at NASA Ames to record all file-related activity over a two-week period. Our instrumentation is different from previous efforts in that it collects information about every read and write request and about the {\em mix\/} of jobs running in the machine (rather than from selected applications). \par The trace analysis in this paper leads to many recommendations for designers of multiprocessor file systems. First, the file system should support simultaneous access to many different files by many jobs. Second, it should expect to see many small requests, predominantly sequential and regular access patterns (although of a different form than in uniprocessors), little or no concurrent file-sharing between jobs, significant byte- and block-sharing between processes within jobs, and strong interprocess locality. Third, our trace-driven simulations showed that these characteristics led to great success in caching, both at the compute nodes and at the I/O~nodes. Finally, we recommend supporting strided I/O requests in the file-system interface, to reduce overhead and allow more performance optimization by the file system.} } @InProceedings{kotz:workload, author = {David Kotz and Nils Nieuwejaar}, title = {Dynamic File-Access Characteristics of a Production Parallel Scientific Workload}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, month = {November}, pages = {640--649}, publisher = {IEEE Computer Society Press}, address = {Washington, DC}, earlier = {kotz:workload-tr}, later = {kotz:jworkload}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:workload.ps.Z}, keyword = {parallel file system, file access pattern, multiprocessor file system workload, parallel I/O, pario-bib, dfk}, abstract = {Multiprocessors have permitted astounding increases in computational performance, but many cannot meet the intense I/O requirements of some scientific applications. An important component of any solution to this I/O bottleneck is a parallel file system that can provide high-bandwidth access to tremendous amounts of data {\em in parallel\/} to hundreds or thousands of processors. \par Most successful systems are based on a solid understanding of the characteristics of the expected workload, but until now there have been no comprehensive workload characterizations of multiprocessor file systems. We began the CHARISMA project in an attempt to fill that gap. We instrumented the common node library on the iPSC/860 at NASA Ames to record all file-related activity over a two-week period. Our instrumentation is different from previous efforts in that it collects information about every read and write request and about the {\em mix\/} of jobs running in the machine (rather than from selected applications). \par The trace analysis in this paper leads to many recommendations for designers of multiprocessor file systems. First, the file system should support simultaneous access to many different files by many jobs. Second, it should expect to see many small requests, predominantly sequential and regular access patterns (although of a different form than in uniprocessors), little or no concurrent file-sharing between jobs, significant byte- and block-sharing between processes within jobs, and strong interprocess locality. Third, our trace-driven simulations showed that these characteristics led to great success in caching, both at the compute nodes and at the I/O~nodes. Finally, we recommend supporting strided I/O requests in the file-system interface, to reduce overhead and allow more performance optimization by the file system.} } @InProceedings{kotz:writeback, author = {David Kotz and Carla Schlatter Ellis}, title = {Caching and Writeback Policies in Parallel File Systems}, booktitle = {1991 IEEE Symposium on Parallel and Distributed Processing}, year = {1991}, month = {December}, pages = {60--67}, earlier = {kotz:thesis}, later = {kotz:jwriteback}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:writeback.ps.Z}, keyword = {dfk, parallel file system, disk caching, parallel I/O, MIMD, pario-bib}, abstract = {Improvements in the processing speed of multiprocessors are outpacing improvements in the speed of disk hardware. Parallel disk I/O subsystems have been proposed as one way to close the gap between processor and disk speeds. Such parallel disk systems require parallel file system software to avoid performance-limiting bottlenecks. We discuss cache management techniques that can be used in a parallel file system implementation. We examine several writeback policies, and give results of experiments that test their performance.}, comment = {See also kotz:jpractical, kotz:fsint2, cormen:integrate.} } @TechReport{krieger:asf-tr, author = {Orran Krieger and Michael Stumm and Ronald Unrau}, title = {The {Alloc Stream Facility}: A Redesign of Application-level Stream {I/O}}, year = {1992}, month = {October}, number = {CSRI-275}, institution = {Computer Systems Research Institute, University of Toronto}, address = {Toronto, Canada, M5S 1A1}, later = {krieger:asf}, URL = {ftp://ftp.csri.utoronto.edu/csri-technical-reports/275/275.ps.Z}, keyword = {memory-mapped file, file system, parallel I/O, pario-bib}, abstract = {This paper describes the design and implementation of a new application level I/O facility, called the Alloc Stream Facility. The Alloc Stream Facility has several key advantages. First, performance is substantially improved as a result of a) the structure of the facility that allows it to take advantage of system specific features like mapped files, and b) a reduction in data copying and the number of I/O system calls. Second, the facility is designed for multi-threaded applications running on multiprocessors and allows for a high degree of concurrency. Finally, the facility can support a variety of I/O interfaces, including stdio, emulated Unix I/O, ASI, and C++ streams, in a way that allows applications to freely intermix calls to the different interfaces, resulting in improved code reusability. \par We show that on several Unix workstation platforms the performance of Unix applications using the Alloc Stream Facility can be substantially better that when the applications use the original I/O facilities.}, comment = {See also krieger:mapped. ``This is an extended version of the paper with the same title in the March, 1994 edition of IEEE Computer.'' A 3-level interface structure: interface, backplane, and stream-specific modules. Different interfaces available: unix, stdio, ASI (theirs), C++. Common backplane. Stream-specific implementations that export operations like salloc and sfree, which return pointers to data buffers. ASI exports that interface to the user, for maximum efficiency. Performance is best when using mapped files as underlying implementation. Many stdio or unix apps are faster only after relinking. ASI is even faster. In addition to better performance, also get multithreading support, multiple interfaces, and extensibility.} } @Article{krieger:asf, author = {Orran Krieger and Michael Stumm and Ronald Unrau}, title = {The {Alloc Stream Facility}: A Redesign of Application-level Stream {I/O}}, journal = {IEEE Computer}, year = {1994}, month = {March}, volume = {27}, number = {3}, pages = {75--82}, publisher = {IEEE Computer Society Press}, earlier = {krieger:asf-tr}, keyword = {memory-mapped file, file system, parallel I/O, pario-bib} } @InProceedings{krieger:hfs, author = {Orran Krieger and Michael Stumm}, title = {{HFS:} A Flexible File System for large-scale Multiprocessors}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {6--14}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, later = {krieger:hfs2}, URL = {ftp://ftp.cs.toronto.edu/pub/parallel/Krieger_Stumm_DAGS93.ps.Z}, keyword = {multiprocessor file system, parallel I/O, operating system, shared memory, pario-bib}, abstract = {The {H{\sc urricane}} File System (HFS) is a new file system being developed for large-scale shared memory multiprocessors with distributed disks. The main goal of this file system is scalability; that is, the file system is designed to handle demands that are expected to grow linearly with the number of processors in the system. To achieve this goal, HFS is designed using a new structuring technique called Hierarchical Clustering. HFS is also designed to be flexible in supporting a variety of policies for managing file data and for managing file system state. This flexibility is necessary to support in a scalable fashion the diverse workloads we expect for a multiprocessor file system.}, comment = {This paper is now out of date; see krieger:thesis. Designed for scalability on the hierarchical clustering model (see unrau:cluster), the Hurricane File System for NUMA shared-memory MIMD machines. Each cluster has its own full file system, which communicates with those in other clusters. Pieces are name server, open-file server, and block-file server. On first access, the file is mapped into the application space. VM system calls BFS to arrange transfers. Open questions: policies for file state management, block distribution, caching, and prefetching. Object-oriented approach used to allow for flexibility and extendability. Local disk file systems are log-structured.} } @InProceedings{krieger:hfs2, author = {Orran Krieger and Michael Stumm}, title = {{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {95--108}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {krieger:hfs}, later = {krieger:hfs3}, keyword = {parallel I/O, parallel file system, object-oriented, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for (potentially large-scale) shared memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. For example, a file's structure can be optimized for concurrent random-access write-only operations by ten processes. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most existing parallel file systems support a single file structure and a small set of policies. \par We have implemented HFS as part of the Hurricane operating system running on the Hector shared memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.}, comment = {A published form of krieger:hfs and the thesis krieger:thesis. Their main point is that the file system is constructed from building-block objects. When you create a file you choose a few building blocks, for example, a replication block that mirrors the file, and some distribution blocks that distribute each replica across a set of disks. When you open the file you plug in some more building blocks, e.g., to do prefetching or to provide the kind of interface that you want to use. They point out that this flexibility is critical to be able to get good performance, because different file-access patterns need different structures and policies. They found that mapped files minimize copying costs and improve performance. They were able to obtain full disk bandwidth. Great paper.} } @Article{krieger:hfs3, author = {Orran Krieger and Michael Stumm}, title = {{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions}, journal = {ACM Transactions on Computer Systems}, year = {1997}, month = {August}, volume = {15}, number = {3}, pages = {286--321}, earlier = {krieger:hfs2}, URL = {http://www.acm.org/pubs/citations/journals/tocs/1997-15-3/p286-krieger/}, keyword = {parallel I/O, parallel file system, object-oriented, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for (potentially large-scale) shared-memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies, and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. As an extreme example, HFS allows a file's structure to be optimized for concurrent random-access write-only operations by 10 threads, something no other file system can do. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most parallel file systems support a single file structure and a small set of policies. We have implemented HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns, HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.} } @PhdThesis{krieger:thesis, author = {Orran Krieger}, title = {{HFS}: A flexible file system for shared-memory multiprocessors}, year = {1994}, month = {October}, school = {University of Toronto}, URL = {ftp://ftp.cs.toronto.edu/pub/parallel/Okrieg_PhD.ps.Z}, keyword = {parallel I/O, multiprocesor file system, shared memory, memory-mapped I/O, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for large-scale, shared-memory multiprocessors. Its architecture is based on the principle that a file system must support a wide variety of file structures, file system policies and I/O interfaces to maximize performance for a wide variety of applications. HFS uses a novel, object-oriented building-block approach to provide the flexibility needed to support this variety of file structures, policies, and I/O interfaces. File structures can be defined in HFS that optimize for sequential or random access, read-only, write-only or read/write access, sparse or dense data, large or small file sizes, and different degrees of application concurrency. Policies that can be defined on a per-file or per-open instance basis include locking policies, prefetching policies, compression/decompression policies and file cache management policies. In contrast, most existing file systems have been designed to support a single file structure and a small set of policies. \par We have implemented large portions of HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. Also, we show that HFS is able to deliver the full I/O bandwidth of the disks on our system to the applications.}, comment = {Excellent work. HFS uses an object-oriented building-block approach to provide flexible, scalable high performance. Indeed, HFS appears to be one of the most flexible parallel file systems available, allowing users to independently control (or redefine) policies for prefetching, caching, redundancy and fault tolerance, and declustering.} } @TechReport{krystynak:datavault, author = {John Krystynak}, title = {{I/O} Performance on the {Connection Machine DataVault} System}, year = {1992}, month = {May}, number = {RND-92-011}, institution = {NAS Systems Division, NASA Ames}, later = {krystynak:pario}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-011/RND-92-011.html}, keyword = {parallel I/O, parallel file system, parallel I/O, performance measurement, pario-bib}, comment = {Short measurements of CM-2 Datavault. Faster if you access through Paris. Can get nearly full 32 MB/s bandwidth. Problem in its ability to use multiple CMIO busses.} } @InProceedings{krystynak:pario, author = {John Krystynak and Bill Nitzberg}, title = {Performance Characteristics of the {iPSC/860} and {CM-2} {I/O} Systems}, booktitle = {Proceedings of the Seventh International Parallel Processing Symposium}, year = {1993}, pages = {837--841}, publisher = {IEEE Computer Society Press}, address = {Newport Beach, CA}, earlier = {krystynak:datavault}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Essentially a (short) combination of krystynak:datavault and nitzberg:cfs.} } @InProceedings{kucera:libc, author = {Julie Kucera}, title = {Making {\em libc}\/ Suitable for Use by Parallel Programs}, booktitle = {Proceedings of the USENIX Distributed and Multiprocessor Systems Workshop}, year = {1989}, pages = {145--152}, keyword = {parallel file system interface, pario-bib}, comment = {Experience making libc reentrant, adding semaphores, etc., on a Convex. Some problems with I/O. Added semaphores and private memory to make libc calls reentrant, i.e., callable in parallel by multiple threads.} } @InProceedings{kwan:cm5io, author = {Thomas T. Kwan and Daniel A. Reed}, title = {Performance of the {CM-5} Scalable File System}, booktitle = {Proceedings of the 8th ACM International Conference on Supercomputing}, year = {1994}, month = {July}, pages = {156--165}, publisher = {ACM Press}, address = {Manchester, UK}, keyword = {parallel I/O, parallel architecture, multiprocessor file system, pario-bib}, comment = {They measure the performance of the CM-5 Scalable File System using synthetic benchmarks. They compare CM-Fortran with CMMD. The hardware-dependent (``physical'') modes were much faster than the generic-format modes, which have to reorder data between the processor distribution and the disk distribution. The network turned out to be a bottleneck for the performance when reordering was needed. They conclude that more user control over the I/O would be very helpful.} } @PhdThesis{kwan:sort, author = {Sai Choi Kwan}, title = {External Sorting: {I/O} Analysis and Parallel Processing Techniques}, year = {1986}, month = {January}, school = {University of Washington}, note = {Available as technical report 86--01--01}, keyword = {parallel I/O, sorting, pario-bib}, comment = {Examines external sorting techniques such as merge sort, tag sort, multi-pass distribution sort, and one-pass distribution sort. The model is one where I/O complexity is included, assuming a linear seek time distribution and a cost of 1/2 rotation for each seek. Parallel I/O or computing are not considered until the distribution sorts. Architectural model on page 58.} } @InProceedings{kwong:distribution, author = {Peter Kwong and Shikaresh Majumdar}, title = {Study of Data Distribution Strategies for Parallel {I/O} Management}, booktitle = {Proceedings of the Third International Conference of the Austrian Center for Parallel Computation (ACPC)}, year = {1996}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {1127}, pages = {12--23}, publisher = {Springer-Verlag}, keyword = {parallel I/O, pario-bib}, abstract = {Recent studies have demonstrated that a significant number of I/O operations are performed by a number of classes of different parallel applications. Appropriate I/O management strategies are required however for harnessing the power of parallel I/O. This paper focuses on two I/O management issues that affect system performance in multiprogrammed parallel environments. Characterization of I/O behavior of parallel applications in terms of four different models is discussed first, followed by an investigation of the performance of a number of different data distribution strategies. Using computer simulations this research shows that I/O characteristics of applications and data distribution have an important effect on system performance. Applications that can simultaneously do computation and I/O, plus strategies that can incorporate centralized I/O management are found to be beneficial for a multiprogrammed parallel environment.}, comment = {See majumdar:management.} } @InProceedings{lake:pario, author = {Brian Lake and Chris Gray}, title = {Parallel {I/O} for {MIMD} Machines}, booktitle = {Proceedings of SS'93: High Performance Computing}, year = {1993}, month = {June}, pages = {301--308}, address = {Calgary}, keyword = {parallel I/O, MIMD, multiprocessor file system, pario-bib}, comment = {They describe the I/O system for the Myrias SPS-3 parallel computer. The SPS is a no-remote-access (NORMA) machine with a software shared memory abstraction. They provide a standard C/FORTRAN I/O interface, with a few extensions. The user's parallel program is considered a client, and an I/O processor (IOP) is the server. No striping across IOPs, which makes it relatively simple for them to have the server manage the shared file pointer. Their extensions allow atomic, file-pointer update, returning the actual position where I/O occurred, and atomic access to fixed- and variable-length records. They have three protocols, for different transfer sizes; small using simple request/response; medium using sliding window; and large using scatter/gather and special hardware double buffering at the IOP. They use scatter/gather DMA, and page-table fiddling, for messaging. Performance is 89--96\% of hardware peak, limited by IOP's VME backplane.} } @Misc{large-scale-memories, key = {Algorithmica}, title = {Special issue on Large-Scale Memories}, year = {1994}, volume = {12}, number = {2}, howpublished = {Algorithmica} } @Article{latifi:network, author = {S. Latifi and M. Moraes de Azevedo and N. Bagherzadeh}, title = {A star-based {I/O}-bounded network for massively parallel systems}, journal = {IEE Proceedings--- Computers and Digital Techniques}, year = {1995}, month = {January}, volume = {42}, number = {1}, pages = {5--14}, keyword = {verify authors, parallel I/O, parallel computer architecture, pario-bib}, abstract = {The paper describes a new interconnection network for massively parallel systems, referred to as star-connected cycles (SCC). The SCC graph presents an I/O-bounded structure that results in several advantages over variable degree graphs like the star and the hypercube. The description of the SCC graph includes issues such as labelling of nodes, degree, diameter and symmetry. The paper also presents an optimal routeing algorithm for the SCC and efficient broadcasting algorithms with O(n) running time, with n being the dimensionality of the graph. A comparison with the cube-connected cycles (CCC) and other interconnection networks is included, indicating that, for even n, an n-SCC and a CCC of similar sizes have about the same diameter. In addition, it is shown that one-port broadcasting in an n-SCC graph can be accomplished with a running time better than or equal to that required by an n-star containing (n-1) times fewer nodes.} } @InProceedings{lautenbach:pfs, author = {Berin F. Lautenbach and Bradley M. Broom}, title = {A Parallel File System for the {AP1000}}, booktitle = {Proceedings of the Third Fujitsu-ANU CAP Workshop}, year = {1992}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, mutisya:cache, and broom:cap. The Acacia file system has file access modes that are much like those in Intel CFS and TMC CMMD. By default all processes have their own file pointer, but they can switch to another mode either all together or in row- or column-subsets. The other modes include a replicated mode (where all read or write the same data), and a variety of shared modes, with arbitrary, fixed, or unspecified ordering among processors, and with fixed or variable-sized records. They also have a parallel-open operation, support for logical records, control over the striping width (number of disks) and height (block size), and control over of redundancy. A prototype is running.} } @Article{lawlor:parity, author = {F.~D. Lawlor}, title = {Efficient mass storage parity recovery mechanism}, journal = {IBM Technical Disclosure Bulletin}, year = {1981}, month = {July}, volume = {24}, number = {2}, pages = {986--987}, keyword = {parallel I/O, disk array, RAID, pario-bib}, comment = {An early paper, perhaps the earliest, that describes the techniques that later became RAID. Lawlor notes how to use parity to recover data lost due to disk crash, as in RAID3, addresses the read-before-write problem by caching the old data block as well as the new data block, and shows how two-dimensional parity can protect against two or more failures.} } @InProceedings{lee:external, author = {Jang Sun Lee and Sunghoon Ko and Sanjay Ranka and Byung Eui Min}, title = {High-Performance External Computations Using User-Controllable {I/O}}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @TechReport{lee:impl, author = {Edward K. Lee}, title = {Software and Performance Issues in the Implementation of a {RAID} Prototype}, year = {1990}, month = {May}, number = {UCB/CSD 90/573}, institution = {EECS, Univ. California at Berkeley}, URL = {http://cs-tr.cs.berkeley.edu/TR/UCB:CSD-90-573}, keyword = {parallel I/O, disk striping, performance, pario-bib}, comment = {Details of their prototype. Defines terms like stripe unit. Explores ways to lay out parity. Does performance simulations. Describes ops needed in device driver. Good to read if you plan to implement a RAID. Results: small R+W, or high loads, don't care about parity placement; in low load, there are different best cases for large R+W. Best all-around is left-symmetric. See also lee:parity.} } @Article{lee:jparity, author = {Edward K. Lee and Randy H. Katz}, title = {The Performance of Parity Placements in Disk Arrays}, journal = {IEEE Transactions on Computers}, year = {1993}, month = {June}, volume = {42}, number = {6}, pages = {651--664}, publisher = {IEEE Computer Society Press}, earlier = {lee:parity}, keyword = {RAID, reliability, parallel I/O, disk striping, pario-bib}, comment = {Journal version of lee:parity.} } @InProceedings{lee:logical-disks, author = {Jang Sun Lee and Jungmin Kim and P. Bruce Berra and Sanjay Ranka}, title = {Logical Disks: User-Controllable {I/O} For Scientific Applications}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {340--347}, publisher = {IEEE Computer Society Press}, keyword = {logical disks, parallel I/O, pario-bib}, abstract = {In this paper we propose user-controllable I/O operations and explore the effects of them with some synthetic access patterns. The operations allow users to determine a file structure matching the access patterns, control the layout and distribution of data blocks on physical disks, and present various access patterns with a minimum number of I/O operations. The operations do not use a file pointer to access data as in typical file systems, which eliminates the overhead of managing the offset of the file, making it easy to share data and reducing the number of I/O operations.} } @InProceedings{lee:pario, author = {K-K. Lee and P. Varman}, title = {Prefetching and {I/O} Parallelism in Multiple Disk Systems}, booktitle = {Proceedings of the 1995 International Conference on Parallel Processing}, year = {1995}, month = {August}, pages = {III:160--163}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, prefetching, disk array, pario-bib} } @InProceedings{lee:parity, author = {Edward K. Lee and Randy H. Katz}, title = {Performance Consequences of Parity Placement in Disk Arrays}, booktitle = {Proceedings of the Fourth International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {1991}, pages = {190--199}, later = {lee:jparity}, keyword = {RAID, reliability, parallel I/O, pario-bib}, comment = {Interesting comparison of several parity placement schemes. Boils down to two basic choices, depending on whether read performance or write performance is more important to you.} } @InProceedings{lee:petal, author = {Edward K. Lee and Chandramohan A. Thekkath}, title = {Petal: Distributed Virtual Disks}, booktitle = {Proceedings of the Seventh International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {1996}, month = {October}, pages = {84--92}, address = {Cambridge, MA}, URL = {http://www.research.digital.com/SRC/personal/Chandu_Thekkath/Papers/petal-asplos96.ps}, keyword = {parallel I/O, distributed file system, declustering, reliability, pario-bib}, comment = {They are trying to build a file server that is easier to manage than most of today's distributed file systems, because disks are cheap but management is expensive. They describe a distributed file server that spreads blocks of all files across many disks and many servers. They use chained declustering so that they can survive loss of server or disk. They dynamically balance load. They dynamically reconfigure when new virtual disks are created or new physical disks are added. They've built it all and are now going to look at possible file systems that can take advantage of the features of Petal.} } @InProceedings{lee:raidmodel, author = {Edward K. Lee and Randy H. Katz}, title = {An Analytic Performance Model of Disk Arrays}, booktitle = {Proceedings of the 1993 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1993}, pages = {98--109}, keyword = {disk array, parallel I/O, RAID, analytic model, pario-bib} } @TechReport{lee:redist, author = {Jang Sun Lee and Sanjay Ranka and Ravi V. Shankar}, title = {Communication-Efficient and Memory-Bounded External Redistribution}, year = {1995}, institution = {Syracuse University}, URL = {ftp://top.cis.syr.edu/users/ranka/ParallelComputing/ExternalRedistribution.ps.Z}, keyword = {parallel I/O algorithm, out-of-core, pario-bib}, abstract = {This paper presents communication-efficient algorithms for the external data redistribution problem. Deterministic lower bounds and upper bounds are presented for the number of I/O operations, communication time and the memory requirements of external redistribution. Our algorithms differ from most other algorithms presented for out-of-core applications in that it is optimal (within a small constant factor) not only in the number of I/O operations, but also in the time taken for communication. A coarse-grained MIMD architecture with I/O subsystems attached to each processor is assumed, but the results are expected to be applicable over a wider variety of architectures.}, comment = {See shankar:transport for the underlying communication primitives.} } @InProceedings{lee:support, author = {Jenq Kuen Lee and Ing-Kuen Tsaur and San-Yih Huang}, title = {Language and Environmental Support for Parallel Object {I/O} on Distributed Memory Environments}, booktitle = {Proceedings of the Seventh SIAM Conference on Parallel Processing for Scientific Computing}, year = {1995}, month = {February}, pages = {756--761}, publisher = {SIAM}, keyword = {parallel I/O, object oriented, distributed memory, pario-bib}, abstract = {The paper describes a parallel file object environment to support distributed array store on shared nothing distributed computing environments. Our environment enables programmers to extend the concept of array distribution from memory levels to file levels. It allows parallel I/O according to the distribution of objects in an application. When objects are read and/or written by multiple applications using different distributions, we present a novel scheme to help programmers to select the best data distribution pattern according to minimum amount of remote data movements for the store of array objects on distributed file systems.} } @InProceedings{lee:userio, author = {Jang Sun Lee and Sang-Gue Oh and Bruce P. Berra and Sanjay Ranka}, title = {User-Controllable {I/O} for Parallel Computers}, booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA~'96)}, year = {1996}, month = {August}, pages = {442--453}, keyword = {parallel I/O, pario-bib}, abstract = {This paper presents the design of UPIO, a software for user-controllable parallel input and output. UPIO is designed to maximize I/O performance for scientific applications on MIMD multicomputers. The most important features of UPIO are: It supports a domain-specific file model and a variety of application interfaces to present numerous access patterns. UPIO provides user-contollerable I/O operations that allow users to control data access, file structure, and data distribution. The domain-specific file model and user controllability give low I/O overhead and allow programmers to exploit the aggregate bandwidth of parallel disks.}, comment = {They describe an interface that seems to allow easier access for programmers that want to map matrices onto parallel files. The concepts are not well explained, so it's hard to really understand what is new and different. They make no explicit comparison with other advanced interfaces like that in Vesta or Galley. No performance results.} } @Article{li:bfxm, author = {Qun Li and Jie Jing and Li Xie}, title = {{BFXM}: A Parallel File System Model Based on the Mechanism of Distributed Shared Memory}, journal = {ACM Operating Systems Review}, year = {1997}, month = {October}, volume = {31}, number = {4}, pages = {30--40}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @Article{li:jmodels, author = {Zhiyong Li and Peter H. Mills and John H. Reif}, title = {Models and Resource Metrics for Parallel and Distributed Computation}, journal = {Parallel Algorithms and Applications}, year = {1996}, volume = {8}, pages = {35--59}, earlier = {li:models}, keyword = {parallel I/O algorithm, pario-bib} } @InProceedings{li:models, author = {Zhiyong Li and Peter H. Mills and John H. Reif}, title = {Models and Resource Metrics for Parallel and Distributed Computation}, booktitle = {Proceedings of the Twenty-Eighth Annual Hawaii International Conference on System Sciences}, year = {1995}, month = {January}, pages = {51--60}, address = {Hawaii}, later = {li:jmodels}, URL = {file://ftp.cs.unc.edu/pub/projects/proteus/reports/models_hicss95.ps.gz"}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {This paper presents a framework of using {\em resource metrics} to characterize the various models of parallel computation. Our framework reflects the approach of recent models to abstract architectural details into several generic parameters, which we call resource metrics. We examine the different resource metrics chosen by different parallel models, categorizing the models into four classes: the basic synchronous models, and extensions of the basic models which more accurately reflect practical machines by incorporating notions of asynchrony, communication cost and memory hierarchy. We then present a new parallel computation model, the LogP-HMM model, as an illustration of design principles based on the framework of resource metrics. The LogP-HMM model extends an existing parameterized network model (LogP) with a sequential hierarchical memory model (HMM) characterizing each processor. The result accurately captures both network communication costs and the effects of multileveled memory such as local cache and I/O. We examine the potential utility of our model in the design of near optimal sorting and FFT algorithms.} } @TechReport{li:recursive-tr, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, year = {1996}, month = {March}, number = {96-04}, institution = {Dept. of Computer Science, Duke University}, later = {li:recursive}, URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz}, keyword = {parallel I/O, out-of-core algorithm, pario-bib}, abstract = {In this paper, we present a framework for synthesizing I/O efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform (FFT) and block matrix transposition algorithms. Our framework uses an algebraic representation which is based on tensor products and other matrix operations. The programs are optimized for the striped Vitter and Shriver's two-level memory model in which data can be distributed using various cyclic(B) distributions in contrast to the normally used {\it physical track} distribution cyclic(B_d), where B_d is the physical disk block size. \par We first introduce tensor bases to capture the semantics of block-cyclic data distributions of out-of-core data and also data access patterns to out-of-core data. We then present program generation techniques for tensor products and matrix transposition. We accurately represent the number of parallel I/O operations required for the synthesized programs for tensor products and matrix transposition as a function of tensor bases and data distributions. We introduce an algorithm to determine the data distribution which optimizes the performance of the synthesized programs. Further, we formalize the procedure of synthesizing efficient out-of-core programs for tensor product formulas with various block-cyclic distributions as a dynamic programming problem. \par We demonstrate the effectiveness of our approach through several examples. We show that the choice of an appropriate data distribution can reduce the number of passes to access out-of-core data by as large as eight times for a tensor product, and the dynamic programming approach can largely reduce the number of passes to access out-of-core data for the overall tensor product formulas.} } @TechReport{li:synthesizing-tr, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, year = {1996}, month = {March}, number = {TR-96-04}, institution = {Dept. of Computer Science, Duke University}, later = {li:synthesizing}, URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {In this paper, we present a framework for synthesizing I/O efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform (FFT) and block matrix transposition algorithms. Our framework uses an algebraic representation which is based on tensor products and other matrix operations. The programs are optimized for the striped Vitter and Shriver's two-level memory model in which data can be distributed using various cyclic(B) distributions in contrast to the normally used {\it physical track} distribution cyclic(B_d), where B_d is the physical disk block size. \par We first introduce tensor bases to capture the semantics of block-cyclic data distributions of out-of-core data and also data access patterns to out-of-core data. We then present program generation techniques for tensor products and matrix transposition. We accurately represent the number of parallel I/O operations required for the synthesized programs for tensor products and matrix transposition as a function of tensor bases and data distributions. We introduce an algorithm to determine the data distribution which optimizes the performance of the synthesized programs. Further, we formalize the procedure of synthesizing efficient out-of-core programs for tensor product formulas with various block-cyclic distributions as a dynamic programming problem. \par We demonstrate the effectiveness of our approach through several examples. We show that the choice of an appropriate data distribution can reduce the number of passes to access out-of-core data by as large as eight times for a tensor product, and the dynamic programming approach can largely reduce the number of passes to access out-of-core data for the overall tensor product formulas.} } @InProceedings{li:synthesizing, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, booktitle = {Proceedings of the 1996 International Conference on Parallel Processing}, year = {1996}, month = {August}, pages = {II:142--149}, publisher = {IEEE Computer Society Press}, address = {St. Charles, IL}, earlier = {li:synthesizing-tr}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {This paper presents a framework for synthesizing I/O-efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform and matrix transpositions. the programs are synthesized from tensor (Kronecker) product representations of algorithms. These programs are optimized for a striped two-level memory model where in the out-of-core data can have block-cyclic distributions on multiple disks.} } @InProceedings{ligon:pfs, author = {W. B. Ligon and R. B. Ross}, title = {Implementation and Performance of a Parallel File System for High Performance Distributed Applications}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {471--480}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, cluster computing, parallel file system, pario-bib}, abstract = {Dedicated cluster parallel computers (DCPCs) are emerging as low-cost high performance environments for many important applications in science and engineering. A significant class of applications that perform well on a DCPC are coarse-grain applications that involve large amounts of file I/O. Current research in parallel file systems for distributed systems is providing a mechanism for adapting these applications to the DCPC environment. We present the Parallel Virtual File System (PVFS), a system that provides disk striping across multiple nodes in a distributed parallel computer and file partitioning among tasks in a parallel program. PVFS is unique among similar systems in that it uses a stream-based approach that represents each file access with a single set of request parameters and decouples the number of network messages from details of the file striping and partitioning. PVFS also provides support for efficient collective file accesses and allows overlapping file partitions. We present results of early performance experiments that show PVFS achieves excellent speedups in accessing moderately sized file segments.} } @InProceedings{lin:clusterio, author = {Zheng Lin and Songnian Zhou}, title = {Parallelizing {I/O} Intensive Applications for a Workstation Cluster: a Case Study}, booktitle = {Proceedings of the IPPS~'93 Workshop on Input/Output in Parallel Computer Systems}, year = {1993}, pages = {17--36}, address = {Newport Beach, CA}, note = {Also published in Computer Architecture News 21(5), December 1993, pages 15--22}, keyword = {parallel I/O, workstation cluster, text retrieval, pario-bib}, comment = {They implement a parallel text retrieval application on a cluster of DEC~5000 workstations.} } @InProceedings{livny:stripe, author = {M. Livny and S. Khoshafian and H. Boral}, title = {Multi-Disk Management Algorithms}, booktitle = {Proceedings of the 1987 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1987}, month = {May}, pages = {69--77}, keyword = {parallel I/O, disk striping, disk array, pario-bib} } @TechReport{lo:disks, author = {Raymond Lo and Norman Matloff}, title = {A Probabilistic Limit on the Virtual Size of Replicated File Systems}, year = {1989}, institution = {Department of EE and CS, UC Davis}, keyword = {parallel I/O, replication, file system, disk mirroring, disk shadowing, pario-bib}, comment = {A look at shadowed disks. If you have $k$ disks set up to read from the disk with the shortest seek, but write to all disks, you have increased reliability, read time like the min of the seeks, and write time like the max of the seeks. It appears that with increasing $k$ you can get good performance. But this paper clearly shows, since writes move all disk heads to the same location, that the effective value of $k$ is actually quite low. Only 4--10 disks are likely to be useful for most traffic loads.} } @Article{lockey:characterization, author = {P. Lockey and R. Proctor and I. D. James}, title = {Characterization of {I/O} Requirements in a Massively Parallel Shelf Sea Model}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {It is now recognized that a high level of I/O performance is crucial in making effective use of parallel machines for many scientific application codes. This paper considers the I/O requirements in one particular scientific application area; 3D modelling of continental shelf sea regions. We identify some of the scientific aims which drive the model development, and the consequent impact on the I/O needs. As a case study we take a parallel production code running a simulation of the North Sea on a Cray~T3D platform and investigate the I/O performance in dealing with the dominant I/O component; dumping of results data to disk. In order to place the performance issues in a more general framework we construct a simple theoretical model of I/O requirements, and use this to probe the impact of available I/O performance on current and proposed scientific objectives.} } @Article{long:swift-raid, author = {Darrell D. E. Long and Bruce R. Montague}, title = {{Swift/RAID}: A Distributed {RAID} System}, journal = {Computing Systems}, year = {1994}, month = {Summer}, volume = {7}, number = {3}, pages = {333--359}, keyword = {RAID, disk array, parallel I/O, distributed file system, pario-bib}, comment = {One of the features of this system is the way they develop and execute transaction plans as little scripts that are built by the client, sent to the servers, and then executed by interpreters.} } @InProceedings{loverso:sfs, author = {Susan J. LoVerso and Marshall Isman and Andy Nanopoulos and William Nesheim and Ewan D. Milne and Richard Wheeler}, title = {{\em sfs}: {A} Parallel File System for the {CM-5}}, booktitle = {Proceedings of the 1993 Summer USENIX Technical Conference}, year = {1993}, pages = {291--305}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {They took the Unix file system from SunOS and extended it to run on the CM-5. This involved handling non-power-of-two block sizes, parallel I/O calls, large file sizes, and more encouragement for extents to be allocated. The hardware is particularly suited to RAID~3 with a 16 byte striping unit, although in theory the software could do anything it wants. Geared to data-parallel model. Proc nodes (PNs) contact the timesharing daemon (TSD) on the control processor (CP), who gets block lists from the file system, which runs on one of the CPs. The TSD then arranges with the disk storage nodes (DSNs) to do the transfer directly with the PNs. Each DSN has 8~MB of buffer space, 8 disk drives, 4 SCSI busses, and a SPARC as controller. Partition managers mount non-local sfs via NFS. Performance results good. Up to 185~MB/s on 118 (2~MB/s) disks.} } @Article{mackay:groundwater, author = {David Mackay and G. Mahinthakumar and Ed D'Azevedo}, title = {A Study of {I/O} in a Parallel Finite Element Groundwater Transport Code}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {A parallel finite element groundwater transport code is used to compare three different strategies for performing parallel I/O: (1) have a single processor collect data and perform sequential I/O in large blocks, (2) use variations of vendor specific I/O extensions (3) use the EDONIO I/O library. Each processor performs many writes of one to four kilobytes to reorganize localdata in a global shared file. Our findings suggest having a single processor collect data and perform large block-contiguous operations may be quite efficient and portable for up to 32 processor configurations. This approach does not scale well for a larger number of processors since the single processor becomes a bottleneck for gathering data. The effective application I/O rate observed, which includes times for opening and closing files, is only a fraction of the peak device read/write rates. Some form of data redistribution and buffering in remote memory as performed in EDONIO may yield significant improvements for non-contiguous data I/O access patterns and short requests. Implementors of parallel I/O systems may consider some form of buffering as performed in EDONIO to speed up such I/O requirements.} } @InProceedings{madhyastha:adaptive, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Intelligent, Adaptive File System Policy Selection}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {172--179}, publisher = {IEEE Computer Society Press}, later = {madhyastha:thesis}, keyword = {parallel I/O, pario-bib}, abstract = {Traditionally, maximizing input/output performance has required tailoring application input/output patterns to the idiosyncrasies of specific input/output systems. The authors show that one can achieve high application input/output performance via a low overhead input/output system that automatically recognizes file access patterns and adaptively modifies system policies to match application requirements. This approach reduces the application developer's input/output optimization effort by isolating input/output optimization decisions within a retargetable file system infrastructure. To validate these claims, they have built a lightweight file system policy testbed that uses a trained learning mechanism to recognize access patterns. The file system then uses these access pattern classifications to select appropriate caching strategies, dynamically adapting file system policies to changing input/output demands throughout application execution. The experimental data show dramatic speedups on both benchmarks and input/output intensive scientific applications.}, comment = {See also madhyastha:thesis, and related papers.} } @InProceedings{madhyastha:classification, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Input/Output Access Pattern Classification Using Hidden {Markov} Models}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {57--67}, publisher = {ACM Press}, address = {San Jose, CA}, later = {madhyastha:thesis}, keyword = {workload characterization, file access pattern, parallel I/O, pario-bib}, abstract = {Input/output performance on current parallel file systems is sensitive to a good match of application access pattern to file system capabilities. Automatic input/output access classification can determine application access patterns at execution time, guiding adaptive file system policies. In this paper we examine a new method for access pattern classification that uses hidden Markov models, trained on access patterns from previous executions, to create a probabilistic model of input/output accesses. We compare this approach to a neural network classification framework, presenting performance results from parallel and sequential benchmarks and applications.}, comment = {The most interesting thing in this paper is the use of a Hidden Markov Model to understand the access pattern of an application to a file. After running the application on the file once, and simultaneously training their HMM, they use the result to tune the system for the next execution (cache size, cache partitioning, prefetching, Intel file mode, etc). They get much better performance in future runs. See also madhyastha:thesis, and related papers.} } @InProceedings{madhyastha:global, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Exploiting Global Input/Output Access Pattern Classification}, booktitle = {Proceedings of SC '97: High Performance Computing and Networking}, year = {1997}, month = {November}, publisher = {IEEE Computer Society Press}, address = {San Jose}, later = {madhyastha:thesis}, URL = {http://scxy.tc.cornell.edu/sc97/proceedings/TECH/MADHYAST/INDEX.HTM}, keyword = {file access pattern, parallel I/O, pario-bib}, abstract = {Parallel input/output systems attempt to alleviate the performance bottleneck that affects many input/output intensive applications. In such systems, an understanding of the application access pattern, especially how requests from multiple processors for different file regions are logically related, is important for optimizing file system performance. We propose a method for automatically classifying these global access patterns and using these global classifications to select and tune file system policies to improve input/output performance. We demonstrate this approach on benchmarks and scientific applications using global classification to automatically select appropriate underlying Intel PFS input/output modes and server buffering strategies.}, comment = {No page numbers: web and CDROM proceedings only. See also madhyastha:thesis and related papers.} } @InProceedings{madhyastha:optimizing, author = {Tara M. Madhyastha and Christopher L. Elford and Daniel A. Reed}, title = {Optimizing Input/Output Using Adaptive File System Policies}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {II:493--514}, later = {madhyastha:thesis}, keyword = {multiprocessor file system, prefetching, caching, parallel I/O, multiprocessor file system interface, pario-bib}, comment = {See also madhyastha:thesis, and related papers.} } @PhdThesis{madhyastha:thesis, author = {Tara Madhyastha}, title = {Automatic Classification of Input/Output Access Patterns}, year = {1997}, month = {August}, school = {University of Illinois, Urbana-Champaign}, URL = {http://www-pablo.cs.uiuc.edu/People/tara/thesis.html}, keyword = {parallel I/O, file access pattern, pario-bib}, comment = {See also madhyastha:classification, madhyastha:global, madhyastha:adaptive, madhyastha:optimizing.} } @InProceedings{majumdar:characterize, author = {S. Majumdar and Yiu Ming Leung}, title = {Characterization of applications with {I/O} for processor scheduling in multiprogrammed parallel systems}, booktitle = {Proceedings of the 1994 IEEE Symposium on Parallel and Distributed Processing}, year = {1994}, pages = {298--307}, publisher = {IEEE Computer Society Press}, keyword = {workload characterization, scheduling, parallel I/O, pario-bib}, abstract = {Most studies of processor scheduling in multiprogrammed parallel systems have ignored the I/O performed by applications. Recent studies have demonstrated that significant I/O operations are performed by a number of different classes of parallel applications. This paper focuses on some basic issues that underlie scheduling in multiprogrammed parallel environments running applications with I/O. Characterization of the I/O behavior of parallel applications is discussed first. Based on simulation models this research investigates the influence of these I/O characteristics on processor scheduling.} } @InProceedings{majumdar:management, author = {Shikaresh Majumdar and Faisal Shad}, title = {Characterization and Management of {I/O} on Multiprogrammed Parallel Systems}, booktitle = {Proceedings of the 1995 IEEE Symposium on Parallel and Distributed Processing}, year = {1995}, month = {October}, pages = {502--510}, publisher = {IEEE Computer Society Press}, address = {San Antonio, TX}, keyword = {workload characterization, parallel I/O, pario-bib}, comment = {Analytical workload model. Simulation studies. See also kwong:distribution.} } @InProceedings{malluhi:pss, author = {Qutaibah Malluhi and William E. Johnston}, title = {Approaches for a Reliable High-Performance Distributed-Parallel Storage System}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {500--509}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {The paper studies different schemes to enhance the reliability, availability and security of a high performance distributed storage system. We have previously designed a distributed parallel storage system that employs the aggregate bandwidth of multiple data servers connected by a high speed wide area network to achieve scalability and high data throughput. The general approach of the paper employs erasure error correcting codes to add data redundancy that can be used to retrieve missing information caused by hardware, software, or human faults. The paper suggests techniques for reducing the communication and computation overhead incurred while retrieving missing data blocks form redundant information. These techniques include clustering, multidimensional coding, and the full two dimensional parity scheme.} } @Article{manuel:logjam, author = {Tom Manuel}, title = {Breaking the Data-rate Logjam with arrays of small disk drives}, journal = {Electronics}, year = {1989}, month = {February}, volume = {62}, number = {2}, pages = {97--100}, keyword = {parallel I/O, disk array, I/O bottleneck, pario-bib}, comment = {See also Electronics, Nov. 88 p 24, Dec. 88 p 112. Trade journal short on disk arrays. Very good intro. No new technical content. Concentrates on RAID project. Lists several commercial versions. Mostly concentrates on single-controller versions.} } @Misc{maspar:pario, key = {Mas}, title = {Parallel File {I/O} Routines}, year = {1992}, howpublished = {MasPar Computer Corporation}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {Man pages for MasPar file system interface. They have either a single shared file pointer, after which all processors read or write in an interleaved pattern, or individual (plural) file pointer, allowing arbitrary access patterns. Updated in 1992 with many more features.} } @Article{masters:pario, author = {Del Masters}, title = {Improve Disk Subsystem Performance with Multiple Serial Drives in Parallel}, journal = {Computer Technology Review}, year = {1987}, month = {July}, volume = {7}, number = {9}, pages = {76--77}, keyword = {parallel I/O, pario-bib}, comment = {Information about the early Maximum Strategy disk array, which striped over 4 disk drives, apparently synchronously.} } @Article{matloff:multidisk, author = {Norman S. Matloff}, title = {A Multiple-Disk System for both Fault Tolerance and Improved Performance}, journal = {IEEE Transactions on Reliability}, year = {1987}, month = {June}, volume = {R-36}, number = {2}, pages = {199--201}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, reliability, disk shadowing, disk mirroring, pario-bib}, comment = {Variation on mirrored disks using more than 2 disks, to spread the files around. Good performance increases.} } @InProceedings{matthews:hippi, author = {Kevin C. Matthews}, title = {Experiences Implementing a Shared File System on a {HIPPI} Disk Array}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {77--88}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/matthews/matthews.htm}, keyword = {mass storage, distributed file system, parallel I/O, pario-bib}, abstract = {Shared file systems which use a physically shared mass storage device have existed for many years, although not on UNIX based operating systems. This paper describes a shared file system (SFS) that was implemented first as a special project on the Gray Research Inc. (CRI) UNICOS operating system. A more general product was then built on top of this project using a HIPPI disk array for the shared mass storage. The design of SFS is outlined, as well as some performance experiences with the product. We describe how SFS interacts with the OSF distributed file service (DFS) and with the CRI data migration facility (DMF). We also describe possible development directions for the SFS product.}, comment = {They use hardware to tie the same storage device (a disk array) to several computers (Cray C90s). They build a custom piece of hardware just to service semaphore requests very fast. HIPPI is the interconnect. Details have a lot to do with the synchronization between processors trying to update the same metadata; that's why they use the semaphores.} } @InProceedings{matthijs:framework, author = {F. Matthijs and Y. Berbers and P. Verbaeten}, title = {A flexible {I/O} framework for parallel and distributed systems}, booktitle = {Proceedings of the Fifth International Workshop on Object Orientation in Operating Systems}, year = {1995}, pages = {187--190}, publisher = {IEEE Computer Society Press}, keyword = {input-output programs, object-oriented, parallel systems; I/O performance, migration, dynamic load balancing, fault tolerance, parallel I/O, pario-bib}, abstract = {We propose a framework for I/O in parallel and distributed systems. The framework is highly customizable and extendible, and enables programmers to offer high level objects in their applications, without requiring them to struggle with the low level and sometimes complex details of high performance distributed I/O. Also, the framework exploits application specific information to improve I/O performance by allowing specialized programmers to customize the framework. Internally, we use indirection and granularity control to support migration, dynamic load balancing, fault tolerance, etc. for objects of the I/O system, including those representing application data.} } @InProceedings{mcmurdy:unstripe, author = {Ronald K. McMurdy and Badrinath Roysam}, title = {Improving {RAID-5} Performance by Un-striping Moderate-sized Files}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {II--279--282}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {Allocate small- and medium-sized files entirely on one disk rather than striped, to cut seek and rotation latency that would happen if they were spread across many disks.} } @InProceedings{meador:array, author = {Wes E. Meador}, title = {Disk Array Systems}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {143--146}, keyword = {parallel I/O, disk array, disk striping, pario-bib}, comment = {Describes {\em Strategy 2 Disk Array Controller}, which allows 4 or 8 drives, hardware striped, with parity drive and 0-4 hot spares. Up to 4 channels to cpu(s). Logical block interface. Defects, errors, formatting, drive failures all handled automatically. Peak 40 MB/s data transfer on each channel.} } @Misc{meiko:cs2, key = {Meiko}, title = {Computing Surface {CS-2}: Technical Overview}, year = {1993}, howpublished = {Meiko brochure S1002-10M115.01A}, keyword = {multiprocessor architecture, parallel I/O, pario-bib}, comment = {Three node types: 4 SPARC (50 MHz), 1 SPARC + two Fujitsu vector procs, or 1 SPARC + 3 I/O ports. All have a special communications processor that supports remote memory access. Each has 128 MBytes in 16 banks. Memory-memory transfer operations using ``remote DMA'', supported by the communications processor. User-level comm interface, with protection. Uses multistage network with 8x8 crossbar switches, looks like a fat tree. S/BUS, separate from the memory bus, is used for I/O, either directly, or through 2 SCSI and 1 ethernet. Control and diagnostic networks. Parallel file system stripes across multiple partitions. Can use RAID. Communications processor has its own MMU; control registers are mapped to user space. Network-wide virtual addresses can support shared memory? Remote store, atomic operations, global operations. Comm proc can support I/O threads -- but can it talk to the disks? OS based on Solaris 2, plus global shared memory, parallel file system, and capability-based protection. Machine is logically partitioned into login, devices, and parallel computation.} } @InProceedings{menasce:mass, author = {Daniel Menasc\'e and Odysseas Ionnis Pentakalos and Yelena Yesha}, title = {An Analytic Model of Hierarchical Mass Storage Systems With Network-Attached Storage Devices}, booktitle = {Proceedings of the 1996 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1996}, month = {May}, pages = {180--189}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {network attached peripherals, analytic model, mass storage, parallel I/O, pario-bib} } @InProceedings{menon:compare, author = {Jai Menon}, title = {A Performance Comparison of {RAID-5} and Log-structured Arrays}, booktitle = {Proceedings of the Fourth IEEE International Symposium on High Performance Distributed Computing}, year = {1995}, month = {August}, pages = {167--178}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {He compares a RAID-5 disk array with a log-structured array (LSA). An LSA is essentially an implementation of a log-structured file system inside a disk controller. The disk controller buffers up writes in a non-volatile cache; when the outgoing data buffer is full, it is written to some large contiguous region of the disk. The controller manages a directory to keep track of the various segment locations, and does garbage collection (cleaning). They can insert a compression algorithm in front of the cache so that they get better cache and disk utilization by storing data in compressed form. for fair comparison they compare with a similar feature in the plain RAID5 array.} } @Article{menon:daisy, author = {Jai Menon and Kent Treiber}, title = {{Daisy}: Virtual-disk Hierarchical Storage Manager}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {37--44}, keyword = {hierarchical storage, tape storage, tertiary storage, tape robot, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @Article{merchant:striping, author = {Arif Merchant and Philip S. Yu}, title = {Analytic Modeling and Comparisons of Striping Strategies of Replicated Disk Arrays}, journal = {IEEE Transactions on Computers}, year = {1995}, month = {March}, volume = {44}, number = {3}, pages = {419--431}, publisher = {IEEE Computer Society Press}, keyword = {disk striping, disk array, RAID, parallel I/O, pario-bib} } @InProceedings{merriam:triangle, author = {Drshal L. Merriam}, title = {Parallel Implementation of an Algorithm for {Delaunay} Triangulation}, booktitle = {Proceedings of Computational Fluid Dynamics}, year = {1992}, volume = {2}, pages = {907--912}, keyword = {parallel I/O, file system workload, pario-bib}, comment = {This application runs on the NASA Ames iPSC/860. This application has some I/O: reading in the input file, which is a set of x,y,z data points. I/O was really slow if formatted (ie, ASCII instead of binary) or sequential instead of parallel. Any input record could go to any processor; the first step in the algorithm (after the points are read in) is essentially a kind of sort to move points around to localize points and balance load.} } @Article{michael:future, author = {Gavin Michael and Andrew Chien}, title = {Future Multicomputers: Beyond Minimalist Multiprocessors?}, journal = {Computer Architecture News}, year = {1992}, month = {December}, volume = {20}, number = {5}, pages = {6--12}, keyword = {multiprocessor architecture, compiler, parallel I/O, pario-bib}, comment = {Includes some comments by Randy Katz about parallel I/O, in particular, distinguishing between ``fat'' nodes (with many disks, e.g., a RAID), and ``thin'' nodes (with one disk).} } @TechReport{milenkovic:model, author = {Milan Milenkovi\'c}, title = {A Model for Multiprocessor {I/O}}, year = {1989}, month = {July}, number = {89-CSE-30}, institution = {Dept. of Computer Science and Engineering, Southern Methodist University}, keyword = {multiprocessor I/O, I/O architecture, distributed system, pario-bib}, comment = {Advocates using dedicated server processors for all I/O, e.g., disk server, terminal server, network server. Pass I/O requests and data via messages or RPC calls over the interconnect (here a shared bus). Server handles packaging, blocking, caching, errors, interrupts, and so forth, freeing the main processors and the interconnect from all this activity. Benefits: encapsulates I/O-related stuff in specific places, accommodates heterogeneity, improves performance. Nice idea, but allows for an I/O bottleneck, unless server can handle all the demand. Otherwise would need multiple servers, more expensive than just multiple controllers.} } @InProceedings{miller:iobehave, author = {Ethan L. Miller and Randy H. Katz}, title = {Input/Output Behavior of Supercomputer Applications}, booktitle = {Proceedings of Supercomputing '91}, year = {1991}, month = {November}, pages = {567--576}, publisher = {IEEE Computer Society Press}, address = {Albuquerque, NM}, keyword = {file access pattern, supercomputer, disk caching, prefetching, pario-bib}, comment = {Same as miller:iobehave-tr except without the appendix outlining trace format. Included in pario-bibliography not because it measures a parallel workload, but because it is so often cited in the parallel-IO community.} } @Article{miller:jrama, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA}: An Easy-To-Use, High-Performance Parallel File System}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {419--446}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {miller:rama2}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {Modern massively parallel file systems provide high bandwidth file access by striping files across arrays of disks attached to a few specialized I/O nodes. However, these file systems are hard to use and difficult to integrate with workstations and tertiary storage. RAMA addresses these problems by providing a high-performance massively parallel file system with a simple interface. RAMA uses hashing to pseudo-randomly distribute data to all of its disks, insuring high bandwidth regardless of access pattern and eliminating bottlenecks in file block accesses. This flexibility does not cause a large loss of performance - RAMA's simulated performance is within 10-15\% of the optimum performance of a similarly-sized striped file system, and is a factor of 4 or more better than a striped file system with poorly laid out data.} } @Article{miller:pario, author = {L. L. Miller and A. R. Hurson}, title = {Multiprogramming and concurrency in parallel file environments}, journal = {International Journal of Mini and Microcomputers}, year = {1991}, volume = {13}, number = {2}, pages = {37--45}, keyword = {parallel file system, parallel I/O, database, pario-bib}, comment = {This is really for databases. They identify two types of file access: one where the file can be operated on as a set of subfiles, each independently by a processor (what they call MIMD mode), and another where the file must be operated on with a centralized control (SIMD mode), in their case to search a B-tree whose nodes span the set of processors. Basically it is a host connected to a controller, that is connected to a set of small I/O processors, each of which has access to disk. In many ways a uniprocessor perspective. Paper design, with simulation results.} } @Article{miller:pass, author = {L.~L. Miller and S.~R. Inglett and A.~R. Hurson}, title = {{PASS}--- A Multiuser Parallel File System Based on Microcomputers}, journal = {Journal of systems and software}, year = {1992}, month = {September}, volume = {19}, number = {1}, pages = {75--83}, keyword = {parallel I/O, parallel file system, multiprocessor file system, pario-bib}, abstract = {Data intensive computer applications suffer from inadequate use of parallelism for processing data stored on secondary storage devices. Devices such as database machines are useful in some applications, but many applications are too small or specialized to use such technology. To bridge this gap, the authors introduce the parallel secondary storage (PASS) system. PASS is based on a network of microcomputers. The individual microcomputers are assigned to a unit of secondary storage and the operations of the microcomputers are initiated and monitored by a control processor. The file system is capable of acting as either an SIMD or an MIMD machine. Communication between the individual microcomputers and the control processor is described. The integration of the multiple microcomputers into the primitive operations on a file is examined. Finally, the strategies employed to enhance performance in the multiprogramming environment are discussed.} } @Article{miller:pfs, author = {L. L. Miller and S. R. Inglett}, title = {Enhancing performance in a parallel file system}, journal = {Microprocessing and Microprogramming}, year = {1994}, month = {May}, volume = {40}, number = {4}, pages = {261--274}, keyword = {parallel I/O, parallel file system, pario-bib} } @InProceedings{miller:radar, author = {Craig Miller and David G. Payne and Thanh N. Phung and Herb Siegel and Roy Williams}, title = {Parallel Processing of Spaceborne Imaging Radar Data}, booktitle = {Proceedings of Supercomputing '95}, year = {1995}, publisher = {IEEE Computer Society Press}, address = {San Diego, CA}, URL = {http://www.supercomp.org/sc95/proceedings/012_PAYN/SC95.HTM}, keyword = {parallel I/O, pario-bib}, abstract = {We discuss the results of a collaborative project on parallel processing of Synthetic Aperture Radar (SAR) data, carried out between the NASA/Jet Propulsion Laboratory (JPL), the California Institute of Technology (Caltech) and Intel Scalable Systems Division (SSD). Through this collaborative effort, we have successfully parallelized the most compute-intensive SAR correlator phase of the Spaceborne Shuttle Imaging Radar-C/X-Band SAR (SIR-C/X-SAR) code, for the Intel Paragon. We describe the data decomposition, the scalable high-performance I/O model, and the node-level optimizations which enable us to obtain efficient processing throughput. In particular, we point out an interesting double level of parallelization arising in the data decomposition which increases substantially our ability to support ``high volume'' SAR. Results are presented from this code running in parallel on the Intel Paragon. A representative set of SAR data, of size 800 Megabytes, which was collected by the SIR-C/X-SAR instrument aboard NASA's Space Shuttle in 15 seconds, is processed in 55 seconds on the Concurrent Supercomputing Consortium's Paragon XP/S 35+. This compares well with a time of 12 minutes for the current SIR-C/X-SAR processing system at JPL. For the first time, a commercial system can process SIR-C/X-SAR data at a rate which is approaching the rate at which the SIR-C/X-SAR instrument can collect the data. This work has successfully demonstrated the viability of the Intel Paragon supercomputer for processing ``high volume'' Synthetic Aperture Radar data in near real-time.}, comment = {Available only on CD-ROM and WWW.} } @InProceedings{miller:rama, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA:} A File System for Massively-Parallel Computers}, booktitle = {Proceedings of the Twelfth IEEE Symposium on Mass Storage Systems}, year = {1993}, pages = {163--168}, later = {miller:rama2}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {The multiprocessor's file system acts as a block cache for tertiary storage. Disk space is broken into ``lines'' of a few MB. Each line has a descriptor telling what blocks it has, and their status. (fileid, offset) hashed to find (disk, linenum). Intrinsic metadata stored at start of each file; positional metadata implicit in hashing, and line descriptors. Sequentiality parameter puts several blocks of a file in the same line, to improve medium-sized requests (otherwise generate lots of request-response net traffic). Not clear on best choice of size. No mention of atomicity wrt concurrent writes to same data. Blocks migrate to tertiary storage as they get old. Fetched on demand, by block (not file). Self-describing blocks have ids in block -- leads to screwy block sizes?} } @InProceedings{miller:rama2, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA}: Easy Access to a High-Bandwidth Massively Parallel File System}, booktitle = {Proceedings of the 1995 USENIX Technical Conference}, year = {1995}, month = {January}, pages = {59--70}, earlier = {miller:rama}, later = {miller:jrama}, keyword = {parallel file system, pario-bib}, comment = {Simulation results. RAMA distributes blocks of each file randomly across disks, which are attached to all processor nodes, using a hash function. Thus there is no centralized metadata. The big benefit is uniform performance regardless of access pattern; they found one situation where it was 10\% slower than an optimal striped layout, but many cases where they were as much as 4 times faster than bad striped data layouts. So, they can give reasonable performance without the need for programmer- or manager-specified data layouts.} } @Article{milligan:bifs, author = {P. Milligan and L. C. Waring and A. S. C. Lee}, title = {{BIFS}: {A} filing system for multiprocessor based systems}, journal = {Microprocessing and Microprogramming}, year = {1991}, volume = {31}, pages = {9--12}, note = {Euromicro~'90 conference, Amsterdam}, keyword = {multiprocessor file system, pario-bib}, comment = {A simple file system for a transputer network, attached to a single disk device. Several procs are devoted to the file system, but really just act as buffers for the host processor that runs the disk. They provide sequential, random access, and indexed files, either byte- or record-oriented. Some prototypes; no results. They add buffering and double buffering, but don't really get into anything interesting.} } @Article{miya:biblio, author = {Eugene N. Miya}, title = {Multiprocessor/Distributed Processing Bibliography}, journal = {Computer Architecture News}, year = {1985}, month = {March}, volume = {13}, number = {1}, pages = {27--29}, note = {Much updated since then, now kept on-line}, keyword = {bibliography, parallel computing, distributed computing, pario-bib}, comment = {This reference is the original publication of Eugene's annotated bibliography. It has grown tremendously and is now huge. Because of the copyright considerations, you can't just nab it off the net, but it is free for the asking from Eugene. Send mail to eugene@nas.nasa.gov.} } @InProceedings{mogi:parity, author = {Kazuhiko Mogi and Masaru Kitsuregawa}, title = {Dynamic Parity Stripe Reorganizations for {RAID5} Disk Arrays}, booktitle = {Proceedings of the Third International Conference on Parallel and Distributed Information Systems}, year = {1994}, month = {September}, pages = {17--26}, keyword = {disk array, RAID, disk striping, parallel I/O, pario-bib}, abstract = {RAID5 disk arrays provide high performance and high reliability for reasonable cost. However RAIDS suffers a performance penalty during block updates. We examine the feasibility of using "dynamic parity striping" to improve the performance of block updates. Instead of updating each block independently, this method buffers a number of updates, generates a new stripe composed of the newly updated blocks, then writes the full stripe back to disk. Two implementations are considered in this paper. One is a log-structured file system (LFS) based method and the other is Virtual Striping. Both methods achieve much higher performance than conventional approaches. The performance characteristics of the LFS based method and the Virtual Striping method are clarified.} } @Article{mokhoff:pario, author = {Nicholas Mokhoff}, title = {Parallel Disk Assembly Packs 1.5 {GBytes}, runs at 4 {MBytes/s}}, journal = {Electronic Design}, year = {1987}, month = {November}, pages = {45--46}, keyword = {parallel I/O, I/O, disk architecture, disk striping, reliability, pario-bib}, comment = {Commercially available: Micropolis Systems' Parallel Disk 1800 series. Four disks plus one parity disk, synchronized and byte-interleaved. SCSI interface. Total capacity 1.5 GBytes, sustained transfer rate of 4 MBytes/s. MTTF 140,000 hours. Hard and soft errors corrected in real-time. Failed drives can be replaced while system is running.} } @TechReport{montague:swift, author = {Bruce R. Montague}, title = {The {Swift/RAID} Distributed Transaction Driver}, year = {1993}, month = {January}, number = {UCSC-CRL-93-99}, institution = {UC Santa Cruz}, keyword = {RAID, parallel I/O, distributed file system, transaction, pario-bib}, comment = {See other Swift papers, e.g., cabrera:pario and long:swift-raid. This paper describes the basic idea of a using a transaction driver to implement RAID over a distributed system. Then it spends most of the time describing the details of the implementation. The basic idea is that processors execute transaction drivers, which provide virtual CPUs to execute scripts of atomic 'instructions', where the instructions are high-level things like read block, write block, compute parity, etc. The transaction driver multiprocesses several scripts if necessary. (Although they describe it in the context of a RAID implementation it certainly could be used for other complex distributed services.) The instructions are often transaction pairs, which compile into a pair of instructions, one for this node and one for the remote node. This node sends the program to the remote node, and they execute them separately, keeping synchronized for transaction pairs when necessary. See also the newer paper in Computing Surveys, long:swift-raid.} } @Article{moon:declustering, author = {Bongki Moon and Joel H. Saltz}, title = {Scalability Analysis of Declustering Methods for for Multidimensional Range Queries}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {1997}, note = {To appear}, URL = {ftp://hpsl.cs.umd.edu/pub/papers/ieee_tkde.ps.Z}, abstract = {Efficient storage and retrieval of multi-attribute datasets have become one of the essential requirements for many data-intensive applications. The Cartesian product file has been known as an effective multi-attribute file structure for partial-match and best-match queries. Several heuristic methods have been developed to decluster Cartesian product files across multiple disks to obtain high performance for disk accesses. Though the scalability of the declustering methods becomes increasingly important for systems equipped with a large number of disks, no analytic studies have been done so far. In this paper we derive formulas describing the scalability of two popular declustering methods Disk Modulo and Fieldwise Xor for range queries, which are the most common type of queries. These formulas disclose the limited scalability of the declustering methods and are corroborated by extensive simulation experiments. From the practical point of view, the formulas given in this paper provide a simple measure which can be used to predict the response time of a given range query and to guide the selection of a declustering method under various conditions.} } @Article{moore:ddio, author = {Jason A. Moore and Michael J. Quinn}, title = {Enhancing Disk-Directed {I/O} for Fine-Grained Redistribution of File Data}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {477--499}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {parallel I/O, multiprocessor file system, interprocessor communication, pario-bib}, comment = {They propose several enhancements to disk-directed I/O (see kotz:diskdir) that aim to improve performance on fine-grained distributions, that is, where each block from the disk is broken into small pieces that are scattered among the compute processors. One enhancement combines multiple pieces, possibly from separate disk blocks, into a single message. Another is to use two-phase I/O (see delrosario:two-phase), but to use disk-directed I/O to read data from the disks into CP memories, efficiently, then permute. This latter technique is probably faster than normal two-phase I/O that uses a traditional file system, not disk-directed I/O, for the read.} } @InProceedings{moore:detection, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Efficient Data-Parallel Files via Automatic Mode Detection}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {1--14}, publisher = {ACM Press}, address = {Philadelphia}, URL = {http://www.cs.orst.edu/~moorej/iopads.ps.Z}, keyword = {parallel I/O, data parallelism, pario-bib}, abstract = {Parallel languages rarely specify parallel I/O constructs, and existing commercial systems provide the programmer with a low-level I/O interface. We present design principles for integrating I/O into languages and show how these principles are applied to a virtual-processor-oriented language. We illustrate how machine-independent modes are used to support both high performance and generality. We describe an automatic mode detection technique that saves the programmer from extra syntax and low-level file system details. We show how virtual processor file operations, typically small by themselves, are combined into efficient large-scale file system calls. Finally, we present a variety of benchmark results detailing design tradeoffs and the performance of various modes.}, comment = {Updated version of TR 95-80-9. See moore:stream. Interesting approach, where they permit a fairly normal fread and fwrite kind of interface, with each VP having its own stream. They choose their own format for the file, and switch between formats (and internal buffering) depending on the particulars of the fread and fwrite parameters. They seem to have good performance, and a familiar interface. They are left with a non-standard file format.} } @TechReport{moore:ocean, author = {Jason A. Moore}, title = {Parallel {I/O} Requirements of Four Oceanography Applications}, year = {1995}, month = {January}, number = {95-80-1}, institution = {Oregon State University}, URL = {http://www.cs.orst.edu/~moorej/ocean.ps.Z}, keyword = {data parallel, file system workload, parallel I/O, pario-bib}, abstract = {Brief descriptions of the I/O requirements for four production oceanography programs running at Oregon State University are presented. The applications all rely exclusively on array-oriented, sequential file operations. Persistent files are used for checkpointing and movie making, while temporary files are used to store out-of-core data.}, comment = {See moore:detection, moore:stream. Only three pages.} } @TechReport{moore:stream-tr, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, year = {1994}, number = {94-80-13}, institution = {Oregon State University}, note = {Updated September 1995.}, later = {moore:stream}, URL = {http://www.cs.orst.edu/~moorej/streamstar.ps.Z}, keyword = {data parallel, parallel I/O, pario-bib}, abstract = {Although hardware supporting parallel file I/O has improved greatly since the introduction of first-generation parallel computers, the programming interface has not. Each vendor provides a different logical view of parallel files as well as nonportable operations for manipulating files. Neither do parallel languages provide standards for performing I/O. In this paper, we describe a view of parallel files for data-parallel languages, dubbed Stream*, in which each virtual processor writes to and reads from its own stream. In this scheme each virtual processor's I/O operations have the same familiar, unambiguous meaning as in a sequential C program. We demonstrate how I/O operations in Stream* can run as fast as those of vendor-specific parallel file systems on the operations most often encountered in data-parallel programs. We show how this system supports general virtual processor operations for debugging and elemental functions. Finally, we present empirical results from a prototype Stream* system running on a Meiko CS-2 multicomputer.}, comment = {See moore:stream; nearly identical. See also moore:detection. This paper gives a little bit earlier description of the Stream* idea than does moore:detection, but you'd be pretty much complete just reading moore:detection.} } @InProceedings{moore:stream, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, booktitle = {Parallel Computing: State-of-the-Art and Perspectives (ParCo~'95)}, year = {1995}, month = {September}, pages = {287--294}, publisher = {Elsevier Science}, earlier = {moore:stream-tr}, keyword = {data parallel, parallel I/O, pario-bib} } @InProceedings{more:mtio, author = {Sachin More and Alok Choudhary and Ian Foster and Ming Q. Xu}, title = {{MTIO} A Multi-Threaded Parallel {I/O} System}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.ece.nwu.edu/~ssmore/ipps97.ps}, keyword = {verify pages, threads, parallel I/O, pario-bib}, abstract = {This paper presents the design and evaluation of a multi-threaded runtime library for parallel I/O. We extend the multi-threading concept to separate the compute and I/O tasks in two separate threads of control. Multi-threading in our design permits a) asynchronous I/O even if the underlying file system does not support asynchronous I/O; b) copy avoidance from the I/O thread to the compute thread by sharing address space; and c) a capability to perform collective I/O asynchronously without blocking the compute threads. Further, this paper presents techniques for collective I/O which maximize load balance and concurrency while reducing communication overhead in an integrated fashion. Performance results on IBM SP2 for various data distributions and access patterns are presented. The results show that there is a tradeoff between the amount of concurrency in I/O and the buffer size designated for I/O; and there is an optimal buffer size beyond which benefits of larger requests diminish due to large communication overheads.} } @Article{moren:controllers, author = {William D. Moren}, title = {Design of Controllers is Key Element in Disk Subsystem Throughput}, journal = {Computer Technology Review}, year = {1988}, month = {Spring}, pages = {71--73}, keyword = {parallel I/O, disk architecture, pario-bib}, comment = {A short paper on some basic techniques used by disk controllers to improve throughput: seek optimization, request combining, request queuing, using multiple drives in parallel, scatter/gather DMA, data caching, read-ahead, cross-track read-ahead, write-back caching, segmented caching, reduced latency (track buffering), and format skewing. [Most of these are already handled in Unix file systems.]} } @InProceedings{mourad:raid, author = {Antoine N. Mourad and W. Kent Fuchs and Daniel G. Saab}, title = {Performance of Redundant Disk Array Organizations in Transaction Processing Environments}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {I--138--145}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {Transaction-processing workload dominated by small I/Os. They compare RAID~5, Parity Striping (which was designed for TP because it avoids lots of seeks on medium-sized requests, by declustering parity but not data), mirroring, and RAID~0. RAID~5 does {\em better\/} than parity striping due to its load balancing ability on the skewed workload. RAID~5 also better as the load increases.} } @InProceedings{mowry:prefetch, author = {Todd C. Mowry and Angela K. Demke and Orran Krieger}, title = {Automatic compiler-inserted I/O prefetching for out-of-core applications}, booktitle = {Proceedings of the 1996 Symposium on Operating Systems Design and Implementation}, year = {1996}, month = {October}, pages = {3--17}, publisher = {USENIX Association}, URL = {http://www.usenix.org/publications/library/proceedings/osdi96/mowry.html}, keyword = {compiler, prefetch, parallel I/O, pario-bib}, abstract = {Current operating systems offer poor performance when a numeric application's working set does not fit in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme, the compiler provides the crucial information on future access patterns without burdening the programmer, the operating system supports non-binding prefetch and release hints for managing I/O, and the operating system cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively move prefetches back ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We have implemented our scheme using the SUIF compiler and the Hurricane operating system. Our experimental results demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the eight applications, with two applications speeding up by threefold or more.}, comment = {Best Paper Award.} } @Article{moyer:application, author = {S. Moyer and V. S. Sunderam}, title = {Parallel {I/O} as a Parallel Application}, journal = {International Journal of Supercomputer Applications}, year = {1995}, month = {Summer}, volume = {9}, number = {2}, pages = {95--107}, keyword = {parallel I/O, pario-bib}, comment = {An overview of PIOUS and its performance. Results for partitioned and self-scheduled access pattern. See other moyer:* papers. The big thing about PIOUS over previous parallel file systems is its internal use of transactions for concurrency control and user-selectable fault-tolerance guarantees, and its optional support of user-level transactions.} } @TechReport{moyer:characterize, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, year = {1995}, month = {June}, number = {CSTR-950601}, institution = {Emory University}, later = {moyer:jcharacterize}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950601.ps}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. But because a single read or write operation can generate data accesses on multiple independent storage devices, a concurrency control mechanism must be employed to retain familiar file access semantics. Concurrency control negates some of the performance benefits of data declustering by introducing additional file access overhead. This paper examines the performance characteristics of the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Results demonstrate that linearizability of file access operations is provided without loss of scalability or stability.}, comment = {``substantially different material than presented in a previous report,'' moyer:scalable-tr. But it seems like the moyer:scalable IOPADS paper is largely a subset of this TR. He describes how they use volatile transactions, and does some experiments with PIOUS to measure their efficiency. Basically, they use a 2-phase commit protocol, using timeouts to detect deadlock and transaction aborts to remedy the deadlock. Results for partitioned and sequential access patterns.} } @Article{moyer:jcharacterize, author = {Steven A. Moyer and V.S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {October}, volume = {38}, number = {1}, pages = {81--91}, earlier = {moyer:characterize}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @InProceedings{moyer:pario, author = {Steven A. Moyer and V. S. Sunderam}, title = {A Parallel {I/O} System for High-Performance Distributed Computing}, booktitle = {Proceedings of the IFIP WG10.3 Working Conference on Programming Environments for Massively Parallel Distributed Systems}, year = {1994}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousifip94.ps}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {See moyer:pious. A further description of the PIOUS parallel file system for cluster computing. (Beta-test version available for ftp). They support parafiles, which are collections of segments, each segment residing on a different server. The segments can be viewed separately or can be interleaved into a linear sequence using an arbitrary chunk size. They also support transactions to support sequential consistency.} } @InProceedings{moyer:pious, author = {Steven A. Moyer and V. S. Sunderam}, title = {{PIOUS:} A Scalable Parallel {I/O} System for Distributed Computing Environments}, booktitle = {Proceedings of the Scalable High-Performance Computing Conference}, year = {1994}, pages = {71--78}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousshpcc94.ps}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {Basically, I/O for clusters of workstations; ideally, it is parallel, heterogeneous, fault tolerant, etc. File servers are independent, have only a local view. Single server used to coordinate open(). Client libraries implement the API and depend on the servers only for storage mechanism. Servers use transactions internally -- but usually these are lightweight transactions, only used for concurrency control and not recovery. Full transactions are supported for times when the user wants the extra fault tolerance. They have files that are in some sense 2-dimensional. Sequential consistency. User-controllable fault tolerance. Performance: 2 clients max out the transport (ethernet). ``Stable'' mode is slow, as is self-scheduled mode. No client caching. See moyer:pario.} } @InCollection{moyer:scalable-book, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {10}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {225--243}, publisher = {Kluwer Academic Publishers}, earlier = {moyer:scalable}, keyword = {parallel I/O, parallel file system, concurrency control, synchronization, transaction, pario-bib}, abstract = {Parallel file systems employ data declustering to increase \mbox{I/O} throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @TechReport{moyer:scalable-tr, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, year = {1995}, month = {February}, number = {CSTR-950202}, institution = {Emory University}, later = {moyer:scalable}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950202.ps}, keyword = {parallel I/O, parallel file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {They describe {\em volatile transactions\/} as a way of providing the appopriate sequential consistency among file-read and -write operations (a feature not provided by most file systems). Their PIOUS library implements these transactions with strict 2-phase locking. They show some performance results, though only on a limited and relatively simple benchmark. If nothing else this paper reminds us all that atomicity of file-read and -write requests should be available to the user (eg, note how they are optional in Vesta). Published as moyer:scalable.} } @InProceedings{moyer:scalable, author = {Steven A. Moyer and V. S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {90--106}, earlier = {moyer:scalable-tr}, later = {moyer:scalable-book}, keyword = {parallel I/O, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Seems to be a subset of moyer:scalable-tr, and for that matter, moyer:characterize. Results for partitioned access pattern.} } @Misc{mpi-forum:mpi2, key = {MPI}, title = {{MPI-2}: Extensions to the Message-Passing Interface}, year = {1997}, month = {July}, howpublished = {{The MPI Forum}}, earlier = {mpi-ioc:mpi-io5}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {This is the definition of the MPI2 message-passing standard, which includes an interface for parallel I/O. Supercedes mpi-ioc:mpi-io5 and earlier versions. See the MPI2 web page at http://www.mpi-forum.org. The I/O section is at http://www.mpi-forum.org/docs/mpi-20-html/node172.html.} } @Misc{mpi-ioc:mpi-io5, key = {MPIO}, title = {{MPI-IO:} A Parallel File {I/O} Interface for {MPI}}, year = {1996}, month = {April}, howpublished = {{The MPI-IO Committee}}, note = {Version 0.5.}, earlier = {corbett:mpi-io4}, later = {mpi-forum:mpi2}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {Supercedes corbett:mpi-io4 and earlier versions. See the MPI-IO Web page at http://parallel.nas.nasa.gov/MPI-IO/.} } @InBook{mpi2-io, author = {{Message-Passing Interface Forum}}, title = {{MPI-2.0}: Extensions to the Message-Passing Interface}, chapter = {9}, year = {1997}, month = {June}, publisher = {MPI Forum}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {MPI, message passing, parallel computing, library, parallel I/O, pario-bib}, comment = {Chapter 9 is about I/O extensions.} } @InProceedings{mueck:multikey, author = {T.~A. Mueck and J. Witzmann}, title = {Multikey Index Support for Tuple Sets on Parallel Mass Storage Systems}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {136--145}, URL = {http://www.computer.org/conferen/mss95/mueck/mueck.htm}, keyword = {parallel database, mass storage, parallel I/O, pario-bib}, abstract = {The development and evaluation of a tuple set manager (TSM) based on multikey index data structures is a main part of the PARABASE project at the University of Vienna. The TSM provides access to parallel mass storage systems using tuple sets instead of conventional files as the central data structure for application programs. A proof-of-concept prototype TSM is already implemented and operational on an iPSC/2. It supports tuple insert and delete operations as well as exact match, partial match, and range queries at system call level. Available results are from this prototype on the one hand and from various performance evaluation figures. The evaluation results demonstrate the performance gain achieved by the implementation of the tuple set management concept on a parallel mass storage system.} } @InProceedings{muller:multi, author = {Keith Muller and Joseph Pasquale}, title = {A High Performance Multi-Structured File System Design}, booktitle = {Proceedings of the Thirteenth ACM Symposium on Operating Systems Principles}, year = {1991}, pages = {56--67}, publisher = {ACM Press}, address = {Pacific Grove, CA}, keyword = {file system, disk striping, disk mirroring, pario-bib} } @InProceedings{muntz:failure, author = {Richard R. Muntz and John C. S. Lui}, title = {Performance Analysis of Disk Arrays Under Failure}, booktitle = {Proceedings of the 16th International Conference on Very Large Data Bases}, year = {1990}, pages = {162--173}, keyword = {disk array, parallel, performance analysis, pario-bib}, comment = {Looked at RAID5 when in failure mode. For small-reads workload, could only get 50\% of normal. So they decouple cluster size and parity-group size, so that they decluster over more disks than group size; during failure, this causes less of a load increase on surviving disks.} } @InProceedings{mutisya:cache, author = {Gerald Mutisya and Bradley M. Broom}, title = {Distributed File Caching for the {AP1000}}, booktitle = {Proceedings of the Third Fujitsu-ANU CAP Workshop}, year = {1992}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, lautenbach:pfs, and broom:cap. They examine ways to manage a distributed file cache, without replication. Since there is no replication, the concurrency control problems boil down to providing atomicity for multi-block, multi-site requests. This is handled essentially by serializing the request: send the request to the first site, and have it forward the request from site to site as each block is processed. This works fine but completely serializes all multi-block requests, somewhat defeating the purpose. Thus, they get concurrency between requests, by having multiple servers, but no parallelism within requests.} } @Article{myllymaki:buffering, author = {Jussi Myllymaki and Miron Livny}, title = {Efficient buffering for concurrent disk tape {I/O}}, journal = {Performance Evaluation: An International Journal}, year = {1996}, volume = {27/28}, pages = {453--471}, note = {Performance~'96}, keyword = {buffering, file caching, tertiary storage, tape robot, file migration, parallel I/O, pario-bib}, comment = {Ways to use secondary and tertiary storage in parallel, and buffering mechanisms for applications with concurrent I/O requirements.} } @InProceedings{nagaraj:hpfs, author = {U. Nagaraj and U. S. Shukla and A. Paulraj}, title = {Design and Evaluation of a High Performance File System for Message Passing Parallel Computers}, booktitle = {Proceedings of the Fifth International Parallel Processing Symposium}, year = {1991}, pages = {549--554}, keyword = {multiprocessor file system, pario-bib}, comment = {They describe a file system for general message-passing, distributed-memory, separate I/O and compute node, multicomputers. They provide few details, although they cite a lot of their tech reports. There are a few simulation results, but none show anything unintuitive.} } @InProceedings{nagashima:pario, author = {Umpei Nagashima and Takashi Shibata and Hiroshi Itoh and Minoru Gotoh}, title = {An Improvement of {I/O} Function for Auxiliary Storage: {Parallel I/O} for a Large Scale Supercomputing}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {48--59}, keyword = {parallel I/O, pario-bib}, comment = {Using parallel I/O channels to access striped disks, in parallel from a supercomputer. They {\em chain}\/ (i.e., combine) requests to a disk for large contiguous accesses.} } @InProceedings{nakajo:ionet, author = {H. Nakajo and S. Ohtani and T. Matsumoto and M. Kohata and K. Hiraki and Y. Kaneda}, title = {An {I/O} Network for Architecture of the Distributed Shared-Memory Massively}, booktitle = {Proceedings of the 11th ACM International Conference on Supercomputing}, year = {1997}, month = {July}, publisher = {ACM Press}, keyword = {verify pages, collective I/O, multiprocessor file system, parallel I/O, pario-bib} } @InProceedings{nakajo:jump1, author = {Hironori Nakajo}, title = {A Simulation-based Evaluation of a Disk {I/O} Subsystem for a Massively Parallel Computer: {JUMP-1}}, booktitle = {Proceedings of the Sixteenth International Conference on Distributed Computer Systems}, year = {1996}, month = {May}, pages = {562--569}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O architecture, pario-bib}, abstract = {JUMP-1 is a distributed shared-memory massively parallel computer and is composed of multiple clusters of interconnected network called RDT (Recursive Diagonal Torus). Each cluster in JUMP-1 consists of 4 element processors, secondary cache memories, and 2 MBP (Memory Based Processor) for high-speed synchronization and communication among clusters. The I/O subsystem is connected to a cluster via a high-speed serial link called STAFF-Link. The I/O buffer memory is mapped onto the JUMP-1 global shared-memory to permit each I/O access operation as memory access. In this paper we describe evaluation of the fundamental performance of the disk I/O subsystem using event-driven simulation, and estimated performance with a Video On Demand (VOD) application.} } @InProceedings{natarajan:clusterio, author = {Chita Natarajan and Ravishankar K. Iyer}, title = {Measurement and Simulation Based Performance Analysis of Parallel {I/O} in a High-Performance Cluster System}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {332--339}, publisher = {IEEE Computer Society Press}, keyword = {performance analysis, parallel I/O, pario-bib}, abstract = {This paper presents a measurement and simulation based study of parallel I/O in a high-performance cluster system: the Pittsburgh Supercomputing Center (PSC) DEC Alpha Supercluster. The measurements were used to characterize the performance bottlenecks and the throughput limits at the compute and I/O nodes, and to provide realistic input parameters to PioSim, a simulation environment we have developed to investigate parallel I/O performance issues in cluster systems. PioSim was used to obtain a detailed characterization of parallel I/O performance, in the high performance cluster system, for different regular access patterns and different system configurations. This paper also explores the use of local disks at the compute nodes for parallel I/O, and finds that the local disk architecture outperforms the traditional parallel I/O over remote I/O node disks architecture, even when as much as 68-75\% of the requests from each compute node goes to remote disks.} } @TechReport{ncr:3600, key = {NCR}, title = {{NCR 3600} Product Description}, year = {1991}, month = {September}, number = {ST-2119-91}, institution = {NCR}, address = {San Diego}, keyword = {multiprocessor architecture, MIMD, parallel I/O, pario-bib}, comment = {Has 1-32 50MHz Intel 486 processors. Parallel independent disks on the disk nodes, separate from the processor nodes. Tree interconnect. Aimed at database applications.} } @InProceedings{ng:diskarray, author = {Spencer Ng}, title = {Some Design Issues of Disk Arrays}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {137--142}, note = {San Francisco, CA}, keyword = {parallel I/O, disk array, pario-bib}, comment = {Discusses disk arrays and striping. Transfer size is important to striping success: small size transfers are better off with independent disks. Synchronized rotation is especially important for small transfer sizes, since then the increased rotational delays dominate. Fine grain striping involves less assembly/disassembly delay, but coarse grain (block) striping allows for request parallelism. Fine grain striping wastes capacity due to fixed size formatting overhead. He also derives exact MTTF equation for 1-failure tolerance and on-line repair.} } @InProceedings{ng:interleave, author = {S. Ng and D. Lang and R. Selinger}, title = {Trade-offs Between Devices and Paths in Achieving Disk Interleaving}, booktitle = {Proceedings of the 15th Annual International Symposium on Computer Architecture}, year = {1988}, pages = {196--201}, keyword = {parallel I/O, disk architecture, disk caching, I/O bottleneck, pario-bib}, comment = {Compares four different ways of restructuring IBM disk controllers and channels to obtain more parallelism. They use parallel heads or parallel actuators. The best results come when they replicate the control electronics to maintain the number of data paths through the controller. Otherwise the controller bottleneck reduces performance. Generally, for large or small transfer sizes, parallel heads with replication gave better performance.} } @Article{nicastro:fft, author = {L. Nicastro and N. {D'Amico}}, title = {An optimized mass storage {FFT} for vector computers}, journal = {Parallel Computing}, year = {1995}, month = {March}, volume = {21}, pages = {423--432}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {out-of-core algorithm, parallel I/O algorithm, scientific computing, vector computer, pario-bib}, comment = {They describe an out-of-core FFT algorithm for vector computers (one disk, one vector processor). They implemented it on a Convex and show good performance. Basically, the segment the array, do FFTs on each segment, and do some transposing and other stuff to combine the segments. Each segment is basically a memoryload. Seems parallelizable too.} } @TechReport{nickolls:dpio, author = {John R. Nickolls and Ernie Rael}, title = {Data Parallel {Unix} Input/Output for a Massively Parallel Processor}, year = {1993}, number = {MP/P-17.93}, institution = {MasPar Computer Corporation}, keyword = {Unix, parallel I/O, data parallel, pario-bib}, comment = {Cite nickolls:maspar-io.} } @InProceedings{nickolls:maspar-io, author = {John R. Nickolls}, title = {The {MasPar} Scalable {Unix I/O} System.}, booktitle = {Proceedings of the Eighth International Parallel Processing Symposium}, year = {1994}, month = {April}, pages = {390--394}, address = {Cancun, Mexico}, keyword = {parallel I/O, multiprocessor file system, SIMD, pario-bib}, abstract = {Scalable parallel computers require I/O balanced with computational power to solve data-intensive problems. Distributed memory architectures call for I/O hardware and software beyond those of conventional scalar systems. \par This paper introduces the MasPar I/O system, designed to provide balanced and and scalable data-parallel Unix I/O. The architecture and implementation of the I/O hardware and software are described. Key elements include parallel access to conventional Unix file descriptors and a self-routing multistage network coupled with a buffer memory for flexible parallel I/O transfers. Performance measurements are presented for parallel Unix I/O with a scalable RAID disk array, a RAM disk, and a HIPPI interconnect.}, comment = {This provides the definitive reference for the Maspar parallel-I/O architecture and file system. This paper includes a brief discussion of the interface and performance results. Also includes some HIPPI interface performance results. This paper is the conference version of nickolls:dpio, so cite this one.} } @InProceedings{nieplocha:arrays, author = {Jarek Nieplocha and Ian Foster}, title = {Disk Resident Arrays: An Array-Oriented {I/O} Library for Out-Of-Core Computations}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {196--204}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {In out-of-core computations, disk storage is treated as another level in the memory hierarchy, below cache, local memory, and (in a parallel computer) remote memories. However the tools used to manage this storage are typically quite different from those used to manage access to local and remote memory. This disparity complicates implementation of out-of-core algorithms and hinders portability. We describe a programming model that addresses this problem. This model allows parallel programs to use essentially the same mechanisms to manage the movement of data between any two adjacent levels in a hierarchical memory system. We take as our starting point the Global Arrays shared-memory model and library, which support a variety of operations on distributed arrays, including transfer between local and remote memories. We show how this model can be extended to support explicit transfer between global memory and secondary storage, and we define a Disk Resident Arrays Library that supports such transfers. We illustrate the utility of the resulting model with two applications, an out-of-core matrix multiplication and a large computational chemistry program. We also describe implementation techniques on several parallel computers and present experimental results that demonstrate that the Disk Resident Arrays model can be implemented very efficiently on parallel computers.} } @Article{nieplocha:chemio, author = {Jarek Nieplocha and Ian Foster and Rick Kendall}, title = {{ChemIO}: High-Performance Parallel {I/O} for Computational Chemistry Applications}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, earlier = {foster:chemio}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {Recent developments in I/O systems on scalable parallel computers have sparked renewed interest in out-of-core methods for computational chemistry. These methods can improve execution time significantly relative to "direct" methods, which perform many redundant computations. However, the widespread use of such out-of-core methods requires efficient and portable implementations of often complex I/O patterns. The ChemIO project has addressed this problem by defining an I/O interface that captures the I/O patterns found in important computational chemistry applications and by providing high-performance implementations of this interface on multiple platforms. This development not only broadens the user community for parallel I/O techniques but also provides new insights into the functionality required in general-purpose scalable I/O libraries and the techniques required to achieve high performance I/O on scalable parallel computers.}, comment = {See also foster:chemio} } @InProceedings{nieuwejaar:galley-perf, author = {Nils Nieuwejaar and David Kotz}, title = {Performance of the {Galley} Parallel File System}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {83--94}, publisher = {ACM Press}, address = {Philadelphia}, later = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley-perf.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. Initial experiments, reported in this paper, indicate that Galley is capable of providing high-performance I/O to applications that access data in patterns that have been observed to be common.}, comment = {See also nieuwejaar:galley.} } @InProceedings{nieuwejaar:galley, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {374--381}, publisher = {ACM Press}, address = {Philadelphia, PA}, later = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. We discuss Galley's file structure and application interface, as well as an application that has been implemented using that interface.}, comment = {See also nieuwejaar:galley-perf.} } @TechReport{nieuwejaar:jgalley-tr, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, year = {1996}, month = {May}, number = {PCS-TR96-286}, institution = {Dept. of Computer Science, Dartmouth College}, earlier = {nieuwejaar:galley}, later = {nieuwejaar:jgalley}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-286.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.} } @Article{nieuwejaar:jgalley, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {447--476}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:jgalley.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.}, comment = {A revised version of nieuwejaar:jgalley-tr, which is a combination of nieuwejaar:galley and nieuwejaar:galley-perf.} } @TechReport{nieuwejaar:strided, author = {Nils Nieuwejaar and David Kotz}, title = {A Multiprocessor Extension to the Conventional File System Interface}, year = {1994}, month = {September}, number = {PCS-TR94-230}, institution = {Dept. of Computer Science, Dartmouth College}, later = {nieuwejaar:strided2-tr}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR94-230.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present an extension which supports {\em strided} and {\em nested-strided} I/O requests.} } @InCollection{nieuwejaar:strided2-book, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {9}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {205--223}, publisher = {Kluwer Academic Publishers}, earlier = {nieuwejaar:strided2}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {Part of a whole book on parallel I/O; see iopads-book and nieuwejaar:strided2 (which is not much different).} } @TechReport{nieuwejaar:strided2-tr, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, year = {1995}, month = {March}, number = {PCS-TR95-253}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Revised 4/18/95 and appeared in IOPADS workshop at IPPS~'95}, earlier = {nieuwejaar:strided}, later = {nieuwejaar:strided2}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR95-253.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {After revision, identical to nieuwejaar:strided2.} } @InProceedings{nieuwejaar:strided2, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {47--62}, earlier = {nieuwejaar:strided2-tr}, later = {nieuwejaar:strided2-book}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR95-253.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {Identical to revised TR95-253, nieuwejaar:strided2-tr. Cite nieuwejaar:strided2-book.} } @PhdThesis{nieuwejaar:thesis, author = {Nils A. Nieuwejaar}, title = {Galley: A New Parallel File System for Parallel Applications}, year = {1996}, month = {November}, school = {Dept. of Computer Science, Dartmouth College}, note = {Available as Dartmouth Technical Report PCS-TR96-300}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-300.ps.Z}, keyword = {parallel I/O, multiprocessor file system, file system workload characterization, file access patterns, file system interface, pario-bib}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Most multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access those multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated application and library programmers to use knowledge about their I/O to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. In this work we examine current multiprocessor file systems, as well as how those file systems are used by scientific applications. Contrary to the expectations of the designers of current parallel file systems, the workloads on those systems are dominated by requests to read and write small pieces of data. Furthermore, rather than being accessed sequentially and contiguously, as in uniprocessor and supercomputer workloads, files in multiprocessor file systems are accessed in regular, structured, but non-contiguous patterns. Based on our observations of multiprocessor workloads, we have designed Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. In this work, we introduce Galley and discuss its design and implementation. We describe Galley's new three-dimensional file structure and discuss how that structure can be used by parallel applications to achieve higher performance. We introduce several new data-access interfaces, which allow applications to explicitly describe the regular access patterns we found to be common in parallel file system workloads. We show how these new interfaces allow parallel applications to achieve tremendous increases in I/O performance. Finally, we discuss how Galley's new file structure and data-access interfaces can be useful in practice.} } @TechReport{nieuwejaar:workload-tr, author = {Nils Nieuwejaar and David Kotz and Apratim Purakayastha and Carla Schlatter Ellis and Michael Best}, title = {File-Access Characteristics of Parallel Scientific Workloads}, year = {1995}, month = {August}, number = {PCS-TR95-263}, institution = {Dept. of Computer Science, Dartmouth College}, earlier = {kotz:workload}, later = {nieuwejaar:workload}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:workload-tr.ps.Z}, keyword = {parallel I/O, file system workload, workload characterization, file access pattern, multiprocessor file system, dfk, pario-bib}, abstract = {Phenomenal improvements in the computational performance of multiprocessors have not been matched by comparable gains in I/O system performance. This imbalance has resulted in I/O becoming a significant bottleneck for many scientific applications. One key to overcoming this bottleneck is improving the performance of parallel file systems. \par The design of a high-performance parallel file system requires a comprehensive understanding of the expected workload. Unfortunately, until recently, no general workload studies of parallel file systems have been conducted. The goal of the CHARISMA project was to remedy this problem by characterizing the behavior of several production workloads, on different machines, at the level of individual reads and writes. The first set of results from the CHARISMA project describe the workloads observed on an Intel iPSC/860 and a Thinking Machines CM-5. This paper is intended to compare and contrast these two workloads for an understanding of their essential similarities and differences, isolating common trends and platform-dependent variances. Using this comparison, we are able to gain more insight into the general principles that should guide parallel file-system design.}, comment = {See also nieuwejaar:strided, ap:workload.} } @Article{nieuwejaar:workload, author = {Nils Nieuwejaar and David Kotz and Apratim Purakayastha and Carla Schlatter Ellis and Michael Best}, title = {File-Access Characteristics of Parallel Scientific Workloads}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1996}, month = {October}, volume = {7}, number = {10}, pages = {1075--1089}, publisher = {IEEE Computer Society Press}, earlier = {nieuwejaar:workload-tr}, URL = {http://www.computer.org/pubs/tpds/abs96.htm#1075td1096}, keyword = {parallel I/O, file system workload, workload characterization, file access pattern, multiprocessor file system, dfk, pario-bib}, abstract = {Phenomenal improvements in the computational performance of multiprocessors have not been matched by comparable gains in I/O system performance. This imbalance has resulted in I/O becoming a significant bottleneck for many scientific applications. One key to overcoming this bottleneck is improving the performance of multiprocessor file systems. \par The design of a high-performance multiprocessor file system requires a comprehensive understanding of the expected workload. Unfortunately, until recently, no general workload studies of multiprocessor file systems have been conducted. The goal of the CHARISMA project was to remedy this problem by characterizing the behavior of several production workloads, on different machines, at the level of individual reads and writes. The first set of results from the CHARISMA project describe the workloads observed on an Intel iPSC/860 and a Thinking Machines CM-5. This paper is intended to compare and contrast these two workloads for an understanding of their essential similarities and differences, isolating common trends and platform-dependent variances. Using this comparison, we are able to gain more insight into the general principles that should guide multiprocessor file-system design.}, comment = {See also kotz:workload, nieuwejaar:strided, ap:workload.} } @Article{ninghui:pfs, author = {Sun Ninghui}, title = {The design of parallel file systems}, journal = {Chinese Journal of Computers}, year = {1994}, month = {December}, volume = {17}, number = {12}, pages = {938--945}, note = {In Chinese}, keyword = {parallel file systems, parallel I/O, pario-bib}, comment = {From the abstract, it doesn't appear to offer anything new, but it's hard to tell.} } @InProceedings{nishino:sfs, author = {H. Nishino and S. Naka and K Ikumi}, title = {High Performance File System for Supercomputing Environment}, booktitle = {Proceedings of Supercomputing '89}, year = {1989}, pages = {747--756}, keyword = {supercomputer, file system, parallel I/O, pario-bib}, comment = {A modification to the Unix file system to allow for supercomputer access. Workload: file size from few KB to few GB, I/O operation size from few bytes to hundreds of MB. Generally programs split into I/O-bound and CPU-bound parts. Sequential and random access. Needs: giant files (bigger than device), peak hardware performance for large files, NFS access. Their FS is built into Unix ``transparently''. Space allocated in clusters, rather than blocks; clusters might be as big as a cylinder. Allows for efficient, large files. Mentions parallel disks as part of a ``virtual volume'' but does not elaborate. Prefetching within a cluster.} } @TechReport{nitzberg:cfs, author = {Bill Nitzberg}, title = {Performance of the {iPSC/860 Concurrent File System}}, year = {1992}, month = {December}, number = {RND-92-020}, institution = {NAS Systems Division, NASA Ames}, later = {krystynak:pario}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-020/RND-92-020.html}, keyword = {Intel, parallel file system, performance measurement, parallel I/O, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems can provide processing power at low cost, but tend to lack I/O capacity. By evaluating the performance and scalability of the Intel iPSC/860 Concurrent File System (CFS), we can get an idea of the current state of parallel I/O performance. I ran three types of tests on the iPSC/860 system at the Numerical Aerodynamic Simulation facility (NAS): broadcast, simulating initial data loading; partitioned, simulating reading and writing a one-dimensional decomposition; and interleaved, simulating reading and writing a two-dimensional decomposition. \par The CFS at NAS can sustain up to 7 megabytes per second writing and 8 megabytes per second reading. However, due to the limited disk cache size, partitioned read performance sharply drops to less than 1 megabyte per second on 128 nodes. In addition, interleaved read and write performance show a similar drop in performance for small block sizes. Although the CFS can sustain 70-80\% of peak I/O throughput, the I/O performance does not scale with the number of nodes. \par Obtaining maximum performance may require significant programming effort: pre-allocating files, overlapping computation and I/O, using large block sizes, and limiting I/O parallelism. A better approach would be to attack the problem by either fixing the CFS (e.g., add more cache to the I/O nodes), or hiding its idiosyncracies (e.g., implement a parallel I/O library).}, comment = {Straightforward measurements of an iPSC/860 with 128 compute nodes, 10 I/O nodes, and 10 disks. This is a bigger system than has been measured before. Has some basic MB/s measurements for some features in Tables 1--2. CFS bug prevents more than 2 asynch requests at a time. Another bug forced random-writes to use preallocated files. For low number of procs, they weren't able to pull the full disk bandwidth. Cache thrashing caused problems when they had a large number of procs, because each read prefetched 8 blocks, which were flushed by some other proc doing a subsequent read. Workaround by synchronizing procs to limit concurrency. Increasing cache size is the right answer, but is not scalable.} } @InProceedings{nitzberg:collective, author = {Bill Nitzberg and Virginia Lo}, title = {Collective Buffering: Improving Parallel {I/O} Performance}, booktitle = {Proceedings of the Sixth IEEE International Symposium on High Performance Distributed Computing}, year = {1997}, month = {August}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, keyword = {verify pages, parallel I/O, collective I/O, pario-bib}, abstract = {"Parallel I/O" is the support of a single parallel application run on many nodes; application data is distributed among the nodes, and is read or written to a single logical file, itself spread across nodes and disks. Parallel I/O is a mapping problem from the data layout in node memory to the file layout on disks. Since the mapping can be quite complicated and involve significant data movement, optimizing the mapping is critical for performance. \par We discuss our general model of the problem, describe four Collective Buffering algorithms we designed, and report experiments testing their performance on an Intel Paragon and an IBM SP2 both housed at NASA Ames Research Center. Our experiments show improvements of up to two order of magnitude over standard techniques and the potential to deliver peak performance with minimal hardware support.} } @TechReport{nitzberg:sc94tutorial, author = {Bill Nitzberg and Samuel A. Fineberg}, title = {Parallel {I/O} on Highly Parallel Systems--- Supercomputing '94 Tutorial {M11} Notes}, year = {1994}, month = {November}, number = {NAS-94-005}, institution = {NASA Ames Research Center}, later = {nitzberg:sc95tutorial}, URL = {http://www.nas.nasa.gov/NAS/TechReports/NASreports/NAS-94-005/NAS-94-005.html}, keyword = {parallel I/O, tutorial, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems provide floating point processing power at low cost, but efficiently supporting a scientific workload also requires commensurate I/O performance. In order to achieve high I/O performance, these systems utilize parallelism in their I/O subsystems---supporting concurrent access to files by multiple nodes of a parallel application, and striping files across multiple disks. However, obtaining maximum I/O performance can require significant programming effort. \par This tutorial presents a snapshot of the state of I/O on highly parallel systems by comparing the well-balanced I/O performance of a traditional vector supercomputer (the Cray Y/MP C90) with the I/O performance of various highly parallel systems (Cray T3D, IBM SP-2, Intel iPSC/860 and Paragon, and Thinking Machines CM-5). In addition, the tutorial covers benchmarking techniques for evaluating current parallel I/O systems and techniques for improving parallel I/O performance. Finally, the tutorial presents several high level parallel I/O libraries and shows how they can help application programmers improve I/O performance.} } @TechReport{nitzberg:sc95tutorial, author = {Bill Nitzberg and Samuel A. Fineberg}, title = {Parallel {I/O} on Highly Parallel Systems--- Supercomputing '95 Tutorial {M6} Notes}, year = {1995}, month = {December}, number = {NAS-95-022}, institution = {NASA Ames Research Center}, later = {nitzberg:sc94tutorial}, URL = {http://www.nas.nasa.gov/NAS/TechReports/NASreports/NAS-95-022/NAS-95-022.html}, keyword = {parallel I/O, tutorial, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems provide floating-point processing power at low cost, but efficiently supporting a scientific workload also requires commensurate I/O performance. To achieve high I/O performance, these systems use parallelism in their I/O subsystems, supporting concurrent access to files by multiple nodes of a parallel application and striping files across multiple disks. However, obtaining maximum I/O performance can require significant programming effort. This tutorial will present a comprehensive survey of the state of the art in parallel I/O from basic concepts to recent advances in the research community. Requirements, interfaces, architectures, and performance will be illustrated using concrete examples from commercial offerings (Cray T3D, IBM SP-2, Intel Paragon, Meiko CS-2, and workstation clusters) and academic research projects (MPI-IO, Panda, PASSION, PIOUS, and Vesta). The material covered is roughly 30\% beginner, 60\% intermediate, and 10\% advanced.} } @PhdThesis{nitzberg:thesis, author = {William J. Nitzberg}, title = {Collective Parallel {I/O}}, year = {1995}, month = {December}, school = {Department of Computer and Information Science, University of Oregon}, keyword = {parallel I/O, parallel algorithm, file system interface, pario-bib}, abstract = {Parallel I/O, the process of transferring a global data structure distributed among compute nodes to a file striped across storage devices, can be quite complicated and involve a significant amount of data movement. Optimizing parallel I/O with respect to data distribution, file layout, and machine architecture is critical for performance. In this work, we propose a solution to both the performance and portability problems plaguing the wide acceptance of distributed memory parallel computers for scientific computing: a collective parallel I/O interface and efficient algorithms to implement it. A collective interface allows the programmer to specify a file access as a high-level global operation rather than as a series of seeks and writes. This not only provides a more natural interface for the programmer, but also provides the system with both the opportunity and the semantic information necessary to optimize the file operation. \par We attack this problem in three steps: we evaluate an early parallel I/O system, the Intel iPSC/860 Concurrent File System; we design and analyze the performance of two classes of algorithms taking advantage of collective parallel I/O; and we design MPI-IO, a collective parallel I/O interface likely to become the standard for portable parallel I/O. \par The collective I/O algorithms fall into two broad categories: data block scheduling and collective buffering. Data block scheduling algorithms attempt to schedule the individual data transfers to minimize resource contention and to optimize for particular hardware characteristics. We develop and evaluate three data block scheduling algorithms: Grouping, Random, and Sliding Window. The data block scheduling algorithms improved performance by as much as a factor of eight. The collective buffering algorithms permute the data before writing or after reading in order to combine small file accesses into large blocks. We design and test a series of four collective buffering algorithms and demonstrate improvement in performance by two orders of magnitude over naive file I/O for the hardest, three-dimensional distributions.}, comment = {See also nitzberg:cfs and corbett:mpi-overview.} } @InProceedings{no:irregular-io, author = {Jaechun No and Sung-soon Park and Jesus Carretero and Alok Choudhary and Pang Chen}, title = {Design and Implementation of a Parallel {I/O} Runtime System for Irregular Applications}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @InProceedings{nodine:deterministic, author = {M. H. Nodine and J. S. Vitter}, title = {Deterministic Distribution Sort in Shared and Distributed Memory Multiprocessors}, booktitle = {Proceedings of the Fifth Symposium on Parallel Algorithms and Architectures}, year = {1993}, pages = {120--129}, address = {Velen, Germany}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/NoV93.distr_sorting.ps.Z}, abstract = {We present an elegant deterministic load balancing strategy for distribution sort that is applicable to a wide variety of parallel disks and parallel memory hierarchies with both single and parallel processors. The simplest application of the strategy is an optimal deterministic algorithm for external sorting with multiple disks and parallel processors. In each input/output (I/O) operation, each of the $D \geq 1$ disks can simultaneously transfer a block of $B$ contiguous records. Our two measures of performance are the number of I/Os and the amount of work done by the CPU(s); our algorithm is simultaneously optimal for both measures. We also show how to sort deterministically in parallel memory hierarchies. When the processors are interconnected by any sort of a PRAM, our algorithms are optimal for all parallel memory hierarchies; when the interconnection network is a hypercube, our algorithms are either optimal or best-known.}, comment = {Short version of nodine:sort2 and nodine:sortdisk.} } @TechReport{nodine:greed, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Greed Sort: An Optimal External Sorting Algorithm for Multiple Disks}, year = {1992}, number = {CS--91--20}, institution = {Brown University}, note = {A summary appears in SPAA~'91}, URL = {http://www.cs.brown.edu/publications/techreports/reports/CS-91-20}, keyword = {parallel I/O algorithms, sorting, pario-bib}, abstract = {We present an optimal deterministic algorithm for external sorting on multiple disks. Our measure of performance is the number of input/output (I/O) operations. In each I/O, each disk can simultaneously transfer a block of data. Our algorithm improves upon a recent randomized optimal algorithm and the (non-optimal) commonly used technique of disk striping. The code is simple enough for easy implementation.}, comment = {Summary is nodine:sort. This is revision of CS--91--04.} } @InProceedings{nodine:loadbalance, author = {Mark H. Nodine and Jeffrey Vitter}, title = {Load Balancing Paradigms for Optimal Use of Parallel Disks and Parallel Memory Hierarchies}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {26--39}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, keyword = {parallel I/O algorithm, memory hierarchy, load balance, sorting, pario-bib}, abstract = {We present several load balancing paradigms pertinent to optimizing I/O performance with disk and processor parallelism. We use sorting as our canonical application to illustrate the paradigms, and we survey a wide variety of applications in computational geometry. The use of parallel disks can help overcome the I/O bottleneck in sorting if the records in each read or write are evenly balanced among the disks. There are three known load balancing paradigms that lead to optimal I/O algorithms: using randomness to assign blocks to disks, using the disks predominantly independently, and deterministically balancing the blocks by matching. In this report, we describe all of these techniques in detail and compare their relative advantages. We show how randomized and deterministic balancing can be extended to provide sorting algorithms that are optimal both in terms of the number of I/Os and the internal processing time for parallel-processing machines with scalable I/O subsystems and with parallel memory hierarchies. We also survey results achieving optimal performance in the these models for a large range of online and batch problems in computational geometry.}, comment = {Invited speaker: Jeffrey Vitter.} } @InProceedings{nodine:opt-sort, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Paradigms for Optimal Sorting with Multiple Disks}, booktitle = {Proceedings of the Twenty-Sixth Annual Hawaii International Conference on System Sciences}, year = {1993}, volume = {I}, pages = {50--59}, keyword = {parallel I/O algorithms, sorting, pario-bib}, comment = {They compare three techniques for balancing I/O across parallel disks, using sorting as an example. The three are randomization, using disks independently (as in balance sort), or tricky matching techniques as in balance sort. They also look at parallel memory hierarchies. All in all, it seems to be mostly a survey of techniques in earlier papers.} } @InProceedings{nodine:sort, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Large-Scale Sorting in Parallel Memories}, booktitle = {Proceedings of the Third Symposium on Parallel Algorithms and Architectures}, year = {1991}, pages = {29--39}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/ViN93.hierarchies.ps.Z}, keyword = {external sorting, file access pattern, parallel I/O, pario-bib}, comment = {Describes algorithms for external sorting that are optimal in the number of I/Os. Proposes a couple of fairly-realistic memory hierarchy models. See also journal version vitter:uniform.} } @TechReport{nodine:sort2, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Optimal Deterministic Sorting in Parallel Memory Hierarchies}, year = {1992}, month = {August}, number = {CS--92--38}, institution = {Brown University}, URL = {ftp://ftp.cs.brown.edu/pub/techreports/92/cs92-38.ps.Z}, keyword = {parallel I/O algorithms, parallel memory, sorting, pario-bib}, comment = {see nodine:deterministic.} } @TechReport{nodine:sortdisk, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Optimal Deterministic Sorting on Parallel Disks}, year = {1992}, month = {August}, number = {CS--92--08}, institution = {Brown University}, URL = {ftp://ftp.cs.brown.edu/pub/techreports/92/cs92-08.ps.Z}, keyword = {parallel I/O algorithms, sorting, pario-bib}, comment = {see nodine:deterministic.} } @InProceedings{nurmi:atm, author = {Marc A. Nurmi and William E. Bejcek and Rod N. Gregoire and K. C. Liu and Mark D. Pohl}, title = {Automatic Management of {CPU} and {I/O} Bottlenecks in Distributed Applications on {ATM} Networks}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {481--489}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, ATM, parallel networking, pario-bib}, abstract = {Existing parallel programming environments for networks of workstations improve the performance of computationally intensive applications by using message passing or virtual shared memory to alleviate CPU bottlenecks. This paper describes an approach based on message passing that addresses both CPU and I/O bottlenecks for a specific class of distributed applications on ATM networks. ATM provides the bandwidth required to utilize multiple I/O channels in parallel. This paper also describes an environment based on distributed process management and centralized application management that implements the approach. The environment adds processes to a running application when necessary to alleviate CPU and I/O bottlenecks while managing process connections in a manner that is transparent to the application.} } @TechReport{ober:seismic, author = {Curtis Ober and Ron Oldfield and John VanDyke and David Womble}, title = {Seismic Imaging on Massively Parallel Computers}, year = {1996}, month = {April}, number = {SAND96-1112}, institution = {Sandia National Laboratories}, URL = {ftp://ftp.cs.sandia.gov/pub/papers/dewombl/seismic_imaging_mpp.ps.Z}, keyword = {multiprocessor application, scientific computing, seismic data processing, parallel I/O, pario-bib}, abstract = {Fast, accurate imaging of complex, oil-bearing geologies, such as overthrusts and salt domes, is the key to reducing the costs of domestic oil and gas exploration. Geophysicists say that the known oil reserves in the Gulf of Mexico could be significantly increased if accurate seismic imaging beneath salt domes was possible. A range of techniques exist for imaging these regions, but the highly accurate techniques involve the solution of the wave equation and are characterized by large data sets and large computational demands. Massively parallel computers can provide the computational power for these highly accurate imaging techniques. \par A brief introduction to seismic processing will be presented, and the implementation of a seismic-imaging code for distributed memory computers will be discussed. The portable code, Salvo, performs a wave-equation-based, 3-D, prestack, depth imaging and currently runs on the Intel Paragon, the Cray T3D and SGI Challenge series. It uses MPI for portability, and has sustained 22 Mflops/sec/proc (compiled FORTRAN) on the Intel Paragon.}, comment = {2 pages about their I/O scheme, mostly regarding a calculation of the optimal balance between compute nodes and I/O nodes to achieve perfect overlap.} } @TechReport{oed:t3d, author = {Wilfried Oed}, title = {The {Cray Research} Massively Parallel Processor System {CRAY T3D}}, year = {1993}, month = {November 15}, institution = {Cray Research GmbH}, address = {M\"unchen, Germany}, keyword = {parallel architecture, shared memory, supercomputer, parallel I/O, pario-bib}, comment = {A MIMD, shared-memory machine, with 2-processor units embedded in a 3-d torus. Each link is bidirectional and runs 300 MB/s. Processors are 150 MHz ALPHA, plus 16--64 MB RAM, plus a memory interface unit. Global physical address space with remote-reference and block-transfer capability. Not clear about cache coherency. Separate tree network for global synchronization. Support for message send and optional interrupt. I/O is all done through interface nodes that hook to the YMP host and to its I/O clusters with 400 MB/s links. I/O is by default serialized, but they do support a ``broadcast'' read operation (but see pase:t3d-fortran). FORTRAN compiler supports the NUMA shared memory; PVM is used for C and message passing.} } @TechReport{ogata:diskarray, author = {Mikito Ogata and Michael J. Flynn}, title = {A Queueing Analysis for Disk Array Systems}, year = {1990}, number = {CSL-TR-90-443}, institution = {Stanford University}, keyword = {disk array, performance analysis, pario-bib}, comment = {Fairly complex analysis of a multiprocessor attached to a disk array system through a central server that is the buffer. Assumes task-oriented model for parallel system, where tasks can be assigned to any CPU; this makes for an easy model. Like Reddy, they compare declustering and striping (they call them striped and synchronized disks).} } @Article{oldfield:seismic, author = {Ron A. Oldfield and David E. Womble and Curtis C. Ober}, title = {Efficient Parallel {I/O} in Seismic Imaging}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, URL = {http://www.cs.dartmouth.edu/~raoldfi/ijsa97}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {While high performance computers tend to be measured by their processor and communications speeds, the bottleneck for many large-scale applications is the I/O performance rather than the computational or communication performance. One such application is the processing of 3D seismic data. Seismic data sets, consisting of recorded pressure waves, can be very large, sometimes more than a terabyte in size. Even if the computations can be performed in-core, the time required to read the initial seismic data and velocity model and write images is substantial. This paper will discuss our approach in handling the massive I/O requirements of seismic processing and show the performance of our imaging code (Salvo) on the Intel Paragon.} } @Article{olson:random, author = {Thomas M. Olson}, title = {Disk Array Performance in a Random {I/O} Environment}, journal = {Computer Architecture News}, year = {1989}, month = {September}, volume = {17}, number = {5}, pages = {71--77}, keyword = {I/O benchmark, transaction processing, pario-bib}, comment = {See wolman:iobench. Used IOBENCH to compare normal disk configuration with striped disks, RAID level 1, and RAID level 5, under a random I/O workload. Multiple disks with files on different disks gave good performance (high throughput and low response time) when multiple users. Striping ensures balanced load, similar performance. RAID level 1 or level 5 ensures reliability at performance cost over striping, but still good. Especially sensitive to write/read ratio --- performance lost for large number of writes.} } @InProceedings{oyang:m2io, author = {Yen-Jen Oyang}, title = {Architecture, Operating System, and {I/O} Subsystem Design of the {$M^2$} Database Machine}, booktitle = {Proceedings of the Parallel Systems Fair at the International Parallel Processing Symposium}, year = {1993}, pages = {31--38}, keyword = {parallel I/O, multiprocessor file system, parallel database, pario-bib}, comment = {A custom multiprocessor with a shared-memory clusters networked together and to shared disks. Runs Mach. Directory-based coherence protocol for the distributed file system. Background writeback.} } @InProceedings{pahuja:dpio, author = {Neena Pahuja and Gautam M. Shroff}, title = {A Data Parallel I/O Library for Workstation Networks}, booktitle = {Proceedings of the 1995 International Conference on High Performance Computing}, year = {1995}, month = {December}, pages = {423--428}, address = {New Delhi, India}, keyword = {disk array, multimedia, parallel I/O, pario-bib} } @InProceedings{paleczny:support, author = {Michael Paleczny and Ken Kennedy and Charles Koelbel}, title = {Compiler Support for Out-of-Core Arrays on Data Parallel Machines}, booktitle = {Proceedings of the Fifth Symposium on the Frontiers of Massively Parallel Computation}, year = {1995}, month = {February}, pages = {110--118}, address = {McLean, VA}, URL = {http://www.cs.rice.edu/~mpal/Frontiers95.ps}, keyword = {compilers, parallel I/O, out-of-core applications, pario-bib}, comment = {They are developing extensions to the FortranD compiler so that it supports I/O-related directives for out-of-core computations. The compiler then analyzes the computation, inserts the necessary I/O calls, and optimizes the I/O. They hand-compile a red-black relaxation program and an LU-factorization program. I/O was much faster than VM, particularly because they were able to make large requests rather than faulting on individual pages. Overlapping I/O and computation was also a big win. See also kennedy:sio, bordawekar:model.} } @InProceedings{panfilov:raid5, author = {Oleg A. Panfilov}, title = {Performance Analysis of {RAID-5} Disk Arrays}, booktitle = {Proceedings of the Twenty-Eighth Annual Hawaii International Conference on System Sciences}, year = {1995}, month = {January}, volume = {I}, pages = {49--60}, keyword = {RAID, disk array, parallel I/O, pario-bib} } @Article{papadopouli:vbr-streams, author = {Maria Papadopouli and Leana Golubchik}, title = {Support of {VBR} Video Streams Under Disk Bandwidth Limitations}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {13--20}, keyword = {multimedia, video on demand, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @Article{park:2disk, author = {{Chan-Ik} Park}, title = {Efficient Placement of Parity and Data To Tolerate Two Disk Failures in Disk Array Systems}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1995}, month = {November}, volume = {6}, number = {11}, pages = {1177--1184}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, disk array, reliability, fault tolerance, pario-bib}, abstract = {In this paper, we deal with the data/parity placement problem which is described as follows: how to place data and parity evenly across disks in order to tolerate two disk failures, given the number of disks N and the redundancy rate p which represents the amount of disk spaces to store parity information. To begin with, we transform the data/parity placement problem into the problem of constructing an N x N matrix such that the matrix will correspond to a solution to the problem. The method to construct a matrix has been proposed and we have shown how our method works through several illustrative examples. It is also shown that any matrix constructed by our proposed method can be mapped into a solution to the placement problem if a certain condition holds between N and p where N is the number of disks and p is a redundancy rate.} } @InProceedings{park:interface, author = {Yoonho Park and Ridgway Scott and Stuart Sechrest}, title = {Virtual Memory Versus File Interfaces for Large, Memory-intensive Scientific Applications}, booktitle = {Proceedings of Supercomputing '96}, year = {1996}, month = {November}, publisher = {ACM Press and IEEE Computer Society Press}, note = {Also available as UH Department of Computer Science Research Report UH-CH-96-7}, URL = {http://www.hpc.uh.edu/cenju/pub/vm_revisit.ps}, keyword = {virtual memory, file interface, scientific applications, out-of-core, parallel I/O, pario-bib}, abstract = {Scientific applications often require some strategy for temporary data storage to do the largest possible simulations. The use of virtual memory for temporary data storage has received criticism because of performance problems. However, modern virtual memory found in recent operating systems such as Cenju-3/DE give application writers control over virtual memory policies. We demonstrate that custom virtual memory policies can dramatically reduce virtual memory overhead and allow applications to run out-of-core efficiently. We also demonstrate that the main advantage of virtual memory, namely programming simplicity, is not lost.}, comment = {Web and CDROM only. They advocate the use of traditional demand-paged virtual memory systems in supporting out-of-core applications. They are implementing an operating system for the NEC Cenju-3/DE, a shared-nothing MIMD multiprocessor with a multistage interconnection network and disks on every node. The operating system is based on Mach, and they have extended Mach to allow user-provided [local] replacement policies. Basically, they argue that you can get good performance as long as you write your own replacement policy (even OPT is possible in certain applications), and that this is easier than (re)writing the application with explicit out-of-core file I/O calls. They measure the performance of two applications on their system, with OPT, FIFO, and a new replacement algorithm customized to one of the applications. They show that they can get much better performance with some replacement policies than with others, but despite the paper's title they do not compare with the performance of an equivalent program using file I/O.} } @TechReport{park:pario, author = {Arvin Park and K. Balasubramanian}, title = {Providing Fault Tolerance in Parallel Secondary Storage Systems}, year = {1986}, month = {November}, number = {CS-TR-057-86}, institution = {Department of Computer Science, Princeton University}, keyword = {parallel I/O, reliability, RAID, pario-bib}, comment = {They use ECC with one or more parity drives in bit-interleaved systems, and on-line regeneration of failed drives from spares. More cost-effective than mirrored disks. One of the earliest references to RAID-like concepts. Basically, they describe RAID3.} } @InProceedings{parsons:complex, author = {Ian Parsons and Jonathan Schaeffer and Duane Szafron and Ron Unrau}, title = {Using {PI/OT} to Support Complex Parallel {I/O}}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @Article{parsons:templates, author = {Ian Parsons and Ron Unrau and Jonathan Schaeffer and Duane Szafron}, title = {{PI/OT}: Parallel {I/O} Templates}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {543--570}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {parallel programming, parallel I/O, pario-bib}, abstract = {This paper presents a novel, top-down, high-level approach to parallelizing file I/O. Each parallel file descriptor is annotated with a high-level specification, or template, of the expected parallel behaviour. The annotations are external to and independent of the source code. At run-time, all I/O using a parallel file descriptor adheres to the semantics of the selected template. By separating the parallel I/O specifications from the code, a user can quickly change the I/O behaviour without rewriting code. Templates can be composed hierarchically to construct complex access patterns. \par Two sample parallel programs using these templates are compared against versions implemented in an existing parallel I/O system (PIOUS). The sample programs show that the use of parallel I/O templates are beneficial from both the performance and software engineering points of view.} } @TechReport{pase:t3d-fortran, author = {Douglas M. Pase and Tom MacDonald and Andrew Meltzer}, title = {{MPP Fortran} Programming Model}, year = {1993}, month = {October 11}, institution = {Cray Research, Inc.}, URL = {ftp://ftp.cray.com/product-info/program_env/program_model.html}, keyword = {compiler, parallel language, supercomputing, parallel I/O, pario-bib}, abstract = {This report describes the MPP Fortran programming model which will be supported on the first phase MPP systems. Based on existing and proposed standards, it is a work sharing model which combines features from existing models in a way that may be both efficiently implemented and useful.}, comment = {See also oed:t3d for T3D overview. I only read the part about I/O. The only I/O support, apparently, is for each processor to open and access the file independently from all other processors.} } @InProceedings{pasquale:dynamic, author = {Barbara K. Pasquale and George C. Polyzos}, title = {Dynamic {I/O} Characterization of {I/O} Intensive Scientific Applications}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, month = {November}, pages = {660--669}, publisher = {IEEE Computer Society Press}, address = {Washington, DC}, keyword = {scientific computing, file access patterns, I/O, pario-bib}, comment = {This paper extends some of their previous results, but the real bottom line here is that some scientific applications do a lot of I/O, the I/O us bursty, and the pattern of bursts is cyclic and regular. Seems like this cyclic nature could be a source of some optimization. Included in the parallel I/O bibliography because it is useful to that community, though they did not trace parallel workload.} } @InProceedings{pasquale:iowork, author = {Barbara K. Pasquale and George C. Polyzos}, title = {A Static Analysis of {I/O} Characteristics of Scientific Applications in a Production Workload}, booktitle = {Proceedings of Supercomputing '93}, year = {1993}, pages = {388--397}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, keyword = {scientific computing, file access patterns, pario-bib}, comment = {Analyzed one month of accounting records from Cray YMP8/864 in SDSC's production environment. Their base assumption is that scientific application I/O is regular and predictable, eg, repetitive periodic bursts, with distinct phases, repeating patterns, and sequential access. The goal is to characterize a set of I/O-intensive scientific applications and evaluate regularity of resource usage. They measure volumes and rates of applications and total system. Cumulative and average usage for each distinct non-system application. Most resource usage came from the 5\% of applications that were not system applications. ``Virtual I/O rate'' is the bytes transferred per CPU second, which is IMHO only a rough measure because sometimes I/O overlaps CPU time, and sometimes does not. They picked out long-running applications with a high virtual I/O rate. Top 50 applications had 71\% of bytes transferred and 10\% of CPU time. Of those, 4.66 MB/sec min, 131 MB/sec max. Of those they picked the ones executed most often. Cluster analysis showed only 1-2 clusters. Correlation between I/O and CPU time. Included in the parallel I/O bibliography because it is useful to that community, though they did not trace parallel workload.} } @Article{patt:iosubsystem, author = {Yale N. Patt}, title = {The {I/O} Subsystem: a Candidate for Improvement}, journal = {IEEE Computer}, year = {1994}, month = {March}, volume = {27}, number = {3}, pages = {15--16}, keyword = {I/O, file system, parallel I/O, pario-bib}, comment = {This is the intro to a special issue on I/O.} } @TechReport{patterson:informed-tr, author = {R. Hugo Patterson and Garth A. Gibson and Eka Ginting and Daniel Stodolsky and Jim Zelenka}, title = {Informed Prefetching and Caching}, year = {1995}, number = {CMU-CS-95-134}, institution = {School of Computer Science, Carnegie Mellon University}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/TIP/TR.CMU-CS-95-134.ps}, keyword = {caching, prefetching, file system, hints, I/O, resource management, parallel I/O, pario-bib}, abstract = {The underutilization of disk parallelism and file cache buffers by traditional file systems induces I/O stall time that degrades the performance of modern microprocessor-based systems. In this paper, we present aggressive mechanisms that tailor file system resource management to the needs of I/O-intensive applications. In particular, we show how to use application-disclosed access patterns (hints) to expose and exploit I/O parallelism, and to dynamically allocate file buffers among three competing demands: prefetching hinted blocks, caching hinted blocks for reuse, and caching recently used data for unhinted accesses. Our approach estimates the impact of alternative buffer allocations on application execution time and applies a cost-benefit analysis to allocate buffers where they will have the greatest impact. We implemented informed prefetching and caching in DEC's OSF/1 operating system and measured its performance on a 150 MHz Alpha equipped with 15 disks. When running a range of applications including text search, 3D scientific visualization, relational database queries, speech recognition, and computational chemistry, informed prefetching reduces the execution time of four of these applications by 20% to 87%. Informed caching reduces the execution time of the fifth application by up to 30%.}, comment = {HTML at http://www.cs.cmu.edu/Web/Groups/PDL/HTML-Papers/tr95-134/tr95-134.fm.html.} } @InProceedings{patterson:informed, author = {R. Hugo Patterson and Garth A. Gibson and Eka Ginting and Daniel Stodolsky and Jim Zelenka}, title = {Informed prefetching and caching}, booktitle = {Proceedings of the Fifteenth ACM Symposium on Operating Systems Principles}, year = {1995}, month = {December}, pages = {79--95}, publisher = {ACM Press}, address = {Copper Mountain, CO}, earlier = {patterson:informed-tr}, URL = {http://www.cs.cmu.edu/afs/cs/project/pdl/WWW/ftp/TIP/SOSP15.ps}, keyword = {caching, prefetching, file system, hints, I/O, resource management, parallel I/O, pario-bib}, abstract = {In this paper, we present aggressive, proactive mechanisms that tailor file system resource management to the needs of I/O-intensive applications. In particular, we show how to use application-disclosed access patterns (hints) to expose and exploit I/O parallelism, and to dynamically allocate file buffers among three competing demands: prefetching hinted blocks, caching hinted blocks for reuse, and caching recently used data for unhinted accesses. Our approach estimates the impact of alternative buffer allocations on application execution time and applies cost-benefit analysis to allocate buffers where they will have the greatest impact. We have implemented informed prefetching and caching in Digitals OSF/1 operating system and measured its performance on a 150 MHz Alpha equipped with 15 disks running a range of applications. Informed prefetching reduces the execution time of text search, scientific visualization, relational database queries, speech recognition, and object linking by 20-83\%. Informed caching reduces the execution time of computational physics by up to 42\% and contributes to the performance improvement of the object linker and the database. Moreover, applied to multiprogrammed, I/O-intensive workloads, informed prefetching and caching increase overall throughput.}, comment = {See patterson:informed-tr for an earlier version. Programs may give hints to the file system about what they will read in the future, and in what order. Hints are used for informed prefetching and informed caching. Most interesting thing about this paper over the earlier ones is the buffer management. Prefetcher and demand fetcher both want buffers. LRU cache and hinted cache both could supply buffers (thru replacement). Each supplies a cost for giving up buffers and benefit for getting more buffers. These are expressed in a common 'currency', in terms of their expected effect on I/O service time, and a manager takes buffers from one and gives buffers to another when the benefits outweigh the costs. All is based on a simple model, which is further simplified in their implementation within OSF/1. Performance looks good, they can keep more disks busy in a parallel file system. Furthermore, informed caching helps reduce the number of I/Os. Indeed they 'discover' MRU replacement policy automatically.} } @InProceedings{patterson:latency, author = {R. H. Patterson and G. A. Gibson and M. Satyanarayanan}, title = {Using Transparent Informed Prefetching to Reduce File Read Latency}, booktitle = {Proceedings of the 1992 NASA Goddard conference on Mass Storage Systems}, year = {1992}, month = {September}, pages = {329--342}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/TIP/MSST.ps}, keyword = {parallel I/O, file prefetching, file caching, pario-bib}, comment = {This 'paper' is really an annotated set of slides.} } @InProceedings{patterson:pdis-tip, author = {R. Hugo Patterson and Garth A. Gibson}, title = {Exposing {I/O} Concurrency with Informed Prefetching}, booktitle = {Proceedings of the Third International Conference on Parallel and Distributed Information Systems}, year = {1994}, month = {September}, pages = {7--16}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/PDIS.ps}, keyword = {prefetching, parallel I/O, pario-bib}, abstract = {Informed prefetching provides a simple mechanism for I/O-intensive, cache-ineffective applications to efficiently exploit highly-parallel I/O subsystems such as disk arrays. This mechanism, dynamic disclosure of future accesses, yields substantial benefits over sequential readahead mechanisms found in current file systems for non-sequen tial workloads. This paper reports the performance of the Transparent Informed Prefetching system (TIP), a minimal prototype implemented in a Mach 3.0 system with up to four disks. We measured reductions by factors of up to 1.9 and 3.7 in the execution time of two example applications: multi-file text search and scientific data visualization.}, comment = {Also available in HTML format at http://www.cs.cmu.edu/Web/Groups/PDL/HTML-Papers/PDIS94/final.fm.html.} } @InProceedings{patterson:raid, author = {David Patterson and Garth Gibson and Randy Katz}, title = {A case for redundant arrays of inexpensive disks {(RAID)}}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1988}, month = {June}, pages = {109--116}, publisher = {ACM Press}, address = {Chicago, IL}, keyword = {parallel I/O, RAID, reliability, cost analysis, I/O bottleneck, disk array, OS93W extra, OS92W, pario-bib}, comment = {Make a good case for the upcoming I/O crisis, compare single large expensive disks (SLED) with small cheap disks. Outline five levels of RAID the give different reliabilities, costs, and performances. Block-interleaved with a single check disk (level 4) or with check blocks interspersed (level 5) seem to give best performance for supercomputer I/O or database I/O or both. Note: the TR by the same name (UCB/CSD 87/391) is essentially identical.} } @InProceedings{patterson:raid2, author = {David Patterson and Peter Chen and Garth Gibson and Randy H. Katz}, title = {Introduction to Redundant Arrays of Inexpensive Disks {(RAID)}}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {112--117}, earlier = {patterson:raid}, keyword = {parallel I/O, RAID, reliability, cost analysis, I/O bottleneck, disk array, pario-bib}, comment = {A short version of patterson:raid, with some slight updates.} } @Article{patterson:tip, author = {R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan}, title = {A Status Report on Research in Transparent Informed Prefetching}, journal = {ACM Operating Systems Review}, year = {1993}, month = {April}, volume = {27}, number = {2}, pages = {21--34}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/OSRev.ps}, keyword = {file system, prefetching, operating system, pario-bib}, abstract = {This paper focuses on extending the power of caching and prefetching to reduce file read latencies by exploiting application level hints about future I/O accesses. We argue that systems that disclose high-level knowledge can transfer optimization information across module boundaries in a manner consistent with sound software engineering principles. Such Transparent Informed Prefetching (TIP) systems provide a technique for converting the high through put of new technologies such as disk arrays and log-structured file systems into low latency for applications. Our preliminary experiments show that even without a high-throughput I/O sub system TIP yields reduced execution time of up to 30% for applications obtaining data from a remote file server and up to 13% for applications obtaining data from a single local disk. These experiments indicate that greater performance benefits will be available when TIP is integrated with low level resource management policies and highly parallel I/O subsystems such as disk arrays.}, comment = {Not much new over previous TIP papers, but does have newer numbers. See patterson:tip1. Also appears in DAGS'93 (patterson:tip2). Previously appeared as TR CMU-CS-93-1.} } @InProceedings{patterson:tip2, author = {R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan}, title = {Informed Prefetching: Converting High Throughput to Low Latency}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {41--55}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, later = {patterson:informed}, keyword = {file system, prefetching, operating system, pario-bib}, abstract = {This paper focuses on extending the power of caching and prefetching to reduce file read latencies by exploiting application level hints about future I/O accesses. We argue that systems that disclose high-level knowledge can transfer optimization information across module boundaries in a manner consistent with sound software engineering principles. Such Transparent Informed Prefetching (TIP) systems provide a technique for converting the high throughput of new technologies such as disk arrays and log-structured file systems into low latency for applications. Our preliminary experiments show that even without a high-throughput I/O sub-system TIP yields reduced execution time of up to 30\% for applications obtaining data from a remote file server and up to 13\% for applications obtaining data from a single local disk. These experiments indicate that greater performance benefits will be available when TIP is integrated with low level resource management policies and highly parallel I/O subsystems such as disk arrays.}, comment = {Invited speaker: Garth Gibson. Similar paper appeared in ACM OSR April 1993 (patterson:tip)} } @Misc{patterson:vterabytes, author = {David Patterson}, title = {Terabytes $\gg$ Teraflops (or Why Work on Processors When {I/O} is Where the Action Is?)}, year = {1993}, howpublished = {Produced by University Video Communications}, note = {Videotape}, URL = {http://www.uvc.com/videos/06Patterson.video.html}, keyword = {videotape, computer architecture, parallel I/O, pario-bib}, abstract = {RISC pioneer and UC, Berkeley Computer Science Professor David Patterson is working to develop input/output systems to match the increasingly higher performance of new processors. Here he describes the results of the RAID (Redundant Arrays of Inexpensive Disks) project, which offers much greater performance, capacity, and reliability from I/O systems. Patterson also discusses a new project, Sequoia 2000, which looks at utilizing small helical scan tapes, such as digital-audiotapes or videotapes, to offer terabytes of storage for the price of a file/server. He believes that a 1000x increase in storage, available on most Ethernets, will have a much greater impact than a 1000x increase in processing speed.}, comment = {See patterson:trends. 58 minutes.} } @InProceedings{pawlowski:parsort, author = {Markus Pawlowski and Rudolf Bayer}, title = {Parallel Sorting of Large Data Volumes on Distributed Memory Multiprocessors}, booktitle = {Parallel Computer Architectures: Theory, Hardware, Software, Applications}, year = {1993}, series = {Lecture Notes in Computer Science}, volume = {732}, pages = {246--264}, publisher = {Springer-Verlag}, address = {Berlin}, keyword = {sorting, parallel I/O algorithm, pario-bib}, comment = {Main contribution appears to be a new sampling method for initial partition of data set. They approach it from a database point of view.} } @TechReport{perez:clfs, author = {F. {P\'erez} and J. Carretero and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{CLFS} Design: A Parallel File Manager for Multicomputers}, year = {1994}, number = {FIM/82.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi82.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This document describes the detailed design of the CLFS, one of the components of the Cache Coherent File System (CCFS). CCFS has three main components: Client File Server (CLFS), Local File Server (LFS), Concurrent Disk System (CDS). The Client File Servers are located on each processing node, to develop file manager functions in a per node basis. The CLFS will interact with the LFSs to provide block services, naming, locking, real input/output and to manage the disk system, partitions, distributed partitions, etc. The CLFS includes a standard POSIX interface (internally parallelized) and some parallel extensions It will be responsible of maintaining cache consistency, distributing accesses to servers, providing a file system interface to the user, etc.}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @Article{perez:evaluate, author = {F. Perez and J. Carretero and L. Alonso}, title = {Evaluating {ParFiSys}: A high-performance parallel and distributed file system}, journal = {Journal of Systems Architecture}, year = {1997}, month = {May}, volume = {43}, number = {8}, pages = {533--542}, keyword = {verify authors, parallel I/O, multiprocessor file system, pario-bib}, abstract = {We present an overview of ParFiSys, a coherent parallel file system developed at the UPM to provide I/O services to the GPMIMD machine, an MPP built within the ESPRIT project P-5404. Special emphasis is made on the results obtained during ParFiSys evaluation. They were obtained using several I/O benchmarks (PARKBENCH, IOBENCH, etc.) and several MPP platforms (T800, T9000, etc.) to cover a big spectrum of the ParFiSys features, being specifically oriented to measure throughput for scientific applications I/O patterns. ParFiSys is specially well suited to provide I/O services to scientific applications requiring high I/O bandwidth, to minimize application porting effort, and to exploit the parallelism of generic message-passing multicomputers.} } @InProceedings{philippsen:triton, author = {Michael Philippsen and Thomas M. Warschko and Walter F. Tichy and Christian G. Herter}, title = {{Project Triton:} Towards improved Programmability of Parallel Machines}, booktitle = {Proceedings of the Twenty-Sixth Annual Hawaii International Conference on System Sciences}, year = {1993}, volume = {I}, pages = {192--201}, keyword = {parallel programming, parallel architecture, parallel I/O, pario-bib}, comment = {A language- and application-driven proposal for parallel architecture, that mixes SIMD and MIMD, high-performance networking, large memory, shared address space, and so forth. Fairly convincing arguments. One disk per node. Little mention of a file system though. Email from student Udo Boehm:``We use in the version of Triton/1 with 256 PE's 72 Disks at the moment (the filesystem is scalable up to 256 Disks). These Disks are divided into 8 Groups with 9 Disks. In each group exists one parity disk. Our implementation of the filesystem is an parallel version of RAID Level 3 with some extensions. We use so called vector files for diskaccess. A file is always distributed over all disks of the diskarray. A vectorfile is divided in logical blocks. A logical block exist of 72 physical blocks, each block is on one of the 72 disks and all these 72 physical blocks have the same blocknumber on each disk. A logical block has 18432 Bytes, where 16384 Bytes are for Data. The filesystem uses these logical blocks to save data. We do not use special PE's for the I/O. All PE's can be (are) used to do I/O ! There exists no central which coordinates the PE's.''} } @InProceedings{pierce:pario, author = {Paul Pierce}, title = {A Concurrent File System for a Highly Parallel Mass Storage System}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, month = {March}, pages = {155--160}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, hypercube, Intel iPSC/2, multiprocessor file system, pario-bib}, comment = {Intel iPSC/2 Concurrent File System. Chose to tailor system for high performance for large files, read in large chunks. Uniform logical file system view, Unix stdio interface. Blocks scattered over all disks, but not striped. Blocksize 4K optimizes message-passing performance without using blocks that are too big. Tree-directory is stored in ONE file and managed by ONE process, so opens are bottlenecked, but that is not their emphasis. File headers, however, are scattered. The file header info contains a list of blocks. File header is managed by disk process on its I/O node. Data caching is done only at the I/O node of the originating disk drive. Read-ahead is used but not detailed here.} } @TechReport{poole:sio-survey, author = {James T. Poole}, title = {Preliminary Survey of {I/O} Intensive Applications}, year = {1994}, number = {CCSF-38}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.ccsf.caltech.edu/SIO/SIO_apps.ps}, keyword = {parallel I/O, pario-bib, multiprocessor file system, file access pattern, checkpoint}, comment = {Goal is to collect a set of representative applications from biology, chemistry, earth science, engineering, graphics, and physics, use performance-monitoring tools to analyze them, create templates and benchmarks that represent them, and then later to evaluate the performance of new I/O tools created by rest of the SIO initiative. Seem to be four categories of I/O needs: input, output, checkpoint, and virtual memory (``out-of-core'' scratch space). Not all types are significant in all applications. (Two groups mention databases and the need to perform computationally complex queries.) Large input is typically raw data (seismic soundings, astronomical observations, satellite remote sensing, weather information). Sometimes there are real-time constraints. Output is often periodic, e.g., the state of the system every few timesteps; typically the volume would increase along with I/O capacity and bandwidth. Checkpointing is a common request; preferably allowing application to choose what and when to checkpoint, and definitely including the state of files. Many kinds of out-of-core: 1) temp files between passes (often written and read sequentially), 2) regular patterns like FFT, matrix transpose, solvers, and single-pass read/compute/write, 3) random access, e.g., to precomputed tables of integrals. Distinct differences in the ways people choose to divide data into files; sometimes all in one huge file, sometimes many ``small'' files (e.g., one per processor, one per timestep, one per region, etc.). Important: overlap of computation and I/O, independent access by individual processors. Not always important: ordering of records read or written by different processors, exposing the I/O model to the application writer. Units of I/O seem to be either (sub)matrices (1--5 dimensions) or items in a collection of objects (100--10000 bytes each). Data sets varied up to 1~TB; bandwidth needs varied up to 1~GB/s. See also bagrodia:sio-character, choudhary:sio-language, bershad:sio-os.} } @InProceedings{poston:hpfs, author = {Alan Poston}, title = {A High Performance File System for {UNIX}}, booktitle = {Proceedings of the USENIX Workshop on UNIX and Supercomputers}, year = {1988}, pages = {215--226}, keyword = {file system, unix, parallel I/O, disk striping, pario-bib}, comment = {A new file system for Unix based on striped files. Better performance for sequential access, better for large-file random access and about the same for small-file random access. Allows full striping track prefetch, or even volume prefetch. Needs a little bit of buffer management change. Talks about buffer management and parity blocks.} } @InProceedings{prabhakar:browsing, author = {Sunil Prabhakar and Divyakant Agrawal and Amr {El Abbadi} and Ambuj Singh and Terence Smith}, title = {Browsing and Placement of Multiresolution Images on Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {102--113}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {With rapid advances in computer and communication technologies, there is an increasing demand to build and maintain large image repositories. In order to reduce the demands on I/O and network resources, multiresolution representations are being proposed for the storage organization of images. Image decomposition techniques such as {\em wavelets} can be used to provide these multiresolution images. The original image is represented by several coefficients, one of them with visual similarity to the original image, but at a lower resolution. These visually similar coefficients can be thought of as {\em thumbnails} or {\em icons} of the original image. This paper addresses the problem of storing these multiresolution coefficients on disks so that thumbnail browsing as well as image reconstruction can be performed efficiently. Several strategies are evaluated to store the image coefficients on parallel disks. These strategies can be classified into two broad classes depending on whether the access pattern of the images is used in the placement. Disk simulation is used to evaluate the performance of these strategies. Simulation results are validated with results from experiments with real disks and are found to be in good agreement. The results indicate that significant performance improvements can be achieved with as few as four disks by placing image coefficients based upon browsing access patterns.}, comment = {They use simulation to study several different placement policies for the thumbnail and varying-resolution versions of images on a disk array.} } @InProceedings{pratt:twofs, author = {Terrence W. Pratt and James C. French and Phillip M. Dickens and Janet, Jr., Stanley A.}, title = {A Comparison of the Architecture and Performance of Two Parallel File Systems}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {161--166}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, Intel iPSC/2, nCUBE, pario-bib}, comment = {Simple comparison of the iPSC/2 and nCUBE/10 parallel I/O systems. Short description of each system, with simple transfer rate measurements. See also french:ipsc2io-tr.} } @TechReport{prost:mpi-io, author = {Jean-Pierre Prost and Marc Snir and Peter Corbett and Dror Feitelson}, title = {{MPI-IO,} A Message-Passing Interface for Concurrent {I/O}}, year = {1994}, month = {August}, number = {RC~19712 (87394)}, institution = {IBM T.J. Watson Research Center}, keyword = {parallel I/O, message-passing, multiprocesor file system interface, pario-bib}, comment = {See newer version mpi-ioc:mpi-io5.} } @Booklet{rab:raidbook, key = {RAB}, title = {The {RAIDBook}: A Source Book for {RAID} Technology}, year = {1993}, month = {June 9}, howpublished = {The RAID Advisory Board}, address = {Lino Lakes, MN}, note = {First Edition}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {Basically, an educational piece about the basics of RAID technology. Helps to define terms across the industry. Written by the RAID advisory board, which is an industry consortium. Overviews RAID, RAID levels, non-Berkeley RAID levels. List of Board members. Bibliography.} } @TechReport{reddy:compiler-tr, author = {A. L. Narasimha Reddy and P. Banerjee and D. K. Chen}, title = {Compiler Support for Parallel {I/O} Operations}, year = {1991}, institution = {IBM Yorktown Heights}, note = {Also appeared in ICPP '91}, later = {reddy:compiler}, keyword = {parallel I/O, pario-bib, compilers} } @InProceedings{reddy:compiler, author = {A. L. Narasimha Reddy and P. Banerjee and D. K. Chen}, title = {Compiler Support for Parallel {I/O} Operations}, booktitle = {Proceedings of the 1991 International Conference on Parallel Processing}, year = {1991}, pages = {II:290--II:291}, publisher = {CRC Press}, address = {St. Charles, IL}, earlier = {reddy:compiler-tr}, keyword = {parallel I/O, pario-bib, compilers}, comment = {This version is only 2 pages. reddy:compiler-tr provides the full text. They discuss three primary issues. 1) Overlapping I/O with computation: the compiler's dependency analysis is used to decide when some I/O may be moved up and performed asynchronously with other computation. 2) Parallel execution of I/O statements: {\em if} all sizes are known at compile time, the compiler can insert seeks so that processes can access the file independently. When writing in the presence of conditionals they even propose skipping by the maximum and leaving holes in the file, and they claim that this doesn't hurt (!). 3) Parallel format conversion: again, if there are fixed-width fields the compiler can have processors seek to different locations, read data independently, and do format conversion in parallel. Really all this is saying is that fixed-width fields are good for parallelism, and that compilers could take advantage of them.} } @InProceedings{reddy:hyperio1, author = {A. L. Reddy and P. Banerjee and Santosh G. Abraham}, title = {{I/O} Embedding in Hypercubes}, booktitle = {Proceedings of the 1988 International Conference on Parallel Processing}, year = {1988}, volume = {1}, pages = {331--338}, publisher = {Pennsylvania State Univ. Press}, address = {St. Charles, IL}, later = {reddy:hyperio3}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {Emphasis is on adjacency. It also implies (and they assume) that data is distributed well across the disks so no data needs to move beyond the neighbors of an I/O node. Still, the idea of adjacency is good since it allows for good data distribution while not requiring it, and for balancing I/O procs among procs in a good way. Also avoids messing up the hypercube regularity with (embedded) dedicated I/O nodes.} } @InProceedings{reddy:hyperio2, author = {A. L. Reddy and P. Banerjee}, title = {{I/O} issues for hypercubes}, booktitle = {ACM International Conference on Supercomputing}, year = {1989}, pages = {72--81}, later = {reddy:hyperio3}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {See reddy:hyperio3 for extended version.} } @Article{reddy:hyperio3, author = {A. L. Narasimha Reddy and Prithviraj Banerjee}, title = {Design, Analysis, and Simulation of {I/O} Architectures for Hypercube Multiprocessors}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1990}, month = {April}, volume = {1}, number = {2}, pages = {140--151}, publisher = {IEEE Computer Society Press}, earlier = {reddy:hyperio1}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {An overall paper restating their embedding technique from reddy:hyperio1, plus a little bit of evaluation along the lines of reddy:pario2, plus some ideas about matrix layout on the disks. They claim that declustering is important, since synchronized disks do not provide enough parallelism, especially in the communication across the hypercube (since the synchronized disks must hang off one node).} } @InProceedings{reddy:pario, author = {A. Reddy and P. Banerjee}, title = {An Evaluation of multiple-disk {I/O} systems}, booktitle = {Proceedings of the 1989 International Conference on Parallel Processing}, year = {1989}, pages = {I:315--322}, publisher = {Pennsylvania State Univ. Press}, address = {St. Charles, IL}, later = {reddy:pario2}, keyword = {parallel I/O, disk array, disk striping, pario-bib}, comment = {see also expanded version reddy:pario2} } @Article{reddy:pario2, author = {A. Reddy and P. Banerjee}, title = {Evaluation of multiple-disk {I/O} systems}, journal = {IEEE Transactions on Computers}, year = {1989}, month = {December}, volume = {38}, pages = {1680--1690}, publisher = {IEEE Computer Society Press}, earlier = {reddy:pario}, later = {reddy:pario3}, keyword = {parallel I/O, disk array, disk striping, pario-bib}, comment = {Compares declustered disks (sortof MIMD-like) to synchronized-interleaved (SIMD-like). Declustering needed for scalability, and is better for scientific workloads. Handles large parallelism needed for scientific workloads and for RAID-like architectures. Synchronized interleaving is better for general file system workloads due to better utilization and reduction of seek overhead.} } @Article{reddy:pario3, author = {A. L. Reddy and Prithviraj Banerjee}, title = {A Study of Parallel Disk Organizations}, journal = {Computer Architecture News}, year = {1989}, month = {September}, volume = {17}, number = {5}, pages = {40--47}, earlier = {reddy:pario2}, keyword = {parallel I/O, disk array, disk striping, pario-bib}, comment = {nothing new over expanded version reddy:pario2, little different from reddy:pario} } @InProceedings{reddy:perfectio, author = {A. L. Narasimha Reddy and Prithviraj Banerjee}, title = {A Study of {I/O} Behavior of {Perfect} Benchmarks on a Multiprocessor}, booktitle = {Proceedings of the 17th Annual International Symposium on Computer Architecture}, year = {1990}, pages = {312--321}, keyword = {parallel I/O, file access pattern, workload, multiprocessor file system, benchmark, pario-bib}, comment = {Using five applications from the Perfect benchmark suite, they studied both implicit (paging) and explicit (file) I/O activity. They found that the paging activity was relatively small and that sequential access to VM was common. All access to files was sequential, though this may be due to the programmer's belief that the file system is sequential. Buffered I/O would help to make transfers bigger and more efficient, but there wasn't enough rereferencing to make caching useful.} } @PhdThesis{reddy:thesis, author = {Narasimha {Reddy L. Annapareddy}}, title = {Parallel Input/Output Architectures for Multiprocessors}, year = {1990}, month = {August}, school = {University of Illinois at Urbana-Champaign}, note = {Available as technical report UILU-ENG-90-2235 or CRHC-90-5}, keyword = {parallel I/O, multiprocessor architecture, pario-bib}, comment = {Much of the material in this thesis has been published in other papers, i.e., reddy:io, reddy:notsame, reddy:hyperio1, reddy:hyperio2, reddy:hyperio3, reddy:pario, reddy:pario2, reddy:pario3, reddy:perfectio, reddy:mmio. He traces some ``Perfect'' benchmarks to determine paging and file access patterns. He simulates a variety of declustered, synchronized, and synchronized-declustered striping configurations under both ``file'' and ``scientific'' workloads to determine which is best. He proposes embeddings for I/O nodes in hypercubes, where the I/O nodes are just like regular nodes but with an additional I/O processor and disk(s). He studies the disk configurations again, when embedded in hypercubes. He proposes ways to lay out matrices (in blocked form) across disks in a hypercube. He proposes a new parity-based fault-tolerance scheme that prevents overloading during failure-mode access. And he considers compiler issues: overlapping I/O with computation, parallelizing I/O statements, and parallel format conversion.} } @Article{reed:panel, author = {Daniel A. Reed and Charles Catlett and Alok Choudhary and David Kotz and Marc Snir}, title = {Parallel {I/O}: Getting Ready for Prime Time}, journal = {IEEE Parallel and Distributed Technology}, year = {1995}, month = {Summer}, pages = {64--71}, publisher = {IEEE Computer Society Press}, note = {Edited transcript of panel discussion at the 1994 International Conference on Parallel Processing}, URL = {http://bugle.cs.uiuc.edu/Projects/IO/ICPP-Panel.ps.Z}, keyword = {parallel I/O, pario-bib, dfk}, comment = {This paper summarizes the presentations made by panel members at the ICPP panel discussion on parallel I/O, and the ensuing discussion. URL represents version just prior to IEEE PDT editing process.} } @Article{rettberg:monarch, author = {Randall D. Rettberg and William R. Crowther and Philip P. Carvey and Raymond S. Tomlinson}, title = {The {Monarch Parallel Processor} Hardware Design}, journal = {IEEE Computer}, year = {1990}, month = {April}, volume = {23}, number = {4}, pages = {18--30}, publisher = {IEEE Computer Society Press}, keyword = {MIMD, parallel architecture, shared memory, parallel I/O, pario-bib}, comment = {This describes the Monarch computer from BBN. It was never built. 65K processors and memory modules. 65GB RAM. Bfly-style switch in dance-hall layout. Switch is synchronous; one switch time is a {\em frame} (one microsecond, equal to 3 processor cycles) and all processors may reference memory in one frame time. Local I-cache only. Contention reduces full bandwidth by 16 percent. Full 64-bit machine. Custom VLSI. Each memory location has 8 tag bits. One allows for a location to be locked by a processor. Thus, any FetchAndOp or full/empty model can be supported. I/O is done by adding I/O processors (up to 2K in a 65K-proc machine) in the switch. They plan 200 disks, each with an I/O processor, for 65K nodes. They would spread each block over 9 disks, including one for parity (essentially RAID).} } @Unpublished{riesen:experience, author = {Rolf Riesen and Arthur B. Maccabe and Stephen R. Wheat}, title = {Experience in Implementing a Parallel File System}, year = {1993}, month = {March}, note = {Available for ftp?}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {They describe their experience building a file system for SUNMOS. Paper describes tuning the SCSI device, their striping strategy, their message-passing tricks, and some performance results.} } @Article{rochberg:ctip, author = {David Rochberg and Garth Gibson}, title = {Prefetching Over a Network: Early Experience with {CTIP}}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {29--36}, keyword = {file prefetching, distributed file system, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @InProceedings{rodriguez:nnt, author = {Bernardo Rodriguez and Leslie Hart and Tom Henderson}, title = {Programming Regular Grid-Based Weather Simulation Models for Portable and Fast Execution}, booktitle = {Proceedings of the 1995 International Conference on Parallel Processing}, year = {1995}, month = {August}, pages = {III:51--59}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {weather simulation, scientific application, parallel I/O, pario-bib}, comment = {Related to hart:grid.} } @TechReport{rosales:cds, author = {F. Rosales and J. Carretero and F. {P\'erez} and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{CDS} Design: A Parallel Disk Server for Multicomputers}, year = {1994}, number = {FIM/83.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi83.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This document describes the detailed design of the CDS, one of the components of the Cache Coherent File System (CCFS). CCFS has three main components: Client File Server (CLFS), Local File Server (LFS), Concurrent Disk System (CDS). A CDSs is located on each disk node, to develop input/output functions in a per node basis. The CDS will interact with the microkernel drivers to execute real input/output and to manage the disk system. The CDS includes general services to distribute accesses to disks, controlling partition information, etc.}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @InProceedings{rothnie:ksr, author = {James Rothnie}, title = {{Kendall Square Research:} Introduction to the {KSR1}}, booktitle = {Proceedings of the 1992 DAGS/PC Symposium}, year = {1992}, month = {June 23--27}, pages = {200--210}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, keyword = {parallel architecture, shared memory, MIMD, interconnection network, parallel I/O, memory-mapped files, pario-bib}, comment = {Overview of the KSR1.} } @InProceedings{roy:unixfile, author = {Paul J. Roy}, title = {Unix File Access and Caching in a Multicomputer Environment}, booktitle = {Proceedings of the Usenix {Mach III} Symposium}, year = {1993}, pages = {21--37}, keyword = {multiprocessor file system, Unix, Mach, memory mapped file, pario-bib}, comment = {Describes the modifications to the OSF/1 AD file system for a multicomputer environment. Goal is for normal Unix files, not supercomputer access. The big thing was separation of the caching from backing store management, by pulling out the cache management into the Extended Memory Management (XMM) subsystem. Normally OSF/1 maps files to Mach memory objects, which are then accessed (through read() and write()) using bcopy(). XMM makes it possible to access these memory objects from any node in the system, providing coherent compute-node caching of pages from the memory object. It uses tokens controlled by the XMM server at the file's server node to support a single-reader, single-writer policy on the whole file, but migrating page by page. They plan to extend to multiple writers, but atomicity constraints on the file pointer and metadata make it difficult. Files are NOT striped across file servers or I/O nodes. Several hacks were necessary to work around Mach interface problems. Unix buffer caching is abandoned. Future includes supercomputer support in the form of turning off all caching. No performance evaluation included. See zajcew:osf1.} } @Unpublished{rullman:interface, author = {Brad Rullman and David Payne}, title = {An Efficient File {I/O} Interface for Parallel Applications}, year = {1995}, month = {February}, note = {DRAFT presented at the Workshop on Scalable I/O, Frontiers~'95}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {They believe that the API should be Unix-compatible, systems must support scalable performance on large transfers of data, and that systems must support very large files. Most of the paper is specifics about the Paragon PFS interface, which has many features not mentioned in earlier PFS papers. Contact brad@ssd.intel.com or payne@ssd.intel.com.} } @Misc{ryan:cfs, author = {Steve Ryan}, title = {{CFS} workload demonstration code}, year = {1991}, month = {July}, howpublished = {WWW ftp://ftp.cs.dartmouth.edu/pub/pario/examples/CFS3D.tar.Z}, note = {A simple program demonstrating CFS usage for ARC3D-like applications}, URL = {ftp://ftp.cs.dartmouth.edu/pub/pario/examples/CFS3D.tar.Z}, keyword = {parallel I/O workload, file access pattern, Intel, pario-bib}, comment = {A sample code that tries to behave like a parallel ARC3D in terms of its output. It writes two files, one containing three three-dimensional matrices X, Y, and Z, and the other containing the four-dimensional matrix Q. The matrices are spread over all the nodes, and each file is written in parallel by the processors. See also ryan:navier.} } @InProceedings{ryan:navier, author = {J. S. Ryan and S. K. Weeratunga}, title = {Parallel Computation of {3-D Navier-Stokes} Flowfields for Supersonic Vehicles}, booktitle = {31st Aerospace Sciences Meeting and Exhibit}, year = {1993}, address = {Reno, NV}, note = {AIAA Paper 93-0064}, URL = {http://www.nas.nasa.gov/HPCC/Pubs/ryan/AIAA93-0064.html}, keyword = {parallel application, CFD, parallel I/O, pario-bib}, comment = {This paper goes with the ryan:cfs code example. Describes their parallel implementation of the ARC3D code on the iPSC/860. A section of the paper considers I/O, which is to write out a large multidimensional matrix at each timestep. They found that it was actually faster to write to separate files because of congestion in the I/O nodes was hurting performance. They never got more than 2 MB/s, even so, on a system that should obtain 7-10 MB/s peak.} } @InProceedings{salem:diskstripe, author = {Kenneth Salem and Hector Garcia-Molina}, title = {Disk Striping}, booktitle = {Proceedings of the IEEE 1986 Conference on Data Engineering}, year = {1986}, pages = {336--342}, earlier = {salem:striping}, keyword = {parallel I/O, disk striping, disk array, pario-bib}, comment = {See the techreport salem:striping for a nearly identical but more detailed version.} } @TechReport{salem:striping, author = {Kenneth Salem and Hector Garcia-Molina}, title = {Disk Striping}, year = {1984}, month = {December}, number = {332}, institution = {EECS Dept. Princeton Univ.}, later = {salem:disktripe}, keyword = {parallel I/O, disk striping, disk array, pario-bib}, comment = {Cite salem:diskstripe instead. Basic paper on striping. For uniprocessor, single-user machine. Interleaving asynchronous, even without matching disk locations though this is discussed. All done with models.} } @InProceedings{salmon:cubix, author = {John Salmon}, title = {{CUBIX: Programming} Hypercubes without Programming Hosts}, booktitle = {Proceedings of the Second Conference on Hypercube Multiprocessors}, year = {1986}, pages = {3--9}, keyword = {hypercube, multiprocessor file system interface, pario-bib}, comment = {Previously, hypercubes were programmed as a combination of host and node programs. Salmon proposes to use a universal host program that acts essentially as a file server, responding to requests from the node programs. Two modes: crystalline, where node programs run in loose synchrony, and amorphous, where node programs are asynchronous. In the crystalline case, files have a single file pointer and are either single- or multiple- access; single access means all nodes must simultaneously issue the same request; multiple access means they all simultaneously issue the same request with different parameters, giving an interleaved pattern. Amorphous allows asynchronous activity, with separate file pointers per node.} } @InProceedings{salmon:nbody, author = {John Salmon and Michael Warren}, title = {Parallel Out-of-core Methods for {N}-body Simulation}, booktitle = {Proceedings of the Eighth SIAM Conference on Parallel Processing for Scientific Computing}, year = {1997}, URL = {http://www.cacr.caltech.edu/~johns/pubs/siam97/}, keyword = {verify pages and month, parallel I/O, out of core applications, scientific computing, pario-bib}, abstract = {Hierarchical treecodes have, to a large extent, converted the compute-bound N-body problem into a memory-bound problem. The large ratio of DRAM to disk pricing suggests use of out-of-core techniques to overcome memory capacity limitations. We will describe a parallel, out-of-core treecode library, targeted at machines with independent secondary storage associated with each processor. Borrowing the space-filling curve techniques from our in-core library, and ``manually'' paging, results in excellent spatial and temporal locality and very good performance.} } @TechReport{sanders:datatypes, author = {Darren Sanders and Yoonho Park and Maciej Brodowicz}, title = {Implementation and performance of {MPI-IO} file access using {MPI} datatypes}, year = {1996}, month = {November}, number = {UH-CS-96-12}, institution = {University of Houston}, URL = {http://www.hpc.uh.edu/cenju/pub/mpio.ps}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {In this paper we document our experience implementing MPI-IO file access using MPI datatypes. We present performance results and discuss two significant problems that stem from the flexibility of MPI datatypes. First, MPI datatypes can be used to specify non-contiguous access patterns. Optimizing data transfers for such patterns is difficult. Second, the behavior of MPI datatypes in a heterogenous environment is not well-defined.}, comment = {They devise several file-access strategies for different situations, depending on the particulars of the etypes and filetypes in use: sequential, two-phase I/O, one file access per etype (random access), and one file access per etype element (random access with smaller pieces). They measure the performance of their system with example patterns that trigger each strategy. It would be nice to see a more extensive performance analysis of their implementation, and of their strategies.} } @InProceedings{savage:afraid, author = {Stefan Savage and John Wilkes}, title = {{AFRAID}--- A Frequently Redundant Array of Independent Disks}, booktitle = {Proceedings of the 1996 USENIX Technical Conference}, year = {1996}, month = {January}, pages = {27--39}, URL = {http://www.hpl.hp.com/personal/John_Wilkes/papers/AFRAID.ps.Z}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {RAID array that relaxes the consistency requirements, to not write parity during busy periods, then to go back and update parity during idle periods. Thus you sacrifice a little reliability for performance; you can select how much.} } @TechReport{scheuermann:partition, author = {Peter Scheuermann and Gerhard Weikum and Peter Zabback}, title = {Data Partitioning and Load Balancing in Parallel Disk Systems}, year = {1994}, month = {January}, number = {209}, institution = {ETH Zurich}, later = {scheuermann:partition2}, keyword = {parallel I/O, disk array, disk striping, load balance, pario-bib}, comment = {Updated as scheuermann:partition2. They describe a file system that attempts to choose both the degree of declustering and the striping unit size to accomodate the needs of different files. They also decsribe static and dynamic placement and migration policies to readjust the load across disks. Note that there are several references in the bib that are about their file system, called FIVE. Seems to be the same as scheuermann:tunable.} } @TechReport{scheuermann:partition2, author = {Peter Scheuermann and Gerhard Weikum and Peter Zabback}, title = {Data Partitioning and Load Balancing in Parallel Disk Systems}, year = {1996}, month = {April}, number = {A/02/96}, institution = {Universit\"at Des Saarlandes}, address = {SaarBr\"ucken, Germany}, note = {Submitted to VLDB Journal.}, earlier = {scheuermann:partition}, keyword = {verify, parallel I/O, disk array, disk striping, load balance, pario-bib}, comment = {Updated version of scheuermann:partition.} } @Unpublished{scheuermann:tunable, author = {Peter Scheuermann and Gerhard Weikum and Peter Zabback}, title = {The Case for Tunable Disk Arrays}, year = {1993}, note = {Submitted to IEEE Computer}, keyword = {verify, parallel I/O, disk array, disk striping, pario-bib}, comment = {Seems to be the same as scheuermann:partition.} } @InCollection{schloss:hcsa-book, author = {Gerhard A. Schloss and Michael Vernick}, title = {{HCSA}: A Hybrid Client-Server Architecture}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {15}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {333--351}, publisher = {Kluwer Academic Publishers}, earlier = {schloss:hcsa}, keyword = {parallel I/O architecture, pario-bib}, abstract = {The {\em HCSA} (Hybrid Client-Server Architecture), a flexible system layout that combines the advantages of the traditional Client-Server Architecture (CSA) with those of the Shared Disk Architecture (SDA), is introduced. In {\em HCSA}, the traditional CSA-style I/O subsystem is modified to give the clients network access to both the server and the server's set of disks. Hence, the {\em HCSA} is more fault-tolerant than the CSA since there are two paths between any client and the shared data. Moreover, a simulation study demonstrates that the {\em HCSA} is able to support a larger number of clients than the CSA or SDA under similar system workloads. Finally, the {\em HCSA} can run applications in either a CSA mode, an SDA mode, or a combination of the two, thus offering backward compatibility with a large number of existing applications.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{schloss:hcsa, author = {Gary Schloss and Michael Vernick}, title = {{HCSA:} A Hybrid Client-Server Architecture}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {63--77}, later = {schloss:hcsa-book}, URL = {http://www.cs.sunysb.edu/~vernick/Papers/iopads.ps}, keyword = {parallel I/O, pario-bib}, comment = {In the context of client-server database systems, they propose to make a compromise between shared-disk architectures, where the disks are all attached to the network and all machines are both clients and servers, and a system where the disks are attached to a single server. Their compromise attaches the disks to both the network and the server.} } @Misc{schneider:sp2-io, author = {David Schneider}, title = {Application {I/O} and Related Issues on the {SP2}}, year = {1995}, organization = {Cornell Theory Center, Cornell University}, note = {Available at \verb+http://www.tc.cornell.edu/SmartNodes/Newsletters/1994/V6N5/application.html+}, URL = {http://www.tc.cornell.edu/SmartNodes/Newsletters/1994/V6N5/application.html}, keyword = {parallel I/O, IBM SP-2, pario-bib} } @TechReport{schulze:raid, author = {Martin Schulze}, title = {Considerations in the Design of a {RAID} Prototype}, year = {1988}, month = {August}, number = {UCB/CSD 88/448}, institution = {UC Berkeley}, URL = {http://cs-tr.cs.berkeley.edu/TR/UCB:CSD-88-448}, keyword = {parallel I/O, RAID, disk array, disk architecture, pario-bib}, comment = {Very practical description of the RAID I prototype.} } @InProceedings{schulze:raid2, author = {Martin Schulze and Garth Gibson and Randy Katz and David Patterson}, title = {How Reliable is a {RAID}?}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, earlier = {chen:raid}, keyword = {parallel I/O, reliability, RAID, disk array, disk architecture, pario-bib}, comment = {Published version of second paper in chen:raid. Some overlap with schulze:raid, though that paper has more detail.} } @InProceedings{schwabe:flexible, author = {Eric J. Schwabe and Ian M. Sutherland}, title = {Flexible Use of Parity Storage Space in Disk Arrays}, booktitle = {Proceedings of the Eighth Symposium on Parallel Algorithms and Architectures}, year = {1996}, month = {June}, pages = {99--108}, publisher = {ACM Press}, address = {Padua, Italy}, keyword = {parallel disks, disk array, parity, RAID, pario-bib} } @Article{schwabe:jlayouts, author = {Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer}, title = {Evaluating Approximately Balanced Parity-Declustered Data Layouts for Disk Arrays}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {501--523}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {schwabe:layouts}, keyword = {disk array, parity, RAID, parallel I/O, pario-bib}, abstract = {Parity-declustered data layouts were developed to reduce the time for on-line failure recovery in disk arrays. They generally require perfect balancing of reconstruction workload among the disks; this restrictive balance condition makes such data layouts difficult to construct. In this paper, we consider approximately balanced data layouts, where some variation in the reconstruction workload over the disks is permitted. Such layouts are considerably easier to construct than perfectly balanced layouts. We consider three methods for constructing approximately balanced data layouts, and analyze their performance both theoretically and experimentally. We conclude that on uniform workloads, approximately balanced layouts have performance nearly identical to that of perfectly balanced layouts.} } @InProceedings{schwabe:layouts, author = {Eric J. Schwabe and Ian M. Sutherland and Bruce K. Holmer}, title = {Evaluating Approximately Balanced Parity-Declustered Data Layouts for Disk Arrays}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {41--54}, publisher = {ACM Press}, address = {Philadelphia}, later = {schwabe:jlayouts}, keyword = {parallel I/O, disk array, parity, RAID, pario-bib}, abstract = {Parity declustering has been used to reduce the time required to reconstruct a failed disk in a disk array. Most existing work on parity declustering uses BIBD-based data layouts, which distribute the workload of reconstructing a failed disk over the remaining disks of the array with perfect balance. For certain array sizes, however, there is no known BIBD-based layout. In this paper, we evaluate data layouts that are approximately balanced --- that is, that distribute the reconstruction workload over the disks of the array with only approximate balance. Approximately balanced layouts are considerably easier to construct than perfectly balanced layouts. We consider three methods for generating approximately balanced layouts: randomization, simulated annealing, and perturbing a BIBD-based layout whose size is near the desired size. We compare the performance of these approximately balanced layouts with that of perfectly balanced layouts using a disk array simulator. We conclude that, on uniform workloads, approximately balanced data layouts have performance nearly identical to that of perfectly balanced layouts. Approximately balanced layouts therefore provide the reconstruction performance benefits of perfectly balanced layouts for arrays where perfectly balanced layouts are either not known, or do not exist.} } @InProceedings{scott:matrix, author = {David S. Scott}, title = {Parallel {I/O} and Solving Out of Core Systems of Linear Equations}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {123--130}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, keyword = {parallel I/O, scientific computing, matrix factorization, Intel, pario-bib}, abstract = {Large systems of linear equations arise in a number of scientific and engineering applications. In this paper we describe the implementation of a family of disk based linear equation solvers and the required characteristics of the I/O system needed to support them.}, comment = {Invited speaker. See also scott:solvers. This gives a very brief overview of Intel's block solver and slab solver, both out-of-core linear-systems solvers. He notes a few optimizations that had to be made to CFS to make it work: data and metadata needed to have equal priority in the cache, because often the (higher-priority) metadata was crowding out the data; and they had to restrict some files to small subsets of disks to reduce the contention for the cache at each I/O node caused by large groups of processors all requesting at the same time (see nitzberg:cfs for the same problem).} } @InProceedings{scott:solvers, author = {David S. Scott}, title = {Out of Core Dense Solvers on {Intel} Parallel Supercomputers}, booktitle = {Proceedings of the Fourth Symposium on the Frontiers of Massively Parallel Computation}, year = {1992}, pages = {484--487}, keyword = {parallel I/O, scientific computing, Intel, pario-bib}, comment = {He discusses ProSolver-DES, which factors large matrices by swapping square submatrices in and out of memory, and Intel's new solver, which swaps column blocks in and out. The new solver is a little slower, but allows full pivoting, which is needed for stability in some matrices. A short paper with little detail. Some performance numbers. See scott:matrix.} } @InProceedings{seamons:compressed, author = {K. E. Seamons and M. Winslett}, title = {A Data Management Approach for Handling Large Compressed Arrays in High Performance Computing}, booktitle = {Proceedings of the Fifth Symposium on the Frontiers of Massively Parallel Computation}, year = {1995}, month = {February}, pages = {119--128}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/compression.ps}, keyword = {parallel I/O, pario-bib}, comment = {``This paper shows how compression can be used to speed up parallel i/o of large arrays. The current version of the paper focuses on improving write performance.'' They use chunked files like in seamons:interface but before writing they compress each chunk on its compute node, and after reading they decompress each chunk on its compute node. Presumably this is only useful when you plan to read back whole chunks. They find better performance for compressing in many cases, even when the compression time dominates the I/O time, because it reduces the I/O time so much. They found that the compression time and compression ratio can vary widely from chunk to chunk, leading to a tremendous load imbalance that unfortunately spoils some of the advanatages if all compute nodes must wait for the slowest to finish.} } @InProceedings{seamons:interface, author = {K. E. Seamons and M. Winslett}, title = {An Efficient Abstract Interface for Multidimensional Array {I/O}}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, month = {November}, pages = {650--659}, publisher = {IEEE Computer Society Press}, address = {Washington, DC}, later = {seamons:jpanda}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/super94.ps}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {``This paper shows what large performance gains can be made for parallel i/o of large arrays by using a carefully implemented library interface for i/o that makes use of array chunking. For example, the authors obtained a factor of 10 speedup in output of time step data by using the natural array chunks of the problem decomposition as the units of i/o on an Intel iPSC/860. The paper also presents results from experiments with the use of chunking in checkpointing and restarts on parallel architectures, and the use of chunking with memory-mapped data files in visualization on sequential architectures.'' They describe a library that supports chunked representations of matrices. That is, ways to checkpoint, output, or input multidimensional matrices to files in a blocked rather than row-major or column-major layout. This helps the file be more versatile for reading in a variety of dimensions. Their experiments show good performance improvements, although they only tried it for an application whose data set in memory was already in a blocked distribution -- I would guess that smaller improvements might come from column- or row-oriented memory distributions. Also, some of their performance improvement came from characteristics specific to the Intel CFS file system, having to do with its IOP-cache management policies. See also seamons:schemas and seamons:compressed.} } @Article{seamons:jpanda, author = {Kent E. Seamons and Marianne Winslett}, title = {Multidimensional Array {I/O} in {Panda~1.0}}, journal = {Journal of Supercomputing}, year = {1996}, volume = {10}, number = {2}, pages = {191--211}, earlier = {seamons:interface}, keyword = {parallel I/O, collective I/O, pario-bib} } @Misc{seamons:msio, author = {K. E. Seamons and Y. Chen and M. Winslett and Y. Cho and S. Kuo and P. Jones and J. Jozwiak and M. Subramanian}, title = {Fast and Easy {I/O} for Arrays in Large-scale Applications}, booktitle = {Workshop on Modeling and Specification of I/O}, year = {1995}, month = {October}, note = {At SPDP'95}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/msio.ps}, keyword = {parallel I/O, scientific computing, pario-bib}, abstract = {This four-page paper, written for an audience from the supercomputing/parallel i/o community, is a nice succinct introduction to Panda. Abstract and summary: \par Scientists with high-performance computing needs are plagued by applications suffering poor i/o performance and are burdened with the need to consider low-level physical storage details of persistent arrays in order to reach acceptable i/o performance levels, especially with existing parallel i/o facilities. The Panda i/o library (URL http://bunny.cs.uiuc.edu/CADR/panda.html) serves as a concrete example of a methodology for freeing application developers from unnecessary storage details through high-level abstract interfaces and providing them with increased performance and greater portability. \par Panda addresses these problems by introducing high-level application program interfaces for array i/o on both parallel and sequential machines, and by developing an efficient commodity-parts-based implementation of those interfaces across a variety of computer architectures. It is costly to build a file system from scratch and we designed Panda to run on top of existing commodity file systems such as AIX; excellent performance using this approach implies immediate and broad applicability. High-level interfaces provide ease of use, application portability, and, most importantly, allow plenty of flexibility for an efficient underlying implementation. A high-level view of an entire i/o operation, made possible with Panda's high level interfaces, allows Panda to optimize reading and writing arrays to the host file system on the i/o nodes using Panda's server-directed i/o architecture. \par Panda focuses specifically on multidimensional arrays, the data type at the root of i/o performance problems in scientific computing. The Panda i/o library exhibits excellent performance on the NASA Ames NAS IBM SP2, attaining 83--98\% of peak AIX performance on each i/o node in the experiments described in this paper. We expect high-level interfaces such as Panda's to become the interfaces of choice for scientific applications in the future. As Panda can be easily added on top of existing parallel file systems and ordinary file systems without changing them, Panda illustrates a way to obtain cheap, fast, and easy-to-use i/o for high-performance scientific applications.}, comment = {Just a short 4-page summary of the Panda I/O library, including some brief performance results.} } @InProceedings{seamons:panda, author = {K. E. Seamons and Y. Chen and P. Jones and J. Jozwiak and M. Winslett}, title = {Server-Directed Collective {I/O} in {Panda}}, booktitle = {Proceedings of Supercomputing '95}, year = {1995}, month = {December}, publisher = {IEEE Computer Society Press}, address = {San Diego, CA}, URL = {http://www.supercomp.org/sc95/proceedings/520_SEAM/SC95.HTM}, keyword = {collective I/O, parallel I/O, pario-bib}, abstract = {We present the architecture and implementation results for Panda 2.0, a library for input and output of multidimensional arrays on parallel and sequential platforms. Panda achieves remarkable performance levels on the IBM SP2, showing excellent scalability as data size increases and as the number of nodes increases, and provides throughputs close to the full capacity of the AIX file system on the SP2 we used. We argue that this good performance can be traced to Panda's use of server-directed i/o (a logical-level version of disk-directed i/o [Kotz94b]) to perform array i/o using sequential disk reads and writes, a very high level interface for collective i/o requests, and built-in facilities for arbitrary rearrangements of arrays during i/o. Other advantages of Panda's approach are ease of use, easy application portability, and a reliance on commodity system software.}, comment = {This rewrite of Panda (see seamons:interface) is in C++ and runs on the SP2. They provide simple ways to declare the distribution of your array in memory and on disk, to form a list of arrays to be output at each timestep or at each checkpoint, and then to call for a timestep or checkpoint. Then they use something like disk-directed I/O (kotz:jdiskdir) internally to accomplish the rearrangement and transfer of data from compute nodes to I/O nodes. Note proceedings only on CD-ROM and WWW.} } @InProceedings{seamons:schemas, author = {K. E. Seamons and M. Winslett}, title = {Physical Schemas for Large Multidimensional Arrays in Scientific Computing Applications}, booktitle = {Proceedings of the 7th International Working Conference on Scientific and Statistical Database Management}, year = {1994}, month = {September}, pages = {218--227}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/ssdbm.ps}, keyword = {parallel I/O, scientific database, scientific computing, pario-bib}, comment = {``This paper presents PANDA's high-level interfaces for i/o operations, including checkpoint, restart, and time step output, and explains the rationale behind them.'' Basically they provide a bit of detail for the file formats they use in seamons:interface} } @PhdThesis{seamons:thesis, author = {Kent E. Seamons}, title = {Panda: Fast Access to Persistent Arrays Using High Level Interfaces and Server Directed Input/Output}, year = {1996}, month = {May}, school = {University of Illinois at Urbana-Champaign}, URL = {http://bunny.cs.uiuc.edu/CADR/pubs/seamons-thesis.html}, keyword = {parallel I/O, persistent data, parallel computing, pario-bib}, abstract = {Multidimensional arrays are a fundamental data type in scientific computing and are used extensively across a broad range of applications. Often these arrays are persistent, i.e., they outlive the invocation of the program that created them. Portability and performance with respect to input and output (i/o) pose significant challenges to applications accessing large persistent arrays, especially in distributed-memory environments. A significant number of scientific applications perform conceptually simple array i/o operations, such as reading or writing a subarray, an entire array, or a list of arrays. However, the algorithms to perform these operations efficiently on a given platform may be complex and non-portable, and may require costly customizations to operating system software. \par This thesis presents a high-level interface for array i/o and three implementation architectures, embodied in the Panda (Persistence AND Arrays) array i/o library. The high-level interface contributes to application portability, by encapsulating unnecessary details and being easy to use. Performance results using Panda demonstrate that an i/o system can provide application programs with a high-level, portable, easy-to-use interface for array i/o without sacrificing performance or requiring custom system software; in fact, combining all these benefits may only be possible through a high-level interface due to the great freedom and flexibility a high-level interface provides for the underlying implementation. \par The Panda server-directed i/o architecture is a prime example of an efficient implementation of collective array i/o for closely synchronized applications in distributed-memory single-program multiple-data (SPMD) environments. A high-level interface is instrumental to the good performance of server-directed i/o, since it provides a global view of an upcoming collective i/o operation that Panda uses to plan sequential reads and writes. Performance results show that with server-directed i/o, Panda achieves throughputs close to the maximum AIX file system throughput on the i/o nodes of the IBM SP2 when reading and writing large multidimensional arrays.}, comment = {see also chen:panda, seamons:panda, seamons:compressed, seamons:interface, seamons:schemas, seamons:msio, seamons:jpanda} } @InProceedings{shin:hartsio, author = {Kang G. Shin and Greg Dykema}, title = {A Distributed {I/O} Architecture for {HARTS}}, booktitle = {Proceedings of the 17th Annual International Symposium on Computer Architecture}, year = {1990}, pages = {332--342}, keyword = {parallel I/O, multiprocessor architecture, MIMD, fault tolerance, pario-bib}, comment = {HARTS is a multicomputer connected with a wrapped hexagonal mesh, with an emphasis on real-time and fault tolerance. The mesh consists of network routing chips. Hanging off each is a small bus-based multiprocessor ``node''. They consider how to integrate I/O devices into this architecture: attach device controllers to processors, to network routers, to node busses, or via a separate network. They decided to compromise and hang each I/O controller off three network routers, in the triangles of the hexagonal mesh. This keeps the traffic off of the node busses, and allows multiple paths to each controller. They discuss the reachability and hop count in the presence of failed nodes and links.} } @InProceedings{shirriff:sawmill, author = {Ken Shirriff and John Ousterhout}, title = {Sawmill: A High-Bandwidth Logging File System}, booktitle = {Proceedings of the 1994 Summer USENIX Technical Conference}, year = {1994}, pages = {125--136}, keyword = {file system, parallel I/O, pario-bib, RAID}, comment = {This is a file system based on LFS and run on the RAID-II prototype (see drapeau:raid-ii). It uses the RAID-II controller's memory (32 MB) to pipeline data transfers from the RAID disks directly to (from) the network. Thus, data never flows through the server CPU or memory. The server remains in control, telling the controller where each block goes, etc. They get very high data rates. And despite being much faster than the RAID for small writes, they were still CPU-limited, because the CPU had to handle all the little requests.} } @TechReport{shriver:api-tr, author = {Elizabeth A.~M. Shriver and Leonard F. Wisniewski}, title = {An {API} for Choreographing Data Accesses}, year = {1995}, month = {November}, number = {PCS-TR95-267}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR95-267.ps.Z}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {Current APIs for multiprocessor multi-disk file systems are not easy to use in developing out-of-core algorithms that choreograph parallel data accesses. Consequently, the efficiency of these algorithms is hard to achieve in practice. We address this deficiency by specifying an API that includes data-access primitives for data choreography. With our API, the programmer can easily access specific blocks from each disk in a single operation, thereby fully utilizing the parallelism of the underlying storage system. Our API supports the development of libraries of commonly-used higher-level routines such as matrix-matrix addition, matrix-matrix multiplication, and BMMC (bit-matrix-multiply/complement) permutations. We illustrate our API in implementations of these three high-level routines to demonstrate how easy it is to use.}, comment = {Also published as Courant Institute Tech Report 708.} } @InCollection{shriver:models-algs, author = {Elizabeth Shriver and Mark Nodine}, title = {An Introduction to Parallel {I/O} Models and Algorithms}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {2}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {31--68}, publisher = {Kluwer Academic Publishers}, keyword = {parallel I/O algorithms, out-of-core, pario-bib}, abstract = {Problems whose data are too large to fit into main memory are called {\it out-of-core} problems. Out-of-core parallel-I/O algorithms can handle much larger problems than in-memory variants and have much better performance than single-device variants. However, they are not commonly used---partly because the understanding of them is not widespread. Yet such algorithms ought to be growing in importance because they address the needs of users with ever-growing problem sizes and ever-increasing performance needs. \par This paper addresses this lack of understanding by presenting an introduction to the data-transfer models on which most of the out-of-core parallel-I/O algorithms are based, with particular emphasis on the Parallel Disk Model. Sample algorithms are discussed to demonstrate the paradigms (algorithmic techniques) used with these models. \par Our aim is to provide insight into both the paradigms and the particular algorithms described, thereby also providing a background for understanding a range of related solutions. It is hoped that this background would enable the appropriate selection of existing algorithms and the development of new ones for current and future out-of-core problems.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{si-woong:cluster, author = {Jang Si-Woong and Chung Ki-Dong and Sam Coleman}, title = {Design and Implementation of a Network-Wide Concurrent File System in a Workstation Cluster}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {239--245}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/woong/woong.htm}, keyword = {mass storage, cluster computing, distributed file system, parallel I/O, pario-bib}, abstract = {We estimate the performance of a network-wide concurrent file system implemented using conventional disks as disk arrays. Tests were carried out on both single system and network-wide environments. On single systems, a file was split across several disks to test the performance of file I/O operations. We concluded that performance was proportional to the number of disks, up to four, on a system with high computing power. Performance of a system with low computing power, however, did not increase, even with more than two disks. When we split a file across disks in a network-wide system called the Network-wide Concurrent File System (N-CFS), we found performance similar to or slightly higher than that of disk arrays on single systems. Since file access through N-CFS is transparent, this system enables traditional disks on single and networked systems to be used as disk arrays for I/O intensive jobs.} } @Article{sicola:storageworks, author = {Stephen J. Sicola}, title = {The Architecture and Design of {HS}-series {StorageWorks} Array Controllers}, journal = {Digital Technical Journal}, year = {1994}, month = {Fall}, volume = {6}, number = {4}, pages = {5--25}, keyword = {disk controller, RAID, parallel I/O, pario-bib}, comment = {Describes the RAID controller for the DEC StorageWorks product.} } @Article{simitci:patterns, author = {Huseyin Simitci and Daniel Reed}, title = {A Comparison of Logical and Physical Parallel {I/O} Patterns}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {Although there are several extant studies of parallel scientific application request patterns, there is little experimental data on the correlation of physical input/output patterns with application input/output stimuli. To understand these correlations, we have instrumented the SCSI device drivers of the Intel Paragon OSF/1 operating system to record key physical input/output activities and have correlated this data with the input/output patterns of scientific applications captured via the Pablo analysis toolkit. Our analysis shows that disk hardware features profoundly affect the distribution of request delays and that current parallel file systems respond to parallel application input/output patterns in non-scalable ways.} } @InCollection{sinclair:instability-book, author = {J.~B. Sinclair and J. Tang and P.~J. Varman}, title = {Placement-Related Problems in Shared Disk {I/O}}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {12}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {271--289}, publisher = {Kluwer Academic Publishers}, earlier = {sinclair:instability}, keyword = {parallel I/O, pario-bib}, abstract = {In a shared-disk parallel I/O system, several processes may be accessing the disks concurrently. An important example is concurrent external merging arising in database management systems with multiple independent sort queries. Such a system may exhibit instability, with one of the processes racing ahead of the others and monopolizing I/O resources. This race can lead to serialization of the processes and poor disk utilization, even when the static load on the disks is balanced. The phenomenon can be avoided by proper layout of data on the disks, as well as through other I/O management strategies. This has implications for both data placement in multiple disk systems and task partitioning for parallel processing.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{sinclair:instability, author = {James B. Sinclair and Jay Tang and Peter J. Varman}, title = {Instability in Parallel {I/O} Systems}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {16--35}, organization = {Rice University}, note = {Also appeared in Computer Architecture News 22(4)}, later = {sinclair:instability-book}, keyword = {parallel I/O, pario-bib}, comment = {They study the performance of a parallel I/O system when several concurrent processes are accessing a shared set of disks, using a common buffer pool. They found that under certain circumstances the system can become unstable, in that some subset of processes monopolize all of the resources, bringing the others to a virtual halt. They use analytical models to show that instability can occur if every process has distinct input and output disks, reads are faster than writes, disk scheduling policy of a certain class, and processes don't wait for other resources.} } @InProceedings{sinclair:placement, author = {J. B. Sinclair and J. Tang and P. J. Varman and B. R. Iyer}, title = {Impact of Data Placement on Parallel {I/O} Systems}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {III--276--279}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, pario-bib}, comment = {Several external merges (many sorted runs into one) are concurrently in action. Where do you put their input and output runs, that is, on which disks? Only input runs are striped, and usually on a subset of disks.} } @TechReport{singh:adopt, author = {Tarvinder Pal Singh and Alok Choudhary}, title = {{ADOPT}: A Dynamic scheme for Optimal PrefeTching in Parallel File Systems}, year = {1994}, month = {June}, institution = {NPAC}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ADOPT.ps.Z}, keyword = {parallel I/O, pario-bib}, comment = {They describe a prefetching scheme where hints can be provided from the programmer, compiler, or runtime library to the I/O node. These hints seem to take the form of a sequence (all in order) or a set (only one of many, from conditional expressions). The hints come from each process, not collectively. Then, the I/O node keeps these specifications and uses them to drive prefetching when there is no other work to do. They rotate among the specifications of many processes. Later they hope to examine more complex scheduling strategies and buffer-space allocation strategies.} } @InProceedings{smirni:evolutionary, author = {Evgenia Smirni and Ruth A. Aydt and Andrew A. Chien and Daniel A. Reed}, title = {{I/O} Requirements of Scientific Applications: An Evolutionary View}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, pages = {49--59}, publisher = {IEEE Computer Society Press}, address = {Syracuse, NY}, URL = {http://www-pablo.cs.uiuc.edu/People/esmirni/docs/IOhpdc96.ps.Z}, keyword = {I/O, workload characterization, scientific computing, parallel I/O, pario-bib}, abstract = {The modest I/O configurations and file system limitations of many current high-performance systems preclude solution of problems with large I/O needs. I/O hardware and file system parallelism is the key to achieving high performance. We analyze the I/O behavior of several versions of two scientific applications on the Intel Paragon XP/S. The versions involve incremental application code enhancements across multiple releases of the operating system. Studying the evolution of I/O access patterns underscores the interplay between application access patterns and file system features. Our results show that both small and large request sizes are common, that at present, application developers must manually aggregate small requests to obtain high disk transfer rates, that concurrent file accesses are frequent, and that appropriate matching of the application access pattern and the file system access mode can significantly increase application I/O performance. Based on these results, we describe a set of file system design principles.}, comment = {They study two applications over several versions, using Pablo to capture the I/O activity. They thus watch as application developers improve the applications use of I/O modes and request sizes. Both applications move through three phases: initialization, computation (with out-of-core I/O or checkpointing I/O), and output. They found it necessary to tune the I/O request sizes to match the parameters of the I/O system. In the initial versions, the code used small read and write requests, which were (according to the developers) the "easiest and most natural implementation for their I/O." They restructured the I/O to make bigger requests, which better matched the capabilities of Intel PFS. They conclude that asynchronous and collective operations are imperative. They would like to see a file system that can adapt dynamically to adjust its policies to the apparent access patterns. Automatic request aggregation of some kind seems like a good idea; of course, that is one feature of a buffer cache.} } @Article{smotherman:taxonomy, author = {Mark Smotherman}, title = {A Sequencing-based Taxonomy of {I/O} Systems and Review of Historical Machines}, journal = {Computer Architecture News}, year = {1989}, month = {September}, volume = {17}, number = {5}, pages = {5--15}, keyword = {I/O architecture, historical summary, pario-bib}, comment = {Classifies I/O systems by how they initiate and terminate I/O. Uniprocessor and Multiprocessor systems.} } @Misc{snir:hpfio, author = {Marc Snir}, title = {Proposal for {IO}}, year = {1992}, month = {August 31,}, howpublished = {Posted to HPFF I/O Forum}, note = {Second Draft}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {An outline of two possible ways to specify mappings of arrays to storage nodes in a multiprocessor, and to make unformatted parallel transfers of multiple records. Seems to apply only to arrays, and to files that hold only arrays. It keeps the linear structure of files as sequences of records, but in some cases does not preserve the order of data items or of fields within subrecords. Tricky to understand unless you know HPF and Fortran 90.} } @InProceedings{soloviev:prefetching, author = {Valery V. Soloviev}, title = {Prefetching in Segmented Disk Cache for Multi-Disk Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {69--82}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, prefetching, disk cache, disk array, pario-bib}, abstract = {This paper investigates the performance of a multi-disk storage system equipped with a segmented disk cache processing a workload of multiple relational scans. Prefetching is a popular method of improving the performance of scans. Many modern disks have a multisegment cache which can be used for prefetching. We observe that, exploiting declustering as a data placement method, prefetching in a segmented cache causes a load imbalance among several disks. A single disk becomes a bottleneck, degrading performance of the entire system. A variation in disk queue length is a primary factor of the imbalance. Using a precise simulation model, we investigate several approaches to achieving better balancing. Our metrics are a scan response time for the closed-end system and an ability to sustain a workload without saturating for the open-end system. We arrive at two main conclusions: (1) Prefetching in main memory is inexpensive and effective for balancing and can supplement or substitute prefetching in disk cache. (2) Disk-level prefetching provides about the same performance as main memory prefetching if request queues are managed in the disk controllers rather than in the host. Checking the disk cache before queuing requests provides not only better request response time but also drastically improves balancing. A single cache performs better than a segmented cache for this method.}, comment = {An interesting paper about disk-controller cache management in database workloads. Actually, the workloads are sequential scans of partitioned files, which could occur in many kinds of workloads. The declustering pattern (partitioning) is a little unusual for most scientific parallel I/O veterans, who are used to striping. And the cache-management algorithms seem a bit strange, particularly the fact that the cache appears to be used only for explicit prefetch requests. Turns out that it is best to put the prefetching and disk queueing in the same place, either on the controller or in main memory, to avoid load imbalance that arises from randomness in the workload, which is accentuated into a big bottleneck and a convoy effect.} } @InProceedings{solworth:mirror, author = {John A. Solworth and Cyril U. Orji}, title = {Distorted Mirrors}, booktitle = {Proceedings of the First International Conference on Parallel and Distributed Information Systems}, year = {1991}, month = {December}, pages = {10--17}, later = {solworth:mirror2}, keyword = {disk mirroring, parallel I/O, pario-bib}, comment = {Write one disk (the master) in the usual way, and write the slave disk at the closest free block. Actually, they propose to logically partition the two disks so that each disk has a master partition and a slave partition. Up to 80\% improvement in small-write performance, while retaining good sequential read performance.} } @Article{solworth:mirror2, author = {John A. Solworth and Cyril U. Orji}, title = {Distorted Mirrors}, journal = {Journal of Distributed and Parallel Databases}, year = {1993}, month = {January}, volume = {1}, number = {1}, pages = {81--102}, earlier = {solworth:mirror}, keyword = {disk mirroring, parallel I/O, pario-bib}, comment = {See solworth:mirror.} } @InProceedings{srinilta:strategies, author = {Chutimet Srinilta and Divyesh Jadav and Alok Choudhary}, title = {Design and Evaluation of Data Storage and Retrieval Strategies in a Distributed Memory Continuous Media Server}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.ece.nwu.edu/~csrinilt/ipps97.ps}, keyword = {verify pages, threads, parallel I/O, pario-bib}, abstract = {High performance servers and high-speed networks will form the backbone of the infra-structure required for distributed multimedia information systems. Given that the goal of such a server is to support hundreds of interactive data streams simultaneously, various tradeoffs are possible with respect to the storage of data on secondary memory, and its retrieval therefrom. In this paper we identify and evaluate these tradeoffs. We evaluate the effect of varying the stripe factor and also the performance of batched retrieval of disk--resident data. We develop a methodology to predict the stream capacity of such a server. The evaluation is done for both uniform and skewed access patterns. Experimental results on the Intel Paragon computer are presented.} } @MastersThesis{stabile:disks, author = {James Joseph Stabile}, title = {Disk Scheduling Algorithms for a Multiple Disk System}, year = {1988}, school = {UC Davis}, keyword = {parallel I/O, parallel file system, disk mirroring, disk scheduling, pario-bib}, comment = {Describes simulation based on model of disk access pattern. Multiple-disk system, much like in matloff:multidisk. Files stored in two copies, each on a separate disk, but there are more than two disks, so this differs from mirroring. He compares several disk scheduling algorithms. A variant of SCAN seems to be the best.} } @Article{steenkiste:net, author = {Peter Steenkiste}, title = {A High-Speed Network Interface for Distributed-Memory Systems: Architecture and Applications}, journal = {ACM Transactions on Computer Systems}, year = {1997}, month = {February}, volume = {15}, number = {1}, pages = {75--109}, publisher = {ACM Press}, keyword = {parallel computer architecture, interconnection network, network interface, distributed memory, systolic array, input/output, parallel I/O, pario-bib}, comment = {See also steenkiste:interface, kung:network, hemy:gigabit, bornstein:reshuffle, and gross:io.} } @Article{stodolsky:jlogging, author = {Daniel Stodolsky and Mark Holland and William V. {Courtright II} and Garth A. Gibson}, title = {Parity-Logging Disk Arrays}, journal = {ACM Transactions on Computer Systems}, year = {1994}, month = {August}, volume = {12}, number = {3}, pages = {206--235}, publisher = {ACM Press}, earlier = {stodolsky:logging}, keyword = {parallel I/O, RAID, redundancy, reliability, pario-bib}, comment = {See stodolsky:logging. An in-between version is CMU-CS-94-170, stodolsky:logging-tr.} } @TechReport{stodolsky:logging-tr, author = {Daniel Stodolsky and Mark Holland and William V. {Courtright II} and Garth A. Gibson}, title = {A Redundant Disk Array Architecture for Efficient Small Writes}, year = {1994}, month = {July}, number = {CMU-CS-94-170}, institution = {Carnegie Mellon University}, note = {Revised from CMU-CS-93-200.}, later = {stodolsky:logging}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/ParityLogging/TR94-170.ps}, keyword = {parallel I/O, disk array, RAID, redundancy, reliability, pario-bib}, abstract = {Parity encoded redundant disk arrays provide highly reliable, cost effective secondary storage with high performance for reads and large writes. Their performance on small writes, however, is much worse than mirrored disks - the traditional, highly reliable, but expensive organization for second ary storage. Unfortunately, small writes are a substantial portion of the I/O workload of many impor tant, demanding applications such as on-line transaction processing. This paper presents parity logging, a novel solution to the small write problem for redundant disk arrays. Parity logging applies journalling techniques to substantially reduce the cost of small writes. We provide detailed models of parity logging and competing schemes - mirroring, floating storage, and RAID level 5 - and verify these models by simulation. Parity logging provides performance competitive with mirroring, but with capacity overhead close to the minimum offered by RAID level 5. Finally, parity logging can exploit data caching more effectively than all three alternative approaches.} } @InProceedings{stodolsky:logging, author = {Daniel Stodolsky and Garth Gibson and Mark Holland}, title = {Parity Logging: Overcoming the Small Write Problem in Redundant Disk Arrays}, booktitle = {Proceedings of the 20th Annual International Symposium on Computer Architecture}, year = {1993}, pages = {64--75}, earlier = {stodolsky:logging-tr}, later = {stodolsky:jlogging}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/ParityLogging/ISCA93.ps}, keyword = {parallel I/O, RAID, redundancy, reliability, pario-bib}, abstract = {Parity encoded redundant disk arrays provide highly reliable, cost effective secondary storage with high performance for read accesses and large write accesses. Their performance on small writes, however, is much worse than mirrored disks - the traditional, highly reliable, but expensive organization for secondary storage. Unfortunately, small writes are a substantial portion of the I/O workload of many important, demanding applications such as on-line transaction processing. This paper presents parity logging, a novel solution to the small write problem for redundant disk arrays. Parity logging applies journalling techniques to substantially reduce the cost of small writes. We provide a detailed analysis of parity logging and competing schemes - mirroring, floating storage, and RAID level 5 - and verify these models by simulation. Parity logging provides performance competitive with mirroring, the best of the alternative single failure tolerating disk array organizations. However, its overhead cost is close to the minimum offered by RAID level 5. Finally, parity logging can exploit data caching much more effectively than all three alternative approaches.}, comment = {Cite stodolsky:jlogging. Earlier version is CMU-CS-93-200. Parity logging to improve small writes. Log all parity updates; when it fills, go redo parity disk. Actually distribute the parity and log across all disks. Performance is comparable to, or exceeding, mirroring. Also handling double failures.} } @Article{stone:query, author = {Harold S. Stone}, title = {Parallel Querying of Large Databases: {A} Case Study}, journal = {IEEE Computer}, year = {1987}, month = {October}, volume = {20}, number = {10}, pages = {11--21}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, database, SIMD, connection machine, pario-bib}, comment = {See also IEEE Computer, Jan 1988, p. 8 and 10. Examines a database query that is parallelized for the Connection Machine. He shows that in many cases, a smarter serial algorithm that reads only a portion of the database (through an index) will be faster than 64K processors reading the whole database. Uses a simple model for the machines to show this. Reemphasizes the point of Boral and DeWitt that I/O is the bottleneck of a database machine, and that parallelizing the processing will not necessarily help a great deal.} } @InProceedings{stonebraker:radd, author = {Michael Stonebraker and Gerhard A. Schloss}, title = {Distributed {RAID} --- {A} New Multiple Copy Algorithm}, booktitle = {Proceedings of 6th International Data Engineering Conference}, year = {1990}, pages = {430--437}, keyword = {disk striping, reliability, pario-bib}, comment = {This is about ``RADD'', a distributed form of RAID. Meant for cases where the disks are physically distributed around several sites, and no one controller controls them all. Much lower space overhead than any mirroring technique, with comparable normal-mode performance at the expense of failure-mode performance.} } @TechReport{stonebraker:xprs, author = {Michael Stonebraker and Randy Katz and David Patterson and John Ousterhout}, title = {The Design of {XPRS}}, year = {1988}, month = {March}, number = {UCB/ERL M88/19}, institution = {UC Berkeley}, keyword = {parallel I/O, disk array, RAID, Sprite, disk architecture, database, pario-bib}, comment = {Designing a DBMS for Sprite and RAID. High availability, high performance. Shared memory multiprocessor. Allocates extents to files that are a interleaved over a variable number of disks, and over a contiguous set of tracks on those disks.} } @MastersThesis{subramaniam:msthesis, author = {Mahesh Subramaniam}, title = {Efficient Implementation of Server-Directed I/O}, year = {1996}, month = {June}, school = {Dept. of Computer Science, University of Illinois}, URL = {http://bunny.cs.uiuc.edu/CDR/pubs/mahesh-thesis.html}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {Parallel computers are a cost effective approach to providing significant computational resources to a broad range of scientific and engineering applications. Due to the relatively lower performance of the I/O subsystems on these machines and due to the significant I/O requirements of these applications, the I/O performance can become a major bottleneck. Optimizing the I/O phase of these applications poses a significant challenge. A large number of these scientific and engineering applications perform simple operations on multidimensional arrays and providing an easy and efficient mechanism for implementing these operations is important. The Panda array I/O library provides simple high level interfaces to specify collective I/O operations on multidimensional arrays in a distributed memory single-program multiple-data (SPMD) environment. The high level information provided by the user through these interfaces allows the Panda array I/O library to produce an efficient implementation of the collective I/O request. The use of these high level interfaces also increases the portability of the application. \par This thesis presents an efficient and portable implementation of the Panda array I/O library. In this implementation, standard software components are used to build the I/O library to aid its portability. The implementation also provides a simple, flexible framework for the implementation and integration of the various collective I/O strategies. The server directed I/O and the reduced messages server directed I/O algorithms are implemented in the Panda array I/O library. This implementation supports the sharing of the I/O servers between multiple applications by extending the collective I/O strategies. Also, the implementation supports the use of part time I/O nodes where certain designated compute nodes act as the I/O servers during the I/O phase of the application. The performance of this implementation of the Panda array I/O library is measured on the IBM SP2 and the performance results show that for read and write operations, the collective I/O strategies used by the Panda array I/O library achieve throughputs close to the maximum throughputs provided by the underlying file system on each I/O node of the IBM SP2.} } @Unpublished{taber:metadisk, author = {David Taber}, title = {{MetaDisk} Driver Technical Description}, year = {1990}, month = {October}, note = {SunFlash electronic mailing list 22(9)}, keyword = {disk mirroring, parallel I/O, pario-bib}, comment = {MetaDisk is a addition to the Sun SPARCstation server kernel. It allows disk mirroring between any two local disk partitions, or concatenation of several disk partitions into one larger partition. Can span up to 4 partitions simultaneously. Appears not to be striped, just allows bigger partitions, and (by chance) some parallel I/O for large files.} } @TechReport{tan:pizzas, author = {Michael Tan and Nick Roussopoulos and Steve Kelley}, title = {The {Tower of Pizzas}}, year = {1995}, month = {April}, number = {UMIACS-TR-95-52}, institution = {University of Maryland Institute for Advanced Computer Studies (UMIACS)}, URL = {ftp://ftp.cs.umd.edu/pub/papers/papers/3462/3462.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {CPU speeds are increasing at a much faster rate than secondary storage device speeds. Many important applications face an I/O bottleneck. We demonstrate that this bottleneck can be alleviated through 1) scalable striping of data and 2) caching/prefetching techniques. This paper describes the design and performance of the Tower of Pizzas (TOPs), a portable software system providing parallel I/O and buffering services.}, comment = {Same as CS-TR-3462 from Department of Computer Science. Basically, a parallel file system for a workstation cluster using the usual parallel file-system ideas. They do support client-side caching, using a client-side server process which shares memory with the client. Otherwise nothing really new.} } @Article{taylor:magic, author = {Herb Taylor and Danny Chin and Stan Knight}, title = {The {Magic} Video-on-Demand Server and Real-Time Simulation System}, journal = {IEEE Parallel and Distributed Technology}, year = {1995}, month = {Summer}, volume = {3}, number = {2}, pages = {40--51}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, multimedia, video on demand, pario-bib}, comment = {They describe a video server system being developed at the Sarnoff Real Time Corporation. This paper describes their simulated system. It is intended as more than a video-on-demand system, but also for capture and processing as well as playback. So they have a complex system of interconnected SIMD boards, each with a high-speed link to various devices, including a collection of disk drives. Data is striped across disks. They integrate playback scheduling and the disk striping in an interesting way.} } @TechReport{tennenhouse:debug, author = {Marsha Tennenhouse and Dror Zernik}, title = {Visual Debugging of Parallel File System Programs}, year = {1995}, month = {March}, institution = {IBM}, URL = {http://www.almaden.ibm.com/watson/pv/vestaabs.html}, keyword = {debugging, visualization, parallel file system, parallel I/O, pario-bib} } @TechReport{thakur:abstract-tr, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Abstract-Device Interface for Implementing Portable Parallel-{I/O} Interfaces}, year = {1996}, month = {May}, number = {MCS-P592-0596}, institution = {Argonne National Laboratory, Mathematics and Computer Science Division}, later = {thakur:abstract}, URL = {http://www.mcs.anl.gov/home/thakur/adio.ps}, keyword = {multiprocessor file system interface, parallel I/O, pario-bib}, comment = {They propose an intermediate interface that can serve as an implementation base for all parallel file-system APIs, and which can itself be implemented on top of all parallel file systems. This ``universal'' interface allows all apps to run on all file systems with no porting, and for people to experiment with different APIs.} } @InProceedings{thakur:abstract, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Abstract-Device Interface for Implementing Portable Parallel-{I/O} Interfaces}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {180--187}, earlier = {thakur:abstract-tr}, URL = {http://www.mcs.anl.gov/home/thakur/adio.ps}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {In this paper, we propose a strategy for implementing parallel-I/O interfaces portably and efficiently. We have defined an abstract-device interface for parallel I/O, called ADIO. Any parallel-I/O API can be implemented on multiple file systems by implementing the API portably on top of ADIO, and implementing only ADIO on different file systems. This approach simplifies the task of implementing an API and yet exploits the specific high-performance features of individual file systems. We have used ADIO to implement the Intel PFS interface and subsets of MPI-IO and IBM PIOFS interfaces on PFS, PIOFS, Unix, and NFS file systems. Our performance studies indicate that the overhead of using ADIO as an implementation strategy is very low.} } @Article{thakur:applications, author = {Rajeev Thakur and Ewing Lusk and William Gropp}, title = {{I/O} in Parallel Applications: The Weakest Link}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, URL = {http://www.mcs.anl.gov/home/thakur/ijsa-article.ps.gz}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {Parallel computers are increasingly being used to run large-scale applications that also have huge I/O requirements. However, many applications obtain poor I/O performance on modern parallel machines. This special issue of IJSA contains papers that describe the I/O requirements and the techniques used to perform I/O in real parallel applications. We first explain how the I/O application program interface (API) plays a critical role in enabling such applications to achieve high I/O performance. We describe how the commonly used Unix I/O interface is inappropriate for parallel I/O and how an explicitly parallel API with support for collective I/O can help the underlying I/O hardware and software perform I/O efficiently. We then describe MPI-IO, a recently defined, standard, portable API specifically designed for high-performance parallel I/O. We conclude with an overview of the papers in this special issue.} } @TechReport{thakur:astrophysics, author = {Rajeev Thakur and Ewing Lusk and William Gropp}, title = {{I/O} Characterization of a Portable Astrophysics Application on the {IBM SP} and {Intel Paragon}}, year = {1995}, month = {August}, number = {MCS-P534-0895}, institution = {Argonne National Laboratory}, note = {Revised October 1995}, URL = {http://www.mcs.anl.gov/home/thakur/astro.ps}, keyword = {file access pattern, workload characterization, parallel I/O, pario-bib}, abstract = {Many large-scale applications on parallel machines are bottlenecked by the I/O performance rather than the CPU or communication performance of the system. To improve the I/O performance, it is first necessary for system designers to understand the I/O requirements of various applications. This paper presents the results of a study of the I/O characteristics and performance of a real, I/O-intensive, portable, parallel application in astrophysics, on two different parallel machines---the IBM SP and the Intel Paragon. We instrumented the source code to record all I/O activity, and analyzed the resulting trace files. Our results show that, for this application, the I/O consists of fairly large writes, and writing data to files is faster on the Paragon, whereas opening and closing files are faster on the SP. We also discuss how the I/O performance of this application could be improved; particularly, we believe that this application would benefit from using collective I/O.}, comment = {Adds another data point to the collection of parallel scientific applications whose I/O has been characterized, a collection started in earnest by crandall:iochar. It's a pretty straightforward application; it just writes its matrices every few timesteps. The application writes whole matrices; the OS sees request sizes that are more a factor of the Chameleon library than of the application. Most of the I/O itself is not implemented in parallel, because they used UniTree on the SP, and because the Chameleon library sequentializes this kind of I/O through one node. Other numbers from the paper don't add much insight into the workload. Revised slightly in October 1995; the abstract represents that revision.} } @TechReport{thakur:evaluation-tr, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Experimental Evaluation of the Parallel {I/O} Systems of the {IBM~SP} and {Intel Paragon} Using a Production Application}, year = {1996}, month = {February}, number = {MCS-P569--0296}, institution = {Argonne National Laboratory}, later = {thakur:evaluation}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {This paper presents the results of an experimental evaluation of the parallel I/O systems of the IBM SP and Intel Paragon. For the evaluation, we used a full, three-dimensional application code that is in production use for studying the nonlinear evolution of Jeans instability in self-gravitating gaseous clouds. The application performs I/O by using library routines that we developed and optimized separately for parallel I/O on the SP and Paragon. The I/O routines perform two-phase I/O and use the PIOFS file system on the SP and PFS on the Paragon. We studied the I/O performance for two different sizes of the application. We found that for the small case, I/O was faster on the SP, whereas for the large case, I/O took almost the same time on both systems. Communication required for I/O was faster on the Paragon in both cases. The highest read bandwidth obtained was 48 Mbytes/sec. and the highest write bandwidth obtained was 31.6 Mbytes/sec., both on the SP.}, comment = {This version no longer on the web.} } @InProceedings{thakur:evaluation, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {An Experimental Evaluation of the Parallel {I/O} Systems of the {IBM~SP} and {Intel Paragon} Using a Production Application}, booktitle = {Proceedings of the Third International Conference of the Austrian Center for Parallel Computation (ACPC)}, year = {1996}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {1127}, pages = {24--35}, publisher = {Springer-Verlag}, earlier = {thakur:evaluation-tr}, URL = {http://www.mcs.anl.gov/home/thakur/io-eval.ps}, keyword = {parallel I/O, multiprocessor file system, workload characterization, pario-bib}, abstract = {We present the results of an experimental evaluation of the parallel I/O systems of the IBM SP and Intel Paragon using a real three-dimensional parallel application code. This application, developed by scientists at the University of Chicago, simulates the gravitational collapse of self-gravitating gaseous clouds. It performs parallel I/O by using library routines that we developed and optimized separately for the SP and Paragon. The I/O routines perform two-phase I/O and use the parallel file systems PIOFS on the SP and PFS on the Paragon. We studied the I/O performance for two different sizes of the application. In the small case, we found that I/O was much faster on the SP. In the large case, open, close, and read operations were only slightly faster, and seeks were significantly faster, on the SP; whereas, writes were slightly faster on the Paragon. The communication required within our I/O routines was faster on the Paragon in both cases. The highest read bandwidth obtained was 48\,Mbytes/sec., and the highest write bandwidth obtained was 31.6\,Mbytes/sec., both on the SP.} } @TechReport{thakur:ext2phase, author = {Rajeev Thakur and Alok Choudhary}, title = {Accessing Sections of Out-of-Core Arrays Using an Extended Two-Phase Method}, year = {1995}, month = {January}, number = {SCCS-685}, institution = {NPAC}, address = {Syracuse University}, later = {thakur:ext2phase2}, keyword = {parallel I/O, pario-bib}, abstract = {In out-of-core computations, data needs to be moved back and forth between main memory and disks during program execution. In this paper, we propose a technique called the Extended Two-Phase Method, for accessing sections of out-of-core arrays efficiently. This is an extension and generalization of the Two-Phase Method for reading in-core arrays from files, which was previously proposed in [Rosario93,Bordawekar93]. The Extended Two-Phase Method uses collective I/O in which all processors cooperate to perform I/O in an efficient manner by combining several I/O requests into fewer larger requests, eliminating multiple disk accesses for the same data and reducing contention for disks. We describe the algorithms for reading as well as writing array sections. Performance results on the Intel Touchstone Delta for many different access patterns are presented and analyzed. It is observed that the Extended Two-Phase Method gives consistently good performance over a wide range of access patterns.}, comment = {Revised as thakur:ext2phase2 and thakur:jext2phase.} } @TechReport{thakur:ext2phase2, author = {Rajeev Thakur and Alok Choudhary}, title = {An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays}, year = {1995}, month = {June}, number = {CACR-103}, institution = {Scalable I/O Initiative}, address = {Center for Advanced Computing Research, Caltech}, note = {Revised November 1995}, earlier = {thakur:ext2phase}, later = {thakur:jext2phase}, URL = {http://www.mcs.anl.gov/home/thakur/cacr-103.ps}, keyword = {parallel I/O, pario-bib}, abstract = {A number of applications on parallel computers deal with very large data sets which cannot fit in main memory. In such cases, data must be stored in files on disks and fetched into main memory during program execution. In programs with large out-of-core arrays stored in files, it is necessary to read/write smaller sections of the arrays from/to files. This paper describes a method, called the {\em extended two-phase method}, for accessing sections of out-of-core arrays in an efficient manner. This method uses collective I/O in which processors cooperate to combine several I/O requests into fewer larger granularity requests, reorder requests so that the file is accessed in proper sequence, and eliminate simultaneous I/O requests for the same data. The I/O workload is divided among processors dynamically, depending on the access requests. We present performance results for two real, out-of-core, parallel applications --- matrix multiplication and a Laplace's equation solver --- and several synthetic access patterns. The results indicate that the extended two-phase method provides a significant performance improvement over a direct method for I/O.}, comment = {Revised version of thakur:ext2phase. The tech report was itself revised in November 1995; the abstract represents that revision.} } @Article{thakur:jext2phase, author = {Rajeev Thakur and Alok Choudhary}, title = {{An Extended Two-Phase Method for Accessing Sections of Out-of-Core Arrays}}, journal = {Scientific Programming}, year = {1996}, month = {Winter}, volume = {5}, number = {4}, pages = {301--317}, earlier = {thakur:ext2phase2}, URL = {http://www.mcs.anl.gov/home/thakur/ext2ph.ps}, keyword = {parallel I/O, pario-bib}, abstract = {A number of applications on parallel computers deal with very large data sets that cannot fit in main memory. In such applications, data must be stored in files on disks and fetched into memory during program execution. Parallel programs with large out-of-core arrays stored in files must read/write smaller sections of the arrays from/to files. In this article, we describe a method for accessing sections of out-of-core arrays efficiently. Our method, the extended two-phase method, uses collective I/O: Processors cooperate to combine several I/O requests into fewer larger granularity requests, reorder requests so that the file is accessed in proper sequence, and eliminate simultaneous I/O requests for the same data. In addition, the I/O workload is divided among processors dynamically, depending on the access requests. We present performance results obtained from two real out-of-core parallel applications---matrix multiplication and a Laplace's equation solver---and several synthetic access patterns, all on the Intel Touchstone Delta. These results indicate that the extended two-phase method significantly outperformed a direct (noncollective) method for accessing out-of-core array sections.} } @Article{thakur:jpassion, author = {Rajeev Thakur and Alok Choudhary and Rajesh Bordawekar and Sachin More and Sivaramakrishna Kuditipudi}, title = {Passion: Optimized {I/O} for Parallel Applications}, journal = {IEEE Computer}, year = {1996}, month = {June}, volume = {29}, number = {6}, pages = {70--78}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, comment = {See thakur:passion, choudhary:passion.} } @InCollection{thakur:out-of-core-book, author = {Rajeev Thakur and Alok Choudhary}, title = {Runtime Support for Out-of-Core Parallel Programs}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {6}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {147--165}, publisher = {Kluwer Academic Publishers}, earlier = {thakur:out-of-core}, keyword = {parallel I/O, out-of-core, pario-bib}, abstract = {In parallel programs with large out-of-core arrays stored in files, it is necessary to read/write smaller sections of the arrays from/to files. We describe a runtime method for accessing sections of out-of-core arrays efficiently. This method, called the {\em extended two-phase method}, uses collective I/O in which processors cooperate to read/write out-of-core data in an efficient manner. The I/O workload is divided among processors dynamically, depending on the access requests. Performance results on the Intel Touchstone Delta show that the extended two-phase method performs considerably better than a direct method for different access patterns, array sizes, and number of processors. We have used the extended two-phase method in the PASSION runtime library for parallel I/O.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{thakur:out-of-core, author = {Rajeev Thakur and Rajesh Bordawekar and Alok Choudhary}, title = {Compilation of Out-Of-Core Data Parallel Programs for Distributed Memory Machines}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {54--72}, organization = {Syracuse University}, note = {Also appeared in Computer Architecture News 22(4)}, later = {thakur:out-of-core-book}, keyword = {parallel I/O, pario-bib}, comment = {Earlier version available as NPAC/Syracuse tech report. They describe the design of an HPF compiler that can translate out-of-core programs into plain programs with explicit I/O. For the most part, they discuss many of the issues involved in manipulating the arrys, and some of the alternatives for run-time support. The out-of-core array is broken into pieces, one per processor. Each processor keeps its local array piece in a file on its own logical disk, and reads and writes pieces of that file as needed. Some of the tradeoffs appear to contrast the amount of I/O with the ability to optimize communication: they choose a method called ``out-of-core communication'' because it simplifies the analysis of communication patterns, although it requires more I/O. The compiler depends on run-time routines for support; the run-time routines hide a lot of the architectural details, simplifying the job of the compiler and making the resulting program more portable. There are some preliminary performance numbers.} } @InProceedings{thakur:passion, author = {Rajeev Thakur and Rajesh Bordawekar and Alok Choudhary and Ravi Ponnusamy and Tarvinder Singh}, title = {{PASSION} Runtime Library for Parallel {I/O}}, booktitle = {Proceedings of the Scalable Parallel Libraries Conference}, year = {1994}, month = {October}, pages = {119--128}, later = {thakur:jpassion}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/splc94_passion_runtime.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {We are developing a compiler and runtime support system called PASSION: Parallel And Scalable Software for Input-Output. PASSION provides software support for I/O intensive out-of-core loosely synchronous problems. This paper gives an overview of the PASSION Runtime Library and describes two of the optimizations incorporated in it, namely Data Prefetching and Data Sieving. Performance improvements provided by these optimizations on the Intel Touchstone Delta are discussed, together with an out-of-core Median Filtering application.}, comment = {See thakur:jpassion. They describe the PASSION library for parallel I/O, though the description is fairly high-level. The main things that this paper adds to earlier papers from this group is a discussion of Data Prefetching (which is really just an asynchronous I/O interface that their compiler uses for prefetching) and Data Sieving, which they use when the application needs to read some array section that is not contiguous in the file; for example, a submatrix of a 2-d matrix from in a file stored row-major. Their solution is to read the complete set of rows (or columns, depending on file layout) in one huge read, into a memory buffer, and then extract the necessary data. Basically, this is another form of the two-phase strategy.} } @TechReport{thakur:romio-users, author = {Rajeev Thakur and Ewing Lusk and William Gropp}, title = {Users Guide for {ROMIO}: A High-Performance, Portable {MPI-IO} Implementation}, year = {1997}, month = {October}, number = {ANL/MCS-TM-234}, institution = {Mathematics and Computer Science Division, Argonne National Laboratory}, URL = {ftp://ftp.mcs.anl.gov/pub/thakur/romio/users-guide.ps.gz}, keyword = {file system interface, parallel I/O, pario-bib}, abstract = {ROMIO is a high-performance, portable implementation of MPI-IO (the I/O chapter in MPI-2). This document describes how to install and use ROMIO version~1.0.0 on various machines.} } @InProceedings{thakur:runtime, author = {R. Thakur and R. Bordawekar and A. Choudhary}, title = {{Compiler and Runtime Support for Out-of-Core HPF Programs}}, booktitle = {Proceedings of the 8th ACM International Conference on Supercomputing}, year = {1994}, month = {July}, pages = {382--391}, publisher = {ACM Press}, address = {Manchester, UK}, URL = {ftp://erc.cat.syr.edu/ece/choudhary/PASSION/ics94-out-of-core-hpf.ps.Z}, keyword = {parallel I/O, pario-bib}, abstract = {This paper describes the design of a compiler which can translate out-of-core programs written in a data parallel language like HPF. Such a compiler is required for compiling large scale scientific applications, such as the Grand Challenge applications, which deal with enormous quantities of data. We propose a framework by which a compiler together with appropriate runtime support can translate an out-of-core HPF program to a message passing node program with explicit parallel I/O. We describe the basic model of the compiler and the various transformations made by the compiler. We also discuss the runtime routines used by the compiler for I/O and communication. In order to minimize I/O, the runtime support system can reuse data already fetched into memory. The working of the compiler is illustrated using two out-of-core applications, namely a Laplace equation solver and LU Decomposition, together with performance results on the Intel Touchstone Delta.}, comment = {They describe ways to make HPF handle out-of-core arrays. Basically, they add directives to say which arrays are out of core, and how much memory to devote to the in-core portion of the array. Then the compiler distributes the array across processors, as in HPF, to form local arrays. Each local array is broken into slabs, where each slab can fit in local memory. The local array is kept in a local array file, from which slabs are loaded and stored. Ghost nodes are also handled. They were careful to avoid double I/O when one slab is another slab's ghost node. They found it most convenient to do all the communication between iterations, then do all the computation for that iteration, where the iteration itself required a loop including both computation and I/O. This means that there may need to be I/O during the communication phase, to store ghost nodes coming in from other places. They do not mention use of asynchronous I/O for overlap. See also bordawekar:efficient.} } @PhdThesis{thakur:thesis, author = {Rajeev Thakur}, title = {{Runtime Support for In-Core and Out-of-Core Data-Parallel Programs}}, year = {1995}, month = {May}, school = {Department of Electrical and Computer Engineering, Syracuse University}, URL = {http://www.mcs.anl.gov/home/thakur/phd_thesis.ps}, keyword = {parallel I/O, runtime library, pario-bib}, abstract = {Distributed memory parallel computers or distributed computer systems are widely recognized as the only cost-effective means of achieving teraflops performance in the near future. However, the fact remains that they are difficult to program and advances in software for these machines have not kept pace with advances in hardware. This thesis addresses several issues in providing runtime support for in-core as well as out-of-core programs on distributed memory parallel computers. This runtime support can be directly used in application programs for greater efficiency, portability and ease of programming. It can also be used together with a compiler to translate programs written in a high-level data-parallel language like High Performance Fortran (HPF) to node programs for distributed memory machines. \par In distributed memory programs, it is often necessary to change the distribution of arrays during program execution. This thesis presents efficient and portable algorithms for runtime array redistribution. The algorithms have been implemented on the Intel Touchstone Delta and are found to scale well with the number of processors and array size. This thesis also presents algorithms for all-to-all collective communication on fat-tree and two-dimensional mesh interconnection topologies. The performance of these algorithms on the CM-5 and Touchstone Delta is studied extensively. A model for estimating the time taken by these algorithms on the basis of system parameters is developed and validated by comparing with experimental results. \par A number of applications deal with very large data sets which cannot fit in main memory, and hence have to be stored in files on disks, resulting in out-of-core programs. This thesis also describes the design and implementation of efficient runtime support for out-of-core computations. Several optimizations for accessing out-of-core data are presented. An Extended Two-Phase Method is proposed for accessing sections of out-of-core arrays efficiently. This method uses collective I/O and the I/O workload is divided among processors dynamically, depending on the access requests. Performance results obtained using this runtime support for out-of-core programs on the Touchstone Delta are presented.} } @TechReport{think:cm-2, key = {TMC}, title = {{Connection Machine} Model {CM-2} Technical Summary}, year = {1987}, month = {April}, number = {HA87-4}, institution = {Thinking Machines}, keyword = {parallel I/O, connection machine, disk array, disk architecture, SIMD, pario-bib}, comment = {I/O and Data Vault, pp. 27--30} } @Book{think:cm5, key = {TMC}, title = {The {Connection Machine} {CM-5} Technical Summary}, year = {1991}, month = {October}, publisher = {Thinking Machines Corporation}, keyword = {computer architecture, connection machine, MIMD, SIMD, parallel I/O, pario-bib}, comment = {Some detail but still skips over some key aspects (like communication topology. Neat communications support makes for user-mode message-passing, broadcasting, reductions, all built in. Lots of info here. File system calls allows data to be transferred in parallel directly from I/O node to processing node, bypassing the partition and I/O management nodes. Multiple I/O devices (even DataVaults) can be logically striped. See also best:cmmdio, loverso:sfs, think:cmmd, think:sda.} } @Misc{think:cm5io, key = {TMC}, title = {The {CM-5} {I/O} system}, year = {1993}, howpublished = {Thinking Machines Corporation glossy}, keyword = {parallel I/O, disk array, striping, RAID, HIPPI, pario-bib}, comment = {More detail about I/O nodes than think:sda, including info about disk storage nodes, HIPPI nodes, and tape nodes (ITS).} } @Manual{think:cmmd, key = {TMC}, title = {{CMMD} User's Guide}, year = {1992}, month = {January}, organization = {Thinking Machines Corporation}, keyword = {MIMD, parallel programming, parallel I/O, message-passing, pario-bib} } @Misc{think:sda, key = {TMC}, title = {{CM-5} Scalable Disk Array}, year = {1992}, month = {November}, howpublished = {Thinking Machines Corporation glossy}, keyword = {parallel I/O, disk array, striping, RAID, pario-bib}, comment = {Disk storage nodes (processor, network interface, buffer, 4 SCSI controllers, 8 disks) attach individually to the CM-5 network. The software stripes across all nodes in the system. Thus, the collection of nodes is called a disk array. Multiple file systems across the array. Flexible redundancy. RAID~3 is used, i.e., bit-striped and a single parity disk. Remote access via NFS supported. Files stored in canonical order, with special hardware to help distribute data across processors. See best:cmmdio.} } @TechReport{thomas:panda, author = {Joel T. Thomas}, title = {The {Panda} Array {I/O} Library on the {Galley} Parallel File System}, year = {1996}, month = {June}, number = {PCS-TR96-288}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Senior Honors Thesis.}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR96-288/}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {The Panda Array I/O library, created at the University of Illinois, Urbana-Champaign, was built especially to address the needs of high-performance scientific applications. I/O has been one of the most frustrating bottlenecks to high performance for quite some time, and the Panda project is an attempt to ameliorate this problem while still providing the user with a simple, high-level interface. The Galley File System, with its hierarchical structure of files and strided requests, is another attempt at addressing the performance problem. My project was to redesign the Panda Array library for use on the Galley file system. This project involved porting Panda's three main functions: a checkpoint function for writing a large array periodically for 'safekeeping,' a restart function that would allow a checkpointed file to be read back in, and finally a timestep function that would allow the user to write a group of large arrays several times in a sequence. Panda supports several different distributions in both the compute-node memories and I/O-node disks. \par We have found that the Galley File System provides a good environment on which to build high-performance libraries, and that the mesh of Panda and Galley was a successful combination.}, comment = {See seamons:thesis.} } @Manual{tmc:cmio, key = {TMC}, title = {Programming the {CM I/O} System}, year = {1990}, month = {November}, organization = {Thinking Machines Corporation}, keyword = {parallel I/O, file system interface, multiprocessor file system, pario-bib}, comment = {Have two types of files, parallel and serial, differing in the way data is laid out internally. Also have three modes for reading the file: synchronous, streaming (asynchronous), and buffered.} } @InProceedings{toledo:solar, author = {Sivan Toledo and Fred G. Gustavson}, title = {The Design and Implementation of {SOLAR}, a Portable Library for Scalable Out-of-Core Linear Algebra Computations}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {28--40}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, out-of-core, linear algebra, pario-bib}, abstract = {SOLAR is a portable high-performance library for out-of-core dense matrix computations. It combines portability with high performance by using existing high-performance in-core subroutine libraries and by using an optimized matrix input-output library. SOLAR works on parallel computers, workstations, and personal computers. It supports in-core computations on both shared-memory and distributed-memory machines, and its matrix input-output library supports both conventional I/O interfaces and parallel I/O interfaces. This paper discusses the overall design of SOLAR, its interfaces, and the design of several important subroutines. Experimental results show that SOLAR can factor on a single workstation an out-of-core positive-definite symmetric matrix at a rate exceeding 215 Mflops, and an out-of-core general matrix at a rate exceeding 195 Mflops. Less than 16\% of the running time is spent on I/O in these computations. These results indicate that SOLAR's portability does not compromise its performance. We expect that the combination of portability, modularity, and the use of a high-level I/O interface will make the library an important platform for research on out-of-core algorithms and on parallel I/O.}, comment = {Sounds great. Library package that supports LAPACK-like functionality on in-core and out-of-core matrices. Good performance. Good portability (IBM workstation, IBM SP-2, and OS/2 laptop). They separate the matrix algorithms from the underlying I/O routines in an interesting way (read and write submatrices), leaving just enough information to allow the I/O system to do some higher-level optimizations.} } @Article{towsley:cpuio-parallel, author = {D. Towsley and K. M. Chandy and J. C. Browne}, title = {Models for Parallel Processing within Programs: {Application} to {CPU: I/O} and {I/O: I/O} Overlap}, journal = {Communications of the ACM}, year = {1978}, month = {October}, volume = {21}, number = {10}, pages = {821--831}, keyword = {parallel processing, parallel I/O, pario-bib}, comment = {Models CPU:I/O and I/O:I/O overlap within a program. ``Overlapping is helpful only when it allows a device to be utilized which would not be utilized without overlapping.'' In general the overlapping seems to help.} } @InProceedings{towsley:cpuio, author = {Donald F. Towsley}, title = {The Effects of {CPU: I/O} Overlap in Computer System Configurations}, booktitle = {Proceedings of the 5th Annual International Symposium on Computer Architecture}, year = {1978}, month = {April}, pages = {238--241}, keyword = {parallel processing, parallel I/O, pario-bib}, comment = {Difficult to follow since it is missing its figures. ``Our most important result is that multiprocessor systems can benefit considerably more than single processor systems with the introduction of CPU: I/O overlap.'' They overlap I/O needed by some future CPU sequence with the current CPU operation. They claim it looks good for large numbers of processors. Their orientation seems to be for multiprocessors operating on independent tasks.} } @InProceedings{trabado:io, author = {Guillermo P. Trabado and E. L. Zapata}, title = {Support for Massive Data Input/Output on Parallel Computers}, booktitle = {Proceedings of the Fifth Workshop on Compilers for Parallel Computers}, year = {1995}, month = {June}, pages = {347--356}, keyword = {parallel I/O, sparse matrix, pario-bib}, comment = {They discuss a library to support irregular data structures, really sparse matrices, on distributed-memory machines. Their library supports several in-memory and out-of-core data distributions, and routines to read and write matrices in those distributions. The paper is sketchy and poorly written. There is little material on I/O.} } @InProceedings{tridgell:hidios, author = {Andrew Tridgell and David Walsh}, title = {The {HiDIOS} filesystem}, booktitle = {Parallel Computing Workshop}, year = {1995}, month = {September}, address = {England}, URL = {ftp://nimbus.anu.edu.au/pub/tridge/HiDIOS/hidios.ps.gz}, keyword = {verify, parallel file system, pario-bib}, comment = {A description of their new parallel file system for the AP-1000. Conceptually, not much new here.} } @InProceedings{trieber:raid5, author = {Kent Treiber and Jai Menon}, title = {Simulation Study of Cached {RAID5} Designs}, booktitle = {Proceedings of the First Conference on High-Performance Computer Architecture}, year = {1995}, month = {January}, pages = {186--197}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, RAID, disk array, pario-bib}, abstract = {This paper considers the performance of cached RAID5 using simulations that are driven by database I/O traces collected at customer sites. This is in contrast to previous performance studies using analytical modelling or random-number simulations. We studied issues of cache size, disk buffering, cache replacement policies, cache allocation policies, destage policies and striping. Our results indicate that: read caching has considerable value; a small amount of cache should be used for writes fast write logic can reduce disk utilization for writes by an order of magnitude; priority queueing should be supported at the disks; disk buffering prefetch should be used; for large caches, it pays to cache sequentially accessed blocks; RAID5 with cylinder striping is superior to parity striping.} } @MastersThesis{vaitzblit:media, author = {Lev Vaitzblit}, title = {The Design and Implementation of a High-Bandwidth File Service for Continuous Media}, year = {1991}, month = {September}, school = {MIT}, keyword = {multimedia, distributed file system, disk striping, pario-bib}, comment = {A DFS for multimedia. Expect large files, read-mostly, highly sequential. Temporal synchronization is key. An administration server handles opens and closes, and provides guarantees on performance (like Swift). The interface at the client nodes talks to the admin server transparently, and stripes requests over all storage nodes. Storage nodes may internally use RAIDs, I suppose. Files are a series of frames, rather than bytes. Each frame has a time offset in seconds. Seeks can be by frame number or time offset. File containers contain several files, and have attributes that specify performance requirements. Interface does prefetching, based on read direction (forward or backward) and any frame skips. But frames are not transmitted from storage server to client node until requested (client pacing). Claim that synchronous disk interleaving with a striping unit of one frame is best. Could get 30 frames/sec (3.5MB/s) with 2 DECstation 5000s and 4 disks, serving a client DEC 5000.} } @InProceedings{vandegoor:unixio, author = {A. J. {van de Goor} and A. Moolenaar}, title = {{UNIX I/O} in a Multiprocessor System}, booktitle = {Proceedings of the 1988 Winter USENIX Conference}, year = {1988}, pages = {251--258}, keyword = {unix, multiprocessor file system, pario-bib}, comment = {How to split up the internals of the Unix I/O system to run on a shared-memory multiprocessor in a non-symmetric OS. They decided to split the functionality just above the buffer cache level, putting the buffer cache management and device drivers on the special I/O processors.} } @InCollection{vanderleest:contention-book, author = {Steven H. VanderLeest and Ravishankar K. Iyer}, title = {Heterogeneous {I/O} Contention in a Single-bus Multiprocessor}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {14}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {313--331}, publisher = {Kluwer Academic Publishers}, earlier = {vanderleest:contention}, keyword = {parallel I/O, pario-bib}, abstract = {None.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{vanderleest:contention, author = {Steven VanderLeest and Ravishankar K. Iyer}, title = {Measurement of {I/O} Bus Contention and Correlation among Heterogeneous Device Types in a Single-bus Multiprocessor System}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {36--53}, organization = {Univ of Illinois, Urbana-Champaign}, note = {Also appeared in Computer Architecture News 22(4)}, later = {sinclair:instability-book}, keyword = {parallel I/O, pario-bib}, comment = {Using a hardware monitor they measure the I/O-bus usage on a 4-processor Sun workstation. They characterize the bus contention caused by multiple different devices (disk, screen, and network). The contention sometimes caused significant performance degradation (to the end-user) despite the bus not being overloaded.} } @InProceedings{vengroff:TPIE, author = {Darren Erik Vengroff}, title = {A Transparent Parallel {I/O} Environment}, booktitle = {Proceedings of the 1994 DAGS/PC Symposium}, year = {1994}, month = {July}, pages = {117--134}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, URL = {ftp://ftp.cs.duke.edu/pub/dev/papers/tpie-dags94.ps.Z}, keyword = {parallel I/O, parallel I/O algorithms, pario-bib}, comment = {Interesting interface, providing high-level data-parallel access to vectors of data on disk. Implementation expectation is to use raw disk devices. Goals: abstraction, support for algorithmic optimality, flexible, portable, and extensible. TPIE is a set of C++ templates and libraries, where the user supplies callback functions to TPIE access methods. TPIE contains a small variety of access methods, each of which operates on a set of input and output streams, calling the user's function once for each set of input records. They can do scan, merge, distribution, sort, permute, batch filter, and distribution-sweep. There is a single thread of control (at least conceptually). Their first prototype is on a Sun SPARCstation; later, clusters of workstations and then a multiprocessor. See vengroff:efficient, vengroff:tpie-man.} } @TechReport{vengroff:efficient-tr, author = {Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {{I/O}-Efficient Scientific Computation Using {TPIE}}, year = {1995}, month = {July}, number = {CS--1995--18}, institution = {Dept. of Computer Science, Duke University}, later = {vengroff:efficient}, URL = {ftp://cs.duke.edu/dist/techreport/1995/1995-18.ps.Z}, keyword = {parallel I/O algorithm, scientific computing, runtime library, pario-bib}, comment = {Expanded version of vengroff:efficient.} } @InProceedings{vengroff:efficient, author = {Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {Supporting {I/O}-Efficient Scientific Computation in {TPIE}}, booktitle = {Proceedings of the 1995 IEEE Symposium on Parallel and Distributed Processing}, year = {1995}, month = {October}, pages = {74--77}, publisher = {IEEE Computer Society Press}, address = {San Antonio, TX}, earlier = {vengroff:efficient-tr}, later = {vengroff:efficient2}, keyword = {parallel I/O, algorithm, run-time library, pario-bib}, comment = {Shorter version of vengroff:efficient2. Excellent paper. This paper does not describe TPIE itself very much, but more about a set of benchmarks using TPIE. All of the benchmarks are run on one disk and one processor. TPIE can use multiple disks and one processor, with plans to extend it to multiple processors later. See vengroff:tpie and vengroff:efficient-tr. Same as vengroff:efficient2?} } @InProceedings{vengroff:efficient2, author = {Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {{I/O}-Efficient Scientific Computation Using {TPIE}}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {II:553--570}, earlier = {vengroff:efficient}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/VeV96.Scientific_TPIE.ps.gz}, keyword = {parallel I/O algorithms, run-time support, parallel I/O, multiprocessor file system interface, pario-bib}, comment = {Longer version of vengroff:efficient.} } @PhdThesis{vengroff:thesis, author = {Darren Erik Vengroff}, title = {The Theory and Practice of {I/O}-Efficient Computation}, year = {1996}, month = {April}, school = {Department of Computer Science, Brown University}, address = {Providence, RI}, keyword = {parallel I/O algorithm, pario-bib} } @Misc{vengroff:tpie-man, author = {Darren Erik Vengroff}, title = {{TPIE} User Manual and Reference}, year = {1995}, month = {January}, howpublished = {Available on the WWW at http://www.cs.duke.edu/\~{}dev/tpie_home_page.html}, note = {Alpha release}, URL = {http://www.cs.duke.edu/~dev/tpie_home_page.html}, keyword = {parallel I/O, parallel I/O algorithm, file system interface, pario-bib}, comment = {Currently an alpha version. It is in the process of being updated. The most current version is generally available on the WWW. See vengroff:tpie, vengroff:efficient.} } @InProceedings{venugopal:delays, author = {C.~R. Venugopal and S.~S.~S.~P. Rao}, title = {Impact of Delays in Parallel {I/O} System: An Empirical Study}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {490-499}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {Performance of I/O intensive applications on a multiprocessor system depends mostly on the variety of disk access delays encountered in the I/O system. Over the years, the improvement in disk performance has taken place more slowly than the corresponding increase in processor speeds. It is therefore necessary to model I/O delays and evaluate performance benefits of moving an application to a better multiprocessor system. We perform such an analysis by measuring I/O delays for a synthesized application that uses a parallel distributed file system. The aim of this study is to evaluate the performance benefits of better disks in a multiprocessor system. We report on how the I/O performance would be affected if an application were to run on a system which would have better disks and communication links. In this study, we show a substantial improvement in the performance of an I/O system with better disks and communication links with respect to the existing system.} } @InProceedings{vitter:jprefetch, author = {Jeffrey Scott Vitter and P. Krishnan}, title = {Optimal Prefetching via Data Compression}, booktitle = {Foundations of Computer Science}, year = {1991}, pages = {121--130}, earlier = {vitter:prefetch}, keyword = {prefetching, data compression, pario-bib} } @InProceedings{vitter:optimal, author = {Jeffrey Scott Vitter and Elizabeth A.~M. Shriver}, title = {Optimal Disk {I/O} with Parallel Block Transfer}, booktitle = {Proceedings of the 22nd Annual ACM Symposium on Theory of Computing (STOC~'90)}, year = {1990}, month = {May}, pages = {159--169}, keyword = {parallel I/O algorithms, parallel memory, pario-bib}, comment = {Summary of vitter:parmem1 and vitter:parmem2.} } @TechReport{vitter:parmem1-tr, author = {Jeffrey Scott Vitter and Elizabeth A. M. Shriver}, title = {Algorithms for Parallel Memory {I}: Two-level Memories}, year = {1993}, month = {January}, number = {CS--93--01}, institution = {Dept. of Computer Science, Duke University}, note = {A summary appears in STOC '90. Revised version of Brown CS--92--04. Appeared in Algorithmica August 1994.}, later = {vitter:parmem1}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1993/1993-01.ps.Z}, keyword = {parallel I/O algorithms, parallel memory, pario-bib}, comment = {Summarized in vitter:optimal. Published as vitter:parmem1.} } @Article{vitter:parmem1, author = {Jeffrey Scott Vitter and Elizabeth A. M. Shriver}, title = {Algorithms for Parallel Memory {I}: Two-level Memories}, journal = {Algorithmica}, year = {1994}, month = {August and September}, volume = {12}, number = {2/3}, pages = {110--147}, earlier = {vitter:parmem1-tr}, keyword = {parallel I/O algorithms, parallel memory, pario-bib}, abstract = {We provide the first optimal algorithms in terms of the number of input/outputs (I/Os) required between internal memory and multiple secondary storage devices for the problems of sorting, FFT, matrix transposition, standard matrix multiplication, and related problems. Our two-level memory model is new and gives a realistic treatment of {\em parallel block transfer}, in which during a single~I/O each of the $P$ secondary storage devices can simultaneously transfer a contiguous block of $B$ records. The model pertains to a large-scale uniprocessor system or parallel multiprocessor system with $P$ disks. In addition, the sorting, FFT, permutation network, and standard matrix multiplication algorithms are typically optimal in terms of the amount of internal processing time. The difficulty in developing optimal algorithms is to cope with the partitioning of memory into $P$~separate physical devices. Our algorithms' performance can be significantly better than those obtained by the well-known but nonoptimal technique of disk striping. Our optimal sorting algorithm is randomized, but practical; the probability of using more than $\ell$ times the optimal number of I/Os is exponentially small in $\ell (\log \ell) \log (M/B)$, where $M$ is the internal memory size.}, comment = {See shorter version vitter:optimal. See TR version vitter:parmem1-tr. See also vitter:parmem2.} } @TechReport{vitter:parmem2-tr, author = {Jeffrey Scott Vitter and Elizabeth A. M. Shriver}, title = {Algorithms for Parallel Memory {II}: Hierarchical Multilevel Memories}, year = {1993}, month = {January}, number = {CS--93--02}, institution = {Dept. of Computer Science, Duke University}, note = {A summary appears in STOC '90. Revised version of Brown CS--92--05. Appeared in Algorithmica 12(2,3).}, later = {vitter:parmem2}, URL = {ftp://ftp.cs.duke.edu/dist/techreport/1993/1993-02.ps.Z}, keyword = {parallel I/O algorithms, parallel memory, pario-bib}, comment = {Summarized in vitter:optimal.} } @Article{vitter:parmem2, author = {Jeffrey Scott Vitter and Elizabeth A. M. Shriver}, title = {Algorithms for Parallel Memory {II}: Hierarchical Multilevel Memories}, journal = {Algorithmica}, year = {1994}, month = {August and September}, volume = {12}, number = {2/3}, pages = {148--169}, earlier = {vitter:parmem2-tr}, keyword = {parallel I/O algorithms, parallel memory, pario-bib}, abstract = {In this paper we introduce parallel versions of two hierarchical memory models and give optimal algorithms in these models for sorting, FFT, and matrix multiplication. In our parallel models, there are $P$ memory hierarchies operating simultaneously; communication among the hierarchies takes place at a base memory level. Our optimal sorting algorithm is randomized and is based upon the probabilistic partitioning technique developed in the companion paper for optimal disk sorting in a two-level memory with parallel block transfer. The probability of using $\ell$ times the optimal running time is exponentially small in~$\ell (\log \ell) \log P$.}, comment = {Summarized in vitter:optimal.} } @TechReport{vitter:prefetch, author = {Jeffrey Scott Vitter and P. Krishnan}, title = {Optimal Prefetching via Data Compression}, year = {1991}, month = {July}, number = {CS--91--46}, institution = {Brown University}, note = {A summary appears in FOCS '91}, later = {vitter:jprefetch}, URL = {ftp://ftp.cs.brown.edu/pub/techreports/91/cs91-46.ps.Z}, keyword = {parallel I/O algorithms, disk prefetching, pario-bib}, abstract = {Caching and prefetching are important mechanisms for speeding up access time to data on secondary storage. Recent work in competitive online algorithms has uncovered several promising new algorithms for caching. In this paper, we apply a form of the competitive philosophy for the first time to the problem of prefetching to develop an optimal universal prefetcher in terms of fault ratio, with particular applications to large-scale databases and hypertext systems. Our algorithms for prefetching are novel in that they are based on data compression techniques that are both theoretically optimal and good in practice. Intuitively, in order to compress data effectively, you have to be able to predict future data well, and thus good data compressors should be able to predict well for purposes of prefetching. We show for powerful models such as Markov sources and $m$th order Markov sources that the page fault rates incurred by our prefetching algorithms are optimal in the limit for almost all sequences of page accesses.}, comment = {``This... is on prefetching, but I think the ideas will have a lot of use with parallel disks. The implementations we have now are doing amazingly well compared to LRU.'' [Vitter]. See vitter:jprefetch.} } @InProceedings{vitter:summary, author = {Jeffrey Scott Vitter}, title = {Efficient Memory Access in Large-Scale Computation}, booktitle = {Proceedings of the 1991 Symposium on Theoretical Aspects of Computer Science (STACS~'91)}, year = {1991}, pages = {26--41}, publisher = {Springer-Verlag}, address = {Berlin}, note = {Published as Lecture Notes in Computer Science volume 480}, keyword = {parallel I/O algorithms, sorting, pario-bib}, comment = {Good overview of all the other papers.} } @Article{vitter:uniform, author = {Jeffrey Scott Vitter and Mark H. Nodine}, title = {Large-Scale Sorting in Uniform Memory Hierarchies}, journal = {Journal of Parallel and Distributed Computing}, year = {1993}, month = {January and February}, volume = {17}, number = {1--2}, pages = {107--114}, publisher = {Academic Press}, keyword = {parallel I/O algorithm, sorting, pario-bib}, comment = {Summary is nodine:sort.} } @Manual{vms:stripe, key = {DEC}, title = {{VAX} Disk Striping Driver for {VMS}}, year = {1989}, month = {December}, organization = {Digital Equipment Corporation}, note = {Order Number AA-NY13A-TE}, keyword = {disk striping, pario-bib}, comment = {Describes the VAX disk striping driver. Stripes an apparently arbitrary number of disk devices. All devices must be the same type, and apparently completely used. Manager can specify ``chunksize'', the number of logical blocks per striped block. They suggest using the track size of the device as the chunk size. They also point out that multiple controllers should be used in order to gain parallelism.} } @InProceedings{waltz:database, author = {David L. Waltz}, title = {Innovative Massively Parallel {AI} Applications}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {132--138}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, keyword = {database, AI, artificial intelligence, pario-bib}, abstract = {Massively parallel applications must address problems that will be too large for workstations for the next several years, or else it will not make sense to expend development costs on them. Suitable applications include one or more of the following properties: 1) large amounts of data; 2) intensive computations; 3) requirement for very fast response times; 4) ways to trade computations for human effort, as in developing applications using learning methods. Most of the suitable applications that we have found come from the general area of very large databases. Massively parallel machines have proved to be important not only in being able to run large applications, but in accelerating development (allowing the use of simpler algorithms, cutting the time to test performance on realistic databases) and allowing many different algorithms and parameter settings to be tried and compared for a particular task. This paper summarizes four such applications. \par The applications described are: 1) prediction of credit card "defaulters" (non-payers) and "attritters" (people who didn't renew their cards) from a credit card database; 2) prediction of the continuation of time series, e.g. stock price movements; 3) automatic keyword assignment for news articles; and 4) protein secondary structure prediction. These add to a list identified in an earlier paper [Waltz 90] including: 5) automatic classification of U.S. Census Bureau long forms, using MBR -- Memory-Based Reasoning [Creecy et al 92, Waltz 89, Stanfill \& Waltz 86]; 6) generating catalogs for a mail order company that maximize expected net returns (revenues from orders minus cost of the catalogs and mailings) using genetically-inspired methods; and 7) text-based intelligent systems for information retrieval, decision support, etc.}, comment = {Invited speaker.} } @TechReport{wang:paging, author = {Kuei Yu Wang and Dan C. Marinescu}, title = {An Analysis of the Paging Activity of Parallel Programs, Part {I}: Correlation of the Paging Activity of Individual Node Programs in the {SPMD} Execution Mode}, year = {1994}, month = {June}, number = {CSD-TR-94-042}, institution = {Purdue University}, keyword = {parallel I/O, virtual memory, paging, characterization, pario-bib}, comment = {They measured the paging behavior of programs running on a Paragon, and analyze the results. To do so, they sample the OSF paging statistics periodically. The general conclusions: they found a surprising amount of dissimilarity in the paging behaviors of nodes within the same program, both in terms of the amount of paging and the timing of peak paging activity. These characteristics do not bode well for systems that use gang scheduling, or applications that have a lot of barriers.} } @InProceedings{watson:hpss, author = {Richard W. Watson and Robert A. Coyne}, title = {The Parallel {I/O} Architecture of the High-Performance Storage System ({HPSS})}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {27--44}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/watson/watson.htm}, keyword = {mass storage, parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {Datasets up to terabyte size and petabyte total capacities have created a serious imbalance between I/O and storage-system performance and system functionality. One promising approach is the use of parallel data-transfer techniques for client access to storage, peripheral-to-peripheral transfers, and remote file transfers. This paper describes the parallel I/O architecture and mechanisms, parallel transport protocol (PTP), parallel FTP, and parallel client application programming interface (API) used by the high-performance storage system (HPSS). Parallel storage integration issues with a local parallel file system are also discussed.} } @InProceedings{wilkes:autoraid-sosp, author = {John Wilkes and Richard Golding and Carl Staelin and Tim Sullivan}, title = {The {HP AutoRAID} Hierarchical Storage System}, booktitle = {Proceedings of the Fifteenth ACM Symposium on Operating Systems Principles}, year = {1995}, month = {December}, pages = {96--108}, publisher = {ACM Press}, address = {Copper Mountain, CO}, later = {wilkes:autoraid}, URL = {http://www.hpl.hp.com/personal/John_Wilkes/papers/AutoRAID.SOSP95.ps.Z}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {Cite wilkes:autoraid. A commercial RAID box that transparently manages a hierarchy of two RAID systems, a RAID-1 mirrored system and a RAID-5 system. The goal is easy-to-use high performance, and they appear to have achieved that goal. Data in current use are kept in the RAID-1, and other data in RAID-5. This design gives performance of RAID-1 with cost of RAID-5. They have a clever scheme for spreading both RAIDs across most disks, including a hot spare. Dual controllers, power supplies, fans, etc. The design is a fairly standard RAID hardware controller, using standard SCSI disks, but with all the new tricks done in controller software. The paper gives a few results from the prototype hardware, and a lot of simulation results.} } @Article{wilkes:autoraid, author = {John Wilkes and Richard Golding and Carl Staelin and Tim Sullivan}, title = {The {HP AutoRAID} Hierarchical Storage System}, journal = {ACM Transactions on Computer Systems}, year = {1996}, month = {February}, volume = {14}, number = {1}, pages = {108--136}, publisher = {ACM Press}, earlier = {wilkes:autoraid-sosp}, URL = {http://www.hpl.hp.com/personal/John_Wilkes/papers/AutoRAID.TOCS.ps.Z}, keyword = {RAID, disk array, parallel I/O, pario-bib} } @TechReport{wilkes:datamesh, author = {John Wilkes}, title = {{DataMesh}--- scope and objectives: a commentary}, year = {1989}, month = {July}, number = {HP-DSD-89-44}, institution = {Hewlett-Packard}, later = {wilkes:datamesh1}, keyword = {parallel I/O, distributed file system, disk caching, heterogeneous file system, pario-bib}, comment = {Hooks a heterogeneous set of storage devices together over a fast interconnect, each with its own identical processor. The whole would then act as a file server for a network. Data storage devices would range from fast to slow (e.g. optical jukebox), varying availability, etc.. Many ideas here but few concrete suggestions. Very little mention of algorithms they might use to control the thing. See also wilkes:datamesh1, cao:tickertaip, chao:datamesh, wilkes:houses, wilkes:lessons.} } @InProceedings{wilkes:datamesh1, author = {John Wilkes}, title = {{DataMesh} Research Project, Phase 1}, booktitle = {Proceedings of the USENIX File Systems Workshop}, year = {1992}, month = {May}, pages = {63--69}, earlier = {wilkes:datamesh}, keyword = {distributed file system, parallel I/O, disk scheduling, disk layout, pario-bib}, comment = {See chao:datamesh} } @Article{wilkes:houses, author = {John Wilkes}, title = {{DataMesh}, house-building, and distributed systems technology}, journal = {ACM Operating Systems Review}, year = {1993}, month = {April}, volume = {27}, number = {2}, pages = {104--108}, earlier = {wilkes:datamesh1}, later = {wilkes:lessons}, keyword = {file system, distributed computing, pario-bib}, comment = {Same as wilkes:lessons. See that for comments.} } @InProceedings{wilkes:lessons, author = {John Wilkes}, title = {{DataMesh}, house-building, and distributed systems technology}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {1--5}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, earlier = {wilkes:houses}, keyword = {file system, parallel I/O, RAID, disk array, pario-bib}, comment = {Invited speaker. Also appeared in ACM OSR April 1993 (wilkes:houses). This gives his viewpoint that we should be focusing more on architecture than on components, to design frameworks rather than just individual policies and mechanisms. It also gives a quick overview of DataMesh. For more DataMesh info, though, see cao:tickertaip, chao:datamesh, wilkes:datamesh1, wilkes:datamesh, wilkes:houses.} } @InProceedings{willeman:pario, author = {Ray Willeman and Susan Phillips and Ron Fargason}, title = {An Integrated Library For Parallel Processing: The Input/Output Component}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {573--575}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, pario-bib}, comment = {Like the CUBIX interface, in some ways. Meant for parallel access to non-striped (sequential) file. Self-describing format so that the reader can read the formatting information and distribute data accordingly.} } @InProceedings{wisniewski:in-place, author = {Leonard F. Wisniewski}, title = {Structured Permuting in Place on Parallel Disk Systems}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {128--139}, publisher = {ACM Press}, address = {Philadelphia}, keyword = {parallel I/O, parallel I/O algorithm, permutation, out-of-core, pario-bib}, abstract = {The ability to perform permutations of large data sets in place reduces the amount of necessary available disk storage. The simplest way to perform a permutation often is to read the records of a data set from a source portion of data storage, permute them in memory, and write them to a separate target portion of the same size. It can be quite expensive, however, to provide disk storage that is twice the size of very large data sets. Permuting in place reduces the expense by using only a small amount of extra disk storage beyond the size of the data set. \par \newcommand{\ceil}[1]{\lceil #1\rceil} \newcommand{\rank}[1]{\mathop{\rm rank}\nolimits #1} This paper features in-place algorithms for commonly used structured permutations. We have developed an asymptotically optimal algorithm for performing BMMC (bit-matrix-multiply/complement) permutations in place that requires at most $\frac{2N}{BD}\left( 2\ceil{\frac{\rank{\gamma}}{\lg (M/B)}} + \frac{7}{2}\right)$ parallel disk accesses, as long as $M \geq 2BD$, where $N$ is the number of records in the data set, $M$ is the number of records that can fit in memory, $D$ is the number of disks, $B$ is the number of records in a block, and $\gamma$ is the lower left $\lg (N/B) \times \lg B$ submatrix of the characteristic matrix for the permutation. This algorithm uses $N+M$ records of disk storage and requires only a constant factor more parallel disk accesses and insignificant additional computation than a previously published asymptotically optimal algorithm that uses $2N$ records of disk storage. \par We also give algorithms to perform mesh and torus permutations on a $d$-dimensional mesh. The in-place algorithm for mesh permutations requires at most $3\ceil{N/BD}$ parallel I/Os and the in-place algorithm for torus permutations uses at most $4dN/BD$ parallel I/Os. The algorithms for mesh and torus permutations require no extra disk space as long as the memory size $M$ is at least $3BD$. The torus algorithm improves upon the previous best algorithm in terms of both time and space.} } @InProceedings{witkowski:hyper-fs, author = {Andrew Witkowski and Kumar Chandrakumar and Greg Macchio}, title = {Concurrent {I/O} System for the Hypercube Multiprocessor}, booktitle = {Proceedings of the Third Conference on Hypercube Concurrent Computers and Applications}, year = {1988}, pages = {1398--1407}, publisher = {ACM Press}, address = {Pasadena, CA}, keyword = {parallel I/O, hypercube, parallel file system, pario-bib}, comment = {Concrete system for the hypercube. Files resident on one disk only. Little support for cooperation except for sequentialized access to parts of the file, or broadcast. No mention of random-access files. I/O nodes are distinguished from computation nodes. I/O nodes have separate comm. network. No parallel access. I/O hooked to front-end too.} } @InProceedings{wolf:dasd, author = {Joel L. Wolf and Philip S. Yu and Hadas Shachnai}, title = {{DASD} Dancing: A Disk Load Balancing Optimization Scheme for Video-on-Demand Computer Systems}, booktitle = {Proceedings of the 1995 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1995}, month = {May}, pages = {157--166}, keyword = {parallel I/O, video server, multimedia, pario-bib} } @Article{wolman:iobench, author = {Barry L. Wolman and Thomas M. Olson}, title = {{IOBENCH:} A System Independent {IO} Benchmark}, journal = {Computer Architecture News}, year = {1989}, month = {September}, volume = {17}, number = {5}, pages = {55--70}, keyword = {I/O benchmark, transaction processing, pario-bib}, comment = {Not about parallel I/O, but see olson:random. Defines a new I/O benchmark that is fairly system-independent. Focus is for transaction processing systems. Cranks up many tasks (users) all doing repetitive read/writes for a specified time, using optional locking, and optional computation. Whole suite of results for comparison with others. See also chen:iobench.} } @Article{womble:intro, author = {David E. Womble and David S. Greenberg}, title = {Parallel {I/O}: An introduction}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {403--417}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {parallel I/O, pario-bib}, comment = {A brief introduction to the topic of parallel I/O (what, why, current research), followed by a roundtable discussion among the authors of the papers in womble:special-issue. The discussion focused on three questions: 1) What are the biggest gaps in current I/O services? 2) Why have vendors failed to adopt new file system technologies? 3) How much direct low-level control over I/O resources should be given to the users and why?} } @InProceedings{womble:outofcore, author = {David Womble and David Greenberg and Rolf Riesen and Stephen Wheat}, title = {Out of Core, Out of Mind: Practical Parallel {I/O}}, booktitle = {Proceedings of the Scalable Parallel Libraries Conference}, year = {1993}, month = {October}, pages = {10--16}, address = {Mississippi State University}, URL = {ftp://ftp.cs.sandia.gov/pub/papers/dewombl/parallel_io_scl93.ps.Z}, keyword = {parallel I/O, parallel file system, pario-bib}, abstract = {Parallel computers are becoming more powerful and more complex in response to the demand for computing power by scientists and engineers. Inevitably, new and more complex I/O systems will be developed for these systems. In particular we believe that the I/O system must provide the programmer with the ability to explicitly manage storage (despite the trend toward complex parallel file systems and caching schemes). One method of doing so is to have a partitioned secondary storage in which each processor owns a logical disk. Along with operating system enhancements which allow overheads such as buffer copying to be avoided and libraries to support optimal remapping of data, this sort of I/O system meets the needs of high performance computing.}, comment = {They argue that it is important to allow the programmer to explicitly control their storage in some way. In particular, they advocate the Partitioned Secondary Storage (PSS) model, in which each processor has its own logical disk, rather than using a parallel file system (PFS) which automatically stripes a linear file across many disks. Basically, programmer knows best. Of course, libraries can help. They note that you will often need data in a different format than it comes, and may need it output in a different format; so, permutation algorithms are needed. Also important to be able to overlap computation with I/O. They use LU factorization as an example, and give an algorithm. On the nCUBE with the PUMA operating system, they get good performance. See womble:pario.} } @InProceedings{womble:pario, author = {David Womble and David Greenberg and Stephen Wheat and Rolf Riesen}, title = {Beyond Core: Making Parallel Computer {I/O} Practical}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {56--63}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, URL = {ftp://ftp.cs.sandia.gov/pub/papers/dewombl/parallel_io_dags93.ps.Z}, keyword = {parallel I/O, out-of-core, parallel algorithm, scientific computing, multiprocessor file system, pario-bib}, abstract = {The solution of Grand Challenge Problems will require computations which are too large to fit in the memories of even the largest machines. Inevitably, new designs of I/O systems will be necessary to support them. Through our implementations of an out-of-core LU factorization we have learned several important lessons about what I/O systems should be like. In particular we believe that the I/O system must provide the programmer with the ability to explicitly manage storage. One method of doing so is to have a partitioned secondary storage in which each processor owns a logical disk. Along with operating system enhancements which allow overheads such as buffer copying to be avoided, this sort of I/O system meets the needs of high performance computing.}, comment = {See womble:outofcore. See thakur:runtime, kotz:lu, brunet:factor for other out-of-core LU results.} } @Article{womble:special-issue, author = {David E. Womble and David S. Greenberg}, title = {Parallel {I/O}}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {iii}, publisher = {North-Holland (Elsevier Scientific)}, note = {Introduction to a special issue.}, keyword = {parallel I/O, pario-bib}, comment = {A one-page introduction to this special issue of Parallel Computing, which includes many papers about parallel I/O. See also womble:intro, nieuwejaar:jgalley, moore:ddio, barve:jmergesort, miller:jrama, schwabe:jlayouts, parsons:templates, cormen:early-vic, carretero:performance,} } @Article{woodward:scivi, author = {Paul R. Woodward}, title = {Interactive Scientific Visualization of Fluid Flow}, journal = {IEEE Computer}, year = {1993}, month = {October}, volume = {26}, number = {10}, pages = {13--25}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O architecture, scientific visualization, pario-bib}, comment = {This paper is interesting for its impressive usage of RAIDs and parallel networks to support scientific visualization. In particular, the proposed Gigawall (a 10-foot by 6-foot gigapixel-per-second display) is run by 24 SGI processors and 32 9-disk RAIDs, connected to an MPP of some kind through an ATM switch. 512 GBytes of storage, playable at 450 MBytes per second, for 19 minutes of animation.} } @InProceedings{wu:thrashing, author = {Kun-Lung Wu and Philip S. Yu and James Z. Teng}, title = {Performance Comparison of Thrashing Control Policies for Concurrent Mergesorts with Parallel Prefetching}, booktitle = {Proceedings of the 1993 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1993}, pages = {171--182}, keyword = {disk prefetching, parallel I/O, disk caching, sorting, pario-bib}, comment = {They discuss prefetching and caching in database machines where mergesorts merge several input streams, each from its own disk, to one output stream, to its own disk. There are concurrent merges going on. A merge can cause thrashing when writes grab a clean buffer that holds an unused prefetch, thus forcing the block to later be read again. They consider several policies to handle this, but it seemed to me like they missed an obvious alternative, that may have been better: whenever you need a clean buffer to write into, but all the clean buffers hold unused-prefetched blocks, stall and wait while the dirty blocks are flushed (presumably started earlier when the clean-block count got too low). It seems better to wait for a half-finished write than to toss out a prefetched block and later have to read it again. Their simulations show that their techniques help a lot.} } @InCollection{yokota:nets-book, author = {Haruo Yokota and Yasuyuki Mimatsu}, title = {A Scalable Disk System with Data Reconstruction Functions}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {16}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {353--372}, publisher = {Kluwer Academic Publishers}, earlier = {yokota:nets}, keyword = {parallel I/O architecture, disk array, pario-bib}, abstract = {Scalable disk systems are required to implement well-balanced computer systems. We have proposed DR-nets, Data-Reconstruction networks, to construct the scalable parallel disk systems with high reliability. Each node of a DR-net has disks, and is connected by links to form an interconnection network. To realize the high reliability, nodes in a sub-network of the interconnection network organize a group of parity calculation proposed for RAIDs. Inter-node communication for calculating parity keeps the locality of data transfer, and it inhibits bottlenecks from occurring, even if the size of the network becomes very large. We have developed an experimental system using Transputers. In this chapter, we provide execution models for estimating the response time and throughput of DR-nets, and compare them to experimental results. We also discuss the reliability of the DR-nets and RAIDs.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @InProceedings{yokota:nets, author = {Haruo Yokota}, title = {{DR-Nets}: Data-Reconstruction Networks for Highly Reliable Parallel Disk Systems}, booktitle = {Proceedings of the IPPS~'94 Workshop on Input/Output in Parallel Computer Systems}, year = {1994}, month = {April}, pages = {105--116}, organization = {Japan Advanced Institute of Science and Technology (JAIST)}, note = {Also appeared in Computer Architecture News 22(4)}, later = {yokota:nets-book}, keyword = {parallel I/O, pario-bib}, comment = {They propose to link a set of disks with its own interconnect, e.g., a torus, to allow the disks to communicate to compute multi-dimensional parity and to respond to disk failures, without using the primary interconnect of the multiprocessor or distributed system. In this sense it is reminiscent of TickerTAIP or DataMesh.} } @MastersThesis{youssef:thesis, author = {Rachad Youssef}, title = {{RAID} for Mobile Computers}, year = {1995}, month = {August}, school = {Carnegie Mellon University Information Networking Institute}, note = {Available as INI-TR 1995-3}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/MOBILE/thesis.ps}, keyword = {parallel I/O, disk array, RAID, mobile computing, pario-bib}, comment = {low-power, highly available disk arrays for mobile computers.} } @InProceedings{zajcew:osf1, author = {Roman Zajcew and Paul Roy and David Black and Chris Peak and Paulo Guedes and Bradford Kemp and John LoVerso and Michael Leibensperger and Michael Barnett and FaraMarz Rabii and Durriya Netterwala}, title = {An {OSF/1 UNIX} for Massively Parallel Multicomputers}, booktitle = {Proceedings of the 1993 Winter USENIX Technical Conference}, year = {1993}, month = {January}, pages = {449--468}, keyword = {unix, parallel operating system, multiprocessor file system, pario-bib}, comment = {Describing the changes to OSF/1 to make OSF/1 AD TNC, primarily intended for NORMA MIMD multicomputers. Enhancements include a new file system, distributed implementation of sockets, and process management. The file system still has traditional file systems, each in its own partition, with a global name space built by mounting file systems on each other. The change is that mounts can be remote, ie, managed by a different file server on another node. They plan to use prefix tables for pathname translation (welch:prefix,nelson:sprite). They use a token-based protocol to provide atomicity of read and write calls, and to maintain consistency of client-node caches. See also roy:unixfile. Process enhancements include a new SIGMIGRATE, rfork(), and rforkmulti().} }