@Article{kotz:prefetch, author = {David F. Kotz and Carla Schlatter Ellis}, title = {Prefetching in File Systems for {MIMD} Multiprocessors}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1990}, month = {April}, volume = {1}, number = {2}, pages = {218--230}, publisher = {IEEE Computer Society Press}, earlier = {ellis:prefetch}, later = {kotz:thesis}, keyword = {dfk, parallel file system, prefetching, MIMD, disk caching, parallel I/O, pario-bib}, abstract = {The problem of providing file I/O to parallel programs has been largely neglected in the development of multiprocessor systems. There are two essential elements of any file system design intended for a highly parallel environment: parallel I/O and effective caching schemes. This paper concentrates on the second aspect of file system design and specifically, on the question of whether prefetching blocks of the file into the block cache can effectively reduce overall execution time of a parallel computation, even under favorable assumptions. \par Experiments have been conducted with an interleaved file system testbed on the Butterfly Plus multiprocessor. Results of these experiments suggest that 1) the hit ratio, the accepted measure in traditional caching studies, may not be an adequate measure of performance when the workload consists of parallel computations and parallel file access patterns, 2) caching with prefetching can significantly improve the hit ratio and the average time to perform an I/O operation, and 3) an improvement in overall execution time has been observed in most cases. In spite of these gains, prefetching sometimes results in increased execution times (a negative result, given the optimistic nature of the study). \par We explore why is it not trivial to translate savings on individual I/O requests into consistently better overall performance and identify the key problems that need to be addressed in order to improve the potential of prefetching techniques in this environment.} } @PhdThesis{kotz:thesis, author = {David Kotz}, title = {Prefetching and Caching Techniques in File Systems for {MIMD} Multiprocessors}, year = {1991}, month = {April}, school = {Duke University}, note = {Available as technical report CS-1991-016}, URL = {http://www.cs.dartmouth.edu/~dfk/papers/thesis_note.html}, keyword = {dfk, parallel file system, prefetching, MIMD, disk caching, parallel I/O, pario-bib}, abstract = {The increasing speed of the most powerful computers, especially multiprocessors, makes it difficult to provide sufficient I/O bandwidth to keep them running at full speed for the largest problems. Trends show that the difference in the speed of disk hardware and the speed of processors is increasing, with I/O severely limiting the performance of otherwise fast machines. This widening access-time gap is known as the ``I/O bottleneck crisis.'' One solution to the crisis, suggested by many researchers, is to use many disks in parallel to increase the overall bandwidth. \par This dissertation studies some of the file system issues needed to get high performance from parallel disk systems, since parallel hardware alone cannot guarantee good performance. The target systems are large MIMD multiprocessors used for scientific applications, with large files spread over multiple disks attached in parallel. The focus is on automatic caching and prefetching techniques. We show that caching and prefetching can transparently provide the power of parallel disk hardware to both sequential and parallel applications using a conventional file system interface. We also propose a new file system interface (compatible with the conventional interface) that could make it easier to use parallel disks effectively. \par Our methodology is a mixture of implementation and simulation, using a software testbed that we built to run on a BBN GP1000 multiprocessor. The testbed simulates the disks and fully implements the caching and prefetching policies. Using a synthetic workload as input, we use the testbed in an extensive set of experiments. The results show that prefetching and caching improved the performance of parallel file systems, often dramatically.}, comment = {Published as kotz:prefetch, kotz:jwriteback, kotz:jpractical, kotz:fsint2.} } @TechReport{kotz:throughput, author = {David Kotz}, title = {Throughput of Existing Multiprocessor File Systems}, year = {1993}, month = {May}, number = {PCS-TR93-190}, institution = {Dept. of Math and Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR93-190.ps.Z}, keyword = {parallel I/O, multiprocessor file system, performance, survey, dfk, pario-bib}, comment = {A brief note on the reported performance of existing file systems (Intel CFS, nCUBE, CM-2, CM-5, and Cray). Many have disappointingly low absolute throughput, in MB/s.} } @TechReport{kotz:tuning, author = {David Kotz}, title = {Tuning {STARFISH}}, year = {1996}, month = {October}, number = {PCS-TR96-296}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-296.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {STARFISH is a parallel file-system simulator we built for our research into the concept of disk-directed I/O. In this report, we detail steps taken to tune the file systems supported by STARFISH, which include a traditional parallel file system (with caching) and a disk-directed I/O system. In particular, we now support two-phase I/O, use smarter disk scheduling, increased the maximum number of outstanding requests that a compute processor may make to each disk, and added gather/scatter block transfer. We also present results of the experiments driving the tuning effort.}, comment = {Reports on some new changes to the STARFISH simulator that implements traditional caching and disk-directed I/O. This is meant mainly as a companion to kotz:jdiskdir. See also kotz:jdiskdir, kotz:diskdir, kotz:expand.} } @TechReport{kotz:workload-tr, author = {David Kotz and Nils Nieuwejaar}, title = {Dynamic File-Access Characteristics of a Production Parallel Scientific Workload}, year = {1994}, month = {April}, number = {PCS-TR94-211}, institution = {Dept. of Math and Computer Science, Dartmouth College}, note = {Revised May 11, 1994}, later = {kotz:workload}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR94-211.ps.Z}, keyword = {parallel file system, file access pattern, multiprocessor file system workload, parallel I/O, pario-bib, dfk}, abstract = {Multiprocessors have permitted astounding increases in computational performance, but many cannot meet the intense I/O requirements of some scientific applications. An important component of any solution to this I/O bottleneck is a parallel file system that can provide high-bandwidth access to tremendous amounts of data {\em in parallel\/} to hundreds or thousands of processors. \par Most successful systems are based on a solid understanding of the characteristics of the expected workload, but until now there have been no comprehensive workload characterizations of multiprocessor file systems. We began the CHARISMA project in an attempt to fill that gap. We instrumented the common node library on the iPSC/860 at NASA Ames to record all file-related activity over a two-week period. Our instrumentation is different from previous efforts in that it collects information about every read and write request and about the {\em mix\/} of jobs running in the machine (rather than from selected applications). \par The trace analysis in this paper leads to many recommendations for designers of multiprocessor file systems. First, the file system should support simultaneous access to many different files by many jobs. Second, it should expect to see many small requests, predominantly sequential and regular access patterns (although of a different form than in uniprocessors), little or no concurrent file-sharing between jobs, significant byte- and block-sharing between processes within jobs, and strong interprocess locality. Third, our trace-driven simulations showed that these characteristics led to great success in caching, both at the compute nodes and at the I/O~nodes. Finally, we recommend supporting strided I/O requests in the file-system interface, to reduce overhead and allow more performance optimization by the file system.} } @InProceedings{kotz:workload, author = {David Kotz and Nils Nieuwejaar}, title = {Dynamic File-Access Characteristics of a Production Parallel Scientific Workload}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, month = {November}, pages = {640--649}, publisher = {IEEE Computer Society Press}, address = {Washington, DC}, earlier = {kotz:workload-tr}, later = {kotz:jworkload}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:workload.ps.Z}, keyword = {parallel file system, file access pattern, multiprocessor file system workload, parallel I/O, pario-bib, dfk}, abstract = {Multiprocessors have permitted astounding increases in computational performance, but many cannot meet the intense I/O requirements of some scientific applications. An important component of any solution to this I/O bottleneck is a parallel file system that can provide high-bandwidth access to tremendous amounts of data {\em in parallel\/} to hundreds or thousands of processors. \par Most successful systems are based on a solid understanding of the characteristics of the expected workload, but until now there have been no comprehensive workload characterizations of multiprocessor file systems. We began the CHARISMA project in an attempt to fill that gap. We instrumented the common node library on the iPSC/860 at NASA Ames to record all file-related activity over a two-week period. Our instrumentation is different from previous efforts in that it collects information about every read and write request and about the {\em mix\/} of jobs running in the machine (rather than from selected applications). \par The trace analysis in this paper leads to many recommendations for designers of multiprocessor file systems. First, the file system should support simultaneous access to many different files by many jobs. Second, it should expect to see many small requests, predominantly sequential and regular access patterns (although of a different form than in uniprocessors), little or no concurrent file-sharing between jobs, significant byte- and block-sharing between processes within jobs, and strong interprocess locality. Third, our trace-driven simulations showed that these characteristics led to great success in caching, both at the compute nodes and at the I/O~nodes. Finally, we recommend supporting strided I/O requests in the file-system interface, to reduce overhead and allow more performance optimization by the file system.} } @InProceedings{kotz:writeback, author = {David Kotz and Carla Schlatter Ellis}, title = {Caching and Writeback Policies in Parallel File Systems}, booktitle = {1991 IEEE Symposium on Parallel and Distributed Processing}, year = {1991}, month = {December}, pages = {60--67}, earlier = {kotz:thesis}, later = {kotz:jwriteback}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/kotz:writeback.ps.Z}, keyword = {dfk, parallel file system, disk caching, parallel I/O, MIMD, pario-bib}, abstract = {Improvements in the processing speed of multiprocessors are outpacing improvements in the speed of disk hardware. Parallel disk I/O subsystems have been proposed as one way to close the gap between processor and disk speeds. Such parallel disk systems require parallel file system software to avoid performance-limiting bottlenecks. We discuss cache management techniques that can be used in a parallel file system implementation. We examine several writeback policies, and give results of experiments that test their performance.}, comment = {See also kotz:jpractical, kotz:fsint2, cormen:integrate.} } @TechReport{krieger:asf-tr, author = {Orran Krieger and Michael Stumm and Ronald Unrau}, title = {The {Alloc Stream Facility}: A Redesign of Application-level Stream {I/O}}, year = {1992}, month = {October}, number = {CSRI-275}, institution = {Computer Systems Research Institute, University of Toronto}, address = {Toronto, Canada, M5S 1A1}, later = {krieger:asf}, URL = {ftp://ftp.csri.utoronto.edu/csri-technical-reports/275/275.ps.Z}, keyword = {memory-mapped file, file system, parallel I/O, pario-bib}, abstract = {This paper describes the design and implementation of a new application level I/O facility, called the Alloc Stream Facility. The Alloc Stream Facility has several key advantages. First, performance is substantially improved as a result of a) the structure of the facility that allows it to take advantage of system specific features like mapped files, and b) a reduction in data copying and the number of I/O system calls. Second, the facility is designed for multi-threaded applications running on multiprocessors and allows for a high degree of concurrency. Finally, the facility can support a variety of I/O interfaces, including stdio, emulated Unix I/O, ASI, and C++ streams, in a way that allows applications to freely intermix calls to the different interfaces, resulting in improved code reusability. \par We show that on several Unix workstation platforms the performance of Unix applications using the Alloc Stream Facility can be substantially better that when the applications use the original I/O facilities.}, comment = {See also krieger:mapped. ``This is an extended version of the paper with the same title in the March, 1994 edition of IEEE Computer.'' A 3-level interface structure: interface, backplane, and stream-specific modules. Different interfaces available: unix, stdio, ASI (theirs), C++. Common backplane. Stream-specific implementations that export operations like salloc and sfree, which return pointers to data buffers. ASI exports that interface to the user, for maximum efficiency. Performance is best when using mapped files as underlying implementation. Many stdio or unix apps are faster only after relinking. ASI is even faster. In addition to better performance, also get multithreading support, multiple interfaces, and extensibility.} } @Article{krieger:asf, author = {Orran Krieger and Michael Stumm and Ronald Unrau}, title = {The {Alloc Stream Facility}: A Redesign of Application-level Stream {I/O}}, journal = {IEEE Computer}, year = {1994}, month = {March}, volume = {27}, number = {3}, pages = {75--82}, publisher = {IEEE Computer Society Press}, earlier = {krieger:asf-tr}, keyword = {memory-mapped file, file system, parallel I/O, pario-bib} } @InProceedings{krieger:hfs, author = {Orran Krieger and Michael Stumm}, title = {{HFS:} A Flexible File System for large-scale Multiprocessors}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {6--14}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, later = {krieger:hfs2}, URL = {ftp://ftp.cs.toronto.edu/pub/parallel/Krieger_Stumm_DAGS93.ps.Z}, keyword = {multiprocessor file system, parallel I/O, operating system, shared memory, pario-bib}, abstract = {The {H{\sc urricane}} File System (HFS) is a new file system being developed for large-scale shared memory multiprocessors with distributed disks. The main goal of this file system is scalability; that is, the file system is designed to handle demands that are expected to grow linearly with the number of processors in the system. To achieve this goal, HFS is designed using a new structuring technique called Hierarchical Clustering. HFS is also designed to be flexible in supporting a variety of policies for managing file data and for managing file system state. This flexibility is necessary to support in a scalable fashion the diverse workloads we expect for a multiprocessor file system.}, comment = {This paper is now out of date; see krieger:thesis. Designed for scalability on the hierarchical clustering model (see unrau:cluster), the Hurricane File System for NUMA shared-memory MIMD machines. Each cluster has its own full file system, which communicates with those in other clusters. Pieces are name server, open-file server, and block-file server. On first access, the file is mapped into the application space. VM system calls BFS to arrange transfers. Open questions: policies for file state management, block distribution, caching, and prefetching. Object-oriented approach used to allow for flexibility and extendability. Local disk file systems are log-structured.} } @InProceedings{krieger:hfs2, author = {Orran Krieger and Michael Stumm}, title = {{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {95--108}, publisher = {ACM Press}, address = {Philadelphia}, earlier = {krieger:hfs}, later = {krieger:hfs3}, keyword = {parallel I/O, parallel file system, object-oriented, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for (potentially large-scale) shared memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. For example, a file's structure can be optimized for concurrent random-access write-only operations by ten processes. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most existing parallel file systems support a single file structure and a small set of policies. \par We have implemented HFS as part of the Hurricane operating system running on the Hector shared memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.}, comment = {A published form of krieger:hfs and the thesis krieger:thesis. Their main point is that the file system is constructed from building-block objects. When you create a file you choose a few building blocks, for example, a replication block that mirrors the file, and some distribution blocks that distribute each replica across a set of disks. When you open the file you plug in some more building blocks, e.g., to do prefetching or to provide the kind of interface that you want to use. They point out that this flexibility is critical to be able to get good performance, because different file-access patterns need different structures and policies. They found that mapped files minimize copying costs and improve performance. They were able to obtain full disk bandwidth. Great paper.} } @Article{krieger:hfs3, author = {Orran Krieger and Michael Stumm}, title = {{HFS}: A Performance-Oriented Flexible File System Based on Building-Block Compositions}, journal = {ACM Transactions on Computer Systems}, year = {1997}, month = {August}, volume = {15}, number = {3}, pages = {286--321}, earlier = {krieger:hfs2}, URL = {http://www.acm.org/pubs/citations/journals/tocs/1997-15-3/p286-krieger/}, keyword = {parallel I/O, parallel file system, object-oriented, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for (potentially large-scale) shared-memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies, and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. As an extreme example, HFS allows a file's structure to be optimized for concurrent random-access write-only operations by 10 threads, something no other file system can do. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most parallel file systems support a single file structure and a small set of policies. We have implemented HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns, HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.} } @PhdThesis{krieger:thesis, author = {Orran Krieger}, title = {{HFS}: A flexible file system for shared-memory multiprocessors}, year = {1994}, month = {October}, school = {University of Toronto}, URL = {ftp://ftp.cs.toronto.edu/pub/parallel/Okrieg_PhD.ps.Z}, keyword = {parallel I/O, multiprocesor file system, shared memory, memory-mapped I/O, pario-bib}, abstract = {The Hurricane File System (HFS) is designed for large-scale, shared-memory multiprocessors. Its architecture is based on the principle that a file system must support a wide variety of file structures, file system policies and I/O interfaces to maximize performance for a wide variety of applications. HFS uses a novel, object-oriented building-block approach to provide the flexibility needed to support this variety of file structures, policies, and I/O interfaces. File structures can be defined in HFS that optimize for sequential or random access, read-only, write-only or read/write access, sparse or dense data, large or small file sizes, and different degrees of application concurrency. Policies that can be defined on a per-file or per-open instance basis include locking policies, prefetching policies, compression/decompression policies and file cache management policies. In contrast, most existing file systems have been designed to support a single file structure and a small set of policies. \par We have implemented large portions of HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. Also, we show that HFS is able to deliver the full I/O bandwidth of the disks on our system to the applications.}, comment = {Excellent work. HFS uses an object-oriented building-block approach to provide flexible, scalable high performance. Indeed, HFS appears to be one of the most flexible parallel file systems available, allowing users to independently control (or redefine) policies for prefetching, caching, redundancy and fault tolerance, and declustering.} } @TechReport{krystynak:datavault, author = {John Krystynak}, title = {{I/O} Performance on the {Connection Machine DataVault} System}, year = {1992}, month = {May}, number = {RND-92-011}, institution = {NAS Systems Division, NASA Ames}, later = {krystynak:pario}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-011/RND-92-011.html}, keyword = {parallel I/O, parallel file system, parallel I/O, performance measurement, pario-bib}, comment = {Short measurements of CM-2 Datavault. Faster if you access through Paris. Can get nearly full 32 MB/s bandwidth. Problem in its ability to use multiple CMIO busses.} } @InProceedings{krystynak:pario, author = {John Krystynak and Bill Nitzberg}, title = {Performance Characteristics of the {iPSC/860} and {CM-2} {I/O} Systems}, booktitle = {Proceedings of the Seventh International Parallel Processing Symposium}, year = {1993}, pages = {837--841}, publisher = {IEEE Computer Society Press}, address = {Newport Beach, CA}, earlier = {krystynak:datavault}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {Essentially a (short) combination of krystynak:datavault and nitzberg:cfs.} } @InProceedings{kucera:libc, author = {Julie Kucera}, title = {Making {\em libc}\/ Suitable for Use by Parallel Programs}, booktitle = {Proceedings of the USENIX Distributed and Multiprocessor Systems Workshop}, year = {1989}, pages = {145--152}, keyword = {parallel file system interface, pario-bib}, comment = {Experience making libc reentrant, adding semaphores, etc., on a Convex. Some problems with I/O. Added semaphores and private memory to make libc calls reentrant, i.e., callable in parallel by multiple threads.} } @InProceedings{kwan:cm5io, author = {Thomas T. Kwan and Daniel A. Reed}, title = {Performance of the {CM-5} Scalable File System}, booktitle = {Proceedings of the 8th ACM International Conference on Supercomputing}, year = {1994}, month = {July}, pages = {156--165}, publisher = {ACM Press}, address = {Manchester, UK}, keyword = {parallel I/O, parallel architecture, multiprocessor file system, pario-bib}, comment = {They measure the performance of the CM-5 Scalable File System using synthetic benchmarks. They compare CM-Fortran with CMMD. The hardware-dependent (``physical'') modes were much faster than the generic-format modes, which have to reorder data between the processor distribution and the disk distribution. The network turned out to be a bottleneck for the performance when reordering was needed. They conclude that more user control over the I/O would be very helpful.} } @PhdThesis{kwan:sort, author = {Sai Choi Kwan}, title = {External Sorting: {I/O} Analysis and Parallel Processing Techniques}, year = {1986}, month = {January}, school = {University of Washington}, note = {Available as technical report 86--01--01}, keyword = {parallel I/O, sorting, pario-bib}, comment = {Examines external sorting techniques such as merge sort, tag sort, multi-pass distribution sort, and one-pass distribution sort. The model is one where I/O complexity is included, assuming a linear seek time distribution and a cost of 1/2 rotation for each seek. Parallel I/O or computing are not considered until the distribution sorts. Architectural model on page 58.} } @InProceedings{kwong:distribution, author = {Peter Kwong and Shikaresh Majumdar}, title = {Study of Data Distribution Strategies for Parallel {I/O} Management}, booktitle = {Proceedings of the Third International Conference of the Austrian Center for Parallel Computation (ACPC)}, year = {1996}, month = {September}, series = {Lecture Notes in Computer Science}, volume = {1127}, pages = {12--23}, publisher = {Springer-Verlag}, keyword = {parallel I/O, pario-bib}, abstract = {Recent studies have demonstrated that a significant number of I/O operations are performed by a number of classes of different parallel applications. Appropriate I/O management strategies are required however for harnessing the power of parallel I/O. This paper focuses on two I/O management issues that affect system performance in multiprogrammed parallel environments. Characterization of I/O behavior of parallel applications in terms of four different models is discussed first, followed by an investigation of the performance of a number of different data distribution strategies. Using computer simulations this research shows that I/O characteristics of applications and data distribution have an important effect on system performance. Applications that can simultaneously do computation and I/O, plus strategies that can incorporate centralized I/O management are found to be beneficial for a multiprogrammed parallel environment.}, comment = {See majumdar:management.} } @InProceedings{lake:pario, author = {Brian Lake and Chris Gray}, title = {Parallel {I/O} for {MIMD} Machines}, booktitle = {Proceedings of SS'93: High Performance Computing}, year = {1993}, month = {June}, pages = {301--308}, address = {Calgary}, keyword = {parallel I/O, MIMD, multiprocessor file system, pario-bib}, comment = {They describe the I/O system for the Myrias SPS-3 parallel computer. The SPS is a no-remote-access (NORMA) machine with a software shared memory abstraction. They provide a standard C/FORTRAN I/O interface, with a few extensions. The user's parallel program is considered a client, and an I/O processor (IOP) is the server. No striping across IOPs, which makes it relatively simple for them to have the server manage the shared file pointer. Their extensions allow atomic, file-pointer update, returning the actual position where I/O occurred, and atomic access to fixed- and variable-length records. They have three protocols, for different transfer sizes; small using simple request/response; medium using sliding window; and large using scatter/gather and special hardware double buffering at the IOP. They use scatter/gather DMA, and page-table fiddling, for messaging. Performance is 89--96\% of hardware peak, limited by IOP's VME backplane.} } @Misc{large-scale-memories, key = {Algorithmica}, title = {Special issue on Large-Scale Memories}, year = {1994}, volume = {12}, number = {2}, howpublished = {Algorithmica} } @Article{latifi:network, author = {S. Latifi and M. Moraes de Azevedo and N. Bagherzadeh}, title = {A star-based {I/O}-bounded network for massively parallel systems}, journal = {IEE Proceedings--- Computers and Digital Techniques}, year = {1995}, month = {January}, volume = {42}, number = {1}, pages = {5--14}, keyword = {verify authors, parallel I/O, parallel computer architecture, pario-bib}, abstract = {The paper describes a new interconnection network for massively parallel systems, referred to as star-connected cycles (SCC). The SCC graph presents an I/O-bounded structure that results in several advantages over variable degree graphs like the star and the hypercube. The description of the SCC graph includes issues such as labelling of nodes, degree, diameter and symmetry. The paper also presents an optimal routeing algorithm for the SCC and efficient broadcasting algorithms with O(n) running time, with n being the dimensionality of the graph. A comparison with the cube-connected cycles (CCC) and other interconnection networks is included, indicating that, for even n, an n-SCC and a CCC of similar sizes have about the same diameter. In addition, it is shown that one-port broadcasting in an n-SCC graph can be accomplished with a running time better than or equal to that required by an n-star containing (n-1) times fewer nodes.} } @InProceedings{lautenbach:pfs, author = {Berin F. Lautenbach and Bradley M. Broom}, title = {A Parallel File System for the {AP1000}}, booktitle = {Proceedings of the Third Fujitsu-ANU CAP Workshop}, year = {1992}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, mutisya:cache, and broom:cap. The Acacia file system has file access modes that are much like those in Intel CFS and TMC CMMD. By default all processes have their own file pointer, but they can switch to another mode either all together or in row- or column-subsets. The other modes include a replicated mode (where all read or write the same data), and a variety of shared modes, with arbitrary, fixed, or unspecified ordering among processors, and with fixed or variable-sized records. They also have a parallel-open operation, support for logical records, control over the striping width (number of disks) and height (block size), and control over of redundancy. A prototype is running.} } @Article{lawlor:parity, author = {F.~D. Lawlor}, title = {Efficient mass storage parity recovery mechanism}, journal = {IBM Technical Disclosure Bulletin}, year = {1981}, month = {July}, volume = {24}, number = {2}, pages = {986--987}, keyword = {parallel I/O, disk array, RAID, pario-bib}, comment = {An early paper, perhaps the earliest, that describes the techniques that later became RAID. Lawlor notes how to use parity to recover data lost due to disk crash, as in RAID3, addresses the read-before-write problem by caching the old data block as well as the new data block, and shows how two-dimensional parity can protect against two or more failures.} } @InProceedings{lee:external, author = {Jang Sun Lee and Sunghoon Ko and Sanjay Ranka and Byung Eui Min}, title = {High-Performance External Computations Using User-Controllable {I/O}}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @TechReport{lee:impl, author = {Edward K. Lee}, title = {Software and Performance Issues in the Implementation of a {RAID} Prototype}, year = {1990}, month = {May}, number = {UCB/CSD 90/573}, institution = {EECS, Univ. California at Berkeley}, URL = {http://cs-tr.cs.berkeley.edu/TR/UCB:CSD-90-573}, keyword = {parallel I/O, disk striping, performance, pario-bib}, comment = {Details of their prototype. Defines terms like stripe unit. Explores ways to lay out parity. Does performance simulations. Describes ops needed in device driver. Good to read if you plan to implement a RAID. Results: small R+W, or high loads, don't care about parity placement; in low load, there are different best cases for large R+W. Best all-around is left-symmetric. See also lee:parity.} } @Article{lee:jparity, author = {Edward K. Lee and Randy H. Katz}, title = {The Performance of Parity Placements in Disk Arrays}, journal = {IEEE Transactions on Computers}, year = {1993}, month = {June}, volume = {42}, number = {6}, pages = {651--664}, publisher = {IEEE Computer Society Press}, earlier = {lee:parity}, keyword = {RAID, reliability, parallel I/O, disk striping, pario-bib}, comment = {Journal version of lee:parity.} } @InProceedings{lee:logical-disks, author = {Jang Sun Lee and Jungmin Kim and P. Bruce Berra and Sanjay Ranka}, title = {Logical Disks: User-Controllable {I/O} For Scientific Applications}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {340--347}, publisher = {IEEE Computer Society Press}, keyword = {logical disks, parallel I/O, pario-bib}, abstract = {In this paper we propose user-controllable I/O operations and explore the effects of them with some synthetic access patterns. The operations allow users to determine a file structure matching the access patterns, control the layout and distribution of data blocks on physical disks, and present various access patterns with a minimum number of I/O operations. The operations do not use a file pointer to access data as in typical file systems, which eliminates the overhead of managing the offset of the file, making it easy to share data and reducing the number of I/O operations.} } @InProceedings{lee:pario, author = {K-K. Lee and P. Varman}, title = {Prefetching and {I/O} Parallelism in Multiple Disk Systems}, booktitle = {Proceedings of the 1995 International Conference on Parallel Processing}, year = {1995}, month = {August}, pages = {III:160--163}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, prefetching, disk array, pario-bib} } @InProceedings{lee:parity, author = {Edward K. Lee and Randy H. Katz}, title = {Performance Consequences of Parity Placement in Disk Arrays}, booktitle = {Proceedings of the Fourth International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {1991}, pages = {190--199}, later = {lee:jparity}, keyword = {RAID, reliability, parallel I/O, pario-bib}, comment = {Interesting comparison of several parity placement schemes. Boils down to two basic choices, depending on whether read performance or write performance is more important to you.} } @InProceedings{lee:petal, author = {Edward K. Lee and Chandramohan A. Thekkath}, title = {Petal: Distributed Virtual Disks}, booktitle = {Proceedings of the Seventh International Conference on Architectural Support for Programming Languages and Operating Systems}, year = {1996}, month = {October}, pages = {84--92}, address = {Cambridge, MA}, URL = {http://www.research.digital.com/SRC/personal/Chandu_Thekkath/Papers/petal-asplos96.ps}, keyword = {parallel I/O, distributed file system, declustering, reliability, pario-bib}, comment = {They are trying to build a file server that is easier to manage than most of today's distributed file systems, because disks are cheap but management is expensive. They describe a distributed file server that spreads blocks of all files across many disks and many servers. They use chained declustering so that they can survive loss of server or disk. They dynamically balance load. They dynamically reconfigure when new virtual disks are created or new physical disks are added. They've built it all and are now going to look at possible file systems that can take advantage of the features of Petal.} } @InProceedings{lee:raidmodel, author = {Edward K. Lee and Randy H. Katz}, title = {An Analytic Performance Model of Disk Arrays}, booktitle = {Proceedings of the 1993 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1993}, pages = {98--109}, keyword = {disk array, parallel I/O, RAID, analytic model, pario-bib} } @TechReport{lee:redist, author = {Jang Sun Lee and Sanjay Ranka and Ravi V. Shankar}, title = {Communication-Efficient and Memory-Bounded External Redistribution}, year = {1995}, institution = {Syracuse University}, URL = {ftp://top.cis.syr.edu/users/ranka/ParallelComputing/ExternalRedistribution.ps.Z}, keyword = {parallel I/O algorithm, out-of-core, pario-bib}, abstract = {This paper presents communication-efficient algorithms for the external data redistribution problem. Deterministic lower bounds and upper bounds are presented for the number of I/O operations, communication time and the memory requirements of external redistribution. Our algorithms differ from most other algorithms presented for out-of-core applications in that it is optimal (within a small constant factor) not only in the number of I/O operations, but also in the time taken for communication. A coarse-grained MIMD architecture with I/O subsystems attached to each processor is assumed, but the results are expected to be applicable over a wider variety of architectures.}, comment = {See shankar:transport for the underlying communication primitives.} } @InProceedings{lee:support, author = {Jenq Kuen Lee and Ing-Kuen Tsaur and San-Yih Huang}, title = {Language and Environmental Support for Parallel Object {I/O} on Distributed Memory Environments}, booktitle = {Proceedings of the Seventh SIAM Conference on Parallel Processing for Scientific Computing}, year = {1995}, month = {February}, pages = {756--761}, publisher = {SIAM}, keyword = {parallel I/O, object oriented, distributed memory, pario-bib}, abstract = {The paper describes a parallel file object environment to support distributed array store on shared nothing distributed computing environments. Our environment enables programmers to extend the concept of array distribution from memory levels to file levels. It allows parallel I/O according to the distribution of objects in an application. When objects are read and/or written by multiple applications using different distributions, we present a novel scheme to help programmers to select the best data distribution pattern according to minimum amount of remote data movements for the store of array objects on distributed file systems.} } @InProceedings{lee:userio, author = {Jang Sun Lee and Sang-Gue Oh and Bruce P. Berra and Sanjay Ranka}, title = {User-Controllable {I/O} for Parallel Computers}, booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA~'96)}, year = {1996}, month = {August}, pages = {442--453}, keyword = {parallel I/O, pario-bib}, abstract = {This paper presents the design of UPIO, a software for user-controllable parallel input and output. UPIO is designed to maximize I/O performance for scientific applications on MIMD multicomputers. The most important features of UPIO are: It supports a domain-specific file model and a variety of application interfaces to present numerous access patterns. UPIO provides user-contollerable I/O operations that allow users to control data access, file structure, and data distribution. The domain-specific file model and user controllability give low I/O overhead and allow programmers to exploit the aggregate bandwidth of parallel disks.}, comment = {They describe an interface that seems to allow easier access for programmers that want to map matrices onto parallel files. The concepts are not well explained, so it's hard to really understand what is new and different. They make no explicit comparison with other advanced interfaces like that in Vesta or Galley. No performance results.} } @Article{li:bfxm, author = {Qun Li and Jie Jing and Li Xie}, title = {{BFXM}: A Parallel File System Model Based on the Mechanism of Distributed Shared Memory}, journal = {ACM Operating Systems Review}, year = {1997}, month = {October}, volume = {31}, number = {4}, pages = {30--40}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @Article{li:jmodels, author = {Zhiyong Li and Peter H. Mills and John H. Reif}, title = {Models and Resource Metrics for Parallel and Distributed Computation}, journal = {Parallel Algorithms and Applications}, year = {1996}, volume = {8}, pages = {35--59}, earlier = {li:models}, keyword = {parallel I/O algorithm, pario-bib} } @InProceedings{li:models, author = {Zhiyong Li and Peter H. Mills and John H. Reif}, title = {Models and Resource Metrics for Parallel and Distributed Computation}, booktitle = {Proceedings of the Twenty-Eighth Annual Hawaii International Conference on System Sciences}, year = {1995}, month = {January}, pages = {51--60}, address = {Hawaii}, later = {li:jmodels}, URL = {file://ftp.cs.unc.edu/pub/projects/proteus/reports/models_hicss95.ps.gz"}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {This paper presents a framework of using {\em resource metrics} to characterize the various models of parallel computation. Our framework reflects the approach of recent models to abstract architectural details into several generic parameters, which we call resource metrics. We examine the different resource metrics chosen by different parallel models, categorizing the models into four classes: the basic synchronous models, and extensions of the basic models which more accurately reflect practical machines by incorporating notions of asynchrony, communication cost and memory hierarchy. We then present a new parallel computation model, the LogP-HMM model, as an illustration of design principles based on the framework of resource metrics. The LogP-HMM model extends an existing parameterized network model (LogP) with a sequential hierarchical memory model (HMM) characterizing each processor. The result accurately captures both network communication costs and the effects of multileveled memory such as local cache and I/O. We examine the potential utility of our model in the design of near optimal sorting and FFT algorithms.} } @TechReport{li:recursive-tr, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, year = {1996}, month = {March}, number = {96-04}, institution = {Dept. of Computer Science, Duke University}, later = {li:recursive}, URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz}, keyword = {parallel I/O, out-of-core algorithm, pario-bib}, abstract = {In this paper, we present a framework for synthesizing I/O efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform (FFT) and block matrix transposition algorithms. Our framework uses an algebraic representation which is based on tensor products and other matrix operations. The programs are optimized for the striped Vitter and Shriver's two-level memory model in which data can be distributed using various cyclic(B) distributions in contrast to the normally used {\it physical track} distribution cyclic(B_d), where B_d is the physical disk block size. \par We first introduce tensor bases to capture the semantics of block-cyclic data distributions of out-of-core data and also data access patterns to out-of-core data. We then present program generation techniques for tensor products and matrix transposition. We accurately represent the number of parallel I/O operations required for the synthesized programs for tensor products and matrix transposition as a function of tensor bases and data distributions. We introduce an algorithm to determine the data distribution which optimizes the performance of the synthesized programs. Further, we formalize the procedure of synthesizing efficient out-of-core programs for tensor product formulas with various block-cyclic distributions as a dynamic programming problem. \par We demonstrate the effectiveness of our approach through several examples. We show that the choice of an appropriate data distribution can reduce the number of passes to access out-of-core data by as large as eight times for a tensor product, and the dynamic programming approach can largely reduce the number of passes to access out-of-core data for the overall tensor product formulas.} } @TechReport{li:synthesizing-tr, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, year = {1996}, month = {March}, number = {TR-96-04}, institution = {Dept. of Computer Science, Duke University}, later = {li:synthesizing}, URL = {ftp://ftp.cs.duke.edu/pub/zli/papers/TR-96-04.ps.gz}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {In this paper, we present a framework for synthesizing I/O efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform (FFT) and block matrix transposition algorithms. Our framework uses an algebraic representation which is based on tensor products and other matrix operations. The programs are optimized for the striped Vitter and Shriver's two-level memory model in which data can be distributed using various cyclic(B) distributions in contrast to the normally used {\it physical track} distribution cyclic(B_d), where B_d is the physical disk block size. \par We first introduce tensor bases to capture the semantics of block-cyclic data distributions of out-of-core data and also data access patterns to out-of-core data. We then present program generation techniques for tensor products and matrix transposition. We accurately represent the number of parallel I/O operations required for the synthesized programs for tensor products and matrix transposition as a function of tensor bases and data distributions. We introduce an algorithm to determine the data distribution which optimizes the performance of the synthesized programs. Further, we formalize the procedure of synthesizing efficient out-of-core programs for tensor product formulas with various block-cyclic distributions as a dynamic programming problem. \par We demonstrate the effectiveness of our approach through several examples. We show that the choice of an appropriate data distribution can reduce the number of passes to access out-of-core data by as large as eight times for a tensor product, and the dynamic programming approach can largely reduce the number of passes to access out-of-core data for the overall tensor product formulas.} } @InProceedings{li:synthesizing, author = {Zhiyong Li and John H. Reif and Sandeep K. S. Gupta}, title = {Synthesizing Efficient Out-of-Core Programs for Block Recursive Algorithms using Block-Cyclic Data Distributions}, booktitle = {Proceedings of the 1996 International Conference on Parallel Processing}, year = {1996}, month = {August}, pages = {II:142--149}, publisher = {IEEE Computer Society Press}, address = {St. Charles, IL}, earlier = {li:synthesizing-tr}, keyword = {parallel I/O algorithm, pario-bib}, abstract = {This paper presents a framework for synthesizing I/O-efficient out-of-core programs for block recursive algorithms, such as the fast Fourier transform and matrix transpositions. the programs are synthesized from tensor (Kronecker) product representations of algorithms. These programs are optimized for a striped two-level memory model where in the out-of-core data can have block-cyclic distributions on multiple disks.} } @InProceedings{ligon:pfs, author = {W. B. Ligon and R. B. Ross}, title = {Implementation and Performance of a Parallel File System for High Performance Distributed Applications}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {471--480}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, cluster computing, parallel file system, pario-bib}, abstract = {Dedicated cluster parallel computers (DCPCs) are emerging as low-cost high performance environments for many important applications in science and engineering. A significant class of applications that perform well on a DCPC are coarse-grain applications that involve large amounts of file I/O. Current research in parallel file systems for distributed systems is providing a mechanism for adapting these applications to the DCPC environment. We present the Parallel Virtual File System (PVFS), a system that provides disk striping across multiple nodes in a distributed parallel computer and file partitioning among tasks in a parallel program. PVFS is unique among similar systems in that it uses a stream-based approach that represents each file access with a single set of request parameters and decouples the number of network messages from details of the file striping and partitioning. PVFS also provides support for efficient collective file accesses and allows overlapping file partitions. We present results of early performance experiments that show PVFS achieves excellent speedups in accessing moderately sized file segments.} } @InProceedings{lin:clusterio, author = {Zheng Lin and Songnian Zhou}, title = {Parallelizing {I/O} Intensive Applications for a Workstation Cluster: a Case Study}, booktitle = {Proceedings of the IPPS~'93 Workshop on Input/Output in Parallel Computer Systems}, year = {1993}, pages = {17--36}, address = {Newport Beach, CA}, note = {Also published in Computer Architecture News 21(5), December 1993, pages 15--22}, keyword = {parallel I/O, workstation cluster, text retrieval, pario-bib}, comment = {They implement a parallel text retrieval application on a cluster of DEC~5000 workstations.} } @InProceedings{livny:stripe, author = {M. Livny and S. Khoshafian and H. Boral}, title = {Multi-Disk Management Algorithms}, booktitle = {Proceedings of the 1987 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1987}, month = {May}, pages = {69--77}, keyword = {parallel I/O, disk striping, disk array, pario-bib} } @TechReport{lo:disks, author = {Raymond Lo and Norman Matloff}, title = {A Probabilistic Limit on the Virtual Size of Replicated File Systems}, year = {1989}, institution = {Department of EE and CS, UC Davis}, keyword = {parallel I/O, replication, file system, disk mirroring, disk shadowing, pario-bib}, comment = {A look at shadowed disks. If you have $k$ disks set up to read from the disk with the shortest seek, but write to all disks, you have increased reliability, read time like the min of the seeks, and write time like the max of the seeks. It appears that with increasing $k$ you can get good performance. But this paper clearly shows, since writes move all disk heads to the same location, that the effective value of $k$ is actually quite low. Only 4--10 disks are likely to be useful for most traffic loads.} } @Article{lockey:characterization, author = {P. Lockey and R. Proctor and I. D. James}, title = {Characterization of {I/O} Requirements in a Massively Parallel Shelf Sea Model}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {It is now recognized that a high level of I/O performance is crucial in making effective use of parallel machines for many scientific application codes. This paper considers the I/O requirements in one particular scientific application area; 3D modelling of continental shelf sea regions. We identify some of the scientific aims which drive the model development, and the consequent impact on the I/O needs. As a case study we take a parallel production code running a simulation of the North Sea on a Cray~T3D platform and investigate the I/O performance in dealing with the dominant I/O component; dumping of results data to disk. In order to place the performance issues in a more general framework we construct a simple theoretical model of I/O requirements, and use this to probe the impact of available I/O performance on current and proposed scientific objectives.} } @Article{long:swift-raid, author = {Darrell D. E. Long and Bruce R. Montague}, title = {{Swift/RAID}: A Distributed {RAID} System}, journal = {Computing Systems}, year = {1994}, month = {Summer}, volume = {7}, number = {3}, pages = {333--359}, keyword = {RAID, disk array, parallel I/O, distributed file system, pario-bib}, comment = {One of the features of this system is the way they develop and execute transaction plans as little scripts that are built by the client, sent to the servers, and then executed by interpreters.} } @InProceedings{loverso:sfs, author = {Susan J. LoVerso and Marshall Isman and Andy Nanopoulos and William Nesheim and Ewan D. Milne and Richard Wheeler}, title = {{\em sfs}: {A} Parallel File System for the {CM-5}}, booktitle = {Proceedings of the 1993 Summer USENIX Technical Conference}, year = {1993}, pages = {291--305}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {They took the Unix file system from SunOS and extended it to run on the CM-5. This involved handling non-power-of-two block sizes, parallel I/O calls, large file sizes, and more encouragement for extents to be allocated. The hardware is particularly suited to RAID~3 with a 16 byte striping unit, although in theory the software could do anything it wants. Geared to data-parallel model. Proc nodes (PNs) contact the timesharing daemon (TSD) on the control processor (CP), who gets block lists from the file system, which runs on one of the CPs. The TSD then arranges with the disk storage nodes (DSNs) to do the transfer directly with the PNs. Each DSN has 8~MB of buffer space, 8 disk drives, 4 SCSI busses, and a SPARC as controller. Partition managers mount non-local sfs via NFS. Performance results good. Up to 185~MB/s on 118 (2~MB/s) disks.} } @Article{mackay:groundwater, author = {David Mackay and G. Mahinthakumar and Ed D'Azevedo}, title = {A Study of {I/O} in a Parallel Finite Element Groundwater Transport Code}, journal = {International Journal of Supercomputer Applications and High Performance Computing}, year = {1998}, note = {To appear in a Special Issue on I/O in Parallel Applications}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {A parallel finite element groundwater transport code is used to compare three different strategies for performing parallel I/O: (1) have a single processor collect data and perform sequential I/O in large blocks, (2) use variations of vendor specific I/O extensions (3) use the EDONIO I/O library. Each processor performs many writes of one to four kilobytes to reorganize localdata in a global shared file. Our findings suggest having a single processor collect data and perform large block-contiguous operations may be quite efficient and portable for up to 32 processor configurations. This approach does not scale well for a larger number of processors since the single processor becomes a bottleneck for gathering data. The effective application I/O rate observed, which includes times for opening and closing files, is only a fraction of the peak device read/write rates. Some form of data redistribution and buffering in remote memory as performed in EDONIO may yield significant improvements for non-contiguous data I/O access patterns and short requests. Implementors of parallel I/O systems may consider some form of buffering as performed in EDONIO to speed up such I/O requirements.} } @InProceedings{madhyastha:adaptive, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Intelligent, Adaptive File System Policy Selection}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {172--179}, publisher = {IEEE Computer Society Press}, later = {madhyastha:thesis}, keyword = {parallel I/O, pario-bib}, abstract = {Traditionally, maximizing input/output performance has required tailoring application input/output patterns to the idiosyncrasies of specific input/output systems. The authors show that one can achieve high application input/output performance via a low overhead input/output system that automatically recognizes file access patterns and adaptively modifies system policies to match application requirements. This approach reduces the application developer's input/output optimization effort by isolating input/output optimization decisions within a retargetable file system infrastructure. To validate these claims, they have built a lightweight file system policy testbed that uses a trained learning mechanism to recognize access patterns. The file system then uses these access pattern classifications to select appropriate caching strategies, dynamically adapting file system policies to changing input/output demands throughout application execution. The experimental data show dramatic speedups on both benchmarks and input/output intensive scientific applications.}, comment = {See also madhyastha:thesis, and related papers.} } @InProceedings{madhyastha:classification, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Input/Output Access Pattern Classification Using Hidden {Markov} Models}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {57--67}, publisher = {ACM Press}, address = {San Jose, CA}, later = {madhyastha:thesis}, keyword = {workload characterization, file access pattern, parallel I/O, pario-bib}, abstract = {Input/output performance on current parallel file systems is sensitive to a good match of application access pattern to file system capabilities. Automatic input/output access classification can determine application access patterns at execution time, guiding adaptive file system policies. In this paper we examine a new method for access pattern classification that uses hidden Markov models, trained on access patterns from previous executions, to create a probabilistic model of input/output accesses. We compare this approach to a neural network classification framework, presenting performance results from parallel and sequential benchmarks and applications.}, comment = {The most interesting thing in this paper is the use of a Hidden Markov Model to understand the access pattern of an application to a file. After running the application on the file once, and simultaneously training their HMM, they use the result to tune the system for the next execution (cache size, cache partitioning, prefetching, Intel file mode, etc). They get much better performance in future runs. See also madhyastha:thesis, and related papers.} } @InProceedings{madhyastha:global, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Exploiting Global Input/Output Access Pattern Classification}, booktitle = {Proceedings of SC '97: High Performance Computing and Networking}, year = {1997}, month = {November}, publisher = {IEEE Computer Society Press}, address = {San Jose}, later = {madhyastha:thesis}, URL = {http://scxy.tc.cornell.edu/sc97/proceedings/TECH/MADHYAST/INDEX.HTM}, keyword = {file access pattern, parallel I/O, pario-bib}, abstract = {Parallel input/output systems attempt to alleviate the performance bottleneck that affects many input/output intensive applications. In such systems, an understanding of the application access pattern, especially how requests from multiple processors for different file regions are logically related, is important for optimizing file system performance. We propose a method for automatically classifying these global access patterns and using these global classifications to select and tune file system policies to improve input/output performance. We demonstrate this approach on benchmarks and scientific applications using global classification to automatically select appropriate underlying Intel PFS input/output modes and server buffering strategies.}, comment = {No page numbers: web and CDROM proceedings only. See also madhyastha:thesis and related papers.} } @InProceedings{madhyastha:optimizing, author = {Tara M. Madhyastha and Christopher L. Elford and Daniel A. Reed}, title = {Optimizing Input/Output Using Adaptive File System Policies}, booktitle = {Proceedings of the Fifth NASA Goddard conference on Mass Storage Systems}, year = {1996}, month = {September}, pages = {II:493--514}, later = {madhyastha:thesis}, keyword = {multiprocessor file system, prefetching, caching, parallel I/O, multiprocessor file system interface, pario-bib}, comment = {See also madhyastha:thesis, and related papers.} } @PhdThesis{madhyastha:thesis, author = {Tara Madhyastha}, title = {Automatic Classification of Input/Output Access Patterns}, year = {1997}, month = {August}, school = {University of Illinois, Urbana-Champaign}, URL = {http://www-pablo.cs.uiuc.edu/People/tara/thesis.html}, keyword = {parallel I/O, file access pattern, pario-bib}, comment = {See also madhyastha:classification, madhyastha:global, madhyastha:adaptive, madhyastha:optimizing.} } @InProceedings{majumdar:characterize, author = {S. Majumdar and Yiu Ming Leung}, title = {Characterization of applications with {I/O} for processor scheduling in multiprogrammed parallel systems}, booktitle = {Proceedings of the 1994 IEEE Symposium on Parallel and Distributed Processing}, year = {1994}, pages = {298--307}, publisher = {IEEE Computer Society Press}, keyword = {workload characterization, scheduling, parallel I/O, pario-bib}, abstract = {Most studies of processor scheduling in multiprogrammed parallel systems have ignored the I/O performed by applications. Recent studies have demonstrated that significant I/O operations are performed by a number of different classes of parallel applications. This paper focuses on some basic issues that underlie scheduling in multiprogrammed parallel environments running applications with I/O. Characterization of the I/O behavior of parallel applications is discussed first. Based on simulation models this research investigates the influence of these I/O characteristics on processor scheduling.} } @InProceedings{majumdar:management, author = {Shikaresh Majumdar and Faisal Shad}, title = {Characterization and Management of {I/O} on Multiprogrammed Parallel Systems}, booktitle = {Proceedings of the 1995 IEEE Symposium on Parallel and Distributed Processing}, year = {1995}, month = {October}, pages = {502--510}, publisher = {IEEE Computer Society Press}, address = {San Antonio, TX}, keyword = {workload characterization, parallel I/O, pario-bib}, comment = {Analytical workload model. Simulation studies. See also kwong:distribution.} } @InProceedings{malluhi:pss, author = {Qutaibah Malluhi and William E. Johnston}, title = {Approaches for a Reliable High-Performance Distributed-Parallel Storage System}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {500--509}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {The paper studies different schemes to enhance the reliability, availability and security of a high performance distributed storage system. We have previously designed a distributed parallel storage system that employs the aggregate bandwidth of multiple data servers connected by a high speed wide area network to achieve scalability and high data throughput. The general approach of the paper employs erasure error correcting codes to add data redundancy that can be used to retrieve missing information caused by hardware, software, or human faults. The paper suggests techniques for reducing the communication and computation overhead incurred while retrieving missing data blocks form redundant information. These techniques include clustering, multidimensional coding, and the full two dimensional parity scheme.} } @Article{manuel:logjam, author = {Tom Manuel}, title = {Breaking the Data-rate Logjam with arrays of small disk drives}, journal = {Electronics}, year = {1989}, month = {February}, volume = {62}, number = {2}, pages = {97--100}, keyword = {parallel I/O, disk array, I/O bottleneck, pario-bib}, comment = {See also Electronics, Nov. 88 p 24, Dec. 88 p 112. Trade journal short on disk arrays. Very good intro. No new technical content. Concentrates on RAID project. Lists several commercial versions. Mostly concentrates on single-controller versions.} } @Misc{maspar:pario, key = {Mas}, title = {Parallel File {I/O} Routines}, year = {1992}, howpublished = {MasPar Computer Corporation}, keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, comment = {Man pages for MasPar file system interface. They have either a single shared file pointer, after which all processors read or write in an interleaved pattern, or individual (plural) file pointer, allowing arbitrary access patterns. Updated in 1992 with many more features.} } @Article{masters:pario, author = {Del Masters}, title = {Improve Disk Subsystem Performance with Multiple Serial Drives in Parallel}, journal = {Computer Technology Review}, year = {1987}, month = {July}, volume = {7}, number = {9}, pages = {76--77}, keyword = {parallel I/O, pario-bib}, comment = {Information about the early Maximum Strategy disk array, which striped over 4 disk drives, apparently synchronously.} } @Article{matloff:multidisk, author = {Norman S. Matloff}, title = {A Multiple-Disk System for both Fault Tolerance and Improved Performance}, journal = {IEEE Transactions on Reliability}, year = {1987}, month = {June}, volume = {R-36}, number = {2}, pages = {199--201}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, reliability, disk shadowing, disk mirroring, pario-bib}, comment = {Variation on mirrored disks using more than 2 disks, to spread the files around. Good performance increases.} } @InProceedings{matthews:hippi, author = {Kevin C. Matthews}, title = {Experiences Implementing a Shared File System on a {HIPPI} Disk Array}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {77--88}, publisher = {IEEE Computer Society Press}, URL = {http://www.computer.org/conferen/mss95/matthews/matthews.htm}, keyword = {mass storage, distributed file system, parallel I/O, pario-bib}, abstract = {Shared file systems which use a physically shared mass storage device have existed for many years, although not on UNIX based operating systems. This paper describes a shared file system (SFS) that was implemented first as a special project on the Gray Research Inc. (CRI) UNICOS operating system. A more general product was then built on top of this project using a HIPPI disk array for the shared mass storage. The design of SFS is outlined, as well as some performance experiences with the product. We describe how SFS interacts with the OSF distributed file service (DFS) and with the CRI data migration facility (DMF). We also describe possible development directions for the SFS product.}, comment = {They use hardware to tie the same storage device (a disk array) to several computers (Cray C90s). They build a custom piece of hardware just to service semaphore requests very fast. HIPPI is the interconnect. Details have a lot to do with the synchronization between processors trying to update the same metadata; that's why they use the semaphores.} } @InProceedings{matthijs:framework, author = {F. Matthijs and Y. Berbers and P. Verbaeten}, title = {A flexible {I/O} framework for parallel and distributed systems}, booktitle = {Proceedings of the Fifth International Workshop on Object Orientation in Operating Systems}, year = {1995}, pages = {187--190}, publisher = {IEEE Computer Society Press}, keyword = {input-output programs, object-oriented, parallel systems; I/O performance, migration, dynamic load balancing, fault tolerance, parallel I/O, pario-bib}, abstract = {We propose a framework for I/O in parallel and distributed systems. The framework is highly customizable and extendible, and enables programmers to offer high level objects in their applications, without requiring them to struggle with the low level and sometimes complex details of high performance distributed I/O. Also, the framework exploits application specific information to improve I/O performance by allowing specialized programmers to customize the framework. Internally, we use indirection and granularity control to support migration, dynamic load balancing, fault tolerance, etc. for objects of the I/O system, including those representing application data.} } @InProceedings{mcmurdy:unstripe, author = {Ronald K. McMurdy and Badrinath Roysam}, title = {Improving {RAID-5} Performance by Un-striping Moderate-sized Files}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {II--279--282}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {Allocate small- and medium-sized files entirely on one disk rather than striped, to cut seek and rotation latency that would happen if they were spread across many disks.} } @InProceedings{meador:array, author = {Wes E. Meador}, title = {Disk Array Systems}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {143--146}, keyword = {parallel I/O, disk array, disk striping, pario-bib}, comment = {Describes {\em Strategy 2 Disk Array Controller}, which allows 4 or 8 drives, hardware striped, with parity drive and 0-4 hot spares. Up to 4 channels to cpu(s). Logical block interface. Defects, errors, formatting, drive failures all handled automatically. Peak 40 MB/s data transfer on each channel.} } @Misc{meiko:cs2, key = {Meiko}, title = {Computing Surface {CS-2}: Technical Overview}, year = {1993}, howpublished = {Meiko brochure S1002-10M115.01A}, keyword = {multiprocessor architecture, parallel I/O, pario-bib}, comment = {Three node types: 4 SPARC (50 MHz), 1 SPARC + two Fujitsu vector procs, or 1 SPARC + 3 I/O ports. All have a special communications processor that supports remote memory access. Each has 128 MBytes in 16 banks. Memory-memory transfer operations using ``remote DMA'', supported by the communications processor. User-level comm interface, with protection. Uses multistage network with 8x8 crossbar switches, looks like a fat tree. S/BUS, separate from the memory bus, is used for I/O, either directly, or through 2 SCSI and 1 ethernet. Control and diagnostic networks. Parallel file system stripes across multiple partitions. Can use RAID. Communications processor has its own MMU; control registers are mapped to user space. Network-wide virtual addresses can support shared memory? Remote store, atomic operations, global operations. Comm proc can support I/O threads -- but can it talk to the disks? OS based on Solaris 2, plus global shared memory, parallel file system, and capability-based protection. Machine is logically partitioned into login, devices, and parallel computation.} } @InProceedings{menasce:mass, author = {Daniel Menasc\'e and Odysseas Ionnis Pentakalos and Yelena Yesha}, title = {An Analytic Model of Hierarchical Mass Storage Systems With Network-Attached Storage Devices}, booktitle = {Proceedings of the 1996 ACM Sigmetrics Conference on Measurement and Modeling of Computer Systems}, year = {1996}, month = {May}, pages = {180--189}, publisher = {ACM Press}, address = {Philadelphia, PA}, keyword = {network attached peripherals, analytic model, mass storage, parallel I/O, pario-bib} } @InProceedings{menon:compare, author = {Jai Menon}, title = {A Performance Comparison of {RAID-5} and Log-structured Arrays}, booktitle = {Proceedings of the Fourth IEEE International Symposium on High Performance Distributed Computing}, year = {1995}, month = {August}, pages = {167--178}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {He compares a RAID-5 disk array with a log-structured array (LSA). An LSA is essentially an implementation of a log-structured file system inside a disk controller. The disk controller buffers up writes in a non-volatile cache; when the outgoing data buffer is full, it is written to some large contiguous region of the disk. The controller manages a directory to keep track of the various segment locations, and does garbage collection (cleaning). They can insert a compression algorithm in front of the cache so that they get better cache and disk utilization by storing data in compressed form. for fair comparison they compare with a similar feature in the plain RAID5 array.} } @Article{menon:daisy, author = {Jai Menon and Kent Treiber}, title = {{Daisy}: Virtual-disk Hierarchical Storage Manager}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {37--44}, keyword = {hierarchical storage, tape storage, tertiary storage, tape robot, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @Article{merchant:striping, author = {Arif Merchant and Philip S. Yu}, title = {Analytic Modeling and Comparisons of Striping Strategies of Replicated Disk Arrays}, journal = {IEEE Transactions on Computers}, year = {1995}, month = {March}, volume = {44}, number = {3}, pages = {419--431}, publisher = {IEEE Computer Society Press}, keyword = {disk striping, disk array, RAID, parallel I/O, pario-bib} } @InProceedings{merriam:triangle, author = {Drshal L. Merriam}, title = {Parallel Implementation of an Algorithm for {Delaunay} Triangulation}, booktitle = {Proceedings of Computational Fluid Dynamics}, year = {1992}, volume = {2}, pages = {907--912}, keyword = {parallel I/O, file system workload, pario-bib}, comment = {This application runs on the NASA Ames iPSC/860. This application has some I/O: reading in the input file, which is a set of x,y,z data points. I/O was really slow if formatted (ie, ASCII instead of binary) or sequential instead of parallel. Any input record could go to any processor; the first step in the algorithm (after the points are read in) is essentially a kind of sort to move points around to localize points and balance load.} } @Article{michael:future, author = {Gavin Michael and Andrew Chien}, title = {Future Multicomputers: Beyond Minimalist Multiprocessors?}, journal = {Computer Architecture News}, year = {1992}, month = {December}, volume = {20}, number = {5}, pages = {6--12}, keyword = {multiprocessor architecture, compiler, parallel I/O, pario-bib}, comment = {Includes some comments by Randy Katz about parallel I/O, in particular, distinguishing between ``fat'' nodes (with many disks, e.g., a RAID), and ``thin'' nodes (with one disk).} } @TechReport{milenkovic:model, author = {Milan Milenkovi\'c}, title = {A Model for Multiprocessor {I/O}}, year = {1989}, month = {July}, number = {89-CSE-30}, institution = {Dept. of Computer Science and Engineering, Southern Methodist University}, keyword = {multiprocessor I/O, I/O architecture, distributed system, pario-bib}, comment = {Advocates using dedicated server processors for all I/O, e.g., disk server, terminal server, network server. Pass I/O requests and data via messages or RPC calls over the interconnect (here a shared bus). Server handles packaging, blocking, caching, errors, interrupts, and so forth, freeing the main processors and the interconnect from all this activity. Benefits: encapsulates I/O-related stuff in specific places, accommodates heterogeneity, improves performance. Nice idea, but allows for an I/O bottleneck, unless server can handle all the demand. Otherwise would need multiple servers, more expensive than just multiple controllers.} } @InProceedings{miller:iobehave, author = {Ethan L. Miller and Randy H. Katz}, title = {Input/Output Behavior of Supercomputer Applications}, booktitle = {Proceedings of Supercomputing '91}, year = {1991}, month = {November}, pages = {567--576}, publisher = {IEEE Computer Society Press}, address = {Albuquerque, NM}, keyword = {file access pattern, supercomputer, disk caching, prefetching, pario-bib}, comment = {Same as miller:iobehave-tr except without the appendix outlining trace format. Included in pario-bibliography not because it measures a parallel workload, but because it is so often cited in the parallel-IO community.} } @Article{miller:jrama, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA}: An Easy-To-Use, High-Performance Parallel File System}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {419--446}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {miller:rama2}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {Modern massively parallel file systems provide high bandwidth file access by striping files across arrays of disks attached to a few specialized I/O nodes. However, these file systems are hard to use and difficult to integrate with workstations and tertiary storage. RAMA addresses these problems by providing a high-performance massively parallel file system with a simple interface. RAMA uses hashing to pseudo-randomly distribute data to all of its disks, insuring high bandwidth regardless of access pattern and eliminating bottlenecks in file block accesses. This flexibility does not cause a large loss of performance - RAMA's simulated performance is within 10-15\% of the optimum performance of a similarly-sized striped file system, and is a factor of 4 or more better than a striped file system with poorly laid out data.} } @Article{miller:pario, author = {L. L. Miller and A. R. Hurson}, title = {Multiprogramming and concurrency in parallel file environments}, journal = {International Journal of Mini and Microcomputers}, year = {1991}, volume = {13}, number = {2}, pages = {37--45}, keyword = {parallel file system, parallel I/O, database, pario-bib}, comment = {This is really for databases. They identify two types of file access: one where the file can be operated on as a set of subfiles, each independently by a processor (what they call MIMD mode), and another where the file must be operated on with a centralized control (SIMD mode), in their case to search a B-tree whose nodes span the set of processors. Basically it is a host connected to a controller, that is connected to a set of small I/O processors, each of which has access to disk. In many ways a uniprocessor perspective. Paper design, with simulation results.} } @Article{miller:pass, author = {L.~L. Miller and S.~R. Inglett and A.~R. Hurson}, title = {{PASS}--- A Multiuser Parallel File System Based on Microcomputers}, journal = {Journal of systems and software}, year = {1992}, month = {September}, volume = {19}, number = {1}, pages = {75--83}, keyword = {parallel I/O, parallel file system, multiprocessor file system, pario-bib}, abstract = {Data intensive computer applications suffer from inadequate use of parallelism for processing data stored on secondary storage devices. Devices such as database machines are useful in some applications, but many applications are too small or specialized to use such technology. To bridge this gap, the authors introduce the parallel secondary storage (PASS) system. PASS is based on a network of microcomputers. The individual microcomputers are assigned to a unit of secondary storage and the operations of the microcomputers are initiated and monitored by a control processor. The file system is capable of acting as either an SIMD or an MIMD machine. Communication between the individual microcomputers and the control processor is described. The integration of the multiple microcomputers into the primitive operations on a file is examined. Finally, the strategies employed to enhance performance in the multiprogramming environment are discussed.} } @Article{miller:pfs, author = {L. L. Miller and S. R. Inglett}, title = {Enhancing performance in a parallel file system}, journal = {Microprocessing and Microprogramming}, year = {1994}, month = {May}, volume = {40}, number = {4}, pages = {261--274}, keyword = {parallel I/O, parallel file system, pario-bib} } @InProceedings{miller:radar, author = {Craig Miller and David G. Payne and Thanh N. Phung and Herb Siegel and Roy Williams}, title = {Parallel Processing of Spaceborne Imaging Radar Data}, booktitle = {Proceedings of Supercomputing '95}, year = {1995}, publisher = {IEEE Computer Society Press}, address = {San Diego, CA}, URL = {http://www.supercomp.org/sc95/proceedings/012_PAYN/SC95.HTM}, keyword = {parallel I/O, pario-bib}, abstract = {We discuss the results of a collaborative project on parallel processing of Synthetic Aperture Radar (SAR) data, carried out between the NASA/Jet Propulsion Laboratory (JPL), the California Institute of Technology (Caltech) and Intel Scalable Systems Division (SSD). Through this collaborative effort, we have successfully parallelized the most compute-intensive SAR correlator phase of the Spaceborne Shuttle Imaging Radar-C/X-Band SAR (SIR-C/X-SAR) code, for the Intel Paragon. We describe the data decomposition, the scalable high-performance I/O model, and the node-level optimizations which enable us to obtain efficient processing throughput. In particular, we point out an interesting double level of parallelization arising in the data decomposition which increases substantially our ability to support ``high volume'' SAR. Results are presented from this code running in parallel on the Intel Paragon. A representative set of SAR data, of size 800 Megabytes, which was collected by the SIR-C/X-SAR instrument aboard NASA's Space Shuttle in 15 seconds, is processed in 55 seconds on the Concurrent Supercomputing Consortium's Paragon XP/S 35+. This compares well with a time of 12 minutes for the current SIR-C/X-SAR processing system at JPL. For the first time, a commercial system can process SIR-C/X-SAR data at a rate which is approaching the rate at which the SIR-C/X-SAR instrument can collect the data. This work has successfully demonstrated the viability of the Intel Paragon supercomputer for processing ``high volume'' Synthetic Aperture Radar data in near real-time.}, comment = {Available only on CD-ROM and WWW.} } @InProceedings{miller:rama, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA:} A File System for Massively-Parallel Computers}, booktitle = {Proceedings of the Twelfth IEEE Symposium on Mass Storage Systems}, year = {1993}, pages = {163--168}, later = {miller:rama2}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, comment = {The multiprocessor's file system acts as a block cache for tertiary storage. Disk space is broken into ``lines'' of a few MB. Each line has a descriptor telling what blocks it has, and their status. (fileid, offset) hashed to find (disk, linenum). Intrinsic metadata stored at start of each file; positional metadata implicit in hashing, and line descriptors. Sequentiality parameter puts several blocks of a file in the same line, to improve medium-sized requests (otherwise generate lots of request-response net traffic). Not clear on best choice of size. No mention of atomicity wrt concurrent writes to same data. Blocks migrate to tertiary storage as they get old. Fetched on demand, by block (not file). Self-describing blocks have ids in block -- leads to screwy block sizes?} } @InProceedings{miller:rama2, author = {Ethan L. Miller and Randy H. Katz}, title = {{RAMA}: Easy Access to a High-Bandwidth Massively Parallel File System}, booktitle = {Proceedings of the 1995 USENIX Technical Conference}, year = {1995}, month = {January}, pages = {59--70}, earlier = {miller:rama}, later = {miller:jrama}, keyword = {parallel file system, pario-bib}, comment = {Simulation results. RAMA distributes blocks of each file randomly across disks, which are attached to all processor nodes, using a hash function. Thus there is no centralized metadata. The big benefit is uniform performance regardless of access pattern; they found one situation where it was 10\% slower than an optimal striped layout, but many cases where they were as much as 4 times faster than bad striped data layouts. So, they can give reasonable performance without the need for programmer- or manager-specified data layouts.} } @Article{milligan:bifs, author = {P. Milligan and L. C. Waring and A. S. C. Lee}, title = {{BIFS}: {A} filing system for multiprocessor based systems}, journal = {Microprocessing and Microprogramming}, year = {1991}, volume = {31}, pages = {9--12}, note = {Euromicro~'90 conference, Amsterdam}, keyword = {multiprocessor file system, pario-bib}, comment = {A simple file system for a transputer network, attached to a single disk device. Several procs are devoted to the file system, but really just act as buffers for the host processor that runs the disk. They provide sequential, random access, and indexed files, either byte- or record-oriented. Some prototypes; no results. They add buffering and double buffering, but don't really get into anything interesting.} } @Article{miya:biblio, author = {Eugene N. Miya}, title = {Multiprocessor/Distributed Processing Bibliography}, journal = {Computer Architecture News}, year = {1985}, month = {March}, volume = {13}, number = {1}, pages = {27--29}, note = {Much updated since then, now kept on-line}, keyword = {bibliography, parallel computing, distributed computing, pario-bib}, comment = {This reference is the original publication of Eugene's annotated bibliography. It has grown tremendously and is now huge. Because of the copyright considerations, you can't just nab it off the net, but it is free for the asking from Eugene. Send mail to eugene@nas.nasa.gov.} } @InProceedings{mogi:parity, author = {Kazuhiko Mogi and Masaru Kitsuregawa}, title = {Dynamic Parity Stripe Reorganizations for {RAID5} Disk Arrays}, booktitle = {Proceedings of the Third International Conference on Parallel and Distributed Information Systems}, year = {1994}, month = {September}, pages = {17--26}, keyword = {disk array, RAID, disk striping, parallel I/O, pario-bib}, abstract = {RAID5 disk arrays provide high performance and high reliability for reasonable cost. However RAIDS suffers a performance penalty during block updates. We examine the feasibility of using "dynamic parity striping" to improve the performance of block updates. Instead of updating each block independently, this method buffers a number of updates, generates a new stripe composed of the newly updated blocks, then writes the full stripe back to disk. Two implementations are considered in this paper. One is a log-structured file system (LFS) based method and the other is Virtual Striping. Both methods achieve much higher performance than conventional approaches. The performance characteristics of the LFS based method and the Virtual Striping method are clarified.} } @Article{mokhoff:pario, author = {Nicholas Mokhoff}, title = {Parallel Disk Assembly Packs 1.5 {GBytes}, runs at 4 {MBytes/s}}, journal = {Electronic Design}, year = {1987}, month = {November}, pages = {45--46}, keyword = {parallel I/O, I/O, disk architecture, disk striping, reliability, pario-bib}, comment = {Commercially available: Micropolis Systems' Parallel Disk 1800 series. Four disks plus one parity disk, synchronized and byte-interleaved. SCSI interface. Total capacity 1.5 GBytes, sustained transfer rate of 4 MBytes/s. MTTF 140,000 hours. Hard and soft errors corrected in real-time. Failed drives can be replaced while system is running.} } @TechReport{montague:swift, author = {Bruce R. Montague}, title = {The {Swift/RAID} Distributed Transaction Driver}, year = {1993}, month = {January}, number = {UCSC-CRL-93-99}, institution = {UC Santa Cruz}, keyword = {RAID, parallel I/O, distributed file system, transaction, pario-bib}, comment = {See other Swift papers, e.g., cabrera:pario and long:swift-raid. This paper describes the basic idea of a using a transaction driver to implement RAID over a distributed system. Then it spends most of the time describing the details of the implementation. The basic idea is that processors execute transaction drivers, which provide virtual CPUs to execute scripts of atomic 'instructions', where the instructions are high-level things like read block, write block, compute parity, etc. The transaction driver multiprocesses several scripts if necessary. (Although they describe it in the context of a RAID implementation it certainly could be used for other complex distributed services.) The instructions are often transaction pairs, which compile into a pair of instructions, one for this node and one for the remote node. This node sends the program to the remote node, and they execute them separately, keeping synchronized for transaction pairs when necessary. See also the newer paper in Computing Surveys, long:swift-raid.} } @Article{moon:declustering, author = {Bongki Moon and Joel H. Saltz}, title = {Scalability Analysis of Declustering Methods for for Multidimensional Range Queries}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {1997}, note = {To appear}, URL = {ftp://hpsl.cs.umd.edu/pub/papers/ieee_tkde.ps.Z}, abstract = {Efficient storage and retrieval of multi-attribute datasets have become one of the essential requirements for many data-intensive applications. The Cartesian product file has been known as an effective multi-attribute file structure for partial-match and best-match queries. Several heuristic methods have been developed to decluster Cartesian product files across multiple disks to obtain high performance for disk accesses. Though the scalability of the declustering methods becomes increasingly important for systems equipped with a large number of disks, no analytic studies have been done so far. In this paper we derive formulas describing the scalability of two popular declustering methods Disk Modulo and Fieldwise Xor for range queries, which are the most common type of queries. These formulas disclose the limited scalability of the declustering methods and are corroborated by extensive simulation experiments. From the practical point of view, the formulas given in this paper provide a simple measure which can be used to predict the response time of a given range query and to guide the selection of a declustering method under various conditions.} } @Article{moore:ddio, author = {Jason A. Moore and Michael J. Quinn}, title = {Enhancing Disk-Directed {I/O} for Fine-Grained Redistribution of File Data}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {477--499}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {parallel I/O, multiprocessor file system, interprocessor communication, pario-bib}, comment = {They propose several enhancements to disk-directed I/O (see kotz:diskdir) that aim to improve performance on fine-grained distributions, that is, where each block from the disk is broken into small pieces that are scattered among the compute processors. One enhancement combines multiple pieces, possibly from separate disk blocks, into a single message. Another is to use two-phase I/O (see delrosario:two-phase), but to use disk-directed I/O to read data from the disks into CP memories, efficiently, then permute. This latter technique is probably faster than normal two-phase I/O that uses a traditional file system, not disk-directed I/O, for the read.} } @InProceedings{moore:detection, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Efficient Data-Parallel Files via Automatic Mode Detection}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {1--14}, publisher = {ACM Press}, address = {Philadelphia}, URL = {http://www.cs.orst.edu/~moorej/iopads.ps.Z}, keyword = {parallel I/O, data parallelism, pario-bib}, abstract = {Parallel languages rarely specify parallel I/O constructs, and existing commercial systems provide the programmer with a low-level I/O interface. We present design principles for integrating I/O into languages and show how these principles are applied to a virtual-processor-oriented language. We illustrate how machine-independent modes are used to support both high performance and generality. We describe an automatic mode detection technique that saves the programmer from extra syntax and low-level file system details. We show how virtual processor file operations, typically small by themselves, are combined into efficient large-scale file system calls. Finally, we present a variety of benchmark results detailing design tradeoffs and the performance of various modes.}, comment = {Updated version of TR 95-80-9. See moore:stream. Interesting approach, where they permit a fairly normal fread and fwrite kind of interface, with each VP having its own stream. They choose their own format for the file, and switch between formats (and internal buffering) depending on the particulars of the fread and fwrite parameters. They seem to have good performance, and a familiar interface. They are left with a non-standard file format.} } @TechReport{moore:ocean, author = {Jason A. Moore}, title = {Parallel {I/O} Requirements of Four Oceanography Applications}, year = {1995}, month = {January}, number = {95-80-1}, institution = {Oregon State University}, URL = {http://www.cs.orst.edu/~moorej/ocean.ps.Z}, keyword = {data parallel, file system workload, parallel I/O, pario-bib}, abstract = {Brief descriptions of the I/O requirements for four production oceanography programs running at Oregon State University are presented. The applications all rely exclusively on array-oriented, sequential file operations. Persistent files are used for checkpointing and movie making, while temporary files are used to store out-of-core data.}, comment = {See moore:detection, moore:stream. Only three pages.} } @TechReport{moore:stream-tr, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, year = {1994}, number = {94-80-13}, institution = {Oregon State University}, note = {Updated September 1995.}, later = {moore:stream}, URL = {http://www.cs.orst.edu/~moorej/streamstar.ps.Z}, keyword = {data parallel, parallel I/O, pario-bib}, abstract = {Although hardware supporting parallel file I/O has improved greatly since the introduction of first-generation parallel computers, the programming interface has not. Each vendor provides a different logical view of parallel files as well as nonportable operations for manipulating files. Neither do parallel languages provide standards for performing I/O. In this paper, we describe a view of parallel files for data-parallel languages, dubbed Stream*, in which each virtual processor writes to and reads from its own stream. In this scheme each virtual processor's I/O operations have the same familiar, unambiguous meaning as in a sequential C program. We demonstrate how I/O operations in Stream* can run as fast as those of vendor-specific parallel file systems on the operations most often encountered in data-parallel programs. We show how this system supports general virtual processor operations for debugging and elemental functions. Finally, we present empirical results from a prototype Stream* system running on a Meiko CS-2 multicomputer.}, comment = {See moore:stream; nearly identical. See also moore:detection. This paper gives a little bit earlier description of the Stream* idea than does moore:detection, but you'd be pretty much complete just reading moore:detection.} } @InProceedings{moore:stream, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, booktitle = {Parallel Computing: State-of-the-Art and Perspectives (ParCo~'95)}, year = {1995}, month = {September}, pages = {287--294}, publisher = {Elsevier Science}, earlier = {moore:stream-tr}, keyword = {data parallel, parallel I/O, pario-bib} } @InProceedings{more:mtio, author = {Sachin More and Alok Choudhary and Ian Foster and Ming Q. Xu}, title = {{MTIO} A Multi-Threaded Parallel {I/O} System}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, URL = {http://www.ece.nwu.edu/~ssmore/ipps97.ps}, keyword = {verify pages, threads, parallel I/O, pario-bib}, abstract = {This paper presents the design and evaluation of a multi-threaded runtime library for parallel I/O. We extend the multi-threading concept to separate the compute and I/O tasks in two separate threads of control. Multi-threading in our design permits a) asynchronous I/O even if the underlying file system does not support asynchronous I/O; b) copy avoidance from the I/O thread to the compute thread by sharing address space; and c) a capability to perform collective I/O asynchronously without blocking the compute threads. Further, this paper presents techniques for collective I/O which maximize load balance and concurrency while reducing communication overhead in an integrated fashion. Performance results on IBM SP2 for various data distributions and access patterns are presented. The results show that there is a tradeoff between the amount of concurrency in I/O and the buffer size designated for I/O; and there is an optimal buffer size beyond which benefits of larger requests diminish due to large communication overheads.} } @Article{moren:controllers, author = {William D. Moren}, title = {Design of Controllers is Key Element in Disk Subsystem Throughput}, journal = {Computer Technology Review}, year = {1988}, month = {Spring}, pages = {71--73}, keyword = {parallel I/O, disk architecture, pario-bib}, comment = {A short paper on some basic techniques used by disk controllers to improve throughput: seek optimization, request combining, request queuing, using multiple drives in parallel, scatter/gather DMA, data caching, read-ahead, cross-track read-ahead, write-back caching, segmented caching, reduced latency (track buffering), and format skewing. [Most of these are already handled in Unix file systems.]} } @InProceedings{mourad:raid, author = {Antoine N. Mourad and W. Kent Fuchs and Daniel G. Saab}, title = {Performance of Redundant Disk Array Organizations in Transaction Processing Environments}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {I--138--145}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {Transaction-processing workload dominated by small I/Os. They compare RAID~5, Parity Striping (which was designed for TP because it avoids lots of seeks on medium-sized requests, by declustering parity but not data), mirroring, and RAID~0. RAID~5 does {\em better\/} than parity striping due to its load balancing ability on the skewed workload. RAID~5 also better as the load increases.} } @InProceedings{mowry:prefetch, author = {Todd C. Mowry and Angela K. Demke and Orran Krieger}, title = {Automatic compiler-inserted I/O prefetching for out-of-core applications}, booktitle = {Proceedings of the 1996 Symposium on Operating Systems Design and Implementation}, year = {1996}, month = {October}, pages = {3--17}, publisher = {USENIX Association}, URL = {http://www.usenix.org/publications/library/proceedings/osdi96/mowry.html}, keyword = {compiler, prefetch, parallel I/O, pario-bib}, abstract = {Current operating systems offer poor performance when a numeric application's working set does not fit in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme, the compiler provides the crucial information on future access patterns without burdening the programmer, the operating system supports non-binding prefetch and release hints for managing I/O, and the operating system cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively move prefetches back ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We have implemented our scheme using the SUIF compiler and the Hurricane operating system. Our experimental results demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the eight applications, with two applications speeding up by threefold or more.}, comment = {Best Paper Award.} } @Article{moyer:application, author = {S. Moyer and V. S. Sunderam}, title = {Parallel {I/O} as a Parallel Application}, journal = {International Journal of Supercomputer Applications}, year = {1995}, month = {Summer}, volume = {9}, number = {2}, pages = {95--107}, keyword = {parallel I/O, pario-bib}, comment = {An overview of PIOUS and its performance. Results for partitioned and self-scheduled access pattern. See other moyer:* papers. The big thing about PIOUS over previous parallel file systems is its internal use of transactions for concurrency control and user-selectable fault-tolerance guarantees, and its optional support of user-level transactions.} } @TechReport{moyer:characterize, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, year = {1995}, month = {June}, number = {CSTR-950601}, institution = {Emory University}, later = {moyer:jcharacterize}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950601.ps}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. But because a single read or write operation can generate data accesses on multiple independent storage devices, a concurrency control mechanism must be employed to retain familiar file access semantics. Concurrency control negates some of the performance benefits of data declustering by introducing additional file access overhead. This paper examines the performance characteristics of the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Results demonstrate that linearizability of file access operations is provided without loss of scalability or stability.}, comment = {``substantially different material than presented in a previous report,'' moyer:scalable-tr. But it seems like the moyer:scalable IOPADS paper is largely a subset of this TR. He describes how they use volatile transactions, and does some experiments with PIOUS to measure their efficiency. Basically, they use a 2-phase commit protocol, using timeouts to detect deadlock and transaction aborts to remedy the deadlock. Results for partitioned and sequential access patterns.} } @Article{moyer:jcharacterize, author = {Steven A. Moyer and V.S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {October}, volume = {38}, number = {1}, pages = {81--91}, earlier = {moyer:characterize}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @InProceedings{moyer:pario, author = {Steven A. Moyer and V. S. Sunderam}, title = {A Parallel {I/O} System for High-Performance Distributed Computing}, booktitle = {Proceedings of the IFIP WG10.3 Working Conference on Programming Environments for Massively Parallel Distributed Systems}, year = {1994}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousifip94.ps}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {See moyer:pious. A further description of the PIOUS parallel file system for cluster computing. (Beta-test version available for ftp). They support parafiles, which are collections of segments, each segment residing on a different server. The segments can be viewed separately or can be interleaved into a linear sequence using an arbitrary chunk size. They also support transactions to support sequential consistency.} } @InProceedings{moyer:pious, author = {Steven A. Moyer and V. S. Sunderam}, title = {{PIOUS:} A Scalable Parallel {I/O} System for Distributed Computing Environments}, booktitle = {Proceedings of the Scalable High-Performance Computing Conference}, year = {1994}, pages = {71--78}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousshpcc94.ps}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {Basically, I/O for clusters of workstations; ideally, it is parallel, heterogeneous, fault tolerant, etc. File servers are independent, have only a local view. Single server used to coordinate open(). Client libraries implement the API and depend on the servers only for storage mechanism. Servers use transactions internally -- but usually these are lightweight transactions, only used for concurrency control and not recovery. Full transactions are supported for times when the user wants the extra fault tolerance. They have files that are in some sense 2-dimensional. Sequential consistency. User-controllable fault tolerance. Performance: 2 clients max out the transport (ethernet). ``Stable'' mode is slow, as is self-scheduled mode. No client caching. See moyer:pario.} } @InCollection{moyer:scalable-book, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {10}, editor = {Ravi Jain and John Werth and James C. Browne}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {225--243}, publisher = {Kluwer Academic Publishers}, earlier = {moyer:scalable}, keyword = {parallel I/O, parallel file system, concurrency control, synchronization, transaction, pario-bib}, abstract = {Parallel file systems employ data declustering to increase \mbox{I/O} throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @TechReport{moyer:scalable-tr, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, year = {1995}, month = {February}, number = {CSTR-950202}, institution = {Emory University}, later = {moyer:scalable}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950202.ps}, keyword = {parallel I/O, parallel file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {They describe {\em volatile transactions\/} as a way of providing the appopriate sequential consistency among file-read and -write operations (a feature not provided by most file systems). Their PIOUS library implements these transactions with strict 2-phase locking. They show some performance results, though only on a limited and relatively simple benchmark. If nothing else this paper reminds us all that atomicity of file-read and -write requests should be available to the user (eg, note how they are optional in Vesta). Published as moyer:scalable.} } @InProceedings{moyer:scalable, author = {Steven A. Moyer and V. S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {90--106}, earlier = {moyer:scalable-tr}, later = {moyer:scalable-book}, keyword = {parallel I/O, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Seems to be a subset of moyer:scalable-tr, and for that matter, moyer:characterize. Results for partitioned access pattern.} } @Misc{mpi-forum:mpi2, key = {MPI}, title = {{MPI-2}: Extensions to the Message-Passing Interface}, year = {1997}, month = {July}, howpublished = {{The MPI Forum}}, earlier = {mpi-ioc:mpi-io5}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {This is the definition of the MPI2 message-passing standard, which includes an interface for parallel I/O. Supercedes mpi-ioc:mpi-io5 and earlier versions. See the MPI2 web page at http://www.mpi-forum.org. The I/O section is at http://www.mpi-forum.org/docs/mpi-20-html/node172.html.} } @Misc{mpi-ioc:mpi-io5, key = {MPIO}, title = {{MPI-IO:} A Parallel File {I/O} Interface for {MPI}}, year = {1996}, month = {April}, howpublished = {{The MPI-IO Committee}}, note = {Version 0.5.}, earlier = {corbett:mpi-io4}, later = {mpi-forum:mpi2}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {Supercedes corbett:mpi-io4 and earlier versions. See the MPI-IO Web page at http://parallel.nas.nasa.gov/MPI-IO/.} } @InBook{mpi2-io, author = {{Message-Passing Interface Forum}}, title = {{MPI-2.0}: Extensions to the Message-Passing Interface}, chapter = {9}, year = {1997}, month = {June}, publisher = {MPI Forum}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {MPI, message passing, parallel computing, library, parallel I/O, pario-bib}, comment = {Chapter 9 is about I/O extensions.} } @InProceedings{mueck:multikey, author = {T.~A. Mueck and J. Witzmann}, title = {Multikey Index Support for Tuple Sets on Parallel Mass Storage Systems}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {136--145}, URL = {http://www.computer.org/conferen/mss95/mueck/mueck.htm}, keyword = {parallel database, mass storage, parallel I/O, pario-bib}, abstract = {The development and evaluation of a tuple set manager (TSM) based on multikey index data structures is a main part of the PARABASE project at the University of Vienna. The TSM provides access to parallel mass storage systems using tuple sets instead of conventional files as the central data structure for application programs. A proof-of-concept prototype TSM is already implemented and operational on an iPSC/2. It supports tuple insert and delete operations as well as exact match, partial match, and range queries at system call level. Available results are from this prototype on the one hand and from various performance evaluation figures. The evaluation results demonstrate the performance gain achieved by the implementation of the tuple set management concept on a parallel mass storage system.} } @InProceedings{muller:multi, author = {Keith Muller and Joseph Pasquale}, title = {A High Performance Multi-Structured File System Design}, booktitle = {Proceedings of the Thirteenth ACM Symposium on Operating Systems Principles}, year = {1991}, pages = {56--67}, publisher = {ACM Press}, address = {Pacific Grove, CA}, keyword = {file system, disk striping, disk mirroring, pario-bib} } @InProceedings{muntz:failure, author = {Richard R. Muntz and John C. S. Lui}, title = {Performance Analysis of Disk Arrays Under Failure}, booktitle = {Proceedings of the 16th International Conference on Very Large Data Bases}, year = {1990}, pages = {162--173}, keyword = {disk array, parallel, performance analysis, pario-bib}, comment = {Looked at RAID5 when in failure mode. For small-reads workload, could only get 50\% of normal. So they decouple cluster size and parity-group size, so that they decluster over more disks than group size; during failure, this causes less of a load increase on surviving disks.} } @InProceedings{mutisya:cache, author = {Gerald Mutisya and Bradley M. Broom}, title = {Distributed File Caching for the {AP1000}}, booktitle = {Proceedings of the Third Fujitsu-ANU CAP Workshop}, year = {1992}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, lautenbach:pfs, and broom:cap. They examine ways to manage a distributed file cache, without replication. Since there is no replication, the concurrency control problems boil down to providing atomicity for multi-block, multi-site requests. This is handled essentially by serializing the request: send the request to the first site, and have it forward the request from site to site as each block is processed. This works fine but completely serializes all multi-block requests, somewhat defeating the purpose. Thus, they get concurrency between requests, by having multiple servers, but no parallelism within requests.} } @Article{myllymaki:buffering, author = {Jussi Myllymaki and Miron Livny}, title = {Efficient buffering for concurrent disk tape {I/O}}, journal = {Performance Evaluation: An International Journal}, year = {1996}, volume = {27/28}, pages = {453--471}, note = {Performance~'96}, keyword = {buffering, file caching, tertiary storage, tape robot, file migration, parallel I/O, pario-bib}, comment = {Ways to use secondary and tertiary storage in parallel, and buffering mechanisms for applications with concurrent I/O requirements.} } @InProceedings{nagaraj:hpfs, author = {U. Nagaraj and U. S. Shukla and A. Paulraj}, title = {Design and Evaluation of a High Performance File System for Message Passing Parallel Computers}, booktitle = {Proceedings of the Fifth International Parallel Processing Symposium}, year = {1991}, pages = {549--554}, keyword = {multiprocessor file system, pario-bib}, comment = {They describe a file system for general message-passing, distributed-memory, separate I/O and compute node, multicomputers. They provide few details, although they cite a lot of their tech reports. There are a few simulation results, but none show anything unintuitive.} } @InProceedings{nagashima:pario, author = {Umpei Nagashima and Takashi Shibata and Hiroshi Itoh and Minoru Gotoh}, title = {An Improvement of {I/O} Function for Auxiliary Storage: {Parallel I/O} for a Large Scale Supercomputing}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {48--59}, keyword = {parallel I/O, pario-bib}, comment = {Using parallel I/O channels to access striped disks, in parallel from a supercomputer. They {\em chain}\/ (i.e., combine) requests to a disk for large contiguous accesses.} } @InProceedings{nakajo:ionet, author = {H. Nakajo and S. Ohtani and T. Matsumoto and M. Kohata and K. Hiraki and Y. Kaneda}, title = {An {I/O} Network for Architecture of the Distributed Shared-Memory Massively}, booktitle = {Proceedings of the 11th ACM International Conference on Supercomputing}, year = {1997}, month = {July}, publisher = {ACM Press}, keyword = {verify pages, collective I/O, multiprocessor file system, parallel I/O, pario-bib} } @InProceedings{nakajo:jump1, author = {Hironori Nakajo}, title = {A Simulation-based Evaluation of a Disk {I/O} Subsystem for a Massively Parallel Computer: {JUMP-1}}, booktitle = {Proceedings of the Sixteenth International Conference on Distributed Computer Systems}, year = {1996}, month = {May}, pages = {562--569}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O architecture, pario-bib}, abstract = {JUMP-1 is a distributed shared-memory massively parallel computer and is composed of multiple clusters of interconnected network called RDT (Recursive Diagonal Torus). Each cluster in JUMP-1 consists of 4 element processors, secondary cache memories, and 2 MBP (Memory Based Processor) for high-speed synchronization and communication among clusters. The I/O subsystem is connected to a cluster via a high-speed serial link called STAFF-Link. The I/O buffer memory is mapped onto the JUMP-1 global shared-memory to permit each I/O access operation as memory access. In this paper we describe evaluation of the fundamental performance of the disk I/O subsystem using event-driven simulation, and estimated performance with a Video On Demand (VOD) application.} } @InProceedings{natarajan:clusterio, author = {Chita Natarajan and Ravishankar K. Iyer}, title = {Measurement and Simulation Based Performance Analysis of Parallel {I/O} in a High-Performance Cluster System}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {332--339}, publisher = {IEEE Computer Society Press}, keyword = {performance analysis, parallel I/O, pario-bib}, abstract = {This paper presents a measurement and simulation based study of parallel I/O in a high-performance cluster system: the Pittsburgh Supercomputing Center (PSC) DEC Alpha Supercluster. The measurements were used to characterize the performance bottlenecks and the throughput limits at the compute and I/O nodes, and to provide realistic input parameters to PioSim, a simulation environment we have developed to investigate parallel I/O performance issues in cluster systems. PioSim was used to obtain a detailed characterization of parallel I/O performance, in the high performance cluster system, for different regular access patterns and different system configurations. This paper also explores the use of local disks at the compute nodes for parallel I/O, and finds that the local disk architecture outperforms the traditional parallel I/O over remote I/O node disks architecture, even when as much as 68-75\% of the requests from each compute node goes to remote disks.} } @TechReport{ncr:3600, key = {NCR}, title = {{NCR 3600} Product Description}, year = {1991}, month = {September}, number = {ST-2119-91}, institution = {NCR}, address = {San Diego}, keyword = {multiprocessor architecture, MIMD, parallel I/O, pario-bib}, comment = {Has 1-32 50MHz Intel 486 processors. Parallel independent disks on the disk nodes, separate from the processor nodes. Tree interconnect. Aimed at database applications.} } @InProceedings{ng:diskarray, author = {Spencer Ng}, title = {Some Design Issues of Disk Arrays}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {137--142}, note = {San Francisco, CA}, keyword = {parallel I/O, disk array, pario-bib}, comment = {Discusses disk arrays and striping. Transfer size is important to striping success: small size transfers are better off with independent disks. Synchronized rotation is especially important for small transfer sizes, since then the increased rotational delays dominate. Fine grain striping involves less assembly/disassembly delay, but coarse grain (block) striping allows for request parallelism. Fine grain striping wastes capacity due to fixed size formatting overhead. He also derives exact MTTF equation for 1-failure tolerance and on-line repair.} } @InProceedings{ng:interleave, author = {S. Ng and D. Lang and R. Selinger}, title = {Trade-offs Between Devices and Paths in Achieving Disk Interleaving}, booktitle = {Proceedings of the 15th Annual International Symposium on Computer Architecture}, year = {1988}, pages = {196--201}, keyword = {parallel I/O, disk architecture, disk caching, I/O bottleneck, pario-bib}, comment = {Compares four different ways of restructuring IBM disk controllers and channels to obtain more parallelism. They use parallel heads or parallel actuators. The best results come when they replicate the control electronics to maintain the number of data paths through the controller. Otherwise the controller bottleneck reduces performance. Generally, for large or small transfer sizes, parallel heads with replication gave better performance.} } @Article{nicastro:fft, author = {L. Nicastro and N. {D'Amico}}, title = {An optimized mass storage {FFT} for vector computers}, journal = {Parallel Computing}, year = {1995}, month = {March}, volume = {21}, pages = {423--432}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {out-of-core algorithm, parallel I/O algorithm, scientific computing, vector computer, pario-bib}, comment = {They describe an out-of-core FFT algorithm for vector computers (one disk, one vector processor). They implemented it on a Convex and show good performance. Basically, the segment the array, do FFTs on each segment, and do some transposing and other stuff to combine the segments. Each segment is basically a memoryload. Seems parallelizable too.} } @TechReport{nickolls:dpio, author = {John R. Nickolls and Ernie Rael}, title = {Data Parallel {Unix} Input/Output for a Massively Parallel Processor}, year = {1993}, number = {MP/P-17.93}, institution = {MasPar Computer Corporation}, keyword = {Unix, parallel I/O, data parallel, pario-bib}, comment = {Cite nickolls:maspar-io.} } @InProceedings{nickolls:maspar-io, author = {John R. Nickolls}, title = {The {MasPar} Scalable {Unix I/O} System.}, booktitle = {Proceedings of the Eighth International Parallel Processing Symposium}, year = {1994}, month = {April}, pages = {390--394}, address = {Cancun, Mexico}, keyword = {parallel I/O, multiprocessor file system, SIMD, pario-bib}, abstract = {Scalable parallel computers require I/O balanced with computational power to solve data-intensive problems. Distributed memory architectures call for I/O hardware and software beyond those of conventional scalar systems. \par This paper introduces the MasPar I/O system, designed to provide balanced and and scalable data-parallel Unix I/O. The architecture and implementation of the I/O hardware and software are described. Key elements include parallel access to conventional Unix file descriptors and a self-routing multistage network coupled with a buffer memory for flexible parallel I/O transfers. Performance measurements are presented for parallel Unix I/O with a scalable RAID disk array, a RAM disk, and a HIPPI interconnect.}, comment = {This provides the definitive reference for the Maspar parallel-I/O architecture and file system. This paper includes a brief discussion of the interface and performance results. Also includes some HIPPI interface performance results. This paper is the conference version of nickolls:dpio, so cite this one.} } @InProceedings{nieplocha:arrays, author = {Jarek Nieplocha and Ian Foster}, title = {Disk Resident Arrays: An Array-Oriented {I/O} Library for Out-Of-Core Computations}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {196--204}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {In out-of-core computations, disk storage is