@TechReport{moore:ocean, author = {Jason A. Moore}, title = {Parallel {I/O} Requirements of Four Oceanography Applications}, year = {1995}, month = {January}, number = {95-80-1}, institution = {Oregon State University}, keyword = {data parallel, file system workload, parallel I/O, pario-bib}, abstract = {Brief descriptions of the I/O requirements for four production oceanography programs running at Oregon State University are presented. The applications all rely exclusively on array-oriented, sequential file operations. Persistent files are used for checkpointing and movie making, while temporary files are used to store out-of-core data.}, comment = {See moore:detection, moore:stream. Only three pages.} } @TechReport{moore:stream-tr, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, year = {1994}, number = {94-80-13}, institution = {Oregon State University}, note = {Updated September 1995.}, later = {moore:stream}, keyword = {data parallel, parallel I/O, pario-bib}, abstract = {Although hardware supporting parallel file I/O has improved greatly since the introduction of first-generation parallel computers, the programming interface has not. Each vendor provides a different logical view of parallel files as well as nonportable operations for manipulating files. Neither do parallel languages provide standards for performing I/O. In this paper, we describe a view of parallel files for data-parallel languages, dubbed Stream*, in which each virtual processor writes to and reads from its own stream. In this scheme each virtual processor's I/O operations have the same familiar, unambiguous meaning as in a sequential C program. We demonstrate how I/O operations in Stream* can run as fast as those of vendor-specific parallel file systems on the operations most often encountered in data-parallel programs. We show how this system supports general virtual processor operations for debugging and elemental functions. Finally, we present empirical results from a prototype Stream* system running on a Meiko CS-2 multicomputer.}, comment = {See moore:stream; nearly identical. See also moore:detection. This paper gives a little bit earlier description of the Stream* idea than does moore:detection, but you'd be pretty much complete just reading moore:detection.} } @InProceedings{moore:stream, author = {Jason A. Moore and Philip J. Hatcher and Michael J. Quinn}, title = {Stream*: Fast, Flexible, Data-parallel {I/O}}, booktitle = {Parallel Computing: State-of-the-Art and Perspectives (ParCo~'95)}, year = {1995}, month = {September}, pages = {287--294}, publisher = {Elsevier Science}, earlier = {moore:stream-tr}, keyword = {data parallel, parallel I/O, pario-bib} } @InProceedings{more:mtio, author = {Sachin More and Alok Choudhary and Ian Foster and Ming Q. Xu}, title = {{MTIO} A Multi-Threaded Parallel {I/O} System}, booktitle = {Proceedings of the Eleventh International Parallel Processing Symposium}, year = {1997}, month = {April}, pages = {368--373}, URL = {http://www.ece.nwu.edu/~ssmore/ipps97.ps}, keyword = {threads, parallel I/O, pario-bib}, abstract = {This paper presents the design and evaluation of a multi-threaded runtime library for parallel I/O. We extend the multi-threading concept to separate the compute and I/O tasks in two separate threads of control. Multi-threading in our design permits a) asynchronous I/O even if the underlying file system does not support asynchronous I/O; b) copy avoidance from the I/O thread to the compute thread by sharing address space; and c) a capability to perform collective I/O asynchronously without blocking the compute threads. Further, this paper presents techniques for collective I/O which maximize load balance and concurrency while reducing communication overhead in an integrated fashion. Performance results on IBM SP2 for various data distributions and access patterns are presented. The results show that there is a tradeoff between the amount of concurrency in I/O and the buffer size designated for I/O; and there is an optimal buffer size beyond which benefits of larger requests diminish due to large communication overheads.} } @Article{moren:controllers, author = {William D. Moren}, title = {Design of Controllers is Key Element in Disk Subsystem Throughput}, journal = {Computer Technology Review}, year = {1988}, month = {Spring}, pages = {71--73}, keyword = {parallel I/O, disk architecture, pario-bib}, comment = {A short paper on some basic techniques used by disk controllers to improve throughput: seek optimization, request combining, request queuing, using multiple drives in parallel, scatter/gather DMA, data caching, read-ahead, cross-track read-ahead, write-back caching, segmented caching, reduced latency (track buffering), and format skewing. [Most of these are already handled in Unix file systems.]} } @InProceedings{mourad:raid, author = {Antoine N. Mourad and W. Kent Fuchs and Daniel G. Saab}, title = {Performance of Redundant Disk Array Organizations in Transaction Processing Environments}, booktitle = {Proceedings of the 1993 International Conference on Parallel Processing}, year = {1993}, pages = {I--138--145}, publisher = {CRC Press}, address = {St. Charles, IL}, keyword = {parallel I/O, disk array, pario-bib, RAID}, comment = {Transaction-processing workload dominated by small I/Os. They compare RAID~5, Parity Striping (which was designed for TP because it avoids lots of seeks on medium-sized requests, by declustering parity but not data), mirroring, and RAID~0. RAID~5 does {\em better\/} than parity striping due to its load balancing ability on the skewed workload. RAID~5 also better as the load increases.} } @InProceedings{mowry:prefetch, author = {Todd C. Mowry and Angela K. Demke and Orran Krieger}, title = {Automatic compiler-inserted {I/O} prefetching for out-of-core applications}, booktitle = {Proceedings of the 1996 Symposium on Operating Systems Design and Implementation}, year = {1996}, month = {October}, pages = {3--17}, publisher = {USENIX Association}, later = {mowry:jprefetch}, URL = {http://www.usenix.org/publications/library/proceedings/osdi96/mowry.html}, keyword = {compiler, prefetch, parallel I/O, pario-bib}, abstract = {Current operating systems offer poor performance when a numeric application's working set does not fit in main memory. As a result, programmers who wish to solve ``out-of-core'' problems efficiently are typically faced with the onerous task of rewriting an application to use explicit I/O operations (e.g., read/write). In this paper, we propose and evaluate a fully-automatic technique which liberates the programmer from this task, provides high performance, and requires only minimal changes to current operating systems. In our scheme, the compiler provides the crucial information on future access patterns without burdening the programmer, the operating system supports non-binding prefetch and release hints for managing I/O, and the operating system cooperates with a run-time layer to accelerate performance by adapting to dynamic behavior and minimizing prefetch overhead. This approach maintains the abstraction of unlimited virtual memory for the programmer, gives the compiler the flexibility to aggressively move prefetches back ahead of references, and gives the operating system the flexibility to arbitrate between the competing resource demands of multiple applications. We have implemented our scheme using the SUIF compiler and the Hurricane operating system. Our experimental results demonstrate that our fully-automatic scheme effectively hides the I/O latency in out-of-core versions of the entire NAS Parallel benchmark suite, thus resulting in speedups of roughly twofold for five of the eight applications, with two applications speeding up by threefold or more.}, comment = {Best Paper Award.} } @Article{moyer:application, author = {S. Moyer and V. S. Sunderam}, title = {Parallel {I/O} as a Parallel Application}, journal = {International Journal of Supercomputer Applications}, year = {1995}, month = {Summer}, volume = {9}, number = {2}, pages = {95--107}, keyword = {parallel I/O, pario-bib}, comment = {An overview of PIOUS and its performance. Results for partitioned and self-scheduled access pattern. See other moyer:* papers. The big thing about PIOUS over previous parallel file systems is its internal use of transactions for concurrency control and user-selectable fault-tolerance guarantees, and its optional support of user-level transactions.} } @TechReport{moyer:characterize, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, year = {1995}, month = {June}, number = {CSTR-950601}, institution = {Emory University}, later = {moyer:jcharacterize}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950601.ps}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. But because a single read or write operation can generate data accesses on multiple independent storage devices, a concurrency control mechanism must be employed to retain familiar file access semantics. Concurrency control negates some of the performance benefits of data declustering by introducing additional file access overhead. This paper examines the performance characteristics of the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Results demonstrate that linearizability of file access operations is provided without loss of scalability or stability.}, comment = {``substantially different material than presented in a previous report,'' moyer:scalable-tr. But it seems like the moyer:scalable IOPADS paper is largely a subset of this TR. He describes how they use volatile transactions, and does some experiments with PIOUS to measure their efficiency. Basically, they use a 2-phase commit protocol, using timeouts to detect deadlock and transaction aborts to remedy the deadlock. Results for partitioned and sequential access patterns.} } @Article{moyer:jcharacterize, author = {Steven A. Moyer and V.S. Sunderam}, title = {Characterizing Concurrency Control Performance for the {PIOUS} Parallel File System}, journal = {Journal of Parallel and Distributed Computing}, year = {1996}, month = {October}, volume = {38}, number = {1}, pages = {81--91}, earlier = {moyer:characterize}, keyword = {parallel I/O, multiprocessor file system, pario-bib} } @InProceedings{moyer:pario, author = {Steven A. Moyer and V. S. Sunderam}, title = {A Parallel {I/O} System for High-Performance Distributed Computing}, booktitle = {Proceedings of the IFIP WG10.3 Working Conference on Programming Environments for Massively Parallel Distributed Systems}, year = {1994}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousifip94.ps}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {See moyer:pious. A further description of the PIOUS parallel file system for cluster computing. (Beta-test version available for ftp). They support parafiles, which are collections of segments, each segment residing on a different server. The segments can be viewed separately or can be interleaved into a linear sequence using an arbitrary chunk size. They also support transactions to support sequential consistency.} } @InProceedings{moyer:pious, author = {Steven A. Moyer and V. S. Sunderam}, title = {{PIOUS:} A Scalable Parallel {I/O} System for Distributed Computing Environments}, booktitle = {Proceedings of the Scalable High-Performance Computing Conference}, year = {1994}, pages = {71--78}, URL = {ftp://ftp.mathcs.emory.edu/pub/vss/piousshpcc94.ps.Z}, keyword = {parallel I/O, parallel file system, workstation cluster, file system interface, pario-bib}, comment = {Basically, I/O for clusters of workstations; ideally, it is parallel, heterogeneous, fault tolerant, etc. File servers are independent, have only a local view. Single server used to coordinate open(). Client libraries implement the API and depend on the servers only for storage mechanism. Servers use transactions internally -- but usually these are lightweight transactions, only used for concurrency control and not recovery. Full transactions are supported for times when the user wants the extra fault tolerance. They have files that are in some sense 2-dimensional. Sequential consistency. User-controllable fault tolerance. Performance: 2 clients max out the transport (ethernet). ``Stable'' mode is slow, as is self-scheduled mode. No client caching. See moyer:pario.} } @InCollection{moyer:scalable-book, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {10}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {225--243}, publisher = {Kluwer Academic Publishers}, earlier = {moyer:scalable}, keyword = {parallel I/O, parallel file system, concurrency control, synchronization, transaction, pario-bib}, abstract = {Parallel file systems employ data declustering to increase \mbox{I/O} throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Part of a whole book on parallel I/O; see iopads-book.} } @TechReport{moyer:scalable-tr, author = {Steven A. Moyer and V.~S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, year = {1995}, month = {February}, number = {CSTR-950202}, institution = {Emory University}, later = {moyer:scalable}, URL = {ftp://ftp.mathcs.emory.edu/pub/cstr/CSTR950202.ps}, keyword = {parallel I/O, parallel file system, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {They describe {\em volatile transactions\/} as a way of providing the appopriate sequential consistency among file-read and -write operations (a feature not provided by most file systems). Their PIOUS library implements these transactions with strict 2-phase locking. They show some performance results, though only on a limited and relatively simple benchmark. If nothing else this paper reminds us all that atomicity of file-read and -write requests should be available to the user (eg, note how they are optional in Vesta). Published as moyer:scalable.} } @InProceedings{moyer:scalable, author = {Steven A. Moyer and V. S. Sunderam}, title = {Scalable Concurrency Control for Parallel File Systems}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {90--106}, earlier = {moyer:scalable-tr}, later = {moyer:scalable-book}, keyword = {parallel I/O, pario-bib}, abstract = {Parallel file systems employ data declustering to increase I/O throughput. As a result, a single read or write operation can generate concurrent data accesses on multiple storage devices. Unless a concurrency control mechanism is employed, familiar file access semantics are likely to be violated. This paper details the transaction-based concurrency control mechanism implemented in the PIOUS parallel file system. Performance results are presented demonstrating that sequential consistency semantics can be provided without loss of system scalability.}, comment = {Seems to be a subset of moyer:scalable-tr, and for that matter, moyer:characterize. Results for partitioned access pattern.} } @Misc{mpi-forum:mpi2, key = {MPI}, title = {{MPI-2}: Extensions to the Message-Passing Interface}, year = {1997}, month = {July}, howpublished = {{The MPI Forum}}, earlier = {mpi-ioc:mpi-io5}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {This is the definition of the MPI2 message-passing standard, which includes an interface for parallel I/O. Supercedes mpi-ioc:mpi-io5 and earlier versions. See the MPI2 web page at http://www.mpi-forum.org. The I/O section is at http://www.mpi-forum.org/docs/mpi-20-html/node172.html.} } @Misc{mpi-ioc:mpi-io5, key = {MPIO}, title = {{MPI-IO:} A Parallel File {I/O} Interface for {MPI}}, year = {1996}, month = {April}, howpublished = {{The MPI-IO Committee}}, note = {Version 0.5.}, earlier = {corbett:mpi-io4}, later = {mpi-forum:mpi2}, keyword = {parallel I/O, message-passing, multiprocessor file system interface, pario-bib}, comment = {Supercedes corbett:mpi-io4 and earlier versions. See the MPI-IO Web page at http://parallel.nas.nasa.gov/MPI-IO/.} } @InBook{mpi2-io, author = {{Message-Passing Interface Forum}}, title = {{MPI-2.0}: Extensions to the Message-Passing Interface}, chapter = {9}, year = {1997}, month = {June}, publisher = {MPI Forum}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {MPI, message passing, parallel computing, library, parallel I/O, pario-bib}, comment = {Chapter 9 is about I/O extensions.} } @InProceedings{mueck:multikey, author = {T.~A. Mueck and J. Witzmann}, title = {Multikey Index Support for Tuple Sets on Parallel Mass Storage Systems}, booktitle = {Proceedings of the Fourteenth IEEE Symposium on Mass Storage Systems}, year = {1995}, month = {September}, pages = {136--145}, URL = {http://www.computer.org/conferen/mss95/mueck/mueck.htm}, keyword = {parallel database, mass storage, parallel I/O, pario-bib}, abstract = {The development and evaluation of a tuple set manager (TSM) based on multikey index data structures is a main part of the PARABASE project at the University of Vienna. The TSM provides access to parallel mass storage systems using tuple sets instead of conventional files as the central data structure for application programs. A proof-of-concept prototype TSM is already implemented and operational on an iPSC/2. It supports tuple insert and delete operations as well as exact match, partial match, and range queries at system call level. Available results are from this prototype on the one hand and from various performance evaluation figures. The evaluation results demonstrate the performance gain achieved by the implementation of the tuple set management concept on a parallel mass storage system.} } @InProceedings{muller:multi, author = {Keith Muller and Joseph Pasquale}, title = {A High Performance Multi-Structured File System Design}, booktitle = {Proceedings of the Thirteenth ACM Symposium on Operating Systems Principles}, year = {1991}, pages = {56--67}, publisher = {ACM Press}, address = {Pacific Grove, CA}, keyword = {file system, disk striping, disk mirroring, pario-bib} } @InProceedings{muntz:failure, author = {Richard R. Muntz and John C. S. Lui}, title = {Performance Analysis of Disk Arrays Under Failure}, booktitle = {Proceedings of the 16th International Conference on Very Large Data Bases}, year = {1990}, pages = {162--173}, keyword = {disk array, parallel, performance analysis, pario-bib}, comment = {Looked at RAID5 when in failure mode. For small-reads workload, could only get 50\% of normal. So they decouple cluster size and parity-group size, so that they decluster over more disks than group size; during failure, this causes less of a load increase on surviving disks.} } @Article{muntz:intro, author = {Richard R. Muntz and Leana Golubchik}, title = {Parallel Data Servers and Applications}, journal = {Parallel Computing}, year = {1998}, month = {January}, volume = {24}, number = {1}, pages = {1--4}, keyword = {parallel I/O, multimedia, databases, pario-bib}, comment = {Introduction to a special issue.} } @InProceedings{mutisya:cache, author = {Gerald Mutisya and Bradley M. Broom}, title = {Distributed File Caching for the {AP1000}}, booktitle = {Proceedings of the Third Fujitsu-ANU CAP Workshop}, year = {1992}, month = {November}, keyword = {distributed file system, multiprocessor file system, pario-bib}, comment = {See also broom:acacia, broom:impl, lautenbach:pfs, and broom:cap. They examine ways to manage a distributed file cache, without replication. Since there is no replication, the concurrency control problems boil down to providing atomicity for multi-block, multi-site requests. This is handled essentially by serializing the request: send the request to the first site, and have it forward the request from site to site as each block is processed. This works fine but completely serializes all multi-block requests, somewhat defeating the purpose. Thus, they get concurrency between requests, by having multiple servers, but no parallelism within requests.} } @Article{myllymaki:buffering, author = {Jussi Myllymaki and Miron Livny}, title = {Efficient buffering for concurrent disk tape {I/O}}, journal = {Performance Evaluation: An International Journal}, year = {1996}, volume = {27/28}, pages = {453--471}, note = {Performance~'96}, keyword = {buffering, file caching, tertiary storage, tape robot, file migration, parallel I/O, pario-bib}, comment = {Ways to use secondary and tertiary storage in parallel, and buffering mechanisms for applications with concurrent I/O requirements.} } @InProceedings{nagaraj:hpfs, author = {U. Nagaraj and U. S. Shukla and A. Paulraj}, title = {Design and Evaluation of a High Performance File System for Message Passing Parallel Computers}, booktitle = {Proceedings of the Fifth International Parallel Processing Symposium}, year = {1991}, pages = {549--554}, keyword = {multiprocessor file system, pario-bib}, comment = {They describe a file system for general message-passing, distributed-memory, separate I/O and compute node, multicomputers. They provide few details, although they cite a lot of their tech reports. There are a few simulation results, but none show anything unintuitive.} } @InProceedings{nagashima:pario, author = {Umpei Nagashima and Takashi Shibata and Hiroshi Itoh and Minoru Gotoh}, title = {An Improvement of {I/O} Function for Auxiliary Storage: {Parallel I/O} for a Large Scale Supercomputing}, booktitle = {Proceedings of the 1990 ACM International Conference on Supercomputing}, year = {1990}, pages = {48--59}, keyword = {parallel I/O, pario-bib}, comment = {Using parallel I/O channels to access striped disks, in parallel from a supercomputer. They {\em chain}\/ (i.e., combine) requests to a disk for large contiguous accesses.} } @InProceedings{nakajo:ionet, author = {H. Nakajo and S. Ohtani and T. Matsumoto and M. Kohata and K. Hiraki and Y. Kaneda}, title = {An {I/O} Network for Architecture of the Distributed Shared-Memory Massively parallel computer JUMP-1}, booktitle = {Proceedings of the 11th ACM International Conference on Supercomputing}, year = {1997}, month = {July}, pages = {253--260}, publisher = {ACM Press}, keyword = {collective I/O, multiprocessor file system, parallel I/O, pario-bib} } @InProceedings{nakajo:jump1, author = {Hironori Nakajo}, title = {A Simulation-based Evaluation of a Disk {I/O} Subsystem for a Massively Parallel Computer: {JUMP-1}}, booktitle = {Proceedings of the Sixteenth International Conference on Distributed Computer Systems}, year = {1996}, month = {May}, pages = {562--569}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, I/O architecture, pario-bib}, abstract = {JUMP-1 is a distributed shared-memory massively parallel computer and is composed of multiple clusters of interconnected network called RDT (Recursive Diagonal Torus). Each cluster in JUMP-1 consists of 4 element processors, secondary cache memories, and 2 MBP (Memory Based Processor) for high-speed synchronization and communication among clusters. The I/O subsystem is connected to a cluster via a high-speed serial link called STAFF-Link. The I/O buffer memory is mapped onto the JUMP-1 global shared-memory to permit each I/O access operation as memory access. In this paper we describe evaluation of the fundamental performance of the disk I/O subsystem using event-driven simulation, and estimated performance with a Video On Demand (VOD) application.} } @InProceedings{natarajan:clusterio, author = {Chita Natarajan and Ravishankar K. Iyer}, title = {Measurement and Simulation Based Performance Analysis of Parallel {I/O} in a High-Performance Cluster System}, booktitle = {Proceedings of the 1996 IEEE Symposium on Parallel and Distributed Processing}, year = {1996}, month = {October}, pages = {332--339}, publisher = {IEEE Computer Society Press}, keyword = {performance analysis, parallel I/O, pario-bib}, abstract = {This paper presents a measurement and simulation based study of parallel I/O in a high-performance cluster system: the Pittsburgh Supercomputing Center (PSC) DEC Alpha Supercluster. The measurements were used to characterize the performance bottlenecks and the throughput limits at the compute and I/O nodes, and to provide realistic input parameters to PioSim, a simulation environment we have developed to investigate parallel I/O performance issues in cluster systems. PioSim was used to obtain a detailed characterization of parallel I/O performance, in the high performance cluster system, for different regular access patterns and different system configurations. This paper also explores the use of local disks at the compute nodes for parallel I/O, and finds that the local disk architecture outperforms the traditional parallel I/O over remote I/O node disks architecture, even when as much as 68-75\% of the requests from each compute node goes to remote disks.} } @TechReport{ncr:3600, key = {NCR}, title = {{NCR 3600} Product Description}, year = {1991}, month = {September}, number = {ST-2119-91}, institution = {NCR}, address = {San Diego}, keyword = {multiprocessor architecture, MIMD, parallel I/O, pario-bib}, comment = {Has 1-32 50MHz Intel 486 processors. Parallel independent disks on the disk nodes, separate from the processor nodes. Tree interconnect. Aimed at database applications.} } @InProceedings{ng:diskarray, author = {Spencer Ng}, title = {Some Design Issues of Disk Arrays}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {137--142}, note = {San Francisco, CA}, keyword = {parallel I/O, disk array, pario-bib}, comment = {Discusses disk arrays and striping. Transfer size is important to striping success: small size transfers are better off with independent disks. Synchronized rotation is especially important for small transfer sizes, since then the increased rotational delays dominate. Fine grain striping involves less assembly/disassembly delay, but coarse grain (block) striping allows for request parallelism. Fine grain striping wastes capacity due to fixed size formatting overhead. He also derives exact MTTF equation for 1-failure tolerance and on-line repair.} } @InProceedings{ng:interleave, author = {S. Ng and D. Lang and R. Selinger}, title = {Trade-offs Between Devices and Paths in Achieving Disk Interleaving}, booktitle = {Proceedings of the 15th Annual International Symposium on Computer Architecture}, year = {1988}, pages = {196--201}, keyword = {parallel I/O, disk architecture, disk caching, I/O bottleneck, pario-bib}, comment = {Compares four different ways of restructuring IBM disk controllers and channels to obtain more parallelism. They use parallel heads or parallel actuators. The best results come when they replicate the control electronics to maintain the number of data paths through the controller. Otherwise the controller bottleneck reduces performance. Generally, for large or small transfer sizes, parallel heads with replication gave better performance.} } @Article{nicastro:fft, author = {L. Nicastro and N. {D'Amico}}, title = {An optimized mass storage {FFT} for vector computers}, journal = {Parallel Computing}, year = {1995}, month = {March}, volume = {21}, pages = {423--432}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {out-of-core algorithm, parallel I/O algorithm, scientific computing, vector computer, pario-bib}, comment = {They describe an out-of-core FFT algorithm for vector computers (one disk, one vector processor). They implemented it on a Convex and show good performance. Basically, the segment the array, do FFTs on each segment, and do some transposing and other stuff to combine the segments. Each segment is basically a memoryload. Seems parallelizable too.} } @TechReport{nickolls:dpio, author = {John R. Nickolls and Ernie Rael}, title = {Data Parallel {Unix} Input/Output for a Massively Parallel Processor}, year = {1993}, number = {MP/P-17.93}, institution = {MasPar Computer Corporation}, keyword = {Unix, parallel I/O, data parallel, pario-bib}, comment = {Cite nickolls:maspar-io.} } @InProceedings{nickolls:maspar-io, author = {John R. Nickolls}, title = {The {MasPar} Scalable {Unix I/O} System.}, booktitle = {Proceedings of the Eighth International Parallel Processing Symposium}, year = {1994}, month = {April}, pages = {390--394}, address = {Cancun, Mexico}, keyword = {parallel I/O, multiprocessor file system, SIMD, pario-bib}, abstract = {Scalable parallel computers require I/O balanced with computational power to solve data-intensive problems. Distributed memory architectures call for I/O hardware and software beyond those of conventional scalar systems. \par This paper introduces the MasPar I/O system, designed to provide balanced and and scalable data-parallel Unix I/O. The architecture and implementation of the I/O hardware and software are described. Key elements include parallel access to conventional Unix file descriptors and a self-routing multistage network coupled with a buffer memory for flexible parallel I/O transfers. Performance measurements are presented for parallel Unix I/O with a scalable RAID disk array, a RAM disk, and a HIPPI interconnect.}, comment = {This provides the definitive reference for the Maspar parallel-I/O architecture and file system. This paper includes a brief discussion of the interface and performance results. Also includes some HIPPI interface performance results. This paper is the conference version of nickolls:dpio, so cite this one.} } @InProceedings{nieplocha:arrays, author = {Jarek Nieplocha and Ian Foster}, title = {Disk Resident Arrays: An Array-Oriented {I/O} Library for Out-Of-Core Computations}, booktitle = {Proceedings of the Sixth Symposium on the Frontiers of Massively Parallel Computation}, year = {1996}, month = {October}, pages = {196--204}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, pario-bib}, abstract = {In out-of-core computations, disk storage is treated as another level in the memory hierarchy, below cache, local memory, and (in a parallel computer) remote memories. However the tools used to manage this storage are typically quite different from those used to manage access to local and remote memory. This disparity complicates implementation of out-of-core algorithms and hinders portability. We describe a programming model that addresses this problem. This model allows parallel programs to use essentially the same mechanisms to manage the movement of data between any two adjacent levels in a hierarchical memory system. We take as our starting point the Global Arrays shared-memory model and library, which support a variety of operations on distributed arrays, including transfer between local and remote memories. We show how this model can be extended to support explicit transfer between global memory and secondary storage, and we define a Disk Resident Arrays Library that supports such transfers. We illustrate the utility of the resulting model with two applications, an out-of-core matrix multiplication and a large computational chemistry program. We also describe implementation techniques on several parallel computers and present experimental results that demonstrate that the Disk Resident Arrays model can be implemented very efficiently on parallel computers.} } @Article{nieplocha:chemio, author = {Jarek Nieplocha and Ian Foster and Rick Kendall}, title = {{ChemIO}: High-Performance Parallel {I/O} for Computational Chemistry Applications}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {345--363}, earlier = {foster:chemio}, keyword = {verify volume number month year and pages, parallel I/O application, pario-bib}, abstract = {Recent developments in I/O systems on scalable parallel computers have sparked renewed interest in out-of-core methods for computational chemistry. These methods can improve execution time significantly relative to "direct" methods, which perform many redundant computations. However, the widespread use of such out-of-core methods requires efficient and portable implementations of often complex I/O patterns. The ChemIO project has addressed this problem by defining an I/O interface that captures the I/O patterns found in important computational chemistry applications and by providing high-performance implementations of this interface on multiple platforms. This development not only broadens the user community for parallel I/O techniques but also provides new insights into the functionality required in general-purpose scalable I/O libraries and the techniques required to achieve high performance I/O on scalable parallel computers.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @InProceedings{nieuwejaar:galley-perf, author = {Nils Nieuwejaar and David Kotz}, title = {Performance of the {Galley} Parallel File System}, booktitle = {Proceedings of the Fourth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1996}, month = {May}, pages = {83--94}, publisher = {ACM Press}, address = {Philadelphia}, later = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley-perf.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. Initial experiments, reported in this paper, indicate that Galley is capable of providing high-performance I/O to applications that access data in patterns that have been observed to be common.}, comment = {See also nieuwejaar:galley.} } @InProceedings{nieuwejaar:galley, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, booktitle = {Proceedings of the 10th ACM International Conference on Supercomputing}, year = {1996}, month = {May}, pages = {374--381}, publisher = {ACM Press}, address = {Philadelphia, PA}, later = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:galley.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. This interface conceals the parallelism within the file system, which increases the ease of programmability, but makes it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. Furthermore, most current parallel file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic parallel workloads. We discuss Galley's file structure and application interface, as well as an application that has been implemented using that interface.}, comment = {See also nieuwejaar:galley-perf. Also available at http://www.acm.org/pubs/citations/proceedings/supercomputing/237578/p374-nieuwejaar/} } @TechReport{nieuwejaar:jgalley-tr, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, year = {1996}, month = {May}, number = {PCS-TR96-286}, institution = {Dept. of Computer Science, Dartmouth College}, earlier = {nieuwejaar:galley}, later = {nieuwejaar:jgalley}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR96-286/}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.} } @Article{nieuwejaar:jgalley, author = {Nils Nieuwejaar and David Kotz}, title = {The {Galley} Parallel File System}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {447--476}, publisher = {North-Holland (Elsevier Scientific)}, earlier = {nieuwejaar:jgalley-tr}, URL = {ftp://ftp.cs.dartmouth.edu/pub/kotz/papers/nieuwejaar:jgalley.ps.Z}, keyword = {parallel file system, parallel I/O, multiprocessor file system interface, pario-bib, dfk}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Many multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated programmers and libraries to use knowledge about their I/O needs to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. We introduce Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. We discuss Galley's file structure and application interface, as well as the performance advantages offered by that interface.}, comment = {A revised version of nieuwejaar:jgalley-tr, which is a combination of nieuwejaar:galley and nieuwejaar:galley-perf.} } @TechReport{nieuwejaar:strided, author = {Nils Nieuwejaar and David Kotz}, title = {A Multiprocessor Extension to the Conventional File System Interface}, year = {1994}, month = {September}, number = {PCS-TR94-230}, institution = {Dept. of Computer Science, Dartmouth College}, later = {nieuwejaar:strided2-tr}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR94-230/}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present an extension which supports {\em strided} and {\em nested-strided} I/O requests.} } @InCollection{nieuwejaar:strided2-book, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, booktitle = {Input/Output in Parallel and Distributed Computer Systems}, chapter = {9}, editor = {Ravi Jain and John Werth and James C. Browne}, crossref = {iopads-book}, year = {1996}, series = {The Kluwer International Series in Engineering and Computer Science}, volume = {362}, pages = {205--223}, publisher = {Kluwer Academic Publishers}, earlier = {nieuwejaar:strided2}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {Part of a whole book on parallel I/O; see iopads-book and nieuwejaar:strided2 (which is not much different).} } @TechReport{nieuwejaar:strided2-tr, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, year = {1995}, month = {March}, number = {PCS-TR95-253}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Revised 4/18/95 and appeared in IOPADS workshop at IPPS~'95}, earlier = {nieuwejaar:strided}, later = {nieuwejaar:strided2}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR95-253/}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {After revision, identical to nieuwejaar:strided2.} } @InProceedings{nieuwejaar:strided2, author = {Nils Nieuwejaar and David Kotz}, title = {Low-level Interfaces for High-level Parallel {I/O}}, booktitle = {Proceedings of the IPPS~'95 Workshop on Input/Output in Parallel and Distributed Systems}, year = {1995}, month = {April}, pages = {47--62}, earlier = {nieuwejaar:strided2-tr}, later = {nieuwejaar:strided2-book}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR95-253.ps.Z}, keyword = {parallel I/O, multiprocessor file system, pario-bib, dfk}, abstract = {As the I/O needs of parallel scientific applications increase, file systems for multiprocessors are being designed to provide applications with parallel access to multiple disks. Many parallel file systems present applications with a conventional Unix-like interface that allows the application to access multiple disks transparently. By tracing all the activity of a parallel file system in a production, scientific computing environment, we show that many applications exhibit highly regular, but non-consecutive I/O access patterns. Since the conventional interface does not provide an efficient method of describing these patterns, we present three extensions to the interface that support {\em strided}, {\em nested-strided}, and {\em nested-batched} I/O requests. We show how these extensions can be used to express common access patterns.}, comment = {Identical to revised TR95-253, nieuwejaar:strided2-tr. Cite nieuwejaar:strided2-book.} } @PhdThesis{nieuwejaar:thesis, author = {Nils A. Nieuwejaar}, title = {Galley: A New Parallel File System for Parallel Applications}, year = {1996}, month = {November}, school = {Dept. of Computer Science, Dartmouth College}, note = {Available as Dartmouth Technical Report PCS-TR96-300}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR96-300.ps.Z}, keyword = {parallel I/O, multiprocessor file system, file system workload characterization, file access patterns, file system interface, pario-bib}, abstract = {Most current multiprocessor file systems are designed to use multiple disks in parallel, using the high aggregate bandwidth to meet the growing I/O requirements of parallel scientific applications. Most multiprocessor file systems provide applications with a conventional Unix-like interface, allowing the application to access those multiple disks transparently. This interface conceals the parallelism within the file system, increasing the ease of programmability, but making it difficult or impossible for sophisticated application and library programmers to use knowledge about their I/O to exploit that parallelism. In addition to providing an insufficient interface, most current multiprocessor file systems are optimized for a different workload than they are being asked to support. In this work we examine current multiprocessor file systems, as well as how those file systems are used by scientific applications. Contrary to the expectations of the designers of current parallel file systems, the workloads on those systems are dominated by requests to read and write small pieces of data. Furthermore, rather than being accessed sequentially and contiguously, as in uniprocessor and supercomputer workloads, files in multiprocessor file systems are accessed in regular, structured, but non-contiguous patterns. Based on our observations of multiprocessor workloads, we have designed Galley, a new parallel file system that is intended to efficiently support realistic scientific multiprocessor workloads. In this work, we introduce Galley and discuss its design and implementation. We describe Galley's new three-dimensional file structure and discuss how that structure can be used by parallel applications to achieve higher performance. We introduce several new data-access interfaces, which allow applications to explicitly describe the regular access patterns we found to be common in parallel file system workloads. We show how these new interfaces allow parallel applications to achieve tremendous increases in I/O performance. Finally, we discuss how Galley's new file structure and data-access interfaces can be useful in practice.} } @TechReport{nieuwejaar:workload-tr, author = {Nils Nieuwejaar and David Kotz and Apratim Purakayastha and Carla Schlatter Ellis and Michael Best}, title = {File-Access Characteristics of Parallel Scientific Workloads}, year = {1995}, month = {August}, number = {PCS-TR95-263}, institution = {Dept. of Computer Science, Dartmouth College}, earlier = {kotz:workload}, later = {nieuwejaar:workload}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR95-263/}, keyword = {parallel I/O, file system workload, workload characterization, file access pattern, multiprocessor file system, dfk, pario-bib}, abstract = {Phenomenal improvements in the computational performance of multiprocessors have not been matched by comparable gains in I/O system performance. This imbalance has resulted in I/O becoming a significant bottleneck for many scientific applications. One key to overcoming this bottleneck is improving the performance of parallel file systems. \par The design of a high-performance parallel file system requires a comprehensive understanding of the expected workload. Unfortunately, until recently, no general workload studies of parallel file systems have been conducted. The goal of the CHARISMA project was to remedy this problem by characterizing the behavior of several production workloads, on different machines, at the level of individual reads and writes. The first set of results from the CHARISMA project describe the workloads observed on an Intel iPSC/860 and a Thinking Machines CM-5. This paper is intended to compare and contrast these two workloads for an understanding of their essential similarities and differences, isolating common trends and platform-dependent variances. Using this comparison, we are able to gain more insight into the general principles that should guide parallel file-system design.}, comment = {See also nieuwejaar:strided, ap:workload.} } @Article{nieuwejaar:workload, author = {Nils Nieuwejaar and David Kotz and Apratim Purakayastha and Carla Schlatter Ellis and Michael Best}, title = {File-Access Characteristics of Parallel Scientific Workloads}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1996}, month = {October}, volume = {7}, number = {10}, pages = {1075--1089}, publisher = {IEEE Computer Society Press}, earlier = {nieuwejaar:workload-tr}, URL = {http://www.computer.org/tpds/td1996/l1075abs.htm}, keyword = {parallel I/O, file system workload, workload characterization, file access pattern, multiprocessor file system, dfk, pario-bib}, abstract = {Phenomenal improvements in the computational performance of multiprocessors have not been matched by comparable gains in I/O system performance. This imbalance has resulted in I/O becoming a significant bottleneck for many scientific applications. One key to overcoming this bottleneck is improving the performance of multiprocessor file systems. \par The design of a high-performance multiprocessor file system requires a comprehensive understanding of the expected workload. Unfortunately, until recently, no general workload studies of multiprocessor file systems have been conducted. The goal of the CHARISMA project was to remedy this problem by characterizing the behavior of several production workloads, on different machines, at the level of individual reads and writes. The first set of results from the CHARISMA project describe the workloads observed on an Intel iPSC/860 and a Thinking Machines CM-5. This paper is intended to compare and contrast these two workloads for an understanding of their essential similarities and differences, isolating common trends and platform-dependent variances. Using this comparison, we are able to gain more insight into the general principles that should guide multiprocessor file-system design.}, comment = {See also kotz:workload, nieuwejaar:strided, ap:workload.} } @Article{ninghui:pfs, author = {Sun Ninghui}, title = {The design of parallel file systems}, journal = {Chinese Journal of Computers}, year = {1994}, month = {December}, volume = {17}, number = {12}, pages = {938--945}, note = {In Chinese}, keyword = {parallel file systems, parallel I/O, pario-bib}, comment = {From the abstract, it doesn't appear to offer anything new, but it's hard to tell.} } @InProceedings{nishino:sfs, author = {H. Nishino and S. Naka and K Ikumi}, title = {High Performance File System for Supercomputing Environment}, booktitle = {Proceedings of Supercomputing '89}, year = {1989}, pages = {747--756}, keyword = {supercomputer, file system, parallel I/O, pario-bib}, comment = {A modification to the Unix file system to allow for supercomputer access. Workload: file size from few KB to few GB, I/O operation size from few bytes to hundreds of MB. Generally programs split into I/O-bound and CPU-bound parts. Sequential and random access. Needs: giant files (bigger than device), peak hardware performance for large files, NFS access. Their FS is built into Unix ``transparently''. Space allocated in clusters, rather than blocks; clusters might be as big as a cylinder. Allows for efficient, large files. Mentions parallel disks as part of a ``virtual volume'' but does not elaborate. Prefetching within a cluster.} } @TechReport{nitzberg:cfs, author = {Bill Nitzberg}, title = {Performance of the {iPSC/860 Concurrent File System}}, year = {1992}, month = {December}, number = {RND-92-020}, institution = {NAS Systems Division, NASA Ames}, later = {krystynak:pario}, URL = {http://www.nas.nasa.gov/NAS/TechReports/RNDreports/RND-92-020/RND-92-020.html}, keyword = {Intel, parallel file system, performance measurement, parallel I/O, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems can provide processing power at low cost, but tend to lack I/O capacity. By evaluating the performance and scalability of the Intel iPSC/860 Concurrent File System (CFS), we can get an idea of the current state of parallel I/O performance. I ran three types of tests on the iPSC/860 system at the Numerical Aerodynamic Simulation facility (NAS): broadcast, simulating initial data loading; partitioned, simulating reading and writing a one-dimensional decomposition; and interleaved, simulating reading and writing a two-dimensional decomposition. \par The CFS at NAS can sustain up to 7 megabytes per second writing and 8 megabytes per second reading. However, due to the limited disk cache size, partitioned read performance sharply drops to less than 1 megabyte per second on 128 nodes. In addition, interleaved read and write performance show a similar drop in performance for small block sizes. Although the CFS can sustain 70-80\% of peak I/O throughput, the I/O performance does not scale with the number of nodes. \par Obtaining maximum performance may require significant programming effort: pre-allocating files, overlapping computation and I/O, using large block sizes, and limiting I/O parallelism. A better approach would be to attack the problem by either fixing the CFS (e.g., add more cache to the I/O nodes), or hiding its idiosyncracies (e.g., implement a parallel I/O library).}, comment = {Straightforward measurements of an iPSC/860 with 128 compute nodes, 10 I/O nodes, and 10 disks. This is a bigger system than has been measured before. Has some basic MB/s measurements for some features in Tables 1--2. CFS bug prevents more than 2 asynch requests at a time. Another bug forced random-writes to use preallocated files. For low number of procs, they weren't able to pull the full disk bandwidth. Cache thrashing caused problems when they had a large number of procs, because each read prefetched 8 blocks, which were flushed by some other proc doing a subsequent read. Workaround by synchronizing procs to limit concurrency. Increasing cache size is the right answer, but is not scalable.} } @InProceedings{nitzberg:collective, author = {Bill Nitzberg and Virginia Lo}, title = {Collective Buffering: Improving Parallel {I/O} Performance}, booktitle = {Proceedings of the Sixth IEEE International Symposium on High Performance Distributed Computing}, year = {1997}, month = {August}, pages = {148--157}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, keyword = {parallel I/O, collective I/O, pario-bib}, abstract = {"Parallel I/O" is the support of a single parallel application run on many nodes; application data is distributed among the nodes, and is read or written to a single logical file, itself spread across nodes and disks. Parallel I/O is a mapping problem from the data layout in node memory to the file layout on disks. Since the mapping can be quite complicated and involve significant data movement, optimizing the mapping is critical for performance. \par We discuss our general model of the problem, describe four Collective Buffering algorithms we designed, and report experiments testing their performance on an Intel Paragon and an IBM SP2 both housed at NASA Ames Research Center. Our experiments show improvements of up to two order of magnitude over standard techniques and the potential to deliver peak performance with minimal hardware support.} } @TechReport{nitzberg:sc94tutorial, author = {Bill Nitzberg and Samuel A. Fineberg}, title = {Parallel {I/O} on Highly Parallel Systems--- Supercomputing '94 Tutorial {M11} Notes}, year = {1994}, month = {November}, number = {NAS-94-005}, institution = {NASA Ames Research Center}, later = {nitzberg:sc95tutorial}, URL = {http://www.nas.nasa.gov/NAS/TechReports/NASreports/NAS-94-005/NAS-94-005.html}, keyword = {parallel I/O, tutorial, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems provide floating point processing power at low cost, but efficiently supporting a scientific workload also requires commensurate I/O performance. In order to achieve high I/O performance, these systems utilize parallelism in their I/O subsystems---supporting concurrent access to files by multiple nodes of a parallel application, and striping files across multiple disks. However, obtaining maximum I/O performance can require significant programming effort. \par This tutorial presents a snapshot of the state of I/O on highly parallel systems by comparing the well-balanced I/O performance of a traditional vector supercomputer (the Cray Y/MP C90) with the I/O performance of various highly parallel systems (Cray T3D, IBM SP-2, Intel iPSC/860 and Paragon, and Thinking Machines CM-5). In addition, the tutorial covers benchmarking techniques for evaluating current parallel I/O systems and techniques for improving parallel I/O performance. Finally, the tutorial presents several high level parallel I/O libraries and shows how they can help application programmers improve I/O performance.} } @TechReport{nitzberg:sc95tutorial, author = {Bill Nitzberg and Samuel A. Fineberg}, title = {Parallel {I/O} on Highly Parallel Systems--- Supercomputing '95 Tutorial {M6} Notes}, year = {1995}, month = {December}, number = {NAS-95-022}, institution = {NASA Ames Research Center}, later = {nitzberg:sc94tutorial}, URL = {http://www.nas.nasa.gov/NAS/TechReports/NASreports/NAS-95-022/NAS-95-022.html}, keyword = {parallel I/O, tutorial, pario-bib}, abstract = {Typical scientific applications require vast amounts of processing power coupled with significant I/O capacity. Highly parallel computer systems provide floating-point processing power at low cost, but efficiently supporting a scientific workload also requires commensurate I/O performance. To achieve high I/O performance, these systems use parallelism in their I/O subsystems, supporting concurrent access to files by multiple nodes of a parallel application and striping files across multiple disks. However, obtaining maximum I/O performance can require significant programming effort. This tutorial will present a comprehensive survey of the state of the art in parallel I/O from basic concepts to recent advances in the research community. Requirements, interfaces, architectures, and performance will be illustrated using concrete examples from commercial offerings (Cray T3D, IBM SP-2, Intel Paragon, Meiko CS-2, and workstation clusters) and academic research projects (MPI-IO, Panda, PASSION, PIOUS, and Vesta). The material covered is roughly 30\% beginner, 60\% intermediate, and 10\% advanced.} } @PhdThesis{nitzberg:thesis, author = {William J. Nitzberg}, title = {Collective Parallel {I/O}}, year = {1995}, month = {December}, school = {Department of Computer and Information Science, University of Oregon}, keyword = {parallel I/O, parallel algorithm, file system interface, pario-bib}, abstract = {Parallel I/O, the process of transferring a global data structure distributed among compute nodes to a file striped across storage devices, can be quite complicated and involve a significant amount of data movement. Optimizing parallel I/O with respect to data distribution, file layout, and machine architecture is critical for performance. In this work, we propose a solution to both the performance and portability problems plaguing the wide acceptance of distributed memory parallel computers for scientific computing: a collective parallel I/O interface and efficient algorithms to implement it. A collective interface allows the programmer to specify a file access as a high-level global operation rather than as a series of seeks and writes. This not only provides a more natural interface for the programmer, but also provides the system with both the opportunity and the semantic information necessary to optimize the file operation. \par We attack this problem in three steps: we evaluate an early parallel I/O system, the Intel iPSC/860 Concurrent File System; we design and analyze the performance of two classes of algorithms taking advantage of collective parallel I/O; and we design MPI-IO, a collective parallel I/O interface likely to become the standard for portable parallel I/O. \par The collective I/O algorithms fall into two broad categories: data block scheduling and collective buffering. Data block scheduling algorithms attempt to schedule the individual data transfers to minimize resource contention and to optimize for particular hardware characteristics. We develop and evaluate three data block scheduling algorithms: Grouping, Random, and Sliding Window. The data block scheduling algorithms improved performance by as much as a factor of eight. The collective buffering algorithms permute the data before writing or after reading in order to combine small file accesses into large blocks. We design and test a series of four collective buffering algorithms and demonstrate improvement in performance by two orders of magnitude over naive file I/O for the hardest, three-dimensional distributions.}, comment = {See also nitzberg:cfs and corbett:mpi-overview.} } @InProceedings{no:irregular-io, author = {Jaechun No and Sung-soon Park and Jesus Carretero and Alok Choudhary and Pang Chen}, title = {Design and Implementation of a Parallel {I/O} Runtime System for Irregular Applications}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @Article{no:jirregular, author = {Jaechun No and Jesus Carretero and Sung-soon Park and Alok Choudhary and Pang Chen}, title = {Design and Implementation of a Parallel {I/O} Runtime System for Irregular Applications}, journal = {Journal of Parallel and Distributed Computing}, year = {1998}, URL = {http://www.ece.nwu.edu/~jno/PAPER/jpdc.ps}, keyword = {verify volume pages publisher month number, parallel I/O, pario-bib} } @InProceedings{nodine:deterministic, author = {M. H. Nodine and J. S. Vitter}, title = {Deterministic Distribution Sort in Shared and Distributed Memory Multiprocessors}, booktitle = {Proceedings of the Fifth Symposium on Parallel Algorithms and Architectures}, year = {1993}, pages = {120--129}, address = {Velen, Germany}, abstract = {We present an elegant deterministic load balancing strategy for distribution sort that is applicable to a wide variety of parallel disks and parallel memory hierarchies with both single and parallel processors. The simplest application of the strategy is an optimal deterministic algorithm for external sorting with multiple disks and parallel processors. In each input/output (I/O) operation, each of the $D \geq 1$ disks can simultaneously transfer a block of $B$ contiguous records. Our two measures of performance are the number of I/Os and the amount of work done by the CPU(s); our algorithm is simultaneously optimal for both measures. We also show how to sort deterministically in parallel memory hierarchies. When the processors are interconnected by any sort of a PRAM, our algorithms are optimal for all parallel memory hierarchies; when the interconnection network is a hypercube, our algorithms are either optimal or best-known.}, comment = {Short version of nodine:sort2 and nodine:sortdisk.} } @TechReport{nodine:greed, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Greed Sort: An Optimal External Sorting Algorithm for Multiple Disks}, year = {1992}, number = {CS--91--20}, institution = {Brown University}, note = {A summary appears in SPAA~'91}, URL = {http://www.cs.brown.edu/publications/techreports/reports/CS-91-20.html}, keyword = {parallel I/O algorithms, sorting, pario-bib}, abstract = {We present an optimal deterministic algorithm for external sorting on multiple disks. Our measure of performance is the number of input/output (I/O) operations. In each I/O, each disk can simultaneously transfer a block of data. Our algorithm improves upon a recent randomized optimal algorithm and the (non-optimal) commonly used technique of disk striping. The code is simple enough for easy implementation.}, comment = {Summary is nodine:sort. This is revision of CS--91--04.} } @InProceedings{nodine:loadbalance, author = {Mark H. Nodine and Jeffrey Vitter}, title = {Load Balancing Paradigms for Optimal Use of Parallel Disks and Parallel Memory Hierarchies}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {26--39}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, keyword = {parallel I/O algorithm, memory hierarchy, load balance, sorting, pario-bib}, abstract = {We present several load balancing paradigms pertinent to optimizing I/O performance with disk and processor parallelism. We use sorting as our canonical application to illustrate the paradigms, and we survey a wide variety of applications in computational geometry. The use of parallel disks can help overcome the I/O bottleneck in sorting if the records in each read or write are evenly balanced among the disks. There are three known load balancing paradigms that lead to optimal I/O algorithms: using randomness to assign blocks to disks, using the disks predominantly independently, and deterministically balancing the blocks by matching. In this report, we describe all of these techniques in detail and compare their relative advantages. We show how randomized and deterministic balancing can be extended to provide sorting algorithms that are optimal both in terms of the number of I/Os and the internal processing time for parallel-processing machines with scalable I/O subsystems and with parallel memory hierarchies. We also survey results achieving optimal performance in the these models for a large range of online and batch problems in computational geometry.}, comment = {Invited speaker: Jeffrey Vitter.} } @InProceedings{nodine:opt-sort, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Paradigms for Optimal Sorting with Multiple Disks}, booktitle = {Proceedings of the Twenty-Sixth Annual Hawaii International Conference on System Sciences}, year = {1993}, volume = {I}, pages = {50--59}, keyword = {parallel I/O algorithms, sorting, pario-bib}, comment = {They compare three techniques for balancing I/O across parallel disks, using sorting as an example. The three are randomization, using disks independently (as in balance sort), or tricky matching techniques as in balance sort. They also look at parallel memory hierarchies. All in all, it seems to be mostly a survey of techniques in earlier papers.} } @InProceedings{nodine:sort, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Large-Scale Sorting in Parallel Memories}, booktitle = {Proceedings of the Third Symposium on Parallel Algorithms and Architectures}, year = {1991}, pages = {29--39}, keyword = {external sorting, file access pattern, parallel I/O, pario-bib}, comment = {Describes algorithms for external sorting that are optimal in the number of I/Os. Proposes a couple of fairly-realistic memory hierarchy models. See also journal version vitter:uniform.} } @TechReport{nodine:sort2, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Optimal Deterministic Sorting in Parallel Memory Hierarchies}, year = {1992}, month = {August}, number = {CS--92--38}, institution = {Brown University}, URL = {ftp://ftp.cs.brown.edu/pub/techreports/92/cs92-38.ps.Z}, keyword = {parallel I/O algorithms, parallel memory, sorting, pario-bib}, comment = {see nodine:deterministic.} } @TechReport{nodine:sortdisk, author = {Mark H. Nodine and Jeffrey Scott Vitter}, title = {Optimal Deterministic Sorting on Parallel Disks}, year = {1992}, month = {August}, number = {CS--92--08}, institution = {Brown University}, URL = {ftp://ftp.cs.brown.edu/pub/techreports/92/cs92-08.ps.Z}, keyword = {parallel I/O algorithms, sorting, pario-bib}, comment = {see nodine:deterministic.} } @InProceedings{nurmi:atm, author = {Marc A. Nurmi and William E. Bejcek and Rod N. Gregoire and K. C. Liu and Mark D. Pohl}, title = {Automatic Management of {CPU} and {I/O} Bottlenecks in Distributed Applications on {ATM} Networks}, booktitle = {Proceedings of the Fifth IEEE International Symposium on High Performance Distributed Computing}, year = {1996}, month = {August}, pages = {481--489}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, ATM, parallel networking, pario-bib}, abstract = {Existing parallel programming environments for networks of workstations improve the performance of computationally intensive applications by using message passing or virtual shared memory to alleviate CPU bottlenecks. This paper describes an approach based on message passing that addresses both CPU and I/O bottlenecks for a specific class of distributed applications on ATM networks. ATM provides the bandwidth required to utilize multiple I/O channels in parallel. This paper also describes an environment based on distributed process management and centralized application management that implements the approach. The environment adds processes to a running application when necessary to alleviate CPU and I/O bottlenecks while managing process connections in a manner that is transparent to the application.} } @TechReport{ober:seismic, author = {Curtis Ober and Ron Oldfield and John VanDyke and David Womble}, title = {Seismic Imaging on Massively Parallel Computers}, year = {1996}, month = {April}, number = {SAND96-1112}, institution = {Sandia National Laboratories}, URL = {ftp://ftp.cs.sandia.gov/pub/papers/dewombl/seismic_imaging_mpp.ps.Z}, keyword = {multiprocessor application, scientific computing, seismic data processing, parallel I/O, pario-bib}, abstract = {Fast, accurate imaging of complex, oil-bearing geologies, such as overthrusts and salt domes, is the key to reducing the costs of domestic oil and gas exploration. Geophysicists say that the known oil reserves in the Gulf of Mexico could be significantly increased if accurate seismic imaging beneath salt domes was possible. A range of techniques exist for imaging these regions, but the highly accurate techniques involve the solution of the wave equation and are characterized by large data sets and large computational demands. Massively parallel computers can provide the computational power for these highly accurate imaging techniques. \par A brief introduction to seismic processing will be presented, and the implementation of a seismic-imaging code for distributed memory computers will be discussed. The portable code, Salvo, performs a wave-equation-based, 3-D, prestack, depth imaging and currently runs on the Intel Paragon, the Cray T3D and SGI Challenge series. It uses MPI for portability, and has sustained 22 Mflops/sec/proc (compiled FORTRAN) on the Intel Paragon.}, comment = {2 pages about their I/O scheme, mostly regarding a calculation of the optimal balance between compute nodes and I/O nodes to achieve perfect overlap.} } @TechReport{oed:t3d, author = {Wilfried Oed}, title = {The {Cray Research} Massively Parallel Processor System {CRAY T3D}}, year = {1993}, month = {November 15}, institution = {Cray Research GmbH}, address = {M\"unchen, Germany}, keyword = {parallel architecture, shared memory, supercomputer, parallel I/O, pario-bib}, comment = {A MIMD, shared-memory machine, with 2-processor units embedded in a 3-d torus. Each link is bidirectional and runs 300 MB/s. Processors are 150 MHz ALPHA, plus 16--64 MB RAM, plus a memory interface unit. Global physical address space with remote-reference and block-transfer capability. Not clear about cache coherency. Separate tree network for global synchronization. Support for message send and optional interrupt. I/O is all done through interface nodes that hook to the YMP host and to its I/O clusters with 400 MB/s links. I/O is by default serialized, but they do support a ``broadcast'' read operation (but see pase:t3d-fortran). FORTRAN compiler supports the NUMA shared memory; PVM is used for C and message passing.} } @TechReport{ogata:diskarray, author = {Mikito Ogata and Michael J. Flynn}, title = {A Queueing Analysis for Disk Array Systems}, year = {1990}, number = {CSL-TR-90-443}, institution = {Stanford University}, keyword = {disk array, performance analysis, pario-bib}, comment = {Fairly complex analysis of a multiprocessor attached to a disk array system through a central server that is the buffer. Assumes task-oriented model for parallel system, where tasks can be assigned to any CPU; this makes for an easy model. Like Reddy, they compare declustering and striping (they call them striped and synchronized disks).} } @InProceedings{okeefe:fibre, author = {Matthew T. O'Keefe}, title = {Shared File Systems and {Fibre Channel}}, booktitle = {Proceedings of the Sixth NASA Goddard Conference on Mass Storage Systems}, year = {1998}, month = {March}, pages = {??}, publisher = {IEEE Computer Society Press}, address = {College Park, MD}, URL = {http://gfs.lcse.umn.edu/pubs/shared_file_systems_1.0.pdf}, keyword = {verify pages, distributed file system, data storage, mass storage, network-attached disks, Fibre Channel, pario-bib}, comment = {position paper} } @TechReport{oldfield:app-pario, author = {Ron Oldfield and David Kotz}, title = {Applications of Parallel {I/O}}, year = {1998}, month = {August}, number = {PCS-TR98-337}, institution = {Dept. of Computer Science, Dartmouth College}, note = {Supplement to PCS-TR96-297.}, earlier = {kotz:app-pario}, URL = {http://www.cs.dartmouth.edu/reports/abstracts/TR98-337/}, keyword = {parallel I/O application, file access patterns, pario-bib} } @Unpublished{oldfield:armada, author = {Ron Oldfield and David Kotz}, title = {The {Armada} Parallel File System}, year = {1998}, month = {November}, note = {Unpublished}, keyword = {parallel I/O, multiprocessor file system, dfk}, abstract = {The rapid improvements in technology have made the challenge of providing a robust, high performance parallel file system incredibly difficult. Hardware advancements have led to faster processors, high bandwidth networks, and larger amounts of primary, secondary and tertiary storage. Although each of these components is advancing at a rapid pace, some components are improving at a faster rate. For example, processor speeds have increased at a rate of 60\% to 80\% per year, while memory and disk access times have decreased by only one-third in the past 10 years. This trend, if not addressed, will most certainly lead to an I/O bottleneck for many parallel applications. \par Conventional parallel file systems try to relieve this bottleneck by providing fixed policies that work well for the general case; however, as we gain experience with parallel file systems, it becomes increasingly clear that a single solution does not suit all applications. For example, it appears to be impossible to find a single appropriate interface, caching policy, file structure, or disk-management strategy. Furthermore, the proliferation of file-system interfaces and abstractions make applications difficult to port. \par We propose to allow the application library to have control over virtually all aspects of the parallel file system. Our parallel file system (Armada) is composed of a fixed core that runs on the I/O node, and high-level application libraries that are implemented on top of the core system. The application libraries are responsible for providing an interface and functionality to the application while the core system arbitrates usage of the I/O-node resources. \par This paper discusses some of the problems with conventional file systems and then gives a high level description of the Armada parallel file system.} } @Article{oldfield:seismic, author = {Ron A. Oldfield and David E. Womble and Curtis C. Ober}, title = {Efficient Parallel {I/O} in Seismic Imaging}, journal = {The International Journal of High Performance Computing Applications}, year = {1998}, month = {Fall}, volume = {12}, number = {3}, pages = {333--344}, URL = {http://www.cs.dartmouth.edu/~raoldfi/ijsa97}, keyword = {verify pages, parallel I/O application, pario-bib}, abstract = {While high performance computers tend to be measured by their processor and communications speeds, the bottleneck for many large-scale applications is the I/O performance rather than the computational or communication performance. One such application is the processing of 3D seismic data. Seismic data sets, consisting of recorded pressure waves, can be very large, sometimes more than a terabyte in size. Even if the computations can be performed in-core, the time required to read the initial seismic data and velocity model and write images is substantial. This paper will discuss our approach in handling the massive I/O requirements of seismic processing and show the performance of our imaging code (Salvo) on the Intel Paragon.}, comment = {In a Special Issue on I/O in Parallel Applications, volume 12, numbers 3 and 4.} } @Article{olson:random, author = {Thomas M. Olson}, title = {Disk Array Performance in a Random {I/O} Environment}, journal = {Computer Architecture News}, year = {1989}, month = {September}, volume = {17}, number = {5}, pages = {71--77}, keyword = {I/O benchmark, transaction processing, pario-bib}, comment = {See wolman:iobench. Used IOBENCH to compare normal disk configuration with striped disks, RAID level 1, and RAID level 5, under a random I/O workload. Multiple disks with files on different disks gave good performance (high throughput and low response time) when multiple users. Striping ensures balanced load, similar performance. RAID level 1 or level 5 ensures reliability at performance cost over striping, but still good. Especially sensitive to write/read ratio --- performance lost for large number of writes.} } @InProceedings{oyang:m2io, author = {Yen-Jen Oyang}, title = {Architecture, Operating System, and {I/O} Subsystem Design of the {$M^2$} Database Machine}, booktitle = {Proceedings of the Parallel Systems Fair at the International Parallel Processing Symposium}, year = {1993}, pages = {31--38}, keyword = {parallel I/O, multiprocessor file system, parallel database, pario-bib}, comment = {A custom multiprocessor with a shared-memory clusters networked together and to shared disks. Runs Mach. Directory-based coherence protocol for the distributed file system. Background writeback.} } @InProceedings{pahuja:dpio, author = {Neena Pahuja and Gautam M. Shroff}, title = {A Data Parallel I/O Library for Workstation Networks}, booktitle = {Proceedings of the 1995 International Conference on High Performance Computing}, year = {1995}, month = {December}, pages = {423--428}, address = {New Delhi, India}, keyword = {disk array, multimedia, parallel I/O, pario-bib} } @InProceedings{paleczny:support, author = {Michael Paleczny and Ken Kennedy and Charles Koelbel}, title = {Compiler Support for Out-of-Core Arrays on Data Parallel Machines}, booktitle = {Proceedings of the Fifth Symposium on the Frontiers of Massively Parallel Computation}, year = {1995}, month = {February}, pages = {110--118}, address = {McLean, VA}, URL = {http://www.cs.rice.edu/~mpal/papers/Frontiers95.ps}, keyword = {compilers, parallel I/O, out-of-core applications, pario-bib}, comment = {They are developing extensions to the FortranD compiler so that it supports I/O-related directives for out-of-core computations. The compiler then analyzes the computation, inserts the necessary I/O calls, and optimizes the I/O. They hand-compile a red-black relaxation program and an LU-factorization program. I/O was much faster than VM, particularly because they were able to make large requests rather than faulting on individual pages. Overlapping I/O and computation was also a big win. See also kennedy:sio, bordawekar:model.} } @InProceedings{panfilov:raid5, author = {Oleg A. Panfilov}, title = {Performance Analysis of {RAID-5} Disk Arrays}, booktitle = {Proceedings of the Twenty-Eighth Annual Hawaii International Conference on System Sciences}, year = {1995}, month = {January}, volume = {I}, pages = {49--60}, keyword = {RAID, disk array, parallel I/O, pario-bib} } @Article{papadopouli:vbr-streams, author = {Maria Papadopouli and Leana Golubchik}, title = {Support of {VBR} Video Streams Under Disk Bandwidth Limitations}, journal = {ACM SIGMETRICS Performance Evaluation Review}, year = {1997}, month = {December}, volume = {25}, number = {3}, pages = {13--20}, keyword = {multimedia, video on demand, parallel I/O, pario-bib}, comment = {Part of a special issue on parallel and distributed I/O.} } @Article{park:2disk, author = {{Chan-Ik} Park}, title = {Efficient Placement of Parity and Data To Tolerate Two Disk Failures in Disk Array Systems}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1995}, month = {November}, volume = {6}, number = {11}, pages = {1177--1184}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, disk array, reliability, fault tolerance, pario-bib}, abstract = {In this paper, we deal with the data/parity placement problem which is described as follows: how to place data and parity evenly across disks in order to tolerate two disk failures, given the number of disks N and the redundancy rate p which represents the amount of disk spaces to store parity information. To begin with, we transform the data/parity placement problem into the problem of constructing an N x N matrix such that the matrix will correspond to a solution to the problem. The method to construct a matrix has been proposed and we have shown how our method works through several illustrative examples. It is also shown that any matrix constructed by our proposed method can be mapped into a solution to the placement problem if a certain condition holds between N and p where N is the number of disks and p is a redundancy rate.} } @InProceedings{park:interface, author = {Yoonho Park and Ridgway Scott and Stuart Sechrest}, title = {Virtual Memory Versus File Interfaces for Large, Memory-intensive Scientific Applications}, booktitle = {Proceedings of Supercomputing '96}, year = {1996}, month = {November}, publisher = {ACM Press and IEEE Computer Society Press}, note = {Also available as UH Department of Computer Science Research Report UH-CH-96-7}, URL = {http://www.hpc.uh.edu/cenju/pub/vm_revisit.ps}, keyword = {virtual memory, file interface, scientific applications, out-of-core, parallel I/O, pario-bib}, abstract = {Scientific applications often require some strategy for temporary data storage to do the largest possible simulations. The use of virtual memory for temporary data storage has received criticism because of performance problems. However, modern virtual memory found in recent operating systems such as Cenju-3/DE give application writers control over virtual memory policies. We demonstrate that custom virtual memory policies can dramatically reduce virtual memory overhead and allow applications to run out-of-core efficiently. We also demonstrate that the main advantage of virtual memory, namely programming simplicity, is not lost.}, comment = {Web and CDROM only. They advocate the use of traditional demand-paged virtual memory systems in supporting out-of-core applications. They are implementing an operating system for the NEC Cenju-3/DE, a shared-nothing MIMD multiprocessor with a multistage interconnection network and disks on every node. The operating system is based on Mach, and they have extended Mach to allow user-provided [local] replacement policies. Basically, they argue that you can get good performance as long as you write your own replacement policy (even OPT is possible in certain applications), and that this is easier than (re)writing the application with explicit out-of-core file I/O calls. They measure the performance of two applications on their system, with OPT, FIFO, and a new replacement algorithm customized to one of the applications. They show that they can get much better performance with some replacement policies than with others, but despite the paper's title they do not compare with the performance of an equivalent program using file I/O.} } @TechReport{park:pario, author = {Arvin Park and K. Balasubramanian}, title = {Providing Fault Tolerance in Parallel Secondary Storage Systems}, year = {1986}, month = {November}, number = {CS-TR-057-86}, institution = {Department of Computer Science, Princeton University}, keyword = {parallel I/O, reliability, RAID, pario-bib}, comment = {They use ECC with one or more parity drives in bit-interleaved systems, and on-line regeneration of failed drives from spares. More cost-effective than mirrored disks. One of the earliest references to RAID-like concepts. Basically, they describe RAID3.} } @InProceedings{parsons:complex, author = {Ian Parsons and Jonathan Schaeffer and Duane Szafron and Ron Unrau}, title = {Using {PI/OT} to Support Complex Parallel {I/O}}, booktitle = {Proceedings of the Joint International Parallel Processing Symposium and IEEE Symposium on Parallel and Distributed Processing}, year = {1998}, month = {March}, publisher = {IEEE Computer Society Press}, note = {To appear}, keyword = {verify pages, parallel I/O, pario-bib} } @Article{parsons:templates, author = {Ian Parsons and Ron Unrau and Jonathan Schaeffer and Duane Szafron}, title = {{PI/OT}: Parallel {I/O} Templates}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {23}, number = {4}, pages = {543--570}, publisher = {North-Holland (Elsevier Scientific)}, keyword = {parallel programming, parallel I/O, pario-bib}, abstract = {This paper presents a novel, top-down, high-level approach to parallelizing file I/O. Each parallel file descriptor is annotated with a high-level specification, or template, of the expected parallel behaviour. The annotations are external to and independent of the source code. At run-time, all I/O using a parallel file descriptor adheres to the semantics of the selected template. By separating the parallel I/O specifications from the code, a user can quickly change the I/O behaviour without rewriting code. Templates can be composed hierarchically to construct complex access patterns. \par Two sample parallel programs using these templates are compared against versions implemented in an existing parallel I/O system (PIOUS). The sample programs show that the use of parallel I/O templates are beneficial from both the performance and software engineering points of view.}, comment = {An interesting approach in which they try to separate the description of the parallelism in a file's access from the sequential programming used to access the file. Seems like a good idea. It seems to assume that the programmer was porting an existing sequential code, or prefers to write their parallel program with a sequential frame of mind, including the existing fopen/fread/fwrite stdio interface. They retain the traditional stream-of-bytes file structure. See also parsons:complex.} } @TechReport{pase:t3d-fortran, author = {Douglas M. Pase and Tom MacDonald and Andrew Meltzer}, title = {{MPP Fortran} Programming Model}, year = {1993}, month = {October 11}, institution = {Cray Research, Inc.}, URL = {ftp://ftp.cray.com/product-info/program_env/program_model.html}, keyword = {compiler, parallel language, supercomputing, parallel I/O, pario-bib}, abstract = {This report describes the MPP Fortran programming model which will be supported on the first phase MPP systems. Based on existing and proposed standards, it is a work sharing model which combines features from existing models in a way that may be both efficiently implemented and useful.}, comment = {See also oed:t3d for T3D overview. I only read the part about I/O. The only I/O support, apparently, is for each processor to open and access the file independently from all other processors.} } @InProceedings{pasquale:characterization, author = {Barbara K. Pasquale and George C. Polyzos}, title = {Dynamic {I/O} characterization of {I/O} intensive scientific applications}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, pages = {660--669}, URL = {http://www.acm.org/pubs/citations/proceedings/supercomputing/198354/p660-pasquale/}, keyword = {parallel I/O, pario-bib}, abstract = {Understanding the characteristic I/O behavior of scientific applications is an integral part of the research and development efforts for the improvement of high performance I/O systems. This study focuses on application level I/O behavior with respect to both static and dynamic characteristics. We observed the San Diego Supercomputer Center's Cray C90 workload and isolated the most I/O intensive applications. The combination of a low-level description of physical resource usage and the high-level functional composition of applications and scientific disciplines for this set reveals the major sources of I/O demand in the workload. We selected two applications from the I/O intensive set and performed a detailed analysis of their dynamic I/O behavior. These applications exhibited a high degree of regularity in their I/O activity over time and their characteristic I/O behaviors can be precisely described by one and two, respectively, recurring sequences of data accesses and computation periods.} } @InProceedings{pasquale:dynamic, author = {Barbara K. Pasquale and George C. Polyzos}, title = {Dynamic {I/O} Characterization of {I/O} Intensive Scientific Applications}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, month = {November}, pages = {660--669}, publisher = {IEEE Computer Society Press}, address = {Washington, DC}, keyword = {scientific computing, file access patterns, I/O, pario-bib}, comment = {This paper extends some of their previous results, but the real bottom line here is that some scientific applications do a lot of I/O, the I/O us bursty, and the pattern of bursts is cyclic and regular. Seems like this cyclic nature could be a source of some optimization. Included in the parallel I/O bibliography because it is useful to that community, though they did not trace parallel workload.} } @InProceedings{pasquale:iowork, author = {Barbara K. Pasquale and George C. Polyzos}, title = {A Static Analysis of {I/O} Characteristics of Scientific Applications in a Production Workload}, booktitle = {Proceedings of Supercomputing '93}, year = {1993}, pages = {388--397}, publisher = {IEEE Computer Society Press}, address = {Portland, OR}, keyword = {scientific computing, file access patterns, pario-bib}, comment = {Analyzed one month of accounting records from Cray YMP8/864 in SDSC's production environment. Their base assumption is that scientific application I/O is regular and predictable, eg, repetitive periodic bursts, with distinct phases, repeating patterns, and sequential access. The goal is to characterize a set of I/O-intensive scientific applications and evaluate regularity of resource usage. They measure volumes and rates of applications and total system. Cumulative and average usage for each distinct non-system application. Most resource usage came from the 5\% of applications that were not system applications. ``Virtual I/O rate'' is the bytes transferred per CPU second, which is IMHO only a rough measure because sometimes I/O overlaps CPU time, and sometimes does not. They picked out long-running applications with a high virtual I/O rate. Top 50 applications had 71\% of bytes transferred and 10\% of CPU time. Of those, 4.66 MB/sec min, 131 MB/sec max. Of those they picked the ones executed most often. Cluster analysis showed only 1-2 clusters. Correlation between I/O and CPU time. Included in the parallel I/O bibliography because it is useful to that community, though they did not trace parallel workload.} } @Article{patt:iosubsystem, author = {Yale N. Patt}, title = {The {I/O} Subsystem: a Candidate for Improvement}, journal = {IEEE Computer}, year = {1994}, month = {March}, volume = {27}, number = {3}, pages = {15--16}, keyword = {I/O, file system, parallel I/O, pario-bib}, comment = {This is the intro to a special issue on I/O.} } @TechReport{patterson:informed-tr, author = {R. Hugo Patterson and Garth A. Gibson and Eka Ginting and Daniel Stodolsky and Jim Zelenka}, title = {Informed Prefetching and Caching}, year = {1995}, number = {CMU-CS-95-134}, institution = {School of Computer Science, Carnegie Mellon University}, later = {patterson:informed}, keyword = {caching, prefetching, file system, hints, I/O, resource management, parallel I/O, pario-bib}, abstract = {The underutilization of disk parallelism and file cache buffers by traditional file systems induces I/O stall time that degrades the performance of modern microprocessor-based systems. In this paper, we present aggressive mechanisms that tailor file system resource management to the needs of I/O-intensive applications. In particular, we show how to use application-disclosed access patterns (hints) to expose and exploit I/O parallelism, and to dynamically allocate file buffers among three competing demands: prefetching hinted blocks, caching hinted blocks for reuse, and caching recently used data for unhinted accesses. Our approach estimates the impact of alternative buffer allocations on application execution time and applies a cost-benefit analysis to allocate buffers where they will have the greatest impact. We implemented informed prefetching and caching in DEC's OSF/1 operating system and measured its performance on a 150 MHz Alpha equipped with 15 disks. When running a range of applications including text search, 3D scientific visualization, relational database queries, speech recognition, and computational chemistry, informed prefetching reduces the execution time of four of these applications by 20% to 87%. Informed caching reduces the execution time of the fifth application by up to 30%.} } @InProceedings{patterson:informed, author = {R. Hugo Patterson and Garth A. Gibson and Eka Ginting and Daniel Stodolsky and Jim Zelenka}, title = {Informed prefetching and caching}, booktitle = {Proceedings of the Fifteenth ACM Symposium on Operating Systems Principles}, year = {1995}, month = {December}, pages = {79--95}, publisher = {ACM Press}, address = {Copper Mountain, CO}, earlier = {patterson:informed-tr}, keyword = {caching, prefetching, file system, hints, I/O, resource management, parallel I/O, pario-bib}, abstract = {In this paper, we present aggressive, proactive mechanisms that tailor file system resource management to the needs of I/O-intensive applications. In particular, we show how to use application-disclosed access patterns (hints) to expose and exploit I/O parallelism, and to dynamically allocate file buffers among three competing demands: prefetching hinted blocks, caching hinted blocks for reuse, and caching recently used data for unhinted accesses. Our approach estimates the impact of alternative buffer allocations on application execution time and applies cost-benefit analysis to allocate buffers where they will have the greatest impact. We have implemented informed prefetching and caching in Digitals OSF/1 operating system and measured its performance on a 150 MHz Alpha equipped with 15 disks running a range of applications. Informed prefetching reduces the execution time of text search, scientific visualization, relational database queries, speech recognition, and object linking by 20-83\%. Informed caching reduces the execution time of computational physics by up to 42\% and contributes to the performance improvement of the object linker and the database. Moreover, applied to multiprogrammed, I/O-intensive workloads, informed prefetching and caching increase overall throughput.}, comment = {See patterson:informed-tr for an earlier version. Programs may give hints to the file system about what they will read in the future, and in what order. Hints are used for informed prefetching and informed caching. Most interesting thing about this paper over the earlier ones is the buffer management. Prefetcher and demand fetcher both want buffers. LRU cache and hinted cache both could supply buffers (thru replacement). Each supplies a cost for giving up buffers and benefit for getting more buffers. These are expressed in a common 'currency', in terms of their expected effect on I/O service time, and a manager takes buffers from one and gives buffers to another when the benefits outweigh the costs. All is based on a simple model, which is further simplified in their implementation within OSF/1. Performance looks good, they can keep more disks busy in a parallel file system. Furthermore, informed caching helps reduce the number of I/Os. Indeed they 'discover' MRU replacement policy automatically.} } @InProceedings{patterson:latency, author = {R. H. Patterson and G. A. Gibson and M. Satyanarayanan}, title = {Using Transparent Informed Prefetching to Reduce File Read Latency}, booktitle = {Proceedings of the 1992 NASA Goddard Conference on Mass Storage Systems}, year = {1992}, month = {September}, pages = {329--342}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs.cmu.edu/project/pdl/ftp/TIP/MSST.ps}, keyword = {parallel I/O, file prefetching, file caching, pario-bib}, comment = {This 'paper' is really an annotated set of slides.} } @InProceedings{patterson:pdis-tip, author = {R. Hugo Patterson and Garth A. Gibson}, title = {Exposing {I/O} Concurrency with Informed Prefetching}, booktitle = {Proceedings of the Third International Conference on Parallel and Distributed Information Systems}, year = {1994}, month = {September}, pages = {7--16}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/PDIS.ps}, keyword = {prefetching, parallel I/O, pario-bib}, abstract = {Informed prefetching provides a simple mechanism for I/O-intensive, cache-ineffective applications to efficiently exploit highly-parallel I/O subsystems such as disk arrays. This mechanism, dynamic disclosure of future accesses, yields substantial benefits over sequential readahead mechanisms found in current file systems for non-sequen tial workloads. This paper reports the performance of the Transparent Informed Prefetching system (TIP), a minimal prototype implemented in a Mach 3.0 system with up to four disks. We measured reductions by factors of up to 1.9 and 3.7 in the execution time of two example applications: multi-file text search and scientific data visualization.}, comment = {Also available in HTML format at http://www.cs.cmu.edu/Web/Groups/PDL/HTML-Papers/PDIS94/final.fm.html.} } @InProceedings{patterson:raid, author = {David Patterson and Garth Gibson and Randy Katz}, title = {A case for redundant arrays of inexpensive disks {(RAID)}}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data}, year = {1988}, month = {June}, pages = {109--116}, publisher = {ACM Press}, address = {Chicago, IL}, keyword = {parallel I/O, RAID, reliability, cost analysis, I/O bottleneck, disk array, OS93W extra, OS92W, pario-bib}, comment = {Make a good case for the upcoming I/O crisis, compare single large expensive disks (SLED) with small cheap disks. Outline five levels of RAID the give different reliabilities, costs, and performances. Block-interleaved with a single check disk (level 4) or with check blocks interspersed (level 5) seem to give best performance for supercomputer I/O or database I/O or both. Note: the TR by the same name (UCB/CSD 87/391) is essentially identical.} } @InProceedings{patterson:raid2, author = {David Patterson and Peter Chen and Garth Gibson and Randy H. Katz}, title = {Introduction to Redundant Arrays of Inexpensive Disks {(RAID)}}, booktitle = {Proceedings of IEEE Compcon}, year = {1989}, month = {Spring}, pages = {112--117}, earlier = {patterson:raid}, keyword = {parallel I/O, RAID, reliability, cost analysis, I/O bottleneck, disk array, pario-bib}, comment = {A short version of patterson:raid, with some slight updates.} } @Article{patterson:tip, author = {R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan}, title = {A Status Report on Research in Transparent Informed Prefetching}, journal = {ACM Operating Systems Review}, year = {1993}, month = {April}, volume = {27}, number = {2}, pages = {21--34}, later = {patterson:informed}, URL = {http://www.cs.cmu.edu/afs/cs/project/pdl/ftp/TIP/OSRev.ps}, keyword = {file system, prefetching, operating system, pario-bib}, abstract = {This paper focuses on extending the power of caching and prefetching to reduce file read latencies by exploiting application level hints about future I/O accesses. We argue that systems that disclose high-level knowledge can transfer optimization information across module boundaries in a manner consistent with sound software engineering principles. Such Transparent Informed Prefetching (TIP) systems provide a technique for converting the high through put of new technologies such as disk arrays and log-structured file systems into low latency for applications. Our preliminary experiments show that even without a high-throughput I/O sub system TIP yields reduced execution time of up to 30% for applications obtaining data from a remote file server and up to 13% for applications obtaining data from a single local disk. These experiments indicate that greater performance benefits will be available when TIP is integrated with low level resource management policies and highly parallel I/O subsystems such as disk arrays.}, comment = {Not much new over previous TIP papers, but does have newer numbers. See patterson:tip1. Also appears in DAGS'93 (patterson:tip2). Previously appeared as TR CMU-CS-93-1.} } @InProceedings{patterson:tip2, author = {R. Hugo Patterson and Garth A. Gibson and M. Satyanarayanan}, title = {Informed Prefetching: Converting High Throughput to Low Latency}, booktitle = {Proceedings of the 1993 DAGS/PC Symposium}, year = {1993}, month = {June}, pages = {41--55}, organization = {Dartmouth Institute for Advanced Graduate Studies}, address = {Hanover, NH}, later = {patterson:informed}, keyword = {file system, prefetching, operating system, pario-bib}, abstract = {This paper focuses on extending the power of caching and prefetching to reduce file read latencies by exploiting application level hints about future I/O accesses. We argue that systems that disclose high-level knowledge can transfer optimization information across module boundaries in a manner consistent with sound software engineering principles. Such Transparent Informed Prefetching (TIP) systems provide a technique for converting the high throughput of new technologies such as disk arrays and log-structured file systems into low latency for applications. Our preliminary experiments show that even without a high-throughput I/O sub-system TIP yields reduced execution time of up to 30\% for applications obtaining data from a remote file server and up to 13\% for applications obtaining data from a single local disk. These experiments indicate that greater performance benefits will be available when TIP is integrated with low level resource management policies and highly parallel I/O subsystems such as disk arrays.}, comment = {Invited speaker: Garth Gibson. Similar paper appeared in ACM OSR April 1993 (patterson:tip)} } @Misc{patterson:vterabytes, author = {David Patterson}, title = {Terabytes $\gg$ Teraflops (or Why Work on Processors When {I/O} is Where the Action Is?)}, year = {1993}, howpublished = {Produced by University Video Communications}, note = {Videotape}, URL = {http://www.uvc.com/videos/06Patterson.video.html}, keyword = {videotape, computer architecture, parallel I/O, pario-bib}, abstract = {RISC pioneer and UC, Berkeley Computer Science Professor David Patterson is working to develop input/output systems to match the increasingly higher performance of new processors. Here he describes the results of the RAID (Redundant Arrays of Inexpensive Disks) project, which offers much greater performance, capacity, and reliability from I/O systems. Patterson also discusses a new project, Sequoia 2000, which looks at utilizing small helical scan tapes, such as digital-audiotapes or videotapes, to offer terabytes of storage for the price of a file/server. He believes that a 1000x increase in storage, available on most Ethernets, will have a much greater impact than a 1000x increase in processing speed.}, comment = {See patterson:trends. 58 minutes.} } @InProceedings{pawlowski:parsort, author = {Markus Pawlowski and Rudolf Bayer}, title = {Parallel Sorting of Large Data Volumes on Distributed Memory Multiprocessors}, booktitle = {Parallel Computer Architectures: Theory, Hardware, Software, Applications}, year = {1993}, series = {Lecture Notes in Computer Science}, volume = {732}, pages = {246--264}, publisher = {Springer-Verlag}, address = {Berlin}, keyword = {sorting, parallel I/O algorithm, pario-bib}, comment = {Main contribution appears to be a new sampling method for initial partition of data set. They approach it from a database point of view.} } @TechReport{pearson:sorting, author = {Matthew D. Pearson}, title = {Fast Out-of-Core Sorting on Parallel Disk Systems}, year = {1999}, month = {June}, number = {PCS-TR99-351}, institution = {Dept. of Computer Science, Dartmouth College}, address = {Hanover, NH}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR99-351.ps.Z}, keyword = {parallel I/O, out of core, sorting, parallel algorithm, pario-bib}, abstract = {This paper discusses our implementation of Rajasekaran's (l,m)-mergesort algorithm (LMM) for sorting on parallel disks. LMM is asymptotically optimal for large problems and has the additional advantage of a low constant in its I/O complexity. Our implementation is written in C using the ViC* I/O API for parallel disk systems. We compare the performance of LMM to that of the C library function qsort on a DEC Alpha server. qsort makes a good benchmark because it is fast and performs comparatively well under demand paging. Since qsort fails when the swap disk fills up, we can only compare these algorithms on a limited range of inputs. Still, on most out-of-core problems, our implementation of LMM runs between 1.5 and 1.9 times faster than qsort, with the gap widening with increasing problem size.}, comment = {Undergraduate Honors Thesis. Advisor: Tom Cormen. Submitted to SC99.} } @TechReport{perez:clfs, author = {F. {P\'erez} and J. Carretero and P. de~Miguel and F. {Garc\'{\i}a} and L. Alonso}, title = {{CLFS} Design: A Parallel File Manager for Multicomputers}, year = {1994}, number = {FIM/82.1/DATSI/94}, institution = {Universidad Politecnic Madrid}, address = {Madrid, Spain}, URL = {http://laurel.datsi.fi.upm.es/~gp/publications/datsi82.1.ps.Z}, keyword = {multiprocessor file system, parallel I/O, pario-bib}, abstract = {This document describes the detailed design of the CLFS, one of the components of the Cache Coherent File System (CCFS). CCFS has three main components: Client File Server (CLFS), Local File Server (LFS), Concurrent Disk System (CDS). The Client File Servers are located on each processing node, to develop file manager functions in a per node basis. The CLFS will interact with the LFSs to provide block services, naming, locking, real input/output and to manage the disk system, partitions, distributed partitions, etc. The CLFS includes a standard POSIX interface (internally parallelized) and some parallel extensions It will be responsible of maintaining cache consistency, distributing accesses to servers, providing a file system interface to the user, etc.}, comment = {See carretero:*, rosales:cds, perez:clfs.} } @Article{perez:evaluate, author = {F. Perez and J. Carretero and L. Alonso and P. {De Miguel} and F. Garcia}, title = {Evaluating {ParFiSys}: A high-performance parallel and distributed file system}, journal = {Journal of Systems Architecture}, year = {1997}, month = {May}, volume = {43}, number = {8}, pages = {533--542}, keyword = {parallel I/O, multiprocessor file system, pario-bib}, abstract = {We present an overview of ParFiSys, a coherent parallel file system developed at the UPM to provide I/O services to the GPMIMD machine, an MPP built within the ESPRIT project P-5404. Special emphasis is made on the results obtained during ParFiSys evaluation. They were obtained using several I/O benchmarks (PARKBENCH, IOBENCH, etc.) and several MPP platforms (T800, T9000, etc.) to cover a big spectrum of the ParFiSys features, being specifically oriented to measure throughput for scientific applications I/O patterns. ParFiSys is specially well suited to provide I/O services to scientific applications requiring high I/O bandwidth, to minimize application porting effort, and to exploit the parallelism of generic message-passing multicomputers.} } @InProceedings{philippsen:triton, author = {Michael Philippsen and Thomas M. Warschko and Walter F. Tichy and Christian G. Herter}, title = {{Project Triton:} Towards improved Programmability of Parallel Machines}, booktitle = {Proceedings of the Twenty-Sixth Annual Hawaii International Conference on System Sciences}, year = {1993}, volume = {I}, pages = {192--201}, keyword = {parallel programming, parallel architecture, parallel I/O, pario-bib}, comment = {A language- and application-driven proposal for parallel architecture, that mixes SIMD and MIMD, high-performance networking, large memory, shared address space, and so forth. Fairly convincing arguments. One disk per node. Little mention of a file system though. Email from student Udo Boehm:``We use in the version of Triton/1 with 256 PE's 72 Disks at the moment (the filesystem is scalable up to 256 Disks). These Disks are divided into 8 Groups with 9 Disks. In each group exists one parity disk. Our implementation of the filesystem is an parallel version of RAID Level 3 with some extensions. We use so called vector files for diskaccess. A file is always distributed over all disks of the diskarray. A vectorfile is divided in logical blocks. A logical block exist of 72 physical blocks, each block is on one of the 72 disks and all these 72 physical blocks have the same blocknumber on each disk. A logical block has 18432 Bytes, where 16384 Bytes are for Data. The filesystem uses these logical blocks to save data. We do not use special PE's for the I/O. All PE's can be (are) used to do I/O ! There exists no central which coordinates the PE's.''} } @InProceedings{pierce:pario, author = {Paul Pierce}, title = {A Concurrent File System for a Highly Parallel Mass Storage System}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, month = {March}, pages = {155--160}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, hypercube, Intel iPSC/2, multiprocessor file system, pario-bib}, comment = {Intel iPSC/2 Concurrent File System. Chose to tailor system for high performance for large files, read in large chunks. Uniform logical file system view, Unix stdio interface. Blocks scattered over all disks, but not striped. Blocksize 4K optimizes message-passing performance without using blocks that are too big. Tree-directory is stored in ONE file and managed by ONE process, so opens are bottlenecked, but that is not their emphasis. File headers, however, are scattered. The file header info contains a list of blocks. File header is managed by disk process on its I/O node. Data caching is done only at the I/O node of the originating disk drive. Read-ahead is used but not detailed here.} } @TechReport{poole:sio-survey, author = {James T. Poole}, title = {Preliminary Survey of {I/O} Intensive Applications}, year = {1994}, number = {CCSF-38}, institution = {Scalable I/O Initiative}, address = {Caltech Concurrent Supercomputing Facilities, Caltech}, URL = {http://www.cacr.caltech.edu/SIO/pubs/SIO_apps.ps}, keyword = {parallel I/O, pario-bib, multiprocessor file system, file access pattern, checkpoint}, comment = {Goal is to collect a set of representative applications from biology, chemistry, earth science, engineering, graphics, and physics, use performance-monitoring tools to analyze them, create templates and benchmarks that represent them, and then later to evaluate the performance of new I/O tools created by rest of the SIO initiative. Seem to be four categories of I/O needs: input, output, checkpoint, and virtual memory (``out-of-core'' scratch space). Not all types are significant in all applications. (Two groups mention databases and the need to perform computationally complex queries.) Large input is typically raw data (seismic soundings, astronomical observations, satellite remote sensing, weather information). Sometimes there are real-time constraints. Output is often periodic, e.g., the state of the system every few timesteps; typically the volume would increase along with I/O capacity and bandwidth. Checkpointing is a common request; preferably allowing application to choose what and when to checkpoint, and definitely including the state of files. Many kinds of out-of-core: 1) temp files between passes (often written and read sequentially), 2) regular patterns like FFT, matrix transpose, solvers, and single-pass read/compute/write, 3) random access, e.g., to precomputed tables of integrals. Distinct differences in the ways people choose to divide data into files; sometimes all in one huge file, sometimes many ``small'' files (e.g., one per processor, one per timestep, one per region, etc.). Important: overlap of computation and I/O, independent access by individual processors. Not always important: ordering of records read or written by different processors, exposing the I/O model to the application writer. Units of I/O seem to be either (sub)matrices (1--5 dimensions) or items in a collection of objects (100--10000 bytes each). Data sets varied up to 1~TB; bandwidth needs varied up to 1~GB/s. See also bagrodia:sio-character, choudhary:sio-language, bershad:sio-os.} } @InProceedings{poston:hpfs, author = {Alan Poston}, title = {A High Performance File System for {UNIX}}, booktitle = {Proceedings of the USENIX Workshop on UNIX and Supercomputers}, year = {1988}, pages = {215--226}, keyword = {file system, unix, parallel I/O, disk striping, pario-bib}, comment = {A new file system for Unix based on striped files. Better performance for sequential access, better for large-file random access and about the same for small-file random access. Allows full striping track prefetch, or even volume prefetch. Needs a little bit of buffer management change. Talks about buffer management and parity blocks.} } @InProceedings{prabhakar:browsing, author = {Sunil Prabhakar and Divyakant Agrawal and Amr {El Abbadi} and Ambuj Singh and Terence Smith}, title = {Browsing and Placement of Multiresolution Images on Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {102--113}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {With rapid advances in computer and communication technologies, there is an increasing demand to build and maintain large image repositories. In order to reduce the demands on I/O and network resources, multiresolution representations are being proposed for the storage organization of images. Image decomposition techniques such as {\em wavelets} can be used to provide these multiresolution images. The original image is represented by several coefficients, one of them with visual similarity to the original image, but at a lower resolution. These visually similar coefficients can be thought of as {\em thumbnails} or {\em icons} of the original image. This paper addresses the problem of storing these multiresolution coefficients on disks so that thumbnail browsing as well as image reconstruction can be performed efficiently. Several strategies are evaluated to store the image coefficients on parallel disks. These strategies can be classified into two broad classes depending on whether the access pattern of the images is used in the placement. Disk simulation is used to evaluate the performance of these strategies. Simulation results are validated with results from experiments with real disks and are found to be in good agreement. The results indicate that significant performance improvements can be achieved with as few as four disks by placing image coefficients based upon browsing access patterns.}, comment = {They use simulation to study several different placement policies for the thumbnail and varying-resolution versions of images on a disk array.} } @InProceedings{pratt:twofs, author = {Terrence W. Pratt and James C. French and Phillip M. Dickens and Janet, Jr., Stanley A.}, title = {A Comparison of the Architecture and Performance of Two Parallel File Systems}, booktitle = {Proceedings of the Fourth Conference on Hypercube Concurrent Computers and Applications}, year = {1989}, pages = {161--166}, publisher = {Golden Gate Enterprises, Los Altos, CA}, address = {Monterey, CA}, keyword = {parallel I/O, Intel iPSC/2, nCUBE, pario-bib}, comment = {Simple comparison of the iPSC/2 and nCUBE/10 parallel I/O systems. Short description of each system, with simple transfer rate measurements. See also french:ipsc2io-tr.} } @InProceedings{preslan:gfs, author = {Kenneth W. Preslan and Andrew P. Barry and Jonathan E. Brassow and Grant M. Erickson and Erling Nygaard and Christopher J. Sabol and Steven R. Soltis and David C. Teigland and Matthew T. O'Keefe}, title = {A 64-bit, Shared Disk File System for {Linux}}, booktitle = {Proceedings of the Seventh NASA Goddard Conference on Mass Storage Systems}, year = {1999}, month = {March}, pages = {??}, publisher = {IEEE Computer Society Press}, address = {San Diego, CA}, note = {To appear}, URL = {http://gfs.lcse.umn.edu/pubs/NASA_GFS_1999.pdf}, keyword = {verify pages, Linux, shared file system, network-attached disks, disk striping, parallel I/O, pario-bib}, comment = {They discuss a shared, serverless, file system for Linux that integrates IP-based network attached storage and Fibre-Channel- based storage area networks. Based on soltis:gfs.} } @TechReport{prost:mpi-io, author = {Jean-Pierre Prost and Marc Snir and Peter Corbett and Dror Feitelson}, title = {{MPI-IO,} A Message-Passing Interface for Concurrent {I/O}}, year = {1994}, month = {August}, number = {RC~19712 (87394)}, institution = {IBM T.J. Watson Research Center}, keyword = {parallel I/O, message-passing, multiprocesor file system interface, pario-bib}, comment = {See newer version mpi-ioc:mpi-io5.} } @Booklet{rab:raidbook, key = {RAB}, title = {The {RAIDBook}: A Source Book for {RAID} Technology}, year = {1993}, month = {June 9}, howpublished = {The RAID Advisory Board}, address = {Lino Lakes, MN}, note = {First Edition}, keyword = {RAID, disk array, parallel I/O, pario-bib}, comment = {Basically, an educational piece about the basics of RAID technology. Helps to define terms across the industry. Written by the RAID advisory board, which is an industry consortium. Overviews RAID, RAID levels, non-Berkeley RAID levels. List of Board members. Bibliography.} } @TechReport{reddy:compiler-tr, author = {A. L. Narasimha Reddy and P. Banerjee and D. K. Chen}, title = {Compiler Support for Parallel {I/O} Operations}, year = {1991}, institution = {IBM Yorktown Heights}, note = {Also appeared in ICPP '91}, later = {reddy:compiler}, keyword = {parallel I/O, pario-bib, compilers} } @InProceedings{reddy:compiler, author = {A. L. Narasimha Reddy and P. Banerjee and D. K. Chen}, title = {Compiler Support for Parallel {I/O} Operations}, booktitle = {Proceedings of the 1991 International Conference on Parallel Processing}, year = {1991}, pages = {II:290--II:291}, publisher = {CRC Press}, address = {St. Charles, IL}, earlier = {reddy:compiler-tr}, keyword = {parallel I/O, pario-bib, compilers}, comment = {This version is only 2 pages. reddy:compiler-tr provides the full text. They discuss three primary issues. 1) Overlapping I/O with computation: the compiler's dependency analysis is used to decide when some I/O may be moved up and performed asynchronously with other computation. 2) Parallel execution of I/O statements: {\em if} all sizes are known at compile time, the compiler can insert seeks so that processes can access the file independently. When writing in the presence of conditionals they even propose skipping by the maximum and leaving holes in the file, and they claim that this doesn't hurt (!). 3) Parallel format conversion: again, if there are fixed-width fields the compiler can have processors seek to different locations, read data independently, and do format conversion in parallel. Really all this is saying is that fixed-width fields are good for parallelism, and that compilers could take advantage of them.} } @InProceedings{reddy:hyperio1, author = {A. L. Reddy and P. Banerjee and Santosh G. Abraham}, title = {{I/O} Embedding in Hypercubes}, booktitle = {Proceedings of the 1988 International Conference on Parallel Processing}, year = {1988}, volume = {1}, pages = {331--338}, publisher = {Pennsylvania State Univ. Press}, address = {St. Charles, IL}, later = {reddy:hyperio3}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {Emphasis is on adjacency. It also implies (and they assume) that data is distributed well across the disks so no data needs to move beyond the neighbors of an I/O node. Still, the idea of adjacency is good since it allows for good data distribution while not requiring it, and for balancing I/O procs among procs in a good way. Also avoids messing up the hypercube regularity with (embedded) dedicated I/O nodes.} } @InProceedings{reddy:hyperio2, author = {A. L. Reddy and P. Banerjee}, title = {{I/O} issues for hypercubes}, booktitle = {ACM International Conference on Supercomputing}, year = {1989}, pages = {72--81}, later = {reddy:hyperio3}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {See reddy:hyperio3 for extended version.} } @Article{reddy:hyperio3, author = {A. L. Narasimha Reddy and Prithviraj Banerjee}, title = {Design, Analysis, and Simulation of {I/O} Architectures for Hypercube Multiprocessors}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1990}, month = {April}, volume = {1}, number = {2}, pages = {140--151}, publisher = {IEEE Computer Society Press}, earlier = {reddy:hyperio1}, keyword = {parallel I/O, hypercube, pario-bib}, comment = {An overall paper restating their embedding technique from reddy:hyperio1, plus a little bit of evaluation along the lines of reddy:pario2, plus some ideas about matrix layout on the disks. They claim that declustering is important, since synchronized disks do not provide enough parallelism, especially in the communicati