% BibTeX bibliography of the IOPADS '99 papers % Send corrections to David Kotz (dfk at cs.dartmouth.edu) @string{acmpress = "ACM Press"} @string{iopads99 = "Proceedings of the Sixth Workshop on Input/Output in Parallel and Distributed Systems"} @InProceedings{barve:bus, author = "Rakesh Barve and Phillip B. Gibbons and Bruce K. Hillyer and Yossi Matias and Elizabeth Shriver and Jeffrey Scott Vitter", title = "Round-like Behavior in Multiple Disks on a Bus", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "1--9", keyword = "disk, I/O bus, parallel I/O, pario-bib", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Shriver.ps", abstract = { In modern I/O architectures, multiple disk drives are attached to each I/O bus. Under I/O-intensive workloads, the disk latency for a request can be overlapped with the disk latency and data transfers of requests to other disks, potentially resulting in an aggregate I/O throughput at nearly bus bandwidth. This paper reports on a performance impairment that results from a previously unknown form of convoy behavior in disk I/O, which we call rounds. In rounds, independent requests to distinct disks convoy, so that each disk services one request before any disk services its next request. We analyze log files to describe read performance of multiple Seagate Wren-7 disks that share a SCSI bus under a heavy workload, demonstrating the rounds behavior and quantifying its performance impact.} } @InProceedings{arpaci-dusseau:river, author = "Remzi H. Arpaci-Dusseau and Eric Anderson and Noah Treuhaft and David E. Culler and Joseph M. Hellerstein and David Patterson and Kathy Yelick", title = "Cluster {I/O} with {River}: Making the Fast Case Common", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "10--22", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Remzi.ps", keyword = "cluster computing, parallel I/O, pario-bib", abstract = { We introduce River, a data-flow programming environment and I/O substrate for clusters of computers. River is designed to provide maximum performance in the common case--- even in the face of non-uniformities in hardware, software, and workload. River is based on two simple design features: a high-performance distributed queue,and a storage redundancy mechanism called graduated declustering.We have implemented a number of data-intensive applications on River, which validate our design with near-ideal performance in a variety of non-uniform performance scenarios.} } @InProceedings{thakur:mpi-io-implement, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {On Implementing {MPI-IO} Portably and with High Performance}, booktitle = iopads99, year = 1999, month = may, pages = "23--32", URL = {http://www.mcs.anl.gov/~thakur/papers/mpio-impl.ps.gz}, earlier = "thakur:mpi-io-implement-tr", keyword = {parallel I/O, multiprocessor file system interface, pario-bib}, abstract = {We discuss the issues involved in implementing MPI-IO portably on multiple machines and file systems and also achieving high performance. One way to implement MPI-IO portably is to implement it on top of the basic Unix I/O functions ({\tt open}, {\tt lseek}, {\tt read}, {\tt write}, and {\tt close}), which are themselves portable. We argue that this approach has limitations in both functionality and performance. We instead advocate an implementation approach that combines a large portion of portable code and a small portion of code that is optimized separately for different machines and file systems. We have used such an approach to develop a high-performance, portable MPI-IO implementation, called ROMIO. \par In addition to basic I/O functionality, we consider the issues of supporting other MPI-IO features, such as 64-bit file sizes, noncontiguous accesses, collective I/O, asynchronous I/O, consistency and atomicity semantics, user-supplied hints, shared file pointers, portable data representation, and file preallocation. We describe how we implemented each of these features on various machines and file systems. The machines we consider are the HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, SGI Origin2000, and networks of workstations; and the file systems we consider are HP HFS, IBM PIOFS, Intel PFS, NEC SFS, SGI XFS, NFS, and any general Unix file system (UFS). \par We also present our thoughts on how a file system can be designed to better support MPI-IO. We provide a list of features desired from a file system that would help in implementing MPI-IO correctly and with high performance.} } @InProceedings{kuo:efficient, author = "S. Kuo and M. Winslett and Y. Cho and J. Lee and Y. Chen", title = "Efficient Input and Output for Scientific Simulations", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "33--44", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kuo.ps", keyword = "scientific computing, simulation, parallel I/O, pario-bib", abstract = { Large simulations which run for hundreds of hours on parallel computers often periodically generate snapshots of states, which are later post-processed to visualize the simulated physical phenomenon. For many applications, fast I/O during post-processing, which is dependent on an efficient organization of data on disk, is as important as minimizing computation-time I/O. In this paper we propose optimizations to support efficient parallel I/O for scientific simulations and subsequent visualizations. We present an ordering mechanism to linearize data on disk, a performance model to help to choose a proper stripe unit size, and a scheduling algorithm to minimize communication contention. Our experiments on an IBM SP show that the combination of these strategies provides a 20-25\% performance boost.} } @InProceedings{mache:spatial, author = "Jens Mache and Virginia Lo and Marilynn Livingston and Sharad Garg", title = "The Impact of Spatial Layout of Jobs on Parallel {I/O} Performance", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "45--56", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Mache.ps", keyword = "parallel I/O, pario-bib", abstract = { Input/Output is a big obstacle to effective use of teraflops-scale computing systems. Motivated by earlier parallel I/O measurements on an Intel TFLOPS machine, we conduct studies to determine the sensitivity of parallel I/O performance on multi-programmed mesh-connected machines with respect to number of I/O nodes, number of compute nodes, network link bandwidth, I/O node bandwidth, spatial layout of jobs, and read or write demands of applications. \par Our extensive simulations and analytical modeling yield important insights into the limitations on parallel I/O performance due to network contention, and into the possible gains in parallel I/O performance that can be achieved by tuning the spatial layout of jobs. \par Applying these results, we devise a new processor allocation strategy that is sensitive to parallel I/O traffic and the resulting network contention. In performance evaluations driven by synthetic workloads and by a real workload trace captured at the San Diego Supercomputing Center, the new strategy improves the average response time of parallel I/O intensive jobs by up to a factor of 4.5.} } @InProceedings{zhou:threads, author = "Yuanyuan Zhou and Limin Wang and Douglas W. Clark and Kai Li", title = "Thread Scheduling for Out-of-Core Applications with Memory Server on Multicomputers", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "57--67", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Zhou.ps", keyword = "threads, scheduling, memory, out-of-core application, parallel I/O, pario-bib", abstract = { Out-of-core applications perform poorly in paged virtual memory (VM) systems because demand paging involves slow disk I/O accesses. Much research has been done on reducing the I/O overhead in such applications by either reducing the number of I/Os or lowering the cost of each I/O operation. In this paper, we investigate a method that combines fine-grained threading with a memory server model to improve the performance of out-of-core applications on multicomputers. The memory server model decreases the average cost of I/O operations by paging to remote memory, while the fine-grained thread scheduling reduces the number of I/O accesses by improving the data locality of applications. We have evaluated this method on an Intel Paragon with 7 applications. Our results show that the memory server system performs better than the VM disk paging by a factor of 5 for sequential applications and a factor of 1.5 to 2.2 for parallel applications. The fine-grained threading alone improves the VM disk paging performance by a factor of 10 and 1.2 to 3 respectively for sequential and parallel applications. Overall, the combination of these two techniques outperforms the VM disk paging by more than a factor of 12 for sequential applications and a factor of 3 to 6 for parallel applications.} } @InProceedings{kallahalla:read-once, author = "Mahesh Kallahalla and Peter J. Varman", title = "Optimal Read-Once Parallel Disk Scheduling", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "68--77", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Kallahalla.ps", keyword = "disk scheduling, parallel I/O, pario-bib", abstract = { We present an optimal algorithm, L-OPT, for prefetching and I/O scheduling in parallel I/O systems using a read-once model of block reference. The algorithm uses knowledge of the next L block references, L-block lookahead, to schedule I/Os in an on-line manner. It uses a dynamic priority assignment scheme to decide when blocks should be prefetched, so as to minimize the total number of I/Os. The parallel disk model of an I/O system is used to study the performance of L-OPT. We show that L-OPT is comparable to the best on-line algorithm with the same amount of lookahead; the ratio of the length of its schedule to the length of the optimal schedule is within a constant factor of the best possible. Specifically, we show that the competitive ratio of L-OPT is $\Omega(\sqrt{MD/L})$ which matches the lower bound on the competitive ratio of any prefetching algorithm with L-block lookahead. In addition we show that when the lookahead consists of the entire reference string, L-OPT performs the minimum possible number of I/Os; hence L-OPT is the optimal off-line algorithm. Finally, using synthetic traces we empirically study the performance characteristics of L-OPT.} } @InProceedings{bester:gass, author = "Joseph Bester and Ian Foster and Carl Kesselman and Jean Tedesco and Steven Tuecke", title = "{GASS}: A Data Movement and Access Service for Wide Area Computing Systems", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "78--88", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Tedesco.ps", keyword = "wide-area network, parallel I/O, pario-bib", abstract = { In wide area computing, programs frequently execute at sites that are distant from their data. Data access mechanisms are required that place limited functionality demands on an application or host system yet permit high-performance implementations. To address these requirements, we propose a data movement and access service called Global Access to Secondary Storage (GASS). This service defines a global name space via Uniform Resource Locators and allows applications to access remote files via standard I/O interfaces. High performance is achieved by incorporating default data movement strategies that are specialized for I/O patterns common in wide area applications and by providing support for programmer management of data movement. GASS forms part of the Globus toolkit, a set of services for high-performance distributed computing. GASS itself makes use of Globus services for security and communication, and other Globus components use GASS services for executable staging and real-time remote monitoring. Application experiences demonstrate that the library has practical utility.} } @InProceedings{weissman:smart, author = "Jon B. Weissman", title = "Smart File Objects: A Remote File Access Paradigm", booktitle = iopads99, year = 1999, publisher = acmpress, address = "Atlanta, GA", month = may, pages = "89--97", URL = "http://vibes.cs.uiuc.edu/IOPADS/Accepted/Weissman.ps", keyword = "object, parallel I/O, pario-bib", abstract = { This paper describes a new scheme for remote file access called Smart File Objects (SFO). The SFO is an object-oriented application-specific file access paradigm designed to attack the bottleneck imposed by high latency, low bandwidth networks such as wide-area and wireless networks. The SFO uses application and network information to adaptively prefetch needed data in parallel with the execution of the application. The SFO can offer additional advantages such as non-blocking I/O, bulk I/O, improved file access APIs, and increased reliability. We describe the SFO concept, a prototype implementation in the Mentat system, and the results obtained with a distributed gene sequence application running across the Internet and vBNS. The results show the potential of the SFO approach to improve application performance.} }