@Article{arpaci-dusseau:jriver, author = {Arpaci-Dusseau, Remzi, H.}, title = {Run-Time Adaptation in {R}iver}, journal = {ACM Transactions on Computer Systems}, year = {2003}, month = {February}, volume = {21}, number = {1}, pages = {36--86}, publisher = {ACM Press}, keyword = {distributed query processing, dataflow, pario-bib}, comment = {River is a dataflow programming environment for database query processing applications. River is specifically designed for clusters of computers with heterogeneous performance characteristics. The goal of the River runtime system is to adapt to "performance faults"--portions of the system that perform poorly by dynamically adjusting the transfer of data through the dataflow graph. River uses two constructs to build applications: a distributed queue that deals with performance faults by consumers, and graduated declustering that deals with performance faults of producers. A distributed queue pushes data through the dataflow graph at a rate proportional to the rate of consumption and adapts to changes in consumption rates. Graduated declustering deals with producer performance faults by reading from replicated producers. Although River is designed specifically for query processing, they briefly discuss how one might adapt scientific applications to work in their framework.} } @Misc{braam:lustre-arch, author = {Peter J. Braam}, title = {The Lustre Storage Architecture}, year = {2002}, month = {November}, howpublished = {Cluster File Systems Inc. Architecture, design, and manual for Lustre}, note = {http://www.lustre.org/docs/lustre.pdf}, URL = {http://www.lustre.org/docs/lustre.pdf}, keyword = {object-based storage, distributed file system, parallel file system, pario-bib}, comment = {Describes an open-source project to develop an object-based file system for clusters. Related to the NASD project at CMU (http://www.pdl.cs.cmu.edu/NASD/).} } @Misc{cfs:lustre, key = {CFS}, title = {Lustre: A Scalable, High-Performance File System}, year = {2002}, month = {November}, howpublished = {Cluster File Systems Inc. white paper, version 1.0}, note = {http://www.lustre.org/docs/whitepaper.pdf}, URL = {http://www.lustre.org/docs/whitepaper.pdf}, keyword = {object-based storage, distributed file system, parallel file system, pario-bib}, comment = {Describes an open-source project to develop an object-based file system for clusters. Related to the NASD project at CMU (http://www.pdl.cs.cmu.edu/NASD/).} } @InProceedings{debergalis:dafs, author = {Matt DeBergalis and Peter Corbett and Steve Kleiman and Arthur Lent and Dave Noveck and Tom Talpey and Mark Wittle}, title = {The Direct Access File System}, booktitle = {Proceedings of the USENIX FAST '03 Conference on File and Storage Technologies}, year = {2003}, month = {April}, publisher = {USENIX Association}, address = {San Francisco, CA}, URL = {http://www.usenix.org/events/fast03/tech/debergalis.html}, keyword = {direct access file system, dafs, remote dma, pario-bib} } @InProceedings{ghemawat:googlefs, author = {Sanjay Ghemawat and Howard Gobioff and Shun-Tak Leung}, title = {The {Google} File System}, booktitle = {Proceedings of the Nineteenth ACM Symposium on Operating Systems Principles}, year = {2003}, month = {October}, pages = {96--108}, publisher = {ACM Press}, address = {Bolton Landing, NY}, URL = {http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf}, keyword = {distributed file system, pario-bib}, abstract = {We have designed and implemented the Google File System, a scalable distributed file system for large distributed data-intensive applications. It provides fault tolerance while running on inexpensive commodity hardware, and it delivers high aggregate performance to a large number of clients. While sharing many of the same goals as previous distributed file systems, our design has been driven by observations of our application workloads and technological environment, both current and anticipated, that reflect a marked departure from some earlier file system assumptions. This has led us to re-examine traditional choices and explore radically different design points. \par The file system has successfully met our storage needs. It is widely deployed within Google as the storage platform for the generation and processing of data used by our service as well as research and development efforts that require large data sets. The largest cluster to date provides hundreds of terabytes of storage across thousands of disks on over a thousand machines, and it is concurrently accessed by hundreds of clients. \par In this paper, we present file system interface extensions designed to support distributed applications, discuss many aspects of our design, and report measurements from both micro-benchmarks and real world use.} } @InProceedings{klaskey:data-streaming, author = {Scott Alan Klasky and Stephane Ethier and Zhihong Lin and Kevin Martins and Doug McCune and Ravi Samtaney}, title = {Grid-Based Parallel Data Streaming implemented for the Gyrokinetic Toroidal Code}, booktitle = {Proceedings of SC2003: High Performance Networking and Computing}, year = {2003}, month = {November}, publisher = {IEEE Computer Society Press}, address = {Phoenix, AZ}, URL = {http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf}, keyword = {grid, parallel data streams, hydrodynamics, application, parallel I/O, pario-app, pario-bib}, abstract = {We have developed a threaded parallel data streaming approach using Globus to transfer multi-terabyte simulation data from a remote supercomputer to the scientist's home analysis/visualization cluster, as the simulation executes, with negligible overhead. Data transfer experiments show that this concurrent data transfer approach is more favorable compared with writing to local disk and then transferring this data to be post-processed. The present approach is conducive to using the grid to pipeline the simulation with post-processing and visualization. We have applied this method to the Gyrokinetic Toroidal Code (GTC), a 3-dimensional particle-in-cell code used to study micro-turbulence in magnetic confinement fusion from first principles plasma theory.}, comment = {published on the web} } @InProceedings{lumb:facade, author = {Christopher R. Lumb}, title = {Fa\c{c}ade: Virtual Storage Devices with Performance Guarantees}, booktitle = {Proceedings of the USENIX FAST '03 Conference on File and Storage Technologies}, year = {2003}, month = {April}, publisher = {USENIX Association}, address = {San Francisco, CA}, URL = {http://www.usenix.org/events/fast03/tech/lumb.html}, keyword = {file systems, qos, quality of service, pario-bib}, abstract = {High-end storage systems, such as those in large data centers, must service multiple independent workloads. Workloads often require predictable quality of service, despite the fact that they have to compete with other rapidly-changing workloads for access to common storage resources. We present a novel approach to providing performance guaran-tees in this highly-volatile scenario, in an efficient and cost-effective way. Fa\c{c}ade, a virtual store controller, sits between hosts and storage devices in the network, and throttles individual I/O requests from multiple clients so that devices do not saturate. We implemented a prototype, and evaluated it using real workloads on an enterprise storage system. We also instantiated it to the particular case of emulating commercial disk arrays. Our results show that Fa\c{c}ade satisfies performance objectives while making efficient use of the storage resources-even in the presence of failures and bursty workloads with stringent performance requirements.} } @InProceedings{magoutis:direct, author = {Kostas Magoutis and Salimah Addetia and Alexandra Fedorova and Margo I. Seltzer}, title = {Making the Most Out of Direct-Access Network Attached Storage}, booktitle = {Proceedings of the USENIX FAST '03 Conference on File and Storage Technologies}, year = {2003}, month = {April}, publisher = {USENIX Association}, address = {San Francisco, CA}, URL = {http://www.usenix.org/events/fast03/tech/magoutis.html}, keyword = {file systems, rpc optimizations, rdma, multi-client workload, small I/O, pario-bib}, abstract = {The performance of high-speed network-attached storage applications is often limited by end-system overhead, caused primarily by memory copying and network protocol processing. In this paper, we examine alternative strategies for reducing overhead in such systems. We consider optimizations to remote procedure call (RPC)-based data transfer using either remote direct memory access (RDMA) or network interface support for pre-posting of application receive buffers. We demonstrate that both mechanisms enable file access throughput that saturates a 2Gb/s network link when performing large I/Os on relatively slow, commodity PCs. However, for multi-client workloads dominated by small I/Os, throughput is limited by the per-I/O overhead of processing RPCs in the server. For such workloads, we propose the use of a new network I/O mechanism, Optimistic RDMA (ORDMA). ORDMA is an alternative to RPC that aims to improve server throughput and response time for small I/Os. We measured performance improvements of up to 32\% in server throughput and 36\% in response time with use of ORDMA in our prototype.} } @InProceedings{ober:seismic2, author = {Curtis Ober and Ron Oldfield and David Womble and John VanDyke and Sudip Dosanjh}, title = {Seismic imaging on massively parallel computers}, booktitle = {Proceedings of the 1996 Simulations Multiconference}, year = {1996}, month = {April}, URL = {ftp://ftp.cs.dartmouth.edu/pub/raoldfi/salvo/smc96.ps.gz}, keyword = {parallel application, scientific computing, seismic data processing, parallel I/O, pario-bib, oldfield} } @InProceedings{ober:seismic3, author = {Curtis Ober and Ron Oldfield and David Womble and L. Romero and Charles Burch}, title = {Practical aspects of prestack depth migration with finite differences}, booktitle = {Proceedings of the 67th Annual International Meeting of the Society of Exploration Geophysicists}, year = {1997}, month = {November}, pages = {1758--1761}, address = {Dallas Texas}, note = {Expanded Abstracts}, keyword = {parallel application, scientific computing, seismic data processing, parallel I/O, pario-bib, oldfield} } @Misc{panasas:architecture, key = {PA}, title = {Object-based Storage Architecture: Defining a new generation of storage systems built on distributed, intelligent storage devices}, year = {2003}, month = {October}, howpublished = {Panasas Inc. white paper, version 1.0}, note = {http://www.panasas.com/docs/}, URL = {http://www.panasas.com/docs/Object_Storage_Architecture_WP.pdf}, keyword = {object-based storage, distributed file system, parallel file system, pario-bib}, comment = {The paper describes the architecture of proprietary object-based storage system for clusters--an extension of Garth Gibson's NASD work at CMU (see gibson:nasd-tr). Similar to Lustre (cfs:lustre, braam:lustre-arch).} } @Misc{pathforward-fs, key = {SGS}, title = {Statement of Work: {SGS} File System}, year = {2001}, month = {April}, howpublished = {ASCI PathForward Program: {DOE} National Nuclear Security Administration \& the {DOD} National Security Agency}, URL = {http://www.llnl.gov/asci/pathforward_trilab/file_system_sow.pdf}, keyword = {design, parallel file system, parallel I/O, pario-bib}, comment = {Describes the requirements and desired performance features of a parallel file system designed for the DOE ASCI computers.} } @InProceedings{spencer:pipeline, author = {M. Spencer and R. Ferreira and M. Beynon and T. Kurc and U. Catalyurek and A. Sussman and J. Saltz}, title = {Executing multiple pipelined data analysis operations in the Grid}, booktitle = {Proceedings of SC2002: High Performance Networking and Computing}, year = {2002}, month = {November}, address = {Baltimore, Maryland}, URL = {citeseer.nj.nec.com/spencer02executing.html}, keyword = {DataCutter, pipeline, dataflow, pario-bib}, abstract = {Processing of data in many data analysis applications can be represented as an acyclic, coarse grain data flow, from data sources to the client. This paper is concerned with scheduling of multiple data analysis operations, each of which is represented as a pipelined chain of processing on data. We define the scheduling problem for effectively placing components onto Grid resources, and propose two scheduling algorithms. Experimental results are presented using a visualization application.} } @InProceedings{torrellas:PnetCDF, author = {Jianwei Li and Wei-keng Liao and Alok Choudhary and Robert Ross and Rajeev Thakur and William Gropp and Rob Latham and Andrew Siegel and Brad Gallagher and Michael Zingale}, title = {Parallel {netCDF}: A High-Performance Scientific {I/O} Interface}, booktitle = {Proceedings of SC2003: High Performance Networking and Computing}, year = {2003}, month = {November}, publisher = {IEEE Computer Society Press}, address = {Phoenix, AZ}, URL = {http://www.sc-conference.org/sc2003/paperpdfs/pap258.pdf}, keyword = {parallel I/O interface, netCDF, MPI-IO, pario-bib}, abstract = {Dataset storage, exchange, and access play a critical role in scientific applications. For such purposes netCDF serves as a portable, efficient file format and programming interface, which is popular in numerous scientific application domains. However, the original interface does not provide an efficient mechanism for parallel data storage and access. \par In this work, we present a new parallel interface for writing and reading netCDF datasets. This interface is derived with minimal changes from the serial netCDF interface but defines semantics for parallel access and is tailored for high performance. The underlying parallel I/O is achieved through MPI-IO, allowing for substantial performance gains through the use of collective I/O optimizations. We compare the implementation strategies and performance with HDF5. Our tests indicate programming convenience and significant I/O performance improvement with this parallel netCDF (PnetCDF) interface.}, comment = {published on the web only} } @InProceedings{tran:adaptive, author = {Nancy Tran and Daniel A. Reed}, title = {{ARIMA} time series modeling and forecasting for adaptive I/O prefetching}, booktitle = {Proceedings of the 15th international conference on Supercomputing}, year = {2001}, month = {June}, pages = {473--485}, URL = {http://doi.acm.org/10.1145/377792.377905}, keyword = {pario-bib, access pattern, prefetching, modeling, time-series analysis}, abstract = {Bursty application I/O patterns, together with transfer limited storage devices, combine to create a major I/O bottleneck on parallel systems. This paper explores the use of time series models to forecast application I/O request times, then prefetching I/O requests during computation intervals to hide I/O latency. Experimental results with I/O intensive scientific codes show performance improvements compared to standard UNIX prefetching strategies.} } @InProceedings{uysal:mems, author = {Mustafa Uysal and Arif Merchant and Guillermo A. Alvarez}, title = {Using {MEMS}-based storage in disk arrays}, booktitle = {Proceedings of the USENIX FAST '03 Conference on File and Storage Technologies}, year = {2003}, month = {April}, pages = {89--101}, publisher = {USENIX Association}, address = {San Francisco, CA}, URL = {http://www.usenix.org/events/fast03/tech/uysal.html}, keyword = {mems-based storage, disk arrays, pario-bib}, abstract = {Current disk arrays, the basic building blocks of high-performance storage systems, are built around two memory technologies: magnetic disk drives, and non-volatile DRAM caches. Disk latencies are higher by six orders of magnitude than non-volatile DRAM access times, but cache costs over 1000 times more per byte. A new storage technology based on microelectromechanical systems (MEMS) will soon offer a new set of performance and cost characteristics that bridge the gap between disk drives and the caches. We evaluate potential gains in performance and cost by incorporating MEMS-based storage in disk arrays. Our evaluation is based on exploring potential placements of MEMS-based storage in a disk array. We used detailed disk array simulators to replay I/O traces of real applications for the evaluation. We show that replacing disks with MEMS-based storage can improve the array performance dramatically, with a cost performance ratio several times better than conventional arrays even if MEMS storage costs ten times as much as disk. We also demonstrate that hybrid MEMS/disk arrays, which cost less than purely MEMS-based arrays, can provide substantial improvements in performance and cost/performance over conventional arrays.}, comment = {Best paper in fast2003.} } @InProceedings{worringen:non-contiguous, author = {Joachim Worringen and Jesper Larson Traff and Hubert Ritzdorf}, title = {Fast Parallel Non-Contiguous File Access}, booktitle = {Proceedings of SC2003: High Performance Networking and Computing}, year = {2003}, month = {November}, publisher = {IEEE Computer Society Press}, address = {Phoenix, AZ}, URL = {http://www.sc-conference.org/sc2003/paperpdfs/pap319.pdf}, keyword = {parallel I/O interface, file access patterns, pario-bib}, abstract = {Many applications of parallel I/O perform non-contiguous file accesses, but only few file system interfaces support non-contiguous access. In contrast, the most commonly used parallel programming interface, MPI, supports parallel I/O through its MPI-IO interface. Within this interface, non-contiguous accesses are supported by the use of derived MPI datatypes. Unfortunately, current MPI-IO implementations suffer from low performance of such non-contiguous accesses when compared to the performance of the storage system for contiguous accesses although a considerable amount of work has been done in this area. In this paper we analyze an important bottleneck in current implementations of MPI-IO, and present a new technique termed listless i/o to perform non-contiguous access with MPI-IO. On the NEC SX-series of parallel vector computers, listless i/o is able to increase the bandwidth for non-contiguous file access by sometimes more than a factor of 500 when compared to the traditional approach.}, comment = {published on the web} }