@Article{brezany:irregular, author = {P. Brezany and A. Choudhary and M. Dang}, title = {Parallelization of irregular out-of-core applications for distributed-memory systems}, journal = {High-Performance Computing and Networking}, year = {1997}, series = {Lecture Notes in Computer Science}, volume = {1225}, pages = {811--820}, publisher = {Springer-Verlag}, earlier = {brezany:irregular-tr}, keyword = {parallel I/O, out of core, compiler, library, pario-bib}, abstract = {Large scale irregular applications involve data arrays and other data structures that are too large to fit in main memory and hence reside on disks; such applications are called out-of-core applications. This paper presents techniques for implementing this kind of applications. In particular we present a design for a runtime system to efficiently support parallel execution of irregular out-of-core codes on distributed-memory systems. Furthermore, we describe the appropriate program transformations required to reduce the I/O overheads for staging data as well as for communication while maintaining load balance. The proposed techniques can be used by a parallelizing compiler or by users writing programs in node + message passing style. We have done a preliminary implementation of the techniques presented here. We introduce experimental results from a template CFD code to demonstrate the efficacy of the presented techniques.}, comment = {The authors present techniques for implementing large scale irregular out-of-core applications. The techniques they describe can be used by a parallel compiler (e.g., HPF and its extensions) or by users using message passing. The objectives of the proposed techniques are to ''to minimize I/O accesses in all steps while maintaining load balance and minimal communication''. They demonstrate the effectivness of their techniques by showing results from a Computational Fluid Dynamics (CFD) code.} } @Article{ceron:dna, author = {C. Ceron and J. Dopazo and E. L. Zapata and J.M. Carazo and O. Trelles}, title = {Parallel implementation of {DNAml} program on message-passing architectures}, journal = {Parallel Computing}, year = {1997}, month = {June}, volume = {24}, number = {5--6}, pages = {701--716}, publisher = {Elsevier Science}, keyword = {parallel computers, run-time analysis, phylogenetic trees, DNAml program, source code, parallel I/O, pario-bib}, abstract = {We present a new computing approach for the parallelization on message-passing computer architectures of the DNAml algorithm, one of the most powerful tools available for constructing phylogenetic trees from DNA sequences. An analysis of the data dependencies of the method gave little chances to develop an efficient parallel approach. However, a careful run-time analysis of the behaviour of the algorithm allowed us to propose a very efficient parallel implementation based on the combination of advanced dynamic scheduling strategies, speculative running-time execution decisions and I/O buffering. In this work, we discuss specific Parallel Virtual Machine (PVM)-based implementations for a cluster of workstations and for Distributed Memory multiprocessors, with high performance results. The code can be obtained from our public-domain sites.}, comment = {They discuss the parallelization on message-passing computers of the {DNA}ml algorithm, a tool used to construct phylogenetic trees from {DNA} sequences. By performing a run-time analysis of the behavior of the algorithm they came up with an efficient parallel implementation based on dynamic scheduling strategies, speculative run-time execution decisions and I/O buffering. They use I/O buffering (prefetching) to fetch tasks that need to be processed. The parallel code was written in C using PVM for message passing and is avaialable via anonymous ftp at ftp.ac.uma.es.} } @TechReport{dazevedo:edonio, author = {E.~F. D'Azevedo and C.~H. Romine}, title = {{EDONIO}: Extended distributed object network {I/O} library}, year = {1995}, number = {ORNL/TM-12934}, institution = {Oak Ridge National Laboratory}, keyword = {parallel I/O, pario-bib} } @InProceedings{ferreira:microscope, author = {Renato Ferreira and Bongki Moon and Jim Humphries and Alan Sussman and Joel Saltz and Robert Miller and Angelo Demarzo}, title = {The Virtual Microscope}, booktitle = {American Medical Informatics Association, 1997 Annual Fall Symposium}, year = {1997}, month = {October}, pages = {449--453}, address = {Nashville, TN}, URL = {http://www.cs.arizona.edu/~bkmoon/papers/amia97.ps}, keyword = {pario-bib, application}, comment = {Best Application Paper award. \par This paper describes a client/server application that emulates a high power light microscope. They use wavelet compression to reduce the size of each of the electronic slides and they use a parallel data server much like the ones used for sattelite image data (see chang:titan) to service data requests.} } @Article{ghandeharizadeh:mitra, author = {Shahram Ghandeharizadeh and Roger Zimmermann and Weifeng Shi and Reza Rejaie and Doug Ierardi and Ta-Wei Li}, title = {Mitra--- A Continuous Media Server}, journal = {Multimedia Tools and Applications}, year = {1998}, month = {July}, volume = {5}, number = {1}, pages = {79--108}, publisher = {Kluwer Academic Publishers}, URL = {http://perspolis.usc.edu/Users/zimmerma/mitra.html}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {Mitra is a scalable storage manager that supports the display of continuous media data types, e.g., audio and video clips. It is a software based system that employs off-the-shelf hardware components. Its present hardware platform is a cluster of multi-disk workstations, connected using an ATM switch. Mitra supports the display of a mix of media types. To reduce the cost of storage, it supports a hierarchical organization of storage devices and stages the frequently accessed objects on the magnetic disks. For the number of displays to scale as a function of additional disks, Mitra employs staggered striping. It implements three strategies to maximize the number of simultaneous displays supported by each disk. First, the EVEREST file system allows different files (corresponding to objects of different media types) to be retrieved at different block size granularities. Second, the FIXB algorithm recognizes the different zones of a disk and guarantees a continuous display while harnessing the average disk transfer rate. Third, Mitra implements the Grouped Sweeping Scheme (GSS) to minimize the impact of disk seeks on the available disk bandwidth. \par In addition to reporting on implementation details of Mitra, we present performance results that demonstrate the scalability characteristics of the system. We compare the obtained results with theoretical expectations based on the bandwidth of participating disks. Mitra attains between 65% to 100% of the theoretical expectations.}, comment = {This paper describes the continous media server Mita. Mita runs on a cluster of multi-disk HP 9000/735 workstations. Each workstation consists of 80 Mbytes of memory and four disks. They implement ''staggered striping'' of the data in which disks are clustered based on media type and treated as a single logical unit. Data is then striped across the logical disk cluster in a round-robin fashion. They present performance results as a function of total number of disks and the number of disks in a cluster.} } @InProceedings{hua:annealing, author = {Kien A. Hua and S. D. Lang and Wen K. Lee}, title = {A decomposition-based simulated annealing technique for data clustering}, booktitle = {Proceedings of the Thirteenth ACM Symposium on Principles of Database Systems}, year = {1994}, pages = {117--128}, publisher = {ACM Press}, URL = {http://www.acm.org/pubs/citations/proceedings/pods/182591/p117-hua}, keyword = {out of core, information retrieval, parallel I/O, pario-bib}, abstract = {It has been demonstrated that simulated annealing provides high-quality results for the data clustering problem. However, existing simulated annealing schemes are memory-based algorithms; they are not suited for solving large problems such as data clustering which typically are too big to fit in the memory space in its entirety. Various buffer replacement policies, assuming either temporal or spatial locality, are not useful in this case since simulated annealing is based on a randomized search process. Poor locality of references will cause the memory to thrash because too many replacements are required. This phenomenon will incur excessive disk accesses and force the machine to run at the speed of the I/O subsystem. In this paper, we formulate the data clustering problem as a graph partition problem (GPP), and propose a decomposition-based approach to address the issue of excessive disk accesses during annealing. We apply the statistical sampling technique to randomly select subgraphs of the GPP into memory for annealing. Both the analytical and experimental studies indicate that the decomposition-based approach can dramatically reduce the costly disk I/O activities while obtaining excellent optimized results.} } @Article{jadav:media, author = {D. Jadav and C. Srinilta and A. Choudhary}, title = {Batching and dynamic allocation techniques for increasing the stream capacity of an on-demand media server}, journal = {Parallel Computing}, year = {1997}, month = {December}, volume = {23}, number = {12}, pages = {1727--1742}, publisher = {Elsevier Science}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {A server for an interactive distributed multimedia system may require thousands of gigabytes of storage space and high I/O bandwidth. In order to maximize system utilization, and thus minimize cost, the load must be balanced among the server's disks, interconnection network and scheduler. Many algorithms for maximizing retrieval capacity from the storage system have been proposed. This paper presents techniques for improving server capacity by assigning media requests to the nodes of a server so as to balance the load on the interconnection network and the scheduling nodes. Five policies for dynamic request assignment are developed. An important factor that affects data retrieval in a high-performance continuous media server is the degree of parallelism of data retrieval. The performance of the dynamic policies on an implementation of a server model developed earlier is presented for two values of the degree of parallelism.} } @Article{kandemir:ooc, author = {M. Kandemir and A. Choudhary and J. Ramanujam and R. Bordawekar}, title = {Compilation techniques for out-of-core parallel computations}, journal = {Parallel Computing}, year = {1998}, month = {May}, volume = {24}, number = {3}, pages = {597--628}, keyword = {parallel I/O, compiler, out of core, pario-bib} } @Article{lepper:cfd, author = {J. Lepper and U. Schnell and K.R.G. Hein}, title = {Parallelization of a simulation code for reactive flows on the {Intel Paragon}}, journal = {Computers and Mathematics with Applications}, year = {1998}, month = {April}, volume = {35}, number = {7}, pages = {101--109}, publisher = {Pergamon-Elsevier Science Ltd}, keyword = {parallel I/O, application, pario-bib}, abstract = {The paper shows the implementation of a 3D simulation code for turbulent flow and combustion processes in full-scale utility boilers on an Intel Paragon XP/S computer. For the portable parallelization, an explicit approach is chosen using a domain decomposition method for the static subdivision of the numerical grid together with the SPMD programming model. The measured speedup for the presented case using a coarse grid is good, although some numerical requirements restrict the implemented message passing to strongly synchronized communication. On the Paragon, the NX message passing library is used for the computations. Furthermore, MPI and PVM are applied and their pros and cons on this computer are described. In addition to the basic message passing techniques for local and global communication, other possibilities are investigated. Besides the applicability of the vectorizing capability of the compiler, the influence of the I/O performance during computations is demonstrated. The scalability of the parallel application is presented for a refined discretization.} } @InProceedings{lyster:geos-das, author = {P.M. Lyster and K. Ekers and J. Guo and M. Harber and D. Lamich and J.W. Larson and R. Lucchesi and R. Rood and S. Schubert and W. Sawyer and M. Sienkiewicz and A. da Silva and J. Stobie and L.L. Takacs and R. Todling and J. Zero}, title = {Parallel Computing at the {NASA} Data Assimilation Office ({DAO})}, year = {1997}, month = {November}, publisher = {IEEE Computer Society Press}, address = {San Jose, CA}, URL = {http://dao.gsfc.nasa.gov/DAO_people/lys/sc97/sc97/INDEX.HTML}, keyword = {parallel I/O, pario-bib}, comment = {This paper is about a NASA project GEOS-DAS (Goddard Earth Observing System-Data Assimilation System). The goal of the project is to produce ''accurate gridded datasets of atmospheric fields''. The data will be used by meteorologists for weather analysis and forecasts as well as being a tool for climate research. This paper discusses their plans to parallelize the core code of the system. They include a section on parallel I/O.} } @Article{no:jirregular, author = {Jaechun No and Jesus Carretero and Sung-soon Park and Alok Choudhary and Pang Chen}, title = {Design and Implementation of a Parallel {I/O} Runtime System for Irregular Applications}, journal = {Journal of Parallel and Distributed Computing}, year = {1998}, URL = {http://www.ece.nwu.edu/~jno/PAPER/jpdc.ps}, keyword = {verify volume pages publisher month number, parallel I/O, pario-bib} } @InProceedings{pasquale:characterization, author = {Barbara K. Pasquale and George C. Polyzos}, title = {Dynamic {I/O} characterization of {I/O} intensive scientific applications}, booktitle = {Proceedings of Supercomputing '94}, year = {1994}, pages = {660--669}, URL = {http://www.acm.org/pubs/citations/proceedings/supercomputing/198354/p660-pasquale/}, keyword = {parallel I/O, pario-bib}, abstract = {Understanding the characteristic I/O behavior of scientific applications is an integral part of the research and development efforts for the improvement of high performance I/O systems. This study focuses on application level I/O behavior with respect to both static and dynamic characteristics. We observed the San Diego Supercomputer Center's Cray C90 workload and isolated the most I/O intensive applications. The combination of a low-level description of physical resource usage and the high-level functional composition of applications and scientific disciplines for this set reveals the major sources of I/O demand in the workload. We selected two applications from the I/O intensive set and performed a detailed analysis of their dynamic I/O behavior. These applications exhibited a high degree of regularity in their I/O activity over time and their characteristic I/O behaviors can be precisely described by one and two, respectively, recurring sequences of data accesses and computation periods.} } @Article{smirni:lessons, author = {E. Smirni and D.A. Reed}, title = {Lessons from characterizing the input/output behavior of parallel scientific applications}, journal = {Performance Evaluation: An International Journal}, year = {1998}, month = {June}, volume = {33}, number = {1}, pages = {27--44}, publisher = {Elsevier Science}, earlier = {smirni:workload}, URL = {http://vibes.cs.uiuc.edu/Publications/Papers/PerfEval98.ps.gz}, keyword = {workload characterization, parallel I/O, scientific applications, pario-bib}, abstract = {As both processor and interprocessor communication hardware is evolving rapidly with only moderate improvements to file system performance in parallel systems, it is becoming increasingly difficult to provide sufficient input/output (I/O) performance to parallel applications. I/O hardware and file system parallelism are the key to bridging this performance gap. Prerequisite to the development of efficient parallel file systems is the detailed characterization of the I/O demands of parallel applications. In the paper, we present a comparative study of parallel I/O access patterns, commonly found in I/O intensive scientific applications. The Pablo performance analysis tool and its I/O extensions is a valuable resource in capturing and analyzing the I/O access attributes and their interactions with extant parallel I/O systems. This analysis is instrumental in guiding the development of new application programming interfaces (APIs) for parallel file systems and effective file system policies that respond to complex application I/O requirements.}, comment = {This paper compares the I/O performance of five scientific applications from the scalable I/O initiative (SIO) suite of applications. Their goals are to collect detailed performance data on applications characteristics and access patterns and to use that information to design and evaluate parallel file system policies and parallel file system APIs. The related work section gives a nice overview of recent I/O characterization studies. They use the Pablo \cite{reed:pablo} performance analysis environment to analyze the performance of their five applications. The applications they chose to evaluate include: MESSKIT and NWChem, two implementations of the Hartree-Fock method for computational chemistry applications; QCRD, a quantum chemical reaction dynamics application; PRISM, a parallel 3D numerical simulation of the Navier-Stokes equations that models high speed turbulent flow that is periodic in one direction; ECAT, a parallel implementation of the Schwinger multichannel method used to calculate low-energy electron molecule collisions. \par The results showed that applications use a combination of both sequential and interleaved access patterns, which shows that there is a clear need for a more complex API than what is given by the standard UNIX API. In addition, when applications required concurrent accesses, they commonly channeled all I/O requests through a single node. Some form of collective I/O would have helped in these cases. They also made an observation that despite the existence of several parallel I/O APIs, programmers of scientific applications preferred to use standard unix. This is mostly due to the lack of an established portable standard. Their study was "instrumental in the design and implementation of MPI-IO". \par Their section on emerging I/O APIs is particularly interesting. They comment that "the diversity of I/O request sizes and patterns suggests that achieving high performance is unlikely with a single file system policy." Their solution is to have a file system in which the user can give "hints" to the file system expressing expected access patterns or to have a file system that automatically classifies access patterns. The file system can then chose policies to deal with the access patterns.} } @InProceedings{smirni:workload, author = {E. Smirni and D.A. Reed}, title = {Workload characterization of input/output intensive parallel applications}, booktitle = {Proceedings of the Conference on Modelling Techniques and Tools for Computer Performance Evaluation}, year = {1997}, month = {June}, series = {Lecture Notes in Computer Science}, volume = {1245}, pages = {169--180}, publisher = {Springer-Verlag}, later = {smirni:lessons}, URL = {http://vibes.cs.uiuc.edu/Publications/Papers/Tools97.ps.gz}, keyword = {parallel I/O, pario-bib}, abstract = {The broadening disparity in the performance of input/output (I/O) devices and the performance of processors and communication links on parallel systems is a major obstacle to achieving high performance for a wide range of parallel applications. I/O hardware and file system parallelism are the keys to bridging this performance gap. A prerequisite to the development of efficient parallel file systems is detailed characterization of the I/O demands of parallel applications. In this paper, we present a comparative study of the I/O access patterns commonly found in I/O intensive parallel applications. Using the Pablo performance analysis environment and its I/O extensions we captured application I/O access patterns and analyzed their interactions with current parallel I/O systems. This analysis has proven instrumental in guiding the development of new application programming interfaces (APIs) for parallel file systems and in developing effective file system policies that can adaptively respond to complex application I/O requirements.}, comment = {see smirni:lessons} } @TechReport{thakur:mpi-tr, author = {Rajeev Thakur and William Gropp and Ewing Lusk}, title = {A Case for Using {MPI's} Derived Datatypes to Improve {I/O} Performance}, year = {1998}, month = {May}, number = {ANL/MCS-P717-0598}, institution = {Mathematics and Computer Science Division, Argonne National Laboratory}, later = {thakur:mpi}, URL = {http://www.mcs.anl.gov/~thakur/dtype/}, keyword = {MPI, parallel I/O, pario-bib}, abstract = {MPI-IO, the I/O part of the MPI-2 standard, is a promising new interface for parallel I/O. A key feature of MPI-IO is that it allows users to access several noncontiguous pieces of data from a file with a single I/O function call by defining file views with derived datatypes. We explain how critical this feature is for high performance, why users must create and use derived datatypes whenever possible, and how it enables implementations to perform optimizations. In particular, we describe two optimizations our MPI-IO implementation, ROMIO, performs: data sieving and collective I/O. We present performance results on five different parallel machines: HP Exemplar, IBM SP, Intel Paragon, NEC SX-4, and SGI Origin2000.} } @InProceedings{tobis:foam, author = {Michael Tobis and Chad Schafer and Ian Foster and Robert Jacob and John Anderson}, title = {{FOAM}: Expanding the Horizons of Climate Modeling}, year = {1997}, month = {November}, publisher = {IEEE Computer Society Press}, keyword = {parallel I/O, scientific application, pario-bib}, abstract = {We report here on a project that expands the applicability of dynamic climate modeling to very long time scales. The Fast Ocean_Atmosphere Model (FOAM) is a coupled ocean-atmosphere model that incorporates physics of interest in understanding decade to century time scale variability. It addresses the high computational cost of this endeavor with a combination of improved ocean model formulation, low atmosphere resolution, and efficient coupling. It also uses message-passing parallel processing techniques, allowing for the use of cost-effective distributed memory platforms. The resulting model runs over 6000 times faster than real time with good fidelity and has yielded significant results.}, comment = {This paper is about the Fast Ocean-Atmosphere Model (FOAM), a climate model that uses ''a combination of new model formulation and parallel computing to expand the time horizon that may be addressed by explicit fluid dynamical representations of the climate system.'' Their model uses message passing on massively parallel distributed-memory computer systems. They are in the process of investigating using parallel I/O to further increase their efficiency.} }