@InProceedings{cormen:fg, author = {Thomas H. Cormen and Elena R. Davidson}, title = {{FG:} A framework generator for hiding latency in parallel programs running on clusters}, booktitle = {Proceedings of the 17th IASTED International Conference on Parallel and Distributed Computing and Systems}, editor = {Bader, DA; Khokhar, AA}, year = {2004}, month = {September}, pages = {137--144}, institution = {Dartmouth Coll, Dept Comp Sci, 6211 Sudikoff Lab, Hanover, NH 03755 USA; Dartmouth Coll, Dept Comp Sci, Hanover, NH 03755 USA}, publisher = {INTERNATIONAL SOCIETY COMPUTER S \& THEIR APPLICATIONS (ISCA)}, copyright = {(c)2005 The Thomson Corporation}, address = {San Francisco, CA}, URL = {http://www.cs.dartmouth.edu/FG/}, keywords = {asynchronous I/O, pipelined I/O, pario-bib}, abstract = {FG is a programming environment for asynchronous programs that run on clusters and fit into a pipeline framework. It enables the programmer to write a series of synchronous functions and represents them as stages of an asynchronous pipeline. FG mitigates the high latency inherent in interprocessor communication and accessing the outer levels of the memory hierarchy. It overlaps separate pipeline stages that perform communication, computation, and I/O by running the stages asynchronously. Each stage maps to a thread. Buffers, whose sizes correspond to block sizes in the memory hierarchy, traverse the pipeline. FG makes such pipeline-structured parallel programs easier to write, smaller, and faster. FG offers several advantages over statically scheduled overlapping and dynamically scheduled overlapping via explicit calls to thread functions. First, it reduces coding and debugging time. Second, we find that it reduces code size by approximately 15-26%. Third, according to experimental results, it improves performance. Compared with programs that use static scheduling, FG-generated programs run approximately 61-69% faster on a 16-node Beowulf cluster. Compared with programs that make explicit calls for dynamically scheduled threads, FG-generated programs run slightly faster. Fourth, FG offers various design options and makes it easy for the programmer to explore different pipeline configurations.} } @Article{feng:performance, author = {Dan Feng and Hong Jiang and Yi-Feng Zhu}, title = {{I/O} performance of an {RAID-10} style parallel file system}, journal = {Journal of Computer Science and Technology}, year = {2004}, month = {November}, volume = {19}, number = {6}, pages = {965--972}, institution = {Huazhong Univ Sci \& Technol, Dept Comp Sci \& Engn, Natl Storage Syst Lab, Wuhan 430074, Peoples R China; Huazhong Univ Sci \& Technol, Dept Comp Sci \& Engn, Natl Storage Syst Lab, Wuhan 430074, Peoples R China; Univ Nebraska, Dept Comp Sci \& Engn, Lincoln, NE USA}, publisher = {SCIENCE CHINA PRESS}, copyright = {(c)2005 The Thomson Corporation}, URL = {http://jcst.ict.ac.cn/cone/cone46.html#paper29}, keywords = {PVFS, parallel I/O, I/O response time, pario-bib}, abstract = {Without any additional cost, all the disks on the nodes of a cluster can be connected together through CEFT-PVFS, an RAID-10 style parallel file system, to provide a multi-GB/s parallel I/O performance. I/O response time is one of the most important measures of quality of service for a client. When multiple clients submit data-intensive jobs at the same time, the response time experienced by the user is an indicator of the power of the cluster. In this paper, a queuing model is used to analyze in detail the average response time when multiple clients access CEFT-PVFS. The results reveal that response time is with a function of several operational parameters. The results show that I/O response time decreases with the increases in I/O buffer hit rate for read requests, write buffer size for write requests and the number of server nodes in the parallel file system, while the higher the I/O requests arrival rate, the longer the I/O response time. On the other hand, the collective power of a large cluster supported by CEFT-PVFS is shown to be able to sustain a steady and stable I/O response time for a relatively large range of the request arrival rate.} } @Article{hey:parkbench, author = {Tony Hey and David Lancaster}, title = {The Development of {Parkbench} and Performance Prediction}, journal = {The International Journal of High Performance Computing Applications}, year = {2000}, month = {Fall}, volume = {14}, number = {3}, pages = {205--215}, URL = {http://www.ecs.soton.ac.uk/~djl/GENESIS/peis.ps.gz}, keywords = {parallel I/O benchmarks, MPI-IO, pario-app, pario-bib} } @InCollection{rabenseifner:benchmark, author = {Rolf Rabenseifner and Alice E. Koniges and Jean-Pierre Prost and Richard Hedges}, title = {The Parallel Effective {I/O} Bandwidth Benchmark: b\_eff\_io}, booktitle = {Parallel I/O for Cluster Computing}, chapter = {4}, editor = {Christophe Cerin and Hai Jin}, year = {2004}, month = {February}, pages = {107--132}, publisher = {Kogan Page Ltd.}, URL = {http://www.hlrs.de/people/rabenseifner/publ/cpj_b_eff_io_nov19.pdf}, keywords = {parallel I/O benchmarks, MPI-IO, pario-bib} } @InProceedings{tsujita:stampi2, author = {Yuichi Tsujita}, title = {Implementation of an {MPI-I/0} mechanism using {PVFS} in remote {I/0} to a {PC} cluster.}, booktitle = {Seventh International Conference on High Performance Computing and Grid in Asia Pacific Region}, year = {2004}, month = {July}, pages = {136--139}, organization = {Kinki University, Japan}, publisher = {Los Alamitos, CA, USA : IEEE Comput. Soc, 2004}, copyright = {(c)2005 IEE}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/hpcasia/2004/2138/00/21380136abs.htm}, keywords = {MPI-IO, PVFS, remote I/O, grid, pario-bib}, abstract = {A flexible intermediate library named Stampi realizes seamless MPI operations on a heterogeneous computing environment. With the help of a flexible communication mechanism of this library, users can execute MPI functions without awareness of underlying communication mechanism. Although Stampi supports MPI-I/O among different platforms, UNIX I/O functions are used when a vendor-supplied MPI-I/O library is not available. To realize distributed I/O operations, a parallel virtual file system (PVFS) has been implemented in the MPI-I/O mechanism. Primitive MPI-I/O functions of Stampi have been evaluated and sufficient performance has been achieved. (9 refs.)}, comment = {also see tsujita:stampi.} } @InProceedings{vydyanathan:pipeline, author = {N. Vydyanathan and G. Khanna and T. Kurc and U. Catalyurek and P. Wyckoff and J. Saltz and P. Sadayappan Naga Vydyanathan and Gaurav Khana and Tahsin M Kurc and Umit V Catalyurek and Pete Wyckoff and Joel H Saltz and P. (Saday) Sadayappan}, title = {Use of {PVFS} for efficient execution of jobs with pipeline-shared {I/O}}, booktitle = {Proceedings of the 5th IEEE/ACM International Workshop on Grid Computing}, editor = {Buyya, R}, year = {2004}, month = {November}, pages = {235--242}, institution = {Ohio State Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA}, publisher = {IEEE Computer Society Press}, copyright = {(c)2005 The Thomson Corporation}, address = {Pittsburgh, PA}, URL = {http://csdl.computer.org/comp/proceedings/grid/2004/2256/00/22560235abs.htm}, keywords = {PVFS, pipelined-shared I/O, grid computing, pario-bib}, abstract = {This paper is concerned with efficient execution of applications that are composed of chain of sequential data processes, which exchange data through a file system. We focus on pipeline-shared I/O behavior within a single pipeline of processes running on a cluster We examine several scheduling strategies and experimentally evaluate them for efficient use the Parallel Virtual File System (PVFS) as a common storage pool.} } @TechReport{wong:benchmarks, author = {Parkson Wong and Rob F Van der Wijngaart}, title = {{NAS} Parallel Benchmarks {I/O} Version 2.4}, year = {2003}, month = {January}, number = {NAS-03-002}, institution = {Computer Sciences Corporation, NASA Advanced Supercomputing (NAS) Division}, address = {NASA Ames Research Center, Moffett Field, CA 94035-1000}, URL = {http://www.nas.nasa.gov/News/Techreports/2003/PDF/nas-03-002.pdf}, keywords = {parallel I/O benchmarks, block tridiagonal, pario-app, pario-bib}, abstract = {We describe a benchmark problem, based on the Block-Tridiagonal (BT) problem of the NAS Parallel Benchmarks (NPB), which is used to test the output capabilities of high-performance computing systems, especially parallel systems. We also present a source code implementation of the benchmark, called NPBIO2.4-MPI, based on the MPI implementation of the NPB, using a variety of ways to write the computed solutions to file.} } @InProceedings{yamamoto:astronomical, author = {Naotaka Yamamoto and Osama Tatebe and Satoshi Sekiguchi}, title = {Parallel and distributed astronomical data analysis on grid datafarm}, booktitle = {5th International Workshop on Grid Computing}, editor = {Buyya, R}, year = {2004}, month = {November}, pages = {461--466}, institution = {AIST, Grid Technol Res Ctr, Tsukuba Cent 2, Umezono 1-1-1, Tsukuba, Ibaraki, Japan; AIST, Grid Technol Res Ctr, Tsukuba, Ibaraki, Japan}, publisher = {IEEE Computer Society Press}, copyright = {(c)2005 The Thomson Corporation}, address = {Pittsburgh, PA}, URL = {http://datafarm.apgrid.org/pdf/Grid2004-yamamoto.pdf}, keywords = {grid, grid datafarm, astronomical data, pario-app, pario-bib}, abstract = {A comprehensive study of the whole petabyte-scale archival data of astronomical observatories has a possibility of new science and new knowledge in the field, while it was not feasible so far due to lack of enough data analysis environment. The Grid Datafarm architecture is designed for global petabyte-scale data-intensive computing, which provides a Grid file system with file replica management for fault tolerance and load balancing, and parallel and distributed data computing support for a set of files, to meet with the requirements of the comprehensive study of the whole archival data. In the paper, we discuss about worldwide parallel and distributed data analysis in the observational astronomical field The archival data is stored, replicated and dispersed in a Gfarm file system. All the astronomical data analysis tools successfully access files in Gfarm file system without any code modification, using a syscall hooking library regardless of file replica locations. Performance evaluation of the parallel data analysis in several ways shows file-affinity process scheduling plays an essential role for scalable and efficient parallel file I/O performance. A data calibration tools shows scalable file I/O performance, and achieved the file I/O performance of 5.9 GB/sec and 4.0 GB/sec for reading and writing FITS files, respectively, using 30 cluster nodes (60 CPUs). On-demand file replica creation mitigates the overhead of access concentration. Another tool shows the performance improvement at a factor of six for reading a shared file by creating file replicas.} }