@InProceedings{barve:competitive2, author = {Rakesh Barve and Mahesh Kallahalla and Peter J. Varman and Jeffrey Scott Vitter}, title = {Competitive Parallel Disk Prefetching and Buffer Management}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {47--56}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {disk prefetching, file caching, parallel I/O, pario-bib}, abstract = {We provide a competitive analysis framework for online prefetching and buffer management algorithms in parallel I/O systems, using a read-once model of block references. This has widespread applicability to key I/O-bound applications such as external merging and concurrent playback of multiple video streams. Two realistic lookahead models, global lookahead and local lookahead, are defined. Algorithms NOM and GREED based on these two forms of lookahead are analyzed for shared buffer and distributed buffer configurations, both of which occur frequently in existing systems. An important aspect of our work is that we show how to implement both the models of lookahead in practice using the simple techniques of forecasting and flushing. \par Given a D-disk parallel I/O system and a globally shared I/O buffer that can hold upto M disk blocks, we derive a lower bound of $\Omega(\sqrt{D}$) on the competitive ratio of any deterministic online prefetching algorithm with O(M) lookahead. NOM is shown to match the lower bound using global M-block lookahead. In contrast, using only local lookahead results in an $\Omega(D)$ competitive ratio. When the buffer is distributed into D portions of M/D blocks each, the algorithm GREED based on local lookahead is shown to be optimal, and NOM is within a constant factor of optimal. Thus we provide a theoretical basis for the intuition that global lookahead is more valuable for prefetching in the case of a shared buffer configuration whereas it is enough to provide local lookahead in case of the distributed configuration. Finally, we analyze the performance of these algorithms for reference strings generated by a uniformly-random stochastic process and we show that they achieve the minimal expected number of I/Os. These results also give bounds on the worst-case expected performance of algorithms which employ randomization in the data layout.}, comment = {See barve:competitive.} } @InProceedings{chang:reuse, author = {Tai-Sheng Chang and Sangyup Shim and David H.~C. Du}, title = {The Scalability of Spatial Reuse Based Serial Storage Interfaces}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {93--101}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {I/O interface, I/O network, I/O architecture, parallel I/O, pario-bib}, abstract = {Due to the growing popularity of emerging applications such as digital libraries, Video-On Demand, distance learning, and Internet World-Wide Web, multimedia servers with a large capacity and high performance storage subsystem are in high demand. Serial storage interfaces are emerging technologies designed to improve the performance of such storage subsystems. They provide high bandwidth, fault tolerance, fair bandwidth sharing and long distance connection capability. All of these issues are critical in designing a scalable and high performance storage subsystem. Some of the serial storage interfaces provide the spatial reuse feature which allows multiple concurrent transmissions. That is, multiple hosts can access disks concurrently with full link bandwidth if their access paths are disjoint. Spatial reuse provides a way to build a storage subsystem whose aggregate bandwidth may be scaled up with the number of hosts. However, it is not clear how much the performance of a storage subsystem could be improved by the spatial reuse with different configurations and traffic scenarios. Both limitation and capability of this scalability need to be investigated. To understand their fundamental performance characteristics, we derive an analytic model for the serial storage interfaces with the spatial reuse feature. Based on this model, we investigate the maximum aggregate throughput from different system configurations and load distributions. We show how the number of disks needed to saturate a loop varies with different number of hosts and different load scenarios. We also show how the load balancing by uniformly distributing the load to all the disks on a loop may incur high overhead. This is because the accesses to far away disks need to go through many links and consume the bandwidth of each link it goes through. The results show the achievable throughput may be reduced by more than half in some cases.} } @InProceedings{cho:local, author = {Yong Cho and Marianne Winslett and Mahesh Subramaniam and Ying Chen and Szu-wen Kuo and Kent E. Seamons}, title = {Exploiting Local Data in Parallel Array {I/O} on a Practical Network of Workstations}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {1--13}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {parallel I/O, distributed system, pario-bib}, abstract = {A cost-effective way to run a parallel application is to use existing workstations connected by a local area network such as Ethernet or FDDI. In this paper, we present an approach for parallel I/O of multidimensional arrays on small networks of workstations with a shared-media interconnect, using the Panda I/O library. \par In such an environment, the message passing throughput per node is lower than the throughput obtainable from a fast disk and it is not easy for users to determine the configuration which will yield the best I/O performance. \par We introduce an I/O strategy that exploits local data to reduce the amount of data that must be shipped across the network, present experimental results, and analyze the results using an analytical performance model and predict the best choice of I/O parameters. \par Our experiments show that the new strategy results in a factor of 1.2--2.1 speedup in response time compared to the Panda version originally developed for the IBM SP2, depending on the array sizes, distributions and compute and I/O node meshes. Further, the performance model predicts the results within a 13\% margin of error.} } @InProceedings{cormen:fft3, author = {Thomas H. Cormen and Jake Wegmann and David M. Nicol}, title = {Multiprocessor Out-of-Core {FFTs} with Distributed Memory and Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {68--78}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {out of core, parallel I/O, pario-bib}, abstract = {This paper extends an earlier out-of-core Fast Fourier Transform (FFT) method for a uniprocessor with the Parallel Disk Model (PDM) to use multiple processors. Four out-of-core multiprocessor methods are examined. Operationally, these methods differ in the size of "mini-butterfly" computed in memory and how the data are organized on the disks and in the distributed memory of the multiprocessor. The methods also perform differing amounts of I/O and communication. Two of them have the remarkable property that even though they are computing the FFT on a multiprocessor, all interprocessor communication occurs outside the mini-butterfly computations; communication that ordinarily occurs in a butterfly is folded into other data-movement operations. An analysis program shows that the two methods that use no butterfly communication usually use less communication overall than the other methods. The analysis program is fast enough that it can be invoked at run time to determine which of the four methods uses the least communication. One set of performance results on a small workstation cluster indicates that the methods without butterfly communication are approximately 9.5\% faster. Moreover, they are much easier to implement.}, comment = {See also cormen:fft and cormen:fft2.} } @InProceedings{cortes:cooperative, author = {Toni Cortes and Sergi Girona and Jes\'us Labarta}, title = {Design Issues of a Cooperative Cache with no Coherence Problems}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {37--46}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {cooperative caching, distributed file system, parallel I/O, pario-bib}, abstract = {In this paper, we examine some of the important problems observed in the design of cooperative caches. Solutions to the coherence, load-balancing and fault-tolerance problems are presented. These solutions have been implemented as a part of PAFS, a parallel/distributed file system, and its performance has been compared to the one achieved by xFS. Using the comparison results, we have observed that the proposed ideas not only solve the main problems of cooperative caches, but also increase the overall system performance. Although the solutions presented in this paper were targeted to a parallel machine, reasonable good results have also been obtained for networks of workstations.} } @InProceedings{foster:remote-io, author = {Ian Foster and David {Kohr, Jr.} and Rakesh Krishnaiyer and Jace Mogill}, title = {Remote {I/O}: Fast Access to Distant Storage}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {14--25}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {parallel I/O, distributed file system, pario-bib}, abstract = {As high-speed networks make it easier to use distributed resources, it becomes increasingly common that applications and their data are not colocated. Users have traditionally addressed this problem by manually staging data to and from remote computers. We argue instead for a remote I/O paradigm in which programs use familiar parallel I/O interfaces to access remote filesystems. In addition to simplifying remote execution, remote I/O can improve performance relative to staging by overlapping computation and data transfer or by reducing communication requirements. However, remote I/O also introduces new technical challenges in the areas of portability, performance, and integration with distributed computing systems. We propose techniques designed to address these challenges and describe a remote I/O library called RIO that we are developing to evaluate the effectiveness of these techniques. RIO addresses issues of portability by adopting the quasi-standard MPI-IO interface and by defining a RIO device and RIO server within the ADIO abstract I/O device architecture. It addresses performance issues by providing traditional I/O optimizations such as asynchronous operations and through implementation techniques such as buffering and message forwarding to offload communication overheads. Microbenchmarks and application experiments demonstrate that our techniques can improve turnaround time relative to staging.} } @InProceedings{kandemir:locality, author = {M. Kandemir and A. Choudhary and J. Ramanujam and M. Kandaswamy}, title = {A Unified Compiler Algorithm for Optimizing Locality, Parallelism, and Communication in Out-of-Core Computations}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {79--92}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {compiler, out of core, parallel I/O, pario-bib}, abstract = {This paper presents compiler algorithms to optimize out-of-core programs. These algorithms consider loop and data layout transformations in a unified framework. The performance of an out-of-core loop nest containing many references can be improved by a combination of restructuring the loops and file layouts. This approach considers array references one-by-one and attempts to optimize each reference for parallelism and locality. When there are references for which parallelism optimizations do not work, communication is vectorized so that data transfer can be performed before the innermost tiling loop. Preliminary results from hand-compiles on IBM SP-2 and Intel Paragon show that this approach reduces the execution time, improves the bandwidth speedup and overall speedup. In addition, we extend the base algorithm to work with file layout constraints and show how it can be used for optimizing programs consisting of multiple loop nests.} } @Article{kandemir:optimizing, author = {M. Kandemir and A. Choudhary and J. Ramanujam and R. Bordawekar}, title = {Optimizing Out-of-core Computations in Uniprocessors}, journal = {Newsletter of the Technical Committee on Computer Architecture (TCCA)}, year = {1997}, month = {June}, pages = {25--27}, keyword = {out of core, parallel I/O, pario-bib} } @InProceedings{madhyastha:classification, author = {Tara M. Madhyastha and Daniel A. Reed}, title = {Input/Output Access Pattern Classification Using Hidden {Markov} Models}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {57--67}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {workload characterization, file access pattern, parallel I/O, pario-bib}, abstract = {Input/output performance on current parallel file systems is sensitive to a good match of application access pattern to file system capabilities. Automatic input/output access classification can determine application access patterns at execution time, guiding adaptive file system policies. In this paper we examine a new method for access pattern classification that uses hidden Markov models, trained on access patterns from previous executions, to create a probabilistic model of input/output accesses. We compare this approach to a neural network classification framework, presenting performance results from parallel and sequential benchmarks and applications.} } @InBook{mpi2-io, author = {{Message-Passing Interface Forum}}, title = {{MPI-2.0}: Extensions to the Message-Passing Interface}, chapter = {9}, year = {1997}, month = {June}, publisher = {MPI Forum}, URL = {http://www.mpi-forum.org/docs/docs.html}, keyword = {MPI, message passing, parallel computing, library, parallel I/O, pario-bib}, comment = {Chapter 9 is about I/O extensions.} } @InProceedings{prabhakar:browsing, author = {Sunil Prabhakar and Divyakant Agrawal and Amr {El Abbadi} and Ambuj Singh and Terence Smith}, title = {Browsing and Placement of Multiresolution Images on Parallel Disks}, booktitle = {Proceedings of the Fifth Workshop on Input/Output in Parallel and Distributed Systems}, year = {1997}, month = {November}, pages = {102--113}, publisher = {ACM Press}, address = {San Jose, CA}, keyword = {multimedia, parallel I/O, pario-bib}, abstract = {With rapid advances in computer and communication technologies, there is an increasing demand to build and maintain large image repositories. In order to reduce the demands on I/O and network resources, multiresolution representations are being proposed for the storage organization of images. Image decomposition techniques such as {\em wavelets} can be used to provide these multiresolution images. The original image is represented by several coefficients, one of them with visual similarity to the original image, but at a lower resolution. These visually similar coefficients can be thought of as {\em thumbnails} or {\em icons} of the original image. This paper addresses the problem of storing these multiresolution coefficients on disks so that thumbnail browsing as well as image reconstruction can be performed efficiently. Several strategies are evaluated to store the image coefficients on parallel disks. These strategies can be classified into two broad classes depending on whether the access pattern of the images is used in the placement. Disk simulation is used to evaluate the performance of these strategies. Simulation results are validated with results from experiments with real disks and are found to be in good agreement. The results indicate that significant performance improvements can be achieved with as few as four disks by placing image coefficients based upon browsing access patterns.} }