@InProceedings{abawajy:scheduling, author = {J. H. Abawajy}, title = {Performance Analysis of Parallel {I/O} Scheduling Approaches on Cluster Computing Systems}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {724--729}, organization = {Carleton University}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190724abs.htm}, keywords = {parallel I/O, I/O scheduling algorithms, pario-bib}, abstract = {As computation and communication hardware performance continue to rapidly increase, I/O represents a growing fraction of application execution time. This gap between the I/O subsystem and others is expected to increase in future since I/O performance is limited by physical motion. Therefore, it is imperative that novel techniques for improveing I/O performance be developed. Parallel I/O is a promising approach to alleviating this bottleneck. However, very little work exist with respect to scheduling parallel I/O operations explicitly. In this paper, we address the problem of effective management of parallel I/O in cluster computing systems by using appropriate I/O scheduling strategies. We propose two new I/O scheduling algorithms and compare them with two existing scheduling Approaches. The preliminary results show that the proposed policies outperform existing policies substantially.} } @InProceedings{cerin:sorting, author = {Christophe C\'erin and Hazem Fkaier and Mohamed Jemni}, title = {A Synthesis of Parallel Out-of-core Sorting Programs on Heterogeneous Clusters}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {78--85}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190078abs.htm}, keywords = {out-of-core, sorting, parallel I/O, load balancing, data distribution, pario-app, pario-bib}, abstract = {The paper considers he problem of parallel external sorting in the contex of a form of heterogeneous clusters. We introduce two algorithms and we compare them two another one that we have previously developed. Since most common sort algorithms assume high-speed random access to all intermediate memory, they are unsuitable if the values to be sorted don't fit in main memory. This is the case for cluster computing platforms which are made of standard, cheap and scarce components. For that class of computing resources a good use of I/O operations compatible with the requirements of load balancing and computational complexity are the key to success. We explore three techniques and show how they can be deployed for clusters with processor performances related by a multiplicative factor. We validate the approaches in showing experimental results for the load balancing factor.} } @InProceedings{ching:noncontiguous, author = {Avery Ching and Alok Choudhary and Kenin Coloma and Wei-keng Liao and Robert Ross and William Gropp}, title = {Noncontiguous {I/O} Accesses Through {MPI-IO}}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {104--111}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190104abs.htm}, keywords = {parallel I/O, MPI-IO, ROMIO, list I/O, noncontiguous access, pario-bib}, abstract = {I/O performance remains a weakness of parallel computing systems today. While this weakness is partly attributed to rapid advances in other system components, I/O interfaces available to programmers and the I/O methods supported by file systems have traditionally not matched efficiently with the types of I/O operations that scientific applications perform, particularly noncontiguous accesses. The MPI-IO interface allows for rich descriptions of the I/O patterns desired for scientific applications and implementations such as ROMIO have taken advantage of this ability while remaining limited by underlying file system methods. \par A method of noncontiguous data access, list I/O, was recently implemented in the Parallel Virtual File System (PVFS). We implement support for this interface in the ROMIO MPI-IO implementation. Through a suite of non-contiguous I/O tests we compared ROMIO list I/O to current methods of ROMIO noncontiguous access and found that the list I/O interface provides performance benefits in many noncontiguous cases.} } @InProceedings{cozette:read2, author = {Olivier Cozette and Cyril Randriamaro}, title = {{READ^2}: Put disks at network level}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {698--704}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190698abs.htm}, keywords = {parallel I/O, pario-bib}, abstract = {Grand challenge applications have to process large amounts of data, and then require high performance IO systems. Cluster computing is a good alternative to proprietary system for building cost effective IO intensive platform: some cluster architectures won sorting benchmark (MinuteSort, Datamation)! Recent advances in IO component technologies (disk, controller and network) let us expect higher IO performance for data intensive applications on cluster. The counterpart of this evolution is that much stress is put on the different buses (memory, IO) of each node which cannot be scaled. In this paper we investigate a strategy we called READ2 (Remote Efficient Access to Distant Device) to reduce the stress. With READ2 any cluster node accesses directly to remote disk: the remote processor and the remote memory are removed from the control and data path: Inputs/Outputs don't interfere with the host processor and the host memory activity. With READ2 strategy, a cluster can be considered as a shared disk architecture instead of a shared nothing one. This papers describes an implementation of READ^2 on Myrinet Networks. First experimental results show IO performance improvement.} } @InProceedings{ho:reorganization, author = {T. K. Ho and Jack Y. B. Lee}, title = {A Row-Permutated Data Reorganization Algorithm for Growing Server-less Video-on-Demand Systems}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {44--51}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190044abs.htm}, keywords = {data reorganization, video on demand, video streaming, pario-bib}, abstract = {Recently, a new server-less architecture is proposed for building low-cost yet scalable video streaming systems. Compare to conventional client-server-based video streaming systems, this server-less architecture does not need any dedicated video server and yet is highly scalable. Video data are distributed among user hosts and these hosts cooperate to stream video data to one another. Thus as new hosts join the system, they also add streaming and storage capacity to absorb the added streaming load. This study investigates the data reorganization problem when growing a server-less video streaming system. Specifically, as video data are distributed among user hosts, these data will need to be redistributed to newly joined hosts to utilize their storage and streaming capacity. This study presents a new data reorganization algorithm that allows controllable tradeoff between data reorganization overhead and streaming load balance.} } @InProceedings{mayr:query, author = {Tobias Mayr and Philippe Bonnet and Johannes Gehrke and Praveen Seshadri}, title = {Leveraging Non-Uniform Resources for Parallel Query Processing}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {120--129}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190120abs.htm}, keywords = {parallel query processing, load balancing, parallel I/O, pario-bib}, abstract = {Modular clusters are now composed of non- uniform nodes with different CPUs, disks or network cards so that customers can adapt the cluster configuration to the changing technologies and to their changing needs. This challenges dataflow parallelism as the primary load balancing technique of existing parallel database systems. We show in this paper that dataflow parallelism alone is ill suited for modular clusters because running the same operation on different subsets of the data can not fully utilize non-uniform hardware resources. We propose and evaluate new load balancing techniques that blend pipeline parallelism with data parallelism. We consider relational operators as pipelines of fine-grained operations that can be located on different cluster nodes and executed in parallel on different data subsets to best exploit non-uniform resources. We present an experimental study that confirms the feasibility and effectiveness of the new techniques in a parallel execution engine prototype based on the open-source DBMS Predator.} } @InProceedings{perez:allocation, author = {Jose Maria Perez and Felix Garcia and Jesus Carretero and Alejandro Calderon and Luis Miguel Sanchez}, title = {Data Allocation and Load Balancing for Heterogeneous Cluster Storage Systems}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {718--723}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190718abs.htm}, keywords = {parallel I/O, load balancing, pario-bib}, abstract = {Distributed filesystems are a typical solution in networked environments as clusters and grids. Parallel filesystems are a typical solution in order to reach high performance I/O distributed environment, but those filesystems have some limitations in heterogeneous storage systems. Usually in distributed systems, load balancing is used as a solution to improve the performance, but typically the distribution is made between peer-to-peer computational resources and from the processor point of view. In heterogeneous systems, like heterogeneous clusters of workstations, the existing solutions do not work so well. However, the utilization of those systems is more extended every day, having an extreme example in the grid environment. In this paper we bring attention to those aspects of heterogeneous distributed data systems presenting a parallel file system that take into account heterogeneity of storage nodes, the dynamic addition of new storage nodes, and an algorithm to group requests in heterogeneous systems.} } @InProceedings{segawa:pvfs-pm, author = {Koji Segawa and Osamu Tatebe and Yuetsu Kodama and Tomohiro Kudoh and Toshiyuki Shimizu}, title = {Design and implementation of {PVFS-PM}: a cluster file system on {SC}ore}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {705--711}, organization = {National Institute of Advanced Industrial Science and Technology (AIST)}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190705abs.htm}, keywords = {parallel I/O, pario-bib}, abstract = {This paper discusses the design and implementation of a cluster file system, called PVFS-PM, on the SCore cluster system software. This is the first attempt to implement a cluster file system on the SCore system. It is based on the PVFS cluster file system but replaces TCP with the PMv2 communication library supported by SCore to provide a scalable, high-performance cluster file system. PVFS-PM improves the performance by factors of 1.07 and 1.93 for writing and reading,respectively, with 8 I/O nodes, compared with the original PVFS on TCP on a Gigabit Ethernet-connected SCore cluster.} } @InProceedings{uk:protein-folding, author = {B. Uk and M. Taufer and T. Stricker and G. Settanni and A. Cavalli and A. Caflisch}, title = {Combining Task- and Data Parallelism to Speed up Protein Folding on a Desktop Grid Platform}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {240--249}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190240abs.htm}, keywords = {protein folding, grid application, parallel I/O, pario-app, pario-bib}, abstract = {The steady increase of computing power at lower and lower cost enables molecular dynamics simulations to investigate the process of protein folding with an explicit treatment of water molecules. Such simulations are typically done with well known computational chemistry codes like CHARMM. Desktop grids such as the United Devices MetaProcessor are highly attractive platforms, since scavenging for unused machines on Intra- and Internet delivers compute power that is almost free. However, the predominant programming paradigm for current desktop grids is pure task parallelism and might not fit the needs for protein folding simulations with explicit water molecules. A short overall turn-around time of a simulation remains highly important for research productivity, but the need for an accurate model and long simulation time-scales leads to tasks that are too large for optimal scheduling on a desktop grid. To address this problem, we introduce a combination of task- and data parallelism as a well suitable computing paradigm for protein folding investigations on grid platforms. As a proof of concept, we design and implement a simple system for protein folding simulations based on the notion of combined task and data parallelism with clustered workers. Clustered workers are machines grouped into small clusters according to network and CPU performance criteria and act as super-nodes within a desktop grid, permitting the utilization of data parallelism in addition to the task parallelism. We integrate our new paradigm into the existing software environment of the United Devices MetaProcessor. For a test protein, we reach a better quality of the folding calculations than we reached using just task parallelism on distributed systems.} } @InProceedings{vilayannur:caching, author = {Murali Vilayannur and Anand Sivasubramaniam and Mahmut Kandemir and Rajeev Thakur and Robert Ross}, title = {Discretionary Caching for {I/O} on Clusters}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {96--103}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190096abs.htm}, keywords = {caching, parallel I/O, pario-bib}, abstract = {I/O bottlenecks are already a problem in many large-scale applications that manipulate huge datasets. This problem is expected to get worse as applications get larger, and the I/O subsystem performance lags behind processor and memory speed improvements. Caching I/O blocks is one effective way of alleviating disk latencies, and there can be multiple levels of caching on a cluster of workstations. Previous studies have shown the benefits of caching - whether it be local to a particular node, or a shared global cache across the cluster - for certain applications. However, we show that while caching is useful in some situations, it can hurt performance if we are not careful about what to cache and when to bypass the cache. This paper presents compilation techniques and runtime support to address this problem. These techniques are implemented and evaluated on an experimental Linux/Pentium cluster running a parallel file system. Our results using a diverse set of applications (scientific and commercial) demonstrate the benefits of a discretionary approach to caching for I/O subsystems on clusters, providing as much as 33% savings over indiscriminately caching everything in some applications.} } @InProceedings{wiebalck:enbd, author = {Arne Wiebalck and Peter T. Breuer and Volker Lindenstruth and Timm M. Steinbeck}, title = {Fault-Tolerant Distributed Mass Storage for LHC Computing}, booktitle = {Proceedings of the Third IEEE/ACM International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {May}, pages = {266--275}, publisher = {IEEE Computer Society Press}, address = {Tokyo, Japan}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190266abs.htm}, keywords = {RAID, fault-tolerance, high-energy physics, parallel I/O, pario-app, pario-bib}, abstract = {In this paper we present the concept and first prototyping results of a modular fault-tolerant distributed mass storage architecture for large Linux PC clusters as they are deployed by the upcoming particle physics experiments. The device masquerading technique using an Enhanced Network Block Device (ENBD) enables local RAID over remote disks as the key concept of the ClusterRAID system. The block level interface to remote files, partitions or disks provided by the ENBD makes it possible to use the standard Linux software RAID to add fault-tolerance to the system. Preliminary performance measurements indicate that the latency is comparable to a local hard drive. With four disks throughput rates of up to 55MB/s were achieved with first prototypes for a RAID0 setup, and about 40MB/s for a RAID5 setup.} } @InProceedings{zhang:n-spek, author = {Ming Zhang and Qing Yang}, title = {Performability Evaluation of Networked Storage Systems Using N-SPEK}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {736--741}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190736abs.htm}, keywords = {benchmarking, performance, block-level access, pario-bib}, abstract = {This paper introduces a new benchmark tool for evaluating performance and availability (performability) of networked storage systems, specifically storage area network (SAN) that is intended for providing block-level data storage with high performance and availability. The new benchmark tool, named N-SPEK (Networked-Storage Performability Evaluation Kernel module), consists of a controller, several workers, one or more probers, and several fault injection modules. N-SPEK is highly accurate and efficient since it runs at kernel level and eliminates skews and overheads caused by file systems. It allows a SAN architect to generate configurable storage workloads to the SAN under test and to inject different faults into various SAN components such as network devices, storage devices, and controllers. Available performances under different workloads and failure conditions are dynamically collected and recorded in the N-SPEK over a spectrum of time. To demonstrate its functionality, we apply N-SPEK to evaluate the performability of a specific iSCSI-based SAN under Linux environment. Our experiments show that N-SPEK not only efficiently generates quantitative performability results but also reveals a few optimization opportunities for future iSCSI implementations.} } @InProceedings{zhou:greedy, author = {Xinrong Zhou and Tong Wei}, title = {A Greedy {I/O} Scheduling Method in the Storage System of Clusters}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {712--717}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190712abs.htm}, keywords = {parallel I/O, disk scheduling, pario-bib}, abstract = {As the size of cluster becomes larger, the process ability of a cluster increases rapidly. Users will exploit this increased power to run scientific, physical and multimedia applications. These kinds of data-intensive applications require high performance storage subsystem. Parallel storage system such as RAID is widely used in today's clusters. In this paper, we bring out a "greedy" I/O scheduling method that utilizes Scatter and Gather operations inside the PCI-SCSI adapter to combine as many I/O operations within the same disk as possible. In this way we reduce the numbers of I/O operations and improve the performance of the whole storage system. After analyzing RAID control strategy, we find out that I/O commands' combination may also bring up data movement in memory and this kind of movement will increase the system's overhead. The experiment results in our real time operating environment show that a better performance can be achieved. The longer the data length is, the better improvement we can get, in some case, we can even get over 40% enhancement.} } @InProceedings{zhu:ceft-pvfs, author = {Yifeng Zhu and Hong Jiang and Xiao Qin and Dan Feng and David R. Swanson}, title = {Improved Read Performance in a Cost-Effective, Fault-Tolerant Parallel Virtual File System {(CEFT-PVFS)}}, booktitle = {Workshop on Parallel I/O in Cluster Computing and Computational Grids}, year = {2003}, month = {May}, pages = {730--735}, publisher = {IEEE Computer Society Press}, address = {Tokyo}, note = {Organized at the IEEE/ACM International Symposium on Cluster Computing and the Grid 2003}, URL = {http://csdl.computer.org/comp/proceedings/ccgrid/2003/1919/00/19190730abs.htm}, keywords = {parallel I/O, fault-tolerance, read performance, parallel file system, PVFS, pario-bib}, abstract = {Due to the ever-widening performance gap between processors and disks, I/O operations tend to become the major performance bottleneck of data-intensive applications on modern clusters. If all the existing disks on the nodes of a cluster are connected together to establish high performance parallel storage systems, the cluster's overall performance can be boosted at no additional cost. CEFT-PVFS (a RAID 10 style parallel file system that extends the original PVFS), as one such system, divides the cluster nodes into two groups, stripes the data across one group in a round-robin fashion, and then duplicates the same data to the other group to provide storage service of high performance and high reliability. Previous research has shown that the system reliability is improved by a factor of more than 40 with mirroring while maintaining a comparable write performance. This paper presents another benefit of CEFT-PVFS in which the aggregate peak read performance can be improved by as much as 100% over that of the original PVFS by exploiting the increased parallelism. \par Additionally, when the data servers, which typically are also computational nodes in a cluster environment, are loaded in an unbalanced way by applications running in the cluster, the read performance of PVFS will be degraded significantly. On the contrary, in the CEFT-PVFS, a heavily loaded data server can be skipped and all the desired data is read from its mirroring node. Thus the performance will not be affected unless both the server node and its mirroring node are heavily loaded.} }