@Article{arge:jsegments, author = {Lars Arge and Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {External-Memory Algorithms for Processing Line Segments in Geographic Information Systems}, journal = {Algorithmica}, year = {1997}, note = {To appear}, earlier = {arge:segments}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/AVV97.SegmentGIS.ps.gz}, keyword = {verify, out-of-core algorithm, computational geometry, pario-bib}, abstract = {We present a set of algorithms designed to solve large-scale geometric problems involving collections of line segments in the plane. Geographical information systems (GIS) handle large amounts of spatial data, and at some level the data is often manipulated as collections of line segments. NASA's EOS project is an example of a GIS that is expected to store and manipulate petabytes (thousands of terabytes, or millions of gigabytes) of data! In the design of algorithms for this type of large-scale application, it is essential to consider the problem of minimizing I/O communication, which is the bottleneck. \par In this paper we develop efficient new external-memory algorithms for a number of important problems involving line segments in the plane, including trapezoid decomposition, batched planar point location, triangulation, red-blue line segment intersection reporting, and general line segment intersection reporting. In GIS systems, the first three problems are useful for rendering and modeling, and the latter two are frequently used for overlaying maps and extracting information from them. To solve these problems, we combine and modify in novel ways several of the previously known techniques for designing efficient algorithms for external memory. We also develop a powerful new technique that can be regarded as a practical external memory version of fractional cascading. Except for the batched planar point location problem, no algorithms specifically designed for external memory were previously known for these problems. Our algorithms for triangulation and line segment intersection partially answer previously posed open problems, while the batched planar point location algorithm improves on the previously known solution, which applied only to monotone decompositions. Our algorithm for the red-blue line segment intersection problem is provably optimal.}, comment = {Special issue on cartography and geographic information systems.} } @InProceedings{arge:segments, author = {Lars Arge and Darren Erik Vengroff and Jeffrey Scott Vitter}, title = {External-Memory Algorithms for Processing Line Segments in Geographic Information Systems}, booktitle = {Proceedings of the Third European Symposium on Algorithms}, year = {1995}, month = {September}, pages = {295--310}, address = {Corfu, Greece}, note = {Available as Springer-Verlag LNCS 979}, later = {arge:jsegments}, URL = {ftp://cs.duke.edu/pub/jsv/Papers/AVV95.SegmentGIS.ps.Z}, keyword = {out-of-core algorithm, computational geometry, pario-bib}, abstract = {In the design of algorithms for large-scale applications it is essential to consider the problem of minimizing I/O communication. Geographical information systems (GIS) are good examples of such large-scale applications as they frequently handle huge amounts of spatial data. In this paper we develop efficient new external-memory algorithms for a number of important problems involving line segments in the plane, including trapezoid decomposition, batched planar point location, triangulation, red-blue line segment intersection reporting, and general line segment intersection reporting. In GIS systems, the first three problems are useful for rendering and modeling, and the latter two are frequently used for overlaying maps and extracting information from them.}, comment = {Does deal with parallel disks, though not in great detail.} } @Article{barve:jmergesort, author = {Rakesh D. Barve and Edward F. Grove and Jeffrey S. Vitter}, title = {Simple Randomized Mergesort on Parallel Disks}, journal = {Parallel Computing}, year = {1997}, volume = {23}, number = {4}, publisher = {North-Holland (Elsevier Scientific)}, note = {To appear}, earlier = {barve:mergesort}, URL = {file://cs.duke.edu/pub/jsv/Papers/BGV96.Simple_Mergesort.ps.gz}, keyword = {verify, parallel I/O algorithm, sorting, pario-bib}, abstract = {We consider the problem of sorting a file of N records on the D-disk model of parallel I/O in which there are two sources of parallelism. Records are transferred to and from disk concurrently in blocks of B contiguous records. In each I/O operation, up to one block can be transferred to or from each of the D disks in parallel. We propose a simple, efficient, randomized mergesort algorithm called SRM that uses a forecast-and-flush approach to overcome the inherent difficulties of simple merging on parallel disks. SRM exhibits a limited use of randomization and also has a useful deterministic version. Generalizing the technique of forecasting, our algorithm is able to read in, at any time, the ``right'' block from any disk, and using the technique of flushing, our algorithm evicts, without any I/O overhead, just the ``right'' blocks from memory to make space for new ones to be read in. The disk layout of SRM is such that it enjoys perfect write parallelism, avoiding fundamental inefficiencies of previous mergesort algorithms. By analysis of generalized maximum occupancy problems we are able to derive an analytical upper bound on SRM's expected overhead valid for arbitrary inputs. \par The upper bound derived on expected I/O performance of SRM indicates that SRM is provably better than disk-striped mergesort (DSM) for realistic parameter values D, M, and B. Average-case simulations show further improvement on the analytical upper bound. Unlike previously proposed optimal sorting algorithms, SRM outperforms DSM even when the number D of parallel disks is small.}, comment = {This paper formerly called barve:mergesort; I discovered that the paper had appeared in SPAA96, so the SPAA96 paper is now called barve:mergesort.} } @Article{gibson:sdcr, author = {Garth A. Gibson and Jeffrey Scott Vitter and John Wilkes}, title = {Report of the Working Group on Storage {I/O} Issues in Large-Scale Computing}, journal = {ACM Computing Surveys}, year = {1996}, month = {December}, volume = {28}, number = {4}, URL = {file://cs.duke.edu/pub/jsv/Papers/GVW96.storage_IO.ps.gz}, abstract = {We discuss the strategic directions and challenges in the management and use of storage systems--those components of computer systems responsible for the storage and retrieval of data. The performance gap between main and secondary memories shows no imminent sign of vanishing, and thus continuing research into storage I/O will be essential to reap the full benefit from the advances occurring in many other areas of computer science. In this report we identify a few strategic research goals and possible thrusts to meet those goals.} } @TechReport{khanna:group, author = {Sanjay Khanna and David Kotz}, title = {A Split-Phase Interface for Parallel File Systems}, year = {1997}, month = {March}, number = {PCS-TR97-312}, institution = {Dept. of Computer Science, Dartmouth College}, URL = {ftp://ftp.cs.dartmouth.edu/TR/TR97-312.ps.Z}, keyword = {multiprocessor file system interface, run-time library, parallel file system, parallel I/O, pario-bib}, abstract = {We describe the effects of a new user-level library for the Galley Parallel File System. This library allows some pre-existing sequential programs to make use of the Galley Parallel File System with minimal modification. It permits programs to efficiently use the parallel file system because the user-level library groups accesses together. We examine the performance of our library, and we show how code needs to be modified to use the library.} }