@InProceedings{arge:sorting-strings, author = {Lars Arge and Paolo Ferragina and Roberto Grossi and Jeffrey S. Vitter}, title = {On sorting strings in external memory}, booktitle = {Proceedings of the 29th Annual ACM Symposium on Theory of Computing}, year = {1997}, month = {May}, pages = {540--548}, publisher = {ACM Press}, address = {El Paso}, URL = {file://ftp.cs.duke.edu/pub/jsv/Papers/AFG97.stringsorting.ps.gz}, keyword = {verify authors and month, out-of-core algorithm, sorting, parallel I/O, pario-bib}, abstract = {In this paper we address for the first time the I/O complexity of the problem of sorting strings in external memory, which is a fundamental component of many large-scale text applications. In the standard unit-cost RAM comparison model, the complexity of sorting K strings of total length N is theta(K log K + N). By analogy, in the external memory (or I/O) model, where the internal memory has size M and the block transfer size is B, it would be natural to guess that the I/O complexity of sorting strings is theta(K/B log_(M/B) (K/B) + N/B), but the known algorithms do not come even close to achieving this bound. Our results show, somewhat counterintuitively, that the I/O complexity of string sorting depends upon the length of the strings relative to the block size. We first consider a simple comparison I/O model, where one is not allowed to break the strings into their characters, and we show that the I/O complexity of string sorting in this model is theta(N_1/B log_(M/B) (N_1/B) + K_2 log_(M/B) K_2 + N/B), where N_1 is the total length of all strings shorter than B and K_2 is the number of strings longer than B. We then consider two more general I/O comparison models in which string breaking is allowed. We obtain improved algorithms and in several cases lower bounds that match their I/O bounds. Finally, we develop more practical algorithms without assuming the comparison model.} } @InProceedings{ferragina:soda96, author = {Paolo Ferragina and Roberto Grossi}, title = {Fast String Searching in Secondary Storage: Theoretical Developments and Experimental Results}, booktitle = {Proceedings of the ACM-SIAM Symposium on Discrete Algorithms (SODA `96)}, year = {1996}, pages = {373--382}, publisher = {ACM Press}, address = {Atlanta}, URL = {http://www.di.unipi.it/~ferragin/Latex/jsoda96.ps.gz}, keyword = {verify month and authors, out-of-core algorithm, parallel I/O, pario-bib}, abstract = {In a previous work [Ferragina-Grossi, ACM STOC 95], we proposed a text indexing data structure for secondary storage, which we called SB-tree, that combines the best of B-trees and suffix arrays, overcoming the limitations of inverted files, suffix arrays, suffix trees, and prefix B-trees. In this paper we study the performance of SB-trees in a practical setting, performing a set of searching and updating experiments. Improved performance was obtained by a new space efficient and alphabet-independent organization of the internal nodes of the SB-tree, and a new batch insertion procedure that avoids thrashing.} } @InProceedings{ferragina:stoc95, author = {Paolo Ferragina and Roberto Grossi}, title = {A Fully-dynamic data structure for external substring search}, booktitle = {Proceedings of the 27th Annual ACM Symposium on Theory of Computing}, year = {1995}, pages = {693--702}, address = {Las Vegas}, keyword = {verify authors and month and URL, out-of-core algorithm, parallel I/O, pario-bib} }