@phdthesis{brewington:thesis,
author = {Brian Brewington},
title = {Observation of changing information sources},
year = {2000},
month = {June},
school = {Thayer School of Engineering, Dartmouth College},
copyright = {Brian Brewington},
group = {agents, actcomm},
url = {http://actcomm.dartmouth.edu/papers/brewington:thesis.ps.gz},
keyword = {world-wide web, search engine, modeling},
abstract = {Many modern information management tasks consist of an observer
that must maintain current knowledge of a collection of changing information.
The goal of this observer is to maintain acceptably accurate state estimates
given limited observation resources, such as bandwidth, time, and storage.
Good examples of such ``observation problems'' are found in any situation
where bandwidth is limited and old observations become less useful over time.
Two such examples are maintaining a search engine's index of the World Wide
Web (WWW) and automated monitoring of multiple sensors. This thesis addresses
the general observation problem by (1) devising a formal framework of what it
means to be ``up-to-date'', (2) gathering empirical data about the web that
allows us to apply this framework to an important setting, and (3) presenting
algorithms for scheduling revisits to optimize formal performance measures.
One year's worth of web page observations are analyzed to show how quickly
and in what ways web pages change. The observations are used to estimate the
distribution of web page change rates. Using this data as a model of the web
we present and benchmark an algorithm for optimizing the observation of a set
of web documents such that fewer changes go unnoticed. Our experiments on
real search engines show that between 40 and 50% of results returned have
been altered at least once since the search engine last observed them. Our
algorithm can theoretically reduce these figures to between 36 and 44%,
assuming that existing search engines currently use simple round-robin
re-indexing strategies. These methods benefit the ``overworked'' observer
much more than the one with ample bandwidth, and are general enough to be of
use in a wide variety of monitoring tasks.},
}