BibTeX for papers by David Kotz; for complete/updated list see https://www.cs.dartmouth.edu/~kotz/research/papers.html @InProceedings{fazio:sampling, author = {Phillip A. Fazio and Keren Tan and David Kotz}, title = {Effects of network trace sampling methods on privacy and utility metrics}, booktitle = {Proceedings of the Annual Workshop on Wireless Systems: Advanced Research and Development (WISARD)}, year = 2012, month = {January}, pages = {1--8}, publisher = {IEEE}, copyright = {IEEE}, DOI = {10.1109/COMSNETS.2012.6151387}, URL = {https://www.cs.dartmouth.edu/~kotz/research/fazio-sampling/index.html}, abstract = {Researchers choosing to share wireless-network traces with colleagues must first anonymize sensitive information, trading off the removal of information in the interest of identity protection and the preservation of useful data within the trace. While several metrics exist to quantify this privacy-utility tradeoff, they are often computationally expensive. Computing these metrics using a \emph{sample} of the trace could potentially save precious time. In this paper, we examine several sampling methods to discover their effects on measurement of the privacy-utility tradeoff when anonymizing network traces. We tested the relative accuracy of several packet and flow-sampling methods on existing privacy and utility metrics. We concluded that, for our test trace, no single sampling method we examined allowed us to accurately measure the tradeoff, and that some sampling methods can produce grossly inaccurate estimates of those values. We call for further research to develop sampling methods that maintain relevant privacy and utility properties.}, } @InProceedings{fazio:netsani, author = {Phil Fazio and Keren Tan and Jihwang Yeo and David Kotz}, title = {Short Paper: The NetSANI Framework for Analysis and Fine-tuning of Network Trace Sanitization}, booktitle = {Proceedings of the ACM Conference on Wireless Network Security (WiSec)}, year = 2011, month = {June}, pages = {5--10}, publisher = {ACM}, copyright = {ACM}, DOI = {10.1145/1998412.1998416}, URL = {https://www.cs.dartmouth.edu/~kotz/research/fazio-netsani/index.html}, abstract = {Anonymization is critical prior to sharing wireless-network traces within the research community, to protect both personal and organizational sensitive information from disclosure. One difficulty in anonymization, or more generally, sanitization, is that users lack information about the quality of a sanitization result, such as how much privacy risk a sanitized trace may expose, and how much research utility the sanitized trace may retain. We propose a framework, NetSANI, that allows users to analyze and control the privacy/utility tradeoff in network sanitization. NetSANI can accommodate most of the currently available privacy and utility metrics for network trace sanitization. This framework provides a set of APIs for analyzing the privacy/utility tradeoff by comparing the changes in privacy and utility levels of a trace for a sanitization operation. We demonstrate the framework with an quantitative evaluation on wireless-network traces.}, } @TechReport{tan:crf-tr, author = {Keren Tan and Guanhua Yan and Jihwang Yeo and David Kotz}, title = {Privacy Analysis of User Association Logs in a Large-scale Wireless LAN}, institution = {Dartmouth Computer Science}, year = 2011, month = {January}, number = {TR2011-679}, copyright = {the authors}, URL = {https://www.cs.dartmouth.edu/~kotz/research/tan-crf-tr/index.html}, abstract = {User association logs collected from a large-scale wireless LAN record where and when a user has used the network. Such information plays an important role in wireless network research. One concern of sharing these data with other researchers, however, is that the logs pose potential privacy risks for the network users. Today, the common practice in sanitizing these data before releasing them to the public is to anonymize users' sensitive information, such as their devices' MAC addresses and their exact association locations. In this work, we demonstrate that such sanitization measures are insufficient to protect user privacy because the differences between user association behaviors can be modeled and many are distinguishable. By simulating an adversary's role, we propose a novel type of correlation attack in which the adversary uses the anonymized association log to build signatures against each user, and when combined with auxiliary information, such signatures can help to identify users within the anonymized log. On a user association log that contains more than four thousand users and millions of association records, we demonstrate that this attack technique is able to pinpoint the victim's identity exactly with a probability as high as 70\%, and narrow it down to a set of 20 candidates with a probability close to 100\%. We further evaluate the effectiveness of standard anonymization techniques, including generalization and perturbation, in mitigating this correlation attack; our experimental results reveal only limited success of these methods, suggesting that more thorough treatment is needed when anonymizing wireless user association logs before public release.}, } @InProceedings{tan:crf, author = {Keren Tan and Guanhua Yan and Jihwang Yeo and David Kotz}, title = {Privacy analysis of user association logs in a large-scale wireless LAN}, booktitle = {Proceedings of the Annual Joint Conference of the IEEE Computer and Communications Societies (INFOCOM) mini-conference}, year = 2011, month = {April}, pages = {31--35}, publisher = {IEEE}, copyright = {IEEE}, DOI = {10.1109/INFCOM.2011.5935168}, URL = {https://www.cs.dartmouth.edu/~kotz/research/tan-crf/index.html}, abstract = {User association logs collected from a large-scale wireless LAN record where and when a user has used the network. Such information plays an important role in wireless network research. One concern of sharing these data with other researchers, however, is that the logs pose potential privacy risks for the network users. Today, the common practice in sanitizing these data before releasing them to the public is to anonymize users' sensitive information, such as their devices' MAC addresses and their exact association locations. In this work, we aim to study whether such sanitization measures are sufficient to protect user privacy. By simulating an adversary's role, we propose a novel type of correlation attack in which the adversary uses the anonymized association log to build signatures against each user, and when combined with auxiliary information, such signatures can help to identify users within the anonymized log. Using a user association log that contains more than four thousand users and millions of association records, we demonstrate that this attack technique, under certain circumstances, is able to pinpoint the victim's identity exactly with a probability as high as 70\%, or narrow it down to a set of 20 candidates with a probability close to 100\%. We further evaluate the effectiveness of standard anonymization techniques, including generalization and perturbation, in mitigating correlation attacks; our experimental results reveal only limited success of these methods, suggesting that more thorough treatment is needed when anonymizing wireless user association logs before public release.}, } @InCollection{tan:survey, author = {Keren Tan and Jihwang Yeo and Michael E. Locasto and David Kotz}, title = {Catch, Clean, and Release: A Survey of Obstacles and Opportunities for Network Trace Sanitization}, booktitle = {Privacy-Aware Knowledge Discovery: Novel Applications and New Techniques}, editor = {Francesco Bonchi and Elena Ferrari}, year = 2011, month = {January}, chapter = 5, pages = {111--141}, publisher = {Chapman and Hall/CRC Press}, copyright = {Chapman and Hall/CRC Press}, ISBN13 = 9781439803653, URL = {https://www.cs.dartmouth.edu/~kotz/research/tan-survey/index.html}, abstract = {Network researchers benefit tremendously from access to traces of production networks, and several repositories of such network traces exist. By their very nature, these traces capture sensitive business and personal activity. Furthermore, network traces contain significant operational information about the target network, such as its structure, identity of the network provider, or addresses of important servers. To protect private or proprietary information, researchers must ``sanitize'' a trace before sharing it. \par In this chapter, we survey the growing body of research that addresses the risks, methods, and evaluation of network trace sanitization. Research on the risks of network trace sanitization attempts to extract information from published network traces, while research on sanitization methods investigates approaches that may protect against such attacks. Although researchers have recently proposed both quantitative and qualitative methods to evaluate the effectiveness of sanitization methods, such work has several shortcomings, some of which we highlight in a discussion of open problems. Sanitizing a network trace, however challenging, remains an important method for advancing network--based research.}, } @TechReport{fazio:thesis, author = {Phillip A. Fazio}, title = {Effects of network trace sampling methods on privacy and utility metrics}, institution = {Dartmouth College, Computer Science}, year = 2011, month = {June}, number = {TR2011-697}, copyright = {the author}, address = {Hanover, NH}, URL = {https://www.cs.dartmouth.edu/~kotz/research/fazio-thesis/index.html}, abstract = {Researchers studying computer networks rely on the availability of traffic trace data collected from live production networks. Those choosing to share trace data with colleagues must first remove or otherwise anonymize sensitive information. This process, called sanitization, represents a tradeoff between the removal of information in the interest of identity protection and the preservation of data within the trace that is most relevant to researchers. While several metrics exist to quantify this privacy-utility tradeoff, they are often computationally expensive. Computing these metrics using a sample of the trace, rather than the entire input trace, could potentially save precious time and space resources, provided the accuracy of these values does not suffer. In this paper, we examine several simple sampling methods to discover their effects on measurement of the privacy-utility tradeoff when anonymizing network traces prior to their sharing or publication. After sanitizing a small sample trace collected from the Dartmouth College wireless network, we tested the relative accuracy of a variety of previously implemented packet and flow-sampling methods on a few existing privacy and utility metrics. This analysis led us to conclude that, for our test trace, no single sampling method we examined allowed us to accurately measure the trade-off, and that some sampling methods can produce grossly inaccurate estimates of those values. We were unable to draw conclusions on the use of packet versus flow sampling in these instances.}, } @PhdThesis{tan:thesis, author = {Keren Tan}, title = {Large-scale Wireless Local-area Network Measurement and Privacy Analysis}, school = {Dartmouth College Computer Science}, year = 2011, month = {August}, copyright = {Keren Tan}, address = {Hanover, NH}, URL = {https://www.cs.dartmouth.edu/~kotz/research/tan-thesis/index.html}, note = {Available as Dartmouth Computer Science Technical Report TR2011-703}, abstract = {The edge of the Internet is increasingly becoming wireless. Understanding the wireless edge is therefore important for understanding the performance and security aspects of the Internet experience. This need is especially necessary for enterprise-wide wireless local-area networks (WLANs) as organizations increasingly depend on WLANs for mission-critical tasks. To study a live production WLAN, especially a large-scale network, is a difficult undertaking. Two fundamental difficulties involved are (1) building a scalable network measurement infrastructure to collect traces from a large-scale production WLAN, and (2) preserving user privacy while sharing these collected traces to the network research community. In this dissertation, we present our experience in designing and implementing one of the largest distributed WLAN measurement systems in the United States, the Dartmouth Internet Security Testbed (DIST), with a particular focus on our solutions to the challenges of efficiency, scalability, and security. We also present an extensive evaluation of the DIST system. To understand the severity of some potential trace-sharing risks for an enterprise-wide large-scale wireless network, we conduct privacy analysis on one kind of wireless network traces, a user-association log, collected from a large-scale WLAN. We introduce a machine-learning based approach that can extract and quantify sensitive information from a user-association log, even though it is sanitized. Finally, we present a case study that evaluates the tradeoff between utility and privacy on WLAN trace sanitization.}, } @InProceedings{tan:crf-s3, author = {Keren Tan and Guanhua Yan and Jihwang Yeo and David Kotz}, title = {A Correlation Attack Against User Mobility Privacy in a Large-scale WLAN network}, booktitle = {Proceedings of the ACM MobiCom S3 workshop}, year = 2010, month = {September}, pages = {33--35}, publisher = {ACM}, copyright = {ACM}, DOI = {10.1145/1860039.1860050}, URL = {https://www.cs.dartmouth.edu/~kotz/research/tan-crf-s3/index.html}, abstract = {User association logs collected from real-world wireless LANs have facilitated wireless network research greatly. To protect user privacy, the common practice in sanitizing these data before releasing them to the public is to anonymize users' sensitive information such as the MAC addresses of their devices and their exact association locations. In this work,we demonstrate that these sanitization measures are insufficient in protecting user privacy from a novel type of correlation attack that is based on CRF (Conditional Random Field). In such a correlation attack, the adversary observes the victim's AP (Access Point) association activities for a short period of time and then infers her corresponding identity in a released user association dataset. Using a user association log that contains more than three thousand users and millions of AP association records, we demonstrate that the CRF-based technique is able to pinpoint the victim's identity exactly with a probability as high as 70\%.}, } @TechReport{yeo:poll-tr, author = {Jihwang Yeo and Keren Tan and David Kotz}, title = {User survey regarding the needs of network researchers in trace-anonymization tools}, institution = {Dartmouth Computer Science}, year = 2009, month = {November}, number = {TR2009-658}, copyright = {the authors}, address = {Hanover, NH}, URL = {https://www.cs.dartmouth.edu/~kotz/research/yeo-poll-tr/index.html}, abstract = {To understand the needs of network researchers in an anonymization tool, we conducted a survey on the network researchers. We invited network researchers world-wide to the survey by sending invitation emails to well-known mailing lists whose subscribers may be interested in network research with collecting, sharing and sanitizing network traces.}, }