import java.util.*; /** * Identifies unique words in a document, via a set * * @author Haris Baig and Chris Bailey-Kellogg, Dartmouth CS 10, Fall 2012 * @author Tim Pierson, CS 10, Winter 2017 -- added comments */ public class UniqueWords { public static void main(String[] args) { String page = "Pretend that this string was loaded from a web page. We won't go to all that trouble here. This string contains multiple words. And multiple copies of multiple words. And multiple words with multiple copies. It is to be used as a test to demonstrate how sets work in removing redundancy by keeping only one copy of each thing. Is it very very redundant in having more than one copy of some words?"; String[] allWords = page.split("[ .,?!]+"); // split on punctuation and white space // Declare new Set to hold unique words Set uniqueWords = new TreeSet(); // Loop over all the words split out of the string, adding to set for (String s: allWords) { uniqueWords.add(s.toLowerCase()); // Calling add() method for duplicate words just overwrites existing entries } System.out.println(allWords.length + " words"); //note: this is not the set, this is the String array of words after parsing String page System.out.println(uniqueWords.size() + " unique words"); //this is the set, size returns how many elements (e.g., the unique words) are present System.out.println(uniqueWords); //print the unique words } }