/** Filtered Search Skeleton ** ** by Latanya Sweeney ** Copyright (c) 2003 Carnegie Mellon University. ** Limited license provided for the exclusive use of students ** enrolled in the Privacy and Anonymity in Data Course at Carnegie Mellon University ** while students are enrolled in the course. Students may modify as needed ** for lab assignments and course projects. ** ** This program provides a skeleton for filtered searching. Based on search criteria, ** URL pages are fetched and stored locally. Multiple searches can be automatically ** perfomed. Searches can be limited to a particular site. Google is used as the ** basis for the search. ** ** Within the code, places where filtering might occur are noted. ** Alternatively, the cached (.htm, .html and .txt) files ** may be filtered after saving when program ends. ** To use this program, do the following: ** 1. Download the Google Java API located at http://www.google.com/apis/ ** This will place the following subdirectories in your default directory: ** com, org, googleapi, javax, META-INF ** 2. Register for a key from Google and assign your key to the KEY variable below. ** 3. Make any changes to FILE_DIR_DELIMIT or END_OF_LINE as needed ** to match the kind of machine you are using. ** 4. Make a directory named 'cache' as a subdirectory to the default directory ** in which this program runs. ** 5. During runtime you can specify search strings and site. ** 6. Retrieved files will be stored in 'cache' directory with an index stored ** as 'index.txt' in the default directory. **/ import java.awt.event.WindowListener; import java.awt.event.WindowEvent; import java.awt.event.WindowAdapter; import java.awt.Frame; import java.awt.TextArea; import java.awt.TextField; import java.awt.Label; import java.awt.Button; import java.awt.event.ActionListener; import java.awt.event.ActionEvent; import java.awt.FlowLayout; import java.net.URL; import java.io.*; import com.google.soap.search.*; // Google API public class FilteredSearch extends Frame implements ActionListener { String FILE_DIR_DELIMIT = "\\"; // Directory delimiter for computer "\\" for Windows // "/" for Mac and Unix String END_OF_LINE = "\r\n"; // End of line char(s) for computer. "\r\n" for PC. String KEY = "Put your Google key here"; // Google key String DEFAULTsite = "cmu.edu"; String DEFAULTmaxtry = "100"; String DEFAULTkeywords = "resume SSN"; long MAXLINES = 10000; // maximum number of lines in a file to copy int MAXURLS = 1000; // maximum number of URLs to store String URLS[] = new String[MAXURLS]; int urlcount = 0; // number of URLs stored in URLS TextArea results; TextField msg; TextField sitespec; TextField maxtry; TextArea searchspec; Button fetch; public FilteredSearch() { setSize(600, 420); setLayout(new FlowLayout() ); results = new TextArea( "Ready to begin Step 1 of 3." + "\n\n" +"Step 1 estimates number of candidate URLs using a search engine." + "\nStep 2 fetches candidate URLs using a search engine." + "\nStep 3 copy URLs to disk." + "\n\n\t" + "Site\t\t(optional) limit the search to the location specified." + "\n\t" + "Maximum tries\tMaximum number of URLs to try for each search string." + "\n\t" + "Search strings\tKeywords to use with search engine. " + "\n\t" + "\t\tEach line contains a distinct keyword search.", 12, 80); results.setEditable(false); add(results); msg = new TextField("Make selections below then click on button.", 80); add(msg); add( new Label("Site")); sitespec = new TextField(DEFAULTsite, 25); add(sitespec); add( new Label("Maximum tries")); maxtry = new TextField(DEFAULTmaxtry, 5); add(maxtry); add( new Label("Search strings")); searchspec = new TextArea(DEFAULTkeywords, 5, 30); add(searchspec); add( new Label(" ")); // move search strings over to the left fetch = new Button(); fetch.setLabel("1. Estimate hits"); fetch.setActionCommand("fetch"); fetch.addActionListener(this); add(fetch); addWindowListener( new WindowAdapter() { public void windowClosing(WindowEvent e) { System.out.println("Bye"); System.exit(0); } }); } public void done() { msg.setText("Exiting ..."); fetch.setLabel("Exiting..."); System.out.println("Bye"); System.exit(0); } public void actionPerformed(ActionEvent e) { if( fetch.getLabel().compareTo("1. Estimate hits") == 0) doStep1(); else if( fetch.getLabel().compareTo("2. Fetch URLs") == 0) doStep2(); else if( fetch.getLabel().compareTo("3. Copy URLs") == 0) doStep3(); else if( fetch.getLabel().compareTo("Done") == 0) done(); } public void doStep1() { results.setText(""); msg.setText("Estimating numbers of hits ..."); fetch.setLabel("1 Estimating..."); // Create Google Search & Results objects, set authorization key GoogleSearchResult googr; GoogleSearch googs = new GoogleSearch(); googs.setKey(KEY); googs.setMaxResults(1); // max chunk retrieved at a time, not max URLs String searchstrings = searchspec.getText(); String sskey = ""; int ssbegin=0, ssend=-1; try { do { ssbegin = ssend+1; ssend = searchstrings.indexOf('\n', ssbegin); if( ssend < 0) ssend = searchstrings.length(); if( ssbegin < ssend) { sskey = searchstrings.substring(ssbegin, ssend); msg.setText("Estimating hits for search: " + sskey); googs.setQueryString(sskey + " " + "site:" + sitespec.getText()); int linkstotal; googs.setStartResult(0); googr = googs.doSearch(); linkstotal = googr.getEstimatedTotalResultsCount(); System.out.println("-----------------------------"); System.out.println(sitespec.getText()); System.out.println(sskey); System.out.println(linkstotal); System.out.println("-----------------------------"); results.append("\n" + sskey + ", Estimated hits = " + linkstotal); } } while(ssbegin < ssend); } catch (Exception f) { msg.setText("Problem encountered. "); System.out.println("The call to the Google Web APIs failed:"); System.out.println(f.toString()); } fetch.setLabel("2. Fetch URLs"); msg.setText("Done estimating number of hits. Step 2: click to fetch URLs."); } public void doStep2() { results.setText(""); msg.setText("Fetching URLs ..."); fetch.setLabel("2 Fetching..."); // Create Google Search & Results objects, set authorization key GoogleSearchResult googr; GoogleSearch googs = new GoogleSearch(); googs.setKey(KEY); googs.setMaxResults(10); // max chunk retrieved at a time, not max URLs String searchstrings = searchspec.getText(); String sskey = ""; int ssbegin=0, ssend=-1; try { do { ssbegin = ssend+1; ssend = searchstrings.indexOf('\n', ssbegin); if( ssend < 0) ssend = searchstrings.length(); if( ssbegin < ssend) { sskey = searchstrings.substring(ssbegin, ssend); msg.setText("Fetching URLS on search... " + sskey); results.append("\n" + sskey); googs.setQueryString(sskey + " " + "site:" + sitespec.getText()); int linkstotal=0, count = 0, timesindex=1, lastindex= -1; GoogleSearchResultElement[] links = (GoogleSearchResultElement []) null; do { googs.setStartResult(count); googr = googs.doSearch(); linkstotal = googr.getEstimatedTotalResultsCount(); if( linkstotal > 0) { links = googr.getResultElements(); if( googr.getStartIndex() > count ) { for(int i= googr.getStartIndex()-count-1; i < googr.getEndIndex()-count; i++) { if( links[i] != (GoogleSearchResultElement) null) { if( (urlcount < MAXURLS) && (links[i].getURL().endsWith(".htm") || links[i].getURL().endsWith(".html") || links[i].getURL().endsWith(".txt") )) { // save URL, but could filter here URLS[urlcount++] = links[i].getURL(); } results.append("\n\t" + links[i].getURL()); } } count += 10; // maxresults returned } } if( googr.getStartIndex() == lastindex) timesindex++; else timesindex = 1; lastindex = googr.getStartIndex(); } while( (timesindex < 2) && (count < Integer.parseInt(maxtry.getText())) ); } } while(ssbegin < ssend); } catch (Exception f) { msg.setText("Problem encountered. "); System.out.println("The call to the Google Web APIs failed:"); System.out.println(f.toString()); } fetch.setLabel("3. Copy URLs"); msg.setText("Done fetching URLs."); } public void doStep3() { results.setText(""); msg.setText("Copying URLs ..."); fetch.setLabel("3 Copying..."); URL u; BufferedReader in; String s, urlstring; int flipflop = 0; try { long cachefile = 1; for(int i=0; i < urlcount; i++) { try { urlstring = URLS[i]; msg.setText("Copying ... " + urlstring); results.append("\nCopying to " + cachefile + ".txt from URL... " + urlstring); // Copies contents of URL to local disk. Could filter here. u = new URL(urlstring); in = new BufferedReader( new InputStreamReader(u.openStream()) ); long numlines = 0, numchars = 0; FileOutputStream f = new FileOutputStream( "cache" + FILE_DIR_DELIMIT + cachefile + ".txt"); while( ((s = in.readLine()) != null) && (numlines++ < MAXLINES) && ((numchars = numchars + s.length()) < MAXLINES * 80) ) { if( (flipflop++ % 6) == 0) msg.setText("Copying lines (" + numlines + ")... " + s); writeString(f, s + END_OF_LINE); } in.close(); f.close(); results.append("\n\tLines= " + numlines ); cachefile++; } catch (Exception ex) { msg.setText("Error encountered." ); System.out.println("Error encountered copying URL: " + ex.toString() ); } } if( urlcount > 0) { FileOutputStream f = new FileOutputStream("index.txt"); for(int i = 0; i < urlcount; i++) writeString(f, "[" + i + "] " + URLS[i] + END_OF_LINE); f.close(); } } catch (Exception ex) { msg.setText("Error encountered." ); System.out.println("Error encountered: " + ex.toString() ); } fetch.setLabel("Done"); msg.setText("Done copying URL contents."); } public void writeString(FileOutputStream fout, String s) throws IOException { for(int pos=0; pos < s.length(); pos++) fout.write( s.charAt(pos) ); } public String readLine(FileInputStream in) { int c= -1; String s = ""; try { do { if( (c = in.read()) >= 0) if((c != (int) '\r') && (c != (int) '\n') ) s = s + (char) c; } while( (c > 0) && (c != (int) '\n') ); } catch(Exception e) { ; } if( (c == -1) && (s == "")) return (String) null; return s; } public boolean readLine(FileInputStream in, StringBuffer s) { int c= -1; s.delete(0, s.length()); try { do { if( (c = in.read()) >= 0) if((c != (int) '\r') && (c != (int) '\n') ) s.append((char) c); } while( (c > 0) && (c != (int) '\n') ); } catch(Exception e) { ; } if( (c == -1) && (s.length() <= 0)) return false; return true; } public static void main(String args []) { FilteredSearch w = new FilteredSearch(); w.setTitle("Filtered Search v1.0"); w.setVisible(true); } }