// Uses Internet Explorer to programmatically retrieve // the contents of a URL, retrieves all links on that URL, // and follows them down recursively, until a specified level. import iexplorer.InternetExplorer; import iexplorer.IWebBrowserApp; import java.util.Hashtable; import java.util.Vector; import mshtml.IHTMLDocument2; import mshtml.IHTMLElementCollection; import mshtml.IHTMLElement; import ezjcom.JComObject; import ezjcom.JComVariant; import ezjcom.JComException; import java.io.BufferedReader; import java.io.InputStreamReader; public class Spider { IWebBrowserApp app = null; WebLoadListener webLoadListener = new WebLoadListener(); Hashtable seenURLs = new Hashtable(); // Spider the given URL to a given depth. void spiderURL( String url, int depth ) { try { // Instantiate an InternetExplorer InternetExplorer ie = new InternetExplorer(); // Get IWebBrowserApp app = ie.getIWebBrowserApp(); // HINT: While developing, make the browser visible!! // app.setVisible( true ); // NOTE: When browser is not visible, the following workaround must // be added for an Internet Explorer bug (http://support.microsoft.com/kb/259935) // Remove it or comment it out if setting the browser visible. app.setLeft( - app.getWidth()); // If IE is trying to load any default pages, stop it. app.Stop(); // Attach the events listener. ie.addJComEventListener( webLoadListener ); // Do the spidering operation recursively. recursiveSpider( url, depth ); } catch (Exception ex) { ex.printStackTrace(); } finally { // Make sure to exit the running IE instance. if ( app != null ) try { app.Quit(); } catch (Exception ex) {} } } // Recursive component for spidering void recursiveSpider( String url, int depth ) throws JComException, InterruptedException { // If already seen this URL, return if ( seenURLs.containsKey( url )) return; // If already processed to the required depth, return if ( depth < 1 ) return; depth--; System.out.println(); System.out.println( "Loading URL " + url ); // Tell browser to load the URL, and wait for load completion. webLoadListener.initialize(); synchronized ( webLoadListener ) { app.Navigate( url ); webLoadListener.wait(); } url = webLoadListener.getFullURL(); if ( seenURLs.containsKey( url )) { // After URL expansion, do we now recognize it as seen? System.out.println( "URL is already seen" ); return; } System.out.println( "Examining URL " + url ); seenURLs.put( url, url ); // Now retrieve the links in the website. IHTMLDocument2 doc = (IHTMLDocument2) app.getDocument().JComCoerceObjectToAnotherType( IHTMLDocument2.class ); mshtml.IHTMLElementCollection links = doc.getLinks(); if ( links.getLength() <= 0 ) return; System.out.println( "Links:" ); Vector toVisit = new Vector(); for ( int i = 0; i < links.getLength(); i++ ) { JComObject obj = links.item( new JComVariant( i )); IHTMLElement elem = (IHTMLElement) obj.JComCoerceObjectToAnotherType( IHTMLElement.class ); String hRef = elem.getAttribute( "HREF", 0 ).getString(); if ( hRef == null || hRef.trim().equals( "" )) continue; System.out.print( " " + hRef ); String innerText = elem.getInnerText(); if ( innerText != null ) System.out.print( " " + innerText ); if ( seenURLs.containsKey( hRef )) { System.out.println( " (seen)" ); continue; } System.out.println(); toVisit.addElement( hRef ); } if ( depth < 1 ) return; // Recursively spider the links. for ( int i = 0; i < toVisit.size(); i++ ) { recursiveSpider( (String) toVisit.elementAt(i), depth ); } } // The main program just gets the URL and depth, and starts things off. public static void main(String args[]) { try { // Get a buffered reader to read user input. BufferedReader in = new BufferedReader( new InputStreamReader( System.in )); System.out.print( "Enter URL to Spider: " ); String url = in.readLine().trim(); if ( url.equals( "" )) return; System.out.print( "Enter depth to spider (between 1 and 10): " ); int depth = Integer.parseInt( in.readLine().trim()); if ( depth < 1 || depth > 10 ) { System.out.println( "Invalid depth " + depth ); return; } new Spider().spiderURL( url, depth ); } catch (Exception ex) { ex.printStackTrace(); } System.exit(0); } }