1   /*
2    *  HtmlLinkExtractor.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  16/Nov/2001
12   *
13   *  $Id: HtmlLinksExtractor.java,v 1.3 2001/12/04 15:35:15 hamish Exp $
14   */
15  
16  package gate.util;
17  
18  import javax.swing.text.html.*;
19  import javax.swing.text.html.parser.*;
20  import javax.swing.text.html.HTMLEditorKit.*;
21  import javax.swing.text.*;
22  import java.util.*;
23  import java.io.*;
24  
25  /**
26   * This class extracts links from HTML files.
27   * <B>It has been hacked</B> to build the contents of
28   * <A HREF="http://gate.ac.uk/sitemap.html">http://gate.ac.uk/sitemap.html</A>;
29   * you <B>probably don't want to use it</B> for anything else!
30   * <P>
31   * Implements the behaviour of the HTML reader.
32   * Methods of an object of this class are called by the HTML parser when
33   * events will appear.
34   */
35  public class HtmlLinksExtractor extends ParserCallback {
36  
37    /** Debug flag */
38    private static final boolean DEBUG = false;
39  
40    /** The tag currently being processed */
41    private HTML.Tag currentTag = null;
42  
43    /** whether we've done a title before */
44    static boolean firstTitle = true;
45  
46    /** will contain &lt;/UL&gt; after first title */
47    static String endUl = "";
48  
49    /** Name of the file we're currently processing */
50    static String currFile = "";
51  
52    /** Path to the file we're currently processing */
53    static String currPath = "";
54  
55    /** This method is called when the HTML parser encounts the beginning
56      * of a tag that means that the tag is paired by an end tag and it's
57      * not an empty one.
58      */
59    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
60  
61      currentTag = t;
62      if (HTML.Tag.A == t){
63        Out.pr("<LI><" + t);
64        String href = "";
65        Enumeration e = a.getAttributeNames();
66        while(e.hasMoreElements()) {
67          HTML.Attribute name = (HTML.Attribute) e.nextElement();
68          String value = (String) a.getAttribute(name);
69  
70          if(name == HTML.Attribute.HREF) {
71            if(
72              value.startsWith("http:") || value.startsWith("HTTP:") ||
73              value.startsWith("file:") || value.startsWith("FILE:") ||
74              value.startsWith("mailto:") || value.startsWith("MAILTO:") ||
75              value.startsWith("ftp:") || value.startsWith("FTP:")
76            )
77              Out.pr(" HREF=\"" + value + "\"");
78            else { // if it is a relative path....
79              Out.pr(" HREF=\"" + currPath + "/" + value + "\"");
80            }
81          }
82        } // while
83  
84        Out.pr(">");
85      }// End if
86  
87      if (HTML.Tag.TITLE == t){
88        Out.pr(endUl + "<H3>");
89        if(firstTitle) { firstTitle = false; endUl = "</UL>"; }
90      }// End if
91  
92    }//handleStartTag
93  
94    private void printAttributes(MutableAttributeSet a){
95      if (a == null) return;
96      // Take all the attributes an put them into the feature map
97      if (0 != a.getAttributeCount()){
98        Enumeration enum = a.getAttributeNames();
99        while (enum.hasMoreElements()){
100         Object attribute = enum.nextElement();
101         Out.pr(" "+ attribute.toString() + "=\"" +
102                                   a.getAttribute(attribute).toString()+"\"");
103       }// End while
104     }// End if
105   }// printAttributes();
106 
107    /** This method is called when the HTML parser encounts the end of a tag
108      * that means that the tag is paired by a beginning tag
109      */
110   public void handleEndTag(HTML.Tag t, int pos){
111     currentTag = null;
112 
113     if (HTML.Tag.A == t)
114       Out.pr("</"+t+">\n");
115     if (HTML.Tag.TITLE == t)
116       Out.pr(
117         "</H3></A>\n\n<P>Links in: <A HREF=\"" + currFile +
118         "\">" + currFile + "</A>:\n<UL>\n"
119       );
120 
121   }//handleEndTag
122 
123   /** This method is called when the HTML parser encounts an empty tag
124     */
125   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
126     if (HTML.Tag.A == t){
127       Out.pr("<"+t);
128       printAttributes(a);
129       Out.pr("/>\n");
130     }// End if
131 
132     if (HTML.Tag.TITLE == t){
133       Out.pr("<"+t);
134       printAttributes(a);
135       Out.pr("/>\n");
136     }// End if
137   } // handleSimpleTag
138 
139   /** This method is called when the HTML parser encounts text (PCDATA)*/
140   public void handleText(char[] text, int pos){
141 
142     if(HTML.Tag.A == currentTag){
143       //text of tag A
144       String tagText = new String(text);
145       Out.pr(tagText);
146     }// End if
147 
148     if(HTML.Tag.TITLE == currentTag){
149       //text of tag A
150       String tagText = new String(text);
151       Out.pr(tagText);
152     }// End if
153 
154   }// end handleText();
155 
156   /**
157     * This method is called when the HTML parser encounts an error
158     * it depends on the programmer if he wants to deal with that error
159     */
160   public void handleError(String errorMsg, int pos) {
161     //Out.println ("ERROR CALLED : " + errorMsg);
162   }
163 
164   /** This method is called once, when the HTML parser reaches the end
165     * of its input streamin order to notify the parserCallback that there
166     * is nothing more to parse.
167     */
168   public void flush() throws BadLocationException{
169   }// flush
170 
171   /** This method is called when the HTML parser encounts a comment
172     */
173   public void handleComment(char[] text, int pos) {
174   }
175 
176   /**
177    * Given a certain folder it lists recursively all the files contained
178    * in that folder. It returns a list of strings representing the file
179    * names
180    */
181   private static List listAllFiles(File aFile, Set foldersToIgnore){
182     java.util.List sgmlFileNames = new ArrayList();
183     java.util.List foldersToExplore = new ArrayList();
184     if (!aFile.isDirectory()){
185       // add the file to the file list
186       sgmlFileNames.add(aFile.getPath());
187       return sgmlFileNames;
188     }// End if
189     listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore);
190     return sgmlFileNames;
191   } // listAllFiles();
192 
193   /** Helper method for listAllFiles */
194   private static void listFilesRec(File aFile,
195                                   java.util.List fileNames,
196                                   java.util.List foldersToExplore,
197                                   Set foldersToIgnore){
198 
199     String[] fileList = aFile.list();
200     for (int i=0; i< fileList.length; i++){
201       File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]);
202       if (tmpFile.isDirectory()){
203         // If the file is not included
204         if (!foldersToIgnore.contains(tmpFile.getName())) {  //fileList[i])) {
205           if(DEBUG) {
206             Err.prln("adding dir: " + tmpFile);
207             Err.prln("  name: " + tmpFile.getName());
208           }
209           foldersToExplore.add(tmpFile);
210         }
211       }else{
212         // only process .html files
213         if(
214           ( fileList[i].toLowerCase().endsWith(".html") ) ||
215           ( fileList[i].toLowerCase().endsWith(".htm") )
216         ) fileNames.add(tmpFile.getPath());
217       }// End if
218     }// End for
219 
220     while(!foldersToExplore.isEmpty()){
221       File folder = (File)foldersToExplore.get(0);
222       foldersToExplore.remove(0);
223       listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore);
224     }//End while
225 
226   } // listFilesRec();
227 
228   /** Extract links from all .html files below a directory */
229   public static void main(String[] args){
230     HTMLEditorKit.Parser  parser = new ParserDelegator();
231     // create a new Htmldocument handler
232     HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor();
233 
234     if (args.length == 0){
235       Out.prln(
236         "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt"
237       );
238       return;
239     }
240     // Create a folder file File
241     File htmlFolder = new File(args[0]);
242     Set foldersToIgnore = new HashSet();
243     for(int i = 1; i<args.length; i++)
244       foldersToIgnore.add(args[i]);
245 
246     List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore);
247     //Collections.sort(htmlFileNames);
248     while (!htmlFileNames.isEmpty()){
249       try{
250         String htmlFileName = (String) htmlFileNames.get(0);
251         currFile = htmlFileName;
252         currPath = new File(currFile).getParent().toString();
253         htmlFileNames.remove(0);
254 
255         Out.prln("\n\n<A HREF=\"file://" + htmlFileName + "\">");
256         Reader reader = new FileReader(htmlFileName);
257         // parse the HTML document
258         parser.parse(reader, htmlDocHandler, true);
259       } catch (IOException e){
260         e.printStackTrace(System.out);
261       }// End try
262     }// End while
263     System.err.println("done.");
264   }// main
265 
266 }//End class HtmlLinksExtractor
267 
268 
269 
270