1   /*
2    *  HtmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  12/June/2000
12   *
13   *  $Id: HtmlDocumentHandler.java,v 1.31 2002/02/28 15:11:13 nasso Exp $
14   */
15  
16  package gate.html;
17  
18  import javax.swing.text.html.*;
19  import javax.swing.text.html.parser.*;
20  import javax.swing.text.html.HTMLEditorKit.*;
21  import javax.swing.text.BadLocationException;
22  import javax.swing.text.MutableAttributeSet;
23  
24  import java.util.*;
25  
26  import gate.corpora.*;
27  import gate.util.*;
28  import gate.*;
29  import gate.event.*;
30  
31  
32  /** Implements the behaviour of the HTML reader.
33    * Methods of an object of this class are called by the HTML parser when
34    * events will appear.
35    * The idea is to parse the HTML document and construct Gate annotations
36    * objects.
37    * This class also will replace the content of the Gate document with a
38    * new one containing anly text from the HTML document.
39    */
40  public class HtmlDocumentHandler extends ParserCallback {
41  
42    /** Debug flag */
43    private static final boolean DEBUG = false;
44  
45    /** Constructor initialises all the private memeber data.
46      * This will use the default annotation set taken from the gate document.
47      * @param aDocument The gate document that will be processed
48      * @param aMarkupElementsMap The map containing the elements that will
49      * transform into annotations
50      */
51    public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
52      this(aDocument,aMarkupElementsMap,null);
53    }
54  
55    /** Constructor initialises all the private memeber data
56      * @param aDocument The gate document that will be processed
57      * @param aMarkupElementsMap The map containing the elements that will
58      * transform into annotations
59      * @param anAnnoatationSet The annotation set that will contain annotations
60      * resulted from the processing of the gate document
61      */
62    public HtmlDocumentHandler(gate.Document       aDocument,
63                               Map                 aMarkupElementsMap,
64                               gate.AnnotationSet  anAnnotationSet) {
65      // init stack
66      stack = new java.util.Stack();
67  
68      // this string contains the plain text (the text without markup)
69      tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
70  
71      // colector is used later to transform all custom objects into
72      // annotation objects
73      colector = new LinkedList();
74  
75      // the Gate document
76      doc = aDocument;
77  
78      // this map contains the elements name that we want to create
79      // if it's null all the elements from the XML documents will be transformed
80      // into Gate annotation objects
81      markupElementsMap = aMarkupElementsMap;
82  
83      // init an annotation set for this gate document
84      basicAS = anAnnotationSet;
85  
86      customObjectsId = 0;
87    }//HtmlDocumentHandler
88  
89    /** Keep the refference to this structure */
90    private RepositioningInfo reposInfo = null;
91  
92    /** Keep the refference to this structure */
93    private RepositioningInfo ampCodingInfo = null;
94  
95    /** Set repositioning information structure refference. If you set this
96     *  refference to <B>null</B> information wouldn't be collected.
97     */
98    public void setRepositioningInfo(RepositioningInfo info) {
99      reposInfo = info;
100   } // setRepositioningInfo
101 
102   /** Return current RepositioningInfo object */
103   public RepositioningInfo getRepositioningInfo() {
104     return reposInfo;
105   } // getRepositioningInfo
106 
107   /** Set repositioning information structure refference for ampersand coding.
108    *  If you set this refference to <B>null</B> information wouldn't be used.
109    */
110   public void setAmpCodingInfo(RepositioningInfo info) {
111     ampCodingInfo = info;
112   } // setRepositioningInfo
113 
114   /** Return current RepositioningInfo object for ampersand coding. */
115   public RepositioningInfo getAmpCodingInfo() {
116     return ampCodingInfo;
117   } // getRepositioningInfo
118 
119   /** The text inside the STYLE tag is processed with <code>handleText()</code>.
120    *  We should skip inserting of this text in the document. */
121   private boolean isInsideStyleTag = false;
122 
123   /** This method is called when the HTML parser encounts the beginning
124     * of a tag that means that the tag is paired by an end tag and it's
125     * not an empty one.
126     */
127   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
128     // Fire the status listener if the elements processed exceded the rate
129     if (0 == (++elements % ELEMENTS_RATE))
130       fireStatusChangedEvent("Processed elements : " + elements);
131 
132     // Start of STYLE tag
133     if(HTML.Tag.STYLE.equals(t)) {
134       isInsideStyleTag = true;
135     } // if
136 
137     // Construct a feature map from the attributes list
138     FeatureMap fm = Factory.newFeatureMap();
139 
140     // Take all the attributes an put them into the feature map
141     if (0 != a.getAttributeCount()){
142       Enumeration enum = a.getAttributeNames();
143       while (enum.hasMoreElements()){
144         Object attribute = enum.nextElement();
145         fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
146       }// while
147     }// if
148 
149     // Just analize the tag t and add some\n chars and spaces to the
150     // tmpDocContent.The reason behind is that we need to have a readable form
151     // for the final document.
152     customizeAppearanceOfDocumentWithStartTag(t);
153 
154     // If until here the "tmpDocContent" ends with a NON whitespace char,
155     // then we add a space char before calculating the START index of this
156     // tag.
157     // This is done in order not to concatenate the content of two separate tags
158     // and obtain a different NEW word.
159     int tmpDocContentSize = tmpDocContent.length();
160     if ( tmpDocContentSize != 0 &&
161          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
162        ) tmpDocContent.append(" ");
163 
164     // create the start index of the annotation
165     Long startIndex = new Long(tmpDocContent.length());
166 
167     // initialy the start index is equal with the End index
168     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
169 
170     // put it into the stack
171     stack.push (obj);
172 
173   }//handleStartTag
174 
175    /** This method is called when the HTML parser encounts the end of a tag
176      * that means that the tag is paired by a beginning tag
177      */
178   public void handleEndTag(HTML.Tag t, int pos){
179     // obj is for internal use
180     CustomObject obj = null;
181 
182     // end of STYLE tag
183     if(HTML.Tag.STYLE.equals(t)) {
184       isInsideStyleTag = false;
185     } // if
186 
187     // If the stack is not empty then we get the object from the stack
188     if (!stack.isEmpty()){
189       obj = (CustomObject) stack.pop();
190       // Before adding it to the colector, we need to check if is an
191       // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
192       if (obj.getStart().equals(obj.getEnd())){
193         // The element had an end tag and its start was equal to its end. Hence
194         // it is anEmptyAndSpan one.
195         obj.getFM().put("isEmptyAndSpan","true");
196       }// End iff
197       // we add it to the colector
198       colector.add(obj);
199     }// End if
200 
201     // If element has text between, then customize its apearance
202     if ( obj != null &&
203          obj.getStart().longValue() != obj.getEnd().longValue()
204        )
205       // Customize the appearance of the document
206       customizeAppearanceOfDocumentWithEndTag(t);
207 
208     // if t is the </HTML> tag then we reached the end of theHTMLdocument
209     if (t == HTML.Tag.HTML){
210       // replace the old content with the new one
211       doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
212 
213       // If basicAs is null then get the default annotation
214       // set from this gate document
215       if (basicAS == null)
216         basicAS = doc.getAnnotations(
217                                 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
218 
219       // sort colector ascending on its id
220       Collections.sort(colector);
221       // iterate through colector and construct annotations
222       while (!colector.isEmpty()){
223         obj = (CustomObject) colector.getFirst();
224         colector.remove(obj);
225           // Construct an annotation from this obj
226           try{
227             if (markupElementsMap == null){
228                basicAS.add( obj.getStart(),
229                             obj.getEnd(),
230                             obj.getElemName(),
231                             obj.getFM()
232                            );
233             }else{
234               String annotationType =
235                      (String) markupElementsMap.get(obj.getElemName());
236               if (annotationType != null)
237                  basicAS.add( obj.getStart(),
238                               obj.getEnd(),
239                               annotationType,
240                               obj.getFM()
241                              );
242             }
243           }catch (InvalidOffsetException e){
244               Err.prln("Error creating an annot :" + obj + " Discarded...");
245           }// end try
246 //        }// end if
247       }//while
248 
249       // notify the listener about the total amount of elements that
250       // has been processed
251       fireStatusChangedEvent("Total elements : " + elements);
252 
253     }//else
254 
255   }//handleEndTag
256 
257   /** This method is called when the HTML parser encounts an empty tag
258     */
259   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
260     // fire the status listener if the elements processed exceded the rate
261     if ((++elements % ELEMENTS_RATE) == 0)
262       fireStatusChangedEvent("Processed elements : " + elements);
263 
264     // construct a feature map from the attributes list
265     // these are empty elements
266     FeatureMap fm = Factory.newFeatureMap();
267 
268     // take all the attributes an put them into the feature map
269     if (0 != a.getAttributeCount ()){
270 
271        // Out.println("HAS  attributes = " + a.getAttributeCount ());
272         Enumeration enum = a.getAttributeNames ();
273         while (enum.hasMoreElements ()){
274           Object attribute = enum.nextElement ();
275           fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
276 
277         }//while
278 
279     }//if
280 
281     // create the start index of the annotation
282     Long startIndex = new Long(tmpDocContent.length());
283 
284     // initialy the start index is equal with the End index
285     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
286 
287     // we add the object directly into the colector
288     // we don't add it to the stack because this is an empty tag
289     colector.add(obj);
290 
291     // Just analize the tag t and add some\n chars and spaces to the
292     // tmpDocContent.The reason behind is that we need to have a readable form
293     // for the final document.
294     customizeAppearanceOfDocumentWithSimpleTag(t);
295 
296   } // handleSimpleTag
297 
298   /** This method is called when the HTML parser encounts text (PCDATA)
299     */
300   public void handleText(char[] text, int pos){
301 
302     // Skip the STYLE tag content
303     if(isInsideStyleTag) return;
304 
305     // create a string object based on the reported text
306     String content = new String(text);
307 
308     // remove the difference between JDK 1.3 and JDK 1.4
309     String trimContent = content.trim();
310     if(trimContent.length() == 0) {
311       return;
312     } // if
313 
314     int trimCorrection = content.indexOf(trimContent.charAt(0));
315     content = trimContent;
316 
317     StringBuffer contentBuffer = new StringBuffer("");
318     int tmpDocContentSize = tmpDocContent.length();
319     boolean incrementStartIndex = false;
320     // If the first char of the text just read "text[0]" is NOT whitespace AND
321     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
322     // concatenation "tmpDocContent + content" will result into a new different
323     // word... and we want to avoid that...
324     if ( tmpDocContentSize != 0 &&
325          content.length() != 0 &&
326          !Character.isWhitespace(content.charAt(0)) &&
327          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
328 
329             contentBuffer.append(" ");
330             incrementStartIndex = true;
331     }// End if
332     // update the document content
333 
334     // put the repositioning information
335     if(reposInfo != null) {
336       int extractedPos = tmpDocContent.length()+contentBuffer.length()
337                     +trimCorrection;
338       addRepositioningInfo(content, pos, extractedPos);
339     } // if
340 
341     contentBuffer.append(content);
342     // calculate the End index for all the elements of the stack
343     // the expression is : End index = Current doc length + text length
344     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
345 
346     CustomObject obj = null;
347     // Iterate through stack to modify the End index of the existing elements
348 
349     java.util.Iterator anIterator = stack.iterator();
350     while (anIterator.hasNext ()){
351       // get the object and move to the next one
352       obj = (CustomObject) anIterator.next ();
353       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
354         obj.setStart(new Long(obj.getStart().longValue() + 1));
355       }// End if
356       // sets its End index
357       obj.setEnd(end);
358     }// End while
359 
360     tmpDocContent.append(contentBuffer.toString());
361   }// end handleText();
362 
363   /** For given content the list with shrink position information is searched
364    *  and on the corresponding positions the correct repositioning information
365    *  is calculated and generated.
366    */
367   public void addRepositioningInfo(String content, int pos, int extractedPos) {
368     int contentLength = content.length();
369 
370     // wrong way (without correction and analysing)
371    //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);
372 
373     RepositioningInfo.PositionInfo pi = null;
374     long startPos = pos;
375     long correction = 0;
376     long substituteStart;
377     long remainingLen;
378     long offsetInExtracted;
379 
380     for(int i = 0; i < ampCodingInfo.size(); ++i) {
381       pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
382       substituteStart = pi.getOriginalPosition();
383 
384       if(substituteStart >= startPos) {
385         if(substituteStart > pos + contentLength + correction) {
386           break; // outside the current text
387         } // if
388 
389         // should create two repositioning information records
390         remainingLen = substituteStart - (startPos + correction);
391         offsetInExtracted = startPos - pos;
392         if(remainingLen > 0) {
393           reposInfo.addPositionInfo(startPos + correction, remainingLen,
394                             extractedPos + offsetInExtracted, remainingLen);
395         } // if
396         // record for shrank text
397         reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
398                           extractedPos + offsetInExtracted + remainingLen,
399                           pi.getCurrentLength());
400         startPos = startPos + remainingLen + pi.getCurrentLength();
401         correction += pi.getOriginalLength() - pi.getCurrentLength();
402       } // if
403     } // for
404 
405     // there is some text remaining for repositioning
406     offsetInExtracted = startPos - pos;
407     remainingLen = contentLength - offsetInExtracted;
408     if(remainingLen > 0) {
409       reposInfo.addPositionInfo(startPos + correction, remainingLen,
410                         extractedPos + offsetInExtracted, remainingLen);
411     } // if
412   } // addRepositioningInfo
413 
414   /** This method analizes the tag t and adds some \n chars and spaces to the
415     * tmpDocContent.The reason behind is that we need to have a readable form
416     * for the final document. This method modifies the content of tmpDocContent.
417     * @param t the Html tag encounted by the HTML parser
418     */
419   protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
420     boolean modification = false;
421     // if the HTML tag is BR then we add a new line character to the document
422     if (HTML.Tag.BR == t){
423       tmpDocContent.append("\n");
424       modification = true;
425     }// End if
426     if (modification == true){
427       Long end = new Long (tmpDocContent.length());
428       java.util.Iterator anIterator = stack.iterator();
429       while (anIterator.hasNext ()){
430         // get the object and move to the next one
431         CustomObject obj = (CustomObject) anIterator.next();
432         // sets its End index
433         obj.setEnd(end);
434       }// End while
435     }//End if
436   }// customizeAppearanceOfDocumentWithSimpleTag
437 
438   /** This method analizes the tag t and adds some \n chars and spaces to the
439     * tmpDocContent.The reason behind is that we need to have a readable form
440     * for the final document. This method modifies the content of tmpDocContent.
441     * @param t the Html tag encounted by the HTML parser
442     */
443   protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
444     boolean modification = false;
445     if (HTML.Tag.P == t){
446       int tmpDocContentSize = tmpDocContent.length();
447       if ( tmpDocContentSize >= 2 &&
448            '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
449          ) { tmpDocContent.append("\n"); modification = true;}
450     }// End if
451     if (modification == true){
452       Long end = new Long (tmpDocContent.length());
453       java.util.Iterator anIterator = stack.iterator();
454       while (anIterator.hasNext ()){
455         // get the object and move to the next one
456         CustomObject obj = (CustomObject) anIterator.next();
457         // sets its End index
458         obj.setEnd(end);
459       }// End while
460     }//End if
461   }// customizeAppearanceOfDocumentWithStartTag
462 
463   /** This method analizes the tag t and adds some \n chars and spaces to the
464     * tmpDocContent.The reason behind is that we need to have a readable form
465     * for the final document. This method modifies the content of tmpDocContent.
466     * @param t the Html tag encounted by the HTML parser
467     */
468   protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
469     boolean modification = false;
470     // if the HTML tag is BR then we add a new line character to the document
471     if ( (HTML.Tag.P == t) ||
472 
473          (HTML.Tag.H1 == t) ||
474          (HTML.Tag.H2 == t) ||
475          (HTML.Tag.H3 == t) ||
476          (HTML.Tag.H4 == t) ||
477          (HTML.Tag.H5 == t) ||
478          (HTML.Tag.H6 == t) ||
479          (HTML.Tag.TR == t) ||
480          (HTML.Tag.CENTER == t) ||
481          (HTML.Tag.LI == t)
482        ){ tmpDocContent.append("\n"); modification = true;}
483 
484     if (HTML.Tag.TITLE == t){
485       tmpDocContent.append("\n\n");
486       modification = true;
487     }// End if
488 
489     if (modification == true){
490       Long end = new Long (tmpDocContent.length());
491       java.util.Iterator anIterator = stack.iterator();
492       while (anIterator.hasNext ()){
493         // get the object and move to the next one
494         CustomObject obj = (CustomObject) anIterator.next();
495         // sets its End index
496         obj.setEnd(end);
497       }// End while
498     }//End if
499   }// customizeAppearanceOfDocumentWithEndTag
500 
501   /**
502     * This method is called when the HTML parser encounts an error
503     * it depends on the programmer if he wants to deal with that error
504     */
505   public void handleError(String errorMsg, int pos) {
506     //Out.println ("ERROR CALLED : " + errorMsg);
507   }
508 
509   /** This method is called once, when the HTML parser reaches the end
510     * of its input streamin order to notify the parserCallback that there
511     * is nothing more to parse.
512     */
513   public void flush() throws BadLocationException{
514   }// flush
515 
516   /** This method is called when the HTML parser encounts a comment
517     */
518   public void handleComment(char[] text, int pos) {
519   }
520 
521   //StatusReporter Implementation
522 
523   public void addStatusListener(StatusListener listener) {
524     myStatusListeners.add(listener);
525   }
526 
527   public void removeStatusListener(StatusListener listener) {
528     myStatusListeners.remove(listener);
529   }
530 
531   protected void fireStatusChangedEvent(String text) {
532     Iterator listenersIter = myStatusListeners.iterator();
533     while(listenersIter.hasNext())
534       ((StatusListener)listenersIter.next()).statusChanged(text);
535   }
536 
537   /**
538     * This method verifies if data contained by the CustomObject can be used
539     * to create a GATE annotation.
540     */
541 /*  private boolean canCreateAnnotation(CustomObject aCustomObject){
542     long start            = aCustomObject.getStart().longValue();
543     long end              = aCustomObject.getEnd().longValue();
544     long gateDocumentSize = doc.getContent().size().longValue();
545 
546     if (start < 0 || end < 0 ) return false;
547     if (start > end ) return false;
548     if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
549     return true;
550   }// canCreateAnnotation
551 */
552 
553   // HtmlDocumentHandler member data
554 
555   // this constant indicates when to fire the status listener
556   // this listener will add an overhead and we don't want a big overhead
557   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
558   final static  int ELEMENTS_RATE = 128;
559 
560   // this map contains the elements name that we want to create
561   // if it's null all the elements from the HTML documents will be transformed
562   // into Gate annotation objects otherwise only the elements it contains will
563   // be transformed
564   private Map markupElementsMap = null;
565 
566   // the content of the HTML document, without any tag
567   // for internal use
568   private StringBuffer tmpDocContent = null;
569 
570   // a stack used to remember elements and to keep the order
571   private java.util.Stack stack = null;
572 
573   // a gate document
574   private gate.Document doc = null;
575 
576   // an annotation set used for creating annotation reffering the doc
577   private gate.AnnotationSet basicAS;
578 
579   // listeners for status report
580   protected List myStatusListeners = new LinkedList();
581 
582   // this reports the the number of elements that have beed processed so far
583   private int elements = 0;
584 
585   protected  long customObjectsId = 0;
586   // we need a colection to retain all the CustomObjects that will be
587   // transformed into annotation over the gate document...
588   // the transformation will take place inside onDocumentEnd() method
589   private LinkedList colector = null;
590 
591   // Inner class
592   /**
593     * The objects belonging to this class are used inside the stack.
594     * This class is for internal needs
595     */
596   class  CustomObject implements Comparable {
597 
598     // constructor
599     public CustomObject(String anElemName, FeatureMap aFm,
600                            Long aStart, Long anEnd) {
601       elemName = anElemName;
602       fm = aFm;
603       start = aStart;
604       end = anEnd;
605       id = new Long(customObjectsId ++);
606     }// End CustomObject()
607 
608     // Methos implemented as required by Comparable interface
609     public int compareTo(Object o){
610       CustomObject obj = (CustomObject) o;
611       return this.id.compareTo(obj.getId());
612     }// compareTo();
613 
614     // accesor
615     public String getElemName() {
616       return elemName;
617     }// getElemName()
618 
619     public FeatureMap getFM() {
620       return fm;
621     }// getFM()
622 
623     public Long getStart() {
624       return start;
625     }// getStart()
626 
627     public Long getEnd() {
628       return end;
629     }// getEnd()
630 
631     public Long getId(){ return id;}
632 
633     // mutator
634     public void setElemName(String anElemName) {
635       elemName = anElemName;
636     }// getElemName()
637 
638     public void setFM(FeatureMap aFm) {
639       fm = aFm;
640     }// setFM();
641 
642     public void setStart(Long aStart) {
643       start = aStart;
644     }// setStart();
645 
646     public void setEnd(Long anEnd) {
647       end = anEnd;
648     }// setEnd();
649 
650     // data fields
651     private String elemName = null;
652     private FeatureMap fm = null;
653     private Long start = null;
654     private Long end  = null;
655     private Long id = null;
656 
657   } // End inner class CustomObject
658 
659 }//End class HtmlDocumentHandler
660 
661 
662 
663