gate.creole.orthomatcher
Class OrthoMatcher

java.lang.Object
  |
  +--gate.util.AbstractFeatureBearer
        |
        +--gate.creole.AbstractResource
              |
              +--gate.creole.AbstractProcessingResource
                    |
                    +--gate.creole.AbstractLanguageAnalyser
                          |
                          +--gate.creole.orthomatcher.OrthoMatcher
All Implemented Interfaces:
ANNIEConstants, Executable, FeatureBearer, LanguageAnalyser, NameBearer, ProcessingResource, Resource, Serializable

public class OrthoMatcher
extends AbstractLanguageAnalyser
implements ANNIEConstants

See Also:
Serialized Form

Inner Class Summary
private static class OrthoMatcher.Class1
           
 
Inner classes inherited from class gate.creole.AbstractProcessingResource
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
 
Field Summary
protected  HashMap alias
           
protected static String ALIASLISTNAME
           
private  boolean allMatchingNeeded
          This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different
protected  String annotationSetName
          the name of the annotation set
protected  List annotationTypes
          the types of the annotation
protected  HashMap annots2Remove
           
protected static String ARTLISTNAME
           
private static int BUFF_SIZE
          the size of the buffer
protected  boolean caseSensitive
           
private  char[] cbuffer
          a buffer in order to read an array of char
protected  HashSet cdg
           
protected static String CDGLISTNAME
           
protected  HashMap connector
           
protected static String CONNECTORLISTNAME
           
protected  HashMap def_art
           
protected  boolean extLists
          internal or external list
protected  Annotation longAnnot
           
protected  List matchesDocFeature
           
protected  boolean matchingUnknowns
           
protected  AnnotationSet nameAllAnnots
           
static String OM_ANN_SET_PARAMETER_NAME
           
static String OM_ANN_TYPES_PARAMETER_NAME
           
static String OM_CASE_SENSITIVE_PARAMETER_NAME
           
static String OM_DOCUMENT_PARAMETER_NAME
           
static String OM_EXT_LISTS_PARAMETER_NAME
           
static String OM_ORG_TYPE_PARAMETER_NAME
           
static String OM_PERSON_TYPE_PARAMETER_NAME
           
protected  String organizationType
          the organization type
protected  String personType
          the person type
protected static String PREPLISTNAME
           
protected  HashMap prepos
           
protected  HashMap processedAnnots
           
protected static String PUNCTUATION_VALUE
           
protected  FeatureMap queryFM
           
protected  Annotation shortAnnot
           
protected  HashMap spur_match
           
protected static String SPURLISTNAME
           
protected  FeatureMap tempMap
          a feature map to be used when retrieving annotations declared here so can be reused for efficiency clear() before each use
protected static String THE_VALUE
           
protected  ArrayList tokensLongAnnot
           
protected  HashMap tokensMap
           
protected  ArrayList tokensShortAnnot
           
protected  String unknownType
           
 
Fields inherited from class gate.creole.AbstractLanguageAnalyser
corpus, document
 
Fields inherited from class gate.creole.AbstractProcessingResource
interrupted, progressListeners, statusListeners
 
Fields inherited from class gate.creole.AbstractResource
name, serialVersionUID
 
Fields inherited from class gate.util.AbstractFeatureBearer
features
 
Fields inherited from interface gate.creole.ANNIEConstants
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DOCUMENT_COREF_FEATURE_NAME, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PR_NAMES, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
 
Constructor Summary
OrthoMatcher()
           
 
Method Summary
private  boolean apply_rules_namematch(String annotationType, String shortName, String longName)
          apply_rules_namematch: apply rules similarly to lasie1.5's namematch
private  void buildTables(AnnotationSet nameAllAnnots)
          Tables for namematch info (used by the namematch rules)
protected  String containTitle(String annotString, Annotation annot)
          return a person name without title
protected  void createAnnotList(String nameFile, String nameList)
          creates the lookup tables
protected  void createLists()
          if ( == false) then reads the names of files in order to create the lookup tables
protected  void docCleanup()
           
 void execute()
          Run the resource.
 String getAnnotationSetName()
          get the name of the annotation set
 List getAnnotationTypes()
          get the types of the annotation
 Boolean getCaseSensitive()
           
 Boolean getExtLists()
           
 String getOrganizationType()
           
 String getPersonType()
           
 Resource init()
          Initialise this resource, and return it.
protected  boolean isUnknownGender(String gender)
           
protected  boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot)
           
protected  boolean matchedAlready(Annotation annot1, Annotation annot2)
           
private  boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, Iterator iter)
           
protected  void matchNameAnnotations()
           
protected  boolean matchOtherAnnots(List toMatchList, Annotation newAnnot, String annotString)
          This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.
 boolean matchRule0(String s1, String s2)
          RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations
 boolean matchRule1(String s1, String s2, boolean matchCase)
          RULE #1: If the two names are identical then they are the same no longer used, because I do the check for same string via the hash table of previous annotations Condition(s): depend on case Applied to: all name annotations
 boolean matchRule10(String s1, String s2)
          RULE #10: is one name the reverse of the other reversing around prepositions only? e.g.
 boolean matchRule11(String s1, String s2)
          RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g.
 boolean matchRule12(String s1, String s2)
          RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only
 boolean matchRule13(String s1, String s2)
          RULE #13: do multi-word names match except for one token e.g.
 boolean matchRule14(String s1, String s2)
          RULE #14: if the last token of one name matches the second name e.g.
 boolean matchRule15(String s1, String s2)
          RULE #15: does one token from a Person name appear as the other token Note that this rule has NOT been used in LaSIE's 1.5 namematcher; added for ACE by Di's request
 boolean matchRule2(String s1, String s2)
          RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations
 boolean matchRule3(String s1, String s2)
          RULE #3: adding a possessive at the end of one name causes a match e.g.
 boolean matchRule4(String s1, String s2)
          RULE #4: Do all tokens other than the punctuation marks , and .
 boolean matchRule5(String s1, String s2)
          RULE #5: if the 1st token of one name matches the second name e.g.
 boolean matchRule6(String s1, String s2)
          RULE #6: if one name is the acronym of the other e.g.
 boolean matchRule7(String s1, String s2)
          RULE #7: if one of the tokens in one of the names is in the list of separators eg.
 boolean matchRule8(String s1, String s2)
          This rule is now obsolete, as The and the trailing CDG are stripped before matching.
 boolean matchRule9(String s1, String s2)
          RULE #9: does one of the names match the token just before a trailing company designator in the other name? The company designator has already been chopped off, so the token before it, is in fact the last token e.g.
protected  void matchUnknown()
           
protected  void matchWithPrevious(Annotation nameAnnot, String annotString)
           
 String regularExpressions(String text, String replacement, String regEx)
          substitute all multiple spaces, tabes and newlines with a single space
 void setAnnotationSetName(String newAnnotationSetName)
          set the annotation set name
 void setAnnotationTypes(List newType)
          set the types of the annotations
 void setCaseSensitive(Boolean newCase)
          set the caseSensitive flag
 void setExtLists(Boolean newExtLists)
          set the extLists flag
 void setOrganizationType(String newOrganizationType)
           
 void setPersonType(String newPersonType)
           
protected  String stripCDG(String annotString, Annotation annot)
          return an organization without a designator and starting The
protected  void updateMatches(Annotation newAnnot, Annotation prevAnnot)
           
protected  Annotation updateMatches(Annotation newAnnot, String annotString)
           
 
Methods inherited from class gate.creole.AbstractLanguageAnalyser
getCorpus, getDocument, setCorpus, setDocument
 
Methods inherited from class gate.creole.AbstractProcessingResource
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener
 
Methods inherited from class gate.creole.AbstractResource
checkParameterValues, getName, getParameterValue, getParameterValue, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
, clone, equals, finalize, getClass, hashCode, notify, notifyAll, registerNatives, toString, wait, wait, wait
 
Methods inherited from interface gate.ProcessingResource
interrupt, isInterrupted, reInit
 
Methods inherited from interface gate.Resource
cleanup, getParameterValue, setParameterValue, setParameterValues
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, setFeatures
 
Methods inherited from interface gate.util.NameBearer
getName, setName
 

Field Detail

OM_DOCUMENT_PARAMETER_NAME

public static final String OM_DOCUMENT_PARAMETER_NAME

OM_ANN_SET_PARAMETER_NAME

public static final String OM_ANN_SET_PARAMETER_NAME

OM_CASE_SENSITIVE_PARAMETER_NAME

public static final String OM_CASE_SENSITIVE_PARAMETER_NAME

OM_ANN_TYPES_PARAMETER_NAME

public static final String OM_ANN_TYPES_PARAMETER_NAME

OM_ORG_TYPE_PARAMETER_NAME

public static final String OM_ORG_TYPE_PARAMETER_NAME

OM_PERSON_TYPE_PARAMETER_NAME

public static final String OM_PERSON_TYPE_PARAMETER_NAME

OM_EXT_LISTS_PARAMETER_NAME

public static final String OM_EXT_LISTS_PARAMETER_NAME

CDGLISTNAME

protected static final String CDGLISTNAME

ALIASLISTNAME

protected static final String ALIASLISTNAME

ARTLISTNAME

protected static final String ARTLISTNAME

PREPLISTNAME

protected static final String PREPLISTNAME

CONNECTORLISTNAME

protected static final String CONNECTORLISTNAME

SPURLISTNAME

protected static final String SPURLISTNAME

PUNCTUATION_VALUE

protected static final String PUNCTUATION_VALUE

THE_VALUE

protected static final String THE_VALUE

annotationSetName

protected String annotationSetName
the name of the annotation set

annotationTypes

protected List annotationTypes
the types of the annotation

organizationType

protected String organizationType
the organization type

personType

protected String personType
the person type

unknownType

protected String unknownType

extLists

protected boolean extLists
internal or external list

matchingUnknowns

protected boolean matchingUnknowns

allMatchingNeeded

private boolean allMatchingNeeded
This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different

caseSensitive

protected boolean caseSensitive

queryFM

protected FeatureMap queryFM

alias

protected HashMap alias

cdg

protected HashSet cdg

spur_match

protected HashMap spur_match

def_art

protected HashMap def_art

connector

protected HashMap connector

prepos

protected HashMap prepos

nameAllAnnots

protected AnnotationSet nameAllAnnots

processedAnnots

protected HashMap processedAnnots

annots2Remove

protected HashMap annots2Remove

matchesDocFeature

protected List matchesDocFeature

tokensMap

protected HashMap tokensMap

shortAnnot

protected Annotation shortAnnot

longAnnot

protected Annotation longAnnot

tokensLongAnnot

protected ArrayList tokensLongAnnot

tokensShortAnnot

protected ArrayList tokensShortAnnot

tempMap

protected FeatureMap tempMap
a feature map to be used when retrieving annotations declared here so can be reused for efficiency clear() before each use

cbuffer

private char[] cbuffer
a buffer in order to read an array of char

BUFF_SIZE

private static final int BUFF_SIZE
the size of the buffer
Constructor Detail

OrthoMatcher

public OrthoMatcher()
Method Detail

init

public Resource init()
              throws ResourceInstantiationException
Initialise this resource, and return it.
Overrides:
init in class AbstractProcessingResource

execute

public void execute()
             throws ExecutionException
Run the resource. It doesn't make sense not to override this in subclasses so the default implementation signals an exception.
Overrides:
execute in class AbstractProcessingResource

matchNameAnnotations

protected void matchNameAnnotations()
                             throws ExecutionException

matchUnknown

protected void matchUnknown()
                     throws ExecutionException

matchHyphenatedUnknowns

private boolean matchHyphenatedUnknowns(Annotation unknown,
                                        String unknownString,
                                        Iterator iter)

matchWithPrevious

protected void matchWithPrevious(Annotation nameAnnot,
                                 String annotString)

matchAnnotations

protected boolean matchAnnotations(Annotation newAnnot,
                                   String annotString,
                                   Annotation prevAnnot)

matchOtherAnnots

protected boolean matchOtherAnnots(List toMatchList,
                                   Annotation newAnnot,
                                   String annotString)
This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.

matchedAlready

protected boolean matchedAlready(Annotation annot1,
                                 Annotation annot2)

updateMatches

protected Annotation updateMatches(Annotation newAnnot,
                                   String annotString)

updateMatches

protected void updateMatches(Annotation newAnnot,
                             Annotation prevAnnot)

docCleanup

protected void docCleanup()

containTitle

protected String containTitle(String annotString,
                              Annotation annot)
                       throws ExecutionException
return a person name without title

stripCDG

protected String stripCDG(String annotString,
                          Annotation annot)
return an organization without a designator and starting The

createLists

protected void createLists()
                    throws IOException
if ( == false) then reads the names of files in order to create the lookup tables

createAnnotList

protected void createAnnotList(String nameFile,
                               String nameList)
                        throws IOException
creates the lookup tables

apply_rules_namematch

private boolean apply_rules_namematch(String annotationType,
                                      String shortName,
                                      String longName)
apply_rules_namematch: apply rules similarly to lasie1.5's namematch

setExtLists

public void setExtLists(Boolean newExtLists)
set the extLists flag

setCaseSensitive

public void setCaseSensitive(Boolean newCase)
set the caseSensitive flag

setAnnotationSetName

public void setAnnotationSetName(String newAnnotationSetName)
set the annotation set name

setAnnotationTypes

public void setAnnotationTypes(List newType)
set the types of the annotations

setOrganizationType

public void setOrganizationType(String newOrganizationType)

setPersonType

public void setPersonType(String newPersonType)

getAnnotationSetName

public String getAnnotationSetName()
get the name of the annotation set

getAnnotationTypes

public List getAnnotationTypes()
get the types of the annotation

getOrganizationType

public String getOrganizationType()

getPersonType

public String getPersonType()

getExtLists

public Boolean getExtLists()

getCaseSensitive

public Boolean getCaseSensitive()

isUnknownGender

protected boolean isUnknownGender(String gender)

matchRule0

public boolean matchRule0(String s1,
                          String s2)
RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations

matchRule1

public boolean matchRule1(String s1,
                          String s2,
                          boolean matchCase)
RULE #1: If the two names are identical then they are the same no longer used, because I do the check for same string via the hash table of previous annotations Condition(s): depend on case Applied to: all name annotations

matchRule2

public boolean matchRule2(String s1,
                          String s2)
RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations

matchRule3

public boolean matchRule3(String s1,
                          String s2)
RULE #3: adding a possessive at the end of one name causes a match e.g. "Standard and Poor" == "Standard and Poor's" and also "Standard and Poor" == "Standard's" Condition(s): case-insensitive match Applied to: all name annotations

matchRule4

public boolean matchRule4(String s1,
                          String s2)
RULE #4: Do all tokens other than the punctuation marks , and . match? e.g. "Smith, Jones" == "Smith Jones" Condition(s): case-insensitive match Applied to: organisation and person annotations

matchRule5

public boolean matchRule5(String s1,
                          String s2)
RULE #5: if the 1st token of one name matches the second name e.g. "Pepsi Cola" == "Pepsi" Condition(s): case-insensitive match Applied to: all name annotations

matchRule6

public boolean matchRule6(String s1,
                          String s2)
RULE #6: if one name is the acronym of the other e.g. "Imperial Chemical Industries" == "ICI" Applied to: organisation annotations only

matchRule7

public boolean matchRule7(String s1,
                          String s2)
RULE #7: if one of the tokens in one of the names is in the list of separators eg. "&" then check if the token before the separator matches the other name e.g. "R.H. Macy & Co." == "Macy" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule8

public boolean matchRule8(String s1,
                          String s2)
This rule is now obsolete, as The and the trailing CDG are stripped before matching. DO NOT CALL!!! RULE #8: if the names match, ignoring The and and trailing company designator (which have already been stripped) e.g. "The Magic Tricks Co." == "Magic Tricks" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule9

public boolean matchRule9(String s1,
                          String s2)
RULE #9: does one of the names match the token just before a trailing company designator in the other name? The company designator has already been chopped off, so the token before it, is in fact the last token e.g. "R.H. Macy Co." == "Macy" Applied to: organisation annotations only

matchRule10

public boolean matchRule10(String s1,
                           String s2)
RULE #10: is one name the reverse of the other reversing around prepositions only? e.g. "Department of Defence" == "Defence Department" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule11

public boolean matchRule11(String s1,
                           String s2)
RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g. "Communications Satellite" == "ComSat" and "Pan American" == "Pan Am" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule12

public boolean matchRule12(String s1,
                           String s2)
RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule13

public boolean matchRule13(String s1,
                           String s2)
RULE #13: do multi-word names match except for one token e.g. "Second Force Recon Company" == "Force Recon Company" Note that this rule has NOT been used in LaSIE's 1.5 namematcher Restrictions: - remove cdg first - shortest name should be 2 words or more - if N is the number of tokens of the longest name, then N-1 tokens should be matched Condition(s): case-sensitive match Applied to: organisation or person annotations only

matchRule14

public boolean matchRule14(String s1,
                           String s2)
RULE #14: if the last token of one name matches the second name e.g. "Hamish Cunningham" == "Cunningham" Condition(s): case-insensitive match Applied to: all person annotations

matchRule15

public boolean matchRule15(String s1,
                           String s2)
RULE #15: does one token from a Person name appear as the other token Note that this rule has NOT been used in LaSIE's 1.5 namematcher; added for ACE by Di's request

buildTables

private void buildTables(AnnotationSet nameAllAnnots)
Tables for namematch info (used by the namematch rules)

regularExpressions

public String regularExpressions(String text,
                                 String replacement,
                                 String regEx)
substitute all multiple spaces, tabes and newlines with a single space