|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: INNER | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object | +--gate.util.AbstractFeatureBearer | +--gate.creole.AbstractResource | +--gate.creole.AbstractProcessingResource | +--gate.creole.AbstractLanguageAnalyser | +--gate.creole.orthomatcher.OrthoMatcher
Inner Class Summary | |
private static class |
OrthoMatcher.Class1
|
Inner classes inherited from class gate.creole.AbstractProcessingResource |
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener |
Field Summary | |
protected HashMap |
alias
|
protected static String |
ALIASLISTNAME
|
private boolean |
allMatchingNeeded
This is an internal variable to indicate whether we matched using a rule that requires that the newly matched annotation matches all the others This is needed, because organizations can share first/last tokens like News and be different |
protected String |
annotationSetName
the name of the annotation set |
protected List |
annotationTypes
the types of the annotation |
protected HashMap |
annots2Remove
|
protected static String |
ARTLISTNAME
|
private static int |
BUFF_SIZE
the size of the buffer |
protected boolean |
caseSensitive
|
private char[] |
cbuffer
a buffer in order to read an array of char |
protected HashSet |
cdg
|
protected static String |
CDGLISTNAME
|
protected HashMap |
connector
|
protected static String |
CONNECTORLISTNAME
|
protected HashMap |
def_art
|
protected boolean |
extLists
internal or external list |
protected Annotation |
longAnnot
|
protected List |
matchesDocFeature
|
protected boolean |
matchingUnknowns
|
protected AnnotationSet |
nameAllAnnots
|
static String |
OM_ANN_SET_PARAMETER_NAME
|
static String |
OM_ANN_TYPES_PARAMETER_NAME
|
static String |
OM_CASE_SENSITIVE_PARAMETER_NAME
|
static String |
OM_DOCUMENT_PARAMETER_NAME
|
static String |
OM_EXT_LISTS_PARAMETER_NAME
|
static String |
OM_ORG_TYPE_PARAMETER_NAME
|
static String |
OM_PERSON_TYPE_PARAMETER_NAME
|
protected String |
organizationType
the organization type |
protected String |
personType
the person type |
protected static String |
PREPLISTNAME
|
protected HashMap |
prepos
|
protected HashMap |
processedAnnots
|
protected static String |
PUNCTUATION_VALUE
|
protected FeatureMap |
queryFM
|
protected Annotation |
shortAnnot
|
protected HashMap |
spur_match
|
protected static String |
SPURLISTNAME
|
protected FeatureMap |
tempMap
a feature map to be used when retrieving annotations declared here so can be reused for efficiency clear() before each use |
protected static String |
THE_VALUE
|
protected ArrayList |
tokensLongAnnot
|
protected HashMap |
tokensMap
|
protected ArrayList |
tokensShortAnnot
|
protected String |
unknownType
|
Fields inherited from class gate.creole.AbstractLanguageAnalyser |
corpus, document |
Fields inherited from class gate.creole.AbstractProcessingResource |
interrupted, progressListeners, statusListeners |
Fields inherited from class gate.creole.AbstractResource |
name, serialVersionUID |
Fields inherited from class gate.util.AbstractFeatureBearer |
features |
Constructor Summary | |
OrthoMatcher()
|
Method Summary | |
private boolean |
apply_rules_namematch(String annotationType,
String shortName,
String longName)
apply_rules_namematch: apply rules similarly to lasie1.5's namematch |
private void |
buildTables(AnnotationSet nameAllAnnots)
Tables for namematch info (used by the namematch rules) |
protected String |
containTitle(String annotString,
Annotation annot)
return a person name without title |
protected void |
createAnnotList(String nameFile,
String nameList)
creates the lookup tables |
protected void |
createLists()
if ( == false) then reads the names of files in order to create the lookup tables |
protected void |
docCleanup()
|
void |
execute()
Run the resource. |
String |
getAnnotationSetName()
get the name of the annotation set |
List |
getAnnotationTypes()
get the types of the annotation |
Boolean |
getCaseSensitive()
|
Boolean |
getExtLists()
|
String |
getOrganizationType()
|
String |
getPersonType()
|
Resource |
init()
Initialise this resource, and return it. |
protected boolean |
isUnknownGender(String gender)
|
protected boolean |
matchAnnotations(Annotation newAnnot,
String annotString,
Annotation prevAnnot)
|
protected boolean |
matchedAlready(Annotation annot1,
Annotation annot2)
|
private boolean |
matchHyphenatedUnknowns(Annotation unknown,
String unknownString,
Iterator iter)
|
protected void |
matchNameAnnotations()
|
protected boolean |
matchOtherAnnots(List toMatchList,
Annotation newAnnot,
String annotString)
This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom. |
boolean |
matchRule0(String s1,
String s2)
RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations |
boolean |
matchRule1(String s1,
String s2,
boolean matchCase)
RULE #1: If the two names are identical then they are the same no longer used, because I do the check for same string via the hash table of previous annotations Condition(s): depend on case Applied to: all name annotations |
boolean |
matchRule10(String s1,
String s2)
RULE #10: is one name the reverse of the other reversing around prepositions only? e.g. |
boolean |
matchRule11(String s1,
String s2)
RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g. |
boolean |
matchRule12(String s1,
String s2)
RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only |
boolean |
matchRule13(String s1,
String s2)
RULE #13: do multi-word names match except for one token e.g. |
boolean |
matchRule14(String s1,
String s2)
RULE #14: if the last token of one name matches the second name e.g. |
boolean |
matchRule15(String s1,
String s2)
RULE #15: does one token from a Person name appear as the other token Note that this rule has NOT been used in LaSIE's 1.5 namematcher; added for ACE by Di's request |
boolean |
matchRule2(String s1,
String s2)
RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations |
boolean |
matchRule3(String s1,
String s2)
RULE #3: adding a possessive at the end of one name causes a match e.g. |
boolean |
matchRule4(String s1,
String s2)
RULE #4: Do all tokens other than the punctuation marks , and . |
boolean |
matchRule5(String s1,
String s2)
RULE #5: if the 1st token of one name matches the second name e.g. |
boolean |
matchRule6(String s1,
String s2)
RULE #6: if one name is the acronym of the other e.g. |
boolean |
matchRule7(String s1,
String s2)
RULE #7: if one of the tokens in one of the names is in the list of separators eg. |
boolean |
matchRule8(String s1,
String s2)
This rule is now obsolete, as The and the trailing CDG are stripped before matching. |
boolean |
matchRule9(String s1,
String s2)
RULE #9: does one of the names match the token just before a trailing company designator in the other name? The company designator has already been chopped off, so the token before it, is in fact the last token e.g. |
protected void |
matchUnknown()
|
protected void |
matchWithPrevious(Annotation nameAnnot,
String annotString)
|
String |
regularExpressions(String text,
String replacement,
String regEx)
substitute all multiple spaces, tabes and newlines with a single space |
void |
setAnnotationSetName(String newAnnotationSetName)
set the annotation set name |
void |
setAnnotationTypes(List newType)
set the types of the annotations |
void |
setCaseSensitive(Boolean newCase)
set the caseSensitive flag |
void |
setExtLists(Boolean newExtLists)
set the extLists flag |
void |
setOrganizationType(String newOrganizationType)
|
void |
setPersonType(String newPersonType)
|
protected String |
stripCDG(String annotString,
Annotation annot)
return an organization without a designator and starting The |
protected void |
updateMatches(Annotation newAnnot,
Annotation prevAnnot)
|
protected Annotation |
updateMatches(Annotation newAnnot,
String annotString)
|
Methods inherited from class gate.creole.AbstractLanguageAnalyser |
getCorpus, getDocument, setCorpus, setDocument |
Methods inherited from class gate.creole.AbstractProcessingResource |
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener |
Methods inherited from class gate.creole.AbstractResource |
checkParameterValues, getName, getParameterValue, getParameterValue, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners |
Methods inherited from class gate.util.AbstractFeatureBearer |
getFeatures, setFeatures |
Methods inherited from class java.lang.Object |
|
Methods inherited from interface gate.ProcessingResource |
interrupt, isInterrupted, reInit |
Methods inherited from interface gate.Resource |
cleanup, getParameterValue, setParameterValue, setParameterValues |
Methods inherited from interface gate.util.FeatureBearer |
getFeatures, setFeatures |
Methods inherited from interface gate.util.NameBearer |
getName, setName |
Field Detail |
public static final String OM_DOCUMENT_PARAMETER_NAME
public static final String OM_ANN_SET_PARAMETER_NAME
public static final String OM_CASE_SENSITIVE_PARAMETER_NAME
public static final String OM_ANN_TYPES_PARAMETER_NAME
public static final String OM_ORG_TYPE_PARAMETER_NAME
public static final String OM_PERSON_TYPE_PARAMETER_NAME
public static final String OM_EXT_LISTS_PARAMETER_NAME
protected static final String CDGLISTNAME
protected static final String ALIASLISTNAME
protected static final String ARTLISTNAME
protected static final String PREPLISTNAME
protected static final String CONNECTORLISTNAME
protected static final String SPURLISTNAME
protected static final String PUNCTUATION_VALUE
protected static final String THE_VALUE
protected String annotationSetName
protected List annotationTypes
protected String organizationType
protected String personType
protected String unknownType
protected boolean extLists
protected boolean matchingUnknowns
private boolean allMatchingNeeded
protected boolean caseSensitive
protected FeatureMap queryFM
protected HashMap alias
protected HashSet cdg
protected HashMap spur_match
protected HashMap def_art
protected HashMap connector
protected HashMap prepos
protected AnnotationSet nameAllAnnots
protected HashMap processedAnnots
protected HashMap annots2Remove
protected List matchesDocFeature
protected HashMap tokensMap
protected Annotation shortAnnot
protected Annotation longAnnot
protected ArrayList tokensLongAnnot
protected ArrayList tokensShortAnnot
protected FeatureMap tempMap
private char[] cbuffer
private static final int BUFF_SIZE
Constructor Detail |
public OrthoMatcher()
Method Detail |
public Resource init() throws ResourceInstantiationException
init
in class AbstractProcessingResource
public void execute() throws ExecutionException
execute
in class AbstractProcessingResource
protected void matchNameAnnotations() throws ExecutionException
protected void matchUnknown() throws ExecutionException
private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, Iterator iter)
protected void matchWithPrevious(Annotation nameAnnot, String annotString)
protected boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot)
protected boolean matchOtherAnnots(List toMatchList, Annotation newAnnot, String annotString)
protected boolean matchedAlready(Annotation annot1, Annotation annot2)
protected Annotation updateMatches(Annotation newAnnot, String annotString)
protected void updateMatches(Annotation newAnnot, Annotation prevAnnot)
protected void docCleanup()
protected String containTitle(String annotString, Annotation annot) throws ExecutionException
protected String stripCDG(String annotString, Annotation annot)
protected void createLists() throws IOException
protected void createAnnotList(String nameFile, String nameList) throws IOException
private boolean apply_rules_namematch(String annotationType, String shortName, String longName)
public void setExtLists(Boolean newExtLists)
public void setCaseSensitive(Boolean newCase)
public void setAnnotationSetName(String newAnnotationSetName)
public void setAnnotationTypes(List newType)
public void setOrganizationType(String newOrganizationType)
public void setPersonType(String newPersonType)
public String getAnnotationSetName()
public List getAnnotationTypes()
public String getOrganizationType()
public String getPersonType()
public Boolean getExtLists()
public Boolean getCaseSensitive()
protected boolean isUnknownGender(String gender)
public boolean matchRule0(String s1, String s2)
public boolean matchRule1(String s1, String s2, boolean matchCase)
public boolean matchRule2(String s1, String s2)
public boolean matchRule3(String s1, String s2)
public boolean matchRule4(String s1, String s2)
public boolean matchRule5(String s1, String s2)
public boolean matchRule6(String s1, String s2)
public boolean matchRule7(String s1, String s2)
public boolean matchRule8(String s1, String s2)
public boolean matchRule9(String s1, String s2)
public boolean matchRule10(String s1, String s2)
public boolean matchRule11(String s1, String s2)
public boolean matchRule12(String s1, String s2)
public boolean matchRule13(String s1, String s2)
public boolean matchRule14(String s1, String s2)
public boolean matchRule15(String s1, String s2)
private void buildTables(AnnotationSet nameAllAnnots)
public String regularExpressions(String text, String replacement, String regEx)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: INNER | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |