1
13
14 package gate.creole.coref;
15
16 import java.util.*;
17
18 import gate.*;
19 import gate.creole.*;
20 import gate.util.*;
21
22 public class NominalCoref extends AbstractCoreferencer
23 implements ProcessingResource, ANNIEConstants {
24
25 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
26
27 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
28
29
30 private static final boolean DEBUG = false;
31
32 private static final String PERSON_CATEGORY = "Person";
34 private static final String JOBTITLE_CATEGORY = "JobTitle";
35 private static final String ORGANIZATION_CATEGORY = "Organization";
36 private static final String LOOKUP_CATEGORY = "Lookup";
37 private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun";
38
39
40
42
44 private String annotationSetName;
45
46 private AnnotationSet defaultAnnotations;
47
48 private HashMap anaphor2antecedent;
49
50
53
54
55 public NominalCoref() {
56 super("NOMINAL");
57 this.anaphor2antecedent = new HashMap();
58 }
59
60
61 public Resource init() throws ResourceInstantiationException {
62 return super.init();
63 }
65
73 public void reInit() throws ResourceInstantiationException {
74 this.anaphor2antecedent = new HashMap();
75 init();
76 }
78
79
80 public void setDocument(Document newDocument) {
81
82
85 super.setDocument(newDocument);
86 }
87
88
89 public void setAnnotationSetName(String annotationSetName) {
90 this.annotationSetName = annotationSetName;
91 }
92
93
94 public String getAnnotationSetName() {
95 return annotationSetName;
96 }
97
98
110 public void execute() throws ExecutionException{
111
112 HashMap anaphorToAntecedent = new HashMap();
113 Object[] nominalArray;
114
115 if (null == this.document) {
117 throw new ExecutionException("[coreference] Document is not set!");
118 }
119
120 preprocess();
122
123
125 Object[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).toArray();
130 java.util.Arrays.sort(tokens, new OffsetComparator());
131
132 int currentToken = 0;
134
135 HashSet personConstraint = new HashSet();
140 personConstraint.add(PERSON_CATEGORY);
141 AnnotationSet people =
142 this.defaultAnnotations.get(personConstraint);
143
144 HashSet jobTitleConstraint = new HashSet();
148 jobTitleConstraint.add(JOBTITLE_CATEGORY);
149
150 AnnotationSet jobTitles =
151 this.defaultAnnotations.get(jobTitleConstraint);
152
153 FeatureMap orgNounConstraint = new SimpleFeatureMapImpl();
154 orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
155 ORGANIZATION_NOUN_CATEGORY);
156 AnnotationSet orgNouns =
157 this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint);
158
159 HashSet orgConstraint = new HashSet();
160 orgConstraint.add(ORGANIZATION_CATEGORY);
161
162 AnnotationSet organizations =
163 this.defaultAnnotations.get(orgConstraint);
164
165 Set nominals = new HashSet();
167 if (people != null) {
168 nominals.addAll(people);
169 }
170 if (jobTitles != null) {
171 nominals.addAll(jobTitles);
172 }
173 if (orgNouns != null) {
174 nominals.addAll(orgNouns);
175 }
176 if (organizations != null) {
177 nominals.addAll(organizations);
178 }
179
180
182 nominalArray = nominals.toArray();
184 java.util.Arrays.sort(nominalArray, new OffsetComparator());
185
186 ArrayList previousPeople = new ArrayList();
187 ArrayList previousOrgs = new ArrayList();
188
189
190 for (int i=0; i<nominalArray.length; i++) {
192 Annotation nominal = (Annotation)nominalArray[i];
193
194 currentToken = advanceTokenPosition(nominal, currentToken, tokens);
196
197
199 if (nominal.getType().equals(PERSON_CATEGORY)) {
200 Object[] personTokens = getSortedTokens(nominal);
203
204 if (personTokens.length == 1) {
205 Annotation personToken = (Annotation) personTokens[0];
206
207 String personCategory = (String)
208 personToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
209 if (personCategory.equals("PP") ||
210 personCategory.equals("PRP") ||
211 personCategory.equals("PRP$") ||
212 personCategory.equals("PRPR$")) {
213 continue;
215 }
216 }
217
218 previousPeople.add(0, nominal);
219 }
221 else if (nominal.getType().equals(JOBTITLE_CATEGORY)) {
222
223 Object[] jobTitleTokens = getSortedTokens(nominal);
225
226 Annotation lastToken = (Annotation)
227 jobTitleTokens[jobTitleTokens.length - 1];
228
229 String tokenCategory = (String)
231 lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
232 if (! tokenCategory.equals("NN")) {
236 continue;
238 }
239
240 if (overlapsAnnotations(nominal, people)) {
242 continue;
244 }
245
246 Annotation previousToken;
247 String previousValue;
248
249 if (currentToken != 0) {
251 previousToken = (Annotation) tokens[currentToken - 1];
252 previousValue = (String)
253 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
254 if (previousValue.equalsIgnoreCase("a") ||
255 previousValue.equalsIgnoreCase("an") ||
256 previousValue.equalsIgnoreCase("other") ||
257 previousValue.equalsIgnoreCase("another")) {
258 continue;
260 }
261 }
262
263
272 if (i < nominalArray.length - 1) {
274 Annotation nextAnnotation = (Annotation) nominalArray[i+1];
275 if (nextAnnotation.getType().equals(PERSON_CATEGORY)) {
276 previousToken = (Annotation) tokens[currentToken - 1];
278 previousValue = (String)
279 previousToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
280
281 int interveningTokens =
283 countInterveningTokens(nominal, nextAnnotation,
284 currentToken, tokens);
285 if (interveningTokens == 0 &&
286 ! previousValue.equalsIgnoreCase("the")) {
287
288 continue;
292 }
293 else if (interveningTokens == 1) {
294 String tokenString =
295 (String) getFollowingToken(nominal,
296 currentToken, tokens)
297 .getFeatures().get(TOKEN_STRING_FEATURE_NAME);
298 if (! tokenString.equals(",") &&
300 ! tokenString.equals("-")) {
301 continue;
303 }
304 }
305
306
309 anaphor2antecedent.put(nominal, nextAnnotation);
310 continue;
313
314 }
315 }
316
317 if (previousPeople.size() == 0) {
320 FeatureMap personFeatures = new SimpleFeatureMapImpl();
321 personFeatures.put("ENTITY_MENTION_TYPE", "NOMINAL");
322 this.defaultAnnotations.add(nominal.getStartNode(),
323 nominal.getEndNode(),
324 PERSON_CATEGORY,
325 personFeatures);
326 continue;
328 }
329
330 int personIndex = 0;
332
333 Annotation previousPerson =
334 (Annotation) previousPeople.get(personIndex);
335
336 String personGender = (String)
338 previousPerson.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
339 String jobTitleGender = (String)
340 nominal.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
341 if (personGender != null && jobTitleGender != null) {
342 if (! personGender.equals(jobTitleGender)) {
343 continue;
346 }
347 }
348
349
353 anaphor2antecedent.put(nominal, previousPerson);
354 }
355 else if (nominal.getType().equals(ORGANIZATION_CATEGORY)) {
356 previousOrgs.add(0, nominal);
359 }
361 else if (nominal.getType().equals(LOOKUP_CATEGORY)) {
362 if (previousOrgs.size() == 0) {
364 continue;
366 }
367
368 Object[] orgNounTokens =
370 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
371 nominal.getStartNode().getOffset(),
372 nominal.getEndNode().getOffset()).toArray();
373 java.util.Arrays.sort(orgNounTokens, new OffsetComparator());
374 Annotation lastToken = (Annotation)
375 orgNounTokens[orgNounTokens.length - 1];
376
377 if (! lastToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME)
379 .equals("NN")) {
380 continue;
382 }
383
384 anaphor2antecedent.put(nominal, previousOrgs.get(0));
387 }
388 }
389
390 generateCorefChains(anaphor2antecedent);
393 }
394
395
401 private boolean overlapsAnnotations(Annotation a,
402 AnnotationSet annotations) {
403 Iterator iter = annotations.iterator();
404 while (iter.hasNext()) {
405 Annotation current = (Annotation) iter.next();
406 if (a.overlaps(current)) {
407 return true;
408 }
409 }
410
411 return false;
412 }
413
414
416 private int advanceTokenPosition(Annotation target, int currentPosition,
417 Object[] tokens) {
418 long targetOffset = target.getStartNode().getOffset().longValue();
419 long currentOffset = ((Annotation) tokens[currentPosition])
420 .getStartNode().getOffset().longValue();
421
422 if (targetOffset > currentOffset) {
423 while (targetOffset > currentOffset) {
424 currentPosition++;
425 currentOffset = ((Annotation) tokens[currentPosition])
426 .getStartNode().getOffset().longValue();
427 }
428 }
429 else if (targetOffset < currentOffset) {
430 while (targetOffset < currentOffset) {
431 currentPosition--;
432 currentOffset = ((Annotation) tokens[currentPosition])
433 .getStartNode().getOffset().longValue();
434 }
435 }
436
437 return currentPosition;
438 }
439
440
442 private int countInterveningTokens(Annotation first, Annotation second,
443 int currentPosition, Object[] tokens) {
444 int interveningTokens = 0;
445
446 long startOffset = first.getEndNode().getOffset().longValue();
447 long endOffset = second.getStartNode().getOffset().longValue();
448
449 long currentOffset = ((Annotation) tokens[currentPosition])
450 .getStartNode().getOffset().longValue();
451
452 while (currentOffset < endOffset) {
453 if (currentOffset >= startOffset) {
454 interveningTokens++;
455 }
456 currentPosition++;
457 currentOffset = ((Annotation) tokens[currentPosition])
458 .getStartNode().getOffset().longValue();
459 }
460 return interveningTokens;
461 }
462
463
464 private Annotation getFollowingToken(Annotation current, int currentPosition,
465 Object[] tokens) {
466 long endOffset = current.getEndNode().getOffset().longValue();
467 long currentOffset = ((Annotation) tokens[currentPosition])
468 .getStartNode().getOffset().longValue();
469 while (currentOffset < endOffset) {
470 currentPosition++;
471 currentOffset = ((Annotation) tokens[currentPosition])
472 .getStartNode().getOffset().longValue();
473 }
474 return (Annotation) tokens[currentPosition];
475 }
476
477
478 private String stringValue(Annotation ann) {
479 Object[] tokens = getSortedTokens(ann);
480
481 StringBuffer output = new StringBuffer();
482 for (int i=0;i<tokens.length;i++) {
483 Annotation token = (Annotation) tokens[i];
484 output.append(token.getFeatures().get(TOKEN_STRING_FEATURE_NAME));
485 if (i < tokens.length - 1) {
486 output.append(" ");
487 }
488 }
489 return output.toString();
490 }
491
492
493 private Object[] getSortedTokens(Annotation a) {
494 Object[] annotationTokens =
495 this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,
496 a.getStartNode().getOffset(),
497 a.getEndNode().getOffset()).toArray();
498 java.util.Arrays.sort(annotationTokens, new OffsetComparator());
499
500 return annotationTokens;
501 }
502
503
504 public HashMap getResolvedAnaphora() {
505 return this.anaphor2antecedent;
506 }
507
508
509 private void preprocess() throws ExecutionException {
510
511 this.anaphor2antecedent.clear();
513
514 if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
516 this.defaultAnnotations = this.document.getAnnotations();
517 }
518 else {
519 this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
520 }
521
522 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
524 Err.prln("Coref Warning: No annotations found for processing!");
525 return;
526 }
527
528
547 }
548
549 }
550