Connect the cleanxml annotator to the tokenizer

AngledLuffa · AngledLuffa · commit 17428496a70c · 2022-03-16T15:15:34.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -255,6 +255,10 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
       this.properties.setProperty("annotators", newAnnotators);
     }
 
+    // if cleanxml is requested and tokenize is here,
+    // make it part of tokenize rather than its own annotator
+    unifyCleanXML(this.properties);
+
     // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
     this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
 
@@ -303,6 +307,36 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
     this.pipelineSetupTime = tim.report();
   }
 
+  /**
+   * The cleanxml annotator can now be invoked as part of the tokenize annotator.
+   *<br>
+   * To ensure backwards compatibility with previous usage of the pipeline,
+   * we allow annotators to be specified tokenize,cleanxml.
+   * In such a case, we remove the cleanxml from the annotators and set
+   * the tokenize.cleanxml option instead
+   */
+  static void unifyCleanXML(Properties properties) {
+    String annotators = properties.getProperty("annotators", "");
+    int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
+    int clean = annotators.indexOf(STANFORD_CLEAN_XML);
+
+    if (clean >= 0 && tokenize >= 0) {
+      properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
+      int comma = annotators.indexOf(",", clean);
+      if (comma >= 0 && comma+1 < annotators.length()) {
+        annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
+      } else {
+        comma = annotators.lastIndexOf(",");
+        if (comma < 0) {
+          throw new IllegalArgumentException("Unable to process annotators " + annotators);
+        }
+        annotators = annotators.substring(0, comma);
+      }
+      logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true  Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
+      properties.setProperty("annotators", annotators);
+    }
+  }
+
   //
   // @Override-able methods to change pipeline behavior
   //
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -133,6 +133,7 @@ public static TokenizerType getTokenizerType(Properties props) {
   /** new segmenter properties **/
   private final boolean useSegmenter;
   private final Annotator segmenterAnnotator;
+  private final CleanXmlAnnotator cleanxmlAnnotator;
 
   /** run a custom post processor after the lexer **/
   private final List<CoreLabelProcessor> postProcessors;
@@ -243,6 +244,12 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
     if (VERBOSE) {
       log.info("Initialized tokenizer factory: " + factory);
     }
+
+    if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML)) {
+      this.cleanxmlAnnotator = new CleanXmlAnnotator(props);
+    } else {
+      this.cleanxmlAnnotator = null;
+    }
   }
 
   /**
@@ -378,10 +385,7 @@ public void annotate(Annotation annotation) {
       // set indexes into document wide tokens list
       setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
       setNewlineStatus(annotation.get(CoreAnnotations.TokensAnnotation.class));
-      return;
-    }
-
-    if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
+    } else if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
       // TODO: This is a huge hack.  jflex does not have a lookahead operation which can match EOF
       // Because of this, the PTBTokenizer has a few productions which can't operate at EOF.
       // For example,
@@ -422,6 +426,9 @@ public void annotate(Annotation annotation) {
       throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
     }
 
+    if (this.cleanxmlAnnotator != null) {
+      this.cleanxmlAnnotator.annotate(annotation);
+    }
   }
 
   @Override
diff --git a/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java b/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java
@@ -6,6 +6,8 @@
 
 import static org.junit.Assert.*;
 
+import edu.stanford.nlp.util.PropertiesUtils;
+
 /**
  * Test some of the utility functions in {@link StanfordCoreNLP}.
  *
@@ -73,4 +75,36 @@ public void testPrereqAnnotatorsCorefWithParse() {
     assertEquals("__empty__", props.getProperty("coref.md.type", "__empty__"));
   }
 
+  // Test a couple use cases of removing the cleanxml annotator from
+  // requested annotator lists
+  @Test
+  public void testUnifyTokenizer() {
+    String[] inputs =   {"tokenize,cleanxml",
+                         "tokenize",
+                         "tokenize,cleanxml,pos",
+                         "tokenize,cleanxml  ,pos",
+                         "tokenize,   cleanxml  ,pos",
+                         "cleanxml,pos"};
+    String[] expected = {"tokenize",
+                         "tokenize",
+                         "tokenize,pos",
+                         "tokenize,pos",
+                         "tokenize,   pos",
+                         "cleanxml,pos"};
+    boolean[] option =  {true,
+                         false,
+                         true,
+                         true,
+                         true,
+                         false};
+    assertEquals(inputs.length, expected.length);
+    assertEquals(inputs.length, option.length);
+    for (int i = 0; i < inputs.length; ++i) {
+      Properties props = new Properties();
+      props.setProperty("annotators", inputs[i]);
+      StanfordCoreNLP.unifyCleanXML(props);
+      assertEquals(expected[i], props.getProperty("annotators"));
+      assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
+    }
+  }
 }