Trim words - doing this instead of splitting on all whitespace gives us a chance of getting VI right

AngledLuffa · AngledLuffa · commit 0d9e9c829bfa · 2022-07-13T23:15:05.000-07:00
diff --git a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
@@ -13,8 +13,9 @@
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.objectbank.DelimitRegExIterator;
 import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
-import java.util.function.Function;
+import edu.stanford.nlp.util.ArrayUtils;
 import edu.stanford.nlp.util.StringUtils;
+import java.util.function.Function;
 
 
 /**
@@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
   //map can be something like "word=0,tag=1,answer=2"
   @SuppressWarnings("rawtypes")
   private Class[] map; // = null;
+  private int wordColumn = -1;
   private IteratorFromReaderFactory<List<CoreLabel>> factory;
 
 //  public void init(SeqClassifierFlags flags) {
@@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
   public void init(String map) {
     // this.flags = null;
     this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
+    this.wordColumn = ArrayUtils.indexOf(this.map, CoreAnnotations.TextAnnotation.class);
     factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
   }
 
@@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
         if (info.length == 1) {
           info = whitePattern.split(line);
         }
+        // Trimming later rather than splitting on all whitespace
+        // gives us the possibility of tokens with whitespace in them
+        // although obviously not at the start or end...
+        // doesn't slow the classifier down too much
+        if (wordColumn >= 0) {
+          info[wordColumn] = info[wordColumn].trim();
+        }
         CoreLabel wi;
         try {
           wi = new CoreLabel(map, info);