Skip to content

Commit 0d9e9c8

Browse files
committed
Trim words - doing this instead of splitting on all whitespace gives us a chance of getting VI right
1 parent 6193934 commit 0d9e9c8

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java

+11-1
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
import edu.stanford.nlp.ling.CoreAnnotations;
1414
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
1515
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
16-
import java.util.function.Function;
16+
import edu.stanford.nlp.util.ArrayUtils;
1717
import edu.stanford.nlp.util.StringUtils;
18+
import java.util.function.Function;
1819

1920

2021
/**
@@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
3435
//map can be something like "word=0,tag=1,answer=2"
3536
@SuppressWarnings("rawtypes")
3637
private Class[] map; // = null;
38+
private int wordColumn = -1;
3739
private IteratorFromReaderFactory<List<CoreLabel>> factory;
3840

3941
// public void init(SeqClassifierFlags flags) {
@@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
5153
public void init(String map) {
5254
// this.flags = null;
5355
this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
56+
this.wordColumn = ArrayUtils.indexOf(this.map, CoreAnnotations.TextAnnotation.class);
5457
factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
5558
}
5659

@@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
8790
if (info.length == 1) {
8891
info = whitePattern.split(line);
8992
}
93+
// Trimming later rather than splitting on all whitespace
94+
// gives us the possibility of tokens with whitespace in them
95+
// although obviously not at the start or end...
96+
// doesn't slow the classifier down too much
97+
if (wordColumn >= 0) {
98+
info[wordColumn] = info[wordColumn].trim();
99+
}
90100
CoreLabel wi;
91101
try {
92102
wi = new CoreLabel(map, info);

0 commit comments

Comments
 (0)