13
13
import edu .stanford .nlp .ling .CoreAnnotations ;
14
14
import edu .stanford .nlp .objectbank .DelimitRegExIterator ;
15
15
import edu .stanford .nlp .objectbank .IteratorFromReaderFactory ;
16
- import java . util . function . Function ;
16
+ import edu . stanford . nlp . util . ArrayUtils ;
17
17
import edu .stanford .nlp .util .StringUtils ;
18
+ import java .util .function .Function ;
18
19
19
20
20
21
/**
@@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
34
35
//map can be something like "word=0,tag=1,answer=2"
35
36
@ SuppressWarnings ("rawtypes" )
36
37
private Class [] map ; // = null;
38
+ private int wordColumn = -1 ;
37
39
private IteratorFromReaderFactory <List <CoreLabel >> factory ;
38
40
39
41
// public void init(SeqClassifierFlags flags) {
@@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
51
53
public void init (String map ) {
52
54
// this.flags = null;
53
55
this .map = CoreLabel .parseStringKeys (StringUtils .mapStringToArray (map ));
56
+ this .wordColumn = ArrayUtils .indexOf (this .map , CoreAnnotations .TextAnnotation .class );
54
57
factory = DelimitRegExIterator .getFactory ("\n (?:\\ s*\n )+" , new ColumnDocParser ());
55
58
}
56
59
@@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
87
90
if (info .length == 1 ) {
88
91
info = whitePattern .split (line );
89
92
}
93
+ // Trimming later rather than splitting on all whitespace
94
+ // gives us the possibility of tokens with whitespace in them
95
+ // although obviously not at the start or end...
96
+ // doesn't slow the classifier down too much
97
+ if (wordColumn >= 0 ) {
98
+ info [wordColumn ] = info [wordColumn ].trim ();
99
+ }
90
100
CoreLabel wi ;
91
101
try {
92
102
wi = new CoreLabel (map , info );
0 commit comments