Skip to content

Commit 1742849

Browse files
committed
Connect the cleanxml annotator to the tokenizer
1 parent e3c9a6d commit 1742849

File tree

3 files changed

+79
-4
lines changed

3 files changed

+79
-4
lines changed

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java

+34
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,10 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
255255
this.properties.setProperty("annotators", newAnnotators);
256256
}
257257

258+
// if cleanxml is requested and tokenize is here,
259+
// make it part of tokenize rather than its own annotator
260+
unifyCleanXML(this.properties);
261+
258262
// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
259263
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
260264

@@ -303,6 +307,36 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
303307
this.pipelineSetupTime = tim.report();
304308
}
305309

310+
/**
311+
* The cleanxml annotator can now be invoked as part of the tokenize annotator.
312+
*<br>
313+
* To ensure backwards compatibility with previous usage of the pipeline,
314+
* we allow annotators to be specified tokenize,cleanxml.
315+
* In such a case, we remove the cleanxml from the annotators and set
316+
* the tokenize.cleanxml option instead
317+
*/
318+
static void unifyCleanXML(Properties properties) {
319+
String annotators = properties.getProperty("annotators", "");
320+
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
321+
int clean = annotators.indexOf(STANFORD_CLEAN_XML);
322+
323+
if (clean >= 0 && tokenize >= 0) {
324+
properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
325+
int comma = annotators.indexOf(",", clean);
326+
if (comma >= 0 && comma+1 < annotators.length()) {
327+
annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
328+
} else {
329+
comma = annotators.lastIndexOf(",");
330+
if (comma < 0) {
331+
throw new IllegalArgumentException("Unable to process annotators " + annotators);
332+
}
333+
annotators = annotators.substring(0, comma);
334+
}
335+
logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
336+
properties.setProperty("annotators", annotators);
337+
}
338+
}
339+
306340
//
307341
// @Override-able methods to change pipeline behavior
308342
//

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

+11-4
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ public static TokenizerType getTokenizerType(Properties props) {
133133
/** new segmenter properties **/
134134
private final boolean useSegmenter;
135135
private final Annotator segmenterAnnotator;
136+
private final CleanXmlAnnotator cleanxmlAnnotator;
136137

137138
/** run a custom post processor after the lexer **/
138139
private final List<CoreLabelProcessor> postProcessors;
@@ -243,6 +244,12 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
243244
if (VERBOSE) {
244245
log.info("Initialized tokenizer factory: " + factory);
245246
}
247+
248+
if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML)) {
249+
this.cleanxmlAnnotator = new CleanXmlAnnotator(props);
250+
} else {
251+
this.cleanxmlAnnotator = null;
252+
}
246253
}
247254

248255
/**
@@ -378,10 +385,7 @@ public void annotate(Annotation annotation) {
378385
// set indexes into document wide tokens list
379386
setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
380387
setNewlineStatus(annotation.get(CoreAnnotations.TokensAnnotation.class));
381-
return;
382-
}
383-
384-
if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
388+
} else if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
385389
// TODO: This is a huge hack. jflex does not have a lookahead operation which can match EOF
386390
// Because of this, the PTBTokenizer has a few productions which can't operate at EOF.
387391
// For example,
@@ -422,6 +426,9 @@ public void annotate(Annotation annotation) {
422426
throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
423427
}
424428

429+
if (this.cleanxmlAnnotator != null) {
430+
this.cleanxmlAnnotator.annotate(annotation);
431+
}
425432
}
426433

427434
@Override

test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java

+34
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import static org.junit.Assert.*;
88

9+
import edu.stanford.nlp.util.PropertiesUtils;
10+
911
/**
1012
* Test some of the utility functions in {@link StanfordCoreNLP}.
1113
*
@@ -73,4 +75,36 @@ public void testPrereqAnnotatorsCorefWithParse() {
7375
assertEquals("__empty__", props.getProperty("coref.md.type", "__empty__"));
7476
}
7577

78+
// Test a couple use cases of removing the cleanxml annotator from
79+
// requested annotator lists
80+
@Test
81+
public void testUnifyTokenizer() {
82+
String[] inputs = {"tokenize,cleanxml",
83+
"tokenize",
84+
"tokenize,cleanxml,pos",
85+
"tokenize,cleanxml ,pos",
86+
"tokenize, cleanxml ,pos",
87+
"cleanxml,pos"};
88+
String[] expected = {"tokenize",
89+
"tokenize",
90+
"tokenize,pos",
91+
"tokenize,pos",
92+
"tokenize, pos",
93+
"cleanxml,pos"};
94+
boolean[] option = {true,
95+
false,
96+
true,
97+
true,
98+
true,
99+
false};
100+
assertEquals(inputs.length, expected.length);
101+
assertEquals(inputs.length, option.length);
102+
for (int i = 0; i < inputs.length; ++i) {
103+
Properties props = new Properties();
104+
props.setProperty("annotators", inputs[i]);
105+
StanfordCoreNLP.unifyCleanXML(props);
106+
assertEquals(expected[i], props.getProperty("annotators"));
107+
assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
108+
}
109+
}
76110
}

0 commit comments

Comments
 (0)