Skip to content

Commit f1b929e

Browse files
committed
Add an mwt misc field to the wire format, including the protobuf annotation serializer
Uses a single string instead of a map for mwtMisc. This is not ideal, because it would be nice to search through the individual pieces, but different treebanks use different standards for how to separate the pieces, so we just follow the same convention as for words for now and keep the whole thing as one string.
1 parent 9805fb9 commit f1b929e

File tree

5 files changed

+597
-302
lines changed

5 files changed

+597
-302
lines changed

src/edu/stanford/nlp/ling/AnnotationLookup.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ private enum KeyLookup {
9898
// MWT specific annotations
9999
MWT_TEXT_KEY(CoreAnnotations.MWTTokenTextAnnotation.class, "mwt_text"),
100100
IS_MWT_KEY(CoreAnnotations.IsMultiWordTokenAnnotation.class, "is_mwt"),
101-
IS_FIRST_MWT_KEY(CoreAnnotations.IsFirstWordOfMWTAnnotation.class, "is_first_mwt");
101+
IS_FIRST_MWT_KEY(CoreAnnotations.IsFirstWordOfMWTAnnotation.class, "is_first_mwt"),
102+
MWT_MISC_KEY(CoreAnnotations.MWTTokenMiscAnnotation.class, "mwt_misc");
102103

103104
private final Class<? extends CoreAnnotation<?>> coreKey;
104105
private final String oldKey;

src/edu/stanford/nlp/ling/CoreAnnotations.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -2217,5 +2217,13 @@ public Class<Boolean> getType() {
22172217
}
22182218
}
22192219

2220-
2220+
/**
2221+
* CoNLL-U misc features specifically on the MWT part of a token rather than the word
2222+
*/
2223+
public static class MWTTokenMiscAnnotation implements CoreAnnotation<String> {
2224+
@Override
2225+
public Class<String> getType() {
2226+
return String.class;
2227+
}
2228+
}
22212229
}

src/edu/stanford/nlp/pipeline/CoreNLP.proto

+5
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,11 @@ message Token {
222222
optional bool isMWT = 67;
223223
optional bool isFirstMWT = 68;
224224
optional string mwtText = 69;
225+
// setting this to a map might be nice, but there are a couple issues
226+
// for one, there can be values with no key
227+
// for another, it's a pain to correctly parse, since different treebanks
228+
// can have different standards for how to write out the misc field
229+
optional string mwtMisc = 78;
225230

226231
// number info
227232
optional uint64 numericValue = 70;

0 commit comments

Comments
 (0)