Skip to content

Commit 76b5a6b

Browse files
committed
Fix the tokenization of 'email' or other things that start with 'em while hopefully not affecting the tokenization of other words. Addresses #1316
1 parent a971209 commit 76b5a6b

File tree

3 files changed

+59891
-59578
lines changed

3 files changed

+59891
-59578
lines changed

src/edu/stanford/nlp/process/PTBLexer.flex

+20-1
Original file line numberDiff line numberDiff line change
@@ -656,9 +656,13 @@ REDAUX_NOT = n{APOSETCETERA}ts?
656656
Here now only need apostrophe initial or final words listed. */
657657
/* Single letters are for French borrowings. */
658658
/* Arguably, c'mon should be split to "c'm" + "on" - split later in ASSIMILATIONS2 */
659-
APOWORD = {WORD}({APOSETCETERA}{WORD})+|\p{Script=Latin}{APOSETCETERA}[A-Z]\.([A-Z]\.)+|{APOS}n{APOS}?|([lLdDjJ]|Dunkin|somethin|ol){APOS}|{APOS}(em|till?|cause|twixt|[1-9]0s)|[1-9]0{APOS}s
659+
APOWORD = {WORD}({APOSETCETERA}{WORD})+|\p{Script=Latin}{APOSETCETERA}[A-Z]\.([A-Z]\.)+|{APOS}n{APOS}?|([lLdDjJ]|Dunkin|somethin|ol){APOS}|{APOS}[1-9]0s|[1-9]0{APOS}s
660660
/* APOWORD2 is things we will strip at beginning of word: th' shortening "the" (Th'enchanting) and y' shortening "you" (y'know, y'all) */
661661
APOWORD2 = (th|y){APOS}
662+
/* APOWORD3 is specifically words that might be a contraction, like "screw 'em", or might be part of a short quote, like 'email'
663+
if this were part of APOWORD, then 'email' or 'tilling' etc would be chopped up unnecessarily */
664+
APOWORD3_TAIL = (em|till?|cause|twixt)
665+
APOWORD3 = {APOSETCETERA}{APOWORD3_TAIL}
662666
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
663667
FULLURL = (ftp|svn|svn\+ssh|http|https|mailto):\/\/[^ \t\n\f\r<>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+[^ \t\n\f\r<>|.!?¡¿,·;:&`\"\'\*\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
664668
LIKELYURL = ((www\.([^ \t\n\f\r`<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+[a-zA-Z]{2,4})|(([^ \t\n\f\r`<>|.!?,:\/$\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+(com|net|org|edu)))(\/[^ \t\n\f\r`<>|]+[^ \t\n\f\r`<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-])?
@@ -956,6 +960,21 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
956960
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
957961
return getNext(norm, tok);
958962
}
963+
/* Having this rule separate prevents improper tokenization of 'email' */
964+
{APOSETCETERA}/{APOWORD3_TAIL}{WORD} {
965+
String tok = yytext();
966+
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
967+
if (DEBUG) { logger.info("Used {APOSETCETERA}/{APOWORD3_TAIL}{WORD} to recognize " + tok + " as " + norm +
968+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
969+
return getNext(norm, tok);
970+
}
971+
{APOWORD3} { String tok = yytext();
972+
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
973+
norm = LexerUtils.removeSoftHyphens(norm);
974+
if (DEBUG) { logger.info("Used {APOWORD3} to recognize " + tok + " as " + norm +
975+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
976+
return getNext(norm, tok);
977+
}
959978
{FULLURL} { String txt = yytext();
960979
String norm = txt;
961980
if (escapeForwardSlashAsterisk) {

0 commit comments

Comments
 (0)