Skip to content

Commit 3c40ba3

Browse files
committed
Start refactoring a couple things which should be common to all language tokenizers, such as space characters and filenames
1 parent 613887a commit 3c40ba3

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/* \u3000 is ideographic space; \u205F is medium math space */
2+
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
3+
SPACES = {SPACE}+
4+
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
5+
SPACENL = ({SPACE}|{NEWLINE})
6+
7+
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
8+
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}

src/edu/stanford/nlp/process/PTBLexer.flex

+5-7
Original file line numberDiff line numberDiff line change
@@ -577,11 +577,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
577577
SPAMP = &
578578
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
579579
SPLET = &[aeiouAEIOU](acute|grave|uml);
580-
/* \u3000 is ideographic space; \u205F is medium math space */
581-
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
582-
SPACES = {SPACE}+
583-
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
584-
SPACENL = ({SPACE}|{NEWLINE})
580+
581+
%include LexCommon.tokens
582+
585583
SPACENLS = {SPACENL}+
586584
/* These next ones are useful to get a fixed length trailing context. */
587585
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
@@ -608,8 +606,6 @@ DOLSIGN = ([A-Z]*\$|#)
608606
DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
609607
/* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
610608
/* |\( ?{NUMBER} ?\)) # is for pound signs */
611-
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
612-
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
613609
/* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
614610
/* Go with just the top 20 currencies. */
615611
SEP_CURRENCY = (USD|EUR|JPY|GBP|AUD|CAD|CHF|CNY|SEK|NZD|MXN|SGD|HKD|NOK|KRW|TRY|RUB|INR|BRL|ZAR)
@@ -1118,6 +1114,8 @@ RM/{NUM} { String txt = yytext();
11181114
{ISO8601DATETIME} { return getNext(); }
11191115
//{ISO8601DATE} { return getNext(); }
11201116
{DEGREES} { return getNext(); }
1117+
/* Ideally would factor this out for use in other tokenizers,
1118+
* but the other tokenizers don't have TokenizerPerLine options */
11211119
<YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()]) { return getNext(); }
11221120
<YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()]) { return getNext(); }
11231121
{WORD}\./{INSENTP} { String origTok = yytext();

test/src/edu/stanford/nlp/process/PTBTokenizerTest.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ public class PTBTokenizerTest {
9696
"i got (89.2%) in my exams",
9797
"Dial 908-333-4444 to unban mox opal",
9898
"The jerk who banned mox opal has social security number 555-55-5555.",
99+
"What do you suppose is in the file thicc_antennae.jpg?",
100+
"What do you suppose is in the file thicc_antennae.asdf?",
99101
};
100102

101103
private final String[][] ptbGold = {
@@ -188,6 +190,9 @@ public class PTBTokenizerTest {
188190
{ "i", "got", "-LRB-", "89.2", "%", "-RRB-", "in", "my", "exams" },
189191
{ "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
190192
{ "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },
193+
// test that filename extensions trigger something being a single word
194+
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
195+
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
191196
};
192197

193198
private final String[][] ptbGoldSplitHyphenated = {
@@ -289,7 +294,8 @@ public class PTBTokenizerTest {
289294
{ "i", "got", "(", "89.2", "%", ")", "in", "my", "exams" },
290295
{ "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
291296
{ "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },
292-
297+
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
298+
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
293299
};
294300

295301
@Test

0 commit comments

Comments
 (0)