Start refactoring a couple things which should be common to all language tokenizers, such as space characters and filenames

AngledLuffa · AngledLuffa · commit 3c40ba32ca51 · 2022-04-14T12:53:44.000-07:00
diff --git a/src/edu/stanford/nlp/process/LexCommon.tokens b/src/edu/stanford/nlp/process/LexCommon.tokens
@@ -0,0 +1,8 @@
+/* \u3000 is ideographic space; \u205F is medium math space */
+SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
+SPACES = {SPACE}+
+NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
+SPACENL = ({SPACE}|{NEWLINE})
+
+FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
+FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
diff --git a/src/edu/stanford/nlp/process/PTBLexer.flex b/src/edu/stanford/nlp/process/PTBLexer.flex
@@ -577,11 +577,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
 SPAMP = &amp;
 SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
 SPLET = &[aeiouAEIOU](acute|grave|uml);
-/* \u3000 is ideographic space; \u205F is medium math space */
-SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
-SPACES = {SPACE}+
-NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
-SPACENL = ({SPACE}|{NEWLINE})
+
+%include LexCommon.tokens
+
 SPACENLS = {SPACENL}+
 /* These next ones are useful to get a fixed length trailing context. */
 SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
@@ -608,8 +606,6 @@ DOLSIGN = ([A-Z]*\$|#)
 DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
 /* not used DOLLAR      {DOLSIGN}[ \t]*{NUMBER}  */
 /* |\( ?{NUMBER} ?\))    # is for pound signs */
-FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
-FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
 /* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
 /* Go with just the top 20 currencies. */
 SEP_CURRENCY = (USD|EUR|JPY|GBP|AUD|CAD|CHF|CNY|SEK|NZD|MXN|SGD|HKD|NOK|KRW|TRY|RUB|INR|BRL|ZAR)
@@ -1118,6 +1114,8 @@ RM/{NUM}        { String txt = yytext();
 {ISO8601DATETIME}       { return getNext(); }
 //{ISO8601DATE}           { return getNext(); }
 {DEGREES}               { return getNext(); }
+/* Ideally would factor this out for use in other tokenizers,
+ * but the other tokenizers don't have TokenizerPerLine options */
 <YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()])      { return getNext(); }
 <YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()])      { return getNext(); }
 {WORD}\./{INSENTP}      { String origTok = yytext();
diff --git a/test/src/edu/stanford/nlp/process/PTBTokenizerTest.java b/test/src/edu/stanford/nlp/process/PTBTokenizerTest.java
@@ -96,6 +96,8 @@ public class PTBTokenizerTest {
       "i got (89.2%) in my exams",
       "Dial 908-333-4444 to unban mox opal",
       "The jerk who banned mox opal has social security number 555-55-5555.",
+      "What do you suppose is in the file thicc_antennae.jpg?",
+      "What do you suppose is in the file thicc_antennae.asdf?",
   };
 
   private final String[][] ptbGold = {
@@ -188,6 +190,9 @@ public class PTBTokenizerTest {
       { "i", "got", "-LRB-", "89.2", "%", "-RRB-", "in", "my", "exams" },
       { "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
       { "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },
+      // test that filename extensions trigger something being a single word
+      { "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
+      { "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
   };
 
   private final String[][] ptbGoldSplitHyphenated = {
@@ -289,7 +294,8 @@ public class PTBTokenizerTest {
       { "i", "got", "(", "89.2", "%", ")", "in", "my", "exams" },
       { "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
       { "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },
-
+      { "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
+      { "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
   };
 
   @Test