@@ -96,6 +96,8 @@ public class PTBTokenizerTest {
96
96
"i got (89.2%) in my exams" ,
97
97
"Dial 908-333-4444 to unban mox opal" ,
98
98
"The jerk who banned mox opal has social security number 555-55-5555." ,
99
+ "What do you suppose is in the file thicc_antennae.jpg?" ,
100
+ "What do you suppose is in the file thicc_antennae.asdf?" ,
99
101
};
100
102
101
103
private final String [][] ptbGold = {
@@ -188,6 +190,9 @@ public class PTBTokenizerTest {
188
190
{ "i" , "got" , "-LRB-" , "89.2" , "%" , "-RRB-" , "in" , "my" , "exams" },
189
191
{ "Dial" , "908-333-4444" , "to" , "unban" , "mox" , "opal" },
190
192
{ "The" , "jerk" , "who" , "banned" , "mox" , "opal" , "has" , "social" , "security" , "number" , "555-55-5555" , "." },
193
+ // test that filename extensions trigger something being a single word
194
+ { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae.jpg" , "?" },
195
+ { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae" , "." , "asdf" , "?" },
191
196
};
192
197
193
198
private final String [][] ptbGoldSplitHyphenated = {
@@ -289,7 +294,8 @@ public class PTBTokenizerTest {
289
294
{ "i" , "got" , "(" , "89.2" , "%" , ")" , "in" , "my" , "exams" },
290
295
{ "Dial" , "908-333-4444" , "to" , "unban" , "mox" , "opal" },
291
296
{ "The" , "jerk" , "who" , "banned" , "mox" , "opal" , "has" , "social" , "security" , "number" , "555-55-5555" , "." },
292
-
297
+ { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae.jpg" , "?" },
298
+ { "What" , "do" , "you" , "suppose" , "is" , "in" , "the" , "file" , "thicc_antennae" , "." , "asdf" , "?" },
293
299
};
294
300
295
301
@ Test
0 commit comments