@@ -656,9 +656,13 @@ REDAUX_NOT = n{APOSETCETERA}ts?
656
656
Here now only need apostrophe initial or final words listed. */
657
657
/* Single letters are for French borrowings. */
658
658
/* Arguably, c'mon should be split to "c'm" + "on" - split later in ASSIMILATIONS2 */
659
- APOWORD = {WORD} ( {APOSETCETERA}{WORD} )+|\p{Script=Latin} {APOSETCETERA} [ A- Z] \. ( [ A- Z] \. )+| {APOS} n{APOS} ?|( [ lLdDjJ] | Dunkin| somethin| ol) {APOS} | {APOS} ( em | till ?| cause | twixt | [ 1- 9] 0s) |[ 1- 9] 0{APOS} s
659
+ APOWORD = {WORD} ( {APOSETCETERA}{WORD} )+|\p{Script=Latin} {APOSETCETERA} [ A- Z] \. ( [ A- Z] \. )+| {APOS} n{APOS} ?|( [ lLdDjJ] | Dunkin| somethin| ol) {APOS} | {APOS} [ 1- 9] 0s| [ 1- 9] 0{APOS} s
660
660
/* APOWORD2 is things we will strip at beginning of word: th' shortening "the" (Th'enchanting) and y' shortening "you" (y'know, y'all) */
661
661
APOWORD2 = ( th| y) {APOS}
662
+ /* APOWORD3 is specifically words that might be a contraction, like "screw 'em", or might be part of a short quote, like 'email'
663
+ if this were part of APOWORD, then 'email' or 'tilling' etc would be chopped up unnecessarily */
664
+ APOWORD3_TAIL = ( em| till?| cause| twixt)
665
+ APOWORD3 = {APOSETCETERA}{APOWORD3_TAIL}
662
666
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
663
667
FULLURL = ( ftp| svn| svn\+ ssh| http| https| mailto) :\/\/ [^ \t\n\f\r <>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + [^ \t\n\f\r <>|.!?¡¿,·;:&`\"\'\* \p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
664
668
LIKELYURL = (( www\. ( [^ \t\n\f\r `<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + \. )+ [ a- zA- Z] {2,4})|(( [^ \t\n\f\r `<>|.!?,:\/ $\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}] + \. )+( com| net| org| edu)))( \/ [^ \t\n\f\r `<>|] + [^ \t\n\f\r `<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-] )?
@@ -956,6 +960,21 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
956
960
" ; quoteStyle=" + quoteStyle + " ; probablyLeft=" + false ); }
957
961
return getNext(norm, tok);
958
962
}
963
+ /* Having this rule separate prevents improper tokenization of 'email' */
964
+ {APOSETCETERA} / {APOWORD3_TAIL}{WORD} {
965
+ String tok = yytext();
966
+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
967
+ if (DEBUG ) { logger. info(" Used {APOSETCETERA}/{APOWORD3_TAIL}{WORD} to recognize " + tok + " as " + norm +
968
+ " ; quoteStyle=" + quoteStyle + " ; probablyLeft=" + false ); }
969
+ return getNext(norm, tok);
970
+ }
971
+ {APOWORD3} { String tok = yytext();
972
+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
973
+ norm = LexerUtils . removeSoftHyphens(norm);
974
+ if (DEBUG ) { logger. info(" Used {APOWORD3} to recognize " + tok + " as " + norm +
975
+ " ; quoteStyle=" + quoteStyle + " ; probablyLeft=" + false ); }
976
+ return getNext(norm, tok);
977
+ }
959
978
{FULLURL} { String txt = yytext();
960
979
String norm = txt;
961
980
if (escapeForwardSlashAsterisk) {
0 commit comments