@@ -769,7 +769,8 @@ HYPHEN = [-\u058A\u2010\u2011\u2012]
769
769
HYPHENS = {HYPHEN} +
770
770
SSN = [ 0- 9] {3} {HYPHEN} [ 0- 9] {2} {HYPHEN} [ 0- 9] {4}
771
771
/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
772
- PHONE = ( \( [ 0- 9] {2,3} \) [ \u00A0\u2007 ] ?|( \+\+ ?)?( [ 0- 9] {1,4} [ \- \u00A0\u2007\u2012 ] )? [ 0- 9] {2,4} [ \- \u00A0\u2007\u2012 /] ) [ 0- 9] {3,4} [ \- \u00A0\u2007\u2012 ] ? [ 0- 9] {3,5}|(( \+\+ ?)? [ 0- 9] {1,4} \. )? [ 0- 9] {2,4} \. [ 0- 9] {3,4} \. [ 0- 9] {3,5}| [ 2- 9][ 0- 9] {2} [- \u2012 ][ 0- 9] {4}
772
+ /* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
773
+ PHONE = ( \( [ 0- 9] {2,3} \) [- \u00A0\u2007 ] ?|( \+\+ ?)?( [ 0- 9] {1,4} [- \u00A0\u2007\u2012 ] )? [ 0- 9] {2,4} [- \u00A0\u2007\u2012 /] ) [ 0- 9] {3,4} [- \u00A0\u2007\u2012 ] ? [ 0- 9] {3,5}|(( \+\+ ?)? [ 0- 9] {1,4} \. )? [ 0- 9] {2,4} \. [ 0- 9] {2,4} \. [ 0- 9] {2,5}|(( \+\+ ?)? [ 0- 9] {1,4} -)? [ 0- 9] {2,4} -[ 0- 9] {2,4} -[ 0- 9] {2,5}| [ 2- 9][ 0- 9] {2} [- \u2012 ][ 0- 9] {4}
773
774
/* Fake duck feet appear sometimes in WSJ, and aren't likely to be SGML, less than, etc., so group. */
774
775
FAKEDUCKFEET = <<| >>
775
776
LESSTHAN = <| <
@@ -1356,8 +1357,8 @@ RM/{NUM} { String txt = yytext();
1356
1357
if (escapeForwardSlashAsterisk) {
1357
1358
String normTok = LexerUtils . escapeChar(yytext(), ' *' );
1358
1359
if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt + " as " + normTok); }
1359
- return getNext(normTok, yytext()); }
1360
- else {
1360
+ return getNext(normTok, txt);
1361
+ } else {
1361
1362
if (DEBUG ) { logger. info(" Used {ASTS} to recognize " + txt); }
1362
1363
return getNext(txt, txt);
1363
1364
}
@@ -1382,20 +1383,55 @@ RM/{NUM} { String txt = yytext();
1382
1383
if (DEBUG ) { logger. info(" Used {=} to recognize " + txt); }
1383
1384
return getNext(txt, txt);
1384
1385
}
1385
- \/ { if (escapeForwardSlashAsterisk) {
1386
- return getNext(LexerUtils . escapeChar(yytext(), ' /' ), yytext()); }
1387
- else {
1388
- return getNext();
1386
+ \/ {
1387
+ String txt = yytext();
1388
+ if (escapeForwardSlashAsterisk) {
1389
+ String normTok = LexerUtils . escapeChar(yytext(), ' /' );
1390
+ if (DEBUG ) { logger. info(" Used {/} to recognize " + txt + " as " + normTok); }
1391
+ return getNext(normTok, txt);
1392
+ } else {
1393
+ if (DEBUG ) { logger. info(" Used {/} to recognize " + txt); }
1394
+ return getNext(txt, txt);
1389
1395
}
1390
1396
}
1391
1397
/* {HTHING}/[^\p{Alpha}\p{Digit}.+] { return getNext(LexerUtils.removeSoftHyphens(yytext()),
1392
1398
yytext()); } */
1393
- {HTHINGEXCEPTIONWHOLE} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1394
- {HTHINGEXCEPTIONWHOLE} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1395
- {HTHINGEXCEPTIONPREFIXED} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1396
- {HTHINGEXCEPTIONPREFIXED} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1397
- {HTHINGEXCEPTIONSUFFIXED} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1398
- {HTHINGEXCEPTIONSUFFIXED} \. / {INSENTP} { return getNext(LexerUtils . removeSoftHyphens(yytext()), yytext());}
1399
+ {HTHINGEXCEPTIONWHOLE} {
1400
+ String tok = yytext();
1401
+ String norm = LexerUtils . removeSoftHyphens(tok);
1402
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONWHOLE} to recognize " + tok + " as " + norm); }
1403
+ return getNext(norm, tok);
1404
+ }
1405
+ {HTHINGEXCEPTIONWHOLE} \. / {INSENTP} {
1406
+ String tok = yytext();
1407
+ String norm = LexerUtils . removeSoftHyphens(tok);
1408
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONWHOLE} (2) to recognize " + tok + " as " + norm); }
1409
+ return getNext(norm, tok);
1410
+ }
1411
+ {HTHINGEXCEPTIONPREFIXED} {
1412
+ String tok = yytext();
1413
+ String norm = LexerUtils . removeSoftHyphens(tok);
1414
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONPREFIXED} to recognize " + tok + " as " + norm); }
1415
+ return getNext(norm, tok);
1416
+ }
1417
+ {HTHINGEXCEPTIONPREFIXED} \. / {INSENTP} {
1418
+ String tok = yytext();
1419
+ String norm = LexerUtils . removeSoftHyphens(tok);
1420
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONPREFIXED} (2) to recognize " + tok + " as " + norm); }
1421
+ return getNext(norm, tok);
1422
+ }
1423
+ {HTHINGEXCEPTIONSUFFIXED} {
1424
+ String tok = yytext();
1425
+ String norm = LexerUtils . removeSoftHyphens(tok);
1426
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONSUFFIXED} to recognize " + tok + " as " + norm); }
1427
+ return getNext(norm, tok);
1428
+ }
1429
+ {HTHINGEXCEPTIONSUFFIXED} \. / {INSENTP} {
1430
+ String tok = yytext();
1431
+ String norm = LexerUtils . removeSoftHyphens(tok);
1432
+ if (DEBUG ) { logger. info(" Used {HTHINGEXCEPTIONSUFFIXED} (2) to recognize " + tok + " as " + norm); }
1433
+ return getNext(norm, tok);
1434
+ }
1399
1435
{HTHING} { String tok = yytext();
1400
1436
breakByHyphensSlashes(tok);
1401
1437
tok = yytext();
@@ -1492,7 +1528,11 @@ RM/{NUM} { String txt = yytext();
1492
1528
return getNext(norm, tok);
1493
1529
}
1494
1530
1495
- {FAKEDUCKFEET} { return getNext(); }
1531
+ {FAKEDUCKFEET} {
1532
+ String tok = yytext();
1533
+ if (DEBUG ) { logger. info(" Used {FAKEDUCKFEET} to recognize " + tok); }
1534
+ return getNext(tok, tok);
1535
+ }
1496
1536
{MISCSYMBOL} {
1497
1537
String tok = yytext();
1498
1538
if (DEBUG ) { logger. info(" Used {MISCSYMBOL} to recognize " + tok); }
0 commit comments