Skip to content

Commit afb1ea8

Browse files
committed
Better French phone numbers and W-L-D scores
1 parent 4b129c0 commit afb1ea8

File tree

3 files changed

+47304
-46839
lines changed

3 files changed

+47304
-46839
lines changed

src/edu/stanford/nlp/process/PTBLexer.flex

+54-14
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,8 @@ HYPHEN = [-\u058A\u2010\u2011\u2012]
769769
HYPHENS = {HYPHEN}+
770770
SSN = [0-9]{3}{HYPHEN}[0-9]{2}{HYPHEN}[0-9]{4}
771771
/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
772-
PHONE = (\([0-9]{2,3}\)[ \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[\- \u00A0\u2007\u2012])?[0-9]{2,4}[\- \u00A0\u2007\u2012/])[0-9]{3,4}[\- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{3,4}\.[0-9]{3,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
772+
/* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
773+
PHONE = (\([0-9]{2,3}\)[- \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[- \u00A0\u2007\u2012])?[0-9]{2,4}[- \u00A0\u2007\u2012/])[0-9]{3,4}[- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{2,4}\.[0-9]{2,5}|((\+\+?)?[0-9]{1,4}-)?[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
773774
/* Fake duck feet appear sometimes in WSJ, and aren't likely to be SGML, less than, etc., so group. */
774775
FAKEDUCKFEET = <<|>>
775776
LESSTHAN = <|&lt;
@@ -1356,8 +1357,8 @@ RM/{NUM} { String txt = yytext();
13561357
if (escapeForwardSlashAsterisk) {
13571358
String normTok = LexerUtils.escapeChar(yytext(), '*');
13581359
if (DEBUG) { logger.info("Used {ASTS} to recognize " + txt + " as " + normTok); }
1359-
return getNext(normTok, yytext()); }
1360-
else {
1360+
return getNext(normTok, txt);
1361+
} else {
13611362
if (DEBUG) { logger.info("Used {ASTS} to recognize " + txt); }
13621363
return getNext(txt, txt);
13631364
}
@@ -1382,20 +1383,55 @@ RM/{NUM} { String txt = yytext();
13821383
if (DEBUG) { logger.info("Used {=} to recognize " + txt); }
13831384
return getNext(txt, txt);
13841385
}
1385-
\/ { if (escapeForwardSlashAsterisk) {
1386-
return getNext(LexerUtils.escapeChar(yytext(), '/'), yytext()); }
1387-
else {
1388-
return getNext();
1386+
\/ {
1387+
String txt = yytext();
1388+
if (escapeForwardSlashAsterisk) {
1389+
String normTok = LexerUtils.escapeChar(yytext(), '/');
1390+
if (DEBUG) { logger.info("Used {/} to recognize " + txt + " as " + normTok); }
1391+
return getNext(normTok, txt);
1392+
} else {
1393+
if (DEBUG) { logger.info("Used {/} to recognize " + txt); }
1394+
return getNext(txt, txt);
13891395
}
13901396
}
13911397
/* {HTHING}/[^\p{Alpha}\p{Digit}.+] { return getNext(LexerUtils.removeSoftHyphens(yytext()),
13921398
yytext()); } */
1393-
{HTHINGEXCEPTIONWHOLE} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1394-
{HTHINGEXCEPTIONWHOLE}\./{INSENTP} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1395-
{HTHINGEXCEPTIONPREFIXED} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1396-
{HTHINGEXCEPTIONPREFIXED}\./{INSENTP} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1397-
{HTHINGEXCEPTIONSUFFIXED} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1398-
{HTHINGEXCEPTIONSUFFIXED}\./{INSENTP} {return getNext(LexerUtils.removeSoftHyphens(yytext()), yytext());}
1399+
{HTHINGEXCEPTIONWHOLE} {
1400+
String tok = yytext();
1401+
String norm = LexerUtils.removeSoftHyphens(tok);
1402+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONWHOLE} to recognize " + tok + " as " + norm); }
1403+
return getNext(norm, tok);
1404+
}
1405+
{HTHINGEXCEPTIONWHOLE}\./{INSENTP} {
1406+
String tok = yytext();
1407+
String norm = LexerUtils.removeSoftHyphens(tok);
1408+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONWHOLE} (2) to recognize " + tok + " as " + norm); }
1409+
return getNext(norm, tok);
1410+
}
1411+
{HTHINGEXCEPTIONPREFIXED} {
1412+
String tok = yytext();
1413+
String norm = LexerUtils.removeSoftHyphens(tok);
1414+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONPREFIXED} to recognize " + tok + " as " + norm); }
1415+
return getNext(norm, tok);
1416+
}
1417+
{HTHINGEXCEPTIONPREFIXED}\./{INSENTP} {
1418+
String tok = yytext();
1419+
String norm = LexerUtils.removeSoftHyphens(tok);
1420+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONPREFIXED} (2) to recognize " + tok + " as " + norm); }
1421+
return getNext(norm, tok);
1422+
}
1423+
{HTHINGEXCEPTIONSUFFIXED} {
1424+
String tok = yytext();
1425+
String norm = LexerUtils.removeSoftHyphens(tok);
1426+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONSUFFIXED} to recognize " + tok + " as " + norm); }
1427+
return getNext(norm, tok);
1428+
}
1429+
{HTHINGEXCEPTIONSUFFIXED}\./{INSENTP} {
1430+
String tok = yytext();
1431+
String norm = LexerUtils.removeSoftHyphens(tok);
1432+
if (DEBUG) { logger.info("Used {HTHINGEXCEPTIONSUFFIXED} (2) to recognize " + tok + " as " + norm); }
1433+
return getNext(norm, tok);
1434+
}
13991435
{HTHING} { String tok = yytext();
14001436
breakByHyphensSlashes(tok);
14011437
tok = yytext();
@@ -1492,7 +1528,11 @@ RM/{NUM} { String txt = yytext();
14921528
return getNext(norm, tok);
14931529
}
14941530

1495-
{FAKEDUCKFEET} { return getNext(); }
1531+
{FAKEDUCKFEET} {
1532+
String tok = yytext();
1533+
if (DEBUG) { logger.info("Used {FAKEDUCKFEET} to recognize " + tok); }
1534+
return getNext(tok, tok);
1535+
}
14961536
{MISCSYMBOL} {
14971537
String tok = yytext();
14981538
if (DEBUG) { logger.info("Used {MISCSYMBOL} to recognize " + tok); }

0 commit comments

Comments
 (0)