@@ -1207,7 +1207,7 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
1207
1207
}
1208
1208
1209
1209
const static uint32_t ZeroWidthC[] = {
1210
- // Characters which don't appear to be visible (sic) follow.
1210
+ // "Zero-Width" characters which don't appear to be visible follow.
1211
1211
0x0001 , 0x0002 , 0x0003 , 0x0004 , 0x0005 , 0x0006 , 0x0007 , 0x0008 ,
1212
1212
0x000b , 0x000e , 0x000f , 0x0010 , 0x0011 , 0x0012 , 0x0013 , 0x0014 ,
1213
1213
0x0015 , 0x0016 , 0x0017 , 0x0018 , 0x0019 , 0x001a , 0x001b , 0x001c ,
@@ -1316,10 +1316,11 @@ const static uint32_t ZeroWidthC[] = {
1316
1316
0x1daa8 , 0x1daa9 , 0x1daaa , 0x1daab , 0x1daac , 0x1daad , 0x1daae , 0x1daaf ,
1317
1317
};
1318
1318
1319
- // / diagnoseZeroWidth - Check for and error zero-width characters in delimiters.
1319
+ // / diagnoseZeroWidthMatchAndAdvance - Error zerowidth characters in delimiters.
1320
1320
// / A non visible character in the middle of a delimter can be used to extend
1321
1321
// / the literal beyond what it would appear creating potential security bugs.
1322
- static bool diagnoseZeroWidth (const char *&CurPtr, DiagnosticEngine *Diags) {
1322
+ static bool diagnoseZeroWidthMatchAndAdvance (char Target, const char *&CurPtr,
1323
+ DiagnosticEngine *Diags) {
1323
1324
// A way needs to be found to find the complete set of zero width chars or
1324
1325
// this security mitigation will be in vain. Current list is generated using
1325
1326
// the display width of attributed strings checking when it does not change.
@@ -1332,44 +1333,46 @@ static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) {
1332
1333
const char *TmpPtr = CurPtr;
1333
1334
while (true ) {
1334
1335
uint32_t NextChar = validateUTF8CharacterAndAdvance (TmpPtr, TmpPtr + 6 );
1335
- if (NextChar != ' " ' && NextChar != ' # ' &&
1336
+ if (NextChar != ( uint32_t )Target &&
1336
1337
(NextChar == ~0U || (NextChar >= 0xe0000 && NextChar <= 0xe0fff ) ||
1337
1338
std::binary_search (ZeroWidthV.begin (), ZeroWidthV.end (), NextChar))) {
1338
- if (Diags)
1339
+ if (Diags && *TmpPtr == Target )
1339
1340
Diags->diagnose (Lexer::getSourceLoc (CurPtr),
1340
1341
diag::lex_zerowidth_in_string_delimiter)
1341
1342
.fixItRemoveChars (Lexer::getSourceLoc (CurPtr),
1342
1343
Lexer::getSourceLoc (TmpPtr));
1343
1344
CurPtr = TmpPtr;
1344
1345
continue ;
1345
1346
}
1346
- return true ;
1347
+
1348
+ return *CurPtr == Target && CurPtr++;
1347
1349
}
1348
1350
}
1349
1351
1350
1352
// / advanceIfMultilineDelimiter - Centralized check for multiline delimiter.
1351
1353
static bool advanceIfMultilineDelimiter (const char *&CurPtr,
1352
1354
DiagnosticEngine *Diags) {
1353
- const char *TmpPtr = CurPtr - 1 ;
1354
- if (*TmpPtr++ == ' "' && diagnoseZeroWidth (TmpPtr, Diags) &&
1355
- *TmpPtr++ == ' "' && diagnoseZeroWidth ( TmpPtr, Diags) &&
1356
- *TmpPtr++ == ' "' ) {
1355
+ const char *TmpPtr = CurPtr;
1356
+ if (*( TmpPtr - 1 ) == ' "' &&
1357
+ diagnoseZeroWidthMatchAndAdvance ( ' "' , TmpPtr, Diags) &&
1358
+ diagnoseZeroWidthMatchAndAdvance ( ' "' , TmpPtr, Diags) ) {
1357
1359
CurPtr = TmpPtr;
1358
1360
return true ;
1359
1361
}
1360
1362
return false ;
1361
1363
}
1362
1364
1363
- // / advanceIfCustomDelimiterLen - Extracts/detects any custom delimiter on
1364
- // / opening a string literal and advances CurPtr if a delimiter is found and
1365
+ // / advanceIfCustomDelimiter - Extracts/detects any custom delimiter on
1366
+ // / opening a string literal, advances CurPtr if a delimiter is found and
1365
1367
// / returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
1366
- static unsigned advanceIfCustomDelimiterLen (const char *&CurPtr) {
1367
- const char *Lookahead = CurPtr;
1368
- while (*Lookahead == ' #' )
1369
- Lookahead++;
1370
- if (*Lookahead++ == ' "' ) {
1371
- unsigned CustomDelimiterLen = Lookahead - CurPtr;
1372
- CurPtr = Lookahead;
1368
+ static unsigned advanceIfCustomDelimiter (const char *&CurPtr,
1369
+ DiagnosticEngine *Diags) {
1370
+ const char *TmpPtr = CurPtr;
1371
+ unsigned CustomDelimiterLen = 1 ;
1372
+ while (diagnoseZeroWidthMatchAndAdvance (' #' , TmpPtr, Diags))
1373
+ CustomDelimiterLen++;
1374
+ if (diagnoseZeroWidthMatchAndAdvance (' "' , TmpPtr, Diags)) {
1375
+ CurPtr = TmpPtr;
1373
1376
return CustomDelimiterLen;
1374
1377
}
1375
1378
return 0 ;
@@ -1380,15 +1383,22 @@ static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) {
1380
1383
// / interpolation inside a "raw" string. Normal/cooked string processing is
1381
1384
// / the degenerate case of there being no # characters surrounding the quotes.
1382
1385
// / If delimiter matches, advances byte pointer passed in and returns true.
1386
+ // / Also used to detect the final delimiter of a string when IsClosing == true.
1383
1387
static bool delimiterMatches (unsigned CustomDelimiterLen, const char *&BytesPtr,
1384
- DiagnosticEngine *Diags) {
1388
+ DiagnosticEngine *Diags, bool IsClosing = false ) {
1385
1389
if (!CustomDelimiterLen)
1386
1390
return true ;
1387
1391
const char *TmpPtr = BytesPtr;
1388
- for ( unsigned i = 0 ; i < CustomDelimiterLen; i++ )
1389
- if (diagnoseZeroWidth ( TmpPtr, Diags) && *TmpPtr++ != ' # ' )
1392
+ while ( CustomDelimiterLen-- )
1393
+ if (! diagnoseZeroWidthMatchAndAdvance ( ' # ' , TmpPtr, Diags))
1390
1394
return false ;
1391
1395
BytesPtr = TmpPtr;
1396
+ if (*BytesPtr == ' #' && Diags)
1397
+ Diags->diagnose (Lexer::getSourceLoc (BytesPtr), IsClosing ?
1398
+ diag::lex_invalid_closing_delimiter :
1399
+ diag::lex_invalid_escape_delimiter)
1400
+ .fixItRemoveChars (Lexer::getSourceLoc (BytesPtr),
1401
+ Lexer::getSourceLoc (BytesPtr + 1 ));
1392
1402
return true ;
1393
1403
}
1394
1404
@@ -1459,18 +1469,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
1459
1469
unsigned CharValue = 0 ;
1460
1470
// Escape processing. We already ate the "\".
1461
1471
switch (*CurPtr) {
1462
- case ' ' : case ' \t ' : case ' \n ' : case ' \r ' : case ' #' :
1463
- if (*CurPtr == ' #' ) {
1464
- if (CustomDelimiterLen) {
1465
- if (EmitDiagnostics)
1466
- diagnose (CurPtr, diag::lex_invalid_delimiter_escape)
1467
- .fixItRemoveChars (Lexer::getSourceLoc (CurPtr),
1468
- Lexer::getSourceLoc (CurPtr + 1 ));
1469
- CurPtr++;
1470
- return ~1U ;
1471
- }
1472
- }
1473
- else if (IsMultilineString && maybeConsumeNewlineEscape (CurPtr, 0 ))
1472
+ case ' ' : case ' \t ' : case ' \n ' : case ' \r ' :
1473
+ if (IsMultilineString && maybeConsumeNewlineEscape (CurPtr, 0 ))
1474
1474
return ' \n ' ;
1475
1475
LLVM_FALLTHROUGH;
1476
1476
default : // Invalid escape.
@@ -1560,16 +1560,15 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1560
1560
1561
1561
case ' #' :
1562
1562
if (inStringLiteral () ||
1563
- !(CustomDelimiterLen = advanceIfCustomDelimiterLen (CurPtr)))
1563
+ !(CustomDelimiterLen = advanceIfCustomDelimiter (CurPtr, Diags )))
1564
1564
continue ;
1565
1565
LLVM_FALLTHROUGH;
1566
1566
1567
1567
case ' "' :
1568
1568
case ' \' ' : {
1569
1569
if (!AllowNewline.back () && inStringLiteral ()) {
1570
- unsigned InnerDelimiter = CustomDelimiter.back ();
1571
1570
if (OpenDelimiters.back () == CurPtr[-1 ] &&
1572
- delimiterMatches (InnerDelimiter , CurPtr, Diags)) {
1571
+ delimiterMatches (CustomDelimiter. back () , CurPtr, Diags, true )) {
1573
1572
// Closing single line string literal.
1574
1573
OpenDelimiters.pop_back ();
1575
1574
AllowNewline.pop_back ();
@@ -1592,7 +1591,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
1592
1591
// We are in multiline string literal.
1593
1592
assert (AllowNewline.back () && " other cases must be handled above" );
1594
1593
if (isMultilineQuote &&
1595
- delimiterMatches (CustomDelimiter.back (), CurPtr, Diags)) {
1594
+ delimiterMatches (CustomDelimiter.back (), CurPtr, Diags, true )) {
1596
1595
// Close multiline string literal.
1597
1596
OpenDelimiters.pop_back ();
1598
1597
AllowNewline.pop_back ();
@@ -1868,10 +1867,10 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
1868
1867
// NOTE: We only allow single-quote string literals so we can emit useful
1869
1868
// diagnostics about changing them to double quotes.
1870
1869
1871
- bool wasErroneous = false , MultilineString = false ;
1870
+ bool wasErroneous = false , IsMultilineString = false ;
1872
1871
1873
1872
// Is this the start of a multiline string literal?
1874
- if ((MultilineString = advanceIfMultilineDelimiter (CurPtr, Diags))) {
1873
+ if ((IsMultilineString = advanceIfMultilineDelimiter (CurPtr, Diags))) {
1875
1874
if (*CurPtr != ' \n ' && *CurPtr != ' \r ' )
1876
1875
diagnose (CurPtr, diag::lex_illegal_multiline_string_start)
1877
1876
.fixItInsert (Lexer::getSourceLoc (CurPtr), " \n " );
@@ -1885,7 +1884,7 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
1885
1884
CurPtr = TmpPtr + 1 ;
1886
1885
const char *EndPtr =
1887
1886
skipToEndOfInterpolatedExpression (CurPtr, BufferEnd,
1888
- Diags, MultilineString );
1887
+ Diags, IsMultilineString );
1889
1888
1890
1889
if (*EndPtr == ' )' ) {
1891
1890
// Successfully scanned the body of the expression literal.
@@ -1898,15 +1897,15 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
1898
1897
}
1899
1898
1900
1899
// String literals cannot have \n or \r in them (unless multiline).
1901
- if (((*CurPtr == ' \r ' || *CurPtr == ' \n ' ) && !MultilineString )
1900
+ if (((*CurPtr == ' \r ' || *CurPtr == ' \n ' ) && !IsMultilineString )
1902
1901
|| CurPtr == BufferEnd) {
1903
1902
TokStart -= CustomDelimiterLen;
1904
1903
diagnose (TokStart, diag::lex_unterminated_string);
1905
1904
return formToken (tok::unknown, TokStart);
1906
1905
}
1907
1906
1908
1907
unsigned CharValue = lexCharacter (CurPtr, *TokStart, true ,
1909
- MultilineString , CustomDelimiterLen);
1908
+ IsMultilineString , CustomDelimiterLen);
1910
1909
wasErroneous |= CharValue == ~1U ;
1911
1910
1912
1911
// If this is the end of string, we are done. If it is a normal character
@@ -1949,15 +1948,15 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
1949
1948
}
1950
1949
1951
1950
// Is this the end of multiline/custom-delimited string literal?
1952
- if ((!MultilineString || advanceIfMultilineDelimiter (CurPtr, Diags)) &&
1953
- delimiterMatches (CustomDelimiterLen, CurPtr, Diags)) {
1951
+ if ((!IsMultilineString || advanceIfMultilineDelimiter (CurPtr, Diags)) &&
1952
+ delimiterMatches (CustomDelimiterLen, CurPtr, Diags, true )) {
1954
1953
TokStart -= CustomDelimiterLen;
1955
1954
if (wasErroneous)
1956
1955
return formToken (tok::unknown, TokStart);
1957
1956
1958
1957
formToken (tok::string_literal, TokStart,
1959
- MultilineString , CustomDelimiterLen);
1960
- if (MultilineString && Diags)
1958
+ IsMultilineString , CustomDelimiterLen);
1959
+ if (IsMultilineString && Diags)
1961
1960
validateMultilineIndents (NextToken, Diags);
1962
1961
return ;
1963
1962
}
@@ -2493,7 +2492,7 @@ void Lexer::lexImpl() {
2493
2492
case ' \\ ' : return formToken (tok::backslash, TokStart);
2494
2493
2495
2494
case ' #' :
2496
- if (unsigned CustomDelimiterLen = advanceIfCustomDelimiterLen (CurPtr))
2495
+ if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter (CurPtr, Diags ))
2497
2496
return lexStringLiteral (CustomDelimiterLen);
2498
2497
return lexHash ();
2499
2498
0 commit comments