Skip to content

Commit 999bb40

Browse files
committed
New diagnostic for closing delimiter
1 parent 02f7cd5 commit 999bb40

File tree

3 files changed

+51
-50
lines changed

3 files changed

+51
-50
lines changed

include/swift/AST/DiagnosticsParse.def

+3-1
Original file line numberDiff line numberDiff line change
@@ -138,8 +138,10 @@ ERROR(lex_invalid_u_escape,none,
138138
"\\u{...} escape sequence expects between 1 and 8 hex digits", ())
139139
ERROR(lex_invalid_u_escape_rbrace,none,
140140
"expected '}' in \\u{...} escape sequence", ())
141-
ERROR(lex_invalid_delimiter_escape,none,
141+
ERROR(lex_invalid_escape_delimiter,none,
142142
"too many '#' characters in delimited escape", ())
143+
ERROR(lex_invalid_closing_delimiter,none,
144+
"too many '#' characters in closing delimiter", ())
143145
ERROR(lex_zerowidth_in_string_delimiter,none,
144146
"zero-width character detected in string delimiter", ())
145147

include/swift/Parse/Token.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ class Token {
277277
this->MultilineString = IsMultilineString;
278278
this->CustomDelimiterLen = CustomDelimiterLen;
279279
assert(this->CustomDelimiterLen == CustomDelimiterLen &&
280-
"string custom delimiter too long");
280+
"custom string delimiter length > 255");
281281
}
282282

283283
bool isMultilineString() const {

lib/Parse/Lexer.cpp

+47-48
Original file line numberDiff line numberDiff line change
@@ -1207,7 +1207,7 @@ static bool maybeConsumeNewlineEscape(const char *&CurPtr, ssize_t Offset) {
12071207
}
12081208

12091209
const static uint32_t ZeroWidthC[] = {
1210-
// Characters which don't appear to be visible (sic) follow.
1210+
// "Zero-Width" characters which don't appear to be visible follow.
12111211
0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008,
12121212
0x000b, 0x000e, 0x000f, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014,
12131213
0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c,
@@ -1316,10 +1316,11 @@ const static uint32_t ZeroWidthC[] = {
13161316
0x1daa8, 0x1daa9, 0x1daaa, 0x1daab, 0x1daac, 0x1daad, 0x1daae, 0x1daaf,
13171317
};
13181318

1319-
/// diagnoseZeroWidth - Check for and error zero-width characters in delimiters.
1319+
/// diagnoseZeroWidthMatchAndAdvance - Error zerowidth characters in delimiters.
13201320
/// A non visible character in the middle of a delimter can be used to extend
13211321
/// the literal beyond what it would appear creating potential security bugs.
1322-
static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) {
1322+
static bool diagnoseZeroWidthMatchAndAdvance(char Target, const char *&CurPtr,
1323+
DiagnosticEngine *Diags) {
13231324
// A way needs to be found to find the complete set of zero width chars or
13241325
// this security mitigation will be in vain. Current list is generated using
13251326
// the display width of attributed strings checking when it does not change.
@@ -1332,44 +1333,46 @@ static bool diagnoseZeroWidth(const char *&CurPtr, DiagnosticEngine *Diags) {
13321333
const char *TmpPtr = CurPtr;
13331334
while (true) {
13341335
uint32_t NextChar = validateUTF8CharacterAndAdvance(TmpPtr, TmpPtr + 6);
1335-
if (NextChar != '"' && NextChar != '#' &&
1336+
if (NextChar != (uint32_t)Target &&
13361337
(NextChar == ~0U || (NextChar >= 0xe0000 && NextChar <= 0xe0fff) ||
13371338
std::binary_search(ZeroWidthV.begin(), ZeroWidthV.end(), NextChar))) {
1338-
if (Diags)
1339+
if (Diags && *TmpPtr == Target)
13391340
Diags->diagnose(Lexer::getSourceLoc(CurPtr),
13401341
diag::lex_zerowidth_in_string_delimiter)
13411342
.fixItRemoveChars(Lexer::getSourceLoc(CurPtr),
13421343
Lexer::getSourceLoc(TmpPtr));
13431344
CurPtr = TmpPtr;
13441345
continue;
13451346
}
1346-
return true;
1347+
1348+
return *CurPtr == Target && CurPtr++;
13471349
}
13481350
}
13491351

13501352
/// advanceIfMultilineDelimiter - Centralized check for multiline delimiter.
13511353
static bool advanceIfMultilineDelimiter(const char *&CurPtr,
13521354
DiagnosticEngine *Diags) {
1353-
const char *TmpPtr = CurPtr - 1;
1354-
if (*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) &&
1355-
*TmpPtr++ == '"' && diagnoseZeroWidth(TmpPtr, Diags) &&
1356-
*TmpPtr++ == '"') {
1355+
const char *TmpPtr = CurPtr;
1356+
if (*(TmpPtr - 1) == '"' &&
1357+
diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags) &&
1358+
diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
13571359
CurPtr = TmpPtr;
13581360
return true;
13591361
}
13601362
return false;
13611363
}
13621364

1363-
/// advanceIfCustomDelimiterLen - Extracts/detects any custom delimiter on
1364-
/// opening a string literal and advances CurPtr if a delimiter is found and
1365+
/// advanceIfCustomDelimiter - Extracts/detects any custom delimiter on
1366+
/// opening a string literal, advances CurPtr if a delimiter is found and
13651367
/// returns a non-zero delimiter length. CurPtr[-1] generally '#' when called.
1366-
static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) {
1367-
const char *Lookahead = CurPtr;
1368-
while (*Lookahead == '#')
1369-
Lookahead++;
1370-
if (*Lookahead++ == '"') {
1371-
unsigned CustomDelimiterLen = Lookahead - CurPtr;
1372-
CurPtr = Lookahead;
1368+
static unsigned advanceIfCustomDelimiter(const char *&CurPtr,
1369+
DiagnosticEngine *Diags) {
1370+
const char *TmpPtr = CurPtr;
1371+
unsigned CustomDelimiterLen = 1;
1372+
while (diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
1373+
CustomDelimiterLen++;
1374+
if (diagnoseZeroWidthMatchAndAdvance('"', TmpPtr, Diags)) {
1375+
CurPtr = TmpPtr;
13731376
return CustomDelimiterLen;
13741377
}
13751378
return 0;
@@ -1380,15 +1383,22 @@ static unsigned advanceIfCustomDelimiterLen(const char *&CurPtr) {
13801383
/// interpolation inside a "raw" string. Normal/cooked string processing is
13811384
/// the degenerate case of there being no # characters surrounding the quotes.
13821385
/// If delimiter matches, advances byte pointer passed in and returns true.
1386+
/// Also used to detect the final delimiter of a string when IsClosing == true.
13831387
static bool delimiterMatches(unsigned CustomDelimiterLen, const char *&BytesPtr,
1384-
DiagnosticEngine *Diags) {
1388+
DiagnosticEngine *Diags, bool IsClosing = false) {
13851389
if (!CustomDelimiterLen)
13861390
return true;
13871391
const char *TmpPtr = BytesPtr;
1388-
for (unsigned i = 0; i < CustomDelimiterLen; i++)
1389-
if (diagnoseZeroWidth(TmpPtr, Diags) && *TmpPtr++ != '#')
1392+
while (CustomDelimiterLen--)
1393+
if (!diagnoseZeroWidthMatchAndAdvance('#', TmpPtr, Diags))
13901394
return false;
13911395
BytesPtr = TmpPtr;
1396+
if (*BytesPtr == '#' && Diags)
1397+
Diags->diagnose(Lexer::getSourceLoc(BytesPtr), IsClosing ?
1398+
diag::lex_invalid_closing_delimiter :
1399+
diag::lex_invalid_escape_delimiter)
1400+
.fixItRemoveChars(Lexer::getSourceLoc(BytesPtr),
1401+
Lexer::getSourceLoc(BytesPtr + 1));
13921402
return true;
13931403
}
13941404

@@ -1459,18 +1469,8 @@ unsigned Lexer::lexCharacter(const char *&CurPtr, char StopQuote,
14591469
unsigned CharValue = 0;
14601470
// Escape processing. We already ate the "\".
14611471
switch (*CurPtr) {
1462-
case ' ': case '\t': case '\n': case '\r': case '#':
1463-
if (*CurPtr == '#') {
1464-
if (CustomDelimiterLen) {
1465-
if (EmitDiagnostics)
1466-
diagnose(CurPtr, diag::lex_invalid_delimiter_escape)
1467-
.fixItRemoveChars(Lexer::getSourceLoc(CurPtr),
1468-
Lexer::getSourceLoc(CurPtr + 1));
1469-
CurPtr++;
1470-
return ~1U;
1471-
}
1472-
}
1473-
else if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
1472+
case ' ': case '\t': case '\n': case '\r':
1473+
if (IsMultilineString && maybeConsumeNewlineEscape(CurPtr, 0))
14741474
return '\n';
14751475
LLVM_FALLTHROUGH;
14761476
default: // Invalid escape.
@@ -1560,16 +1560,15 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
15601560

15611561
case '#':
15621562
if (inStringLiteral() ||
1563-
!(CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr)))
1563+
!(CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags)))
15641564
continue;
15651565
LLVM_FALLTHROUGH;
15661566

15671567
case '"':
15681568
case '\'': {
15691569
if (!AllowNewline.back() && inStringLiteral()) {
1570-
unsigned InnerDelimiter = CustomDelimiter.back();
15711570
if (OpenDelimiters.back() == CurPtr[-1] &&
1572-
delimiterMatches(InnerDelimiter, CurPtr, Diags)) {
1571+
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
15731572
// Closing single line string literal.
15741573
OpenDelimiters.pop_back();
15751574
AllowNewline.pop_back();
@@ -1592,7 +1591,7 @@ static const char *skipToEndOfInterpolatedExpression(const char *CurPtr,
15921591
// We are in multiline string literal.
15931592
assert(AllowNewline.back() && "other cases must be handled above");
15941593
if (isMultilineQuote &&
1595-
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags)) {
1594+
delimiterMatches(CustomDelimiter.back(), CurPtr, Diags, true)) {
15961595
// Close multiline string literal.
15971596
OpenDelimiters.pop_back();
15981597
AllowNewline.pop_back();
@@ -1868,10 +1867,10 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
18681867
// NOTE: We only allow single-quote string literals so we can emit useful
18691868
// diagnostics about changing them to double quotes.
18701869

1871-
bool wasErroneous = false, MultilineString = false;
1870+
bool wasErroneous = false, IsMultilineString = false;
18721871

18731872
// Is this the start of a multiline string literal?
1874-
if ((MultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) {
1873+
if ((IsMultilineString = advanceIfMultilineDelimiter(CurPtr, Diags))) {
18751874
if (*CurPtr != '\n' && *CurPtr != '\r')
18761875
diagnose(CurPtr, diag::lex_illegal_multiline_string_start)
18771876
.fixItInsert(Lexer::getSourceLoc(CurPtr), "\n");
@@ -1885,7 +1884,7 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
18851884
CurPtr = TmpPtr + 1;
18861885
const char *EndPtr =
18871886
skipToEndOfInterpolatedExpression(CurPtr, BufferEnd,
1888-
Diags, MultilineString);
1887+
Diags, IsMultilineString);
18891888

18901889
if (*EndPtr == ')') {
18911890
// Successfully scanned the body of the expression literal.
@@ -1898,15 +1897,15 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
18981897
}
18991898

19001899
// String literals cannot have \n or \r in them (unless multiline).
1901-
if (((*CurPtr == '\r' || *CurPtr == '\n') && !MultilineString)
1900+
if (((*CurPtr == '\r' || *CurPtr == '\n') && !IsMultilineString)
19021901
|| CurPtr == BufferEnd) {
19031902
TokStart -= CustomDelimiterLen;
19041903
diagnose(TokStart, diag::lex_unterminated_string);
19051904
return formToken(tok::unknown, TokStart);
19061905
}
19071906

19081907
unsigned CharValue = lexCharacter(CurPtr, *TokStart, true,
1909-
MultilineString, CustomDelimiterLen);
1908+
IsMultilineString, CustomDelimiterLen);
19101909
wasErroneous |= CharValue == ~1U;
19111910

19121911
// If this is the end of string, we are done. If it is a normal character
@@ -1949,15 +1948,15 @@ void Lexer::lexStringLiteral(unsigned CustomDelimiterLen) {
19491948
}
19501949

19511950
// Is this the end of multiline/custom-delimited string literal?
1952-
if ((!MultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) &&
1953-
delimiterMatches(CustomDelimiterLen, CurPtr, Diags)) {
1951+
if ((!IsMultilineString || advanceIfMultilineDelimiter(CurPtr, Diags)) &&
1952+
delimiterMatches(CustomDelimiterLen, CurPtr, Diags, true)) {
19541953
TokStart -= CustomDelimiterLen;
19551954
if (wasErroneous)
19561955
return formToken(tok::unknown, TokStart);
19571956

19581957
formToken(tok::string_literal, TokStart,
1959-
MultilineString, CustomDelimiterLen);
1960-
if (MultilineString && Diags)
1958+
IsMultilineString, CustomDelimiterLen);
1959+
if (IsMultilineString && Diags)
19611960
validateMultilineIndents(NextToken, Diags);
19621961
return;
19631962
}
@@ -2493,7 +2492,7 @@ void Lexer::lexImpl() {
24932492
case '\\': return formToken(tok::backslash, TokStart);
24942493

24952494
case '#':
2496-
if (unsigned CustomDelimiterLen = advanceIfCustomDelimiterLen(CurPtr))
2495+
if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))
24972496
return lexStringLiteral(CustomDelimiterLen);
24982497
return lexHash();
24992498

0 commit comments

Comments
 (0)