Skip to content

Commit 441d9c9

Browse files
tlivelyradekdoulik
authored andcommitted
[Parser] Support string-style identifiers (WebAssembly#6278)
In addition to normal identifiers, support parsing identifiers of the format `$"..."`. This format is not yet allowed by the standard, but it is a popular proposed extension (see WebAssembly/spec#617 and WebAssembly/annotations#21). Binaryen has historically allowed a similar format and has supported arbitrary non-standard identifier characters, so it's much easier to support this extended syntax than to fix everything to use the restricted standard syntax.
1 parent f1875b2 commit 441d9c9

File tree

4 files changed

+103
-32
lines changed

4 files changed

+103
-32
lines changed

src/parser/lexer.cpp

+60-21
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,25 @@ struct LexStrCtx : LexCtx {
329329
}
330330
};
331331

332+
struct LexIdResult : LexResult {
333+
bool isStr = false;
334+
std::optional<std::string> str;
335+
};
336+
337+
struct LexIdCtx : LexCtx {
338+
bool isStr = false;
339+
std::optional<std::string> str;
340+
341+
LexIdCtx(std::string_view in) : LexCtx(in) {}
342+
343+
std::optional<LexIdResult> lexed() {
344+
if (auto basic = LexCtx::lexed()) {
345+
return LexIdResult{*basic, isStr, str};
346+
}
347+
return {};
348+
}
349+
};
350+
332351
std::optional<LexResult> lparen(std::string_view in) {
333352
LexCtx ctx(in);
334353
ctx.takePrefix("("sv);
@@ -647,26 +666,6 @@ std::optional<LexResult> idchar(std::string_view in) {
647666
return ctx.lexed();
648667
}
649668

650-
// id ::= '$' idchar+
651-
std::optional<LexResult> ident(std::string_view in) {
652-
LexCtx ctx(in);
653-
if (!ctx.takePrefix("$"sv)) {
654-
return {};
655-
}
656-
if (auto lexed = idchar(ctx.next())) {
657-
ctx.take(*lexed);
658-
} else {
659-
return {};
660-
}
661-
while (auto lexed = idchar(ctx.next())) {
662-
ctx.take(*lexed);
663-
}
664-
if (ctx.canFinish()) {
665-
return ctx.lexed();
666-
}
667-
return {};
668-
}
669-
670669
// string ::= '"' (b*:stringelem)* '"' => concat((b*)*)
671670
// (if |concat((b*)*)| < 2^32)
672671
// stringelem ::= c:stringchar => utf8(c)
@@ -741,6 +740,30 @@ std::optional<LexStrResult> str(std::string_view in) {
741740
return ctx.lexed();
742741
}
743742

743+
// id ::= '$' idchar+ | '$' str
744+
std::optional<LexIdResult> ident(std::string_view in) {
745+
LexIdCtx ctx(in);
746+
if (!ctx.takePrefix("$"sv)) {
747+
return {};
748+
}
749+
if (auto s = str(ctx.next())) {
750+
ctx.isStr = true;
751+
ctx.str = s->str;
752+
ctx.take(*s);
753+
} else if (auto lexed = idchar(ctx.next())) {
754+
ctx.take(*lexed);
755+
while (auto lexed = idchar(ctx.next())) {
756+
ctx.take(*lexed);
757+
}
758+
} else {
759+
return {};
760+
}
761+
if (ctx.canFinish()) {
762+
return ctx.lexed();
763+
}
764+
return {};
765+
}
766+
744767
// keyword ::= ( 'a' | ... | 'z' ) idchar* (if literal terminal in grammar)
745768
// reserved ::= idchar+
746769
//
@@ -889,11 +912,27 @@ std::optional<std::string_view> Token::getString() const {
889912
if (tok->str) {
890913
return std::string_view(*tok->str);
891914
}
915+
// Remove quotes.
892916
return span.substr(1, span.size() - 2);
893917
}
894918
return {};
895919
}
896920

921+
std::optional<std::string_view> Token::getID() const {
922+
if (auto* tok = std::get_if<IdTok>(&data)) {
923+
if (tok->str) {
924+
return std::string_view(*tok->str);
925+
}
926+
if (tok->isStr) {
927+
// Remove '$' and quotes.
928+
return span.substr(2, span.size() - 3);
929+
}
930+
// Remove '$'.
931+
return span.substr(1);
932+
}
933+
return {};
934+
}
935+
897936
void Lexer::skipSpace() {
898937
if (auto ctx = space(next())) {
899938
index += ctx->span.size();
@@ -908,7 +947,7 @@ void Lexer::lexToken() {
908947
} else if (auto t = rparen(next())) {
909948
tok = Token{t->span, RParenTok{}};
910949
} else if (auto t = ident(next())) {
911-
tok = Token{t->span, IdTok{}};
950+
tok = Token{t->span, IdTok{t->isStr, t->str}};
912951
} else if (auto t = integer(next())) {
913952
tok = Token{t->span, IntTok{t->n, t->sign}};
914953
} else if (auto t = float_(next())) {

src/parser/lexer.h

+8-8
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ struct RParenTok {
5353
};
5454

5555
struct IdTok {
56+
// Whether this ID has `$"..."` format
57+
bool isStr;
58+
59+
// If the ID is a string ID and contains escapes, this is its contents.
60+
std::optional<std::string> str;
61+
5662
bool operator==(const IdTok&) const { return true; }
5763
friend std::ostream& operator<<(std::ostream&, const IdTok&);
5864
};
@@ -81,6 +87,7 @@ struct FloatTok {
8187
};
8288

8389
struct StringTok {
90+
// If the string contains escapes, this is its contents.
8491
std::optional<std::string> str;
8592

8693
bool operator==(const StringTok& other) const { return str == other.str; }
@@ -111,14 +118,6 @@ struct Token {
111118

112119
bool isRParen() const { return std::get_if<RParenTok>(&data); }
113120

114-
std::optional<std::string_view> getID() const {
115-
if (std::get_if<IdTok>(&data)) {
116-
// Drop leading '$'.
117-
return span.substr(1);
118-
}
119-
return {};
120-
}
121-
122121
std::optional<std::string_view> getKeyword() const {
123122
if (std::get_if<KeywordTok>(&data)) {
124123
return span;
@@ -132,6 +131,7 @@ struct Token {
132131
std::optional<double> getF64() const;
133132
std::optional<float> getF32() const;
134133
std::optional<std::string_view> getString() const;
134+
std::optional<std::string_view> getID() const;
135135

136136
bool operator==(const Token&) const;
137137
friend std::ostream& operator<<(std::ostream& os, const Token&);

test/gtest/wat-lexer.cpp

+27
Original file line numberDiff line numberDiff line change
@@ -1377,6 +1377,33 @@ TEST(LexerTest, LexIdent) {
13771377
Lexer lexer("$"sv);
13781378
EXPECT_TRUE(lexer.empty());
13791379
}
1380+
1381+
// String IDs
1382+
{
1383+
Lexer lexer("$\"\"");
1384+
ASSERT_FALSE(lexer.empty());
1385+
Token expected{"$\"\""sv, IdTok{true, std::nullopt}};
1386+
EXPECT_EQ(*lexer, expected);
1387+
EXPECT_TRUE(lexer->getID());
1388+
EXPECT_EQ(*lexer->getID(), ""sv);
1389+
}
1390+
{
1391+
Lexer lexer("$\"hello\"");
1392+
ASSERT_FALSE(lexer.empty());
1393+
Token expected{"$\"hello\""sv, IdTok{true, std::nullopt}};
1394+
EXPECT_EQ(*lexer, expected);
1395+
EXPECT_TRUE(lexer->getID());
1396+
EXPECT_EQ(*lexer->getID(), "hello"sv);
1397+
}
1398+
{
1399+
// _$_£_€_𐍈_
1400+
auto unicode = "$\"_\\u{24}_\\u{00a3}_\\u{20AC}_\\u{10348}_\""sv;
1401+
Lexer lexer(unicode);
1402+
ASSERT_FALSE(lexer.empty());
1403+
std::string escaped{"_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"};
1404+
Token expected{unicode, IdTok{true, {escaped}}};
1405+
EXPECT_EQ(*lexer, expected);
1406+
}
13801407
}
13811408

13821409
TEST(LexerTest, LexString) {

test/lit/wat-kitchen-sink.wast

+8-3
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@
380380
;; CHECK: (elem $passive-2 anyref (struct.new_default $s0) (struct.new_default $s0))
381381
(elem $passive-2 anyref (item struct.new $s0) (struct.new $s0))
382382

383-
;; CHECK: (elem declare func $ref-func $ref-is-null $table-fill $table-grow $table-set)
383+
;; CHECK: (elem declare func $ref-func $table-fill $table-grow $table-set)
384384
(elem declare func 0 1 2 3)
385385

386386
(elem $declare-2 declare funcref (item ref.func 0) (ref.func 1) (item (ref.func 2)))
@@ -467,6 +467,11 @@
467467
;; CHECK-NEXT: )
468468
(func $f4 (type 18) (local i32 i64) (local $l f32))
469469

470+
;; CHECK: (func $"[quoted_name]" (type $void)
471+
;; CHECK-NEXT: (nop)
472+
;; CHECK-NEXT: )
473+
(func $"[quoted_name]")
474+
470475
;; CHECK: (func $nop-skate (type $void)
471476
;; CHECK-NEXT: (nop)
472477
;; CHECK-NEXT: (nop)
@@ -3622,13 +3627,13 @@
36223627
;; CHECK-NEXT: (ref.func $ref-func)
36233628
;; CHECK-NEXT: )
36243629
;; CHECK-NEXT: (drop
3625-
;; CHECK-NEXT: (ref.func $ref-is-null)
3630+
;; CHECK-NEXT: (ref.func $ref-func)
36263631
;; CHECK-NEXT: )
36273632
;; CHECK-NEXT: )
36283633
(func $ref-func
36293634
ref.func $ref-func
36303635
drop
3631-
ref.func 154
3636+
ref.func 156
36323637
drop
36333638
)
36343639

0 commit comments

Comments
 (0)