Skip to content

Commit 04f5d7b

Browse files
committed
syntax: loosen ASCII compatible rules
Previously, patterns like `(?-u:☃)` were banned under the logic that Unicode scalar values shouldn't be available unless Unicode mode is enabled. But since patterns are required to be UTF-8, there really isn't any difficulty in just interpreting Unicode literals as their corresponding UTF-8 encoding. Note though that Unicode character classes, even things like `(?-u:[☃])`, remain banned. We probably could make character classes work too, but it's unclear how that plays with ASCII compatible mode requiring that a single byte is the fundamental atom of matching (where as Unicode mode requires that Unicode scalar values are the fundamental atom of matching).
1 parent cfd0ca2 commit 04f5d7b

File tree

2 files changed

+12
-38
lines changed

2 files changed

+12
-38
lines changed

regex-syntax/src/hir/translate.rs

+10-36
Original file line numberDiff line numberDiff line change
@@ -388,17 +388,10 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
388388
}
389389
Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
390390
Either::Right(byte) => self.push_byte(byte),
391-
Either::Left(ch) => {
392-
if !self.flags().unicode() && ch.len_utf8() > 1 {
393-
return Err(
394-
self.error(x.span, ErrorKind::UnicodeNotAllowed)
395-
);
396-
}
397-
match self.case_fold_char(x.span, ch)? {
398-
None => self.push_char(ch),
399-
Some(expr) => self.push(HirFrame::Expr(expr)),
400-
}
401-
}
391+
Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
392+
None => self.push_char(ch),
393+
Some(expr) => self.push(HirFrame::Expr(expr)),
394+
},
402395
},
403396
Ast::Dot(ref span) => {
404397
self.push(HirFrame::Expr(self.hir_dot(**span)?));
@@ -872,8 +865,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
872865
})?;
873866
Ok(Some(Hir::class(hir::Class::Unicode(cls))))
874867
} else {
875-
if c.len_utf8() > 1 {
876-
return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
868+
if !c.is_ascii() {
869+
return Ok(None);
877870
}
878871
// If case folding won't do anything, then don't bother trying.
879872
match c {
@@ -1211,9 +1204,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
12111204
match self.ast_literal_to_scalar(ast)? {
12121205
Either::Right(byte) => Ok(byte),
12131206
Either::Left(ch) => {
1214-
let cp = u32::from(ch);
1215-
if cp <= 0x7F {
1216-
Ok(u8::try_from(cp).unwrap())
1207+
if ch.is_ascii() {
1208+
Ok(u8::try_from(ch).unwrap())
12171209
} else {
12181210
// We can't feasibly support Unicode in
12191211
// byte oriented classes. Byte classes don't
@@ -1661,16 +1653,7 @@ mod tests {
16611653
assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
16621654
assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
16631655

1664-
assert_eq!(
1665-
t_err("(?-u)☃"),
1666-
TestError {
1667-
kind: hir::ErrorKind::UnicodeNotAllowed,
1668-
span: Span::new(
1669-
Position::new(5, 1, 6),
1670-
Position::new(8, 1, 7)
1671-
),
1672-
}
1673-
);
1656+
assert_eq!(t("(?-u)☃"), hir_lit("☃"));
16741657
assert_eq!(
16751658
t_err(r"(?-u)\xFF"),
16761659
TestError {
@@ -1748,16 +1731,7 @@ mod tests {
17481731
);
17491732
assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
17501733

1751-
assert_eq!(
1752-
t_err("(?i-u)β"),
1753-
TestError {
1754-
kind: hir::ErrorKind::UnicodeNotAllowed,
1755-
span: Span::new(
1756-
Position::new(6, 1, 7),
1757-
Position::new(8, 1, 8),
1758-
),
1759-
}
1760-
);
1734+
assert_eq!(t("(?i-u)β"), hir_lit("β"),);
17611735
}
17621736

17631737
#[test]

src/bytes.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ bytes:
6868
1. The `u` flag can be disabled even when disabling it might cause the regex to
6969
match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in
7070
"ASCII compatible" mode.
71-
2. In ASCII compatible mode, neither Unicode scalar values nor Unicode
72-
character classes are allowed.
71+
2. In ASCII compatible mode, Unicode character classes are not allowed. Literal
72+
Unicode scalar values outside of character classes are allowed.
7373
3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`)
7474
revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps
7575
to `[[:digit:]]` and `\s` maps to `[[:space:]]`.

0 commit comments

Comments
 (0)