Skip to content

Commit 5ef45fb

Browse files
committed
Initial working implementation of JS's new custom glob
1 parent 196f0da commit 5ef45fb

File tree

4 files changed

+204
-44
lines changed

4 files changed

+204
-44
lines changed

playwright/_impl/_glob.py

+8-12
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,12 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
import re
1514

1615
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping
1716
escaped_chars = {"$", "^", "+", ".", "*", "(", ")", "|", "\\", "?", "{", "}", "[", "]"}
1817

1918

20-
def glob_to_regex(glob: str) -> "re.Pattern[str]":
19+
def glob_to_regex_pattern(glob: str) -> str:
2120
tokens = ["^"]
2221
in_group = False
2322

@@ -46,23 +45,20 @@ def glob_to_regex(glob: str) -> "re.Pattern[str]":
4645
else:
4746
tokens.append("([^/]*)")
4847
else:
49-
if c == "?":
50-
tokens.append(".")
51-
elif c == "[":
52-
tokens.append("[")
53-
elif c == "]":
54-
tokens.append("]")
55-
elif c == "{":
48+
if c == "{":
5649
in_group = True
5750
tokens.append("(")
5851
elif c == "}":
5952
in_group = False
6053
tokens.append(")")
61-
elif c == "," and in_group:
62-
tokens.append("|")
54+
elif c == ",":
55+
if in_group:
56+
tokens.append("|")
57+
else:
58+
tokens.append("\\" + c)
6359
else:
6460
tokens.append("\\" + c if c in escaped_chars else c)
6561
i += 1
6662

6763
tokens.append("$")
68-
return re.compile("".join(tokens))
64+
return "".join(tokens)

playwright/_impl/_helper.py

+82-17
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
is_target_closed_error,
4545
rewrite_error,
4646
)
47-
from playwright._impl._glob import glob_to_regex
47+
from playwright._impl._glob import glob_to_regex_pattern
4848
from playwright._impl._greenlets import RouteGreenlet
4949
from playwright._impl._str_utils import escape_regex_flags
5050

@@ -144,31 +144,96 @@ class FrameNavigatedEvent(TypedDict):
144144

145145

146146
def url_matches(
147-
base_url: Optional[str], url_string: str, match: Optional[URLMatch]
147+
base_url: Optional[str],
148+
url_string: str,
149+
match: Optional[URLMatch],
150+
websocket_url: bool = None,
148151
) -> bool:
149152
if not match:
150153
return True
151-
if isinstance(match, str) and match[0] != "*":
152-
# Allow http(s) baseURL to match ws(s) urls.
153-
if (
154-
base_url
155-
and re.match(r"^https?://", base_url)
156-
and re.match(r"^wss?://", url_string)
157-
):
158-
base_url = re.sub(r"^http", "ws", base_url)
159-
if base_url:
160-
match = urljoin(base_url, match)
161-
parsed = urlparse(match)
162-
if parsed.path == "":
163-
parsed = parsed._replace(path="/")
164-
match = parsed.geturl()
165154
if isinstance(match, str):
166-
match = glob_to_regex(match)
155+
match = re.compile(
156+
resolve_glob_to_regex_pattern(base_url, match, websocket_url)
157+
)
167158
if isinstance(match, Pattern):
168159
return bool(match.search(url_string))
169160
return match(url_string)
170161

171162

163+
def resolve_glob_to_regex_pattern(
164+
base_url: Optional[str], glob: str, websocket_url: bool = None
165+
) -> str:
166+
if websocket_url:
167+
base_url = to_websocket_base_url(base_url)
168+
glob = resolve_glob_base(base_url, glob)
169+
return glob_to_regex_pattern(glob)
170+
171+
172+
def to_websocket_base_url(base_url: Optional[str]) -> Optional[str]:
173+
if base_url is not None and re.match(r"^https?://", base_url):
174+
base_url = re.sub(r"^http", "ws", base_url)
175+
return base_url
176+
177+
178+
def resolve_glob_base(base_url: Optional[str], match: str) -> str:
179+
if match[0] == "*":
180+
return match
181+
182+
token_map: Dict[str, str] = {}
183+
184+
def map_token(original: str, replacement: str) -> str:
185+
if len(original) == 0:
186+
return ""
187+
token_map[replacement] = original
188+
return replacement
189+
190+
# Escaped `\\?` behaves the same as `?` in our glob patterns.
191+
match = match.replace(r"\\?", "?")
192+
# Glob symbols may be escaped in the URL and some of them such as ? affect resolution,
193+
# so we replace them with safe components first.
194+
processed_parts = []
195+
for index, token in enumerate(match.split("/")):
196+
if token in (".", "..", ""):
197+
processed_parts.append(token)
198+
continue
199+
# Handle special case of http*://, note that the new schema has to be
200+
# a web schema so that slashes are properly inserted after domain.
201+
if index == 0 and token.endswith(":"):
202+
# Using a simple replacement for the scheme part
203+
processed_parts.append(map_token(token, "http:"))
204+
continue
205+
question_index = token.find("?")
206+
if question_index == -1:
207+
processed_parts.append(map_token(token, f"$_{index}_$"))
208+
else:
209+
new_prefix = map_token(token[:question_index], f"$_{index}_$")
210+
new_suffix = map_token(token[question_index:], f"?$_{index}_$")
211+
processed_parts.append(new_prefix + new_suffix)
212+
213+
relative_path = "/".join(processed_parts)
214+
resolved_url = urljoin(base_url if base_url is not None else "", relative_path)
215+
216+
for replacement, original in token_map.items():
217+
resolved_url = resolved_url.replace(replacement, original, 1)
218+
219+
# In Node.js, new URL('http://localhost') returns 'http://localhost/'.
220+
# To ensure the same url matching behavior, do the same.
221+
split = resolved_url.split("://", maxsplit=1)
222+
if len(split) == 2:
223+
core_url = "http://" + split[1]
224+
else:
225+
core_url = match
226+
parsed = urlparse(core_url, allow_fragments=True)
227+
if len(split) == 2:
228+
# urlparse doesn't like stars in the scheme
229+
parsed = parsed._replace(scheme=split[0])
230+
if parsed.path == "":
231+
parsed = parsed._replace(path="/")
232+
resolved_url = parsed.geturl()
233+
234+
return resolved_url
235+
236+
172237
class HarLookupResult(TypedDict, total=False):
173238
action: Literal["error", "redirect", "fulfill", "noentry"]
174239
message: Optional[str]

playwright/_impl/_network.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ def prepare_interception_patterns(
754754
return patterns
755755

756756
def matches(self, ws_url: str) -> bool:
757-
return url_matches(self._base_url, ws_url, self.url)
757+
return url_matches(self._base_url, ws_url, self.url, True)
758758

759759
async def handle(self, websocket_route: "WebSocketRoute") -> None:
760760
coro_or_future = self.handler(websocket_route)

tests/async/test_page_route.py

+113-14
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
import pytest
2222

23-
from playwright._impl._glob import glob_to_regex
23+
from playwright._impl._glob import glob_to_regex_pattern
24+
from playwright._impl._helper import url_matches
2425
from playwright.async_api import (
2526
Browser,
2627
BrowserContext,
@@ -29,6 +30,7 @@
2930
Playwright,
3031
Request,
3132
Route,
33+
expect,
3234
)
3335
from tests.server import Server, TestServerRequest
3436
from tests.utils import must
@@ -1051,17 +1053,19 @@ async def handle_request(route: Route) -> None:
10511053
assert await response.json() == {"foo": "bar"}
10521054

10531055

1054-
async def test_glob_to_regex() -> None:
1056+
async def test_should_work_with_glob() -> None:
1057+
def glob_to_regex(pattern: str) -> re.Pattern:
1058+
return re.compile(glob_to_regex_pattern(pattern))
1059+
10551060
assert glob_to_regex("**/*.js").match("https://localhost:8080/foo.js")
10561061
assert not glob_to_regex("**/*.css").match("https://localhost:8080/foo.js")
1057-
assert not glob_to_regex("*.js").match("https://localhost:8080/foo.js")
1062+
assert not glob_to_regex("*.js").match(
1063+
"https://localhost:8080/foo.js"
1064+
) # Doesn"t match path separator
10581065
assert glob_to_regex("https://**/*.js").match("https://localhost:8080/foo.js")
10591066
assert glob_to_regex("http://localhost:8080/simple/path.js").match(
10601067
"http://localhost:8080/simple/path.js"
10611068
)
1062-
assert glob_to_regex("http://localhost:8080/?imple/path.js").match(
1063-
"http://localhost:8080/Simple/path.js"
1064-
)
10651069
assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/a.js")
10661070
assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/b.js")
10671071
assert not glob_to_regex("**/{a,b}.js").match("https://localhost:8080/c.js")
@@ -1081,15 +1085,110 @@ async def test_glob_to_regex() -> None:
10811085
"http://localhost:3000/signin-oidcnice"
10821086
)
10831087

1084-
assert glob_to_regex("**/three-columns/settings.html?**id=[a-z]**").match(
1088+
# range [] is NOT supported
1089+
assert glob_to_regex("**/api/v[0-9]").fullmatch("http://example.com/api/v[0-9]")
1090+
assert not glob_to_regex("**/api/v[0-9]").fullmatch(
1091+
"http://example.com/api/version"
1092+
)
1093+
assert not glob_to_regex("**/api/v[0-9]").fullmatch(
1094+
"http://example.com/api/v1"
1095+
) # Should not match if [] is literal
1096+
1097+
# query params
1098+
assert glob_to_regex("**/api\\?param").match("http://example.com/api?param")
1099+
assert not glob_to_regex("**/api\\?param").match("http://example.com/api-param")
1100+
1101+
assert glob_to_regex("**/three-columns/settings.html\\?**id=settings-**").match(
10851102
"http://mydomain:8080/blah/blah/three-columns/settings.html?id=settings-e3c58efe-02e9-44b0-97ac-dd138100cf7c&blah"
10861103
)
10871104

1088-
assert glob_to_regex("\\?") == re.compile(r"^\?$")
1089-
assert glob_to_regex("\\") == re.compile(r"^\\$")
1090-
assert glob_to_regex("\\\\") == re.compile(r"^\\$")
1091-
assert glob_to_regex("\\[") == re.compile(r"^\[$")
1092-
assert glob_to_regex("[a-z]") == re.compile(r"^[a-z]$")
1093-
assert glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]") == re.compile(
1094-
r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$"
1105+
print(glob_to_regex("\\?").pattern)
1106+
assert glob_to_regex("\\?").pattern == r"^\?$"
1107+
assert glob_to_regex("\\").pattern == r"^\\$"
1108+
assert glob_to_regex("\\\\").pattern == r"^\\$"
1109+
assert glob_to_regex("\\[").pattern == r"^\[$"
1110+
assert glob_to_regex("[a-z]").pattern == r"^\[a-z\]$"
1111+
assert (
1112+
glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]").pattern
1113+
== r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$"
1114+
)
1115+
1116+
# --- url_matches tests ---
1117+
# Basic exact and wildcard matching
1118+
assert url_matches(None, "http://playwright.dev/", "http://playwright.dev")
1119+
assert url_matches(None, "http://playwright.dev/?a=b", "http://playwright.dev?a=b")
1120+
assert url_matches(None, "http://playwright.dev/", "h*://playwright.dev")
1121+
assert url_matches(
1122+
None, "http://api.playwright.dev/?x=y", "http://*.playwright.dev?x=y"
1123+
)
1124+
assert url_matches(None, "http://playwright.dev/foo/bar", "**/foo/**")
1125+
1126+
# Relative path matching with base URL
1127+
assert url_matches("http://playwright.dev", "http://playwright.dev/?x=y", "?x=y")
1128+
assert url_matches(
1129+
"http://playwright.dev/foo/", "http://playwright.dev/foo/bar?x=y", "./bar?x=y"
1130+
)
1131+
1132+
# This is not supported, we treat ? as a query separator.
1133+
assert not url_matches(
1134+
None,
1135+
"http://localhost:8080/Simple/path.js",
1136+
"http://localhost:8080/?imple/path.js",
1137+
)
1138+
assert not url_matches(None, "http://playwright.dev/", "http://playwright.?ev")
1139+
assert url_matches(None, "http://playwright./?ev", "http://playwright.?ev")
1140+
assert not url_matches(
1141+
None, "http://playwright.dev/foo", "http://playwright.dev/f??"
1142+
)
1143+
assert url_matches(None, "http://playwright.dev/f??", "http://playwright.dev/f??")
1144+
assert url_matches(
1145+
None, "http://playwright.dev/?x=y", r"http://playwright.dev\?x=y"
10951146
)
1147+
assert url_matches(
1148+
None, "http://playwright.dev/?x=y", r"http://playwright.dev/\?x=y"
1149+
)
1150+
assert url_matches(
1151+
"http://playwright.dev/foo", "http://playwright.dev/foo?bar", "?bar"
1152+
)
1153+
assert url_matches(
1154+
"http://playwright.dev/foo", "http://playwright.dev/foo?bar", r"\\?bar"
1155+
)
1156+
assert url_matches("http://first.host/", "http://second.host/foo", "**/foo")
1157+
assert url_matches("http://playwright.dev/", "http://localhost/", "*//localhost/")
1158+
1159+
1160+
async def test_should_not_support_question_in_glob_pattern(
1161+
page: Page, playwright: Playwright, server: Server
1162+
) -> None:
1163+
server.set_route("/index", lambda req: (req.write(b"index-no-hello"), req.finish()))
1164+
server.set_route(
1165+
"/index123hello", lambda req: (req.write(b"index123hello"), req.finish())
1166+
)
1167+
server.set_route(
1168+
"/index?hello", lambda req: (req.write(b"index?hello"), req.finish())
1169+
)
1170+
server.set_route(
1171+
"/index1hello", lambda req: (req.write(b"index1hello"), req.finish())
1172+
)
1173+
1174+
async def handle_any_char(route: Route) -> None:
1175+
await route.fulfill(body="intercepted any character")
1176+
1177+
await page.route("**/index?hello", handle_any_char)
1178+
1179+
async def handle_question_mark(route: Route) -> None:
1180+
await route.fulfill(body="intercepted question mark")
1181+
1182+
await page.route(r"**/index\?hello", handle_question_mark)
1183+
1184+
await page.goto(server.PREFIX + "/index?hello")
1185+
await expect(page.locator("body")).to_have_text("intercepted question mark")
1186+
1187+
await page.goto(server.PREFIX + "/index")
1188+
await expect(page.locator("body")).to_have_text("index-no-hello")
1189+
1190+
await page.goto(server.PREFIX + "/index1hello")
1191+
await expect(page.locator("body")).to_have_text("index1hello")
1192+
1193+
await page.goto(server.PREFIX + "/index123hello")
1194+
await expect(page.locator("body")).to_have_text("index123hello")

0 commit comments

Comments
 (0)