diff --git a/playwright/_impl/_glob.py b/playwright/_impl/_glob.py index 2d899a789..08b7ce466 100644 --- a/playwright/_impl/_glob.py +++ b/playwright/_impl/_glob.py @@ -11,13 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping escaped_chars = {"$", "^", "+", ".", "*", "(", ")", "|", "\\", "?", "{", "}", "[", "]"} -def glob_to_regex(glob: str) -> "re.Pattern[str]": +def glob_to_regex_pattern(glob: str) -> str: tokens = ["^"] in_group = False @@ -46,23 +45,20 @@ def glob_to_regex(glob: str) -> "re.Pattern[str]": else: tokens.append("([^/]*)") else: - if c == "?": - tokens.append(".") - elif c == "[": - tokens.append("[") - elif c == "]": - tokens.append("]") - elif c == "{": + if c == "{": in_group = True tokens.append("(") elif c == "}": in_group = False tokens.append(")") - elif c == "," and in_group: - tokens.append("|") + elif c == ",": + if in_group: + tokens.append("|") + else: + tokens.append("\\" + c) else: tokens.append("\\" + c if c in escaped_chars else c) i += 1 tokens.append("$") - return re.compile("".join(tokens)) + return "".join(tokens) diff --git a/playwright/_impl/_helper.py b/playwright/_impl/_helper.py index 2f7ab57b0..96acb8857 100644 --- a/playwright/_impl/_helper.py +++ b/playwright/_impl/_helper.py @@ -44,7 +44,7 @@ is_target_closed_error, rewrite_error, ) -from playwright._impl._glob import glob_to_regex +from playwright._impl._glob import glob_to_regex_pattern from playwright._impl._greenlets import RouteGreenlet from playwright._impl._str_utils import escape_regex_flags @@ -144,31 +144,103 @@ class FrameNavigatedEvent(TypedDict): def url_matches( - base_url: Optional[str], url_string: str, match: Optional[URLMatch] + base_url: Optional[str], + url_string: str, + match: Optional[URLMatch], + websocket_url: bool = None, ) -> bool: if not match: return True - if isinstance(match, str) and match[0] != "*": - # Allow http(s) baseURL to match ws(s) urls. - if ( - base_url - and re.match(r"^https?://", base_url) - and re.match(r"^wss?://", url_string) - ): - base_url = re.sub(r"^http", "ws", base_url) - if base_url: - match = urljoin(base_url, match) - parsed = urlparse(match) - if parsed.path == "": - parsed = parsed._replace(path="/") - match = parsed.geturl() if isinstance(match, str): - match = glob_to_regex(match) + match = re.compile( + resolve_glob_to_regex_pattern(base_url, match, websocket_url) + ) if isinstance(match, Pattern): return bool(match.search(url_string)) return match(url_string) +def resolve_glob_to_regex_pattern( + base_url: Optional[str], glob: str, websocket_url: bool = None +) -> str: + if websocket_url: + base_url = to_websocket_base_url(base_url) + glob = resolve_glob_base(base_url, glob) + return glob_to_regex_pattern(glob) + + +def to_websocket_base_url(base_url: Optional[str]) -> Optional[str]: + if base_url is not None and re.match(r"^https?://", base_url): + base_url = re.sub(r"^http", "ws", base_url) + return base_url + + +def resolve_glob_base(base_url: Optional[str], match: str) -> str: + if match[0] == "*": + return match + + token_map: Dict[str, str] = {} + + def map_token(original: str, replacement: str) -> str: + if len(original) == 0: + return "" + token_map[replacement] = original + return replacement + + # Escaped `\\?` behaves the same as `?` in our glob patterns. + match = match.replace(r"\\?", "?") + # Glob symbols may be escaped in the URL and some of them such as ? affect resolution, + # so we replace them with safe components first. + processed_parts = [] + for index, token in enumerate(match.split("/")): + if token in (".", "..", ""): + processed_parts.append(token) + continue + # Handle special case of http*://, note that the new schema has to be + # a web schema so that slashes are properly inserted after domain. + if index == 0 and token.endswith(":"): + # Using a simple replacement for the scheme part + processed_parts.append(map_token(token, "http:")) + continue + question_index = token.find("?") + if question_index == -1: + processed_parts.append(map_token(token, f"$_{index}_$")) + else: + new_prefix = map_token(token[:question_index], f"$_{index}_$") + new_suffix = map_token(token[question_index:], f"?$_{index}_$") + processed_parts.append(new_prefix + new_suffix) + + relative_path = "/".join(processed_parts) + resolved_url = urljoin(base_url if base_url is not None else "", relative_path) + + for replacement, original in token_map.items(): + resolved_url = resolved_url.replace(replacement, original, 1) + + return ensure_trailing_slash(resolved_url) + + +# In Node.js, new URL('http://localhost') returns 'http://localhost/'. +# To ensure the same url matching behavior, do the same. +def ensure_trailing_slash(url: str) -> str: + split = url.split("://", maxsplit=1) + if len(split) == 2: + # URL parser doesn't like strange/unknown schemes, so we replace it for parsing, then put it back + parsable_url = "http://" + split[1] + else: + # Given current rules, this should never happen _and_ still be a valid matcher. We require the protocol to be part of the match, + # so either the user is using a glob that starts with "*" (and none of this code is running), or the user actually has `something://` in `match` + parsable_url = url + parsed = urlparse(parsable_url, allow_fragments=True) + if len(split) == 2: + # Replace the scheme that we removed earlier + parsed = parsed._replace(scheme=split[0]) + if parsed.path == "": + parsed = parsed._replace(path="/") + url = parsed.geturl() + + return url + + class HarLookupResult(TypedDict, total=False): action: Literal["error", "redirect", "fulfill", "noentry"] message: Optional[str] diff --git a/playwright/_impl/_network.py b/playwright/_impl/_network.py index 4b15531af..6492c4311 100644 --- a/playwright/_impl/_network.py +++ b/playwright/_impl/_network.py @@ -754,7 +754,7 @@ def prepare_interception_patterns( return patterns def matches(self, ws_url: str) -> bool: - return url_matches(self._base_url, ws_url, self.url) + return url_matches(self._base_url, ws_url, self.url, True) async def handle(self, websocket_route: "WebSocketRoute") -> None: coro_or_future = self.handler(websocket_route) diff --git a/tests/async/test_page_route.py b/tests/async/test_page_route.py index 017bdac9a..b04f96145 100644 --- a/tests/async/test_page_route.py +++ b/tests/async/test_page_route.py @@ -20,7 +20,8 @@ import pytest -from playwright._impl._glob import glob_to_regex +from playwright._impl._glob import glob_to_regex_pattern +from playwright._impl._helper import url_matches from playwright.async_api import ( Browser, BrowserContext, @@ -29,6 +30,7 @@ Playwright, Request, Route, + expect, ) from tests.server import Server, TestServerRequest from tests.utils import must @@ -1051,17 +1053,19 @@ async def handle_request(route: Route) -> None: assert await response.json() == {"foo": "bar"} -async def test_glob_to_regex() -> None: +async def test_should_work_with_glob() -> None: + def glob_to_regex(pattern: str) -> re.Pattern: + return re.compile(glob_to_regex_pattern(pattern)) + assert glob_to_regex("**/*.js").match("https://localhost:8080/foo.js") assert not glob_to_regex("**/*.css").match("https://localhost:8080/foo.js") - assert not glob_to_regex("*.js").match("https://localhost:8080/foo.js") + assert not glob_to_regex("*.js").match( + "https://localhost:8080/foo.js" + ) # Doesn"t match path separator assert glob_to_regex("https://**/*.js").match("https://localhost:8080/foo.js") assert glob_to_regex("http://localhost:8080/simple/path.js").match( "http://localhost:8080/simple/path.js" ) - assert glob_to_regex("http://localhost:8080/?imple/path.js").match( - "http://localhost:8080/Simple/path.js" - ) assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/a.js") assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/b.js") assert not glob_to_regex("**/{a,b}.js").match("https://localhost:8080/c.js") @@ -1081,15 +1085,119 @@ async def test_glob_to_regex() -> None: "http://localhost:3000/signin-oidcnice" ) - assert glob_to_regex("**/three-columns/settings.html?**id=[a-z]**").match( + # range [] is NOT supported + assert glob_to_regex("**/api/v[0-9]").fullmatch("http://example.com/api/v[0-9]") + assert not glob_to_regex("**/api/v[0-9]").fullmatch( + "http://example.com/api/version" + ) + assert not glob_to_regex("**/api/v[0-9]").fullmatch( + "http://example.com/api/v1" + ) # Should not match if [] is literal + + # query params + assert glob_to_regex("**/api\\?param").match("http://example.com/api?param") + assert not glob_to_regex("**/api\\?param").match("http://example.com/api-param") + + assert glob_to_regex("**/three-columns/settings.html\\?**id=settings-**").match( "http://mydomain:8080/blah/blah/three-columns/settings.html?id=settings-e3c58efe-02e9-44b0-97ac-dd138100cf7c&blah" ) - assert glob_to_regex("\\?") == re.compile(r"^\?$") - assert glob_to_regex("\\") == re.compile(r"^\\$") - assert glob_to_regex("\\\\") == re.compile(r"^\\$") - assert glob_to_regex("\\[") == re.compile(r"^\[$") - assert glob_to_regex("[a-z]") == re.compile(r"^[a-z]$") - assert glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]") == re.compile( - r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$" + assert glob_to_regex("\\?").pattern == r"^\?$" + assert glob_to_regex("\\").pattern == r"^\\$" + assert glob_to_regex("\\\\").pattern == r"^\\$" + assert glob_to_regex("\\[").pattern == r"^\[$" + assert glob_to_regex("[a-z]").pattern == r"^\[a-z\]$" + assert ( + glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]").pattern + == r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$" + ) + + # --- url_matches tests --- + # Basic exact and wildcard matching + assert url_matches(None, "http://playwright.dev/", "http://playwright.dev") + assert url_matches(None, "http://playwright.dev/?a=b", "http://playwright.dev?a=b") + assert url_matches(None, "http://playwright.dev/", "h*://playwright.dev") + assert url_matches( + None, "http://api.playwright.dev/?x=y", "http://*.playwright.dev?x=y" + ) + assert url_matches(None, "http://playwright.dev/foo/bar", "**/foo/**") + + # Relative path matching with base URL + assert url_matches("http://playwright.dev", "http://playwright.dev/?x=y", "?x=y") + assert url_matches( + "http://playwright.dev/foo/", "http://playwright.dev/foo/bar?x=y", "./bar?x=y" + ) + + # This is not supported, we treat ? as a query separator. + assert not url_matches( + None, + "http://localhost:8080/Simple/path.js", + "http://localhost:8080/?imple/path.js", + ) + assert not url_matches(None, "http://playwright.dev/", "http://playwright.?ev") + assert url_matches(None, "http://playwright./?ev", "http://playwright.?ev") + assert not url_matches( + None, "http://playwright.dev/foo", "http://playwright.dev/f??" + ) + assert url_matches(None, "http://playwright.dev/f??", "http://playwright.dev/f??") + assert url_matches( + None, "http://playwright.dev/?x=y", r"http://playwright.dev\?x=y" + ) + assert url_matches( + None, "http://playwright.dev/?x=y", r"http://playwright.dev/\?x=y" + ) + assert url_matches( + "http://playwright.dev/foo", "http://playwright.dev/foo?bar", "?bar" + ) + assert url_matches( + "http://playwright.dev/foo", "http://playwright.dev/foo?bar", r"\\?bar" + ) + assert url_matches("http://first.host/", "http://second.host/foo", "**/foo") + assert url_matches("http://playwright.dev/", "http://localhost/", "*//localhost/") + + # Added for Python implementation + assert url_matches( + None, + "custom://example.com/foo/bar?id=123", + "{custom,another}://example.com/foo/bar?id=123", + ) + assert not url_matches( + None, "custom://example.com/foo/bar?id=123", "**example.com/foo/bar?id=123" ) + + +async def test_should_not_support_question_in_glob_pattern( + page: Page, playwright: Playwright, server: Server +) -> None: + server.set_route("/index", lambda req: (req.write(b"index-no-hello"), req.finish())) + server.set_route( + "/index123hello", lambda req: (req.write(b"index123hello"), req.finish()) + ) + server.set_route( + "/index?hello", lambda req: (req.write(b"index?hello"), req.finish()) + ) + server.set_route( + "/index1hello", lambda req: (req.write(b"index1hello"), req.finish()) + ) + + async def handle_any_char(route: Route) -> None: + await route.fulfill(body="intercepted any character") + + await page.route("**/index?hello", handle_any_char) + + async def handle_question_mark(route: Route) -> None: + await route.fulfill(body="intercepted question mark") + + await page.route(r"**/index\?hello", handle_question_mark) + + await page.goto(server.PREFIX + "/index?hello") + await expect(page.locator("body")).to_have_text("intercepted question mark") + + await page.goto(server.PREFIX + "/index") + await expect(page.locator("body")).to_have_text("index-no-hello") + + await page.goto(server.PREFIX + "/index1hello") + await expect(page.locator("body")).to_have_text("index1hello") + + await page.goto(server.PREFIX + "/index123hello") + await expect(page.locator("body")).to_have_text("index123hello") diff --git a/tests/async/test_request_intercept.py b/tests/async/test_request_intercept.py index 316e0b102..75746bbca 100644 --- a/tests/async/test_request_intercept.py +++ b/tests/async/test_request_intercept.py @@ -175,3 +175,17 @@ async def test_should_give_access_to_the_intercepted_response_body( route.fulfill(response=response), eval_task, ) + + +async def test_should_intercept_by_glob(page: Page, server: Server) -> None: + await page.goto(server.EMPTY_PAGE) + await page.route( + "http://localhos**?*oo", + lambda route: route.fulfill(body="intercepted", status=200), + ) + + result = await page.evaluate( + "url => fetch(url).then(r => r.text())", server.PREFIX + "/?foo" + ) + + assert result == "intercepted" diff --git a/tests/sync/test_request_intercept.py b/tests/sync/test_request_intercept.py index 8df41c0c2..a54c0ad71 100644 --- a/tests/sync/test_request_intercept.py +++ b/tests/sync/test_request_intercept.py @@ -131,3 +131,17 @@ def handle_route(route: Route) -> None: assert request.uri.decode() == "/title.html" original = (assetdir / "title.html").read_text() assert response.text() == original + + +def test_should_intercept_by_glob(page: Page, server: Server) -> None: + page.goto(server.EMPTY_PAGE) + page.route( + "http://localhos**?*oo", + lambda route: route.fulfill(body="intercepted", status=200), + ) + + result = page.evaluate( + "url => fetch(url).then(r => r.text())", server.PREFIX + "/?foo" + ) + + assert result == "intercepted"