Skip to content

chore(roll): roll glob changes from Playwright 1.52.0 #2824

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 8 additions & 12 deletions playwright/_impl/_glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re

# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions#escaping
escaped_chars = {"$", "^", "+", ".", "*", "(", ")", "|", "\\", "?", "{", "}", "[", "]"}


def glob_to_regex(glob: str) -> "re.Pattern[str]":
def glob_to_regex_pattern(glob: str) -> str:
tokens = ["^"]
in_group = False

Expand Down Expand Up @@ -46,23 +45,20 @@ def glob_to_regex(glob: str) -> "re.Pattern[str]":
else:
tokens.append("([^/]*)")
else:
if c == "?":
tokens.append(".")
elif c == "[":
tokens.append("[")
elif c == "]":
tokens.append("]")
elif c == "{":
if c == "{":
in_group = True
tokens.append("(")
elif c == "}":
in_group = False
tokens.append(")")
elif c == "," and in_group:
tokens.append("|")
elif c == ",":
if in_group:
tokens.append("|")
else:
tokens.append("\\" + c)
else:
tokens.append("\\" + c if c in escaped_chars else c)
i += 1

tokens.append("$")
return re.compile("".join(tokens))
return "".join(tokens)
102 changes: 85 additions & 17 deletions playwright/_impl/_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
is_target_closed_error,
rewrite_error,
)
from playwright._impl._glob import glob_to_regex
from playwright._impl._glob import glob_to_regex_pattern
from playwright._impl._greenlets import RouteGreenlet
from playwright._impl._str_utils import escape_regex_flags

Expand Down Expand Up @@ -144,31 +144,99 @@ class FrameNavigatedEvent(TypedDict):


def url_matches(
base_url: Optional[str], url_string: str, match: Optional[URLMatch]
base_url: Optional[str],
url_string: str,
match: Optional[URLMatch],
websocket_url: bool = None,
) -> bool:
if not match:
return True
if isinstance(match, str) and match[0] != "*":
# Allow http(s) baseURL to match ws(s) urls.
if (
base_url
and re.match(r"^https?://", base_url)
and re.match(r"^wss?://", url_string)
):
base_url = re.sub(r"^http", "ws", base_url)
if base_url:
match = urljoin(base_url, match)
parsed = urlparse(match)
if parsed.path == "":
parsed = parsed._replace(path="/")
match = parsed.geturl()
if isinstance(match, str):
match = glob_to_regex(match)
match = re.compile(
resolve_glob_to_regex_pattern(base_url, match, websocket_url)
)
if isinstance(match, Pattern):
return bool(match.search(url_string))
return match(url_string)


def resolve_glob_to_regex_pattern(
base_url: Optional[str], glob: str, websocket_url: bool = None
) -> str:
if websocket_url:
base_url = to_websocket_base_url(base_url)
glob = resolve_glob_base(base_url, glob)
return glob_to_regex_pattern(glob)


def to_websocket_base_url(base_url: Optional[str]) -> Optional[str]:
if base_url is not None and re.match(r"^https?://", base_url):
base_url = re.sub(r"^http", "ws", base_url)
return base_url


def resolve_glob_base(base_url: Optional[str], match: str) -> str:
if match[0] == "*":
return match

token_map: Dict[str, str] = {}

def map_token(original: str, replacement: str) -> str:
if len(original) == 0:
return ""
token_map[replacement] = original
return replacement

# Escaped `\\?` behaves the same as `?` in our glob patterns.
match = match.replace(r"\\?", "?")
# Glob symbols may be escaped in the URL and some of them such as ? affect resolution,
# so we replace them with safe components first.
processed_parts = []
for index, token in enumerate(match.split("/")):
if token in (".", "..", ""):
processed_parts.append(token)
continue
# Handle special case of http*://, note that the new schema has to be
# a web schema so that slashes are properly inserted after domain.
if index == 0 and token.endswith(":"):
# Using a simple replacement for the scheme part
processed_parts.append(map_token(token, "http:"))
continue
question_index = token.find("?")
if question_index == -1:
processed_parts.append(map_token(token, f"$_{index}_$"))
else:
new_prefix = map_token(token[:question_index], f"$_{index}_$")
new_suffix = map_token(token[question_index:], f"?$_{index}_$")
processed_parts.append(new_prefix + new_suffix)

relative_path = "/".join(processed_parts)
resolved_url = urljoin(base_url if base_url is not None else "", relative_path)

for replacement, original in token_map.items():
resolved_url = resolved_url.replace(replacement, original, 1)

# In Node.js, new URL('http://localhost') returns 'http://localhost/'.
# To ensure the same url matching behavior, do the same.
split = resolved_url.split("://", maxsplit=1)
if len(split) == 2:
# URL parser doesn't like strange/unknown schemes, so we replace it for parsing, then put it back
parsable_url = "http://" + split[1]
else:
# Given current rules, this should never happen _and_ still be a valid matcher. We require the protocol to be part of the match,
# so either the user is using a glob that starts with "*" (and none of this code is running), or the user actually has `something://` in `match`
parsable_url = resolved_url
parsed = urlparse(parsable_url, allow_fragments=True)
if len(split) == 2:
# Replace the scheme that we removed earlier
parsed = parsed._replace(scheme=split[0])
if parsed.path == "":
parsed = parsed._replace(path="/")
resolved_url = parsed.geturl()

return resolved_url


class HarLookupResult(TypedDict, total=False):
action: Literal["error", "redirect", "fulfill", "noentry"]
message: Optional[str]
Expand Down
2 changes: 1 addition & 1 deletion playwright/_impl/_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ def prepare_interception_patterns(
return patterns

def matches(self, ws_url: str) -> bool:
return url_matches(self._base_url, ws_url, self.url)
return url_matches(self._base_url, ws_url, self.url, True)

async def handle(self, websocket_route: "WebSocketRoute") -> None:
coro_or_future = self.handler(websocket_route)
Expand Down
136 changes: 122 additions & 14 deletions tests/async/test_page_route.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

import pytest

from playwright._impl._glob import glob_to_regex
from playwright._impl._glob import glob_to_regex_pattern
from playwright._impl._helper import url_matches
from playwright.async_api import (
Browser,
BrowserContext,
Expand All @@ -29,6 +30,7 @@
Playwright,
Request,
Route,
expect,
)
from tests.server import Server, TestServerRequest
from tests.utils import must
Expand Down Expand Up @@ -1051,17 +1053,19 @@ async def handle_request(route: Route) -> None:
assert await response.json() == {"foo": "bar"}


async def test_glob_to_regex() -> None:
async def test_should_work_with_glob() -> None:
def glob_to_regex(pattern: str) -> re.Pattern:
return re.compile(glob_to_regex_pattern(pattern))

assert glob_to_regex("**/*.js").match("https://localhost:8080/foo.js")
assert not glob_to_regex("**/*.css").match("https://localhost:8080/foo.js")
assert not glob_to_regex("*.js").match("https://localhost:8080/foo.js")
assert not glob_to_regex("*.js").match(
"https://localhost:8080/foo.js"
) # Doesn"t match path separator
assert glob_to_regex("https://**/*.js").match("https://localhost:8080/foo.js")
assert glob_to_regex("http://localhost:8080/simple/path.js").match(
"http://localhost:8080/simple/path.js"
)
assert glob_to_regex("http://localhost:8080/?imple/path.js").match(
"http://localhost:8080/Simple/path.js"
)
assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/a.js")
assert glob_to_regex("**/{a,b}.js").match("https://localhost:8080/b.js")
assert not glob_to_regex("**/{a,b}.js").match("https://localhost:8080/c.js")
Expand All @@ -1081,15 +1085,119 @@ async def test_glob_to_regex() -> None:
"http://localhost:3000/signin-oidcnice"
)

assert glob_to_regex("**/three-columns/settings.html?**id=[a-z]**").match(
# range [] is NOT supported
assert glob_to_regex("**/api/v[0-9]").fullmatch("http://example.com/api/v[0-9]")
assert not glob_to_regex("**/api/v[0-9]").fullmatch(
"http://example.com/api/version"
)
assert not glob_to_regex("**/api/v[0-9]").fullmatch(
"http://example.com/api/v1"
) # Should not match if [] is literal

# query params
assert glob_to_regex("**/api\\?param").match("http://example.com/api?param")
assert not glob_to_regex("**/api\\?param").match("http://example.com/api-param")

assert glob_to_regex("**/three-columns/settings.html\\?**id=settings-**").match(
"http://mydomain:8080/blah/blah/three-columns/settings.html?id=settings-e3c58efe-02e9-44b0-97ac-dd138100cf7c&blah"
)

assert glob_to_regex("\\?") == re.compile(r"^\?$")
assert glob_to_regex("\\") == re.compile(r"^\\$")
assert glob_to_regex("\\\\") == re.compile(r"^\\$")
assert glob_to_regex("\\[") == re.compile(r"^\[$")
assert glob_to_regex("[a-z]") == re.compile(r"^[a-z]$")
assert glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]") == re.compile(
r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$"
assert glob_to_regex("\\?").pattern == r"^\?$"
assert glob_to_regex("\\").pattern == r"^\\$"
assert glob_to_regex("\\\\").pattern == r"^\\$"
assert glob_to_regex("\\[").pattern == r"^\[$"
assert glob_to_regex("[a-z]").pattern == r"^\[a-z\]$"
assert (
glob_to_regex("$^+.\\*()|\\?\\{\\}\\[\\]").pattern
== r"^\$\^\+\.\*\(\)\|\?\{\}\[\]$"
)

# --- url_matches tests ---
# Basic exact and wildcard matching
assert url_matches(None, "http://playwright.dev/", "http://playwright.dev")
assert url_matches(None, "http://playwright.dev/?a=b", "http://playwright.dev?a=b")
assert url_matches(None, "http://playwright.dev/", "h*://playwright.dev")
assert url_matches(
None, "http://api.playwright.dev/?x=y", "http://*.playwright.dev?x=y"
)
assert url_matches(None, "http://playwright.dev/foo/bar", "**/foo/**")

# Relative path matching with base URL
assert url_matches("http://playwright.dev", "http://playwright.dev/?x=y", "?x=y")
assert url_matches(
"http://playwright.dev/foo/", "http://playwright.dev/foo/bar?x=y", "./bar?x=y"
)

# This is not supported, we treat ? as a query separator.
assert not url_matches(
None,
"http://localhost:8080/Simple/path.js",
"http://localhost:8080/?imple/path.js",
)
assert not url_matches(None, "http://playwright.dev/", "http://playwright.?ev")
assert url_matches(None, "http://playwright./?ev", "http://playwright.?ev")
assert not url_matches(
None, "http://playwright.dev/foo", "http://playwright.dev/f??"
)
assert url_matches(None, "http://playwright.dev/f??", "http://playwright.dev/f??")
assert url_matches(
None, "http://playwright.dev/?x=y", r"http://playwright.dev\?x=y"
)
assert url_matches(
None, "http://playwright.dev/?x=y", r"http://playwright.dev/\?x=y"
)
assert url_matches(
"http://playwright.dev/foo", "http://playwright.dev/foo?bar", "?bar"
)
assert url_matches(
"http://playwright.dev/foo", "http://playwright.dev/foo?bar", r"\\?bar"
)
assert url_matches("http://first.host/", "http://second.host/foo", "**/foo")
assert url_matches("http://playwright.dev/", "http://localhost/", "*//localhost/")

# Added for Python implementation
assert url_matches(
None,
"custom://example.com/foo/bar?id=123",
"{custom,another}://example.com/foo/bar?id=123",
)
assert not url_matches(
None, "custom://example.com/foo/bar?id=123", "**example.com/foo/bar?id=123"
)


async def test_should_not_support_question_in_glob_pattern(
page: Page, playwright: Playwright, server: Server
) -> None:
server.set_route("/index", lambda req: (req.write(b"index-no-hello"), req.finish()))
server.set_route(
"/index123hello", lambda req: (req.write(b"index123hello"), req.finish())
)
server.set_route(
"/index?hello", lambda req: (req.write(b"index?hello"), req.finish())
)
server.set_route(
"/index1hello", lambda req: (req.write(b"index1hello"), req.finish())
)

async def handle_any_char(route: Route) -> None:
await route.fulfill(body="intercepted any character")

await page.route("**/index?hello", handle_any_char)

async def handle_question_mark(route: Route) -> None:
await route.fulfill(body="intercepted question mark")

await page.route(r"**/index\?hello", handle_question_mark)

await page.goto(server.PREFIX + "/index?hello")
await expect(page.locator("body")).to_have_text("intercepted question mark")

await page.goto(server.PREFIX + "/index")
await expect(page.locator("body")).to_have_text("index-no-hello")

await page.goto(server.PREFIX + "/index1hello")
await expect(page.locator("body")).to_have_text("index1hello")

await page.goto(server.PREFIX + "/index123hello")
await expect(page.locator("body")).to_have_text("index123hello")
14 changes: 14 additions & 0 deletions tests/async/test_request_intercept.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,17 @@ async def test_should_give_access_to_the_intercepted_response_body(
route.fulfill(response=response),
eval_task,
)


async def test_should_intercept_by_glob(page: Page, server: Server) -> None:
await page.goto(server.EMPTY_PAGE)
await page.route(
"http://localhos**?*oo",
lambda route: route.fulfill(body="intercepted", status=200),
)

result = await page.evaluate(
"url => fetch(url).then(r => r.text())", server.PREFIX + "/?foo"
)

assert result == "intercepted"
14 changes: 14 additions & 0 deletions tests/sync/test_request_intercept.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,17 @@ def handle_route(route: Route) -> None:
assert request.uri.decode() == "/title.html"
original = (assetdir / "title.html").read_text()
assert response.text() == original


def test_should_intercept_by_glob(page: Page, server: Server) -> None:
page.goto(server.EMPTY_PAGE)
page.route(
"http://localhos**?*oo",
lambda route: route.fulfill(body="intercepted", status=200),
)

result = page.evaluate(
"url => fetch(url).then(r => r.text())", server.PREFIX + "/?foo"
)

assert result == "intercepted"
Loading