Skip to content

Commit b658989

Browse files
squashed commit
- add FetchResolveCache - pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - remove cache_link_parsing - introduce persistent cache for link parsing - cache link evaluation when possible (further speedup, somehow) - handle --no-cache-dir
1 parent 04d4030 commit b658989

File tree

7 files changed

+497
-113
lines changed

7 files changed

+497
-113
lines changed

src/pip/_internal/cache.py

Lines changed: 58 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import logging
88
import os
99
from pathlib import Path
10-
from typing import Any, Dict, List, Optional
10+
from typing import Any, Dict, List, Optional, Type
1111

1212
from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
1313
from pip._vendor.packaging.utils import canonicalize_name
@@ -41,7 +41,9 @@ def __init__(self, cache_dir: str) -> None:
4141
assert not cache_dir or os.path.isabs(cache_dir)
4242
self.cache_dir = cache_dir or None
4343

44-
def _get_cache_path_parts(self, link: Link) -> List[str]:
44+
def _get_cache_path_parts(
45+
self, link: Link, *, interpreter_dependent: bool
46+
) -> List[str]:
4547
"""Get parts of part that must be os.path.joined with cache_dir"""
4648

4749
# We want to generate an url to use as our cache key, we don't want to
@@ -53,13 +55,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:
5355
if link.subdirectory_fragment:
5456
key_parts["subdirectory"] = link.subdirectory_fragment
5557

56-
# Include interpreter name, major and minor version in cache key
57-
# to cope with ill-behaved sdists that build a different wheel
58-
# depending on the python version their setup.py is being run on,
59-
# and don't encode the difference in compatibility tags.
60-
# https://github.com/pypa/pip/issues/7296
61-
key_parts["interpreter_name"] = interpreter_name()
62-
key_parts["interpreter_version"] = interpreter_version()
58+
if interpreter_dependent:
59+
# Include interpreter name, major and minor version in cache key
60+
# to cope with ill-behaved sdists that build a different wheel
61+
# depending on the python version their setup.py is being run on,
62+
# and don't encode the difference in compatibility tags.
63+
# https://github.com/pypa/pip/issues/7296
64+
key_parts["interpreter_name"] = interpreter_name()
65+
key_parts["interpreter_version"] = interpreter_version()
6366

6467
# Encode our key url with sha224, we'll use this because it has similar
6568
# security properties to sha256, but with a shorter total output (and
@@ -87,26 +90,47 @@ class LinkMetadataCache(Cache):
8790
"""Persistently store the metadata of dists found at each link."""
8891

8992
def get_path_for_link(self, link: Link) -> str:
90-
parts = self._get_cache_path_parts(link)
93+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
9194
assert self.cache_dir
9295
return os.path.join(self.cache_dir, "link-metadata", *parts)
9396

9497

95-
class WheelCacheBase(Cache):
96-
"""Specializations to the cache concept for wheels."""
98+
class SerializableEntry(abc.ABC):
99+
@classmethod
100+
@abc.abstractmethod
101+
def suffix(cls) -> str:
102+
...
97103

98104
@abc.abstractmethod
99-
def get(
100-
self,
101-
link: Link,
102-
package_name: Optional[str],
103-
supported_tags: List[Tag],
104-
) -> Link:
105-
"""Returns a link to a cached item if it exists, otherwise returns the
106-
passed link.
107-
"""
105+
def serialize(self) -> Dict[str, Any]:
108106
...
109107

108+
109+
class FetchResolveCache(Cache):
110+
def get_path_for_link(self, link: Link) -> str:
111+
# We are reading index links to extract other links from, not executing any
112+
# python code, so these caches are interpreter-independent.
113+
parts = self._get_cache_path_parts(link, interpreter_dependent=False)
114+
assert self.cache_dir
115+
return os.path.join(self.cache_dir, "fetch-resolve", *parts)
116+
117+
def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
118+
hashed = _hash_dict(entry.serialize())
119+
return self.cache_path(link) / f"{hashed}{entry.suffix()}"
120+
121+
def clear_hashed_entries(
122+
self, link: Link, entry_type: Type[SerializableEntry]
123+
) -> None:
124+
for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
125+
logger.debug(
126+
"unlinking invalidated hashed link eval cache entry %s", hashed_entry
127+
)
128+
hashed_entry.unlink()
129+
130+
131+
class WheelCacheBase(Cache):
132+
"""Specializations to the cache concept for wheels."""
133+
110134
def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
111135
can_not_cache = not self.cache_dir or not canonical_package_name or not link
112136
if can_not_cache:
@@ -119,6 +143,18 @@ def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
119143
candidates.append((candidate, path))
120144
return candidates
121145

146+
@abc.abstractmethod
147+
def get(
148+
self,
149+
link: Link,
150+
package_name: Optional[str],
151+
supported_tags: List[Tag],
152+
) -> Link:
153+
"""Returns a link to a cached item if it exists, otherwise returns the
154+
passed link.
155+
"""
156+
...
157+
122158

123159
class SimpleWheelCache(WheelCacheBase):
124160
"""A cache of wheels for future installs."""
@@ -141,7 +177,7 @@ def get_path_for_link(self, link: Link) -> str:
141177
142178
:param link: The link of the sdist for which this will cache wheels.
143179
"""
144-
parts = self._get_cache_path_parts(link)
180+
parts = self._get_cache_path_parts(link, interpreter_dependent=True)
145181
assert self.cache_dir
146182
# Store wheels within the root cache_dir
147183
return os.path.join(self.cache_dir, "wheels", *parts)

src/pip/_internal/cli/req_command.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from optparse import Values
1313
from typing import TYPE_CHECKING, Any, List, Optional, Tuple
1414

15-
from pip._internal.cache import LinkMetadataCache, WheelCache
15+
from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache
1616
from pip._internal.cli import cmdoptions
1717
from pip._internal.cli.base_command import Command
1818
from pip._internal.cli.command_context import CommandContextMixIn
@@ -506,8 +506,13 @@ def _build_package_finder(
506506
ignore_requires_python=ignore_requires_python,
507507
)
508508

509+
if options.cache_dir:
510+
fetch_resolve_cache = FetchResolveCache(options.cache_dir)
511+
else:
512+
fetch_resolve_cache = None
509513
return PackageFinder.create(
510514
link_collector=link_collector,
511515
selection_prefs=selection_prefs,
512516
target_python=target_python,
517+
fetch_resolve_cache=fetch_resolve_cache,
513518
)

src/pip/_internal/index/collector.py

Lines changed: 35 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import collections
66
import email.message
7-
import functools
87
import itertools
98
import json
109
import logging
@@ -96,7 +95,9 @@ class _NotHTTP(Exception):
9695
pass
9796

9897

99-
def _ensure_api_response(url: str, session: PipSession) -> None:
98+
def _ensure_api_response(
99+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
100+
) -> None:
100101
"""
101102
Send a HEAD request to the URL, and ensure the response contains a simple
102103
API Response.
@@ -108,13 +109,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
108109
if scheme not in {"http", "https"}:
109110
raise _NotHTTP()
110111

111-
resp = session.head(url, allow_redirects=True)
112+
resp = session.head(url, allow_redirects=True, headers=headers)
112113
raise_for_status(resp)
113114

114115
_ensure_api_header(resp)
115116

116117

117-
def _get_simple_response(url: str, session: PipSession) -> Response:
118+
def _get_simple_response(
119+
url: str, session: PipSession, headers: Optional[Dict[str, str]] = None
120+
) -> Response:
118121
"""Access an Simple API response with GET, and return the response.
119122
120123
This consists of three parts:
@@ -128,10 +131,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
128131
and raise `_NotAPIContent` otherwise.
129132
"""
130133
if is_archive_file(Link(url).filename):
131-
_ensure_api_response(url, session=session)
134+
_ensure_api_response(url, session=session, headers=headers)
132135

133136
logger.debug("Getting page %s", redact_auth_from_url(url))
134137

138+
logger.debug("headers: %s", str(headers))
139+
if headers is None:
140+
headers = {}
135141
resp = session.get(
136142
url,
137143
headers={
@@ -156,6 +162,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
156162
# once per 10 minutes.
157163
# For more information, please see pypa/pip#5670.
158164
"Cache-Control": "max-age=0",
165+
**headers,
159166
},
160167
)
161168
raise_for_status(resp)
@@ -188,44 +195,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
188195
return None
189196

190197

191-
class CacheablePageContent:
192-
def __init__(self, page: "IndexContent") -> None:
193-
assert page.cache_link_parsing
194-
self.page = page
195-
196-
def __eq__(self, other: object) -> bool:
197-
return isinstance(other, type(self)) and self.page.url == other.page.url
198-
199-
def __hash__(self) -> int:
200-
return hash(self.page.url)
201-
202-
203-
class ParseLinks(Protocol):
204-
def __call__(self, page: "IndexContent") -> Iterable[Link]:
205-
...
206-
207-
208-
def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
209-
"""
210-
Given a function that parses an Iterable[Link] from an IndexContent, cache the
211-
function's result (keyed by CacheablePageContent), unless the IndexContent
212-
`page` has `page.cache_link_parsing == False`.
213-
"""
214-
215-
@functools.lru_cache(maxsize=None)
216-
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
217-
return list(fn(cacheable_page.page))
218-
219-
@functools.wraps(fn)
220-
def wrapper_wrapper(page: "IndexContent") -> List[Link]:
221-
if page.cache_link_parsing:
222-
return wrapper(CacheablePageContent(page))
223-
return list(fn(page))
224-
225-
return wrapper_wrapper
226-
227-
228-
@with_cached_index_content
229198
def parse_links(page: "IndexContent") -> Iterable[Link]:
230199
"""
231200
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
@@ -235,7 +204,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
235204
if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
236205
data = json.loads(page.content)
237206
for file in data.get("files", []):
238-
link = Link.from_json(file, page.url)
207+
link = Link.from_json(file, page.url, page_content=page)
239208
if link is None:
240209
continue
241210
yield link
@@ -248,7 +217,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
248217
url = page.url
249218
base_url = parser.base_url or url
250219
for anchor in parser.anchors:
251-
link = Link.from_element(anchor, page_url=url, base_url=base_url)
220+
link = Link.from_element(
221+
anchor, page_url=url, base_url=base_url, page_content=page
222+
)
252223
if link is None:
253224
continue
254225
yield link
@@ -263,20 +234,19 @@ def __init__(
263234
content_type: str,
264235
encoding: Optional[str],
265236
url: str,
266-
cache_link_parsing: bool = True,
237+
etag: Optional[str] = None,
238+
date: Optional[str] = None,
267239
) -> None:
268240
"""
269241
:param encoding: the encoding to decode the given content.
270242
:param url: the URL from which the HTML was downloaded.
271-
:param cache_link_parsing: whether links parsed from this page's url
272-
should be cached. PyPI index urls should
273-
have this set to False, for example.
274243
"""
275244
self.content = content
276245
self.content_type = content_type
277246
self.encoding = encoding
278247
self.url = url
279-
self.cache_link_parsing = cache_link_parsing
248+
self.etag = etag
249+
self.date = date
280250

281251
def __str__(self) -> str:
282252
return redact_auth_from_url(self.url)
@@ -320,21 +290,22 @@ def _handle_get_simple_fail(
320290
meth("Could not fetch URL %s: %s - skipping", link, reason)
321291

322292

323-
def _make_index_content(
324-
response: Response, cache_link_parsing: bool = True
325-
) -> IndexContent:
293+
def _make_index_content(response: Response) -> IndexContent:
326294
encoding = _get_encoding_from_headers(response.headers)
327295
return IndexContent(
328296
response.content,
329297
response.headers["Content-Type"],
330298
encoding=encoding,
331299
url=response.url,
332-
cache_link_parsing=cache_link_parsing,
300+
etag=response.headers.get("ETag", None),
301+
date=response.headers.get("Date", None),
333302
)
334303

335304

336-
def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
337-
url = link.url.split("#", 1)[0]
305+
def _get_index_content(
306+
link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None
307+
) -> Optional["IndexContent"]:
308+
url = link.url_without_fragment
338309

339310
# Check for VCS schemes that do not support lookup as web pages.
340311
vcs_scheme = _match_vcs_scheme(url)
@@ -361,7 +332,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
361332
logger.debug(" file: URL is directory, getting %s", url)
362333

363334
try:
364-
resp = _get_simple_response(url, session=session)
335+
resp = _get_simple_response(url, session=session, headers=headers)
365336
except _NotHTTP:
366337
logger.warning(
367338
"Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +348,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
377348
exc.request_desc,
378349
exc.content_type,
379350
)
380-
except NetworkConnectionError as exc:
381-
_handle_get_simple_fail(link, exc)
382-
except RetryError as exc:
351+
except (NetworkConnectionError, RetryError) as exc:
383352
_handle_get_simple_fail(link, exc)
384353
except SSLError as exc:
385354
reason = "There was a problem confirming the ssl certificate: "
@@ -390,7 +359,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
390359
except requests.Timeout:
391360
_handle_get_simple_fail(link, "timed out")
392361
else:
393-
return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
362+
return _make_index_content(resp)
394363
return None
395364

396365

@@ -454,11 +423,14 @@ def create(
454423
def find_links(self) -> List[str]:
455424
return self.search_scope.find_links
456425

457-
def fetch_response(self, location: Link) -> Optional[IndexContent]:
426+
def fetch_response(
427+
self, location: Link, headers: Optional[Dict[str, str]] = None
428+
) -> Optional[IndexContent]:
458429
"""
459430
Fetch an HTML page containing package links.
460431
"""
461-
return _get_index_content(location, session=self.session)
432+
logger.debug("headers: %s", str(headers))
433+
return _get_index_content(location, session=self.session, headers=headers)
462434

463435
def collect_sources(
464436
self,
@@ -472,7 +444,6 @@ def collect_sources(
472444
candidates_from_page=candidates_from_page,
473445
page_validator=self.session.is_secure_origin,
474446
expand_dir=False,
475-
cache_link_parsing=False,
476447
)
477448
for loc in self.search_scope.get_index_urls_locations(project_name)
478449
).values()
@@ -482,7 +453,6 @@ def collect_sources(
482453
candidates_from_page=candidates_from_page,
483454
page_validator=self.session.is_secure_origin,
484455
expand_dir=True,
485-
cache_link_parsing=True,
486456
)
487457
for loc in self.find_links
488458
).values()

0 commit comments

Comments
 (0)