4
4
5
5
import collections
6
6
import email .message
7
- import functools
8
7
import itertools
9
8
import json
10
9
import logging
@@ -96,7 +95,9 @@ class _NotHTTP(Exception):
96
95
pass
97
96
98
97
99
- def _ensure_api_response (url : str , session : PipSession ) -> None :
98
+ def _ensure_api_response (
99
+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
100
+ ) -> None :
100
101
"""
101
102
Send a HEAD request to the URL, and ensure the response contains a simple
102
103
API Response.
@@ -108,13 +109,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None:
108
109
if scheme not in {"http" , "https" }:
109
110
raise _NotHTTP ()
110
111
111
- resp = session .head (url , allow_redirects = True )
112
+ resp = session .head (url , allow_redirects = True , headers = headers )
112
113
raise_for_status (resp )
113
114
114
115
_ensure_api_header (resp )
115
116
116
117
117
- def _get_simple_response (url : str , session : PipSession ) -> Response :
118
+ def _get_simple_response (
119
+ url : str , session : PipSession , headers : Optional [Dict [str , str ]] = None
120
+ ) -> Response :
118
121
"""Access an Simple API response with GET, and return the response.
119
122
120
123
This consists of three parts:
@@ -128,10 +131,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
128
131
and raise `_NotAPIContent` otherwise.
129
132
"""
130
133
if is_archive_file (Link (url ).filename ):
131
- _ensure_api_response (url , session = session )
134
+ _ensure_api_response (url , session = session , headers = headers )
132
135
133
136
logger .debug ("Getting page %s" , redact_auth_from_url (url ))
134
137
138
+ logger .debug ("headers: %s" , str (headers ))
139
+ if headers is None :
140
+ headers = {}
135
141
resp = session .get (
136
142
url ,
137
143
headers = {
@@ -156,6 +162,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response:
156
162
# once per 10 minutes.
157
163
# For more information, please see pypa/pip#5670.
158
164
"Cache-Control" : "max-age=0" ,
165
+ ** headers ,
159
166
},
160
167
)
161
168
raise_for_status (resp )
@@ -188,44 +195,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
188
195
return None
189
196
190
197
191
- class CacheablePageContent :
192
- def __init__ (self , page : "IndexContent" ) -> None :
193
- assert page .cache_link_parsing
194
- self .page = page
195
-
196
- def __eq__ (self , other : object ) -> bool :
197
- return isinstance (other , type (self )) and self .page .url == other .page .url
198
-
199
- def __hash__ (self ) -> int :
200
- return hash (self .page .url )
201
-
202
-
203
- class ParseLinks (Protocol ):
204
- def __call__ (self , page : "IndexContent" ) -> Iterable [Link ]:
205
- ...
206
-
207
-
208
- def with_cached_index_content (fn : ParseLinks ) -> ParseLinks :
209
- """
210
- Given a function that parses an Iterable[Link] from an IndexContent, cache the
211
- function's result (keyed by CacheablePageContent), unless the IndexContent
212
- `page` has `page.cache_link_parsing == False`.
213
- """
214
-
215
- @functools .lru_cache (maxsize = None )
216
- def wrapper (cacheable_page : CacheablePageContent ) -> List [Link ]:
217
- return list (fn (cacheable_page .page ))
218
-
219
- @functools .wraps (fn )
220
- def wrapper_wrapper (page : "IndexContent" ) -> List [Link ]:
221
- if page .cache_link_parsing :
222
- return wrapper (CacheablePageContent (page ))
223
- return list (fn (page ))
224
-
225
- return wrapper_wrapper
226
-
227
-
228
- @with_cached_index_content
229
198
def parse_links (page : "IndexContent" ) -> Iterable [Link ]:
230
199
"""
231
200
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
@@ -235,7 +204,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
235
204
if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
236
205
data = json .loads (page .content )
237
206
for file in data .get ("files" , []):
238
- link = Link .from_json (file , page .url )
207
+ link = Link .from_json (file , page .url , page_content = page )
239
208
if link is None :
240
209
continue
241
210
yield link
@@ -248,7 +217,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
248
217
url = page .url
249
218
base_url = parser .base_url or url
250
219
for anchor in parser .anchors :
251
- link = Link .from_element (anchor , page_url = url , base_url = base_url )
220
+ link = Link .from_element (
221
+ anchor , page_url = url , base_url = base_url , page_content = page
222
+ )
252
223
if link is None :
253
224
continue
254
225
yield link
@@ -263,20 +234,19 @@ def __init__(
263
234
content_type : str ,
264
235
encoding : Optional [str ],
265
236
url : str ,
266
- cache_link_parsing : bool = True ,
237
+ etag : Optional [str ] = None ,
238
+ date : Optional [str ] = None ,
267
239
) -> None :
268
240
"""
269
241
:param encoding: the encoding to decode the given content.
270
242
:param url: the URL from which the HTML was downloaded.
271
- :param cache_link_parsing: whether links parsed from this page's url
272
- should be cached. PyPI index urls should
273
- have this set to False, for example.
274
243
"""
275
244
self .content = content
276
245
self .content_type = content_type
277
246
self .encoding = encoding
278
247
self .url = url
279
- self .cache_link_parsing = cache_link_parsing
248
+ self .etag = etag
249
+ self .date = date
280
250
281
251
def __str__ (self ) -> str :
282
252
return redact_auth_from_url (self .url )
@@ -320,21 +290,22 @@ def _handle_get_simple_fail(
320
290
meth ("Could not fetch URL %s: %s - skipping" , link , reason )
321
291
322
292
323
- def _make_index_content (
324
- response : Response , cache_link_parsing : bool = True
325
- ) -> IndexContent :
293
+ def _make_index_content (response : Response ) -> IndexContent :
326
294
encoding = _get_encoding_from_headers (response .headers )
327
295
return IndexContent (
328
296
response .content ,
329
297
response .headers ["Content-Type" ],
330
298
encoding = encoding ,
331
299
url = response .url ,
332
- cache_link_parsing = cache_link_parsing ,
300
+ etag = response .headers .get ("ETag" , None ),
301
+ date = response .headers .get ("Date" , None ),
333
302
)
334
303
335
304
336
- def _get_index_content (link : Link , * , session : PipSession ) -> Optional ["IndexContent" ]:
337
- url = link .url .split ("#" , 1 )[0 ]
305
+ def _get_index_content (
306
+ link : Link , * , session : PipSession , headers : Optional [Dict [str , str ]] = None
307
+ ) -> Optional ["IndexContent" ]:
308
+ url = link .url_without_fragment
338
309
339
310
# Check for VCS schemes that do not support lookup as web pages.
340
311
vcs_scheme = _match_vcs_scheme (url )
@@ -361,7 +332,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
361
332
logger .debug (" file: URL is directory, getting %s" , url )
362
333
363
334
try :
364
- resp = _get_simple_response (url , session = session )
335
+ resp = _get_simple_response (url , session = session , headers = headers )
365
336
except _NotHTTP :
366
337
logger .warning (
367
338
"Skipping page %s because it looks like an archive, and cannot "
@@ -377,9 +348,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
377
348
exc .request_desc ,
378
349
exc .content_type ,
379
350
)
380
- except NetworkConnectionError as exc :
381
- _handle_get_simple_fail (link , exc )
382
- except RetryError as exc :
351
+ except (NetworkConnectionError , RetryError ) as exc :
383
352
_handle_get_simple_fail (link , exc )
384
353
except SSLError as exc :
385
354
reason = "There was a problem confirming the ssl certificate: "
@@ -390,7 +359,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon
390
359
except requests .Timeout :
391
360
_handle_get_simple_fail (link , "timed out" )
392
361
else :
393
- return _make_index_content (resp , cache_link_parsing = link . cache_link_parsing )
362
+ return _make_index_content (resp )
394
363
return None
395
364
396
365
@@ -454,11 +423,14 @@ def create(
454
423
def find_links (self ) -> List [str ]:
455
424
return self .search_scope .find_links
456
425
457
- def fetch_response (self , location : Link ) -> Optional [IndexContent ]:
426
+ def fetch_response (
427
+ self , location : Link , headers : Optional [Dict [str , str ]] = None
428
+ ) -> Optional [IndexContent ]:
458
429
"""
459
430
Fetch an HTML page containing package links.
460
431
"""
461
- return _get_index_content (location , session = self .session )
432
+ logger .debug ("headers: %s" , str (headers ))
433
+ return _get_index_content (location , session = self .session , headers = headers )
462
434
463
435
def collect_sources (
464
436
self ,
@@ -472,7 +444,6 @@ def collect_sources(
472
444
candidates_from_page = candidates_from_page ,
473
445
page_validator = self .session .is_secure_origin ,
474
446
expand_dir = False ,
475
- cache_link_parsing = False ,
476
447
)
477
448
for loc in self .search_scope .get_index_urls_locations (project_name )
478
449
).values ()
@@ -482,7 +453,6 @@ def collect_sources(
482
453
candidates_from_page = candidates_from_page ,
483
454
page_validator = self .session .is_secure_origin ,
484
455
expand_dir = True ,
485
- cache_link_parsing = True ,
486
456
)
487
457
for loc in self .find_links
488
458
).values ()
0 commit comments