1# Copyright 2024 The Bazel Authors. All rights reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15""" 16Parse SimpleAPI HTML in Starlark. 17""" 18 19def parse_simpleapi_html(*, url, content): 20 """Get the package URLs for given shas by parsing the Simple API HTML. 21 22 Args: 23 url(str): The URL that the HTML content can be downloaded from. 24 content(str): The Simple API HTML content. 25 26 Returns: 27 A list of structs with: 28 * filename: The filename of the artifact. 29 * url: The URL to download the artifact. 30 * sha256: The sha256 of the artifact. 31 * metadata_sha256: The whl METADATA sha256 if we can download it. If this is 32 present, then the 'metadata_url' is also present. Defaults to "". 33 * metadata_url: The URL for the METADATA if we can download it. Defaults to "". 34 """ 35 sdists = {} 36 whls = {} 37 lines = content.split("<a href=\"") 38 39 _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"") 40 api_version, _, _ = api_version.partition("\"") 41 42 # We must assume the 1.0 if it is not present 43 # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients 44 api_version = api_version or "1.0" 45 api_version = tuple([int(i) for i in api_version.split(".")]) 46 47 if api_version >= (2, 0): 48 # We don't expect to have version 2.0 here, but have this check in place just in case. 49 # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api 50 fail("Unsupported API version: {}".format(api_version)) 51 52 # Each line follows the following pattern 53 # <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br /> 54 for line in lines[1:]: 55 dist_url, _, tail = line.partition("#sha256=") 56 sha256, _, tail = tail.partition("\"") 57 58 # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api 59 yanked = "data-yanked" in line 60 61 head, _, _ = tail.rpartition("</a>") 62 maybe_metadata, _, filename = head.rpartition(">") 63 64 metadata_sha256 = "" 65 metadata_url = "" 66 for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]: 67 metadata_marker = metadata_marker + "=\"sha256=" 68 if metadata_marker in maybe_metadata: 69 # Implement https://peps.python.org/pep-0714/ 70 _, _, tail = maybe_metadata.partition(metadata_marker) 71 metadata_sha256, _, _ = tail.partition("\"") 72 metadata_url = dist_url + ".metadata" 73 break 74 75 if filename.endswith(".whl"): 76 whls[sha256] = struct( 77 filename = filename, 78 url = _absolute_url(url, dist_url), 79 sha256 = sha256, 80 metadata_sha256 = metadata_sha256, 81 metadata_url = _absolute_url(url, metadata_url) if metadata_url else "", 82 yanked = yanked, 83 ) 84 else: 85 sdists[sha256] = struct( 86 filename = filename, 87 url = _absolute_url(url, dist_url), 88 sha256 = sha256, 89 metadata_sha256 = "", 90 metadata_url = "", 91 yanked = yanked, 92 ) 93 94 return struct( 95 sdists = sdists, 96 whls = whls, 97 ) 98 99def _get_root_directory(url): 100 scheme_end = url.find("://") 101 if scheme_end == -1: 102 fail("Invalid URL format") 103 104 scheme = url[:scheme_end] 105 host_end = url.find("/", scheme_end + 3) 106 if host_end == -1: 107 host_end = len(url) 108 host = url[scheme_end + 3:host_end] 109 110 return "{}://{}".format(scheme, host) 111 112def _is_downloadable(url): 113 """Checks if the URL would be accepted by the Bazel downloader. 114 115 This is based on Bazel's HttpUtils::isUrlSupportedByDownloader 116 """ 117 return url.startswith("http://") or url.startswith("https://") or url.startswith("file://") 118 119def _absolute_url(index_url, candidate): 120 if candidate == "": 121 return candidate 122 123 if _is_downloadable(candidate): 124 return candidate 125 126 if candidate.startswith("/"): 127 # absolute path 128 root_directory = _get_root_directory(index_url) 129 return "{}{}".format(root_directory, candidate) 130 131 if candidate.startswith(".."): 132 # relative path with up references 133 candidate_parts = candidate.split("..") 134 last = candidate_parts[-1] 135 for _ in range(len(candidate_parts) - 1): 136 index_url, _, _ = index_url.rstrip("/").rpartition("/") 137 138 return "{}/{}".format(index_url, last.strip("/")) 139 140 # relative path without up-references 141 return "{}/{}".format(index_url, candidate) 142