Skip to content

Commit

Permalink
fix(pypi): change the parallelisation scheme for querying SimpleAPI (#…
Browse files Browse the repository at this point in the history
…2531)

Instead of querying everything in parallel and yielding a lot of 404
warnings, let's query the main index first and then query the other
indexes only for the packages that were not yet found.

What is more, we can print the value of
`experimental_index_url_overrides`
for the users to use.

Whilst at it, add a unit test to check the new logic.

Fixes #2100, since this is the best `rules_python` can do for now.

---------

Co-authored-by: Douglas Thor <[email protected]>
  • Loading branch information
aignas and dougthor42 authored Dec 31, 2024
1 parent 9035db2 commit 475a99e
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 39 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ Unreleased changes template.
* Bazel 6 support is dropped and Bazel 7.4.1 is the minimum supported
version, per our Bazel support matrix. Earlier versions are not
tested by CI, so functionality cannot be guaranteed.
* ({bzl:obj}`pip.parse`) From now we will make fewer calls to indexes when
fetching the metadata from SimpleAPI. The calls will be done in parallel to
each index separately, so the extension evaluation time might slow down if
not using {bzl:obj}`pip.parse.experimental_index_url_overrides`.
* ({bzl:obj}`pip.parse`) Only query SimpleAPI for packages that have
sha values in the `requirements.txt` file.

Expand Down
5 changes: 5 additions & 0 deletions python/private/pypi/extension.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,11 @@ The indexes must support Simple API as described here:
https://packaging.python.org/en/latest/specifications/simple-repository-api/
This is equivalent to `--extra-index-urls` `pip` option.
:::{versionchanged} 1.1.0
Starting with this version we will iterate over each index specified until
we find metadata for all references distributions.
:::
""",
default = [],
),
Expand Down
99 changes: 60 additions & 39 deletions python/private/pypi/simpleapi_download.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,17 @@ load("@bazel_features//:features.bzl", "bazel_features")
load("//python/private:auth.bzl", "get_auth")
load("//python/private:envsubst.bzl", "envsubst")
load("//python/private:normalize_name.bzl", "normalize_name")
load("//python/private:text_util.bzl", "render")
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")

def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
def simpleapi_download(
ctx,
*,
attr,
cache,
parallel_download = True,
read_simpleapi = None,
_fail = fail):
"""Download Simple API HTML.
Args:
Expand All @@ -49,6 +57,9 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
reflected when re-evaluating the extension unless we do
`bazel clean --expunge`.
parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads.
read_simpleapi: a function for reading and parsing of the SimpleAPI contents.
Used in tests.
_fail: a function to print a failure. Used in tests.
Returns:
dict of pkg name to the parsed HTML contents - a list of structs.
Expand All @@ -64,15 +75,22 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):

# NOTE @aignas 2024-03-31: we are not merging results from multiple indexes
# to replicate how `pip` would handle this case.
async_downloads = {}
contents = {}
index_urls = [attr.index_url] + attr.extra_index_urls
for pkg in attr.sources:
pkg_normalized = normalize_name(pkg)

success = False
for index_url in index_urls:
result = _read_simpleapi(
read_simpleapi = read_simpleapi or _read_simpleapi

found_on_index = {}
warn_overrides = False
for i, index_url in enumerate(index_urls):
if i != 0:
# Warn the user about a potential fix for the overrides
warn_overrides = True

async_downloads = {}
sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
for pkg in sources:
pkg_normalized = normalize_name(pkg)
result = read_simpleapi(
ctx = ctx,
url = "{}/{}/".format(
index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
Expand All @@ -84,42 +102,45 @@ def simpleapi_download(ctx, *, attr, cache, parallel_download = True):
)
if hasattr(result, "wait"):
# We will process it in a separate loop:
async_downloads.setdefault(pkg_normalized, []).append(
struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
),
async_downloads[pkg] = struct(
pkg_normalized = pkg_normalized,
wait = result.wait,
)
continue

if result.success:
elif result.success:
contents[pkg_normalized] = result.output
success = True
break

if not async_downloads and not success:
fail("Failed to download metadata from urls: {}".format(
", ".join(index_urls),
))

if not async_downloads:
return contents

# If we use `block` == False, then we need to have a second loop that is
# collecting all of the results as they were being downloaded in parallel.
for pkg, downloads in async_downloads.items():
success = False
for download in downloads:
found_on_index[pkg] = index_url

if not async_downloads:
continue

# If we use `block` == False, then we need to have a second loop that is
# collecting all of the results as they were being downloaded in parallel.
for pkg, download in async_downloads.items():
result = download.wait()

if result.success and download.pkg_normalized not in contents:
if result.success:
contents[download.pkg_normalized] = result.output
success = True

if not success:
fail("Failed to download metadata from urls: {}".format(
", ".join(index_urls),
))
found_on_index[pkg] = index_url

failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
if failed_sources:
_fail("Failed to download metadata for {} for from urls: {}".format(
failed_sources,
index_urls,
))
return None

if warn_overrides:
index_url_overrides = {
pkg: found_on_index[pkg]
for pkg in attr.sources
if found_on_index[pkg] != attr.index_url
}

# buildifier: disable=print
print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format(
render.dict(index_url_overrides),
))

return contents

Expand Down
5 changes: 5 additions & 0 deletions tests/pypi/simpleapi_download/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
load("simpleapi_download_tests.bzl", "simpleapi_download_test_suite")

simpleapi_download_test_suite(
name = "simpleapi_download_tests",
)
128 changes: 128 additions & 0 deletions tests/pypi/simpleapi_download/simpleapi_download_tests.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright 2024 The Bazel Authors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""

load("@rules_testing//lib:test_suite.bzl", "test_suite")
load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download") # buildifier: disable=bzl-visibility

_tests = []

def _test_simple(env):
calls = []

def read_simpleapi(ctx, url, attr, cache, block):
_ = ctx # buildifier: disable=unused-variable
_ = attr
_ = cache
env.expect.that_bool(block).equals(False)
calls.append(url)
if "foo" in url and "main" in url:
return struct(
output = "",
success = False,
)
else:
return struct(
output = "data from {}".format(url),
success = True,
)

contents = simpleapi_download(
ctx = struct(
os = struct(environ = {}),
),
attr = struct(
index_url_overrides = {},
index_url = "main",
extra_index_urls = ["extra"],
sources = ["foo", "bar", "baz"],
envsubst = [],
),
cache = {},
parallel_download = True,
read_simpleapi = read_simpleapi,
)

env.expect.that_collection(calls).contains_exactly([
"extra/foo/",
"main/bar/",
"main/baz/",
"main/foo/",
])
env.expect.that_dict(contents).contains_exactly({
"bar": "data from main/bar/",
"baz": "data from main/baz/",
"foo": "data from extra/foo/",
})

_tests.append(_test_simple)

def _test_fail(env):
calls = []
fails = []

def read_simpleapi(ctx, url, attr, cache, block):
_ = ctx # buildifier: disable=unused-variable
_ = attr
_ = cache
env.expect.that_bool(block).equals(False)
calls.append(url)
if "foo" in url:
return struct(
output = "",
success = False,
)
else:
return struct(
output = "data from {}".format(url),
success = True,
)

simpleapi_download(
ctx = struct(
os = struct(environ = {}),
),
attr = struct(
index_url_overrides = {},
index_url = "main",
extra_index_urls = ["extra"],
sources = ["foo", "bar", "baz"],
envsubst = [],
),
cache = {},
parallel_download = True,
read_simpleapi = read_simpleapi,
_fail = fails.append,
)

env.expect.that_collection(fails).contains_exactly([
"""Failed to download metadata for ["foo"] for from urls: ["main", "extra"]""",
])
env.expect.that_collection(calls).contains_exactly([
"extra/foo/",
"main/bar/",
"main/baz/",
"main/foo/",
])

_tests.append(_test_fail)

def simpleapi_download_test_suite(name):
"""Create the test suite.
Args:
name: the name of the test suite
"""
test_suite(name = name, basic_tests = _tests)

0 comments on commit 475a99e

Please sign in to comment.