From 39aeb9a2b790a2b473d3b4ceafb26e80bdd36009 Mon Sep 17 00:00:00 2001
From: Ge Yao <my@yaoge123.com>
Date: Sun, 24 May 2026 17:21:29 +0800
Subject: [PATCH 1/3] Add pip-index.py for PEP 503 simple-index style mirrors

Adapted from ustclug/ustcmirror-images pytorch/sync.py (originally
pytorch.py). Crawls PEP 503 / PEP 691 simple HTML indexes recursively
and rewrites href attributes so saved index pages point back to this
mirror. The same script is used for multiple jobs by parameterising
endpoint discovery and href rewriting via environment variables.

Additions on top of the upstream script:

  - Multi-host href rewrite (REWRITE_HOSTS):
    handles upstreams that emit absolute URLs across multiple origins
    (e.g. download.pytorch.org + download-r2.pytorch.org).

  - EXTRA_REWRITES:
    redirect hrefs that point at sibling hosts (e.g. files.pythonhosted.org)
    to a sibling local mirror prefix (e.g. /pypi/web), so transitive deps
    stay on the same mirror without forcing users to set --extra-index-url.

  - DEVPI_MODE:
    query a devpi server's channel JSON API and crawl only that channel's
    own projects via .../<channel>/+simple/<project>/ instead of walking
    the full inherited PyPI namespace.

  - aiohttp instead of httpx, so it can run inside the existing shared
    tunathu/tunasync-scripts:latest image with no extra dependencies.
---
 pip-index.py | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 447 insertions(+)
 create mode 100755 pip-index.py
diff --git a/pip-index.py b/pip-index.py
new file mode 100755
index 0000000..8f10170
--- /dev/null
+++ b/pip-index.py
@@ -0,0 +1,447 @@
+#!/usr/bin/env python3
+"""
+pip-index.py - Mirror script for pip-style HTTP package indexes.
+
+Adapted from ustclug/ustcmirror-images pytorch/sync.py (originally
+pytorch.py). Crawls PEP 503 / PEP 691 simple HTML indexes recursively
+and rewrites href attributes so saved index pages point back to this
+mirror. Used by the pytorch and jetson-pypi tunasync jobs; the same
+script is mounted into tunathu/tunasync-scripts:latest via docker_volumes.
+
+Additions on top of the upstream script:
+  - Multi-host href rewrite (download.pytorch.org + download-r2.pytorch.org).
+  - EXTRA_REWRITES so hrefs to other hosts (e.g. files.pythonhosted.org)
+    can be redirected to a sibling local mirror prefix (e.g. /pypi/web).
+  - DEVPI_MODE: query devpi channel JSON API to crawl only that channel's
+    own projects via .../<channel>/+simple/<project>/ instead of walking
+    the full inherited PyPI namespace.
+  - Switched httpx -> aiohttp so it runs inside the shared
+    tunathu/tunasync-scripts:latest image with no extra dependencies.
+
+Compatible with: PyTorch download server, NVIDIA Jetson AI Lab pypi
+(devpi-backed), and other PEP 503 simple HTML indexes that emit
+<a href="..."> per file.
+
+Not for: pypi.org full mirroring (use shadowmire.py), Conda channels,
+apt / yum repositories, git or docker registries.
+
+Environment variables:
+  TO / TUNASYNC_WORKING_DIR  Mirror data directory (tunasync injects).
+  TUNASYNC_MIRROR_NAME       Mirror name (tunasync injects). Used as the
+                             default URLBASE when URLBASE is unset.
+  URLBASE                    Local URL prefix used when rewriting hrefs.
+                             Defaults to "/<TUNASYNC_MIRROR_NAME>/".
+                             Always normalised to leading + trailing "/".
+  REWRITE_HOSTS              Comma-separated upstream hosts whose absolute
+                             hrefs should be rewritten to URLBASE. Hrefs
+                             starting with "/" are always rewritten too.
+                             Defaults to the host of TUNASYNC_UPSTREAM_URL.
+  EXTRA_REWRITES             Comma-separated "host=prefix" rules for hrefs
+                             of OTHER hosts, e.g.
+                             "files.pythonhosted.org=/pypi/web".
+                             Default empty.
+  USE_PYTORCH_RELEASES       PyTorch-only. "1" -> additionally consume
+                             pytorch.github.io releases.json (or
+                             published_versions.json with GET_ALL=1) to
+                             discover extra index pages. Default "0".
+  GET_ALL                    PyTorch-only, with USE_PYTORCH_RELEASES=1.
+                             "1" -> use published_versions.json (full).
+                             "0" -> use releases.json (recommended).
+  DEVPI_MODE                 "1" -> treat each ".../<channel>/+simple/"
+                             entry in CUSTOM_ENDPOINTS as a devpi channel
+                             and crawl only its own projects via JSON.
+                             Falls back to PEP 503 if the JSON call fails.
+                             Default "0".
+  CUSTOM_ENDPOINTS           Comma-separated list of additional index URLs
+                             to crawl. Required for non-PyTorch upstreams.
+  NO_NIGHTLY                 "1" -> skip URLs containing "/nightly/".
+                             Default "1".
+  JOBS                       Concurrent download semaphore. Default "1".
+  TIMEOUT                    Per-request total timeout (seconds).
+                             Default "120".
+  DRY_RUN                    "1" -> log only, do not write anything.
+  https_proxy / HTTPS_PROXY  Honoured automatically (aiohttp trust_env).
+"""
+
+from contextlib import contextmanager
+from typing import IO, Any, Generator
+import aiohttp
+from pathlib import Path
+import os
+import re
+from urllib.parse import urlparse, urljoin, unquote
+import asyncio
+import time
+import logging
+
+LOG_FORMAT = "%(asctime)s %(levelname)s: %(message)s (%(filename)s:%(lineno)d)"
+log_level = logging.DEBUG if os.environ.get("DEBUG") else logging.INFO
+logging.basicConfig(level=log_level, format=LOG_FORMAT)
+
+# PyTorch-specific GitHub raw URLs, only used when USE_PYTORCH_RELEASES=1.
+RELEASES_URL = "https://raw.githubusercontent.com/pytorch/pytorch.github.io/refs/heads/site/releases.json"
+PUBLISHED_VERSION_URL = "https://raw.githubusercontent.com/pytorch/pytorch.github.io/refs/heads/site/published_versions.json"
+A_RE = re.compile(r"<a ([^>]*)>")
+HREF_RE = re.compile(r'href="([^"]+)"')
+
+
+base = Path(os.environ.get("TO", os.environ.get("TUNASYNC_WORKING_DIR", ".")))
+dry_run = os.environ.get("DRY_RUN", "0") == "1"
+jobs = int(os.environ.get("JOBS", "1"))
+timeout_sec = int(os.environ.get("TIMEOUT", "120"))
+
+# URLBASE defaults to /<TUNASYNC_MIRROR_NAME>/ when unset.
+mirror_name = os.environ.get("TUNASYNC_MIRROR_NAME", "")
+default_urlbase = f"/{mirror_name}/" if mirror_name else "/pytorch/"
+urlbase = os.environ.get("URLBASE", default_urlbase)
+if not urlbase.endswith("/"):
+    urlbase += "/"
+if not urlbase.startswith("/"):
+    urlbase = "/" + urlbase
+
+# if true, additionally read PyTorch releases.json / published_versions.json
+use_pytorch_releases = os.environ.get("USE_PYTORCH_RELEASES", "0") == "1"
+# if true, use PUBLISHED_VERSION_URL to get the list of URLs (only with USE_PYTORCH_RELEASES=1)
+get_all = os.environ.get("GET_ALL", "0") == "1"
+# if true, expand devpi channel endpoints via JSON API
+devpi_mode = os.environ.get("DEVPI_MODE", "0") == "1"
+# allow custom endpoints, e.g., https://download.pytorch.org/whl/xpu (Intel GPU builds)
+custom_endpoints = [
+    e.strip() for e in os.environ.get("CUSTOM_ENDPOINTS", "").split(",") if e.strip()
+]
+
+# REWRITE_HOSTS defaults to the host of TUNASYNC_UPSTREAM_URL when unset.
+upstream_url = os.environ.get("TUNASYNC_UPSTREAM_URL", "")
+if upstream_url:
+    default_hosts = urlparse(upstream_url).netloc
+else:
+    default_hosts = "download.pytorch.org,download-r2.pytorch.org"
+rewrite_hosts = [
+    h.strip()
+    for h in os.environ.get("REWRITE_HOSTS", default_hosts).split(",")
+    if h.strip()
+]
+# EXTRA_REWRITES: host=prefix rules to rewrite hrefs of OTHER hosts to
+# different mirror prefixes. See docstring for usage and Issue #86 context.
+extra_rewrites: list[tuple[str, str]] = []
+for rule in os.environ.get("EXTRA_REWRITES", "").split(","):
+    rule = rule.strip()
+    if not rule:
+        continue
+    if "=" not in rule:
+        logging.warning(f"Ignoring invalid EXTRA_REWRITES rule: {rule!r}")
+        continue
+    host, prefix = rule.split("=", 1)
+    host = host.strip()
+    prefix = prefix.strip()
+    if not host or not prefix:
+        logging.warning(f"Ignoring invalid EXTRA_REWRITES rule: {rule!r}")
+        continue
+    if not prefix.endswith("/"):
+        prefix += "/"
+    if not prefix.startswith("/"):
+        prefix = "/" + prefix
+    extra_rewrites.append((host, prefix))
+# exclude nightly builds, by default
+no_nightly = os.environ.get("NO_NIGHTLY", "1") == "1"
+
+sem = asyncio.Semaphore(jobs)
+
+
+@contextmanager
+def overwrite(
+    file_path: Path, mode: str = "w", tmp_suffix: str = ".tmp"
+) -> Generator[IO[Any], None, None]:
+    tmp_path = file_path.parent / (file_path.name + tmp_suffix)
+    try:
+        with open(tmp_path, mode) as tmp_file:
+            yield tmp_file
+        tmp_path.rename(file_path)
+    except Exception:
+        # well, just keep the tmp_path in error case.
+        raise
+
+
+async def show_progress(url, start_time, get_downloaded, total):
+    try:
+        while True:
+            await asyncio.sleep(5)
+            downloaded = get_downloaded()
+            elapsed = time.monotonic() - start_time
+            if total > 0:
+                logging.info(
+                    f"Progress of {url}: {downloaded}/{total} "
+                    f"({downloaded / total:.2%}), elapsed: {elapsed:.0f}s"
+                )
+            else:
+                logging.info(
+                    f"Progress of {url}: {downloaded} bytes, elapsed: {elapsed:.0f}s"
+                )
+    except asyncio.CancelledError:
+        pass
+
+
+async def get_with_progress(client: aiohttp.ClientSession, url: str) -> bytes:
+    for attempt in range(3):
+        try:
+            async with client.get(url, allow_redirects=True) as resp:
+                resp.raise_for_status()
+                total = int(resp.headers.get("Content-Length", 0))
+                downloaded = 0
+
+                progress_task = asyncio.create_task(
+                    show_progress(url, time.monotonic(), lambda: downloaded, total)
+                )
+                chunks = []
+                try:
+                    async for chunk in resp.content.iter_chunked(65536):
+                        downloaded += len(chunk)
+                        chunks.append(chunk)
+                finally:
+                    progress_task.cancel()
+                    try:
+                        await progress_task
+                    except asyncio.CancelledError:
+                        pass
+                return b"".join(chunks)
+        except Exception as e:
+            if attempt == 2:
+                raise e
+            logging.warning(f"Failed to download {url}, retrying ({attempt + 1})...")
+            await asyncio.sleep(5)
+    assert False, "impossible"
+
+
+async def get_devpi_projects(
+    client: aiohttp.ClientSession, channel_url: str
+) -> list[str] | None:
+    """Query a devpi channel via JSON API for its own projects.
+
+    Returns a list of project names owned by the channel, or None if the
+    upstream does not look like devpi or the request fails.
+    """
+    try:
+        async with client.get(
+            channel_url,
+            headers={"Accept": "application/json"},
+            allow_redirects=True,
+        ) as resp:
+            if resp.status != 200:
+                return None
+            data = await resp.json(content_type=None)
+    except (aiohttp.ClientError, asyncio.TimeoutError, ValueError):
+        return None
+    result = data.get("result") if isinstance(data, dict) else None
+    if not isinstance(result, dict):
+        return None
+    projects = result.get("projects")
+    if not isinstance(projects, list):
+        return None
+    return [p for p in projects if isinstance(p, str) and p]
+
+
+def rewrite_index(index_resp: str) -> str:
+    """Rewrite href attributes so the saved index page points at this mirror."""
+    # Relative-from-root form: href="/whl/..." -> href="<URLBASE>whl/..."
+    index_resp = index_resp.replace('href="/', f'href="{urlbase}')
+    # Absolute upstream URLs from any rewrite host.
+    for host in rewrite_hosts:
+        index_resp = index_resp.replace(
+            f'href="https://{host}/', f'href="{urlbase}'
+        )
+        index_resp = index_resp.replace(
+            f'href="http://{host}/', f'href="{urlbase}'
+        )
+    # Absolute URLs from OTHER hosts that have an explicit local prefix
+    # (e.g. files.pythonhosted.org -> /pypi/web/). See Issue #86.
+    for host, prefix in extra_rewrites:
+        index_resp = index_resp.replace(
+            f'href="https://{host}/', f'href="{prefix}'
+        )
+        index_resp = index_resp.replace(
+            f'href="http://{host}/', f'href="{prefix}'
+        )
+    return index_resp
+
+
+async def recursive_download(client: aiohttp.ClientSession, url: str):
+    path = unquote(urlparse(url).path)
+    while path.startswith("/"):
+        path = path[1:]
+    if url.endswith("/") or url.endswith(".html"):
+        # index.html (current) or torch_stable.html (old)
+        async with sem:
+            logging.info(f"Getting {url}")
+            contents = await get_with_progress(client, url)
+            index_resp = contents.decode("utf-8")
+            if url.endswith("/"):
+                filename = "index.html"
+            else:
+                filename = url.split("/")[-1]
+                assert filename.endswith(".html"), f"Unexpected HTML file: {filename}"
+
+        # Derive the upstream base (scheme://netloc) from the current url so
+        # absolute-from-root hrefs ("/whl/foo") can be resolved without
+        # hardcoding download.pytorch.org.
+        parsed = urlparse(url)
+        upstream_base = f"{parsed.scheme}://{parsed.netloc}"
+
+        tasks = []
+        for m in A_RE.finditer(index_resp):
+            attr = m.group(1)
+            href = HREF_RE.search(attr)
+            assert href is not None, f"Invalid href in {attr}"
+            suburl = href.group(1).split("#")[0]
+            if suburl.startswith("/"):
+                suburl = urljoin(upstream_base, suburl)
+            else:
+                suburl = urljoin(url, suburl)
+            tasks.append(asyncio.create_task(recursive_download(client, suburl)))
+            if suburl.endswith(".whl") and "data-core-metadata" in attr:
+                tasks.append(
+                    asyncio.create_task(
+                        recursive_download(client, suburl + ".metadata")
+                    )
+                )
+        if tasks:
+            await asyncio.gather(*tasks)
+        if not dry_run:
+            index_resp = rewrite_index(index_resp)
+            os.makedirs(base / path, exist_ok=True)
+            with overwrite(base / path / filename, "w") as f:
+                f.write(index_resp)
+    else:
+        if (base / path).exists():
+            return
+        if not dry_run:
+            os.makedirs((base / path).parent, exist_ok=True)
+            async with sem:
+                logging.info(f"Downloading {url} to {base / path}")
+                try:
+                    with overwrite(base / path, "wb") as f:
+                        contents = await get_with_progress(client, url)
+                        # Large files
+                        await asyncio.to_thread(f.write, contents)
+                except aiohttp.ClientResponseError as e:
+                    # Some urls are blocked by upstream, e.g.,
+                    # https://download.pytorch.org/whl/cu128/
+                    # nvidia_cudnn_cu12-9.8.0.87-py3-none-manylinux_2_27_aarch64.whl
+                    # This is a workaround to skip those files.
+                    if e.status == 403:
+                        logging.warning(f"Forbidden: {url}, skipping.")
+                    else:
+                        raise e
+
+
+async def expand_devpi_endpoint(
+    client: aiohttp.ClientSession, endpoint: str
+) -> list[str]:
+    """Expand a devpi-style "<channel>/+simple/" endpoint into per-project
+    "<channel>/+simple/<project>/" URLs.
+
+    Returns the expanded URL list. If the endpoint does not look like a
+    devpi channel or the JSON API is unavailable, returns [endpoint] so
+    the caller falls back to PEP 503 crawling.
+    """
+    if "/+simple/" not in endpoint:
+        return [endpoint]
+    channel_url = endpoint.split("/+simple/", 1)[0]
+    projects = await get_devpi_projects(client, channel_url)
+    if projects is None:
+        logging.warning(
+            f"DEVPI_MODE: cannot get projects for {channel_url}, "
+            "falling back to PEP 503 crawl"
+        )
+        return [endpoint]
+    base_simple = endpoint
+    if not base_simple.endswith("/"):
+        base_simple += "/"
+    expanded = [f"{base_simple}{p}/" for p in projects]
+    logging.info(
+        f"DEVPI_MODE: expanded {channel_url} to {len(expanded)} project endpoints"
+    )
+    return expanded
+
+
+async def main():
+    timeout_obj = aiohttp.ClientTimeout(total=timeout_sec)
+    connector = aiohttp.TCPConnector(limit=jobs)
+    async with aiohttp.ClientSession(
+        headers={
+            "User-Agent": "pip-index-sync"
+        },
+        timeout=timeout_obj,
+        connector=connector,
+        trust_env=True,
+    ) as client:
+        urls = set()
+
+        def add_endpoint(url: str):
+            if no_nightly and "/nightly/" in url:
+                logging.info(f"Skipping nightly build: {url}")
+                return
+            if url.endswith(".html"):
+                urls.add(url)
+            else:
+                if not url.endswith("/"):
+                    url += "/"
+                urls.add(url)
+
+        if devpi_mode:
+            for endpoint in custom_endpoints:
+                expanded = await expand_devpi_endpoint(client, endpoint)
+                for u in expanded:
+                    add_endpoint(u)
+        else:
+            for endpoint in custom_endpoints:
+                add_endpoint(endpoint)
+
+        if use_pytorch_releases:
+            if not get_all:
+                logging.info("Getting releases info from GitHub...")
+                async with client.get(RELEASES_URL) as resp:
+                    resp.raise_for_status()
+                    releases = await resp.json(content_type=None)
+                releases = releases["release"]
+
+                for os_ in releases:
+                    for version in releases[os_]:
+                        url = version["installation"].split(" ")[-1]
+                        if not url.startswith("https://download.pytorch.org"):
+                            continue
+                        if url.startswith("https://download.pytorch.org/whl/"):
+                            add_endpoint(url)
+            else:
+                logging.info("Getting published versions from GitHub...")
+                async with client.get(PUBLISHED_VERSION_URL) as resp:
+                    resp.raise_for_status()
+                    published_versions = await resp.json(content_type=None)
+                published_versions = published_versions["versions"]
+
+                def find_commands(obj: dict) -> list[str]:
+                    commands = []
+                    assert isinstance(obj, dict), f"unexpected JSON schema {obj}"
+                    for key, value in obj.items():
+                        if key == "command" and value is not None:
+                            assert isinstance(value, str), f"unexpected command {value}"
+                            commands.append(value)
+                        elif isinstance(value, dict):
+                            commands.extend(find_commands(value))
+                    return commands
+
+                for command in find_commands(published_versions):
+                    command = command.split(" ")[-1]
+                    if command.startswith("https://download.pytorch.org/whl/"):
+                        add_endpoint(command)
+
+        if not urls:
+            logging.warning(
+                "No URLs to crawl. Set CUSTOM_ENDPOINTS or USE_PYTORCH_RELEASES=1."
+            )
+            return
+
+        await asyncio.gather(*(recursive_download(client, url) for url in urls))
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 199a8be1d0fc31fc4913ba2f898a311bed90ed5d Mon Sep 17 00:00:00 2001
From: Ge Yao <my@yaoge123.com>
Date: Sun, 24 May 2026 17:50:03 +0800
Subject: [PATCH 2/3] pip-index.py: stream large files, sanitize paths, dedup
 crawl

Address review feedback:
- Add safe_local_path() that resolves URL paths under base and rejects
  any candidate that escapes base (rejects '..', absolute paths, NUL).
- Stop treating '.../torch_stable.html' as a directory: when the URL
  ends with .html, write into <parent>/<filename>, not <full path>/.
- Add stream_to_file(): wheels are now written chunk-by-chunk through
  asyncio.to_thread(fh.write, chunk) into a sibling .tmp file, which is
  unlinked on failure and atomically renamed on success. The previous
  implementation buffered the whole response in memory before writing.
- Maintain a process-wide visited URL set so cross-linked index pages
  do not re-enter recursive_download() for the same URL.
- Replace 'raise e' in get_with_progress and recursive_download with a
  bare 'raise' so the original traceback is preserved.
---
 pip-index.py | 112 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 93 insertions(+), 19 deletions(-)

diff --git a/pip-index.py b/pip-index.py
index 8f10170..16e1e56 100755
--- a/pip-index.py
+++ b/pip-index.py
@@ -146,6 +146,27 @@
 no_nightly = os.environ.get("NO_NIGHTLY", "1") == "1"
 
 sem = asyncio.Semaphore(jobs)
+# Track URLs we have already started processing so links from cross-referenced
+# index pages do not cause duplicate work or infinite recursion.
+visited: set[str] = set()
+
+
+def safe_local_path(base_dir: Path, raw_path: str) -> Path:
+    """Resolve `raw_path` against `base_dir` and reject path traversal.
+
+    URL path components are unquoted and joined to `base_dir`. The resolved
+    path is then required to live below `base_dir` so a hostile upstream
+    cannot escape the mirror via '..' or absolute components.
+    """
+    while raw_path.startswith("/"):
+        raw_path = raw_path[1:]
+    candidate = (base_dir / raw_path).resolve()
+    base_resolved = base_dir.resolve()
+    try:
+        candidate.relative_to(base_resolved)
+    except ValueError as exc:
+        raise ValueError(f"refusing path outside base: {candidate}") from exc
+    return candidate
 
 
 @contextmanager
@@ -182,6 +203,7 @@ async def show_progress(url, start_time, get_downloaded, total):
 
 
 async def get_with_progress(client: aiohttp.ClientSession, url: str) -> bytes:
+    """Fetch `url` fully into memory. Use only for index pages."""
     for attempt in range(3):
         try:
             async with client.get(url, allow_redirects=True) as resp:
@@ -204,14 +226,53 @@ async def get_with_progress(client: aiohttp.ClientSession, url: str) -> bytes:
                     except asyncio.CancelledError:
                         pass
                 return b"".join(chunks)
-        except Exception as e:
+        except Exception:
             if attempt == 2:
-                raise e
+                raise
             logging.warning(f"Failed to download {url}, retrying ({attempt + 1})...")
             await asyncio.sleep(5)
     assert False, "impossible"
 
 
+async def stream_to_file(
+    client: aiohttp.ClientSession, url: str, dest: Path
+) -> None:
+    """Stream `url` to `dest` via a sibling .tmp file. Memory-bounded."""
+    for attempt in range(3):
+        tmp = dest.parent / (dest.name + ".tmp")
+        try:
+            async with client.get(url, allow_redirects=True) as resp:
+                resp.raise_for_status()
+                total = int(resp.headers.get("Content-Length", 0))
+                downloaded = 0
+                progress_task = asyncio.create_task(
+                    show_progress(url, time.monotonic(), lambda: downloaded, total)
+                )
+                try:
+                    dest.parent.mkdir(parents=True, exist_ok=True)
+                    with open(tmp, "wb") as fh:
+                        async for chunk in resp.content.iter_chunked(65536):
+                            downloaded += len(chunk)
+                            await asyncio.to_thread(fh.write, chunk)
+                finally:
+                    progress_task.cancel()
+                    try:
+                        await progress_task
+                    except asyncio.CancelledError:
+                        pass
+            os.replace(tmp, dest)
+            return
+        except Exception:
+            try:
+                tmp.unlink()
+            except FileNotFoundError:
+                pass
+            if attempt == 2:
+                raise
+            logging.warning(f"Failed to download {url}, retrying ({attempt + 1})...")
+            await asyncio.sleep(5)
+
+
 async def get_devpi_projects(
     client: aiohttp.ClientSession, channel_url: str
 ) -> list[str] | None:
@@ -265,9 +326,14 @@ def rewrite_index(index_resp: str) -> str:
 
 
 async def recursive_download(client: aiohttp.ClientSession, url: str):
-    path = unquote(urlparse(url).path)
-    while path.startswith("/"):
-        path = path[1:]
+    # Skip URLs we already started processing. Index pages frequently link
+    # back into themselves and cross-link wheel artifacts; without this the
+    # crawl could grow exponentially or even loop.
+    if url in visited:
+        return
+    visited.add(url)
+
+    raw_path = unquote(urlparse(url).path)
     if url.endswith("/") or url.endswith(".html"):
         # index.html (current) or torch_stable.html (old)
         async with sem:
@@ -276,9 +342,16 @@ async def recursive_download(client: aiohttp.ClientSession, url: str):
             index_resp = contents.decode("utf-8")
             if url.endswith("/"):
                 filename = "index.html"
+                # Treat the directory portion as the local path so we do not
+                # turn the .html filename into a directory.
+                index_dir = safe_local_path(base, raw_path)
             else:
                 filename = url.split("/")[-1]
                 assert filename.endswith(".html"), f"Unexpected HTML file: {filename}"
+                # `raw_path` already includes the .html filename. Strip it so
+                # `index_dir` is the parent directory.
+                parent = raw_path.rsplit("/", 1)[0] if "/" in raw_path else ""
+                index_dir = safe_local_path(base, parent)
 
         # Derive the upstream base (scheme://netloc) from the current url so
         # absolute-from-root hrefs ("/whl/foo") can be resolved without
@@ -296,32 +369,33 @@ async def recursive_download(client: aiohttp.ClientSession, url: str):
                 suburl = urljoin(upstream_base, suburl)
             else:
                 suburl = urljoin(url, suburl)
+            if suburl in visited:
+                continue
             tasks.append(asyncio.create_task(recursive_download(client, suburl)))
             if suburl.endswith(".whl") and "data-core-metadata" in attr:
-                tasks.append(
-                    asyncio.create_task(
-                        recursive_download(client, suburl + ".metadata")
+                meta_url = suburl + ".metadata"
+                if meta_url not in visited:
+                    tasks.append(
+                        asyncio.create_task(
+                            recursive_download(client, meta_url)
+                        )
                     )
-                )
         if tasks:
             await asyncio.gather(*tasks)
         if not dry_run:
             index_resp = rewrite_index(index_resp)
-            os.makedirs(base / path, exist_ok=True)
-            with overwrite(base / path / filename, "w") as f:
+            index_dir.mkdir(parents=True, exist_ok=True)
+            with overwrite(index_dir / filename, "w") as f:
                 f.write(index_resp)
     else:
-        if (base / path).exists():
+        dest = safe_local_path(base, raw_path)
+        if dest.exists():
             return
         if not dry_run:
-            os.makedirs((base / path).parent, exist_ok=True)
             async with sem:
-                logging.info(f"Downloading {url} to {base / path}")
+                logging.info(f"Downloading {url} to {dest}")
                 try:
-                    with overwrite(base / path, "wb") as f:
-                        contents = await get_with_progress(client, url)
-                        # Large files
-                        await asyncio.to_thread(f.write, contents)
+                    await stream_to_file(client, url, dest)
                 except aiohttp.ClientResponseError as e:
                     # Some urls are blocked by upstream, e.g.,
                     # https://download.pytorch.org/whl/cu128/
@@ -330,7 +404,7 @@ async def recursive_download(client: aiohttp.ClientSession, url: str):
                     if e.status == 403:
                         logging.warning(f"Forbidden: {url}, skipping.")
                     else:
-                        raise e
+                        raise
 
 
 async def expand_devpi_endpoint(

From de8d0d53ee0af695a79a3f1a33d79efa4ec950d0 Mon Sep 17 00:00:00 2001
From: Ge Yao <my@yaoge123.com>
Date: Sun, 24 May 2026 19:06:47 +0800
Subject: [PATCH 3/3] pip-index.py: drop asyncio.to_thread for fh.write

Per Copilot review reflection: matching the rest of the repo (apt-sync.py,
adoptium.py, github-release.py, etc.) which all do plain f.write(chunk)
inside the download loop. With default JOBS=1 there is no concurrent
coroutine to starve, and asyncio.to_thread per chunk adds ~50us thread
scheduling overhead that outweighs any event-loop unblocking benefit.
---
 pip-index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pip-index.py b/pip-index.py
index 16e1e56..80f2c18 100755
--- a/pip-index.py
+++ b/pip-index.py
@@ -253,7 +253,7 @@ async def stream_to_file(
                     with open(tmp, "wb") as fh:
                         async for chunk in resp.content.iter_chunked(65536):
                             downloaded += len(chunk)
-                            await asyncio.to_thread(fh.write, chunk)
+                            fh.write(chunk)
                 finally:
                     progress_task.cancel()
                     try: