Skip to content

Commit 90e66dc

Browse files
authored
Merge pull request #1114 from jobselko/fix_1099
Support repairing metadata files
2 parents a7eb175 + 2b45134 commit 90e66dc

File tree

3 files changed

+242
-9
lines changed

3 files changed

+242
-9
lines changed

CHANGES/1099.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Added support for recreating and fixing metadata files to `repair_metadata` endpoint.

pulp_python/app/tasks/repair.py

Lines changed: 122 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
from django.db.models.query import QuerySet
99
from pulp_python.app.models import PythonPackageContent, PythonRepository
1010
from pulp_python.app.utils import (
11+
artifact_to_metadata_artifact,
1112
artifact_to_python_content_data,
1213
fetch_json_release_metadata,
1314
parse_metadata,
1415
)
15-
from pulpcore.plugin.models import ContentArtifact, ProgressReport
16+
from pulpcore.plugin.models import Artifact, ContentArtifact, ProgressReport
1617
from pulpcore.plugin.util import get_domain
1718

1819
log = logging.getLogger(__name__)
@@ -41,16 +42,25 @@ def repair(repository_pk: UUID) -> None:
4142
content_set = repository.latest_version().content.values_list("pk", flat=True)
4243
content = PythonPackageContent.objects.filter(pk__in=content_set)
4344

44-
num_repaired, pkgs_not_repaired = repair_metadata(content)
45+
num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired = (
46+
repair_metadata(content)
47+
)
48+
# Convert set() to 0
49+
if not pkgs_not_repaired:
50+
pkgs_not_repaired = 0
51+
if not pkgs_metadata_not_repaired:
52+
pkgs_metadata_not_repaired = 0
53+
4554
log.info(
4655
_(
4756
"{} packages' metadata repaired. Not repaired packages due to either "
48-
"inaccessible URL or mismatched sha256: {}."
49-
).format(num_repaired, pkgs_not_repaired)
57+
"inaccessible URL or mismatched sha256: {}. "
58+
"{} metadata files repaired. Packages whose metadata files could not be repaired: {}."
59+
).format(num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired)
5060
)
5161

5262

53-
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]:
63+
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str], int, set[str]]:
5464
"""
5565
Repairs metadata for a queryset of PythonPackageContent objects
5666
and updates the progress report.
@@ -59,9 +69,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
5969
content (QuerySet[PythonPackageContent]): The queryset of items to repair.
6070
6171
Returns:
62-
tuple[int, set[str]]: A tuple containing:
72+
tuple[int, set[str], int, set[str]]: A tuple containing:
6373
- The number of packages that were repaired.
6474
- A set of packages' PKs that were not repaired.
75+
- The number of metadata files that were repaired.
76+
- A set of packages' PKs without repaired metadata artifacts.
6577
"""
6678
immediate_content = (
6779
content.filter(contentartifact__artifact__isnull=False)
@@ -87,6 +99,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
8799
# Keep track of on-demand packages that were not repaired
88100
pkgs_not_repaired = set()
89101

102+
# Metadata artifacts and content artifacts
103+
metadata_batch = []
104+
total_metadata_repaired = 0
105+
pkgs_metadata_not_repaired = set()
106+
90107
progress_report = ProgressReport(
91108
message="Repairing packages' metadata",
92109
code="repair.metadata",
@@ -102,6 +119,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
102119
.artifact
103120
)
104121
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
122+
total_metadata_repaired += update_metadata_artifact_if_needed(
123+
package,
124+
new_data.get("metadata_sha256"),
125+
main_artifact,
126+
metadata_batch,
127+
pkgs_metadata_not_repaired,
128+
)
105129
total_repaired += update_package_if_needed(
106130
package, new_data, batch, set_of_update_fields
107131
)
@@ -163,7 +187,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
163187
total_repaired += len(batch)
164188
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
165189

166-
return total_repaired, pkgs_not_repaired
190+
if metadata_batch:
191+
not_repaired = _process_metadata_batch(metadata_batch)
192+
pkgs_metadata_not_repaired.update(not_repaired)
193+
total_metadata_repaired += len(metadata_batch) - len(not_repaired)
194+
195+
return total_repaired, pkgs_not_repaired, total_metadata_repaired, pkgs_metadata_not_repaired
167196

168197

169198
def update_package_if_needed(
@@ -202,3 +231,89 @@ def update_package_if_needed(
202231
set_of_update_fields.clear()
203232

204233
return total_repaired
234+
235+
236+
def update_metadata_artifact_if_needed(
237+
package: PythonPackageContent,
238+
new_metadata_sha256: str | None,
239+
main_artifact: Artifact,
240+
metadata_batch: list[tuple],
241+
pkgs_metadata_not_repaired: set[str],
242+
) -> int:
243+
"""
244+
Repairs metadata artifacts for wheel packages by creating missing metadata artifacts
245+
or updating existing ones when the metadata_sha256 differs. Only processes wheel files
246+
that have a valid new_metadata_sha256. Queues operations for batch processing.
247+
248+
Args:
249+
package: Package to check for metadata changes.
250+
new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None.
251+
main_artifact: The main package artifact used to generate metadata.
252+
metadata_batch: List of tuples for batch processing (updated in-place).
253+
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).
254+
255+
Returns:
256+
Number of repaired metadata artifacts (only when batch is flushed at BULK_SIZE).
257+
"""
258+
total_metadata_repaired = 0
259+
260+
if not package.filename.endswith(".whl") or not new_metadata_sha256:
261+
return total_metadata_repaired
262+
263+
original_metadata_sha256 = package.metadata_sha256
264+
cas = package.contentartifact_set.filter(relative_path__endswith=".metadata")
265+
266+
# Create missing
267+
if not cas:
268+
metadata_batch.append((package, main_artifact))
269+
# Fix existing
270+
elif new_metadata_sha256 != original_metadata_sha256:
271+
ca = cas.first()
272+
metadata_artifact = ca.artifact
273+
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
274+
metadata_batch.append((package, main_artifact))
275+
276+
if len(metadata_batch) == BULK_SIZE:
277+
not_repaired = _process_metadata_batch(metadata_batch)
278+
pkgs_metadata_not_repaired.update(not_repaired)
279+
total_metadata_repaired += BULK_SIZE - len(not_repaired)
280+
metadata_batch.clear()
281+
282+
return total_metadata_repaired
283+
284+
285+
def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
286+
"""
287+
Processes a batch of metadata repair operations by creating metadata artifacts
288+
and their corresponding ContentArtifacts.
289+
290+
Args:
291+
metadata_batch: List of (package, main_artifact) tuples.
292+
293+
Returns:
294+
Set of package PKs for which metadata artifacts could not be created.
295+
"""
296+
not_repaired = set()
297+
content_artifacts = []
298+
299+
for package, main_artifact in metadata_batch:
300+
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
301+
if metadata_artifact:
302+
ca = ContentArtifact(
303+
artifact=metadata_artifact,
304+
content=package,
305+
relative_path=f"{package.filename}.metadata",
306+
)
307+
content_artifacts.append(ca)
308+
else:
309+
not_repaired.add(package.pk)
310+
311+
if content_artifacts:
312+
ContentArtifact.objects.bulk_create(
313+
content_artifacts,
314+
update_conflicts=True,
315+
update_fields=["artifact"],
316+
unique_fields=["content", "relative_path"],
317+
)
318+
319+
return not_repaired

pulp_python/tests/functional/api/test_repair.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
@pytest.fixture
1212
def create_content_direct(python_bindings):
13-
def _create(artifact_filename, content_data):
13+
def _create(artifact_filename, content_data, metadata_artifact_filename=None):
1414
commands = (
1515
"from pulpcore.plugin.models import Artifact, ContentArtifact; "
1616
"from pulpcore.plugin.util import get_url; "
@@ -21,8 +21,16 @@ def _create(artifact_filename, content_data):
2121
"c.save(); "
2222
f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); "
2323
"ca.save(); "
24-
"print(get_url(c))"
2524
)
25+
if metadata_artifact_filename:
26+
commands += (
27+
f"a2 = Artifact.init_and_validate('{metadata_artifact_filename}'); "
28+
"a2.save(); "
29+
"ca2_filename = c.filename + '.metadata'; "
30+
f"ca2 = ContentArtifact(artifact=a2, content=c, relative_path=ca2_filename); "
31+
"ca2.save(); "
32+
)
33+
commands += "print(get_url(c))"
2634
process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True)
2735

2836
assert process.returncode == 0
@@ -214,3 +222,112 @@ def test_metadata_repair_endpoint(
214222
assert new_content.author == author
215223
assert new_content.packagetype == packagetype
216224
assert new_content.requires_python == requires_python
225+
226+
227+
def test_metadata_artifact_repair_endpoint(
228+
create_content_direct,
229+
delete_orphans_pre,
230+
download_python_file,
231+
monitor_task,
232+
move_to_repository,
233+
pulpcore_bindings,
234+
python_bindings,
235+
python_repo_factory,
236+
):
237+
"""
238+
Test repairing of PythonPackageContent's metadata_sha256 and its metadata Artifact
239+
and ContentArtifact via `Repositories.repair_metadata` endpoint.
240+
"""
241+
# 1. Setup tested data
242+
python_repo = python_repo_factory()
243+
244+
# missing metadata_sha256, missing metadata Artifact + ContentArtifact
245+
filename_1 = "scipy-1.1.0-cp27-none-win_amd64.whl"
246+
metadata_1 = None
247+
url_1 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_1)
248+
file_1 = download_python_file(filename_1, url_1)
249+
250+
# correct metadata_sha256, missing metadata Artifact + ContentArtifact
251+
filename_2 = "scipy-1.1.0-cp27-cp27m-manylinux1_x86_64.whl"
252+
metadata_2 = "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"
253+
url_2 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_2)
254+
file_2 = download_python_file(filename_2, url_2)
255+
256+
# wrong metadata_sha256, missing metadata Artifact + ContentArtifact
257+
filename_3 = "scipy-1.1.0-cp34-none-win32.whl"
258+
metadata_3 = "1234"
259+
url_3 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_3)
260+
file_3 = download_python_file(filename_3, url_3)
261+
262+
# wrong metadata_sha256, wrong metadata Artifact, correct metadata ContentArtifact
263+
filename_4 = "scipy-1.1.0-cp35-none-win32.whl"
264+
metadata_4 = "5678"
265+
url_4 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_4)
266+
file_4 = download_python_file(filename_4, url_4)
267+
metadata_file_4 = download_python_file(
268+
f"{filename_1}.metadata",
269+
urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), f"{filename_1}.metadata"),
270+
)
271+
272+
# Build PythonPackageContent data
273+
filenames = [filename_1, filename_2, filename_3, filename_4]
274+
metadata_sha256s = [metadata_1, metadata_2, metadata_3, metadata_4]
275+
data_1, data_2, data_3, data_4 = [
276+
{"name": "scipy", "version": "1.1.0", "filename": f, "metadata_sha256": m}
277+
for f, m in zip(filenames, metadata_sha256s)
278+
]
279+
280+
# 2. Create content
281+
content_1 = create_content_direct(file_1, data_1)
282+
content_2 = create_content_direct(file_2, data_2)
283+
content_3 = create_content_direct(file_3, data_3)
284+
content_4 = create_content_direct(file_4, data_4, metadata_file_4)
285+
286+
content_hrefs = {}
287+
for data, content in [
288+
(data_1, content_1),
289+
(data_2, content_2),
290+
(data_3, content_3),
291+
(data_4, content_4),
292+
]:
293+
for field, test_value in data.items():
294+
assert getattr(content, field) == test_value
295+
content_hrefs[data["filename"]] = content.pulp_href
296+
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))
297+
298+
# 3. Repair metadata and metadata files
299+
response = python_bindings.RepositoriesPythonApi.repair_metadata(python_repo.pulp_href)
300+
monitor_task(response.task)
301+
302+
# 4. Check new metadata and metadata files
303+
main_artifact_hrefs = set()
304+
metadata_artifact_hrefs = set()
305+
new_data = [
306+
(filename_1, "15ae132303b2774a0d839d01c618cf99fc92716adfaaa2bc1267142ab2b76b98"),
307+
(filename_2, "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"),
308+
# filename_3 and filename_4 have the same metadata file
309+
(filename_3, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
310+
(filename_4, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
311+
]
312+
for filename, metadata_sha256 in new_data:
313+
content = pulpcore_bindings.ContentApi.list(pulp_href__in=[content_hrefs[filename]]).results
314+
assert content
315+
artifacts = content[0].artifacts
316+
assert len(artifacts) == 2
317+
318+
main_artifact_href = artifacts.get(filename)
319+
main_artifact_hrefs.add(main_artifact_href)
320+
main_artifact = pulpcore_bindings.ArtifactsApi.read(main_artifact_href)
321+
322+
metadata_artifact_href = artifacts.get(f"{filename}.metadata")
323+
metadata_artifact_hrefs.add(metadata_artifact_href)
324+
metadata_artifact = pulpcore_bindings.ArtifactsApi.read(metadata_artifact_href)
325+
326+
pkg = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
327+
assert pkg.metadata_sha256 == metadata_sha256
328+
assert main_artifact.sha256 == pkg.sha256
329+
assert metadata_artifact.sha256 == pkg.metadata_sha256
330+
331+
# Check deduplication
332+
assert len(main_artifact_hrefs) == 4
333+
assert len(metadata_artifact_hrefs) == 3

0 commit comments

Comments
 (0)