[Fixes #14000] Refact extraMetadata#14286
Conversation
There was a problem hiding this comment.
Code Review
This pull request deprecates the legacy ExtraMetadata model and migrates its storage to SparseField entries, introducing a backward-compatible adapter to maintain the deprecated /extra_metadata/ endpoint and serializer field. The review feedback highlights several important improvement opportunities in the adapter, such as preventing potential TypeError and AttributeError exceptions when handling non-dictionary metadata, avoiding log pollution from deprecation warnings during bulk serialization, and returning a 400 Bad Request error instead of silently skipping oversized metadata entries.
| try: | ||
| metadata = json.loads(sparse_field.value) if sparse_field.value else {} | ||
| except (json.JSONDecodeError, TypeError): | ||
| metadata = {} | ||
| return {**{"id": sparse_field.pk}, **metadata} |
There was a problem hiding this comment.
If metadata is not a dictionary (e.g., if it is parsed as a list, string, or integer, or if it is None), unpacking it with **metadata will raise a TypeError. We should explicitly verify that metadata is a dictionary before unpacking to prevent potential 500 server errors.
| try: | |
| metadata = json.loads(sparse_field.value) if sparse_field.value else {} | |
| except (json.JSONDecodeError, TypeError): | |
| metadata = {} | |
| return {**{"id": sparse_field.pk}, **metadata} | |
| try: | |
| metadata = json.loads(sparse_field.value) if sparse_field.value else {} | |
| except (json.JSONDecodeError, TypeError): | |
| metadata = {} | |
| if not isinstance(metadata, dict): | |
| metadata = {} | |
| return {**{"id": sparse_field.pk}, **metadata} |
| for sf in qs: | ||
| try: | ||
| meta = json.loads(sf.value) if sf.value else {} | ||
| except (json.JSONDecodeError, TypeError): | ||
| continue | ||
| if str(meta.get(key)) == str(value): | ||
| filtered.append(sf) |
There was a problem hiding this comment.
If meta is not a dictionary (e.g., if it is a list or string), calling meta.get(key) will raise an AttributeError, causing a 500 server error. We should verify that meta is a dictionary before attempting to access its keys.
| for sf in qs: | |
| try: | |
| meta = json.loads(sf.value) if sf.value else {} | |
| except (json.JSONDecodeError, TypeError): | |
| continue | |
| if str(meta.get(key)) == str(value): | |
| filtered.append(sf) | |
| for sf in qs: | |
| try: | |
| meta = json.loads(sf.value) if sf.value else {} | |
| if not isinstance(meta, dict): | |
| continue | |
| except (json.JSONDecodeError, TypeError): | |
| continue | |
| if str(meta.get(key)) == str(value): | |
| filtered.append(sf) |
| def get_attribute(self, instance): | ||
| warnings.warn(DEPRECATION_REASON, DeprecationWarning, stacklevel=2) | ||
| logger.warning(DEPRECATION_REASON) |
There was a problem hiding this comment.
Calling logger.warning inside get_attribute will flood the logs on bulk serialization (e.g., when listing resources). It is better to rely on warnings.warn to signal deprecation, avoiding log pollution.
| def get_attribute(self, instance): | |
| warnings.warn(DEPRECATION_REASON, DeprecationWarning, stacklevel=2) | |
| logger.warning(DEPRECATION_REASON) | |
| def get_attribute(self, instance): | |
| warnings.warn(DEPRECATION_REASON, DeprecationWarning, stacklevel=2) |
| if len(value) > SPARSE_FIELD_VALUE_MAX_LENGTH: | ||
| logger.warning( | ||
| "extra_metadata entry skipped: serialized value exceeds " | ||
| f"{SPARSE_FIELD_VALUE_MAX_LENGTH} characters" | ||
| ) | ||
| continue |
There was a problem hiding this comment.
| if len(value) > SPARSE_FIELD_VALUE_MAX_LENGTH: | ||
| logger.warning( | ||
| "extra_metadata entry skipped: serialized value exceeds " | ||
| f"{SPARSE_FIELD_VALUE_MAX_LENGTH} characters" | ||
| ) | ||
| continue |
There was a problem hiding this comment.
Codecov Report❌ Patch coverage is Additional details and impacted files@@ Coverage Diff @@
## master #14286 +/- ##
==========================================
- Coverage 74.92% 74.84% -0.09%
==========================================
Files 975 979 +4
Lines 59902 60384 +482
Branches 8157 8244 +87
==========================================
+ Hits 44884 45192 +308
- Misses 13194 13349 +155
- Partials 1824 1843 +19 🚀 New features to boost your workflow:
|
b315c76 to
48e7442
Compare
| continue | ||
| meta_dict.pop("id", None) | ||
| value = json.dumps(meta_dict) | ||
| if len(value) > SPARSE_FIELD_VALUE_MAX_LENGTH: | ||
| return Response( | ||
| {"detail": f"serialized value exceeds {SPARSE_FIELD_VALUE_MAX_LENGTH} characters"}, status=400 | ||
| ) | ||
| name = _next_sparse_name(resource) | ||
| try: | ||
| SparseField.objects.create(resource=resource, name=name, value=value) | ||
| except IntegrityError: | ||
| # Concurrent request created the same name; retry with UUID | ||
| name = f"{SPARSE_FIELD_PREFIX}{uuid.uuid4().hex[:12]}" | ||
| SparseField.objects.create(resource=resource, name=name, value=value) | ||
|
|
||
| result = [_sparse_to_legacy(sf) for sf in _sparse_fields_for_resource(resource)] | ||
| return Response(result, status=201) |
| def test_creation_should_rase_exec_for_unsupported_files(self): | ||
| self.client.force_login(self.admin) |
| def test_creation_from_url_should_create_the_doc(self): | ||
| """ | ||
| If file_path is not available, should raise error | ||
| """ |
| for filter_key, filter_value in metadata_filters.items(): | ||
| # Extract field name from "metadata__fieldname" | ||
| if not filter_key.startswith("metadata__"): | ||
| continue | ||
|
|
||
| found_metadata_filter = True | ||
| field_name = filter_key[len("metadata__") :] | ||
|
|
||
| # Query SparseFields for this field name | ||
| sparse_fields = SparseField.objects.filter(name=field_name) | ||
|
|
||
| batch_pks = set() | ||
| for sf in sparse_fields: | ||
| try: | ||
| # Try to parse as JSON if it looks like JSON | ||
| stored_value = json.loads(sf.value) if sf.value and sf.value.startswith("{") else sf.value | ||
| except (json.JSONDecodeError, TypeError): | ||
| stored_value = sf.value | ||
|
|
||
| # Compare values (convert to string for consistent comparison) | ||
| if str(stored_value) == str(filter_value): | ||
| batch_pks.add(sf.resource.pk) | ||
|
|
||
| filtered_pks.update(batch_pks) | ||
|
|
| "extension": "jpeg", | ||
| } | ||
| } | ||
| actual = self.client.post(self.url, data=payload, format="json") |
| detail=True, | ||
| methods=["get", "put", "delete", "post"], | ||
| url_path=r"extra_metadata", | ||
| url_name="extra-metadata", | ||
| ) |
57c60c9 to
98776ad
Compare
adad877 to
4e975f2
Compare
|
/gemini review |
4e975f2 to
4297467
Compare
|
/gemini review |
There was a problem hiding this comment.
Code Review
This pull request replaces the legacy ExtraMetadata model with SparseField entries to store custom metadata, introducing backward-compatible adapters and a migration script to handle existing data. The code review highlights several critical issues: a logic bug in filter_sparse_fields that changes query filtering from AND to OR logic, potential N+1 query performance bottlenecks in both the sparse field filtering and the deprecated metadata POST endpoint, a prefiltering bug when dealing with boolean or null values, and database performance concerns in the migration script that could be resolved by using batched bulk_create instead of get_or_create in a loop.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def filter_sparse_fields(self, queryset, metadata_filters): | ||
| """ | ||
| Filter queryset by sparse field values (metadata custom fields). | ||
|
|
||
| Queryset is filtered by interrogating SparseField entries that match | ||
| the given metadata filter specifications. | ||
|
|
||
| Args: | ||
| queryset: ResourceBase queryset to filter | ||
| metadata_filters: dict with keys like "metadata__key" and values to match | ||
|
|
||
| Returns: | ||
| Filtered queryset containing only resources with matching sparse fields | ||
| """ | ||
| if not metadata_filters: | ||
| return queryset | ||
|
|
||
| filtered_pks = set() | ||
| found_metadata_filter = False | ||
|
|
||
| for filter_key, filter_value in metadata_filters.items(): | ||
| # Extract field name from "metadata__fieldname" | ||
| if not filter_key.startswith("metadata__"): | ||
| continue | ||
|
|
||
| found_metadata_filter = True | ||
| field_name = filter_key[len("metadata__") :] | ||
|
|
||
| # Text prefilter to reduce the set to deserialize; the actual | ||
| # semantic match is performed after json.loads. | ||
| sparse_fields = ( | ||
| SparseField.objects.filter(name__startswith="extra_") | ||
| .filter(value__icontains=field_name) | ||
| .filter(value__icontains=str(filter_value)) | ||
| ) | ||
|
|
||
| batch_pks = set() | ||
| for sf in sparse_fields: | ||
| try: | ||
| # ExtraMetadata was a JSONfield | ||
| stored_value = json.loads(sf.value) if sf.value and sf.value.startswith("{") else sf.value | ||
| except (json.JSONDecodeError, TypeError): | ||
| logger.warning( | ||
| f"Bad migrated ExtraMetadata into SparseField: {sf.name} for resource {sf.resource.id}:{sf.resource.title}" | ||
| ) | ||
| continue | ||
|
|
||
| if not isinstance(stored_value, dict): | ||
| logger.warning( | ||
| f"Unexpected non-dict value in SparseField: {sf.name} for resource {sf.resource.id}:{sf.resource.title}" | ||
| ) | ||
| continue | ||
|
|
||
| # Compare values in a type-agnostic way. | ||
| if str(stored_value.get(field_name, None)) == str(filter_value): | ||
| batch_pks.add(sf.resource.pk) | ||
|
|
||
| filtered_pks.update(batch_pks) | ||
|
|
||
| # Filter by the collected PKs | ||
| # If we processed metadata filters, return only resources with matching values | ||
| if found_metadata_filter: | ||
| filtered = queryset.filter(pk__in=filtered_pks) if filtered_pks else queryset.none() | ||
| else: | ||
| # No metadata filters found, return queryset as-is | ||
| filtered = queryset | ||
|
|
||
| return filtered |
There was a problem hiding this comment.
There are multiple critical issues in the filter_sparse_fields implementation:
- Logic Bug (AND vs OR): The original implementation used Django's
.filter(**metadata_filters)which combined multiple metadata filters usingANDlogic. The new implementation accumulates PKs across all filters intofiltered_pksusingfiltered_pks.update(batch_pks), which effectively changes the logic toOR. This will return incorrect results when filtering by multiple metadata fields. - N+1 Queries: Accessing
sf.resource.idandsf.resource.pkinside the loop triggers a database query for eachSparseFieldrecord becauseselect_related("resource")is not used. Usingsf.resource_iddirectly avoids these queries. - Boolean/None Prefilter Bug: If
filter_valueis a boolean (e.g.,True),str(filter_value)evaluates to"True". However, the JSON string stored invaluecontains"true"(lowercase). This causes theicontainsprefilter to fail to find any records. Booleans andNoneshould be mapped to their JSON string representations ("true","false","null").
def filter_sparse_fields(self, queryset, metadata_filters):
"""
Filter queryset by sparse field values (metadata custom fields).
Queryset is filtered by interrogating SparseField entries that match
the given metadata filter specifications.
Args:
queryset: ResourceBase queryset to filter
metadata_filters: dict with keys like "metadata__key" and values to match
Returns:
Filtered queryset containing only resources with matching sparse fields
"""
if not metadata_filters:
return queryset
filtered_pks = None
found_metadata_filter = False
for filter_key, filter_value in metadata_filters.items():
# Extract field name from "metadata__fieldname"
if not filter_key.startswith("metadata__"):
continue
found_metadata_filter = True
field_name = filter_key[len("metadata__") :]
if isinstance(filter_value, bool):
prefilter_val = "true" if filter_value else "false"
elif filter_value is None:
prefilter_val = "null"
else:
prefilter_val = str(filter_value)
# Text prefilter to reduce the set to deserialize; the actual
# semantic match is performed after json.loads.
sparse_fields = (
SparseField.objects.filter(name__startswith="extra_")
.filter(value__icontains=field_name)
.filter(value__icontains=prefilter_val)
)
batch_pks = set()
for sf in sparse_fields:
try:
# ExtraMetadata was a JSONfield
stored_value = json.loads(sf.value) if sf.value and sf.value.startswith("{") else sf.value
except (json.JSONDecodeError, TypeError):
logger.warning(
f"Bad migrated ExtraMetadata into SparseField: {sf.name} for resource {sf.resource_id}"
)
continue
if not isinstance(stored_value, dict):
logger.warning(
f"Unexpected non-dict value in SparseField: {sf.name} for resource {sf.resource_id}"
)
continue
# Compare values in a type-agnostic way.
if str(stored_value.get(field_name, None)) == str(filter_value):
batch_pks.add(sf.resource_id)
if filtered_pks is None:
filtered_pks = batch_pks
else:
filtered_pks.intersection_update(batch_pks)
# Filter by the collected PKs
# If we processed metadata filters, return only resources with matching values
if found_metadata_filter:
filtered = queryset.filter(pk__in=filtered_pks) if filtered_pks else queryset.none()
else:
# No metadata filters found, return queryset as-is
filtered = queryset
return filtered| def migrate_extrametadata_to_sparsefields(apps, schema_editor): | ||
| """ | ||
| Migrate ExtraMetadata to SparseField. | ||
| Each ExtraMetadata object (with its JSON dict) is stored as a single | ||
| SparseField entry with name 'extra_<pk>' and value as JSON string. | ||
| Entries whose serialized JSON exceeds 1024 characters are skipped with a warning. | ||
| """ | ||
| ExtraMetadata = apps.get_model("base", "ExtraMetadata") | ||
| SparseField = apps.get_model("metadata", "SparseField") | ||
|
|
||
| for extra_meta in ExtraMetadata.objects.select_related("resource").iterator(): | ||
| name = f"extra_{extra_meta.pk}" | ||
| value = json.dumps(extra_meta.metadata) | ||
| if len(value) > 1024: | ||
| logger.warning( | ||
| f"ExtraMetadata pk={extra_meta.pk} for resource {extra_meta.resource.id}:{extra_meta.resource.title} " | ||
| f"skipped during migration to SparseField: " | ||
| f"serialized value exceeds 1024 characters" | ||
| ) | ||
| continue | ||
| SparseField.objects.get_or_create( | ||
| resource=extra_meta.resource, | ||
| name=name, | ||
| defaults={"value": value}, | ||
| ) |
There was a problem hiding this comment.
Using SparseField.objects.get_or_create inside a loop executes a SELECT and potentially an INSERT query for every single ExtraMetadata record. On large production databases with thousands of records, this will cause significant migration delays and potential timeouts. Using bulk_create with batching is highly recommended for performance and efficiency.
def migrate_extrametadata_to_sparsefields(apps, schema_editor):
"""
Migrate ExtraMetadata to SparseField.
Each ExtraMetadata object (with its JSON dict) is stored as a single
SparseField entry with name 'extra_<pk>' and value as JSON string.
Entries whose serialized JSON exceeds 1024 characters are skipped with a warning.
"""
ExtraMetadata = apps.get_model("base", "ExtraMetadata")
SparseField = apps.get_model("metadata", "SparseField")
to_create = []
for extra_meta in ExtraMetadata.objects.select_related("resource").iterator():
name = f"extra_{extra_meta.pk}"
value = json.dumps(extra_meta.metadata)
if len(value) > 1024:
logger.warning(
f"ExtraMetadata pk={extra_meta.pk} for resource {extra_meta.resource.id}:{extra_meta.resource.title} "
f"skipped during migration to SparseField: "
f"serialized value exceeds 1024 characters"
)
continue
to_create.append(
SparseField(
resource=extra_meta.resource,
name=name,
value=value,
)
)
if len(to_create) >= 1000:
SparseField.objects.bulk_create(to_create, ignore_conflicts=True)
to_create = []
if to_create:
SparseField.objects.bulk_create(to_create, ignore_conflicts=True)| def _next_sparse_name(resource): | ||
| """Generate the next available ``extra_<N>`` name. | ||
|
|
||
| Falls back to a UUID-based suffix if a name collision occurs (e.g. under | ||
| concurrent requests), keeping the ``extra_`` prefix intact. | ||
| The numeric part is still preferred for readability. | ||
| """ | ||
| existing = ( | ||
| SparseField.objects.filter( | ||
| resource=resource, | ||
| name__startswith=SPARSE_FIELD_PREFIX, | ||
| ) | ||
| .order_by("-name") | ||
| .values_list("name", flat=True) | ||
| ) | ||
| max_n = 0 | ||
| for name in existing: | ||
| suffix = name[len(SPARSE_FIELD_PREFIX) :] | ||
| try: | ||
| max_n = max(max_n, int(suffix)) | ||
| except (ValueError, TypeError): | ||
| pass | ||
| candidate = f"{SPARSE_FIELD_PREFIX}{max_n + 1}" | ||
| # Guard against a race: if the candidate already exists, use a UUID suffix | ||
| if SparseField.objects.filter(resource=resource, name=candidate).exists(): | ||
| candidate = f"{SPARSE_FIELD_PREFIX}{uuid.uuid4().hex[:12]}" | ||
| return candidate |
There was a problem hiding this comment.
To avoid N+1 queries when creating multiple metadata entries, _next_sparse_name should accept an optional list of existing_names that are already loaded in memory.
def _next_sparse_name(resource, existing_names=None):
"""Generate the next available ``extra_<N>`` name.
Falls back to a UUID-based suffix if a name collision occurs (e.g. under
concurrent requests), keeping the ``extra_`` prefix intact.
The numeric part is still preferred for readability.
"""
if existing_names is None:
existing_names = list(
SparseField.objects.filter(
resource=resource,
name__startswith=SPARSE_FIELD_PREFIX,
).values_list("name", flat=True)
)
max_n = 0
for name in existing_names:
suffix = name[len(SPARSE_FIELD_PREFIX) :]
try:
max_n = max(max_n, int(suffix))
except (ValueError, TypeError):
pass
candidate = f"{SPARSE_FIELD_PREFIX}{max_n + 1}"
# Guard against a race: if the candidate already exists, use a UUID suffix
if candidate in existing_names or SparseField.objects.filter(resource=resource, name=candidate).exists():
candidate = f"{SPARSE_FIELD_PREFIX}{uuid.uuid4().hex[:12]}"
return candidate| @staticmethod | ||
| def _extra_metadata_post(request, resource): | ||
| data = request.data | ||
| if isinstance(data, str): | ||
| try: | ||
| data = json.loads(data) | ||
| except (json.JSONDecodeError, TypeError): | ||
| return Response( | ||
| {"detail": "Invalid JSON payload."}, | ||
| status=400, | ||
| ) | ||
|
|
||
| if not isinstance(data, list): | ||
| return Response( | ||
| {"detail": "Expected a JSON list of metadata objects."}, | ||
| status=400, | ||
| ) | ||
|
|
||
| cleaned = [] | ||
| for meta_dict in data: | ||
| if not isinstance(meta_dict, dict): | ||
| logger.warning(f"Skipping non-dict metadata entry in POST payload: {meta_dict}") | ||
| continue | ||
| meta_dict = dict(meta_dict) # avoid mutating caller input | ||
| meta_dict.pop("id", None) | ||
| value = json.dumps(meta_dict) | ||
| if len(value) > SPARSE_FIELD_VALUE_MAX_LENGTH: | ||
| return Response( | ||
| {"detail": f"serialized value exceeds {SPARSE_FIELD_VALUE_MAX_LENGTH} characters"}, status=400 | ||
| ) | ||
| cleaned.append(value) | ||
|
|
||
| for value in cleaned: | ||
| name = _next_sparse_name(resource) | ||
| try: | ||
| SparseField.objects.create(resource=resource, name=name, value=value) | ||
| except IntegrityError: | ||
| # Concurrent request created the same name; retry with UUID | ||
| name = f"{SPARSE_FIELD_PREFIX}{uuid.uuid4().hex[:12]}" | ||
| SparseField.objects.create(resource=resource, name=name, value=value) | ||
|
|
||
| result = [_sparse_to_legacy(sf) for sf in _sparse_fields_for_resource(resource)] | ||
| return Response(result, status=201) |
There was a problem hiding this comment.
In _extra_metadata_post, _next_sparse_name is called inside a loop, which executes a database query on every iteration. Pre-fetching the existing names once before the loop and passing them to _next_sparse_name avoids these N+1 queries.
@staticmethod
def _extra_metadata_post(request, resource):
data = request.data
if isinstance(data, str):
try:
data = json.loads(data)
except (json.JSONDecodeError, TypeError):
return Response(
{"detail": "Invalid JSON payload."},
status=400,
)
if not isinstance(data, list):
return Response(
{"detail": "Expected a JSON list of metadata objects."},
status=400,
)
cleaned = []
for meta_dict in data:
if not isinstance(meta_dict, dict):
logger.warning(f"Skipping non-dict metadata entry in POST payload: {meta_dict}")
continue
meta_dict = dict(meta_dict) # avoid mutating caller input
meta_dict.pop("id", None)
value = json.dumps(meta_dict)
if len(value) > SPARSE_FIELD_VALUE_MAX_LENGTH:
return Response(
{"detail": f"serialized value exceeds {SPARSE_FIELD_VALUE_MAX_LENGTH} characters"}, status=400
)
cleaned.append(value)
existing_names = list(
SparseField.objects.filter(
resource=resource,
name__startswith=SPARSE_FIELD_PREFIX,
).values_list("name", flat=True)
)
for value in cleaned:
name = _next_sparse_name(resource, existing_names=existing_names)
existing_names.append(name)
try:
SparseField.objects.create(resource=resource, name=name, value=value)
except IntegrityError:
# Concurrent request created the same name; retry with UUID
name = f"{SPARSE_FIELD_PREFIX}{uuid.uuid4().hex[:12]}"
SparseField.objects.create(resource=resource, name=name, value=value)
result = [_sparse_to_legacy(sf) for sf in _sparse_fields_for_resource(resource)]
return Response(result, status=201)| ) | ||
|
|
||
| # Verify sparse field was created | ||
| _sf_check = SparseField.objects.filter(resource=_r, name="extra_1") |
There was a problem hiding this comment.
MINOR: the update_or_create return the new object and a boolean saying if the object was created or not, so this part is not required. it should be:
obj, created = SparseField.objects.update_or_create(
resource=_r,
name="extra_1",
defaults={"value": json.dumps({"category": "category"})},
)
self.assertTrue(created, "SparseField should have been created")
There was a problem hiding this comment.
_sf_check is also used for first()
| import re | ||
| import json | ||
| import logging | ||
| from schema import Schema |
There was a problem hiding this comment.
we can remove the schema==0.7.7 dependency from the pyproject.toml
Checklist
For all pull requests:
The following are required only for core and extension modules (they are welcomed, but not required, for contrib modules):
Submitting the PR does not require you to check all items, but by the time it gets merged, they should be either satisfied or inapplicable.