diff --git a/.env.example b/.env.example index f02b437..c65267d 100644 --- a/.env.example +++ b/.env.example @@ -173,3 +173,7 @@ CELERY_SINGLE_PROCESS=1 BOOST_ENDPOINT_THROTTLE_INFO=60/minute BOOST_ENDPOINT_THROTTLE_ADD_OR_UPDATE=10/hour + +# Comma-separated hostnames allowed for git clone URLs (HTTPS only). +# Default when unset: github.com. Set to empty to disable host allowlisting (not recommended). +BOOST_ALLOWED_CLONE_HOSTS=github.com diff --git a/docs/boost-endpoint-api.md b/docs/boost-endpoint-api.md index 9b01d91..0c61834 100644 --- a/docs/boost-endpoint-api.md +++ b/docs/boost-endpoint-api.md @@ -343,7 +343,7 @@ Top-level entry point called by the Celery task. Creates a temporary directory, For each submodule the following steps run in order: -1. **Path validation** — the submodule name is checked against the temp directory root to prevent path traversal. +1. **Path validation** — the submodule name is checked against the temp directory root to prevent path traversal. Organization and submodule names must match `^[A-Za-z0-9._-]+$`. Clone URLs are validated before any `git` subprocess: only `https://` (and SCP-style SSH normalized to HTTPS for checks) is allowed; resolved addresses must not be private, loopback, or link-local; the hostname must appear in `ALLOWED_CLONE_HOSTS` (default `github.com`, overridable via `BOOST_ALLOWED_CLONE_HOSTS`). 2. **Clone** — `git clone -b local-{lang_code} --depth 1 https://github.com/{organization}/{submodule}.git` into a temporary subdirectory. A 300-second timeout applies. Clone failure is recorded in `errors` and processing stops for that submodule. @@ -404,7 +404,8 @@ HTTP `400` responses and submodule `errors` lists use the same object schema. Va | `required_field` | Missing `organization`, `version`, or `add_or_update`; empty `add_or_update` dict | | `invalid_language_code` | Non-string or blank language key in `add_or_update` | | `invalid_submodule_list` | Submodule value is not a non-empty list | -| `invalid_submodule` | Submodule name fails path validation | +| `invalid_submodule` | Submodule name fails path or segment validation | +| `invalid_clone_url` | Organization/submodule or resolved clone URL fails SSRF checks (bad scheme, private IP, or host not in `ALLOWED_CLONE_HOSTS`) | | `clone_failed` | `git clone` failed or timed out | | `no_documentation_files` | No supported doc files found after scan | | `permission_denied` | Missing `project.add` or `project.edit` | diff --git a/docs/deployment-runbook.md b/docs/deployment-runbook.md index 75fdd99..1d58874 100644 --- a/docs/deployment-runbook.md +++ b/docs/deployment-runbook.md @@ -86,6 +86,7 @@ Runtime plugin env vars (set in `.env`, read by `settings_override.py` at boot): |----------|-------------------|-------| | `BOOST_ENDPOINT_THROTTLE_INFO` | `60/minute` | Scoped rate for `GET /boost-endpoint/info/` | | `BOOST_ENDPOINT_THROTTLE_ADD_OR_UPDATE` | `10/hour` | Scoped rate for `POST /boost-endpoint/add-or-update/` | +| `BOOST_ALLOWED_CLONE_HOSTS` | `github.com` | Comma-separated hostnames permitted for git clone URLs (HTTPS only; SSRF mitigation) | ### Weblate environment variables @@ -110,6 +111,7 @@ Key variables (full reference in `.env.example`): | `CELERY_SINGLE_PROCESS` | `1` | `.env` | Weblate Celery worker process count; increase when tasks queue | | `BOOST_ENDPOINT_THROTTLE_INFO` | `60/minute` | `.env` | Plugin rate limit (see above) | | `BOOST_ENDPOINT_THROTTLE_ADD_OR_UPDATE` | `10/hour` | `.env` | Plugin rate limit (see above) | +| `BOOST_ALLOWED_CLONE_HOSTS` | `github.com` | `.env` | Hostnames allowed for git clone URLs (see above) | | `WEBLATE_EMAIL_HOST` | `smtp.example.com` | `.env` | SMTP server; set user/password for production | | `WEBLATE_GITHUB_USERNAME` | — | `.env` | GitHub account for VCS; required with token for add-or-update | | `WEBLATE_GITHUB_TOKEN` | — | `.env` | GitHub PAT (`repo` scope); rotate via pre-deploy checklist | diff --git a/src/boost_weblate/endpoint/errors.py b/src/boost_weblate/endpoint/errors.py index b5e756a..95b4152 100644 --- a/src/boost_weblate/endpoint/errors.py +++ b/src/boost_weblate/endpoint/errors.py @@ -16,6 +16,7 @@ class BoostEndpointErrorCode(StrEnum): """Stable machine-readable error codes for Boost endpoint failures.""" INVALID_SUBMODULE = "invalid_submodule" + INVALID_CLONE_URL = "invalid_clone_url" CLONE_FAILED = "clone_failed" NO_DOCUMENTATION_FILES = "no_documentation_files" PERMISSION_DENIED = "permission_denied" diff --git a/src/boost_weblate/endpoint/serializers.py b/src/boost_weblate/endpoint/serializers.py index 435a86f..01ad283 100644 --- a/src/boost_weblate/endpoint/serializers.py +++ b/src/boost_weblate/endpoint/serializers.py @@ -6,8 +6,10 @@ from __future__ import annotations +from enum import StrEnum from typing import Any +from django.core.exceptions import ValidationError from rest_framework import serializers from boost_weblate.endpoint.errors import ( @@ -15,6 +17,24 @@ boost_validation_errors, to_error_dict, ) +from boost_weblate.endpoint.validators import validate_repo_segment + + +class DrfValidationCode(StrEnum): + """DRF ``ErrorDetail.code`` values mapped by this serializer.""" + + REQUIRED = "required" + NOT_A_LIST = "not_a_list" + EMPTY = "empty" + + +class RequestField(StrEnum): + """Top-level add-or-update request field names.""" + + ORGANIZATION = "organization" + ADD_OR_UPDATE = "add_or_update" + VERSION = "version" + EXTENSIONS = "extensions" class AddOrUpdateRequestSerializer(serializers.Serializer): @@ -53,6 +73,7 @@ class AddOrUpdateRequestSerializer(serializers.Serializer): def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self._custom_validation_errors: list[dict[str, Any]] = [] + self._custom_error_fields: set[str] = set() self._structured_errors: list[dict[str, Any]] = [] @property @@ -61,6 +82,7 @@ def structured_errors(self) -> list[dict[str, Any]]: def is_valid(self, *, raise_exception: bool = False) -> bool: self._custom_validation_errors = [] + self._custom_error_fields = set() valid = super().is_valid(raise_exception=False) if not valid: self._structured_errors = self._to_structured_errors() @@ -73,16 +95,14 @@ def is_valid(self, *, raise_exception: bool = False) -> bool: def _to_structured_errors(self) -> list[dict[str, Any]]: structured = list(self._custom_validation_errors) for field, messages in self.errors.items(): - if field == "add_or_update" and self._custom_validation_errors: + if field in self._custom_error_fields: continue - for subfield, message, drf_code in self._flatten_field_errors( - field, messages - ): + for subfield, message, drf_code in self._flatten_field_errors(messages): code = self._code_for_drf_error(field, drf_code, subfield=subfield) metadata: dict[str, Any] = {"field": field} if drf_code is not None: metadata["drf_code"] = drf_code - if subfield and field == "add_or_update": + if subfield and field == RequestField.ADD_OR_UPDATE: metadata["language"] = subfield structured.append(to_error_dict(code, message, **metadata)) return structured @@ -93,7 +113,7 @@ def _message_and_drf_code(err: Any) -> tuple[str, str | None]: @staticmethod def _flatten_field_errors( - field: str, messages: Any + messages: Any, ) -> list[tuple[str | None, str, str | None]]: """Flatten nested DRF errors into (subfield, message, drf_code) triplets.""" results: list[tuple[str | None, str, str | None]] = [] @@ -104,7 +124,7 @@ def _flatten_field_errors( for msg in value: if isinstance(msg, dict) or hasattr(msg, "items"): nested = AddOrUpdateRequestSerializer._flatten_field_errors( - field, msg + msg ) results.extend( (key_str if sub is None else sub, message, drf_code) @@ -116,9 +136,7 @@ def _flatten_field_errors( ) results.append((key_str, message, drf_code)) elif isinstance(value, dict) or hasattr(value, "items"): - nested = AddOrUpdateRequestSerializer._flatten_field_errors( - field, value - ) + nested = AddOrUpdateRequestSerializer._flatten_field_errors(value) results.extend( (key_str if sub is None else sub, message, drf_code) for sub, message, drf_code in nested @@ -143,16 +161,35 @@ def _code_for_drf_error( *, subfield: str | None = None, ) -> BoostEndpointErrorCode: - if drf_code == "required": + if drf_code == DrfValidationCode.REQUIRED: return BoostEndpointErrorCode.REQUIRED_FIELD - if drf_code == "not_a_list": + if drf_code == DrfValidationCode.NOT_A_LIST: return BoostEndpointErrorCode.INVALID_SUBMODULE_LIST - if drf_code == "empty": - if field == "add_or_update" and subfield: + if drf_code == DrfValidationCode.EMPTY: + if field == RequestField.ADD_OR_UPDATE and subfield: return BoostEndpointErrorCode.INVALID_SUBMODULE_LIST return BoostEndpointErrorCode.REQUIRED_FIELD return BoostEndpointErrorCode.REQUIRED_FIELD + def validate_organization(self, value: str) -> str: + """Reject organization names that would produce unsafe clone URLs.""" + try: + return validate_repo_segment(value, field="organization") + except ValidationError as exc: + self._custom_error_fields.add(RequestField.ORGANIZATION) + self._custom_validation_errors.extend( + boost_validation_errors( + [ + ( + BoostEndpointErrorCode.INVALID_CLONE_URL, + str(exc), + {"field": RequestField.ORGANIZATION}, + ) + ] + ) + ) + raise serializers.ValidationError(str(exc)) from exc + def validate_extensions(self, value: list[str] | None) -> list[str] | None: """Strip entries and remove blanks so all-empty input does not filter files.""" if value is None: @@ -171,7 +208,10 @@ def validate_add_or_update(self, value: dict[str, Any]) -> dict[str, Any]: "add_or_update: each key must be a non-empty language " f"code; got {repr(lang_code)}" ), - {"field": "add_or_update", "language": str(lang_code)}, + { + "field": RequestField.ADD_OR_UPDATE, + "language": str(lang_code), + }, ) ) continue @@ -184,7 +224,7 @@ def validate_add_or_update(self, value: dict[str, Any]) -> dict[str, Any]: f"submodule names; key {lang_code!r} is not a list " f"(got {type(submodules).__name__})." ), - {"field": "add_or_update", "language": lang_code}, + {"field": RequestField.ADD_OR_UPDATE, "language": lang_code}, ) ) elif len(submodules) == 0: @@ -195,10 +235,43 @@ def validate_add_or_update(self, value: dict[str, Any]) -> dict[str, Any]: "add_or_update: each value must be a non-empty list of " f"submodule names; key {lang_code!r} has an empty list." ), - {"field": "add_or_update", "language": lang_code}, + {"field": RequestField.ADD_OR_UPDATE, "language": lang_code}, ) ) + else: + for submodule in submodules: + if not isinstance(submodule, str): + items.append( + ( + BoostEndpointErrorCode.INVALID_SUBMODULE_LIST, + ( + "add_or_update: each submodule name must be a " + f"string; key {lang_code!r} has " + f"{type(submodule).__name__}." + ), + { + "field": RequestField.ADD_OR_UPDATE, + "language": lang_code, + }, + ) + ) + break + try: + validate_repo_segment(submodule, field="submodule") + except ValidationError as exc: + items.append( + ( + BoostEndpointErrorCode.INVALID_SUBMODULE, + str(exc), + { + "field": RequestField.ADD_OR_UPDATE, + "language": lang_code, + "submodule": submodule, + }, + ) + ) if items: - self._custom_validation_errors = boost_validation_errors(items) - raise serializers.ValidationError({"add_or_update": "invalid"}) + self._custom_error_fields.add(RequestField.ADD_OR_UPDATE) + self._custom_validation_errors.extend(boost_validation_errors(items)) + raise serializers.ValidationError({RequestField.ADD_OR_UPDATE: "invalid"}) return value diff --git a/src/boost_weblate/endpoint/services.py b/src/boost_weblate/endpoint/services.py index 207471f..ffddf0b 100644 --- a/src/boost_weblate/endpoint/services.py +++ b/src/boost_weblate/endpoint/services.py @@ -36,6 +36,7 @@ from django.conf import settings from django.contrib.messages import get_messages +from django.core.exceptions import ValidationError from django.db import transaction from weblate.formats.models import FILE_FORMATS from weblate.lang.models import Language @@ -50,6 +51,11 @@ append_error, to_error_dict, ) +from boost_weblate.endpoint.validators import ( + github_https_clone_url, + github_ssh_repo_url, + validate_repo_segment, +) if TYPE_CHECKING: from weblate.lang.models import LanguageQuerySet @@ -311,7 +317,7 @@ def get_supported_extensions(self) -> set[str]: def clone_repository(self, submodule: str, target_dir: str, branch: str) -> bool: """Clone a git repository to target directory.""" - repo_url = f"https://github.com/{self.organization}/{submodule}.git" + repo_url = github_https_clone_url(self.organization, submodule) try: LOGGER.info("Cloning %s to %s", repo_url, target_dir) @@ -508,7 +514,15 @@ def create_or_update_component( return None, False # Single clone per repo: first component gets real repo, others use weblate:// - real_repo = f"git@github.com:{self.organization}/{submodule}.git" + try: + real_repo = github_ssh_repo_url(self.organization, submodule) + except ValidationError as exc: + LOGGER.error( + "Invalid repo URL for %s/%s: %s", self.organization, submodule, exc + ) + report_error(cause="Component creation/update") + return None, False + repo_owner = ( Component.objects.filter(project=project, repo=real_repo) .order_by("slug") @@ -902,6 +916,19 @@ def process_submodule( "errors": [], } + try: + validate_repo_segment(self.organization, field="organization") + validate_repo_segment(submodule, field="submodule") + except ValidationError as exc: + append_error( + result, + BoostEndpointErrorCode.INVALID_CLONE_URL, + str(exc), + submodule=submodule, + organization=self.organization, + ) + return result + # Create temp directory for this submodule temp_submodule_dir = os.path.join(temp_dir, submodule) resolved = Path(temp_submodule_dir).resolve() @@ -919,9 +946,21 @@ def process_submodule( os.makedirs(temp_submodule_dir, exist_ok=True) # Clone repository - if not self.clone_repository( - submodule, temp_submodule_dir, f"local-{self.lang_code}" - ): + try: + cloned = self.clone_repository( + submodule, temp_submodule_dir, f"local-{self.lang_code}" + ) + except ValidationError as exc: + append_error( + result, + BoostEndpointErrorCode.INVALID_CLONE_URL, + str(exc), + submodule=submodule, + organization=self.organization, + ) + return result + + if not cloned: append_error( result, BoostEndpointErrorCode.CLONE_FAILED, diff --git a/src/boost_weblate/endpoint/validators.py b/src/boost_weblate/endpoint/validators.py new file mode 100644 index 0000000..aa456bc --- /dev/null +++ b/src/boost_weblate/endpoint/validators.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: 2026 Andrew Zhang +# +# SPDX-License-Identifier: BSL-1.0 + +"""URL validation for git clone operations (SSRF mitigation).""" + +from __future__ import annotations + +import ipaddress +import re +import socket +from urllib.parse import urlparse + +from django.conf import settings +from django.core.exceptions import ValidationError + +_REPO_SEGMENT_RE = re.compile(r"^[A-Za-z0-9._-]+$") + +# SCP-style SSH: git@host:path/to/repo.git +_SCP_SSH_RE = re.compile(r"^git@([^:/]+):(.+)$") + + +def validate_repo_segment(name: str, *, field: str) -> str: + """Restrict organization/submodule to safe GitHub path segments.""" + if not name or not name.strip(): + raise ValidationError(f"{field}: must be a non-empty string") + if not _REPO_SEGMENT_RE.fullmatch(name): + raise ValidationError( + f"{field}: invalid characters in {name!r}; " + "allowed: letters, digits, '.', '_', '-'" + ) + return name + + +def _normalize_clone_url(url: str) -> str: + """Convert SCP-style SSH URLs to HTTPS for validation; reject other forms.""" + url = url.strip() + if not url: + raise ValidationError("Clone URL must not be empty") + + scp_match = _SCP_SSH_RE.match(url) + if scp_match: + host, path = scp_match.groups() + path = path.lstrip("/") + return f"https://{host}/{path}" + + parsed = urlparse(url) + if parsed.scheme in ("", "file") or url.startswith("/"): + raise ValidationError( + f"Unsupported clone URL scheme: {parsed.scheme or 'local path'!r}" + ) + if parsed.scheme == "ssh": + raise ValidationError("Unsupported clone URL scheme: 'ssh'") + if parsed.scheme not in ("https",): + raise ValidationError(f"Unsupported clone URL scheme: {parsed.scheme!r}") + + return url + + +def _hostname_allowed(hostname: str, allowed_hosts: list[str]) -> bool: + hostname = hostname.rstrip(".").lower() + for allowed in allowed_hosts: + allowed = allowed.rstrip(".").lower() + if hostname == allowed or hostname.endswith(f".{allowed}"): + return True + return False + + +def _resolve_addresses( + hostname: str, +) -> frozenset[ipaddress.IPv4Address | ipaddress.IPv6Address]: + try: + infos = socket.getaddrinfo( + hostname, None, type=socket.SOCK_STREAM, proto=socket.IPPROTO_TCP + ) + except socket.gaierror as exc: + raise ValidationError( + f"Could not resolve clone URL host {hostname!r}: {exc}" + ) from exc + + addresses: set[ipaddress.IPv4Address | ipaddress.IPv6Address] = set() + for info in infos: + sockaddr = info[4] + if not sockaddr: + continue + ip_str = sockaddr[0] + try: + addresses.add(ipaddress.ip_address(ip_str)) + except ValueError: + continue + + if not addresses: + raise ValidationError(f"Could not resolve clone URL host {hostname!r}") + + return frozenset(addresses) + + +def _reject_unsafe_ip(addr: ipaddress.IPv4Address | ipaddress.IPv6Address) -> None: + if addr.is_unspecified: + raise ValidationError(f"Clone URL resolves to unspecified IP address: {addr}") + if addr.is_private: + raise ValidationError(f"Clone URL resolves to private IP address: {addr}") + if addr.is_loopback: + raise ValidationError(f"Clone URL resolves to loopback IP address: {addr}") + if addr.is_link_local: + raise ValidationError(f"Clone URL resolves to link-local IP address: {addr}") + if addr.is_reserved: + raise ValidationError(f"Clone URL resolves to reserved IP address: {addr}") + if addr.is_multicast: + raise ValidationError(f"Clone URL resolves to multicast IP address: {addr}") + + +def _check_literal_host_ip(hostname: str) -> None: + try: + addr = ipaddress.ip_address(hostname) + except ValueError: + return + _reject_unsafe_ip(addr) + + +def validate_clone_url(url: str) -> str: + """Validate a git clone URL before any subprocess or VCS operation. + + Resolves the hostname twice and rejects if the address set changes (DNS + rebinding mitigation). Git still performs its own resolution; this narrows + the rebind window without cloning to a raw IP (which would break TLS/SNI). + """ + normalized = _normalize_clone_url(url) + parsed = urlparse(normalized) + + hostname = parsed.hostname + if not hostname: + raise ValidationError("Clone URL must include a hostname") + + _check_literal_host_ip(hostname) + + allowed_hosts: list[str] = getattr(settings, "ALLOWED_CLONE_HOSTS", []) + if allowed_hosts and not _hostname_allowed(hostname, allowed_hosts): + raise ValidationError( + f"Clone URL host {hostname!r} is not in ALLOWED_CLONE_HOSTS" + ) + + first_addresses = _resolve_addresses(hostname) + for addr in first_addresses: + _reject_unsafe_ip(addr) + + second_addresses = _resolve_addresses(hostname) + if second_addresses != first_addresses: + raise ValidationError( + "Clone URL host address changed between DNS lookups (possible rebinding)" + ) + + return normalized + + +def github_https_clone_url(organization: str, submodule: str) -> str: + """Build and validate an HTTPS GitHub clone URL.""" + org = validate_repo_segment(organization, field="organization") + repo = validate_repo_segment(submodule, field="submodule") + url = f"https://github.com/{org}/{repo}.git" + return validate_clone_url(url) + + +def github_ssh_repo_url(organization: str, submodule: str) -> str: + """Build and validate an SCP-style GitHub SSH repo URL.""" + org = validate_repo_segment(organization, field="organization") + repo = validate_repo_segment(submodule, field="submodule") + url = f"git@github.com:{org}/{repo}.git" + validate_clone_url(url) + return url diff --git a/src/boost_weblate/settings_override.py b/src/boost_weblate/settings_override.py index a46ce56..75c0359 100644 --- a/src/boost_weblate/settings_override.py +++ b/src/boost_weblate/settings_override.py @@ -119,6 +119,21 @@ def boost_endpoint_throttle_rates() -> dict[str, str]: BOOST_ENDPOINT_THROTTLE_RATES = boost_endpoint_throttle_rates() +_DEFAULT_ALLOWED_CLONE_HOSTS = ("github.com",) + + +def allowed_clone_hosts() -> list[str]: + """Hostnames permitted for git clone URLs (env override optional).""" + raw = os.environ.get("BOOST_ALLOWED_CLONE_HOSTS") + if raw is None: + return list(_DEFAULT_ALLOWED_CLONE_HOSTS) + if raw.strip() == "": + return [] + return [host.strip().lower() for host in raw.split(",") if host.strip()] + + +ALLOWED_CLONE_HOSTS = allowed_clone_hosts() + def merge_boost_endpoint_throttle_rates( rest_framework: dict[str, Any], diff --git a/tests/django_qbk_format_settings.py b/tests/django_qbk_format_settings.py index 8e0bcb0..22cf141 100644 --- a/tests/django_qbk_format_settings.py +++ b/tests/django_qbk_format_settings.py @@ -23,7 +23,10 @@ import weblate.settings_example as _wl_example -from boost_weblate.settings_override import merge_boost_endpoint_throttle_rates +from boost_weblate.settings_override import ( + ALLOWED_CLONE_HOSTS, + merge_boost_endpoint_throttle_rates, +) for _key, _value in _wl_example.__dict__.items(): if _key.isupper(): @@ -63,3 +66,5 @@ ] REST_FRAMEWORK = merge_boost_endpoint_throttle_rates(_wl_example.REST_FRAMEWORK) + +ALLOWED_CLONE_HOSTS = ALLOWED_CLONE_HOSTS diff --git a/tests/endpoint/test_serializers.py b/tests/endpoint/test_serializers.py index 8929e35..22d8894 100644 --- a/tests/endpoint/test_serializers.py +++ b/tests/endpoint/test_serializers.py @@ -7,7 +7,11 @@ from rest_framework.exceptions import ErrorDetail from boost_weblate.endpoint.errors import BoostEndpointErrorCode -from boost_weblate.endpoint.serializers import AddOrUpdateRequestSerializer +from boost_weblate.endpoint.serializers import ( + AddOrUpdateRequestSerializer, + DrfValidationCode, + RequestField, +) def _error_codes(errors: list[dict]) -> list[str]: @@ -108,13 +112,11 @@ def test_flatten_field_errors_propagates_error_detail_code() -> None: "zh_Hans": [ ErrorDetail( 'Expected a list of items but got type "str".', - code="not_a_list", + code=DrfValidationCode.NOT_A_LIST, ) ] } - flattened = AddOrUpdateRequestSerializer._flatten_field_errors( - "add_or_update", nested - ) + flattened = AddOrUpdateRequestSerializer._flatten_field_errors(nested) assert flattened == [ ( "zh_Hans", @@ -126,23 +128,29 @@ def test_flatten_field_errors_propagates_error_detail_code() -> None: def test_code_for_drf_error_maps_drf_codes() -> None: assert ( - AddOrUpdateRequestSerializer._code_for_drf_error("organization", "required") + AddOrUpdateRequestSerializer._code_for_drf_error( + RequestField.ORGANIZATION, DrfValidationCode.REQUIRED + ) == BoostEndpointErrorCode.REQUIRED_FIELD ) assert ( AddOrUpdateRequestSerializer._code_for_drf_error( - "add_or_update", "not_a_list", subfield="zh_Hans" + RequestField.ADD_OR_UPDATE, + DrfValidationCode.NOT_A_LIST, + subfield="zh_Hans", ) == BoostEndpointErrorCode.INVALID_SUBMODULE_LIST ) assert ( AddOrUpdateRequestSerializer._code_for_drf_error( - "add_or_update", "empty", subfield="zh_Hans" + RequestField.ADD_OR_UPDATE, DrfValidationCode.EMPTY, subfield="zh_Hans" ) == BoostEndpointErrorCode.INVALID_SUBMODULE_LIST ) assert ( - AddOrUpdateRequestSerializer._code_for_drf_error("add_or_update", "empty") + AddOrUpdateRequestSerializer._code_for_drf_error( + RequestField.ADD_OR_UPDATE, DrfValidationCode.EMPTY + ) == BoostEndpointErrorCode.REQUIRED_FIELD ) @@ -157,3 +165,70 @@ def test_add_or_update_serializer_missing_required_fields() -> None: assert all( e["metadata"].get("drf_code") == "required" for e in ser.structured_errors ) + + +def test_add_or_update_serializer_rejects_invalid_organization() -> None: + ser = AddOrUpdateRequestSerializer( + data={ + "organization": "bad/org", + "version": "v", + "add_or_update": {"zh_Hans": ["json"]}, + } + ) + assert not ser.is_valid() + assert BoostEndpointErrorCode.INVALID_CLONE_URL.value in _error_codes( + ser.structured_errors + ) + + +def test_add_or_update_serializer_rejects_invalid_submodule_segment() -> None: + ser = AddOrUpdateRequestSerializer( + data={ + "organization": "o", + "version": "v", + "add_or_update": {"zh_Hans": ["../evil"]}, + } + ) + assert not ser.is_valid() + assert BoostEndpointErrorCode.INVALID_SUBMODULE.value in _error_codes( + ser.structured_errors + ) + + +def test_add_or_update_serializer_accumulates_custom_errors() -> None: + ser = AddOrUpdateRequestSerializer( + data={ + "organization": "bad/org", + "version": "v", + "add_or_update": {"zh_Hans": ["../evil"]}, + } + ) + assert not ser.is_valid() + codes = _error_codes(ser.structured_errors) + assert BoostEndpointErrorCode.INVALID_CLONE_URL.value in codes + assert BoostEndpointErrorCode.INVALID_SUBMODULE.value in codes + fields = {e["metadata"]["field"] for e in ser.structured_errors} + assert fields == {"organization", "add_or_update"} + + +def test_invalid_organization_still_flattens_other_drf_errors() -> None: + ser = AddOrUpdateRequestSerializer( + data={ + "organization": "bad/org", + "add_or_update": {"zh_Hans": ["json"]}, + } + ) + assert not ser.is_valid() + codes = _error_codes(ser.structured_errors) + assert BoostEndpointErrorCode.INVALID_CLONE_URL.value in codes + assert BoostEndpointErrorCode.REQUIRED_FIELD.value in codes + org_errors = [ + e for e in ser.structured_errors if e["metadata"]["field"] == "organization" + ] + version_errors = [ + e for e in ser.structured_errors if e["metadata"]["field"] == "version" + ] + assert len(org_errors) == 1 + assert org_errors[0]["code"] == BoostEndpointErrorCode.INVALID_CLONE_URL.value + assert len(version_errors) == 1 + assert version_errors[0]["metadata"]["drf_code"] == "required" diff --git a/tests/endpoint/test_services.py b/tests/endpoint/test_services.py index 354ea56..f3ce70f 100644 --- a/tests/endpoint/test_services.py +++ b/tests/endpoint/test_services.py @@ -13,6 +13,7 @@ from unittest.mock import MagicMock, patch import pytest +from django.core.exceptions import ValidationError from boost_weblate.endpoint.errors import BoostEndpointErrorCode from boost_weblate.endpoint.services import ( @@ -309,6 +310,14 @@ def test_multiple_files_multiple_configs(self) -> None: class TestCloneRepository: def setup_method(self) -> None: self.svc = _make_svc() + self._clone_url_patcher = patch( + "boost_weblate.endpoint.services.github_https_clone_url", + side_effect=lambda org, sub: f"https://github.com/{org}/{sub}.git", + ) + self._clone_url_patcher.start() + + def teardown_method(self) -> None: + self._clone_url_patcher.stop() def test_successful_clone_returns_true(self) -> None: mock_result = MagicMock() @@ -366,6 +375,14 @@ def fake_run(cmd, **_kwargs): assert "-b" in cmd assert "mybranch" in cmd + def test_rejects_invalid_clone_url(self) -> None: + with patch( + "boost_weblate.endpoint.services.github_https_clone_url", + side_effect=ValidationError("bad url"), + ): + with pytest.raises(ValidationError, match="bad url"): + self.svc.clone_repository("mylib", "/tmp/mylib", "main") + # --------------------------------------------------------------------------- # get_or_create_project (mocks Project ORM) @@ -469,6 +486,11 @@ def fake_get_or_create(**kwargs): class TestCreateOrUpdateComponent: def setup_method(self) -> None: self.svc = _make_svc(lang_code="zh_Hans", version="1.0") + self._ssh_url_patcher = patch( + "boost_weblate.endpoint.services.github_ssh_repo_url", + side_effect=lambda org, sub: f"git@github.com:{org}/{sub}.git", + ) + self._ssh_url_patcher.start() self.project = MagicMock() self.project.pk = 1 self.project.slug = "boost-json-documentation-zh_hans" @@ -481,6 +503,9 @@ def setup_method(self) -> None: "file_format": "adoc", } + def teardown_method(self) -> None: + self._ssh_url_patcher.stop() + def test_missing_config_keys_returns_none(self) -> None: bad_config = {"component_slug": "x"} # missing many keys result, created = self.svc.create_or_update_component( @@ -983,6 +1008,23 @@ def test_clone_failure_returns_error_result(self, tmp_path) -> None: assert result["success"] is False assert _has_error_code(result["errors"], BoostEndpointErrorCode.CLONE_FAILED) + def test_clone_url_validation_failure_returns_invalid_clone_url( + self, tmp_path + ) -> None: + with patch.object( + self.svc, + "clone_repository", + side_effect=ValidationError("allowlist rejected"), + ): + result = self.svc.process_submodule("json", str(tmp_path)) + assert result["success"] is False + assert _has_error_code( + result["errors"], BoostEndpointErrorCode.INVALID_CLONE_URL + ) + assert not _has_error_code( + result["errors"], BoostEndpointErrorCode.CLONE_FAILED + ) + def test_no_docs_found_returns_error(self, tmp_path) -> None: with ( patch.object(self.svc, "clone_repository", return_value=True), @@ -1101,7 +1143,15 @@ def test_invalid_submodule_name_path_traversal(self, tmp_path) -> None: result = self.svc.process_submodule("../evil", str(tmp_path)) assert result["success"] is False assert _has_error_code( - result["errors"], BoostEndpointErrorCode.INVALID_SUBMODULE + result["errors"], BoostEndpointErrorCode.INVALID_CLONE_URL + ) + + def test_invalid_organization_rejected_before_clone(self, tmp_path) -> None: + svc = _make_svc(organization="bad/org") + result = svc.process_submodule("json", str(tmp_path)) + assert result["success"] is False + assert _has_error_code( + result["errors"], BoostEndpointErrorCode.INVALID_CLONE_URL ) diff --git a/tests/endpoint/test_validators.py b/tests/endpoint/test_validators.py new file mode 100644 index 0000000..0bd0085 --- /dev/null +++ b/tests/endpoint/test_validators.py @@ -0,0 +1,242 @@ +# SPDX-FileCopyrightText: 2026 Andrew Zhang +# +# SPDX-License-Identifier: BSL-1.0 + +"""Tests for clone URL validation (SSRF mitigation).""" + +from __future__ import annotations + +import socket +from unittest.mock import patch + +import pytest +from django.core.exceptions import ValidationError +from django.test import override_settings + +from boost_weblate.endpoint.validators import ( + github_https_clone_url, + github_ssh_repo_url, + validate_clone_url, + validate_repo_segment, +) + +_PUBLIC_IP = "8.8.8.8" +_PRIVATE_IPS = ("10.0.0.1", "172.16.0.1", "192.168.1.1", "127.0.0.1", "169.254.169.254") + + +def _mock_getaddrinfo_public(*_args, **_kwargs): + return [ + ( + socket.AF_INET, + socket.SOCK_STREAM, + socket.IPPROTO_TCP, + "", + (_PUBLIC_IP, 0), + ) + ] + + +def _mock_getaddrinfo_ip(ip: str): + def _getaddrinfo(*_args, **_kwargs): + return [ + ( + socket.AF_INET, + socket.SOCK_STREAM, + socket.IPPROTO_TCP, + "", + (ip, 0), + ) + ] + + return _getaddrinfo + + +def _mock_getaddrinfo_rebind(*_args, **_kwargs): + calls = {"n": 0} + + def _getaddrinfo(*_a, **_kw): + calls["n"] += 1 + ip = _PUBLIC_IP if calls["n"] == 1 else "10.0.0.1" + return [ + ( + socket.AF_INET, + socket.SOCK_STREAM, + socket.IPPROTO_TCP, + "", + (ip, 0), + ) + ] + + return _getaddrinfo + + +class TestValidateRepoSegment: + @pytest.mark.parametrize( + "name", + ["CppDigest", "json", "my-lib", "foo.bar", "Boost_1_90"], + ) + def test_accepts_safe_segments(self, name: str) -> None: + assert validate_repo_segment(name, field="submodule") == name + + @pytest.mark.parametrize( + "name", + ["", "../evil", "foo/bar", "user@host", "foo:bar", "foo bar", "foo%bar"], + ) + def test_rejects_unsafe_segments(self, name: str) -> None: + with pytest.raises(ValidationError): + validate_repo_segment(name, field="organization") + + +class TestValidateCloneUrlBadScheme: + @pytest.mark.parametrize( + "url", + [ + "file:///etc/passwd", + "ftp://example.com/repo.git", + "gopher://example.com/repo.git", + "http://github.com/boostorg/json.git", + "ssh://git@github.com/boostorg/json.git", + ], + ) + def test_rejects_non_https_schemes(self, url: str) -> None: + with pytest.raises(ValidationError): + validate_clone_url(url) + + +class TestValidateCloneUrlPrivateIp: + @pytest.mark.parametrize("ip", _PRIVATE_IPS) + def test_rejects_private_resolved_ips(self, ip: str) -> None: + url = "https://github.com/boostorg/json.git" + with ( + patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_ip(ip), + ), + pytest.raises(ValidationError), + ): + validate_clone_url(url) + + @pytest.mark.parametrize("ip", _PRIVATE_IPS) + def test_rejects_literal_private_host(self, ip: str) -> None: + url = f"https://{ip}/repo.git" + with pytest.raises(ValidationError): + validate_clone_url(url) + + +class TestValidateCloneUrlUnspecifiedIp: + @pytest.mark.parametrize("ip", ("0.0.0.0", "::")) + def test_rejects_unspecified_resolved_ips(self, ip: str) -> None: + url = "https://github.com/boostorg/json.git" + with ( + patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_ip(ip), + ), + pytest.raises(ValidationError, match="unspecified"), + ): + validate_clone_url(url) + + @pytest.mark.parametrize("ip", ("0.0.0.0", "::")) + def test_rejects_literal_unspecified_host(self, ip: str) -> None: + url = f"https://[{ip}]/repo.git" if ":" in ip else f"https://{ip}/repo.git" + with pytest.raises(ValidationError, match="unspecified"): + validate_clone_url(url) + + +class TestValidateCloneUrlAllowlist: + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_rejects_non_allowlisted_host(self) -> None: + url = "https://evil.com/org/repo.git" + with ( + patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ), + pytest.raises(ValidationError, match="ALLOWED_CLONE_HOSTS"), + ): + validate_clone_url(url) + + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_accepts_allowlisted_host(self) -> None: + url = "https://github.com/boostorg/json.git" + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + assert validate_clone_url(url) == url + + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_accepts_subdomain_of_allowlisted_host(self) -> None: + url = "https://api.github.com/org/repo.git" + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + assert validate_clone_url(url) == url + + @override_settings(ALLOWED_CLONE_HOSTS=[]) + def test_empty_allowlist_skips_host_check(self) -> None: + url = "https://example.com/org/repo.git" + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + assert validate_clone_url(url) == url + + +class TestValidateCloneUrlDnsRebinding: + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_rejects_changed_addresses_between_lookups(self) -> None: + url = "https://github.com/boostorg/json.git" + with ( + patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_rebind(), + ), + pytest.raises(ValidationError, match="rebinding"), + ): + validate_clone_url(url) + + +class TestValidateCloneUrlHappyPath: + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_https_github_url(self) -> None: + url = "https://github.com/boostorg/json.git" + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + assert validate_clone_url(url) == url + + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_scp_ssh_url_normalized(self) -> None: + scp = "git@github.com:boostorg/json.git" + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + assert validate_clone_url(scp) == "https://github.com/boostorg/json.git" + + +class TestGithubUrlBuilders: + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_github_https_clone_url(self) -> None: + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + url = github_https_clone_url("boostorg", "json") + assert url == "https://github.com/boostorg/json.git" + + @override_settings(ALLOWED_CLONE_HOSTS=["github.com"]) + def test_github_ssh_repo_url(self) -> None: + with patch( + "boost_weblate.endpoint.validators.socket.getaddrinfo", + side_effect=_mock_getaddrinfo_public, + ): + url = github_ssh_repo_url("boostorg", "json") + assert url == "git@github.com:boostorg/json.git" + + def test_builders_reject_bad_segments(self) -> None: + with pytest.raises(ValidationError): + github_https_clone_url("bad/org", "json") diff --git a/tests/test_settings_override.py b/tests/test_settings_override.py index 4a689b2..0a52030 100644 --- a/tests/test_settings_override.py +++ b/tests/test_settings_override.py @@ -6,9 +6,12 @@ from __future__ import annotations +import importlib import importlib.util from pathlib import Path +import pytest + _QBK = "boost_weblate.formats.quickbook.QuickBookFormat" @@ -75,3 +78,24 @@ def test_merge_boost_endpoint_throttle_rates_preserves_upstream() -> None: assert rates["anon"] == "100/day" assert rates["info"] == BOOST_ENDPOINT_THROTTLE_RATES["info"] assert rates["add-or-update"] == BOOST_ENDPOINT_THROTTLE_RATES["add-or-update"] + + +def test_allowed_clone_hosts_default(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("BOOST_ALLOWED_CLONE_HOSTS", raising=False) + + import boost_weblate.settings_override as so + + importlib.reload(so) + + assert so.allowed_clone_hosts() == ["github.com"] + assert so.ALLOWED_CLONE_HOSTS == ["github.com"] + + +def test_allowed_clone_hosts_parses_env(monkeypatch: pytest.MonkeyPatch) -> None: + from boost_weblate.settings_override import allowed_clone_hosts + + monkeypatch.setenv("BOOST_ALLOWED_CLONE_HOSTS", "GitHub.com, GitLab.com") + assert allowed_clone_hosts() == ["github.com", "gitlab.com"] + + monkeypatch.setenv("BOOST_ALLOWED_CLONE_HOSTS", "") + assert allowed_clone_hosts() == []