From 8c6ee45b1d4235c92a46acb2e5b6631e664572f1 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 15:21:46 -0400 Subject: [PATCH 01/16] design: machine-readable validate output with store/reload Design plan for enhancing `dandi validate` with: - Structured output formats (-f json/json_pp/json_lines/yaml) - Auto-save _validation.jsonl sidecar alongside .log files - --load to reload/re-render stored results with different groupings - Upload validation persistence for later inspection - Extended grouping options (severity, id, validator, standard, dandiset) - Refactoring into dandi/validate/ subpackage (git mv separately) - _record_version field on ValidationResult for forward compatibility - VisiData integration via native JSONL support Addresses #1515, #1753, #1748; enhances #1743. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- .../specs/validate-machine-readable-output.md | 900 ++++++++++++++++++ 1 file changed, 900 insertions(+) create mode 100644 .specify/specs/validate-machine-readable-output.md diff --git a/.specify/specs/validate-machine-readable-output.md b/.specify/specs/validate-machine-readable-output.md new file mode 100644 index 000000000..904e0f0e2 --- /dev/null +++ b/.specify/specs/validate-machine-readable-output.md @@ -0,0 +1,900 @@ +# Design: Machine-Readable Validation Output with Store/Reload + +## Summary + +Add machine-readable output formats to `dandi validate`, support saving validation +results to disk, and reloading/re-rendering them later with different grouping, +filtering, and display options. This enables workflows like: + +- Sweeping all BIDS example datasets or all DANDI datasets, storing results +- Storing validation errors during `upload` for later inspection +- Re-rendering stored results with alternative groupings/filters without re-running validation + +## Related Issues & PRs + +| # | Title | Status | Relevance | +|---|-------|--------|-----------| +| [#1515](https://github.com/dandi/dandi-cli/issues/1515) | `validate: Add -f\|--format option` | Open | Direct: requests JSON/YAML output formats | +| [#1753](https://github.com/dandi/dandi-cli/issues/1753) | Provide easy means for introspecting upload validation failures | Open | Direct: decouple execution from rendering via JSON dumps | +| [#1743](https://github.com/dandi/dandi-cli/pull/1743) | Add filtering of issues by type/ID or by file location | Draft | Related: post-validation filtering, blocked pending store/cache | +| [#1737](https://github.com/dandi/dandi-cli/issues/1737) | `upload,validate: Add --validators option` | Open | Related: selective validator control | +| [#1748](https://github.com/dandi/dandi-cli/issues/1748) | Tidy up the validate command function | Open | Prerequisite: refactor before adding options | +| [#1448](https://github.com/dandi/dandi-cli/issues/1448) | Align validation with server | Open | Consistency between client/server validation | +| [#1624](https://github.com/dandi/dandi-cli/pull/1624) | Serialize Severity by name | Merged | Prerequisite: meaningful JSON Severity output | +| [#1514](https://github.com/dandi/dandi-cli/pull/1514) | Overhaul validation results (Pydantic Origins) | Merged | Foundation: Pydantic-based models, JSON-ready | +| [#1619](https://github.com/dandi/dandi-cli/pull/1619) | Deduplicate validation results | Merged | Foundation: no duplicate records | +| [#1597](https://github.com/dandi/dandi-cli/issues/1597) | Replace bidsschematools with bids-validator-deno | Closed | Mentions filtering + result summaries as follow-ups | + +## Current State + +### What exists + +- **`ValidationResult`** (Pydantic `BaseModel` in `validate_types.py`): fully JSON-serializable + with `model_dump(mode="json")` and `model_validate_json()` round-trip support. + Fields: `id`, `origin` (validator+standard+versions), `scope`, `severity`, `message`, + `path`, `dandiset_path`, `dataset_path`, `asset_paths`, `within_asset_paths`, + `path_regex`, `metadata`. The `origin_result` field is excluded from serialization. + **No schema/format version field exists on the model itself.** +- **`Severity_`** annotated type serializes as `"ERROR"`, `"WARNING"`, etc. (not numeric). +- **`Origin`** is a Pydantic `BaseModel` with `type`, `validator`, `validator_version`, + `standard`, `standard_version`, `standard_schema_version`. +- **CLI output** is human-only: colored text via `click.secho()` with grouping by + `none`/`path`, filtering by `--min-severity` and `--ignore REGEX`. +- **`dandi ls`** already supports `-f json|json_pp|json_lines|yaml` via `formatter.py` + with `JSONFormatter`, `JSONLinesFormatter`, `YAMLFormatter` context-manager classes. +- **Upload** runs `get_validation_errors()` per-file but only logs/raises on errors; + no structured output is saved. +- **Log files** are written to `platformdirs.user_log_dir("dandi-cli", "dandi")` + (typically `~/.local/share/logs/dandi-cli/`) with pattern + `YYYY.MM.DD-HH.MM.SSZ-PID.log`. The path is stored in `ctx.obj.logfile` in the + Click context (set in `command.py:main()`). +- **Current module layout**: `dandi/validate.py` + `dandi/validate_types.py` as + top-level modules. As more validation modules are anticipated (I/O, reporting, + additional validator backends), this needs to become a subpackage. + +### What's missing + +1. No `--format` option on `validate` (only human-readable output) +2. No `--output` to save results to a file +3. No `--load` to reload previously-saved results +4. No summary statistics +5. No grouping by `validator`, `severity`, `id`, `standard`, etc. +6. Upload validation results are not persisted for later review +7. No shared utility for writing/reading validation JSONL files +8. No schema version on `ValidationResult` for forward-compatible serialization +9. Validation modules are flat files, not a subpackage + +### Importers of current validation modules + +The following files import from `validate.py` or `validate_types.py` and will +need import path updates after the subpackage refactoring: + +- `dandi/cli/cmd_validate.py` — `from ..validate import validate`; `from ..validate_types import ...` +- `dandi/cli/tests/test_cmd_validate.py` — `from ...validate_types import ...` +- `dandi/cli/tests/test_command.py` — `from ..cmd_validate import validate` +- `dandi/upload.py` — `from .validate_types import Severity` +- `dandi/organize.py` — `from .validate_types import ...` +- `dandi/pynwb_utils.py` — `from .validate_types import ...` +- `dandi/files/bases.py` — `from dandi.validate_types import ...` +- `dandi/files/bids.py` — `from ..validate_types import ...`; `from dandi.validate import validate_bids` +- `dandi/files/zarr.py` — `from ..validate_types import ...` +- `dandi/bids_validator_deno/_validator.py` — `from dandi.validate_types import ...` +- `dandi/tests/test_validate.py` — `from ..validate import validate`; `from ..validate_types import ...` +- `dandi/tests/test_validate_types.py` — `from dandi.validate_types import ...` + +## Design Principles + +### Separation of concerns (per #1753) + +Three decoupled stages: + +1. **Execution**: Run validators, produce `ValidationResult` objects +2. **Serialization**: Dump results to JSONL (the interchange format) +3. **Rendering**: Display results with grouping, filtering, formatting + +Currently stages 1+3 are coupled in `cmd_validate.py`. This design introduces +stage 2 and makes stage 3 work from either stage 1 (live) or stage 2 (loaded). + +``` +Live validation: validate() → [ValidationResult] → filter/group → render + ↓ + save(sidecar) + +Stored results: load(files) → [ValidationResult] → filter/group → render + +Upload results: upload() → [ValidationResult] → save(sidecar) + ↓ + load(sidecar) → filter/group → render +``` + +### Uniform output across all formats — no envelope/non-envelope split + +All structured formats (`json`, `json_pp`, `json_lines`, `yaml`) emit the +**same data** — a flat list of `ValidationResult` records. No format gets a +special envelope wrapper that others lack. This avoids having two schemas to +maintain and document. + +**JSONL**: one `ValidationResult` per line (pure results, `cat`-composable): + +```bash +cat results/*.jsonl | jq 'select(.severity == "ERROR")' # just works +wc -l results/*.jsonl # = issue count +grep BIDS.NON_BIDS results/*.jsonl # fast text search +vd results/*.jsonl # instant tabular view +``` + +**VisiData integration**: VisiData natively loads `.jsonl` as tabular data — +each `ValidationResult` becomes a row with columns for `id`, `severity`, +`path`, `origin.validator`, `message`, etc. This gives immediate interactive +sorting, filtering, frequency tables, and pivoting with no extra code. Since +VisiData is Python-based, future integration is possible (e.g., a `dandi` +VisiData plugin that adds custom commands for grouping by dandiset, linking +to BIDS spec rules, or opening the offending file). + +**JSON / JSON pretty-printed**: a JSON array of `ValidationResult` objects: + +```json +[ + {"id": "BIDS.NON_BIDS_PATH_PLACEHOLDER", "origin": {...}, "severity": "ERROR", ...}, + {"id": "NWBI.check_data_orientation", "origin": {...}, "severity": "WARNING", ...} +] +``` + +**YAML**: same array structure in YAML syntax. + +Summary statistics are handled separately via `--summary` (human output) or +post-processing (`jq`, `--load` + `--summary`). They are not baked into the +serialized data. + +### Prior art: bids-validator `output.json` + +The bids-validator ([example](https://github.com/bids-standard/bids-examples/blob/3aa560cc/2d_mb_pcasl/derivatives/bids-validator/output.json)) +uses an envelope format: + +```json +{ + "issues": { + "issues": [{"code": "...", "severity": "warning", "location": "...", ...}], + "codeMessages": {"JSON_KEY_RECOMMENDED": "A JSON file is missing..."} + }, + "summary": {"subjects": [...], "totalFiles": 11, "schemaVersion": "1.2.1", ...} +} +``` + +Their format is a **final report** — one monolithic JSON per dataset. Our design +differs intentionally: we produce **composable JSONL records** that can be +concatenated across datasets, piped through standard Unix tools, and loaded +into tabular tools like VisiData. The `_record_version` field on each record +provides the self-describing property that their envelope provides at the +file level. + +### Schema version on `ValidationResult` + +Add a `_record_version` field to `ValidationResult` to enable forward-compatible +deserialization. This lets loaders detect and handle older record formats +gracefully: + +```python +class ValidationResult(BaseModel): + _record_version: str = "1" # schema version for this record format + id: str + origin: Origin + # ... rest of fields +``` + +Serialized as `"_record_version": "1"` in every JSON line. The loader can +check this and warn/adapt if it encounters a newer version. The underscore +prefix signals it is metadata about the record format, not validation content. + +### Grouping is human-rendering only + +Structured output always emits a flat results list. `--grouping` only affects +human-readable display. Downstream tools can trivially group with +`jq group_by(.id)` etc., and a stable flat schema is more useful than a format +that varies by grouping option. + +### Orthogonal options + +`--format` and `--output` are independent: +- `--format` controls serialization format (default: `human`) +- `--output` controls destination (default: stdout) +- Neither implies the other — if `--output` is given without `--format`, + the format must be specified explicitly (error otherwise, since writing + colored human text to a file is not useful) + +## Design + +### Step 0a: Refactor into `dandi/validate/` subpackage + +**Goal**: Convert flat validation modules into a proper subpackage to +accommodate growing validation functionality (I/O, reporting, future validators). + +**Critical**: The `git mv` must be committed **separately** from any content +changes to preserve git rename tracking. + +#### Commit 1: Pure rename (git mv only) + +```bash +mkdir -p dandi/validate +git mv dandi/validate.py dandi/validate/core.py +git mv dandi/validate_types.py dandi/validate/types.py +# Create __init__.py that re-exports everything for backwards compatibility +``` + +`dandi/validate/__init__.py` (created, not moved): + +```python +"""Validation of DANDI datasets against schemas and standards. + +This package provides validation functionality for dandisets, including: +- DANDI schema validation +- BIDS standard validation +- File layout and organization validation +- Metadata completeness checking +""" +# Re-export public API for backwards compatibility +from .core import validate, validate_bids # noqa: F401 +from .types import ( # noqa: F401 + ORIGIN_INTERNAL_DANDI, + ORIGIN_VALIDATION_DANDI, + ORIGIN_VALIDATION_DANDI_LAYOUT, + ORIGIN_VALIDATION_DANDI_ZARR, + Origin, + OriginType, + Scope, + Severity, + Severity_, + Standard, + ValidationResult, + Validator, +) +``` + +This `__init__.py` means **all existing imports continue to work unchanged**: +- `from dandi.validate import validate` — works (via `__init__.py`) +- `from dandi.validate_types import ValidationResult` — **breaks**, needs update + +#### Commit 2: Update all import paths + +Update all importers listed above to use the new subpackage paths: +- `from dandi.validate_types import X` → `from dandi.validate.types import X` + (or `from dandi.validate import X` where re-exported) +- `from dandi.validate import validate` — already works via `__init__.py` +- `from ..validate_types import X` → `from ..validate.types import X` + +Also move test files: +- `dandi/tests/test_validate.py` → `dandi/validate/tests/test_core.py` +- `dandi/tests/test_validate_types.py` → `dandi/validate/tests/test_types.py` + +#### Resulting subpackage layout + +``` +dandi/validate/ +├── __init__.py # Re-exports for backwards compat +├── core.py # validate(), validate_bids() — was validate.py +├── types.py # ValidationResult, Origin, etc. — was validate_types.py +├── io.py # NEW: JSONL read/write utilities +└── tests/ + ├── __init__.py + ├── test_core.py # was dandi/tests/test_validate.py + ├── test_types.py # was dandi/tests/test_validate_types.py + └── test_io.py # NEW: tests for I/O utilities +``` + +### Step 0b: Refactor `cmd_validate.py` (addresses #1748) + +Before adding new options, decompose the `validate()` CLI function. It currently +handles argument parsing, validation execution, filtering, and rendering in one +function. Extract: + +- `_collect_results()` — run validation, return `list[ValidationResult]` +- `_filter_results()` — apply `--min-severity`, `--ignore` +- `_render_results()` — dispatch to human or structured output + +This keeps complexity per function under the LAD threshold (max-complexity 10) +and makes it natural to slot in `--load` as an alternative to `_collect_results()`. + +Adds `@click.pass_context` to `validate()` to access `ctx.obj.logfile`. + +### Step 0c: Add `_record_version` to `ValidationResult` + +Add the schema version field to `ValidationResult` in `dandi/validate/types.py`: + +```python +class ValidationResult(BaseModel): + _record_version: str = "1" + id: str + origin: Origin + # ... existing fields unchanged +``` + +This is a backwards-compatible addition — existing serialized records without +`_record_version` will deserialize fine (Pydantic fills the default). The loader +in `io.py` can log a debug message if it encounters an unknown version. + +### Shared I/O utility: `dandi/validate/io.py` + +New module shared by `cmd_validate.py` and `upload.py`: + +```python +"""Read and write validation results in JSONL format. + +JSONL files contain one ValidationResult per line — pure results, no envelope. +This makes them fully cat-composable, grep-searchable, and jq-processable. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from .types import ValidationResult + +lgr = logging.getLogger(__name__) + +CURRENT_RECORD_VERSION = "1" + + +def write_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> Path: + """Write validation results to a JSONL file (one result per line). + + Parameters + ---------- + results : list[ValidationResult] + Validation results to write. + path : str | Path + Output file path. + + Returns + ------- + Path + The path written to. + """ + path = Path(path) + with open(path, "w") as f: + for r in results: + f.write(r.model_dump_json() + "\n") + return path + + +def append_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> None: + """Append validation results to an existing JSONL file.""" + path = Path(path) + with open(path, "a") as f: + for r in results: + f.write(r.model_dump_json() + "\n") + + +def load_validation_jsonl(*paths: str | Path) -> list[ValidationResult]: + """Load and concatenate validation results from JSONL files. + + Each line must be a JSON-serialized ValidationResult. Blank lines and + lines that fail to parse as ValidationResult are skipped with a warning. + + Parameters + ---------- + *paths + One or more JSONL file paths. + + Returns + ------- + list[ValidationResult] + Concatenated results from all files. + """ + results: list[ValidationResult] = [] + for p in paths: + p = Path(p) + with open(p) as f: + for lineno, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + vr = ValidationResult.model_validate_json(line) + except Exception: + lgr.debug( + "Skipping unrecognized line %d in %s", lineno, p + ) + continue + rv = getattr(vr, "_record_version", CURRENT_RECORD_VERSION) + if rv != CURRENT_RECORD_VERSION: + lgr.debug( + "Record version %s (expected %s) at %s:%d", + rv, CURRENT_RECORD_VERSION, p, lineno, + ) + results.append(vr) + return results + + +def validation_sidecar_path(logfile: str | Path) -> Path: + """Derive the validation sidecar path from a log file path. + + Given ``2026.03.19-14.30.00Z-12345.log``, returns + ``2026.03.19-14.30.00Z-12345_validation.jsonl`` in the same directory. + """ + logfile = Path(logfile) + return logfile.with_name(logfile.stem + "_validation.jsonl") +``` + +### Phase 1a: Format output (`-f/--format`) — addresses #1515 + +**Goal**: Add structured output formats to `dandi validate`. + +#### CLI changes + +```python +@click.option( + "--format", "-f", + "output_format", # avoid shadowing builtin + help="Output format.", + type=click.Choice(["human", "json", "json_pp", "json_lines", "yaml"]), + default="human", +) +``` + +#### Structured output schema (uniform across formats) + +All structured formats emit a flat list of `ValidationResult` records. + +**json / json_pp** — JSON array: +```json +[ + { + "_record_version": "1", + "id": "BIDS.NON_BIDS_PATH_PLACEHOLDER", + "origin": { + "type": "VALIDATION", + "validator": "bids-validator-deno", + "validator_version": "2.0.6", + "standard": "BIDS", + "standard_version": "1.9.0", + "standard_schema_version": "0.11.3" + }, + "scope": "file", + "severity": "ERROR", + "message": "File does not match any pattern known to BIDS.", + "path": "sub-01/anat/junk.txt", + "dandiset_path": "/data/dandiset001", + "dataset_path": "/data/dandiset001", + "asset_paths": null, + "within_asset_paths": null, + "path_regex": null, + "metadata": null + } +] +``` + +**json_lines** — one record per line (same fields, `cat`-composable): +``` +{"_record_version":"1","id":"BIDS.NON_BIDS_PATH_PLACEHOLDER","origin":{...},...} +{"_record_version":"1","id":"NWBI.check_data_orientation","origin":{...},...} +``` + +**yaml** — YAML list of the same records. + +No envelope in any format. Summary is a separate concern (see Phase 1c). + +#### Implementation approach + +Reuse `formatter.py` infrastructure. `ValidationResult.model_dump(mode="json")` +produces a dict compatible with existing `JSONFormatter`/`YAMLFormatter`. +The `validate()` CLI function collects all results into a list (already does +for filtering), then dispatches to `display_errors()` (human) or structured +formatter. + +### Phase 1b: File output (`-o/--output`) + auto-save sidecar + +**Goal**: Write results to file and auto-save a `_validation.jsonl` sidecar +alongside the `.log` file. + +#### CLI changes + +```python +@click.option( + "--output", "-o", + help="Write output to file instead of stdout. " + "Requires --format to be set to a structured format.", + type=click.Path(dir_okay=False, writable=True), + default=None, +) +``` + +Validation: +- If `--output` is given and `--format` is `human` (default), raise + `click.UsageError("--output requires --format to be set to a structured format")` + +#### Auto-save sidecar + +`dandi validate` writes a `_validation.jsonl` sidecar next to its log file +**by default**, but **skips the sidecar when `--output` is specified** (the +user has already chosen where to save structured results — no need for a +redundant copy): + +```python +# After collecting all_results: +if all_results and not output: + sidecar = validation_sidecar_path(ctx.obj.logfile) + write_validation_jsonl(all_results, sidecar) + lgr.info("Validation results saved to %s", sidecar) +``` + +Empty results (clean validation) also skip the sidecar to avoid clutter. +The sidecar accumulation rate matches the already-existing `.log` files. + +### Phase 1c: Summary flag (`--summary`) + +**Goal**: Add summary statistics as a display option, decoupled from the +serialized data format. + +```python +@click.option( + "--summary / --no-summary", + help="Show summary statistics after validation output.", + default=False, +) +``` + +Human output appends: + +``` +Summary: + Total issues: 42 + By severity: ERROR: 5, WARNING: 12, HINT: 25 + By validator: bids-validator-deno: 30, nwbinspector: 10, dandi: 2 + Files with errors: 8/150 +``` + +For structured formats, `--summary` appends a summary object **to stderr** +(keeping stdout as pure machine-parseable results), or is simply computed by +the consumer from the flat results list. The summary is never mixed into the +structured output stream itself. + +### Phase 2: Load and Re-Render (`--load`) — addresses #1753 + +**Goal**: Reload previously-saved validation results and apply all display options. + +#### CLI option + +```python +@click.option( + "--load", + help="Load validation results from JSONL file(s) instead of running " + "validation. Accepts multiple --load flags. Shell glob expansion " + "is supported (e.g. results/*.jsonl).", + type=click.Path(exists=True, dir_okay=False), + multiple=True, + default=(), +) +``` + +#### Mutual exclusivity with paths + +`--load` and positional `paths` are mutually exclusive. Enforced explicitly: + +```python +if load and paths: + raise click.UsageError( + "--load and PATH arguments are mutually exclusive. " + "Use --load to re-render stored results, or provide paths to validate." + ) +``` + +#### Behavior + +- When `--load` is specified, skip all validation execution +- Use `load_validation_jsonl()` from `dandi/validate/io.py` to read and concatenate +- Apply `--min-severity`, `--ignore`, `--grouping`, `--format`, `--output` identically +- **Exit code**: reflects the loaded results — non-zero if any ERROR-severity + issues are present after filtering. Rationale: the user is asking "are there + errors in these results?" and the answer should be in the exit code regardless + of whether validation was live or loaded. +- Records with unknown `_record_version` are loaded with a debug-level warning + but not rejected (forward-compatible reading) + +#### Example workflows + +```bash +# Run validation (auto-saves _validation.jsonl sidecar) +dandi validate /data/dandiset001 + +# Re-render the auto-saved results with different filters +dandi validate --load ~/.local/share/logs/dandi-cli/*_validation.jsonl \ + --min-severity ERROR +dandi validate --load results.jsonl --grouping path +dandi validate --load results.jsonl -f json_pp --ignore "NWBI\." + +# Cross-dataset sweep: validate many, then analyze together +for ds in bids-examples/*/; do + dandi validate "$ds" -f json_lines -o "results/$(basename $ds).jsonl" || true +done +dandi validate --load results/*.jsonl --grouping id --min-severity ERROR +``` + +Note: shell glob expansion is done by the shell, not by Click. `--load +results/*.jsonl` works because the shell expands the glob into multiple +`--load` arguments before the CLI sees them. + +### Phase 3: Upload validation sidecar — addresses #1753 + +**Goal**: Persist all validation results from `dandi upload` for later inspection. + +#### The problem with `ctx.obj.logfile` in upload + +The `upload()` function in `upload.py` is both a Python API and a CLI backend. +The Click context (`ctx.obj.logfile`) is only available via CLI. The upload +function's validation happens inside a deeply nested generator (`_upload_item`). + +**Solution**: Pass the sidecar path as an optional parameter to `upload()`: + +```python +def upload( + paths: Sequence[str | Path] | None = None, + existing: UploadExisting = UploadExisting.REFRESH, + validation: UploadValidation = UploadValidation.REQUIRE, + # ... existing params ... + validation_log_path: str | Path | None = None, # NEW +) -> None: +``` + +The CLI wrapper (`cmd_upload.py`) derives the sidecar path from `ctx.obj.logfile` +and passes it in. Programmatic callers can pass their own path or leave it as +`None` (no sidecar written). + +#### Implementation + +```python +# In cmd_upload.py, the CLI function: +sidecar = validation_sidecar_path(ctx.obj.logfile) +upload(..., validation_log_path=sidecar) + +# In upload.py, inside the upload loop: +# Collect all validation results as each file is validated +if validation_log_path is not None: + append_validation_jsonl(validation_statuses, validation_log_path) +``` + +After the upload completes (or if it fails due to validation errors): + +```python +if validation_log_path and Path(validation_log_path).exists(): + lgr.info( + "Validation results saved to %s\n" + " Use `dandi validate --load %s` to review.", + validation_log_path, validation_log_path, + ) +``` + +Uses `append_validation_jsonl()` from `dandi/validate/io.py` — the file is +opened in append mode for each batch, allowing incremental writes as files are +validated during upload without holding all results in memory. + +### Phase 4: Extended grouping options — enhances #1743 work + +**Goal**: Support additional grouping strategies for human-readable output. + +Extend `--grouping` from `{none, path}` to: + +| Value | Groups by | Use case | +|-------|-----------|----------| +| `none` | No grouping (flat list) | Default, simple review | +| `path` | File/dataset path | Per-file review | +| `severity` | Severity level | Triage by priority | +| `id` | Error ID | Find most common issues | +| `validator` | Validator name | Per-tool review | +| `standard` | Standard (BIDS/NWB/etc) | Per-standard review | +| `dandiset` | Dandiset path | Multi-dandiset sweeps | + +Human output with grouping adds section headers and counts: + +``` +=== ERROR (5 issues) === +[BIDS.NON_BIDS_PATH_PLACEHOLDER] sub-01/junk.txt — File does not match... +... + +=== WARNING (12 issues) === +[NWBI.check_data_orientation] sub-01/sub-01.nwb — Data may not be... +... +``` + +**Structured output is unaffected** — always a flat results list regardless +of `--grouping`. Downstream tools group trivially: +`jq -s 'group_by(.origin.validator)'`. + +### Phase 5: Cross-dataset sweep support and tooling integration (optional) + +- Helper script or command for batch validation across directories +- Consider a `dandi validate-report` subcommand for aggregate analysis + across multiple loaded JSONL files (cross-tabulation, top-N errors, etc.) +- **VisiData plugin**: Since VisiData is Python-based and already opens JSONL + natively, a `dandi` VisiData plugin could add: + - Custom column extractors for nested `origin.*` fields (flattened for tabular view) + - Frequency sheet by error ID / validator / standard + - Keybinding to open the offending file or jump to BIDS spec rule + - Integration with `dandi validate --load` for round-trip editing + (e.g., mark false positives, export filtered subset) + +## Path Serialization + +Paths in `ValidationResult` are serialized as-is (whatever `Path.__str__` +produces). For portability across machines: + +- **Recommendation**: When the path is within a dandiset, make it relative + to `dandiset_path`. When within a BIDS dataset, make it relative to + `dataset_path`. This is not a requirement for Phase 1 but should be + considered for Phase 2+ to make loaded results meaningful across machines. + +## Use Case: Sweeping BIDS Example Datasets / All DANDI Datasets + +```bash +# Validate all BIDS examples, save results per-dataset +for ds in bids-examples/*/; do + dandi validate "$ds" -f json_lines -o "results/$(basename $ds).jsonl" || true +done + +# Combine and analyze with jq +cat results/*.jsonl | jq 'select(.severity == "ERROR")' | jq -s 'group_by(.id)' + +# Or reload individual results with different views +dandi validate --load results/ds001.jsonl --grouping id --min-severity ERROR +dandi validate --load results/ds001.jsonl -f json_pp + +# Reload ALL results across datasets +dandi validate --load results/*.jsonl --grouping id --min-severity ERROR --summary + +# Interactive exploration with VisiData — sort, filter, pivot, frequency tables +vd results/*.jsonl +``` + +For all DANDI datasets: + +```bash +for dandiset_id in $(dandi ls -f json_lines https://dandiarchive.org/ | jq -r '.identifier'); do + dandi download "DANDI:${dandiset_id}/draft" -o "/data/${dandiset_id}" --download dandiset.yaml + dandi validate "/data/${dandiset_id}" -f json_lines -o "results/${dandiset_id}.jsonl" || true +done + +# Aggregate cross-dandiset analysis +cat results/*.jsonl | jq -s ' + group_by(.id) + | map({id: .[0].id, count: length, severity: .[0].severity}) + | sort_by(-.count) +' +``` + +## Implementation Order + +### Step 0a: Refactor into `dandi/validate/` subpackage + +- `git mv dandi/validate.py dandi/validate/core.py` — committed alone +- `git mv dandi/validate_types.py dandi/validate/types.py` — committed alone +- Create `dandi/validate/__init__.py` with re-exports for backwards compat +- Separate commit: update all import paths across the codebase +- Separate commit: move test files to `dandi/validate/tests/` +- All existing tests must pass after each commit + +### Step 0b: Refactor `cmd_validate.py` — addresses #1748 + +- Extract `_collect_results()`, `_filter_results()`, `_render_results()` +- Keep `validate()` CLI function as thin orchestrator +- Existing behavior unchanged, existing tests must pass +- Add `@click.pass_context` to `validate()` to access `ctx.obj.logfile` + +### Step 0c: Add `_record_version` to `ValidationResult` + +- Add `_record_version: str = "1"` field +- Verify serialization round-trip includes it +- No behavioral change to existing code + +### Step 1a: `--format` (structured output to stdout) — addresses #1515 + +- Add `--format` option with choices `human|json|json_pp|json_lines|yaml` +- All structured formats emit flat list of ValidationResult records +- Reuse `formatter.py` infrastructure for JSON/YAML +- Tests: CliRunner for each format, round-trip serialization + +### Step 1b: `--output` + auto-save sidecar + +- Add `--output` option, enforce `--format` must be structured +- Create `dandi/validate/io.py` with shared `write_validation_jsonl()`, + `append_validation_jsonl()`, `load_validation_jsonl()`, + `validation_sidecar_path()` +- Auto-save `_validation.jsonl` sidecar when results are non-empty and + `--output` is not specified (user already has their output file) +- Tests: file creation, sidecar naming, empty-results no-op, sidecar + suppressed when `--output` is used + +### Step 1c: `--summary` + +- Add `--summary` flag for human output +- For structured formats, summary to stderr (not mixed into data stream) +- Tests: summary output format, counts accuracy + +### Step 2: `--load` (reload and re-render) — addresses #1753 + +- Add `--load` option (multiple, mutually exclusive with paths) +- Use `load_validation_jsonl()` from shared utility +- All filtering/grouping/format options work on loaded results +- Exit code reflects loaded results +- `_record_version` checked with debug-level warning for unknown versions +- Tests: load + filter, multi-file concatenation, mutual exclusivity error, + forward-compatible loading of unknown versions + +### Step 3: Upload validation sidecar — addresses #1753 + +- Add `validation_log_path` parameter to `upload()` +- CLI wrapper passes sidecar path derived from `ctx.obj.logfile` +- Use `append_validation_jsonl()` for incremental writes +- Announce sidecar path on validation errors +- Tests: sidecar creation during upload, content correctness + +### Step 4: Extended grouping (human-only) — enhances #1743 + +- Add `severity`, `id`, `validator`, `standard`, `dandiset` grouping options +- Implement section headers + counts for human output +- Structured output unchanged (always flat) +- This subsumes some of the filtering work in draft PR #1743 + +### Step 5: Cross-dataset sweep support (optional) + +- Helper script or `dandi validate-report` subcommand +- Aggregate analysis across multiple JSONL files + +## Backwards Compatibility + +- `dandi validate` with no new options behaves identically to today +- Exit code semantics preserved: non-zero if any ERROR-severity issues +- When using `--format` other than `human`, colored output is suppressed +- When using `--load`, exit code reflects loaded results +- The auto-save sidecar is the only new side effect; it writes when there + are results and `--output` is not specified (for `validate`), or always + (for `upload`). Follows the same lifecycle as the existing `.log` files +- The `dandi/validate/` subpackage `__init__.py` re-exports all public API, + so `from dandi.validate import validate` continues to work. Only direct + `from dandi.validate_types import ...` imports need updating. + +## Testing Strategy + +| Component | Test type | Approach | +|-----------|-----------|----------| +| Subpackage refactor | Smoke | All existing tests pass after `git mv` + import updates | +| `--format` output | CLI unit | `click.CliRunner`, assert JSON structure, round-trip | +| `dandi/validate/io.py` | Unit | Write → load round-trip, append, empty files, corrupt lines | +| `_record_version` | Unit | Serialization includes field, loader handles missing/unknown | +| `--load` | CLI unit | Load from fixture files, multi-file concat, mutual exclusivity | +| `--output` | CLI unit | Verify file creation, content matches stdout format | +| Sidecar auto-save | CLI unit | Verify `_validation.jsonl` created next to mock logfile | +| Upload sidecar | Integration | Upload with Docker Compose fixture, verify sidecar | +| Extended grouping | CLI unit | Each grouping value, section headers, counts | +| Summary | CLI unit | Verify counts match actual results | +| Edge cases | Unit | Empty results, None severity, very long paths, Unicode messages | + +All new tests marked `@pytest.mark.ai_generated`. + +## File Inventory + +| File | Change | +|------|--------| +| `dandi/validate/__init__.py` | **New** — subpackage with re-exports | +| `dandi/validate/core.py` | **Renamed** from `dandi/validate.py` | +| `dandi/validate/types.py` | **Renamed** from `dandi/validate_types.py` + add `_record_version` | +| `dandi/validate/io.py` | **New** — shared JSONL read/write utilities | +| `dandi/validate/tests/__init__.py` | **New** | +| `dandi/validate/tests/test_core.py` | **Renamed** from `dandi/tests/test_validate.py` | +| `dandi/validate/tests/test_types.py` | **Renamed** from `dandi/tests/test_validate_types.py` | +| `dandi/validate/tests/test_io.py` | **New** — tests for I/O utilities | +| `dandi/cli/cmd_validate.py` | Refactor + add `--format`, `--output`, `--load`, `--summary`, grouping extensions | +| `dandi/upload.py` | Add `validation_log_path` parameter, write sidecar | +| `dandi/cli/cmd_upload.py` | Pass sidecar path to `upload()` | +| `dandi/cli/tests/test_cmd_validate.py` | Extend with format/load/output/summary tests | +| `dandi/pynwb_utils.py` | Update imports | +| `dandi/organize.py` | Update imports | +| `dandi/files/bases.py` | Update imports | +| `dandi/files/bids.py` | Update imports | +| `dandi/files/zarr.py` | Update imports | +| `dandi/bids_validator_deno/_validator.py` | Update imports | From 3ab47413dbb5c5b61393e1086306b4eaf586a3f4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 15:53:32 -0400 Subject: [PATCH 02/16] refactor: git mv validate.py and validate_types.py into dandi/validate/ subpackage Pure file move with no content changes, plus __init__.py re-exports for backward compatibility. Imports will be updated in the next commit. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/validate/__init__.py | 46 +++++++++++++++++++ dandi/{validate.py => validate/core.py} | 0 .../{validate_types.py => validate/types.py} | 0 3 files changed, 46 insertions(+) create mode 100644 dandi/validate/__init__.py rename dandi/{validate.py => validate/core.py} (100%) rename dandi/{validate_types.py => validate/types.py} (100%) diff --git a/dandi/validate/__init__.py b/dandi/validate/__init__.py new file mode 100644 index 000000000..ce70a48d9 --- /dev/null +++ b/dandi/validate/__init__.py @@ -0,0 +1,46 @@ +"""Validation of DANDI datasets against schemas and standards. + +This subpackage provides validation functionality for dandisets, including: +- DANDI schema validation +- BIDS standard validation +- File layout and organization validation +- Metadata completeness checking + +Submodules: +- core: Main validation functions (validate, validate_bids) +- types: Data types and models (ValidationResult, Origin, Severity, etc.) +- io: JSONL read/write utilities for validation results +""" + +from .core import validate, validate_bids +from .types import ( + ORIGIN_INTERNAL_DANDI, + ORIGIN_VALIDATION_DANDI, + ORIGIN_VALIDATION_DANDI_LAYOUT, + ORIGIN_VALIDATION_DANDI_ZARR, + Origin, + OriginType, + Scope, + Severity, + Severity_, + Standard, + ValidationResult, + Validator, +) + +__all__ = [ + "ORIGIN_INTERNAL_DANDI", + "ORIGIN_VALIDATION_DANDI", + "ORIGIN_VALIDATION_DANDI_LAYOUT", + "ORIGIN_VALIDATION_DANDI_ZARR", + "Origin", + "OriginType", + "Scope", + "Severity", + "Severity_", + "Standard", + "ValidationResult", + "Validator", + "validate", + "validate_bids", +] diff --git a/dandi/validate.py b/dandi/validate/core.py similarity index 100% rename from dandi/validate.py rename to dandi/validate/core.py diff --git a/dandi/validate_types.py b/dandi/validate/types.py similarity index 100% rename from dandi/validate_types.py rename to dandi/validate/types.py From 7f80ca2ab7e4c8fc6f03497fbd5c9578cdc28c8f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 15:54:30 -0400 Subject: [PATCH 03/16] refactor: update all import paths for dandi/validate/ subpackage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update imports across 13 files to use the new subpackage structure: - dandi.validate_types → dandi.validate.types - dandi.validate → dandi.validate.core (for explicit imports) - Relative imports adjusted accordingly Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/bids_validator_deno/_validator.py | 2 +- dandi/cli/cmd_validate.py | 4 ++-- dandi/cli/tests/test_cmd_validate.py | 2 +- dandi/files/bases.py | 2 +- dandi/files/bids.py | 4 ++-- dandi/files/zarr.py | 4 ++-- dandi/organize.py | 2 +- dandi/pynwb_utils.py | 2 +- dandi/tests/test_bids_validator_deno/test_validator.py | 2 +- dandi/tests/test_validate.py | 4 ++-- dandi/tests/test_validate_types.py | 2 +- dandi/upload.py | 2 +- dandi/validate/core.py | 8 ++++---- 13 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dandi/bids_validator_deno/_validator.py b/dandi/bids_validator_deno/_validator.py index d97f953db..e811f2c5a 100644 --- a/dandi/bids_validator_deno/_validator.py +++ b/dandi/bids_validator_deno/_validator.py @@ -13,7 +13,7 @@ from pydantic import DirectoryPath, validate_call from dandi.utils import find_parent_directory_containing -from dandi.validate_types import ( +from dandi.validate.types import ( Origin, OriginType, Scope, diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index b0b5012f5..c8c104341 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -11,8 +11,8 @@ from .base import devel_debug_option, devel_option, map_to_click_exceptions from ..utils import pluralize -from ..validate import validate as validate_ -from ..validate_types import Severity, ValidationResult +from ..validate.core import validate as validate_ +from ..validate.types import Severity, ValidationResult @click.command() diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 6694ef3ec..978bd7e91 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -5,7 +5,7 @@ from ..cmd_validate import _process_issues, validate from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess -from ...validate_types import ( +from ...validate.types import ( Origin, OriginType, Scope, diff --git a/dandi/files/bases.py b/dandi/files/bases.py index 15bc616c4..b5be008bf 100644 --- a/dandi/files/bases.py +++ b/dandi/files/bases.py @@ -29,7 +29,7 @@ from dandi.metadata.core import get_default_metadata from dandi.misctypes import DUMMY_DANDI_ETAG, Digest, LocalReadableFile, P from dandi.utils import post_upload_size_check, pre_upload_size_check, yaml_load -from dandi.validate_types import ( +from dandi.validate.types import ( ORIGIN_INTERNAL_DANDI, ORIGIN_VALIDATION_DANDI, Origin, diff --git a/dandi/files/bids.py b/dandi/files/bids.py index 7e43163ec..bcda0f6d2 100644 --- a/dandi/files/bids.py +++ b/dandi/files/bids.py @@ -16,7 +16,7 @@ from ..consts import ZARR_MIME_TYPE, dandiset_metadata_file from ..metadata.core import add_common_metadata, prepare_metadata from ..misctypes import Digest -from ..validate_types import ( +from ..validate.types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Scope, Severity, @@ -92,7 +92,7 @@ def _get_metadata(self) -> None: with self._lock: if self._asset_metadata is None: # Import here to avoid circular import - from dandi.validate import validate_bids + from dandi.validate.core import validate_bids # === Validate the dataset using bidsschematools === # This is done to obtain the metadata for each asset in the dataset diff --git a/dandi/files/zarr.py b/dandi/files/zarr.py index efb1efd2d..805ce3a16 100644 --- a/dandi/files/zarr.py +++ b/dandi/files/zarr.py @@ -10,10 +10,10 @@ import json import os import os.path -import urllib.parse from pathlib import Path from time import sleep from typing import Any, Optional +import urllib.parse from dandischema.models import BareAsset, DigestType from pydantic import BaseModel, ConfigDict, ValidationError @@ -47,7 +47,7 @@ ) from .bases import LocalDirectoryAsset -from ..validate_types import ( +from ..validate.types import ( ORIGIN_VALIDATION_DANDI_ZARR, Origin, OriginType, diff --git a/dandi/organize.py b/dandi/organize.py index ac313fe86..4ba0047ba 100644 --- a/dandi/organize.py +++ b/dandi/organize.py @@ -43,7 +43,7 @@ pluralize, yaml_load, ) -from .validate_types import ( +from .validate.types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Scope, Severity, diff --git a/dandi/pynwb_utils.py b/dandi/pynwb_utils.py index 7ffe552a8..5010725e4 100644 --- a/dandi/pynwb_utils.py +++ b/dandi/pynwb_utils.py @@ -42,7 +42,7 @@ ) from .misctypes import Readable from .utils import get_module_version, is_url -from .validate_types import ( +from .validate.types import ( Origin, OriginType, Scope, diff --git a/dandi/tests/test_bids_validator_deno/test_validator.py b/dandi/tests/test_bids_validator_deno/test_validator.py index d0e812c49..2782c42f2 100644 --- a/dandi/tests/test_bids_validator_deno/test_validator.py +++ b/dandi/tests/test_bids_validator_deno/test_validator.py @@ -27,7 +27,7 @@ ) from dandi.consts import dandiset_metadata_file from dandi.tests.fixtures import BIDS_TESTDATA_SELECTION -from dandi.validate_types import ( +from dandi.validate.types import ( OriginType, Scope, Severity, diff --git a/dandi/tests/test_validate.py b/dandi/tests/test_validate.py index edbf4ae8f..2a07a1809 100644 --- a/dandi/tests/test_validate.py +++ b/dandi/tests/test_validate.py @@ -7,8 +7,8 @@ from .fixtures import BIDS_TESTDATA_SELECTION from .. import __version__ from ..consts import dandiset_metadata_file -from ..validate import validate -from ..validate_types import ( +from ..validate.core import validate +from ..validate.types import ( Origin, OriginType, Scope, diff --git a/dandi/tests/test_validate_types.py b/dandi/tests/test_validate_types.py index eb43e251a..9b3b3abf1 100644 --- a/dandi/tests/test_validate_types.py +++ b/dandi/tests/test_validate_types.py @@ -6,7 +6,7 @@ from pydantic import ValidationError import pytest -from dandi.validate_types import ( +from dandi.validate.types import ( Origin, OriginType, Scope, diff --git a/dandi/upload.py b/dandi/upload.py index 1694aee23..324e31129 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -49,7 +49,7 @@ from .support import pyout as pyouts from .support.pyout import naturalsize from .utils import ensure_datetime, path_is_subpath, pluralize -from .validate_types import Severity +from .validate.types import Severity def _check_dandidownload_paths(dfile: DandiFile) -> None: diff --git a/dandi/validate/core.py b/dandi/validate/core.py index c32000dab..f30cf95b5 100644 --- a/dandi/validate/core.py +++ b/dandi/validate/core.py @@ -13,10 +13,7 @@ import os from pathlib import Path -from .consts import dandiset_metadata_file -from .files import find_dandi_files -from .utils import find_parent_directory_containing -from .validate_types import ( +from .types import ( ORIGIN_VALIDATION_DANDI_LAYOUT, Origin, OriginType, @@ -26,6 +23,9 @@ ValidationResult, Validator, ) +from ..consts import dandiset_metadata_file +from ..files import find_dandi_files +from ..utils import find_parent_directory_containing BIDS_TO_DANDI = { "subject": "subject_id", From ddd5c3e9a14b87037801955f4aa3f41f403253ba Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 15:55:04 -0400 Subject: [PATCH 04/16] refactor: move validation tests into dandi/validate/tests/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test_validate.py → dandi/validate/tests/test_core.py - test_validate_types.py → dandi/validate/tests/test_types.py - Update relative imports in moved test files - Fix circular import: don't eagerly import core in __init__.py Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/validate/__init__.py | 7 ++++--- dandi/validate/tests/__init__.py | 0 .../test_validate.py => validate/tests/test_core.py} | 10 +++++----- .../tests/test_types.py} | 0 4 files changed, 9 insertions(+), 8 deletions(-) create mode 100644 dandi/validate/tests/__init__.py rename dandi/{tests/test_validate.py => validate/tests/test_core.py} (97%) rename dandi/{tests/test_validate_types.py => validate/tests/test_types.py} (100%) diff --git a/dandi/validate/__init__.py b/dandi/validate/__init__.py index ce70a48d9..659b81765 100644 --- a/dandi/validate/__init__.py +++ b/dandi/validate/__init__.py @@ -10,9 +10,12 @@ - core: Main validation functions (validate, validate_bids) - types: Data types and models (ValidationResult, Origin, Severity, etc.) - io: JSONL read/write utilities for validation results + +Note: core is NOT eagerly imported here to avoid circular imports +(core → dandi.files → dandi.validate.types → dandi.validate.__init__). +Import from dandi.validate.core directly for validate/validate_bids. """ -from .core import validate, validate_bids from .types import ( ORIGIN_INTERNAL_DANDI, ORIGIN_VALIDATION_DANDI, @@ -41,6 +44,4 @@ "Standard", "ValidationResult", "Validator", - "validate", - "validate_bids", ] diff --git a/dandi/validate/tests/__init__.py b/dandi/validate/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dandi/tests/test_validate.py b/dandi/validate/tests/test_core.py similarity index 97% rename from dandi/tests/test_validate.py rename to dandi/validate/tests/test_core.py index 2a07a1809..64c32090d 100644 --- a/dandi/tests/test_validate.py +++ b/dandi/validate/tests/test_core.py @@ -4,11 +4,8 @@ import pytest -from .fixtures import BIDS_TESTDATA_SELECTION -from .. import __version__ -from ..consts import dandiset_metadata_file -from ..validate.core import validate -from ..validate.types import ( +from ..core import validate +from ..types import ( Origin, OriginType, Scope, @@ -17,6 +14,9 @@ ValidationResult, Validator, ) +from ... import __version__ +from ...consts import dandiset_metadata_file +from ...tests.fixtures import BIDS_TESTDATA_SELECTION def test_validate_nwb_error(simple3_nwb: Path) -> None: diff --git a/dandi/tests/test_validate_types.py b/dandi/validate/tests/test_types.py similarity index 100% rename from dandi/tests/test_validate_types.py rename to dandi/validate/tests/test_types.py From c962bf1ac2ee0ab70b4ee7c73ba599848d0bd5d4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:03:30 -0400 Subject: [PATCH 05/16] refactor: extract _collect_results() and _filter_results() from validate CLI Decompose the monolithic validate() click command into helpers: - _collect_results(): runs validation and collects results - _filter_results(): applies min-severity and ignore filters - _process_issues(): simplified, no longer handles ignore (moved to _filter) No behavior changes; all existing tests pass unchanged. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 94 +++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index c8c104341..bc86b4982 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -1,6 +1,5 @@ from __future__ import annotations -from collections.abc import Iterable import logging import os import re @@ -15,6 +14,57 @@ from ..validate.types import Severity, ValidationResult +def _collect_results( + paths: tuple[str, ...], + schema: str | None, + devel_debug: bool, + allow_any_path: bool, +) -> list[ValidationResult]: + """Run validation and collect all results into a list.""" + # Avoid heavy import by importing within function: + from ..pynwb_utils import ignore_benign_pynwb_warnings + + # Don't log validation warnings, as this command reports them to the user + # anyway: + root = logging.getLogger() + for h in root.handlers: + h.addFilter(lambda r: not getattr(r, "validating", False)) + + if not paths: + paths = (os.curdir,) + # below we are using load_namespaces but it causes HDMF to whine if there + # is no cached name spaces in the file. It is benign but not really useful + # at this point, so we ignore it although ideally there should be a formal + # way to get relevant warnings (not errors) from PyNWB + ignore_benign_pynwb_warnings() + + return list( + validate_( + *paths, + schema_version=schema, + devel_debug=devel_debug, + allow_any_path=allow_any_path, + ) + ) + + +def _filter_results( + results: list[ValidationResult], + min_severity: str, + ignore: str | None, +) -> list[ValidationResult]: + """Filter results by minimum severity and ignore pattern.""" + min_severity_value = Severity[min_severity].value + filtered = [ + r + for r in results + if r.severity is not None and r.severity.value >= min_severity_value + ] + if ignore is not None: + filtered = [r for r in filtered if not re.search(ignore, r.id)] + return filtered + + @click.command() @click.option( "--schema", help="Validate against new BIDS schema version.", metavar="VERSION" @@ -102,49 +152,15 @@ def validate( Exits with non-0 exit code if any file is not compliant. """ - # Avoid heavy import by importing within function: - from ..pynwb_utils import ignore_benign_pynwb_warnings - - # Don't log validation warnings, as this command reports them to the user - # anyway: - root = logging.getLogger() - for h in root.handlers: - h.addFilter(lambda r: not getattr(r, "validating", False)) - - if not paths: - paths = (os.curdir,) - # below we are using load_namespaces but it causes HDMF to whine if there - # is no cached name spaces in the file. It is benign but not really useful - # at this point, so we ignore it although ideally there should be a formal - # way to get relevant warnings (not errors) from PyNWB - ignore_benign_pynwb_warnings() - - validator_result = validate_( - *paths, - schema_version=schema, - devel_debug=devel_debug, - allow_any_path=allow_any_path, - ) - - min_severity_value = Severity[min_severity].value - - filtered_results = [ - i - for i in validator_result - if i.severity is not None and i.severity.value >= min_severity_value - ] - - _process_issues(filtered_results, grouping, ignore) + results = _collect_results(paths, schema, devel_debug, allow_any_path) + filtered = _filter_results(results, min_severity, ignore) + _process_issues(filtered, grouping) def _process_issues( - validator_result: Iterable[ValidationResult], + issues: list[ValidationResult], grouping: str, - ignore: str | None = None, ) -> None: - issues = [i for i in validator_result if i.severity is not None] - if ignore is not None: - issues = [i for i in issues if not re.search(ignore, i.id)] purviews = [i.purview for i in issues] if grouping == "none": display_errors( From af45790af0b2eaab765eb00c5118b3cb052f7cd2 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:09:17 -0400 Subject: [PATCH 06/16] feat: add record_version field to ValidationResult Add record_version: str = "1" for forward-compatible serialization. Uses no underscore prefix since Pydantic v2 excludes underscore-prefixed fields from serialization. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/validate/__init__.py | 2 ++ dandi/validate/tests/test_types.py | 24 ++++++++++++++++++++++++ dandi/validate/types.py | 6 ++++++ 3 files changed, 32 insertions(+) diff --git a/dandi/validate/__init__.py b/dandi/validate/__init__.py index 659b81765..2066ac89c 100644 --- a/dandi/validate/__init__.py +++ b/dandi/validate/__init__.py @@ -17,6 +17,7 @@ """ from .types import ( + CURRENT_RECORD_VERSION, ORIGIN_INTERNAL_DANDI, ORIGIN_VALIDATION_DANDI, ORIGIN_VALIDATION_DANDI_LAYOUT, @@ -32,6 +33,7 @@ ) __all__ = [ + "CURRENT_RECORD_VERSION", "ORIGIN_INTERNAL_DANDI", "ORIGIN_VALIDATION_DANDI", "ORIGIN_VALIDATION_DANDI_LAYOUT", diff --git a/dandi/validate/tests/test_types.py b/dandi/validate/tests/test_types.py index 9b3b3abf1..259b5f4f0 100644 --- a/dandi/validate/tests/test_types.py +++ b/dandi/validate/tests/test_types.py @@ -7,6 +7,7 @@ import pytest from dandi.validate.types import ( + CURRENT_RECORD_VERSION, Origin, OriginType, Scope, @@ -156,3 +157,26 @@ def test_invalid_severity_validation(self, invalid_severity: Any) -> None: assert excinfo.value.error_count() == 1 assert excinfo.value.errors()[0]["loc"][-1] == "severity" + + @pytest.mark.ai_generated + def test_record_version_serialization(self) -> None: + """Test that record_version is included in JSON serialization.""" + r = ValidationResult( + id="foo", + origin=FOO_ORIGIN, + scope=Scope.FILE, + severity=Severity.ERROR, + ) + assert r.record_version == CURRENT_RECORD_VERSION + + json_dump = r.model_dump(mode="json") + assert json_dump["record_version"] == CURRENT_RECORD_VERSION + + json_str = r.model_dump_json() + json_dict = json.loads(json_str) + assert json_dict["record_version"] == CURRENT_RECORD_VERSION + + # Round-trip + r2 = ValidationResult.model_validate_json(json_str) + assert r2.record_version == CURRENT_RECORD_VERSION + assert r == r2 diff --git a/dandi/validate/types.py b/dandi/validate/types.py index 8356a0d51..30d9fa61d 100644 --- a/dandi/validate/types.py +++ b/dandi/validate/types.py @@ -180,7 +180,13 @@ class Scope(Enum): DATASET = "dataset" +CURRENT_RECORD_VERSION = "1" + + class ValidationResult(BaseModel): + record_version: str = CURRENT_RECORD_VERSION + """Version of the serialized record format for forward compatibility""" + id: str origin: Origin From 9db6f30c0b1fb5c4fc31875bb27ae17a87daf5e7 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:19:14 -0400 Subject: [PATCH 07/16] feat: add --format option to dandi validate CLI Add -f/--format {human,json,json_pp,json_lines,yaml} to produce structured output using existing formatter infrastructure. Structured formats suppress colored text and 'No errors found' message. Exit code still reflects validation results. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 52 +++++++++++++++++++++++- dandi/cli/tests/test_cmd_validate.py | 60 ++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index bc86b4982..f129fbf84 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -3,16 +3,20 @@ import logging import os import re +import sys from typing import cast import warnings import click from .base import devel_debug_option, devel_option, map_to_click_exceptions +from .formatter import JSONFormatter, JSONLinesFormatter, YAMLFormatter from ..utils import pluralize from ..validate.core import validate as validate_ from ..validate.types import Severity, ValidationResult +STRUCTURED_FORMATS = ("json", "json_pp", "json_lines", "yaml") + def _collect_results( paths: tuple[str, ...], @@ -136,6 +140,14 @@ def validate_bids( type=click.Choice([i.name for i in Severity], case_sensitive=True), default="HINT", ) +@click.option( + "--format", + "-f", + "output_format", + help="Output format.", + type=click.Choice(["human", "json", "json_pp", "json_lines", "yaml"]), + default="human", +) @click.argument("paths", nargs=-1, type=click.Path(exists=True, dir_okay=True)) @devel_debug_option() @map_to_click_exceptions @@ -144,6 +156,7 @@ def validate( ignore: str | None, grouping: str, min_severity: str, + output_format: str = "human", schema: str | None = None, devel_debug: bool = False, allow_any_path: bool = False, @@ -154,7 +167,44 @@ def validate( """ results = _collect_results(paths, schema, devel_debug, allow_any_path) filtered = _filter_results(results, min_severity, ignore) - _process_issues(filtered, grouping) + + if output_format == "human": + _process_issues(filtered, grouping) + else: + _render_structured(filtered, output_format, sys.stdout) + _exit_if_errors(filtered) + + +def _get_formatter(output_format: str, out: object = None): + """Create a formatter for the given output format.""" + if output_format == "json": + return JSONFormatter(out=out) + elif output_format == "json_pp": + return JSONFormatter(indent=2, out=out) + elif output_format == "json_lines": + return JSONLinesFormatter(out=out) + elif output_format == "yaml": + return YAMLFormatter(out=out) + else: + raise ValueError(f"Unknown format: {output_format}") + + +def _render_structured( + results: list[ValidationResult], + output_format: str, + out: object, +) -> None: + """Render validation results in a structured format.""" + formatter = _get_formatter(output_format, out=out) + with formatter: + for r in results: + formatter(r.model_dump(mode="json")) + + +def _exit_if_errors(results: list[ValidationResult]) -> None: + """Raise SystemExit(1) if any result has severity >= ERROR.""" + if any(r.severity is not None and r.severity >= Severity.ERROR for r in results): + raise SystemExit(1) def _process_issues( diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 978bd7e91..fe06c34ec 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -1,7 +1,9 @@ +import json from pathlib import Path from click.testing import CliRunner import pytest +import ruamel.yaml from ..cmd_validate import _process_issues, validate from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess @@ -149,3 +151,61 @@ def test_validate_bids_error_grouping_notification( # Does it notify the user correctly? notification_substring = "Invalid value for '--grouping'" assert notification_substring in r.output + + +@pytest.mark.ai_generated +def test_validate_format_json(simple2_nwb: Path) -> None: + """Test --format json outputs a valid JSON array.""" + r = CliRunner().invoke(validate, ["-f", "json", str(simple2_nwb)]) + assert r.exit_code == 1 # NO_DANDISET_FOUND is an error + data = json.loads(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + # Check structure of first result + rec = data[0] + assert "id" in rec + assert "origin" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_json_lines(simple2_nwb: Path) -> None: + """Test --format json_lines outputs one JSON object per line.""" + r = CliRunner().invoke(validate, ["-f", "json_lines", str(simple2_nwb)]) + assert r.exit_code == 1 + lines = [line for line in r.output.strip().split("\n") if line.strip()] + assert len(lines) >= 1 + for line in lines: + rec = json.loads(line) + assert "id" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_yaml(simple2_nwb: Path) -> None: + """Test --format yaml outputs valid YAML.""" + r = CliRunner().invoke(validate, ["-f", "yaml", str(simple2_nwb)]) + assert r.exit_code == 1 + yaml = ruamel.yaml.YAML(typ="safe") + data = yaml.load(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + rec = data[0] + assert "id" in rec + assert "severity" in rec + assert "record_version" in rec + + +@pytest.mark.ai_generated +def test_validate_format_no_errors_no_message(tmp_path: Path) -> None: + """Structured formats should not emit 'No errors found.' text.""" + (tmp_path / "dandiset.yaml").write_text( + "identifier: 12346\nname: Foo\ndescription: Dandiset Foo\n" + ) + r = CliRunner().invoke(validate, ["-f", "json", str(tmp_path)]) + assert r.exit_code == 0 + assert "No errors found" not in r.output + data = json.loads(r.output) + assert data == [] From 2128a868e61ab65e5cdbc09aa77cc36b33758ac0 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:21:49 -0400 Subject: [PATCH 08/16] feat: add --output option, io.py module, and auto-save sidecar - Create dandi/validate/io.py with write/append/load JSONL utilities and validation_sidecar_path() helper - Add -o/--output option to write structured output to file - Auto-save _validation.jsonl sidecar next to logfile when using structured format without --output Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 36 +++++++++ dandi/cli/tests/test_cmd_validate.py | 30 +++++++ dandi/validate/io.py | 116 ++++++++++++++++++++++++++ dandi/validate/tests/test_io.py | 117 +++++++++++++++++++++++++++ 4 files changed, 299 insertions(+) create mode 100644 dandi/validate/io.py create mode 100644 dandi/validate/tests/test_io.py diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index f129fbf84..0681942f7 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -13,8 +13,11 @@ from .formatter import JSONFormatter, JSONLinesFormatter, YAMLFormatter from ..utils import pluralize from ..validate.core import validate as validate_ +from ..validate.io import validation_sidecar_path, write_validation_jsonl from ..validate.types import Severity, ValidationResult +lgr = logging.getLogger(__name__) + STRUCTURED_FORMATS = ("json", "json_pp", "json_lines", "yaml") @@ -148,15 +151,27 @@ def validate_bids( type=click.Choice(["human", "json", "json_pp", "json_lines", "yaml"]), default="human", ) +@click.option( + "--output", + "-o", + "output_file", + help="Write output to file instead of stdout. " + "Requires --format to be set to a structured format.", + type=click.Path(dir_okay=False, writable=True), + default=None, +) @click.argument("paths", nargs=-1, type=click.Path(exists=True, dir_okay=True)) +@click.pass_context @devel_debug_option() @map_to_click_exceptions def validate( + ctx: click.Context, paths: tuple[str, ...], ignore: str | None, grouping: str, min_severity: str, output_format: str = "human", + output_file: str | None = None, schema: str | None = None, devel_debug: bool = False, allow_any_path: bool = False, @@ -165,16 +180,37 @@ def validate( Exits with non-0 exit code if any file is not compliant. """ + if output_file is not None and output_format not in STRUCTURED_FORMATS: + raise click.UsageError( + "--output requires --format to be set to a structured format " + "(json, json_pp, json_lines, yaml)." + ) + results = _collect_results(paths, schema, devel_debug, allow_any_path) filtered = _filter_results(results, min_severity, ignore) if output_format == "human": _process_issues(filtered, grouping) + elif output_file is not None: + with open(output_file, "w") as fh: + _render_structured(filtered, output_format, fh) + lgr.info("Validation output written to %s", output_file) + _exit_if_errors(filtered) else: _render_structured(filtered, output_format, sys.stdout) + # Auto-save sidecar next to logfile + if filtered and hasattr(ctx, "obj") and ctx.obj is not None: + _auto_save_sidecar(filtered, ctx.obj.logfile) _exit_if_errors(filtered) +def _auto_save_sidecar(results: list[ValidationResult], logfile: str) -> None: + """Write validation sidecar JSONL next to the logfile.""" + sidecar = validation_sidecar_path(logfile) + write_validation_jsonl(results, sidecar) + lgr.info("Validation sidecar saved to %s", sidecar) + + def _get_formatter(output_format: str, out: object = None): """Create a formatter for the given output format.""" if output_format == "json": diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index fe06c34ec..77a1d74ee 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -209,3 +209,33 @@ def test_validate_format_no_errors_no_message(tmp_path: Path) -> None: assert "No errors found" not in r.output data = json.loads(r.output) assert data == [] + + +@pytest.mark.ai_generated +def test_validate_output_file(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output writes to file instead of stdout.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 + assert outfile.exists() + lines = outfile.read_text().strip().split("\n") + assert len(lines) >= 1 + rec = json.loads(lines[0]) + assert "id" in rec + # stdout should be empty (output went to file) + assert r.output.strip() == "" + + +@pytest.mark.ai_generated +def test_validate_output_requires_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output without --format gives a usage error.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code != 0 + assert "--output requires --format" in r.output diff --git a/dandi/validate/io.py b/dandi/validate/io.py new file mode 100644 index 000000000..57974ad52 --- /dev/null +++ b/dandi/validate/io.py @@ -0,0 +1,116 @@ +"""I/O utilities for validation results in JSONL format. + +Provides functions for writing, appending, and loading validation results +as JSONL (JSON Lines) files — one ValidationResult per line. +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from .types import CURRENT_RECORD_VERSION, ValidationResult + +lgr = logging.getLogger(__name__) + + +def write_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> Path: + """Write validation results to a JSONL file, overwriting if it exists. + + Parameters + ---------- + results + List of ValidationResult objects to write. + path + File path to write to. + + Returns + ------- + Path + The path written to (as a Path object). + """ + path = Path(path) + with path.open("w") as f: + for r in results: + f.write(r.model_dump_json()) + f.write("\n") + return path + + +def append_validation_jsonl( + results: list[ValidationResult], + path: str | Path, +) -> None: + """Append validation results to a JSONL file. + + Parameters + ---------- + results + List of ValidationResult objects to append. + path + File path to append to. Created if it does not exist. + """ + path = Path(path) + with path.open("a") as f: + for r in results: + f.write(r.model_dump_json()) + f.write("\n") + + +def load_validation_jsonl(*paths: str | Path) -> list[ValidationResult]: + """Load and concatenate validation results from one or more JSONL files. + + Parameters + ---------- + paths + One or more file paths to load from. + + Returns + ------- + list[ValidationResult] + All results from all files, in order. + """ + results: list[ValidationResult] = [] + for p in paths: + p = Path(p) + with p.open() as f: + for line_no, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + data = json.loads(line) + version = data.get("record_version", "0") + if version != CURRENT_RECORD_VERSION: + lgr.warning( + "%s:%d: record_version %r != current %r, " "loading anyway", + p, + line_no, + version, + CURRENT_RECORD_VERSION, + ) + results.append(ValidationResult.model_validate_json(line)) + return results + + +def validation_sidecar_path(logfile: str | Path) -> Path: + """Derive the validation sidecar path from a logfile path. + + The sidecar is placed next to the logfile with ``_validation.jsonl`` + appended to the stem. + + Parameters + ---------- + logfile + Path to the logfile. + + Returns + ------- + Path + Path to the sidecar file. + """ + logfile = Path(logfile) + return logfile.parent / (logfile.stem + "_validation.jsonl") diff --git a/dandi/validate/tests/test_io.py b/dandi/validate/tests/test_io.py new file mode 100644 index 000000000..fc694b1bb --- /dev/null +++ b/dandi/validate/tests/test_io.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from dandi.validate.io import ( + append_validation_jsonl, + load_validation_jsonl, + validation_sidecar_path, + write_validation_jsonl, +) +from dandi.validate.types import ( + Origin, + OriginType, + Scope, + Severity, + ValidationResult, + Validator, +) + +FOO_ORIGIN = Origin( + type=OriginType.INTERNAL, + validator=Validator.dandi, + validator_version="1.0.0", +) + + +def _make_result(id_: str, severity: Severity = Severity.WARNING) -> ValidationResult: + return ValidationResult( + id=id_, + origin=FOO_ORIGIN, + scope=Scope.FILE, + severity=severity, + message=f"Message for {id_}", + path=Path(f"/tmp/{id_}.nwb"), + ) + + +@pytest.mark.ai_generated +class TestWriteAndLoad: + def test_round_trip(self, tmp_path: Path) -> None: + """Write results to JSONL and load them back.""" + results = [_make_result("A"), _make_result("B", Severity.ERROR)] + out = tmp_path / "results.jsonl" + ret = write_validation_jsonl(results, out) + assert ret == out + assert out.exists() + + loaded = load_validation_jsonl(out) + assert len(loaded) == 2 + assert loaded[0].id == "A" + assert loaded[1].id == "B" + assert loaded[1].severity == Severity.ERROR + + def test_append(self, tmp_path: Path) -> None: + """Append results to an existing JSONL file.""" + out = tmp_path / "results.jsonl" + write_validation_jsonl([_make_result("A")], out) + append_validation_jsonl([_make_result("B")], out) + + loaded = load_validation_jsonl(out) + assert len(loaded) == 2 + assert loaded[0].id == "A" + assert loaded[1].id == "B" + + def test_append_creates_file(self, tmp_path: Path) -> None: + """Append creates the file if it doesn't exist.""" + out = tmp_path / "new.jsonl" + append_validation_jsonl([_make_result("A")], out) + assert out.exists() + loaded = load_validation_jsonl(out) + assert len(loaded) == 1 + + def test_empty_file(self, tmp_path: Path) -> None: + """Loading an empty file returns an empty list.""" + out = tmp_path / "empty.jsonl" + write_validation_jsonl([], out) + loaded = load_validation_jsonl(out) + assert loaded == [] + + def test_multi_file_load(self, tmp_path: Path) -> None: + """Load from multiple JSONL files and concatenate.""" + f1 = tmp_path / "a.jsonl" + f2 = tmp_path / "b.jsonl" + write_validation_jsonl([_make_result("A")], f1) + write_validation_jsonl([_make_result("B"), _make_result("C")], f2) + + loaded = load_validation_jsonl(f1, f2) + assert len(loaded) == 3 + assert [r.id for r in loaded] == ["A", "B", "C"] + + def test_blank_lines_skipped(self, tmp_path: Path) -> None: + """Blank lines in JSONL should be silently skipped.""" + out = tmp_path / "results.jsonl" + write_validation_jsonl([_make_result("A")], out) + # Add blank lines + with out.open("a") as f: + f.write("\n\n") + loaded = load_validation_jsonl(out) + assert len(loaded) == 1 + + +@pytest.mark.ai_generated +class TestSidecarPath: + def test_derives_from_logfile(self) -> None: + """Sidecar path is derived from logfile by appending _validation.jsonl.""" + logfile = Path("/var/log/dandi/2026.03.19-12.00.00Z-12345.log") + sidecar = validation_sidecar_path(logfile) + assert sidecar == Path( + "/var/log/dandi/2026.03.19-12.00.00Z-12345_validation.jsonl" + ) + + def test_string_input(self) -> None: + """String input is accepted.""" + sidecar = validation_sidecar_path("/tmp/test.log") + assert sidecar == Path("/tmp/test_validation.jsonl") From d400cfd41051075f5ef4c46b777c40a261d15ed7 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:41:19 -0400 Subject: [PATCH 09/16] feat: add --summary option to dandi validate CLI Add --summary/--no-summary flag that shows statistics after validation: total issues, breakdown by severity, validator, and standard. For human output, printed to stdout; for structured formats, printed to stderr. Also refactors _process_issues into _render_human (no exit) + _exit_if_errors, keeping _process_issues as backward-compatible wrapper. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 71 +++++++++++++++++++++++----- dandi/cli/tests/test_cmd_validate.py | 10 ++++ 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 0681942f7..6907576f0 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -160,6 +160,11 @@ def validate_bids( type=click.Path(dir_okay=False, writable=True), default=None, ) +@click.option( + "--summary/--no-summary", + help="Show summary statistics.", + default=False, +) @click.argument("paths", nargs=-1, type=click.Path(exists=True, dir_okay=True)) @click.pass_context @devel_debug_option() @@ -172,6 +177,7 @@ def validate( min_severity: str, output_format: str = "human", output_file: str | None = None, + summary: bool = False, schema: str | None = None, devel_debug: bool = False, allow_any_path: bool = False, @@ -190,14 +196,21 @@ def validate( filtered = _filter_results(results, min_severity, ignore) if output_format == "human": - _process_issues(filtered, grouping) + _render_human(filtered, grouping) + if summary: + _print_summary(filtered, sys.stdout) + _exit_if_errors(filtered) elif output_file is not None: with open(output_file, "w") as fh: _render_structured(filtered, output_format, fh) lgr.info("Validation output written to %s", output_file) + if summary: + _print_summary(filtered, sys.stderr) _exit_if_errors(filtered) else: _render_structured(filtered, output_format, sys.stdout) + if summary: + _print_summary(filtered, sys.stderr) # Auto-save sidecar next to logfile if filtered and hasattr(ctx, "obj") and ctx.obj is not None: _auto_save_sidecar(filtered, ctx.obj.logfile) @@ -211,6 +224,40 @@ def _auto_save_sidecar(results: list[ValidationResult], logfile: str) -> None: lgr.info("Validation sidecar saved to %s", sidecar) +def _print_summary(results: list[ValidationResult], out: object) -> None: + """Print summary statistics about validation results.""" + from collections import Counter + + total = len(results) + print("\n--- Validation Summary ---", file=out) + print(f"Total issues: {total}", file=out) + if not total: + return + + severity_counts = Counter( + r.severity.name if r.severity is not None else "NONE" for r in results + ) + print("By severity:", file=out) + for sev in ("CRITICAL", "ERROR", "WARNING", "HINT", "INFO"): + if sev in severity_counts: + print(f" {sev}: {severity_counts[sev]}", file=out) + + validator_counts = Counter(r.origin.validator.value for r in results) + if validator_counts: + print("By validator:", file=out) + for validator, count in validator_counts.most_common(): + print(f" {validator}: {count}", file=out) + + standard_counts = Counter( + r.origin.standard.value if r.origin.standard is not None else "N/A" + for r in results + ) + if standard_counts: + print("By standard:", file=out) + for standard, count in standard_counts.most_common(): + print(f" {standard}: {count}", file=out) + + def _get_formatter(output_format: str, out: object = None): """Create a formatter for the given output format.""" if output_format == "json": @@ -243,10 +290,11 @@ def _exit_if_errors(results: list[ValidationResult]) -> None: raise SystemExit(1) -def _process_issues( +def _render_human( issues: list[ValidationResult], grouping: str, ) -> None: + """Render validation results in human-readable colored format.""" purviews = [i.purview for i in issues] if grouping == "none": display_errors( @@ -273,18 +321,19 @@ def _process_issues( " 'none'." ) - if ( - max( - (i.severity for i in issues if i.severity is not None), - default=Severity.INFO, - ) - >= Severity.ERROR - ): - raise SystemExit(1) - else: + if not any(r.severity is not None and r.severity >= Severity.ERROR for r in issues): click.secho("No errors found.", fg="green") +def _process_issues( + issues: list[ValidationResult], + grouping: str, +) -> None: + """Legacy wrapper: render human output and exit if errors.""" + _render_human(issues, grouping) + _exit_if_errors(issues) + + def _get_severity_color(severities: list[Severity]) -> str: max_severity = max(severities, default=Severity.INFO) if max_severity >= Severity.ERROR: diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 77a1d74ee..e229c8a86 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -239,3 +239,13 @@ def test_validate_output_requires_format(simple2_nwb: Path, tmp_path: Path) -> N ) assert r.exit_code != 0 assert "--output requires --format" in r.output + + +@pytest.mark.ai_generated +def test_validate_summary_human(simple2_nwb: Path) -> None: + """Test --summary in human format shows statistics.""" + r = CliRunner().invoke(validate, ["--summary", str(simple2_nwb)]) + assert r.exit_code != 0 + assert "Validation Summary" in r.output + assert "Total issues:" in r.output + assert "By severity:" in r.output From 8843d80168e4810395a135736cd533aaac00b7e2 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:42:23 -0400 Subject: [PATCH 10/16] feat: add --load option to dandi validate CLI Add --load to reload previously-saved JSONL validation results and re-render them with different formats/filters/grouping. Mutually exclusive with positional paths. Exit code reflects loaded results. Skip auto-save sidecar when loading. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 23 ++++++++++++-- dandi/cli/tests/test_cmd_validate.py | 45 ++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 6907576f0..713047afb 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -165,6 +165,13 @@ def validate_bids( help="Show summary statistics.", default=False, ) +@click.option( + "--load", + help="Load validation results from JSONL file(s) instead of running validation.", + type=click.Path(exists=True, dir_okay=False), + multiple=True, + default=(), +) @click.argument("paths", nargs=-1, type=click.Path(exists=True, dir_okay=True)) @click.pass_context @devel_debug_option() @@ -178,6 +185,7 @@ def validate( output_format: str = "human", output_file: str | None = None, summary: bool = False, + load: tuple[str, ...] = (), schema: str | None = None, devel_debug: bool = False, allow_any_path: bool = False, @@ -192,7 +200,16 @@ def validate( "(json, json_pp, json_lines, yaml)." ) - results = _collect_results(paths, schema, devel_debug, allow_any_path) + if load and paths: + raise click.UsageError("--load and positional paths are mutually exclusive.") + + if load: + from ..validate.io import load_validation_jsonl + + results = load_validation_jsonl(*load) + else: + results = _collect_results(paths, schema, devel_debug, allow_any_path) + filtered = _filter_results(results, min_severity, ignore) if output_format == "human": @@ -211,8 +228,8 @@ def validate( _render_structured(filtered, output_format, sys.stdout) if summary: _print_summary(filtered, sys.stderr) - # Auto-save sidecar next to logfile - if filtered and hasattr(ctx, "obj") and ctx.obj is not None: + # Auto-save sidecar next to logfile (skip when loading) + if not load and filtered and hasattr(ctx, "obj") and ctx.obj is not None: _auto_save_sidecar(filtered, ctx.obj.logfile) _exit_if_errors(filtered) diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index e229c8a86..537ea42b1 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -249,3 +249,48 @@ def test_validate_summary_human(simple2_nwb: Path) -> None: assert "Validation Summary" in r.output assert "Total issues:" in r.output assert "By severity:" in r.output + + +@pytest.mark.ai_generated +def test_validate_load(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load reads results from a JSONL file.""" + # First, produce a JSONL file + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 + assert outfile.exists() + + # Now load it + r = CliRunner().invoke(validate, ["--load", str(outfile)]) + assert r.exit_code == 1 # loaded errors still produce exit 1 + assert "DANDI.NO_DANDISET_FOUND" in r.output + + +@pytest.mark.ai_generated +def test_validate_load_with_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load combined with --format.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-f", "json_lines", "-o", str(outfile), str(simple2_nwb)], + ) + assert outfile.exists() + + r = CliRunner().invoke(validate, ["--load", str(outfile), "-f", "json"]) + assert r.exit_code == 1 + data = json.loads(r.output) + assert isinstance(data, list) + assert len(data) >= 1 + + +@pytest.mark.ai_generated +def test_validate_load_mutual_exclusivity(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --load and paths are mutually exclusive.""" + outfile = tmp_path / "dummy.jsonl" + outfile.write_text("") + r = CliRunner().invoke(validate, ["--load", str(outfile), str(simple2_nwb)]) + assert r.exit_code != 0 + assert "mutually exclusive" in r.output From 69a554c71382ab5a20efe4da91a88d52cf90673d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:46:18 -0400 Subject: [PATCH 11/16] feat: persist upload validation results to sidecar JSONL - Add validation_log_path parameter to upload() - In upload validation loop, append results to sidecar via append_validation_jsonl() when validation_log_path is set - CLI cmd_upload derives sidecar path from logfile and passes it Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_upload.py | 11 +++++++++-- dandi/upload.py | 7 +++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1f057dc36..baa438633 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -96,7 +96,8 @@ def upload( can point to specific files you would like to validate and have uploaded. """ # Avoid heavy imports by importing with function: - from ..upload import upload + from ..upload import upload as upload_ + from ..validate.io import validation_sidecar_path if jobs_pair is None: jobs = None @@ -104,7 +105,12 @@ def upload( else: jobs, jobs_per_file = jobs_pair - upload( + sidecar = None + ctx = click.get_current_context() + if ctx.obj is not None: + sidecar = validation_sidecar_path(ctx.obj.logfile) + + upload_( paths, existing=existing, validation=validation, @@ -115,4 +121,5 @@ def upload( jobs=jobs, jobs_per_file=jobs_per_file, sync=sync, + validation_log_path=sidecar, ) diff --git a/dandi/upload.py b/dandi/upload.py index 324e31129..48c9f94b5 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -112,6 +112,7 @@ def upload( jobs: int | None = None, jobs_per_file: int | None = None, sync: bool = False, + validation_log_path: str | Path | None = None, ) -> None: if paths: paths = [Path(p).absolute() for p in paths] @@ -300,6 +301,12 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: ): yield {"status": "pre-validating"} validation_statuses = dfile.get_validation_errors() + if validation_log_path is not None and validation_statuses: + from .validate.io import append_validation_jsonl + + append_validation_jsonl( + validation_statuses, validation_log_path + ) validation_errors = [ s for s in validation_statuses From cabefcfe390c4c497f13e75e8d51c10e0fdf213d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 16:47:35 -0400 Subject: [PATCH 12/16] fix: add proper type annotations to cmd_validate helpers Fix mypy errors by using IO[str] instead of object for file-like output parameters in _print_summary, _get_formatter, and _render_structured. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 713047afb..3a07d6839 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -4,7 +4,7 @@ import os import re import sys -from typing import cast +from typing import IO, cast import warnings import click @@ -241,7 +241,7 @@ def _auto_save_sidecar(results: list[ValidationResult], logfile: str) -> None: lgr.info("Validation sidecar saved to %s", sidecar) -def _print_summary(results: list[ValidationResult], out: object) -> None: +def _print_summary(results: list[ValidationResult], out: IO[str]) -> None: """Print summary statistics about validation results.""" from collections import Counter @@ -275,7 +275,9 @@ def _print_summary(results: list[ValidationResult], out: object) -> None: print(f" {standard}: {count}", file=out) -def _get_formatter(output_format: str, out: object = None): +def _get_formatter( + output_format: str, out: IO[str] | None = None +) -> JSONFormatter | JSONLinesFormatter | YAMLFormatter: """Create a formatter for the given output format.""" if output_format == "json": return JSONFormatter(out=out) @@ -292,7 +294,7 @@ def _get_formatter(output_format: str, out: object = None): def _render_structured( results: list[ValidationResult], output_format: str, - out: object, + out: IO[str], ) -> None: """Render validation results in a structured format.""" formatter = _get_formatter(output_format, out=out) From adacc0685501d244568e507554381e0346b0d4f2 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 17:50:00 -0400 Subject: [PATCH 13/16] feat: auto-detect --format from --output file extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When --output is given without explicit --format, infer the format from the file extension: .json → json_pp, .jsonl → json_lines, .yaml/.yml → yaml. Error only if extension is unrecognized. Update design doc to reflect this behavior. Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- .../specs/validate-machine-readable-output.md | 17 ++++++---- dandi/cli/cmd_validate.py | 27 ++++++++++++--- dandi/cli/tests/test_cmd_validate.py | 33 +++++++++++++++++-- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/.specify/specs/validate-machine-readable-output.md b/.specify/specs/validate-machine-readable-output.md index 904e0f0e2..db42361e1 100644 --- a/.specify/specs/validate-machine-readable-output.md +++ b/.specify/specs/validate-machine-readable-output.md @@ -198,9 +198,10 @@ that varies by grouping option. `--format` and `--output` are independent: - `--format` controls serialization format (default: `human`) - `--output` controls destination (default: stdout) -- Neither implies the other — if `--output` is given without `--format`, - the format must be specified explicitly (error otherwise, since writing - colored human text to a file is not useful) +- When `--output` is given without `--format`, the format is auto-detected + from the file extension: `.json` → `json_pp`, `.jsonl` → `json_lines`, + `.yaml`/`.yml` → `yaml`. If the extension is unrecognized, an error is + raised (since writing colored human text to a file is not useful) ## Design @@ -506,8 +507,10 @@ alongside the `.log` file. ``` Validation: -- If `--output` is given and `--format` is `human` (default), raise - `click.UsageError("--output requires --format to be set to a structured format")` +- If `--output` is given and `--format` is `human` (default), auto-detect + format from file extension (`.json` → `json_pp`, `.jsonl` → `json_lines`, + `.yaml`/`.yml` → `yaml`). If extension is unrecognized, raise + `click.UsageError` #### Auto-save sidecar @@ -800,7 +803,9 @@ cat results/*.jsonl | jq -s ' ### Step 1b: `--output` + auto-save sidecar -- Add `--output` option, enforce `--format` must be structured +- Add `--output` option, auto-detect format from extension (`.json` → + `json_pp`, `.jsonl` → `json_lines`, `.yaml`/`.yml` → `yaml`); error + if extension unrecognized and `--format` not given - Create `dandi/validate/io.py` with shared `write_validation_jsonl()`, `append_validation_jsonl()`, `load_validation_jsonl()`, `validation_sidecar_path()` diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 3a07d6839..8966b5320 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -20,6 +20,19 @@ STRUCTURED_FORMATS = ("json", "json_pp", "json_lines", "yaml") +_EXT_TO_FORMAT = { + ".json": "json_pp", + ".jsonl": "json_lines", + ".yaml": "yaml", + ".yml": "yaml", +} + + +def _format_from_ext(path: str) -> str | None: + """Infer output format from file extension, or None if unrecognized.""" + ext = os.path.splitext(path)[1].lower() + return _EXT_TO_FORMAT.get(ext) + def _collect_results( paths: tuple[str, ...], @@ -194,11 +207,15 @@ def validate( Exits with non-0 exit code if any file is not compliant. """ - if output_file is not None and output_format not in STRUCTURED_FORMATS: - raise click.UsageError( - "--output requires --format to be set to a structured format " - "(json, json_pp, json_lines, yaml)." - ) + # Auto-detect format from output file extension when --format not given + if output_file is not None and output_format == "human": + output_format = _format_from_ext(output_file) + if output_format is None: + raise click.UsageError( + "--output requires --format to be set to a structured format " + "(json, json_pp, json_lines, yaml), or use a recognized " + "extension (.json, .jsonl, .yaml, .yml)." + ) if load and paths: raise click.UsageError("--load and positional paths are mutually exclusive.") diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 537ea42b1..07b772259 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -231,8 +231,8 @@ def test_validate_output_file(simple2_nwb: Path, tmp_path: Path) -> None: @pytest.mark.ai_generated def test_validate_output_requires_format(simple2_nwb: Path, tmp_path: Path) -> None: - """Test --output without --format gives a usage error.""" - outfile = tmp_path / "results.jsonl" + """Test --output with unrecognized extension and no --format gives error.""" + outfile = tmp_path / "results.txt" r = CliRunner().invoke( validate, ["-o", str(outfile), str(simple2_nwb)], @@ -241,6 +241,35 @@ def test_validate_output_requires_format(simple2_nwb: Path, tmp_path: Path) -> N assert "--output requires --format" in r.output +@pytest.mark.ai_generated +def test_validate_output_auto_format(simple2_nwb: Path, tmp_path: Path) -> None: + """Test --output auto-detects format from file extension.""" + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke( + validate, + ["-o", str(outfile), str(simple2_nwb)], + ) + assert r.exit_code == 1 # NO_DANDISET_FOUND + assert outfile.exists() + lines = outfile.read_text().strip().split("\n") + # Should be json_lines format (one JSON per line) + rec = json.loads(lines[0]) + assert "id" in rec + + # .json → json_pp (indented) + outjson = tmp_path / "results.json" + r = CliRunner().invoke( + validate, + ["-o", str(outjson), str(simple2_nwb)], + ) + assert r.exit_code == 1 + content = outjson.read_text() + data = json.loads(content) + assert isinstance(data, list) + # json_pp produces indented output + assert "\n " in content + + @pytest.mark.ai_generated def test_validate_summary_human(simple2_nwb: Path) -> None: """Test --summary in human format shows statistics.""" From 1100de52a448f728d62c268ac054b73456758c3c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 18:07:02 -0400 Subject: [PATCH 14/16] feat: add extended grouping options to dandi validate Add severity, id, validator, standard, and dandiset as --grouping options. Uses section headers with counts (e.g. "=== ERROR (5 issues) ===") for human output. Structured output is unaffected (always flat). Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 62 +++++++++++++++++++++++----- dandi/cli/tests/test_cmd_validate.py | 55 +++++++++++++++++++++++- 2 files changed, 105 insertions(+), 12 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 8966b5320..3614eaaf7 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -146,7 +146,10 @@ def validate_bids( "--grouping", "-g", help="How to group error/warning reporting.", - type=click.Choice(["none", "path"], case_sensitive=False), + type=click.Choice( + ["none", "path", "severity", "id", "validator", "standard", "dandiset"], + case_sensitive=False, + ), default="none", ) @click.option("--ignore", metavar="REGEX", help="Regex matching error IDs to ignore") @@ -209,13 +212,14 @@ def validate( """ # Auto-detect format from output file extension when --format not given if output_file is not None and output_format == "human": - output_format = _format_from_ext(output_file) - if output_format is None: + detected = _format_from_ext(output_file) + if detected is None: raise click.UsageError( "--output requires --format to be set to a structured format " "(json, json_pp, json_lines, yaml), or use a recognized " "extension (.json, .jsonl, .yaml, .yml)." ) + output_format = detected if load and paths: raise click.UsageError("--load and positional paths are mutually exclusive.") @@ -326,13 +330,31 @@ def _exit_if_errors(results: list[ValidationResult]) -> None: raise SystemExit(1) +def _group_key(issue: ValidationResult, grouping: str) -> str: + """Extract the grouping key from a ValidationResult.""" + if grouping == "path": + return issue.purview or "(no path)" + elif grouping == "severity": + return issue.severity.name if issue.severity is not None else "NONE" + elif grouping == "id": + return issue.id + elif grouping == "validator": + return issue.origin.validator.value + elif grouping == "standard": + return issue.origin.standard.value if issue.origin.standard else "N/A" + elif grouping == "dandiset": + return str(issue.dandiset_path) if issue.dandiset_path else "(no dandiset)" + else: + raise NotImplementedError(f"Unsupported grouping: {grouping}") + + def _render_human( issues: list[ValidationResult], grouping: str, ) -> None: """Render validation results in human-readable colored format.""" - purviews = [i.purview for i in issues] if grouping == "none": + purviews = [i.purview for i in issues] display_errors( purviews, [i.id for i in issues], @@ -340,9 +362,8 @@ def _render_human( [i.message for i in issues], ) elif grouping == "path": - # The purviews are the paths, if we group by path, we need to de-duplicate. - # typing complains if we just take the set, though the code works otherwise. - purviews = list(set(purviews)) + # Legacy path grouping: de-duplicate purviews, show per-path + purviews = list(set(i.purview for i in issues)) for purview in purviews: applies_to = [i for i in issues if purview == i.purview] display_errors( @@ -352,10 +373,29 @@ def _render_human( [i.message for i in applies_to], ) else: - raise NotImplementedError( - "The `grouping` parameter values currently supported are 'path' and" - " 'none'." - ) + # Generic grouped rendering with section headers + from collections import OrderedDict + + groups: OrderedDict[str, list[ValidationResult]] = OrderedDict() + for issue in issues: + key = _group_key(issue, grouping) + groups.setdefault(key, []).append(issue) + + for key, group_issues in groups.items(): + header = f"=== {key} ({pluralize(len(group_issues), 'issue')}) ===" + fg = _get_severity_color( + cast( + "list[Severity]", + [i.severity for i in group_issues if i.severity is not None], + ) + ) + click.secho(header, fg=fg, bold=True) + for issue in group_issues: + msg = f" [{issue.id}] {issue.purview} — {issue.message}" + ifg = _get_severity_color( + [issue.severity] if issue.severity is not None else [] + ) + click.secho(msg, fg=ifg) if not any(r.severity is not None and r.severity >= Severity.ERROR for r in issues): click.secho("No errors found.", fg="green") diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 07b772259..27868258f 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -5,7 +5,7 @@ import pytest import ruamel.yaml -from ..cmd_validate import _process_issues, validate +from ..cmd_validate import _process_issues, _render_human, validate from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess from ...validate.types import ( Origin, @@ -323,3 +323,56 @@ def test_validate_load_mutual_exclusivity(simple2_nwb: Path, tmp_path: Path) -> r = CliRunner().invoke(validate, ["--load", str(outfile), str(simple2_nwb)]) assert r.exit_code != 0 assert "mutually exclusive" in r.output + + +@pytest.mark.ai_generated +@pytest.mark.parametrize( + "grouping", + ["severity", "id", "validator", "standard", "dandiset"], +) +def test_render_human_grouping(grouping: str, capsys: pytest.CaptureFixture) -> None: + """Test extended grouping renders section headers with counts.""" + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-01/sub-01.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_missing_unit", + origin=origin, + scope=Scope.FILE, + message="Missing text for attribute 'unit'.", + path=Path("sub-02/sub-02.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ] + _render_human(issues, grouping=grouping) + captured = capsys.readouterr().out + + # Section headers with "===" must appear + assert "===" in captured + # Both issues should be rendered + assert "NWBI.check_data_orientation" in captured + assert "NWBI.check_missing_unit" in captured + # Issue counts in headers + assert "issue" in captured + + +@pytest.mark.ai_generated +def test_validate_grouping_severity_cli(simple2_nwb: Path) -> None: + """Test --grouping=severity via CLI.""" + r = CliRunner().invoke(validate, ["--grouping=severity", str(simple2_nwb)]) + assert r.exit_code != 0 + assert "===" in r.output + assert "ERROR" in r.output From 2ddd5d4fda3bcd64efd4ad8a06cd62db91085cbc Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 19 Mar 2026 19:39:48 -0400 Subject: [PATCH 15/16] feat: add --max-per-group truncation to dandi validate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Limit how many results are shown per leaf group (or in the flat list when no grouping is used). Excess results are replaced by a TruncationNotice placeholder — a distinct dataclass (not a ValidationResult) so consumers can isinstance() check. - TruncationNotice dataclass + LeafItem/TruncatedResults type aliases - _truncate_leaves() walks the grouped tree, caps leaf lists - Human output: "... and N more issues" in cyan - Structured output: {"_truncated": true, "omitted_count": N} sentinel - Headers show original counts including omitted items - Works without grouping (flat list) and with multi-level grouping Co-Authored-By: Claude Code 2.1.63 / Claude Opus 4.6 --- dandi/cli/cmd_validate.py | 254 ++++++++++++++--- dandi/cli/tests/test_cmd_validate.py | 400 ++++++++++++++++++++++++++- 2 files changed, 615 insertions(+), 39 deletions(-) diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 3614eaaf7..9d37fe7b1 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -1,10 +1,13 @@ from __future__ import annotations +from collections import OrderedDict +import dataclasses +import json as json_mod import logging import os import re import sys -from typing import IO, cast +from typing import IO, Union, cast import warnings import click @@ -18,6 +21,14 @@ lgr = logging.getLogger(__name__) + +@dataclasses.dataclass +class TruncationNotice: + """Placeholder indicating omitted results in truncated output.""" + + omitted_count: int + + STRUCTURED_FORMATS = ("json", "json_pp", "json_lines", "yaml") _EXT_TO_FORMAT = { @@ -131,7 +142,9 @@ def validate_bids( "`dandi validate` instead. Proceeding to parse the call to `dandi validate` now.", DeprecationWarning, ) - ctx.invoke(validate, paths=paths, grouping=grouping) + ctx.invoke( + validate, paths=paths, grouping=(grouping,) if grouping != "none" else () + ) @click.command() @@ -145,12 +158,13 @@ def validate_bids( @click.option( "--grouping", "-g", - help="How to group error/warning reporting.", + help="How to group output. Repeat for hierarchical nesting, e.g. -g severity -g id.", type=click.Choice( ["none", "path", "severity", "id", "validator", "standard", "dandiset"], case_sensitive=False, ), - default="none", + multiple=True, + default=(), ) @click.option("--ignore", metavar="REGEX", help="Regex matching error IDs to ignore") @click.option( @@ -181,6 +195,13 @@ def validate_bids( help="Show summary statistics.", default=False, ) +@click.option( + "--max-per-group", + type=int, + default=None, + help="Limit results per group (or total if ungrouped). " + "Excess results are replaced by a count of omitted items.", +) @click.option( "--load", help="Load validation results from JSONL file(s) instead of running validation.", @@ -196,11 +217,12 @@ def validate( ctx: click.Context, paths: tuple[str, ...], ignore: str | None, - grouping: str, + grouping: tuple[str, ...], min_severity: str, output_format: str = "human", output_file: str | None = None, summary: bool = False, + max_per_group: int | None = None, load: tuple[str, ...] = (), schema: str | None = None, devel_debug: bool = False, @@ -210,6 +232,9 @@ def validate( Exits with non-0 exit code if any file is not compliant. """ + # Normalize grouping: strip "none" values + grouping = tuple(g for g in grouping if g != "none") + # Auto-detect format from output file extension when --format not given if output_file is not None and output_format == "human": detected = _format_from_ext(output_file) @@ -221,6 +246,13 @@ def validate( ) output_format = detected + # JSONL is incompatible with grouping (flat format, no nesting) + if grouping and output_format == "json_lines": + raise click.UsageError( + "--grouping is incompatible with json_lines format " + "(JSONL is a flat format that cannot represent nested groups)." + ) + if load and paths: raise click.UsageError("--load and positional paths are mutually exclusive.") @@ -234,19 +266,31 @@ def validate( filtered = _filter_results(results, min_severity, ignore) if output_format == "human": - _render_human(filtered, grouping) + _render_human(filtered, grouping, max_per_group=max_per_group) if summary: _print_summary(filtered, sys.stdout) _exit_if_errors(filtered) elif output_file is not None: with open(output_file, "w") as fh: - _render_structured(filtered, output_format, fh) + _render_structured( + filtered, + output_format, + fh, + grouping, + max_per_group=max_per_group, + ) lgr.info("Validation output written to %s", output_file) if summary: _print_summary(filtered, sys.stderr) _exit_if_errors(filtered) else: - _render_structured(filtered, output_format, sys.stdout) + _render_structured( + filtered, + output_format, + sys.stdout, + grouping, + max_per_group=max_per_group, + ) if summary: _print_summary(filtered, sys.stderr) # Auto-save sidecar next to logfile (skip when loading) @@ -316,12 +360,39 @@ def _render_structured( results: list[ValidationResult], output_format: str, out: IO[str], + grouping: tuple[str, ...] = (), + max_per_group: int | None = None, ) -> None: """Render validation results in a structured format.""" - formatter = _get_formatter(output_format, out=out) - with formatter: - for r in results: - formatter(r.model_dump(mode="json")) + if grouping: + # Grouped output: build nested dict, serialize directly + grouped: GroupedResults | TruncatedResults = _group_results(results, grouping) + if max_per_group is not None: + grouped = _truncate_leaves(grouped, max_per_group) + data = _serialize_grouped(grouped) + if output_format in ("json", "json_pp"): + indent = 2 if output_format == "json_pp" else None + json_mod.dump(data, out, indent=indent, sort_keys=True, default=str) + out.write("\n") + elif output_format == "yaml": + import ruamel.yaml + + yaml = ruamel.yaml.YAML(typ="safe") + yaml.default_flow_style = False + yaml.dump(data, out) + else: + raise ValueError(f"Unsupported format for grouped output: {output_format}") + else: + items: list[dict] = [r.model_dump(mode="json") for r in results] + if max_per_group is not None and len(items) > max_per_group: + items = items[:max_per_group] + items.append( + {"_truncated": True, "omitted_count": len(results) - max_per_group} + ) + formatter = _get_formatter(output_format, out=out) + with formatter: + for item in items: + formatter(item) def _exit_if_errors(results: list[ValidationResult]) -> None: @@ -348,20 +419,88 @@ def _group_key(issue: ValidationResult, grouping: str) -> str: raise NotImplementedError(f"Unsupported grouping: {grouping}") +# Recursive grouped type: either a nested OrderedDict or leaf list +GroupedResults = Union["OrderedDict[str, GroupedResults]", list[ValidationResult]] + +# Leaf items after possible truncation +LeafItem = Union[ValidationResult, TruncationNotice] +TruncatedResults = Union["OrderedDict[str, TruncatedResults]", list[LeafItem]] + + +def _group_results( + results: list[ValidationResult], + levels: tuple[str, ...], +) -> GroupedResults: + """Group results recursively by the given hierarchy of grouping levels. + + Returns a nested OrderedDict with leaf values as lists of ValidationResult. + With zero levels, returns the flat list unchanged. + """ + if not levels: + return results + key_fn = levels[0] + remaining = levels[1:] + groups: OrderedDict[str, list[ValidationResult]] = OrderedDict() + for r in results: + k = _group_key(r, key_fn) + groups.setdefault(k, []).append(r) + if remaining: + return OrderedDict((k, _group_results(v, remaining)) for k, v in groups.items()) + # mypy can't resolve the recursive type alias, but this is correct: + # OrderedDict[str, list[VR]] is a valid GroupedResults + return cast("GroupedResults", groups) + + +def _truncate_leaves( + grouped: GroupedResults | TruncatedResults, max_per_group: int +) -> TruncatedResults: + """Truncate leaf lists to *max_per_group* items, appending a TruncationNotice.""" + if isinstance(grouped, list): + if len(grouped) > max_per_group: + kept: list[LeafItem] = list(grouped[:max_per_group]) + kept.append(TruncationNotice(len(grouped) - max_per_group)) + return kept + return cast("TruncatedResults", grouped) + return OrderedDict( + (k, _truncate_leaves(v, max_per_group)) for k, v in grouped.items() + ) + + +def _serialize_grouped(grouped: GroupedResults | TruncatedResults) -> dict | list: + """Convert grouped results to a JSON-serializable nested dict/list.""" + if isinstance(grouped, list): + result: list[dict] = [] + for item in grouped: + if isinstance(item, TruncationNotice): + result.append({"_truncated": True, "omitted_count": item.omitted_count}) + else: + result.append(item.model_dump(mode="json")) + return result + return {k: _serialize_grouped(v) for k, v in grouped.items()} + + def _render_human( issues: list[ValidationResult], - grouping: str, + grouping: tuple[str, ...], + max_per_group: int | None = None, ) -> None: """Render validation results in human-readable colored format.""" - if grouping == "none": - purviews = [i.purview for i in issues] + if not grouping: + shown = issues + omitted = 0 + if max_per_group is not None and len(issues) > max_per_group: + shown = issues[:max_per_group] + omitted = len(issues) - max_per_group + purviews = [i.purview for i in shown] display_errors( purviews, - [i.id for i in issues], - cast("list[Severity]", [i.severity for i in issues]), - [i.message for i in issues], + [i.id for i in shown], + cast("list[Severity]", [i.severity for i in shown]), + [i.message for i in shown], ) - elif grouping == "path": + if omitted: + click.secho(f"... and {pluralize(omitted, 'more issue')}", fg="cyan") + elif grouping == ("path",): # Legacy path grouping: de-duplicate purviews, show per-path purviews = list(set(i.purview for i in issues)) for purview in purviews: @@ -373,39 +512,80 @@ def _render_human( [i.message for i in applies_to], ) else: - # Generic grouped rendering with section headers - from collections import OrderedDict + grouped: GroupedResults | TruncatedResults = _group_results(issues, grouping) + if max_per_group is not None: + grouped = _truncate_leaves(grouped, max_per_group) + _render_human_grouped(grouped, depth=0) + + if not any(r.severity is not None and r.severity >= Severity.ERROR for r in issues): + click.secho("No errors found.", fg="green") + - groups: OrderedDict[str, list[ValidationResult]] = OrderedDict() - for issue in issues: - key = _group_key(issue, grouping) - groups.setdefault(key, []).append(issue) +def _count_leaves(grouped: GroupedResults | TruncatedResults) -> int: + """Count total items in a grouped structure (including omitted counts).""" + if isinstance(grouped, list): + return sum( + item.omitted_count if isinstance(item, TruncationNotice) else 1 + for item in grouped + ) + return sum(_count_leaves(v) for v in grouped.values()) - for key, group_issues in groups.items(): - header = f"=== {key} ({pluralize(len(group_issues), 'issue')}) ===" + +def _render_human_grouped( + grouped: GroupedResults | TruncatedResults, + depth: int, +) -> None: + """Recursively render grouped results with nested indented section headers.""" + indent = " " * depth + if isinstance(grouped, list): + # Leaf level: render individual issues + for issue in grouped: + if isinstance(issue, TruncationNotice): + click.secho( + f"{indent}... and {pluralize(issue.omitted_count, 'more issue')}", + fg="cyan", + ) + continue + msg = f"{indent}[{issue.id}] {issue.purview} — {issue.message}" + fg = _get_severity_color( + [issue.severity] if issue.severity is not None else [] + ) + click.secho(msg, fg=fg) + else: + for key, value in grouped.items(): + count = _count_leaves(value) + header = f"{indent}=== {key} ({pluralize(count, 'issue')}) ===" + # Determine color from all issues in this group + all_issues = _collect_all_issues(value) fg = _get_severity_color( cast( "list[Severity]", - [i.severity for i in group_issues if i.severity is not None], + [i.severity for i in all_issues if i.severity is not None], ) ) click.secho(header, fg=fg, bold=True) - for issue in group_issues: - msg = f" [{issue.id}] {issue.purview} — {issue.message}" - ifg = _get_severity_color( - [issue.severity] if issue.severity is not None else [] - ) - click.secho(msg, fg=ifg) + _render_human_grouped(value, depth + 1) - if not any(r.severity is not None and r.severity >= Severity.ERROR for r in issues): - click.secho("No errors found.", fg="green") + +def _collect_all_issues( + grouped: GroupedResults | TruncatedResults, +) -> list[ValidationResult]: + """Flatten a grouped structure into a list of all ValidationResults.""" + if isinstance(grouped, list): + return [item for item in grouped if isinstance(item, ValidationResult)] + result: list[ValidationResult] = [] + for v in grouped.values(): + result.extend(_collect_all_issues(v)) + return result def _process_issues( issues: list[ValidationResult], - grouping: str, + grouping: str | tuple[str, ...], ) -> None: """Legacy wrapper: render human output and exit if errors.""" + if isinstance(grouping, str): + grouping = (grouping,) if grouping != "none" else () _render_human(issues, grouping) _exit_if_errors(issues) diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index 27868258f..f804956d3 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -1,11 +1,20 @@ import json from pathlib import Path +from typing import cast from click.testing import CliRunner import pytest import ruamel.yaml -from ..cmd_validate import _process_issues, _render_human, validate +from ..cmd_validate import ( + GroupedResults, + TruncationNotice, + _group_results, + _process_issues, + _render_human, + _truncate_leaves, + validate, +) from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess from ...validate.types import ( Origin, @@ -357,7 +366,7 @@ def test_render_human_grouping(grouping: str, capsys: pytest.CaptureFixture) -> dandiset_path=Path("/data/ds001"), ), ] - _render_human(issues, grouping=grouping) + _render_human(issues, grouping=(grouping,)) captured = capsys.readouterr().out # Section headers with "===" must appear @@ -376,3 +385,390 @@ def test_validate_grouping_severity_cli(simple2_nwb: Path) -> None: assert r.exit_code != 0 assert "===" in r.output assert "ERROR" in r.output + + +@pytest.mark.ai_generated +def test_render_human_multilevel_grouping(capsys: pytest.CaptureFixture) -> None: + """Test multi-level grouping renders nested section headers.""" + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-01/sub-01.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_missing_unit", + origin=origin, + scope=Scope.FILE, + message="Missing text for attribute 'unit'.", + path=Path("sub-02/sub-02.nwb"), + severity=Severity.WARNING, + dandiset_path=Path("/data/ds001"), + ), + ValidationResult( + id="NWBI.check_data_orientation", + origin=origin, + scope=Scope.FILE, + message="Data may be in the wrong orientation.", + path=Path("sub-03/sub-03.nwb"), + severity=Severity.ERROR, + dandiset_path=Path("/data/ds001"), + ), + ] + _render_human(issues, grouping=("severity", "id")) + captured = capsys.readouterr().out + + # Should have nested headers: severity then id + assert "=== WARNING" in captured + assert "=== ERROR" in captured + assert "=== NWBI.check_data_orientation" in captured + assert "=== NWBI.check_missing_unit" in captured + # Nested headers should be indented + lines = captured.split("\n") + # Find inner headers — they should have leading spaces + inner_headers = [ln for ln in lines if "===" in ln and ln.startswith(" ")] + assert len(inner_headers) >= 2 # at least 2 inner group headers + + +@pytest.mark.ai_generated +def test_validate_multilevel_grouping_human_cli(simple2_nwb: Path) -> None: + """Test -g severity -g id via CLI produces nested headers.""" + r = CliRunner().invoke(validate, ["-g", "severity", "-g", "id", str(simple2_nwb)]) + assert r.exit_code != 0 + assert "===" in r.output + # Should have nested structure + lines = r.output.split("\n") + inner_headers = [ln for ln in lines if "===" in ln and ln.startswith(" ")] + assert len(inner_headers) >= 1 + + +@pytest.mark.ai_generated +def test_validate_multilevel_grouping_json_cli(simple2_nwb: Path) -> None: + """Test -g severity -f json_pp via CLI produces nested JSON dict.""" + r = CliRunner().invoke( + validate, ["-g", "severity", "-f", "json_pp", str(simple2_nwb)] + ) + assert r.exit_code == 1 + data = json.loads(r.output) + # With grouping, output should be a dict (not a list) + assert isinstance(data, dict) + # Keys should be severity names + for key in data: + assert key in ("CRITICAL", "ERROR", "WARNING", "HINT", "INFO", "NONE") + # Values should be lists of validation result dicts + for v in data.values(): + assert isinstance(v, list) + for rec in v: + assert "id" in rec + + +@pytest.mark.ai_generated +def test_validate_multilevel_grouping_json_two_levels(simple2_nwb: Path) -> None: + """Test -g severity -g id -f json_pp produces two-level nested JSON.""" + r = CliRunner().invoke( + validate, ["-g", "severity", "-g", "id", "-f", "json_pp", str(simple2_nwb)] + ) + assert r.exit_code == 1 + data = json.loads(r.output) + assert isinstance(data, dict) + # Each value should be a dict (second grouping level) + for severity_key, id_groups in data.items(): + assert isinstance(id_groups, dict) + for id_key, results in id_groups.items(): + assert isinstance(results, list) + for rec in results: + assert "id" in rec + + +@pytest.mark.ai_generated +def test_validate_grouping_yaml_cli(simple2_nwb: Path) -> None: + """Test -g severity -f yaml produces grouped YAML output.""" + r = CliRunner().invoke(validate, ["-g", "severity", "-f", "yaml", str(simple2_nwb)]) + assert r.exit_code == 1 + yaml = ruamel.yaml.YAML(typ="safe") + data = yaml.load(r.output) + assert isinstance(data, dict) + for key in data: + assert key in ("CRITICAL", "ERROR", "WARNING", "HINT", "INFO", "NONE") + + +@pytest.mark.ai_generated +def test_validate_grouping_jsonl_error(simple2_nwb: Path) -> None: + """Test -g severity -f json_lines gives a UsageError.""" + r = CliRunner().invoke( + validate, ["-g", "severity", "-f", "json_lines", str(simple2_nwb)] + ) + assert r.exit_code != 0 + assert "incompatible" in r.output + + +@pytest.mark.ai_generated +def test_validate_grouping_none_explicit(simple2_nwb: Path) -> None: + """Test -g none is treated as no grouping.""" + r = CliRunner().invoke(validate, ["-g", "none", str(simple2_nwb)]) + assert r.exit_code != 0 + # Should NOT have section headers + assert "===" not in r.output + + +@pytest.mark.ai_generated +def test_group_results_unit() -> None: + """Unit test for _group_results with multiple levels.""" + from collections import OrderedDict + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id="A.one", + origin=origin, + scope=Scope.FILE, + message="msg1", + path=Path("f1.nwb"), + severity=Severity.ERROR, + ), + ValidationResult( + id="A.two", + origin=origin, + scope=Scope.FILE, + message="msg2", + path=Path("f2.nwb"), + severity=Severity.WARNING, + ), + ValidationResult( + id="A.one", + origin=origin, + scope=Scope.FILE, + message="msg3", + path=Path("f3.nwb"), + severity=Severity.ERROR, + ), + ] + + # Zero levels: returns flat list + result = _group_results(issues, ()) + assert result is issues + + # One level + result = _group_results(issues, ("severity",)) + assert isinstance(result, OrderedDict) + assert "ERROR" in result + assert "WARNING" in result + assert len(result["ERROR"]) == 2 + assert len(result["WARNING"]) == 1 + + # Two levels + result2 = _group_results(issues, ("severity", "id")) + assert isinstance(result2, OrderedDict) + error_group = result2["ERROR"] + assert isinstance(error_group, OrderedDict) + assert "A.one" in error_group + assert len(error_group["A.one"]) == 2 + warning_group = result2["WARNING"] + assert isinstance(warning_group, OrderedDict) + assert "A.two" in warning_group + assert len(warning_group["A.two"]) == 1 + + +def _make_jsonl(tmp_path: Path, n: int = 5) -> Path: + """Write *n* synthetic validation results to a JSONL file and return its path.""" + outfile = tmp_path / "results.jsonl" + lines = [] + for i in range(n): + sev = "ERROR" if i % 2 == 0 else "WARNING" + rec = { + "id": f"TEST.issue_{i}", + "origin": { + "type": "VALIDATION", + "validator": "nwbinspector", + "validator_version": "", + }, + "scope": "file", + "severity": sev, + "path": f"sub-{i:02d}/sub-{i:02d}.nwb", + "message": f"Synthetic issue number {i}", + "record_version": "1", + } + lines.append(json.dumps(rec)) + outfile.write_text("\n".join(lines) + "\n") + return outfile + + +@pytest.mark.ai_generated +def test_max_per_group_flat(tmp_path: Path) -> None: + """--max-per-group without grouping truncates the flat list.""" + jsonl = _make_jsonl(tmp_path, n=5) + r = CliRunner().invoke(validate, ["--load", str(jsonl), "--max-per-group", "2"]) + assert r.exit_code == 1 + assert "3 more issues" in r.output + # Only 2 real issues should be listed + assert "TEST.issue_0" in r.output + assert "TEST.issue_1" in r.output + assert "TEST.issue_4" not in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_with_grouping(tmp_path: Path) -> None: + """-g severity --max-per-group 1 truncates each severity group independently.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + ["--load", str(jsonl), "-g", "severity", "--max-per-group", "1"], + ) + assert r.exit_code == 1 + # Each group should show "more issue(s)" + assert "more issue" in r.output + # Headers should reflect original counts (including omitted) + assert "=== ERROR" in r.output + assert "=== WARNING" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_json(tmp_path: Path) -> None: + """-f json_pp --max-per-group 2 emits _truncated placeholder in JSON.""" + jsonl = _make_jsonl(tmp_path, n=5) + r = CliRunner().invoke( + validate, + ["--load", str(jsonl), "-f", "json_pp", "--max-per-group", "2"], + ) + assert r.exit_code == 1 # ERRORs in test data + data = json.loads(r.output) + assert isinstance(data, list) + # Last item should be a truncation notice + assert data[-1]["_truncated"] is True + assert data[-1]["omitted_count"] == 3 + # Only 2 real results before the notice + real = [d for d in data if "_truncated" not in d] + assert len(real) == 2 + + +@pytest.mark.ai_generated +def test_max_per_group_multilevel(tmp_path: Path) -> None: + """-g severity -g id --max-per-group 1 truncates only at leaf level.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + [ + "--load", + str(jsonl), + "-g", + "severity", + "-g", + "id", + "--max-per-group", + "1", + ], + ) + assert r.exit_code == 1 + # All severity groups should appear + assert "=== ERROR" in r.output + assert "=== WARNING" in r.output + # Each id within a severity gets at most 1 result — but since each + # synthetic issue has a unique id, each leaf has exactly 1 item, + # so no truncation notice is expected for unique-id leaves. + # Verify structure is intact. + assert "TEST.issue_0" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_no_truncation(tmp_path: Path) -> None: + """--max-per-group larger than result count produces no placeholder.""" + jsonl = _make_jsonl(tmp_path, n=3) + r = CliRunner().invoke(validate, ["--load", str(jsonl), "--max-per-group", "100"]) + assert r.exit_code == 1 + assert "more issue" not in r.output + # All issues present + assert "TEST.issue_0" in r.output + assert "TEST.issue_1" in r.output + assert "TEST.issue_2" in r.output + + +@pytest.mark.ai_generated +def test_max_per_group_json_grouped(tmp_path: Path) -> None: + """-g severity -f json_pp --max-per-group 1 emits _truncated in grouped JSON.""" + jsonl = _make_jsonl(tmp_path, n=6) + r = CliRunner().invoke( + validate, + [ + "--load", + str(jsonl), + "-g", + "severity", + "-f", + "json_pp", + "--max-per-group", + "1", + ], + ) + data = json.loads(r.output) + assert isinstance(data, dict) + # Each severity group should have a truncation notice if it has > 1 item + for sev_key, items in data.items(): + assert isinstance(items, list) + truncated = [i for i in items if isinstance(i, dict) and i.get("_truncated")] + if len(items) > 1: + # At least one truncation notice + assert len(truncated) >= 1 + + +@pytest.mark.ai_generated +def test_truncate_leaves_unit() -> None: + """Unit test for _truncate_leaves helper.""" + from collections import OrderedDict + + origin = Origin( + type=OriginType.VALIDATION, + validator=Validator.nwbinspector, + validator_version="", + ) + issues = [ + ValidationResult( + id=f"T.{i}", + origin=origin, + scope=Scope.FILE, + message=f"msg{i}", + path=Path(f"f{i}.nwb"), + severity=Severity.ERROR, + ) + for i in range(5) + ] + + # Flat list truncation + truncated = _truncate_leaves(issues, 2) + assert isinstance(truncated, list) + assert len(truncated) == 3 # 2 results + 1 notice + assert isinstance(truncated[-1], TruncationNotice) + assert truncated[-1].omitted_count == 3 + + # Nested dict truncation + grouped: GroupedResults = cast( + "GroupedResults", OrderedDict([("A", issues[:3]), ("B", issues[3:])]) + ) + truncated_grouped = _truncate_leaves(grouped, 1) + assert isinstance(truncated_grouped, OrderedDict) + a_items = truncated_grouped["A"] + assert isinstance(a_items, list) + assert len(a_items) == 2 # 1 result + 1 notice + assert isinstance(a_items[-1], TruncationNotice) + assert a_items[-1].omitted_count == 2 + # B has 2 items → truncated to 1 + notice + b_items = truncated_grouped["B"] + assert isinstance(b_items, list) + assert len(b_items) == 2 + assert isinstance(b_items[-1], TruncationNotice) + + # No truncation when under limit + no_trunc = _truncate_leaves(issues, 100) + assert no_trunc is issues From efe6ec135c21e95a2706b774deb467c2d33c4a67 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Sat, 21 Mar 2026 11:45:55 -0400 Subject: [PATCH 16/16] fix: auto-save validation sidecar for all output formats The _auto_save_sidecar() call was only in the structured-to-stdout branch, so the default human format (the most common usage) never wrote the _validation.jsonl sidecar next to the log file. Move the sidecar write and _exit_if_errors() into a shared path that runs after all rendering branches. The sidecar is now written whenever there are results, unless --output or --load is active. Also update the validate docstring/help text to document the sidecar behavior, and update the design spec (Phase 1b, Phase 3, testing strategy) to reflect the --validation-log CLI option for upload and proper CLI integration testing via CliRunner through main(). Co-Authored-By: Claude Code 2.1.81 / Claude Opus 4.6 --- .../specs/validate-machine-readable-output.md | 283 +++++++++++++++--- dandi/cli/cmd_validate.py | 23 +- dandi/cli/tests/test_cmd_validate.py | 87 ++++++ 3 files changed, 347 insertions(+), 46 deletions(-) diff --git a/.specify/specs/validate-machine-readable-output.md b/.specify/specs/validate-machine-readable-output.md index db42361e1..62a5481c7 100644 --- a/.specify/specs/validate-machine-readable-output.md +++ b/.specify/specs/validate-machine-readable-output.md @@ -515,21 +515,34 @@ Validation: #### Auto-save sidecar `dandi validate` writes a `_validation.jsonl` sidecar next to its log file -**by default**, but **skips the sidecar when `--output` is specified** (the -user has already chosen where to save structured results — no need for a -redundant copy): +**by default** whenever there are results (any severity), regardless of +`--format`. The sidecar is **skipped** only when: +- `--output` is specified (user already has structured results in a file) +- `--load` is used (re-rendering existing results, not a fresh run) +- No results are produced (clean validation) ```python -# After collecting all_results: -if all_results and not output: - sidecar = validation_sidecar_path(ctx.obj.logfile) - write_validation_jsonl(all_results, sidecar) - lgr.info("Validation results saved to %s", sidecar) +# After collecting and filtering results, regardless of --format: +if not load and not output_file and filtered: + if hasattr(ctx, "obj") and ctx.obj is not None: + _auto_save_sidecar(filtered, ctx.obj.logfile) ``` -Empty results (clean validation) also skip the sidecar to avoid clutter. +**Important**: The sidecar must be written in ALL output format branches +(human, structured-to-stdout, structured-to-file), not just in the +structured-to-stdout branch. A bug existed where the sidecar was only saved +in the `else` branch (structured output to stdout), missing the most common +case (human output, the default). + The sidecar accumulation rate matches the already-existing `.log` files. +The `validate` command's help text should mention this behavior so users +know where to find saved results: + +> Validation results are automatically saved as a JSONL sidecar next to the +> dandi-cli log file (unless --output is specified). Use ``dandi validate +> --load `` to re-render saved results with different options. + ### Phase 1c: Summary flag (`--summary`) **Goal**: Add summary statistics as a display option, decoupled from the @@ -633,7 +646,36 @@ The `upload()` function in `upload.py` is both a Python API and a CLI backend. The Click context (`ctx.obj.logfile`) is only available via CLI. The upload function's validation happens inside a deeply nested generator (`_upload_item`). -**Solution**: Pass the sidecar path as an optional parameter to `upload()`: +**Solution**: Two layers: + +1. **Python API** (`upload.py`): accepts `validation_log_path: str | Path | None` + parameter. Already implemented. +2. **CLI** (`cmd_upload.py`): add an explicit `--validation-log` option that + makes the feature discoverable and testable. + +#### CLI option: `--validation-log` + +```python +@click.option( + "--validation-log", + help="Path for writing validation results in JSONL format. " + "Defaults to a sidecar file next to the dandi-cli log file. " + "Pass empty string to disable.", + type=click.Path(dir_okay=False), + default=None, +) +``` + +Behavior: +- **Not specified** (default): derive sidecar from `ctx.obj.logfile` via + `validation_sidecar_path()` — same as current implicit behavior +- **Explicit path**: use that path directly as `validation_log_path` +- **Empty string `""`**: pass `None` to `upload()`, disabling validation logging + +This makes the validation sidecar feature visible in `dandi upload --help` +and allows users to control where results are saved or opt out entirely. + +#### Python API parameter ```python def upload( @@ -641,24 +683,31 @@ def upload( existing: UploadExisting = UploadExisting.REFRESH, validation: UploadValidation = UploadValidation.REQUIRE, # ... existing params ... - validation_log_path: str | Path | None = None, # NEW + validation_log_path: str | Path | None = None, ) -> None: ``` -The CLI wrapper (`cmd_upload.py`) derives the sidecar path from `ctx.obj.logfile` -and passes it in. Programmatic callers can pass their own path or leave it as -`None` (no sidecar written). +The CLI wrapper resolves the `--validation-log` option and passes the +resolved path (or `None`) to `upload()`. #### Implementation ```python -# In cmd_upload.py, the CLI function: -sidecar = validation_sidecar_path(ctx.obj.logfile) -upload(..., validation_log_path=sidecar) - -# In upload.py, inside the upload loop: -# Collect all validation results as each file is validated -if validation_log_path is not None: +# In cmd_upload.py: +if validation_log is not None: + # Explicit --validation-log: use as-is (empty string → disable) + sidecar = Path(validation_log) if validation_log else None +else: + # Default: derive from logfile + ctx = click.get_current_context() + sidecar = None + if ctx.obj is not None: + sidecar = validation_sidecar_path(ctx.obj.logfile) + +upload_(..., validation_log_path=sidecar) + +# In upload.py, inside the upload loop (already implemented): +if validation_log_path is not None and validation_statuses: append_validation_jsonl(validation_statuses, validation_log_path) ``` @@ -677,6 +726,22 @@ Uses `append_validation_jsonl()` from `dandi/validate/io.py` — the file is opened in append mode for each batch, allowing incremental writes as files are validated during upload without holding all results in memory. +#### Testing the upload validation sidecar + +See **Step 3** in the Implementation Order section for detailed test descriptions. + +Key insight: to test the default sidecar-next-to-logfile behavior, CLI tests +must invoke `main` (the Click group), not `upload` directly, because `main()` +sets up `ctx.obj.logfile`. Example: + +```python +runner = CliRunner() +r = runner.invoke(main, ["upload", "--dandi-instance", instance_id, str(dspath)]) +``` + +Tests for the explicit `--validation-log` option can invoke either `main` or +`upload` directly since the option bypasses the logfile derivation. + ### Phase 4: Extended grouping options — enhances #1743 work **Goal**: Support additional grouping strategies for human-readable output. @@ -832,11 +897,140 @@ cat results/*.jsonl | jq -s ' ### Step 3: Upload validation sidecar — addresses #1753 -- Add `validation_log_path` parameter to `upload()` -- CLI wrapper passes sidecar path derived from `ctx.obj.logfile` -- Use `append_validation_jsonl()` for incremental writes -- Announce sidecar path on validation errors -- Tests: sidecar creation during upload, content correctness +#### 3a: Add `--validation-log` CLI option to `dandi upload` + +Currently the upload command writes a validation sidecar derived from +`ctx.obj.logfile` (the dandi-cli log file), but this is invisible to the +user — there is no `--help` text about it, no way to control the path, and +no way to disable it. + +Add an explicit `--validation-log` option: + +```python +@click.option( + "--validation-log", + help="Path for writing validation results in JSONL format. " + "Defaults to a sidecar file next to the dandi-cli log file. " + "Pass empty string to disable.", + type=click.Path(dir_okay=False), + default=None, +) +``` + +Behavior: +- **Not specified** (default): derive sidecar from `ctx.obj.logfile` as today +- **Explicit path**: use that path directly +- **Empty string `""`**: disable validation logging entirely + +This makes the feature discoverable via `--help` and testable without +needing to go through `main()` to get a logfile. + +#### 3b: Python API parameter + +- `upload()` already has `validation_log_path: str | Path | None = None` +- CLI wrapper resolves the `--validation-log` option → path or `None` and + passes it through + +#### 3c: Testing strategy for upload validation sidecar + +**Problem**: Existing upload tests call `SampleDandiset.upload()` which +invokes the Python API directly. The Python API's `validation_log_path` +parameter works, but the real end-to-end flow — where `dandi upload` +automatically derives the sidecar from its log file — is untested. Also, +invoking `CliRunner().invoke(upload, ...)` on the subcommand alone does not +set `ctx.obj.logfile` (that is done by `main()`). + +**Approach — two layers of tests**: + +1. **Python API tests** (`dandi/tests/test_upload.py`): Extend existing + tests that already exercise `SampleDandiset.upload()` to pass + `validation_log_path` and verify sidecar contents. These test the core + write logic without CLI overhead. + + - `test_upload_bids_invalid`: pass `validation_log_path`, verify sidecar + exists with ERROR-level results after `UploadError` + - `test_upload_invalid_metadata`: same pattern for NWB validation errors + - New `test_upload_validation_sidecar_clean`: upload valid NWB with + `validation_log_path`, verify sidecar absent or contains only + sub-ERROR results + +2. **CLI integration tests** (`dandi/cli/tests/test_cmd_upload.py` — new file): + Use `CliRunner().invoke(main, ["upload", ...])` to go through `main()` + so `ctx.obj.logfile` is set. These require the docker-compose archive + fixture. Tests verify: + + - Default behavior: `_validation.jsonl` sidecar appears next to the log + file (derived via `validation_sidecar_path(ctx.obj.logfile)`) + - `--validation-log /path/to/custom.jsonl`: sidecar at specified path + - `--validation-log ""`: no sidecar written + - Sidecar content is loadable via `load_validation_jsonl()` and contains + expected severity levels + + The log file path can be discovered from the CliRunner output (dandi + logs "Logs saved in ..." at INFO level) or by inspecting the logdir + after invocation. + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_sidecar( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """CLI upload creates validation sidecar next to log file.""" + # ... set up dandiset with a file that has validation warnings/errors ... + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # Find the log file from the logdir + logdir = Path(platformdirs.user_log_dir("dandi-cli", "dandi")) + log_files = sorted(logdir.glob("*.log")) + assert log_files # at least one log was created + latest_log = log_files[-1] + sidecar = validation_sidecar_path(latest_log) + if sidecar.exists(): + results = load_validation_jsonl(sidecar) + assert len(results) > 0 + ``` + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_log_option( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """--validation-log sends sidecar to explicit path.""" + custom_log = tmp_path / "my-validation.jsonl" + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--validation-log", str(custom_log), + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # Verify custom path was used + if custom_log.exists(): + results = load_validation_jsonl(custom_log) + assert all(isinstance(r, ValidationResult) for r in results) + ``` + + ```python + @pytest.mark.ai_generated + def test_upload_cli_validation_log_disabled( + new_dandiset: SampleDandiset, tmp_path: Path + ) -> None: + """--validation-log '' disables sidecar.""" + runner = CliRunner() + r = runner.invoke(main, [ + "upload", + "--validation-log", "", + "--dandi-instance", new_dandiset.api.instance_id, + str(new_dandiset.dspath), + ]) + # No validation JSONL should exist in logdir + logdir = Path(platformdirs.user_log_dir("dandi-cli", "dandi")) + assert not list(logdir.glob("*_validation.jsonl")) + ``` ### Step 4: Extended grouping (human-only) — enhances #1743 @@ -865,19 +1059,26 @@ cat results/*.jsonl | jq -s ' ## Testing Strategy -| Component | Test type | Approach | -|-----------|-----------|----------| -| Subpackage refactor | Smoke | All existing tests pass after `git mv` + import updates | -| `--format` output | CLI unit | `click.CliRunner`, assert JSON structure, round-trip | -| `dandi/validate/io.py` | Unit | Write → load round-trip, append, empty files, corrupt lines | -| `_record_version` | Unit | Serialization includes field, loader handles missing/unknown | -| `--load` | CLI unit | Load from fixture files, multi-file concat, mutual exclusivity | -| `--output` | CLI unit | Verify file creation, content matches stdout format | -| Sidecar auto-save | CLI unit | Verify `_validation.jsonl` created next to mock logfile | -| Upload sidecar | Integration | Upload with Docker Compose fixture, verify sidecar | -| Extended grouping | CLI unit | Each grouping value, section headers, counts | -| Summary | CLI unit | Verify counts match actual results | -| Edge cases | Unit | Empty results, None severity, very long paths, Unicode messages | +| Component | Test type | Location | Approach | +|-----------|-----------|----------|----------| +| Subpackage refactor | Smoke | existing | All existing tests pass after `git mv` + import updates | +| `--format` output | CLI unit | `cli/tests/test_cmd_validate.py` | `CliRunner`, assert JSON structure, round-trip | +| `dandi/validate/io.py` | Unit | `validate/tests/test_io.py` | Write → load round-trip, append, empty files, corrupt lines | +| `_record_version` | Unit | `validate/tests/test_types.py` | Serialization includes field, loader handles missing/unknown | +| `--load` | CLI unit | `cli/tests/test_cmd_validate.py` | Load from fixture files, multi-file concat, mutual exclusivity | +| `--output` | CLI unit | `cli/tests/test_cmd_validate.py` | Verify file creation, content matches stdout format | +| Sidecar auto-save (`validate`) | CLI unit | `cli/tests/test_cmd_validate.py` | Verify `_validation.jsonl` created next to mock logfile | +| Upload sidecar (Python API) | Integration | `tests/test_upload.py` | Pass `validation_log_path` to `SampleDandiset.upload()`, verify file + contents | +| Upload sidecar (CLI, default) | CLI integration | `cli/tests/test_cmd_upload.py` | `CliRunner().invoke(main, ["upload", ...])` — verify `_validation.jsonl` next to log file | +| Upload `--validation-log` option | CLI integration | `cli/tests/test_cmd_upload.py` | Custom path, empty-string disable | +| Extended grouping | CLI unit | `cli/tests/test_cmd_validate.py` | Each grouping value, section headers, counts | +| Summary | CLI unit | `cli/tests/test_cmd_validate.py` | Verify counts match actual results | +| Edge cases | Unit | various | Empty results, None severity, very long paths, Unicode messages | + +**Note on CLI integration tests**: Tests in `cli/tests/test_cmd_upload.py` must invoke +`main` (not `upload` directly) via `CliRunner().invoke(main, ["upload", ...])` so that +`ctx.obj.logfile` is set by the `main()` group callback. Invoking the `upload` +subcommand directly skips the log file setup. All new tests marked `@pytest.mark.ai_generated`. @@ -895,8 +1096,10 @@ All new tests marked `@pytest.mark.ai_generated`. | `dandi/validate/tests/test_io.py` | **New** — tests for I/O utilities | | `dandi/cli/cmd_validate.py` | Refactor + add `--format`, `--output`, `--load`, `--summary`, grouping extensions | | `dandi/upload.py` | Add `validation_log_path` parameter, write sidecar | -| `dandi/cli/cmd_upload.py` | Pass sidecar path to `upload()` | +| `dandi/cli/cmd_upload.py` | Add `--validation-log` option, pass sidecar path to `upload()` | +| `dandi/cli/tests/test_cmd_upload.py` | **New** — CLI integration tests for upload validation sidecar | | `dandi/cli/tests/test_cmd_validate.py` | Extend with format/load/output/summary tests | +| `dandi/tests/test_upload.py` | Extend existing tests to pass `validation_log_path` and verify sidecar | | `dandi/pynwb_utils.py` | Update imports | | `dandi/organize.py` | Update imports | | `dandi/files/bases.py` | Update imports | diff --git a/dandi/cli/cmd_validate.py b/dandi/cli/cmd_validate.py index 9d37fe7b1..c0d0ea99b 100644 --- a/dandi/cli/cmd_validate.py +++ b/dandi/cli/cmd_validate.py @@ -231,6 +231,11 @@ def validate( """Validate files for data standards compliance. Exits with non-0 exit code if any file is not compliant. + + Validation results are automatically saved as a JSONL sidecar next to the + dandi-cli log file (unless --output is used or --load is active). Use + ``dandi validate --load `` to re-render saved results later with + different grouping, filtering, or format options. """ # Normalize grouping: strip "none" values grouping = tuple(g for g in grouping if g != "none") @@ -269,7 +274,6 @@ def validate( _render_human(filtered, grouping, max_per_group=max_per_group) if summary: _print_summary(filtered, sys.stdout) - _exit_if_errors(filtered) elif output_file is not None: with open(output_file, "w") as fh: _render_structured( @@ -282,7 +286,6 @@ def validate( lgr.info("Validation output written to %s", output_file) if summary: _print_summary(filtered, sys.stderr) - _exit_if_errors(filtered) else: _render_structured( filtered, @@ -293,10 +296,18 @@ def validate( ) if summary: _print_summary(filtered, sys.stderr) - # Auto-save sidecar next to logfile (skip when loading) - if not load and filtered and hasattr(ctx, "obj") and ctx.obj is not None: - _auto_save_sidecar(filtered, ctx.obj.logfile) - _exit_if_errors(filtered) + + # Auto-save sidecar next to logfile (skip when loading or writing to --output file) + if ( + not load + and not output_file + and filtered + and hasattr(ctx, "obj") + and ctx.obj is not None + ): + _auto_save_sidecar(filtered, ctx.obj.logfile) + + _exit_if_errors(filtered) def _auto_save_sidecar(results: list[ValidationResult], logfile: str) -> None: diff --git a/dandi/cli/tests/test_cmd_validate.py b/dandi/cli/tests/test_cmd_validate.py index f804956d3..cb2952f00 100644 --- a/dandi/cli/tests/test_cmd_validate.py +++ b/dandi/cli/tests/test_cmd_validate.py @@ -15,7 +15,9 @@ _truncate_leaves, validate, ) +from ..command import main from ...tests.xfail import mark_xfail_windows_python313_posixsubprocess +from ...validate.io import load_validation_jsonl from ...validate.types import ( Origin, OriginType, @@ -772,3 +774,88 @@ def test_truncate_leaves_unit() -> None: # No truncation when under limit no_trunc = _truncate_leaves(issues, 100) assert no_trunc is issues + + +@pytest.mark.ai_generated +def test_validate_auto_sidecar_human( + simple2_nwb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Default human-format validate auto-saves sidecar next to log file.""" + logdir = tmp_path / "logs" + logdir.mkdir() + monkeypatch.setattr("platformdirs.user_log_dir", lambda *a, **kw: str(logdir)) + + r = CliRunner().invoke(main, ["validate", str(simple2_nwb)]) + assert r.exit_code == 1 # NO_DANDISET_FOUND + + # Find sidecar files + sidecars = list(logdir.glob("*_validation.jsonl")) + assert len(sidecars) == 1 + + # Verify content is loadable + results = load_validation_jsonl(sidecars[0]) + assert len(results) > 0 + + +@pytest.mark.ai_generated +def test_validate_auto_sidecar_skipped_with_output( + simple2_nwb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """--output suppresses auto-save sidecar.""" + logdir = tmp_path / "logs" + logdir.mkdir() + monkeypatch.setattr("platformdirs.user_log_dir", lambda *a, **kw: str(logdir)) + + outfile = tmp_path / "results.jsonl" + r = CliRunner().invoke(main, ["validate", "-o", str(outfile), str(simple2_nwb)]) + assert r.exit_code == 1 + assert outfile.exists() + + # No sidecar should exist + sidecars = list(logdir.glob("*_validation.jsonl")) + assert len(sidecars) == 0 + + +@pytest.mark.ai_generated +def test_validate_auto_sidecar_skipped_with_load( + simple2_nwb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """--load suppresses auto-save sidecar.""" + logdir = tmp_path / "logs" + logdir.mkdir() + monkeypatch.setattr("platformdirs.user_log_dir", lambda *a, **kw: str(logdir)) + + # First produce a JSONL to load + outfile = tmp_path / "input.jsonl" + r = CliRunner().invoke( + main, ["validate", "-f", "json_lines", "-o", str(outfile), str(simple2_nwb)] + ) + assert outfile.exists() + + # Clear any sidecars from first run + for s in logdir.glob("*_validation.jsonl"): + s.unlink() + + # Now --load it + r = CliRunner().invoke(main, ["validate", "--load", str(outfile)]) + assert r.exit_code == 1 + + # No new sidecar should exist + sidecars = list(logdir.glob("*_validation.jsonl")) + assert len(sidecars) == 0 + + +@pytest.mark.ai_generated +def test_validate_auto_sidecar_structured_stdout( + simple2_nwb: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Structured format to stdout also auto-saves sidecar.""" + logdir = tmp_path / "logs" + logdir.mkdir() + monkeypatch.setattr("platformdirs.user_log_dir", lambda *a, **kw: str(logdir)) + + r = CliRunner().invoke(main, ["validate", "-f", "json", str(simple2_nwb)]) + assert r.exit_code == 1 + + sidecars = list(logdir.glob("*_validation.jsonl")) + assert len(sidecars) == 1