From 382f6c94e7a42a20db45f4aa71f7269c9443ca48 Mon Sep 17 00:00:00 2001 From: Laith Al-Saadoon Date: Sat, 16 May 2026 15:58:07 +0000 Subject: [PATCH] feat!: drop detect-secrets; ship tuned betterleaks default config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `codehub analyze` ran two parallel secret scanners. detect-secrets was the long pole (5+ minute Python walker, sometimes timed out at 300s) and betterleaks was effectively dead — the wrapper passed `--report-path=/dev/stdout`, which fails inside Node's `execFile` with ENXIO because the child's fd 1 is a pipe. Together they emitted 18,893 findings on the OCH self-scan, almost all noise from generic-entropy matchers flagging integrity hashes in lockfiles and SBOMs. Coverage audit (Context7 + DeepWiki + upstream `betterleaks.toml`) confirmed betterleaks ships 276 default rules vs detect-secrets' ~24, including a CEL-filtered `generic-api-key` catch-all that subsumes the older tool's high-entropy + keyword detectors. Only `IPPublicDetector` (low-value, high-FP) is uniquely detect-secrets — not worth keeping the Python dep for. What changed: - Removed detect-secrets entirely: wrapper, converter, catalog spec, index switch case, P1 list, tests, README rows, docs ADR refs, pre-release-gate workflow step, in-tree `.secrets.baseline`. - Fixed betterleaks wrapper: `--report-path=-` instead of `/dev/stdout`, always uses `dir` mode (working-tree state, not git history), auto-detects user `betterleaks.toml`/`gitleaks.toml` and only injects the vendored default config when none is present. - Shipped `packages/scanners/config/betterleaks.default.toml` — `[extend] useDefault = true` plus `[[allowlists]]` blocks that filter vendored deps, build outputs, lockfiles, SBOMs, binary blobs, and test files via RE2 path regexes. - Pre-release gate now runs `betterleaks dir` with the same config the wrapper injects locally, with `--exit-code=1`. Measured on the OCH self-scan: - Wall clock: 12:39 → 5:35 (-56%) - Findings: 18,893 → 45 (-420x) - Betterleaks: 0 (broken, ENXIO) → 0 (clean, tuned config holds) ADR 0017 records the rationale and migration. Users override the shipped config by dropping a `betterleaks.toml` at the project root; the wrapper picks it up via betterleaks' native config-precedence and skips the `--config` injection. --- .github/workflows/pre-release-gate.yml | 31 +- .secrets.baseline | 515 ------------------ ...17-drop-detect-secrets-tune-betterleaks.md | 111 ++++ packages/cli/src/commands/analyze.ts | 2 +- packages/cli/src/commands/scan.test.ts | 3 +- .../content/docs/architecture/monorepo-map.md | 2 +- .../docs/architecture/scanners-and-sarif.md | 11 +- packages/scanners/README.md | 12 +- .../scanners/config/betterleaks.default.toml | 159 ++++++ packages/scanners/package.json | 3 +- packages/scanners/src/catalog.test.ts | 26 +- packages/scanners/src/catalog.ts | 21 - .../detect-secrets-to-sarif.test.ts | 222 -------- .../src/converters/detect-secrets-to-sarif.ts | 200 ------- packages/scanners/src/index.ts | 9 +- packages/scanners/src/wrappers/betterleaks.ts | 123 ++++- .../scanners/src/wrappers/detect-secrets.ts | 76 --- .../scanners/src/wrappers/wrappers.test.ts | 101 +--- 18 files changed, 438 insertions(+), 1189 deletions(-) delete mode 100644 .secrets.baseline create mode 100644 docs/adr/0017-drop-detect-secrets-tune-betterleaks.md create mode 100644 packages/scanners/config/betterleaks.default.toml delete mode 100644 packages/scanners/src/converters/detect-secrets-to-sarif.test.ts delete mode 100644 packages/scanners/src/converters/detect-secrets-to-sarif.ts delete mode 100644 packages/scanners/src/wrappers/detect-secrets.ts diff --git a/.github/workflows/pre-release-gate.yml b/.github/workflows/pre-release-gate.yml index 72a48e4e..df07a295 100644 --- a/.github/workflows/pre-release-gate.yml +++ b/.github/workflows/pre-release-gate.yml @@ -63,8 +63,8 @@ jobs: - name: Install with frozen lockfile and no lifecycle scripts run: pnpm install --frozen-lockfile --ignore-scripts - detect-secrets: - name: detect-secrets full sweep + betterleaks: + name: betterleaks full sweep if: startsWith(github.head_ref, 'release-please--') runs-on: ubuntu-latest steps: @@ -72,25 +72,18 @@ jobs: with: fetch-depth: 0 persist-credentials: false - - name: Install detect-secrets - run: pip install --user 'detect-secrets==1.5.0' - - name: Sweep tracked tree + - uses: jdx/mise-action@1648a7812b9aeae629881980618f079932869151 # v4.0.1 + - name: Sweep working tree run: | set -euo pipefail - export PATH="$HOME/.local/bin:$PATH" - # The repo already ships .secrets.baseline (per Track B). The - # release gate re-asserts that no NEW secrets have crept in. - if [ -f .secrets.baseline ]; then - detect-secrets scan --baseline .secrets.baseline - else - detect-secrets scan --all-files > /tmp/scan.json - FOUND=$(python3 -c "import json,sys; d=json.load(open('/tmp/scan.json')); n=sum(len(v) for v in d.get('results',{}).values()); print(n)") - if [ "$FOUND" != "0" ]; then - echo "detect-secrets found $FOUND potential secrets" >&2 - cat /tmp/scan.json - exit 1 - fi - fi + # `dir` mode reflects the current checkout (not the entire git + # log), matching what `codehub analyze` runs locally. The + # vendored config is the same file the wrapper auto-injects. + betterleaks dir \ + --no-banner \ + --config=packages/scanners/config/betterleaks.default.toml \ + --exit-code=1 \ + . licenses-reassert: name: License allowlist re-assert diff --git a/.secrets.baseline b/.secrets.baseline deleted file mode 100644 index 48b2405c..00000000 --- a/.secrets.baseline +++ /dev/null @@ -1,515 +0,0 @@ -{ - "version": "1.5.0", - "plugins_used": [ - { - "name": "ArtifactoryDetector" - }, - { - "name": "AWSKeyDetector" - }, - { - "name": "AzureStorageKeyDetector" - }, - { - "name": "Base64HighEntropyString", - "limit": 4.5 - }, - { - "name": "BasicAuthDetector" - }, - { - "name": "CloudantDetector" - }, - { - "name": "DiscordBotTokenDetector" - }, - { - "name": "GitHubTokenDetector" - }, - { - "name": "GitLabTokenDetector" - }, - { - "name": "HexHighEntropyString", - "limit": 3.0 - }, - { - "name": "IbmCloudIamDetector" - }, - { - "name": "IbmCosHmacDetector" - }, - { - "name": "IPPublicDetector" - }, - { - "name": "JwtTokenDetector" - }, - { - "name": "KeywordDetector", - "keyword_exclude": "" - }, - { - "name": "MailchimpDetector" - }, - { - "name": "NpmDetector" - }, - { - "name": "OpenAIDetector" - }, - { - "name": "PrivateKeyDetector" - }, - { - "name": "PypiTokenDetector" - }, - { - "name": "SendGridDetector" - }, - { - "name": "SlackDetector" - }, - { - "name": "SoftlayerDetector" - }, - { - "name": "SquareOAuthDetector" - }, - { - "name": "StripeDetector" - }, - { - "name": "TelegramBotTokenDetector" - }, - { - "name": "TwilioKeyDetector" - } - ], - "filters_used": [ - { - "path": "detect_secrets.filters.allowlist.is_line_allowlisted" - }, - { - "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", - "min_level": 2 - }, - { - "path": "detect_secrets.filters.heuristic.is_indirect_reference" - }, - { - "path": "detect_secrets.filters.heuristic.is_likely_id_string" - }, - { - "path": "detect_secrets.filters.heuristic.is_lock_file" - }, - { - "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" - }, - { - "path": "detect_secrets.filters.heuristic.is_potential_uuid" - }, - { - "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" - }, - { - "path": "detect_secrets.filters.heuristic.is_sequential_string" - }, - { - "path": "detect_secrets.filters.heuristic.is_swagger_file" - }, - { - "path": "detect_secrets.filters.heuristic.is_templated_secret" - }, - { - "path": "detect_secrets.filters.regex.should_exclude_file", - "pattern": [ - "\\.git/.*", - "node_modules/.*", - "pnpm-lock\\.yaml", - "\\.codehub/.*", - "SBOM\\.cdx\\.json", - ".*\\.lock$", - ".*\\.tsbuildinfo$", - "dist/.*", - ".*\\.min\\.js$", - ".*\\.map$", - "vendor/.*", - "\\.erpaval/.*", - "\\.npm/.*", - "\\.astro/.*", - "\\.secrets\\.baseline" - ] - } - ], - "results": { - "packages/analysis/src/page-rank.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/analysis/src/page-rank.test.ts", - "hashed_secret": "e3fdbbc96eaa60786974548a1a09cc078804aa3d", - "is_verified": false, - "line_number": 74 - } - ], - "packages/cli/src/commands/index-repo.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/commands/index-repo.test.ts", - "hashed_secret": "d23bd10592bd06603057fd3d7a4506743fbdb43b", - "is_verified": false, - "line_number": 39 - } - ], - "packages/cli/src/scip-pins.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/scip-pins.ts", - "hashed_secret": "fcdf4195aa3c347737da1e575806a295f9c021be", - "is_verified": false, - "line_number": 130 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/scip-pins.ts", - "hashed_secret": "9d0516ba2a61bb12f1451ba9df814d12f52422f1", - "is_verified": false, - "line_number": 154 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/scip-pins.ts", - "hashed_secret": "3f56fa719f45798ccc2488540608905ca8fbc0bf", - "is_verified": false, - "line_number": 190 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/scip-pins.ts", - "hashed_secret": "9abd1a2bb680580c4f956fa2a47a980f6518c9c5", - "is_verified": false, - "line_number": 196 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/cli/src/scip-pins.ts", - "hashed_secret": "92332b62f0fe29bed53eeffe05a88ab31b87315a", - "is_verified": false, - "line_number": 236 - } - ], - "packages/core-types/src/hash.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/core-types/src/hash.test.ts", - "hashed_secret": "244f421f896bdcdd2784dccf4eaf7c8dfd5189b5", - "is_verified": false, - "line_number": 6 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/core-types/src/hash.test.ts", - "hashed_secret": "aabc6baa2329644dd4c24ce57ddde468ff64ce8e", - "is_verified": false, - "line_number": 12 - } - ], - "packages/embedder/src/http-embedder.test.ts": [ - { - "type": "Secret Keyword", - "filename": "packages/embedder/src/http-embedder.test.ts", - "hashed_secret": "f10317c27ca4d2ef628b2690766ceca20f8668ac", - "is_verified": false, - "line_number": 325 - }, - { - "type": "Secret Keyword", - "filename": "packages/embedder/src/http-embedder.test.ts", - "hashed_secret": "d17f1ab3e20d30ec8e0ec98bc91e03091f0573e1", - "is_verified": false, - "line_number": 439 - } - ], - "packages/embedder/src/model-pins.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "0207a3773ba173170c1ccc44729c2e346fbb9e12", - "is_verified": false, - "line_number": 15 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "64ca8eac3974a5cbd678de321270a5257edaff08", - "is_verified": false, - "line_number": 43 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "cc5171d4407f12f112ca150a8a5680df8d28d227", - "is_verified": false, - "line_number": 50 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "d43204e28b1dbe1ca892f5393cfe86371254b56f", - "is_verified": false, - "line_number": 57 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "470c964a497fd0b112982013b8b1bb1f68523db9", - "is_verified": false, - "line_number": 64 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "507075f422831acf7b62751c7f2d37e93d99e83a", - "is_verified": false, - "line_number": 83 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/embedder/src/model-pins.ts", - "hashed_secret": "0124e7cdec3eb7a4b199ee51a047e383e5fb2bc9", - "is_verified": false, - "line_number": 98 - } - ], - "packages/ingestion/src/pipeline/phases/content-cache.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/ingestion/src/pipeline/phases/content-cache.test.ts", - "hashed_secret": "78fe441cba8a3bd1d84351e6f700ee8c494ff98b", - "is_verified": false, - "line_number": 19 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/ingestion/src/pipeline/phases/content-cache.test.ts", - "hashed_secret": "8f8e76aa11d365940289d58f8a589d225ebcc63e", - "is_verified": false, - "line_number": 20 - } - ], - "packages/ingestion/src/pipeline/phases/repo-node.test.ts": [ - { - "type": "Basic Auth Credentials", - "filename": "packages/ingestion/src/pipeline/phases/repo-node.test.ts", - "hashed_secret": "ee977806d7286510da8b9a7492ba58e2484c0ecc", - "is_verified": false, - "line_number": 47 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/ingestion/src/pipeline/phases/repo-node.test.ts", - "hashed_secret": "158b484ae1f6f64f89da22397d25fbdafad02252", - "is_verified": false, - "line_number": 119 - }, - { - "type": "Hex High Entropy String", - "filename": "packages/ingestion/src/pipeline/phases/repo-node.test.ts", - "hashed_secret": "a33bee9914fa4090c8f06504da2d2dbb13754e2d", - "is_verified": false, - "line_number": 149 - } - ], - "packages/ingestion/src/pipeline/phases/repo-node.ts": [ - { - "type": "Basic Auth Credentials", - "filename": "packages/ingestion/src/pipeline/phases/repo-node.ts", - "hashed_secret": "ee977806d7286510da8b9a7492ba58e2484c0ecc", - "is_verified": false, - "line_number": 151 - } - ], - "packages/sarif/fixtures/v2.1.0-valid.sarif.json": [ - { - "type": "Hex High Entropy String", - "filename": "packages/sarif/fixtures/v2.1.0-valid.sarif.json", - "hashed_secret": "bc9947c6b98e4298cf1ba280ea4a5830fb64cf4b", - "is_verified": false, - "line_number": 38 - } - ], - "packages/sarif/src/schema-validation.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/sarif/src/schema-validation.test.ts", - "hashed_secret": "bc9947c6b98e4298cf1ba280ea4a5830fb64cf4b", - "is_verified": false, - "line_number": 54 - } - ], - "packages/scanners/src/catalog.ts": [ - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/catalog.ts", - "hashed_secret": "f3bbbd66a63d4bf1747940578ec3d0103530e21d", - "is_verified": false, - "line_number": 68 - }, - { - "type": "Basic Auth Credentials", - "filename": "packages/scanners/src/catalog.ts", - "hashed_secret": "9d4e1e23bd5b727046a9e3b4b7db57bd8d6ee684", - "is_verified": false, - "line_number": 69 - } - ], - "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts": [ - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "6367c48dd193d56ea7b0baad25b19455e529f5ee", - "is_verified": false, - "line_number": 28 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "0b3d8b29493059afd7f9912106279c4643ac4939", - "is_verified": false, - "line_number": 35 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "ffcb2deee9131e6cda4faa78bb40423c9b847ff0", - "is_verified": false, - "line_number": 44 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "ac4ae97285c19b13201deb9b192d921316db3447", - "is_verified": false, - "line_number": 71 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "bf1c365741a4bfb5fee5c3150335ab4f867a4d9a", - "is_verified": false, - "line_number": 78 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "f49cf6381e322b147053b74e4500af8533ac1e4c", - "is_verified": false, - "line_number": 102 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "07c6d85766912f733f8e20e7b639702e383d14e3", - "is_verified": false, - "line_number": 136 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "4801dc173acec2ca381ea34fad2258ac32fcc4a2", - "is_verified": false, - "line_number": 142 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "7a85f4764bbd6daf1c3545efbbf0f279a6dc0beb", - "is_verified": false, - "line_number": 194 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.test.ts", - "hashed_secret": "aa20c3bf5eca16e978a39965710b45ac9b9b5949", - "is_verified": false, - "line_number": 195 - } - ], - "packages/scanners/src/converters/detect-secrets-to-sarif.ts": [ - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.ts", - "hashed_secret": "49261fcb15a7f165031840707fa7fd42ef7b0921", - "is_verified": false, - "line_number": 73 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.ts", - "hashed_secret": "da28d1d6117bc94b53b31aee9ee76fa597af9df8", - "is_verified": false, - "line_number": 82 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/converters/detect-secrets-to-sarif.ts", - "hashed_secret": "ecb252044b5ea0f679ee78ec1a12904739e2904d", - "is_verified": false, - "line_number": 154 - } - ], - "packages/scanners/src/wrappers/wrappers.test.ts": [ - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/wrappers/wrappers.test.ts", - "hashed_secret": "ac4ae97285c19b13201deb9b192d921316db3447", - "is_verified": false, - "line_number": 225 - }, - { - "type": "Secret Keyword", - "filename": "packages/scanners/src/wrappers/wrappers.test.ts", - "hashed_secret": "bf1c365741a4bfb5fee5c3150335ab4f867a4d9a", - "is_verified": false, - "line_number": 232 - } - ], - "packages/storage/src/duckdb-adapter.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/storage/src/duckdb-adapter.test.ts", - "hashed_secret": "158b484ae1f6f64f89da22397d25fbdafad02252", - "is_verified": false, - "line_number": 953 - } - ], - "packages/storage/src/graph-hash-parity.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/storage/src/graph-hash-parity.test.ts", - "hashed_secret": "158b484ae1f6f64f89da22397d25fbdafad02252", - "is_verified": false, - "line_number": 428 - } - ], - "packages/storage/src/graphdb-adapter.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/storage/src/graphdb-adapter.test.ts", - "hashed_secret": "158b484ae1f6f64f89da22397d25fbdafad02252", - "is_verified": false, - "line_number": 970 - } - ], - "packages/storage/src/graphdb-roundtrip.test.ts": [ - { - "type": "Hex High Entropy String", - "filename": "packages/storage/src/graphdb-roundtrip.test.ts", - "hashed_secret": "158b484ae1f6f64f89da22397d25fbdafad02252", - "is_verified": false, - "line_number": 481 - } - ] - }, - "generated_at": "2026-05-12T15:14:54Z" -} diff --git a/docs/adr/0017-drop-detect-secrets-tune-betterleaks.md b/docs/adr/0017-drop-detect-secrets-tune-betterleaks.md new file mode 100644 index 00000000..5fc8051c --- /dev/null +++ b/docs/adr/0017-drop-detect-secrets-tune-betterleaks.md @@ -0,0 +1,111 @@ +# ADR 0017 — Drop detect-secrets; ship a tuned betterleaks default config + +- Status: **Accepted** — 2026-05-16. +- Authors: Laith Al-Saadoon + Claude. +- Branch: `chore/scanners-dedup-and-tune`. +- Supersedes: the "20th scanner" decision in + [ADR 0010](./0010-dogfood-findings-2026-04-27.md) (PR #72). + +## Context + +`codehub analyze` ran two parallel secret scanners — `betterleaks` +(Go, gitleaks fork) and `detect-secrets` (Python, Yelp). Three problems +showed up in dogfood runs: + +1. **Wall-clock cost.** detect-secrets is a single-process Python + walker; on the OCH repo it took 5+ minutes and sometimes timed out + at the 300 s ceiling. It was the long pole of every analyze run. +2. **The betterleaks integration was broken.** The wrapper passed + `--report-path=/dev/stdout`, which fails inside Node's `execFile` + with `ENXIO` because the child's fd 1 is a pipe, not a char device. + Betterleaks logged the failure to stderr, emitted nothing to stdout, + and the wrapper guard turned that into an empty SARIF. detect-secrets + was effectively the only working secret scanner. +3. **18,893 findings on the OCH self-scan**, the vast majority noise: + detect-secrets' generic Base64HighEntropy / KeywordDetector flagged + integrity hashes in `pnpm-lock.yaml`, hash strings in + `.cdx.json` SBOMs, fixture data, build outputs. + +A coverage audit (Context7 + DeepWiki on +`github.com/betterleaks/betterleaks` + the upstream `betterleaks.toml`) +confirmed betterleaks ships **276 default rules** vs detect-secrets' +~24, including a CEL-filtered `generic-api-key` catch-all that subsumes +detect-secrets' high-entropy + keyword detectors. The only detector +unique to detect-secrets is `IPPublicDetector` (low value, high FP), and +a handful of named IBM-flavoured rules that fall through to +`generic-api-key` on betterleaks. + +## Decision + +1. **Remove detect-secrets entirely.** Wrapper, converter, catalog spec, + index switch case, P1 list, tests, README rows, docs ADR refs, + pre-release-gate workflow step, and the in-tree `.secrets.baseline` + file. detect-secrets' threat coverage is a strict subset of + betterleaks for the OCH use case. + +2. **Fix the betterleaks wrapper.** Two changes: + - `--report-path=/dev/stdout` → `--report-path=-`. The dash is + betterleaks' explicit "write SARIF to stdout" idiom and works + under `execFile`. + - Use `dir` mode unconditionally. `git --pre-commit=false` walks the + entire git log and re-flags every secret that ever existed in any + historical commit, which is wrong for a working-tree-state scan. + `dir` mode reflects the current checkout, matching what + `codehub analyze` actually wants. Cost: `dir` mode does not honor + `.gitignore`, so the path filtering moves into the config. + +3. **Ship a vendored default config** at + `packages/scanners/config/betterleaks.default.toml`. It uses + `[extend] useDefault = true` to inherit the 276 upstream rules and + then layers `[[allowlists]]` blocks that filter findings on: + - Vendored deps (`node_modules`, `.venv`, `vendor`, `Pods`, etc.). + - Build outputs (`dist`, `build`, `target`, `.next`, `coverage`). + - Lockfiles (`pnpm-lock.yaml`, `Cargo.lock`, `go.sum`, etc.). + - Generated SBOM / SARIF / `.codehub` artifacts. + - Binary blobs (`.parquet`, `.wasm`, `.so`, `.png`, `.pdf`). + - Test files (`*.test.ts`, `_test.go`, `test/`, `__fixtures__/`). + The wrapper auto-detects user-supplied `betterleaks.toml` / + `.gitleaks.toml` at the project root and only injects the vendored + config when none is present, so user customisation wins. + +4. **Update the pre-release CI gate** to run `betterleaks dir` against + the vendored config, with `--exit-code=1` so any new finding fails + the gate. Replaces the previous `detect-secrets scan --baseline` + step. + +## Outcomes (measured on the OCH self-scan) + +| Metric | Before | After | Delta | +|---|---|---|---| +| Wall clock (`codehub analyze .`) | 12:39 | 5:35 | **−56%** | +| Total scanner findings | 18,893 | 45 | **−420×** | +| Betterleaks findings | 0 (broken) | 0 (clean) | n/a | +| Scanner inventory size | 20 | 19 | −1 | + +The remaining 45 are all signal: 26 grype CVEs, 12 vulture dead-code +flags, 3 ruff lint, 3 radon complexity, 1 biome. + +## Tradeoffs + +- **`.gitignore` is no longer a filter for secret scans.** `dir` mode + walks every file the OS shows. The vendored `[allowlists]` is broad + but not exhaustive; users with unusual layouts may need to extend + the config. Accepted: the upside (working-tree-state scans, not + history audits) is the right default for analyze. +- **Loss of named-rule attribution for IBM Cloudant / IAM / COS / + SoftLayer.** Those collapse into `generic-api-key`. Detection still + happens; only the SARIF `ruleId` changes. Acceptable for the OCH + use case (open-source code repos, no enterprise-IBM credential + leaks expected). +- **Loss of `IPPublicDetector`.** Public-IP-as-leak is a high-FP, low- + value heuristic; not worth keeping detect-secrets to retain it. + +## Migration + +Existing users with a `.secrets.baseline` file at their project root +should delete it (no longer consumed). Any project-level overrides +should move to a `betterleaks.toml` at the project root, which the +wrapper will pick up automatically and use instead of the vendored +default. The vendored config is published with the +`@opencodehub/scanners` npm package under `config/` and is read at +runtime via `import.meta.url` resolution. diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index 9c9cf3a4..5cf36ccb 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -100,7 +100,7 @@ export interface AnalyzeOptions { * Run Priority-1 security scanners at the end of `analyze` and write * `.codehub/scan.sarif` + ingest findings into the graph. **Default: * on.** Most scanners are local binaries (semgrep, bandit, ruff, - * vulture, radon, detect-secrets, betterleaks, ty); the network-backed + * vulture, radon, betterleaks, ty); the network-backed * ones (osv-scanner, grype, npm/pip audit) are silently skipped when * `--offline` is set. Pass `false` (CLI: `--no-scan`) to suppress — the * graph pipeline runs unchanged. diff --git a/packages/cli/src/commands/scan.test.ts b/packages/cli/src/commands/scan.test.ts index 86051a04..083937e7 100644 --- a/packages/cli/src/commands/scan.test.ts +++ b/packages/cli/src/commands/scan.test.ts @@ -13,7 +13,7 @@ test("selectScanners: empty profile yields only polyglot P1 scanners", () => { const ids = selectScanners({}, {}) .map((s) => s.id) .sort(); - assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); }); test("selectScanners: iacTypes=['terraform'] enables tflint + trivy + checkov", () => { @@ -24,7 +24,6 @@ test("selectScanners: iacTypes=['terraform'] enables tflint + trivy + checkov", assert.deepEqual(ids, [ "betterleaks", "checkov", - "detect-secrets", "grype", "osv-scanner", "semgrep", diff --git a/packages/docs/src/content/docs/architecture/monorepo-map.md b/packages/docs/src/content/docs/architecture/monorepo-map.md index 838bfa53..b0ef8c7c 100644 --- a/packages/docs/src/content/docs/architecture/monorepo-map.md +++ b/packages/docs/src/content/docs/architecture/monorepo-map.md @@ -25,7 +25,7 @@ binary; every other package is a library imported by `cli`, `mcp`, | `@opencodehub/pack` | `packages/pack` | Deterministic 9-item code-pack BOM (the artifact attached to every release). | | `@opencodehub/policy` | `packages/policy` | `opencodehub.policy.yaml` loader, validator, evaluator. | | `@opencodehub/sarif` | `packages/sarif` | SARIF 2.1.0 Zod schemas, merge + enrich, suppressions, baseline diffing. | -| `@opencodehub/scanners` | `packages/scanners` | Twenty scanner wrappers (semgrep, osv-scanner, bandit, ruff, grype, vulture, pip-audit, npm-audit, biome, betterleaks, detect-secrets, trivy, checkov, hadolint, tflint, spectral, radon, ty, clamav, och self-scan). | +| `@opencodehub/scanners` | `packages/scanners` | Nineteen scanner wrappers (semgrep, osv-scanner, bandit, ruff, grype, vulture, pip-audit, npm-audit, biome, betterleaks, trivy, checkov, hadolint, tflint, spectral, radon, ty, clamav, och self-scan). | | `@opencodehub/scip-ingest` | `packages/scip-ingest` | `.scip` protobuf reader + per-language indexer runners (TypeScript, Python, Go, Rust, Java, .NET, clang, Kotlin, Ruby). | | `@opencodehub/search` | `packages/search` | Hybrid BM25 + RRF search. | | `@opencodehub/storage` | `packages/storage` | The `IGraphStore` / `ITemporalStore` interface segregation, the LadybugDB and DuckDB adapters, the resolver that picks between them. | diff --git a/packages/docs/src/content/docs/architecture/scanners-and-sarif.md b/packages/docs/src/content/docs/architecture/scanners-and-sarif.md index 1475bb0a..cfc988ab 100644 --- a/packages/docs/src/content/docs/architecture/scanners-and-sarif.md +++ b/packages/docs/src/content/docs/architecture/scanners-and-sarif.md @@ -12,20 +12,21 @@ covers the catalog, the license distinction between bundled and wrapped tools, how SARIF enrichment stays GHAS-compatible, and how baseline diffs get bucketized. -## Scanner inventory (20) +## Scanner inventory (19) The catalog at `packages/scanners/src/catalog.ts` is a flat module: one exported `ScannerSpec` per tool plus aggregate arrays. Selection is driven by the project profile (languages, IaC types, API contracts) and can be overridden with an explicit `scanners` list on the `scan` -tool. After PR #72 added `detect-secrets`, the inventory is **20 -scanners**: +tool. The current inventory is **19 scanners** — `detect-secrets` was +removed in favour of `betterleaks`, which ships 276 default rules and +a CEL-filtered `generic-api-key` catch-all that subsumes the older +tool's entropy + keyword detectors. | Scanner | Scope | |---|---| | `semgrep` | Multi-language static analysis. | -| `betterleaks` | Secrets — permissive license. | -| `detect-secrets` | Secrets — entropy + pattern based. | +| `betterleaks` | Secrets — 276 rules + entropy + CEL filters. | | `osv-scanner` | Lockfile vulnerability scan against OSV. | | `bandit` | Python static security. | | `biome` | TS/JS lint + format. | diff --git a/packages/scanners/README.md b/packages/scanners/README.md index 5dd5f789..f429f3a4 100644 --- a/packages/scanners/README.md +++ b/packages/scanners/README.md @@ -35,7 +35,6 @@ fixed in `P1_SPECS` (lines 305-318); P2 ordering in `P2_SPECS` (lines 321-330). | `betterleaks` | all (secrets) | yes | MIT | | `osv-scanner` | all (deps) | yes | Apache-2.0 | | `bandit` | python | yes | Apache-2.0 | -| `detect-secrets` | all (Yelp keyword + basic-auth) | no | Apache-2.0 | | `biome` | typescript / javascript / tsx | yes | MIT | | `pip-audit` | python | no | Apache-2.0 | | `npm-audit` | typescript / javascript | no | Artistic-2.0 bin | @@ -68,9 +67,14 @@ fixed in `P1_SPECS` (lines 305-318); P2 ordering in `P2_SPECS` (lines 321-330). catalog before launch, so scans don't waste time on irrelevant tools. - **SHA256-pinned versions** — every spec carries a `version` and an `installCmd`; CI installs the exact version listed. -- **`detect-secrets` is the 20th scanner** — added to catch keyword and - basic-auth secret shapes that betterleaks structurally cannot see - (`packages/scanners/src/catalog.ts:64-82`). +- **`betterleaks` ships a vendored default config** at + `packages/scanners/config/betterleaks.default.toml`. It extends the + upstream 276 default rules and layers an `[allowlists]` block that + drops findings on vendored deps, lockfiles, build outputs, SBOMs, + generated SARIF, and common test-fixture directories. Users override + by placing their own `betterleaks.toml` (or `.gitleaks.toml`) at the + project root. The wrapper auto-detects user configs and only injects + the vendored one when the project doesn't carry its own. - **`optIn` and `beta` flags** — `clamav` is opt-in (off by profile); `ty` is marked beta. Both are excluded from the default `filterSpecsByProfile` output unless asked for explicitly. diff --git a/packages/scanners/config/betterleaks.default.toml b/packages/scanners/config/betterleaks.default.toml new file mode 100644 index 00000000..a6b9617b --- /dev/null +++ b/packages/scanners/config/betterleaks.default.toml @@ -0,0 +1,159 @@ +# OpenCodeHub default betterleaks config. +# +# Shipped inside @opencodehub/scanners and injected via `--config` when the +# user has not placed their own `betterleaks.toml` / `.gitleaks.toml` at the +# project root. Inherits all 276 default rules from betterleaks itself, +# then layers a universal `paths` allowlist that filters out vendored deps, +# generated artifacts, lockfile noise, and common build-output directories. +# +# Why we don't lean on `.gitignore`: +# - betterleaks `dir` mode ignores `.gitignore` entirely. +# - `git` mode honors it transitively (git itself doesn't enumerate +# gitignored paths), but only works inside a real git repo. The wrapper +# prefers `git` mode when a `.git` directory is present, falling back +# to `dir` mode otherwise. This config covers BOTH cases — the path +# allowlist is harmless in `git` mode (gitignored paths simply don't +# reach the scanner) and load-bearing in `dir` mode. +# +# Override discipline: drop your own `betterleaks.toml` at the project root +# and OCH will use it instead of this file. Use `[extend] path = "..."` to +# layer your own allowlists on top of these defaults. + +title = "OpenCodeHub default scan policy (betterleaks)" + +[extend] +useDefault = true + +# ───────────────────────────────────────────────────────────────────────────── +# Universal allowlist — vendored deps, build outputs, lockfiles, SBOMs. +# RE2: no lookarounds, anchor explicitly with `(^|/)` and `$`. +# ───────────────────────────────────────────────────────────────────────────── + +[[allowlists]] +description = "Vendored dependency directories" +paths = [ + '''(^|/)node_modules/''', + '''(^|/)\.pnpm/''', + '''(^|/)\.yarn/''', + '''(^|/)bower_components/''', + '''(^|/)\.venv/''', + '''(^|/)venv/''', + '''(^|/)\.virtualenv/''', + '''(^|/)site-packages/''', + '''(^|/)__pycache__/''', + '''(^|/)\.tox/''', + '''(^|/)\.nox/''', + '''(^|/)vendor/''', + '''(^|/)third_party/''', + '''(^|/)Pods/''', + '''(^|/)Carthage/''', +] + +[[allowlists]] +description = "Build output directories" +paths = [ + '''(^|/)dist/''', + '''(^|/)build/''', + '''(^|/)out/''', + '''(^|/)\.next/''', + '''(^|/)\.nuxt/''', + '''(^|/)\.svelte-kit/''', + '''(^|/)\.turbo/''', + '''(^|/)\.cache/''', + '''(^|/)\.parcel-cache/''', + '''(^|/)target/''', + '''(^|/)bin/''', + '''(^|/)obj/''', + '''(^|/)coverage/''', + '''(^|/)\.nyc_output/''', + '''(^|/)\.gradle/''', + '''(^|/)\.idea/''', + '''(^|/)\.vscode/''', +] + +[[allowlists]] +description = "Lockfiles — managers pin integrity hashes that look like secrets" +paths = [ + '''(^|/)pnpm-lock\.yaml$''', + '''(^|/)package-lock\.json$''', + '''(^|/)yarn\.lock$''', + '''(^|/)npm-shrinkwrap\.json$''', + '''(^|/)Cargo\.lock$''', + '''(^|/)go\.sum$''', + '''(^|/)poetry\.lock$''', + '''(^|/)Pipfile\.lock$''', + '''(^|/)uv\.lock$''', + '''(^|/)pdm\.lock$''', + '''(^|/)composer\.lock$''', + '''(^|/)Gemfile\.lock$''', + '''(^|/)mix\.lock$''', + '''(^|/)flake\.lock$''', + '''(^|/)pubspec\.lock$''', + '''(^|/)Podfile\.lock$''', +] + +[[allowlists]] +description = "Generated SBOM and security artifacts" +paths = [ + '''\.cdx\.json$''', + '''\.cdx\.xml$''', + '''\.spdx\.json$''', + '''\.spdx$''', + '''\.sarif$''', + '''(^|/)\.codehub/''', + '''(^|/)\.secrets\.baseline$''', + '''(^|/)\.gitleaks-cache$''', + '''(^|/)\.trufflehog\.json$''', +] + +[[allowlists]] +description = "Binary and large-blob formats — high entropy by definition" +paths = [ + '''\.parquet$''', + '''\.avro$''', + '''\.orc$''', + '''\.pb$''', + '''\.proto\.bin$''', + '''\.onnx$''', + '''\.safetensors$''', + '''\.bin$''', + '''\.wasm$''', + '''\.so$''', + '''\.dylib$''', + '''\.dll$''', + '''\.a$''', + '''\.o$''', + '''\.zip$''', + '''\.tar$''', + '''\.tar\.gz$''', + '''\.tgz$''', + '''\.7z$''', + '''\.jpg$''', + '''\.jpeg$''', + '''\.png$''', + '''\.gif$''', + '''\.webp$''', + '''\.pdf$''', + '''\.mp4$''', + '''\.mov$''', +] + +[[allowlists]] +description = "Test files and fixture directories" +paths = [ + '''\.test\.ts$''', + '''\.test\.tsx$''', + '''\.test\.js$''', + '''\.test\.jsx$''', + '''\.test\.py$''', + '''_test\.go$''', + '''_test\.rs$''', + '''(^|/)test/''', + '''(^|/)tests/''', + '''(^|/)testdata/''', + '''(^|/)__fixtures__/''', + '''(^|/)__snapshots__/''', + '''(^|/)spec/''', + '''(^|/)specs/''', + '''(^|/)e2e/''', +] diff --git a/packages/scanners/package.json b/packages/scanners/package.json index 2fe647eb..eec5b587 100644 --- a/packages/scanners/package.json +++ b/packages/scanners/package.json @@ -29,7 +29,8 @@ "dist/**/*.js.map", "!dist/**/*.test.js.map", "dist/**/*.d.ts.map", - "!dist/**/*.test.d.ts.map" + "!dist/**/*.test.d.ts.map", + "config/*.toml" ], "scripts": { "build": "tsc -b", diff --git a/packages/scanners/src/catalog.test.ts b/packages/scanners/src/catalog.test.ts index 82b59f5b..e0dd586d 100644 --- a/packages/scanners/src/catalog.test.ts +++ b/packages/scanners/src/catalog.test.ts @@ -16,7 +16,6 @@ test("P1_SPECS contains the Priority-1 scanners in stable order", () => { "betterleaks", "osv-scanner", "bandit", - "detect-secrets", "biome", "pip-audit", "npm-audit", @@ -41,8 +40,10 @@ test("P2_SPECS contains the Priority-2 scanners in stable order", () => { ]); }); -test("ALL_SPECS has 20 entries (constraint-10 met)", () => { - assert.equal(ALL_SPECS.length, 20); +test("ALL_SPECS has 19 entries", () => { + // Was 20 in the original constraint-10 inventory; detect-secrets was + // dropped in favour of betterleaks (strict superset, ~10x faster). + assert.equal(ALL_SPECS.length, 19); }); test("ty is flagged beta and clamav is optIn", () => { @@ -93,11 +94,10 @@ test("every P2 spec is marked priority 2", () => { test("filterSpecsByLanguages keeps polyglot scanners and language-matching ones", () => { const pythonOnly = filterSpecsByLanguages(P1_SPECS, ["python"]); const ids = pythonOnly.map((s) => s.id).sort(); - // semgrep/betterleaks/osv-scanner/detect-secrets/grype polyglot; bandit/pip-audit/ruff/vulture match python. + // semgrep/betterleaks/osv-scanner/grype polyglot; bandit/pip-audit/ruff/vulture match python. assert.deepEqual(ids, [ "bandit", "betterleaks", - "detect-secrets", "grype", "osv-scanner", "pip-audit", @@ -110,28 +110,20 @@ test("filterSpecsByLanguages keeps polyglot scanners and language-matching ones" test("filterSpecsByLanguages returns only polyglot scanners for empty input", () => { const empty = filterSpecsByLanguages(P1_SPECS, []); const ids = empty.map((s) => s.id).sort(); - assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); }); test("filterSpecsByLanguages includes biome + npm-audit for TypeScript projects", () => { const ts = filterSpecsByLanguages(P1_SPECS, ["typescript"]); const ids = ts.map((s) => s.id).sort(); - assert.deepEqual(ids, [ - "betterleaks", - "biome", - "detect-secrets", - "grype", - "npm-audit", - "osv-scanner", - "semgrep", - ]); + assert.deepEqual(ids, ["betterleaks", "biome", "grype", "npm-audit", "osv-scanner", "semgrep"]); }); test("filterSpecsByProfile: empty profile yields polyglot P1 scanners", () => { const ids = filterSpecsByProfile(ALL_SPECS, {}) .map((s) => s.id) .sort(); - assert.deepEqual(ids, ["betterleaks", "detect-secrets", "grype", "osv-scanner", "semgrep"]); + assert.deepEqual(ids, ["betterleaks", "grype", "osv-scanner", "semgrep"]); }); test("filterSpecsByProfile: Python + Terraform project enables python + IaC scanners", () => { @@ -146,7 +138,6 @@ test("filterSpecsByProfile: Python + Terraform project enables python + IaC scan "bandit", "betterleaks", "checkov", - "detect-secrets", "grype", "osv-scanner", "pip-audit", @@ -171,7 +162,6 @@ test("filterSpecsByProfile: Docker-only project enables hadolint + trivy + check assert.deepEqual(ids, [ "betterleaks", "checkov", - "detect-secrets", "grype", "hadolint", "osv-scanner", diff --git a/packages/scanners/src/catalog.ts b/packages/scanners/src/catalog.ts index 4ef9e72a..a71605c5 100644 --- a/packages/scanners/src/catalog.ts +++ b/packages/scanners/src/catalog.ts @@ -61,26 +61,6 @@ export const BANDIT_SPEC: ScannerSpec = { license: "Apache-2.0", }; -// detect-secrets — Yelp's polyglot secret scanner. The 20th scanner per -// ROADMAP constraint 10. v1.5.0 shipped 2024-05-06; master is still -// active but no new tag in ~24 months — stale-since flag captured here -// rather than in a dedicated field. Unique value over betterleaks comes -// from KeywordDetector (`admin_password = "hunter2"`) and -// BasicAuthDetector (`https://user:pass@host`) — classes of secrets a -// regex-shape scanner structurally cannot see. -export const DETECT_SECRETS_SPEC: ScannerSpec = { - id: "detect-secrets", - name: "detect-secrets", - languages: "all", - iacTypes: [], - sarifNative: false, - installCmd: "pipx install detect-secrets==1.5.0", - version: "1.5.0", - offlineCapable: true, - priority: 1, - license: "Apache-2.0", -}; - export const BIOME_SPEC: ScannerSpec = { id: "biome", name: "Biome", @@ -307,7 +287,6 @@ export const P1_SPECS: readonly ScannerSpec[] = [ BETTERLEAKS_SPEC, OSV_SCANNER_SPEC, BANDIT_SPEC, - DETECT_SECRETS_SPEC, BIOME_SPEC, PIP_AUDIT_SPEC, NPM_AUDIT_SPEC, diff --git a/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts b/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts deleted file mode 100644 index 129ae873..00000000 --- a/packages/scanners/src/converters/detect-secrets-to-sarif.test.ts +++ /dev/null @@ -1,222 +0,0 @@ -/** - * detect-secrets JSON → SARIF v2.1.0 converter tests. - * - * Every generated SARIF log is validated against `SarifLogSchema` from - * @opencodehub/sarif so schema drift is caught at the conversion boundary. - */ - -import assert from "node:assert/strict"; -import { test } from "node:test"; -import { SarifLogSchema } from "@opencodehub/sarif"; -import { detectSecretsJsonToSarif } from "./detect-secrets-to-sarif.js"; - -function assertValidSarif(log: unknown): void { - const result = SarifLogSchema.safeParse(log); - assert.ok(result.success, `expected valid SARIF: ${result.success ? "" : result.error.message}`); -} - -test("detectSecretsJsonToSarif emits one result per finding across files", () => { - const json = { - version: "1.5.0", - plugins_used: [], - filters_used: [], - results: { - "src/config.ts": [ - { - type: "AWS Access Key", - filename: "src/config.ts", - hashed_secret: "abc123", - is_verified: false, - line_number: 10, - }, - { - type: "Secret_Keyword", - filename: "src/config.ts", - hashed_secret: "def456", - is_verified: false, - line_number: 11, - }, - ], - "src/db.ts": [ - { - type: "Basic Auth Credentials", - filename: "src/db.ts", - hashed_secret: "ghi789", - is_verified: true, - line_number: 4, - }, - ], - }, - generated_at: "2026-05-09T19:00:00Z", - }; - const log = detectSecretsJsonToSarif(json); - assertValidSarif(log); - assert.equal(log.runs.length, 1); - assert.equal(log.runs[0]?.tool.driver.name, "detect-secrets"); - assert.equal(log.runs[0]?.tool.driver.version, "1.5.0"); - const results = log.runs[0]?.results ?? []; - assert.equal(results.length, 3); - assert.equal(results[0]?.ruleId, "AWSKeyDetector"); - assert.equal(results[1]?.ruleId, "KeywordDetector"); - assert.equal(results[2]?.ruleId, "BasicAuthDetector"); -}); - -test("detectSecretsJsonToSarif marks verified findings as error", () => { - const json = { - results: { - "x.ts": [ - { - type: "AWS Access Key", - filename: "x.ts", - hashed_secret: "h1", - is_verified: true, - line_number: 1, - }, - { - type: "AWS Access Key", - filename: "x.ts", - hashed_secret: "h2", - is_verified: false, - line_number: 2, - }, - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - assertValidSarif(log); - const results = log.runs[0]?.results ?? []; - assert.equal(results[0]?.level, "error"); - assert.equal(results[1]?.level, "warning"); - const props0 = (results[0]?.properties as { opencodehub?: Record } | undefined) - ?.opencodehub; - assert.equal(props0?.["is_verified"], true); -}); - -test("detectSecretsJsonToSarif stamps hashed_secret on partialFingerprints (not as crypto fingerprint)", () => { - const json = { - results: { - "x.ts": [ - { - type: "AWS Access Key", - filename: "x.ts", - hashed_secret: "deadbeef", - line_number: 1, - }, - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - const r = log.runs[0]?.results?.[0]; - // SARIF §3.27.18: partialFingerprints are plugin-defined identifiers, - // NOT a security claim. The slot is named `detect_secrets_sha1` to - // make the (non-cryptographic) algorithm explicit. - assert.equal(r?.partialFingerprints?.["detect_secrets_sha1"], "deadbeef"); -}); - -test("detectSecretsJsonToSarif uses 1-indexed startLine matching SARIF", () => { - const json = { - results: { - "x.ts": [{ type: "AWS Access Key", filename: "x.ts", hashed_secret: "h", line_number: 42 }], - }, - }; - const log = detectSecretsJsonToSarif(json); - const region = log.runs[0]?.results?.[0]?.locations?.[0]?.physicalLocation?.region; - assert.equal(region?.startLine, 42); -}); - -test("detectSecretsJsonToSarif passes overlapping findings through", () => { - // Two detectors fire on the same line — both must pass through and let - // OCH's downstream SARIF dedupe handle merging. - const json = { - results: { - "secret.py": [ - { - type: "AWS Access Key", - filename: "secret.py", - hashed_secret: "h-aws", - line_number: 7, - }, - { - type: "Secret_Keyword", - filename: "secret.py", - hashed_secret: "h-keyword", - line_number: 7, - }, - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - const results = log.runs[0]?.results ?? []; - assert.equal(results.length, 2); - assert.equal(results[0]?.ruleId, "AWSKeyDetector"); - assert.equal(results[1]?.ruleId, "KeywordDetector"); - assert.equal( - results[0]?.locations?.[0]?.physicalLocation?.region?.startLine, - results[1]?.locations?.[0]?.physicalLocation?.region?.startLine, - ); -}); - -test("detectSecretsJsonToSarif slugs unknown detector types instead of dropping", () => { - const json = { - results: { - "x.ts": [ - { - type: "Future Detector v2", - filename: "x.ts", - hashed_secret: "h", - line_number: 1, - }, - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - const r = log.runs[0]?.results?.[0]; - assert.equal(r?.ruleId, "Future-Detector-v2"); -}); - -test("detectSecretsJsonToSarif emits empty (but valid) SARIF for garbage input", () => { - assertValidSarif(detectSecretsJsonToSarif({})); - assertValidSarif(detectSecretsJsonToSarif(null)); - assertValidSarif(detectSecretsJsonToSarif({ results: "not an object" })); - assertValidSarif(detectSecretsJsonToSarif({ results: [] })); - assert.equal(detectSecretsJsonToSarif({}).runs[0]?.results?.length, 0); - assert.equal( - detectSecretsJsonToSarif(null).runs[0]?.tool.driver.name, - "detect-secrets", - "tool.driver.name must be preserved on empty SARIF", - ); -}); - -test("detectSecretsJsonToSarif skips findings without a type", () => { - const json = { - results: { - "x.ts": [ - { type: "AWS Access Key", filename: "x.ts", hashed_secret: "ok", line_number: 1 }, - { filename: "x.ts", hashed_secret: "drop", line_number: 2 }, // no type - { type: "", filename: "x.ts", hashed_secret: "drop", line_number: 3 }, // empty type - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - const results = log.runs[0]?.results ?? []; - assert.equal(results.length, 1); - assert.equal(results[0]?.ruleId, "AWSKeyDetector"); -}); - -test("detectSecretsJsonToSarif tolerates findings without hashed_secret", () => { - const json = { - results: { - "x.ts": [ - { - type: "AWS Access Key", - filename: "x.ts", - line_number: 1, - }, - ], - }, - }; - const log = detectSecretsJsonToSarif(json); - const r = log.runs[0]?.results?.[0]; - assert.equal(r?.ruleId, "AWSKeyDetector"); - assert.equal(r?.partialFingerprints, undefined); -}); diff --git a/packages/scanners/src/converters/detect-secrets-to-sarif.ts b/packages/scanners/src/converters/detect-secrets-to-sarif.ts deleted file mode 100644 index c9208c03..00000000 --- a/packages/scanners/src/converters/detect-secrets-to-sarif.ts +++ /dev/null @@ -1,200 +0,0 @@ -/** - * detect-secrets JSON → SARIF v2.1.0 converter. - * - * detect-secrets does not emit SARIF natively (Yelp/detect-secrets#488 is - * still open as P4/help-wanted). Its `scan` subcommand writes JSON on - * stdout shaped like: - * - * { - * "version": "1.5.0", - * "plugins_used": [...], - * "filters_used": [...], - * "results": { - * "": [ - * { - * "type": "AWS Access Key", - * "filename": "", - * "hashed_secret": "", - * "is_verified": false, - * "line_number": 42 - * } - * ] - * }, - * "generated_at": "..." - * } - * - * We emit one SARIF result per finding: - * - ruleId = type-string slug (e.g. "AWSKeyDetector") - * - level = "warning" (verified=true → "error") - * - message = " detected in " - * - location = artifactLocation { uri: "" }, region.startLine - * - properties.opencodehub.is_verified = boolean - * - partialFingerprints.detect_secrets_sha1 = hashed_secret - * - * We do NOT advertise hashed_secret as a cryptographic fingerprint — - * SHA-1 is not collision-resistant. The - * `partialFingerprints.detect_secrets_sha1` slot is documented as a - * plugin-defined identifier per SARIF §3.27.18, not a security claim. - * - * Overlapping findings (KeywordDetector + AWSKeyDetector on the same - * line) are NOT deduplicated here — both pass through and rely on - * OCH's downstream SARIF dedupe at merge time. - * - * The output is validated against `SarifLogSchema` from @opencodehub/sarif - * before being returned, so malformed emissions never leak downstream. - */ - -import type { SarifLog, SarifResult, SarifRun } from "@opencodehub/sarif"; -import { SarifLogSchema } from "@opencodehub/sarif"; -import { DETECT_SECRETS_SPEC } from "../catalog.js"; - -/** - * Stable detect-secrets `type` → SARIF ruleId map. Each detector class - * is referenced by the spaced human-readable name detect-secrets emits in - * its JSON output. Source: `detect-secrets --list-all-plugins` (v1.5.0). - * - * Unknown types fall back to a slug derived from the type string, so - * future detector additions in detect-secrets do not break the converter - * — they just emit a generic ruleId until this table is updated. - */ -const TYPE_TO_RULE_ID: Readonly> = { - "Artifactory Credentials": "ArtifactoryDetector", - "AWS Access Key": "AWSKeyDetector", - "Azure Storage Account access key": "AzureStorageKeyDetector", - "Basic Auth Credentials": "BasicAuthDetector", - "Cloudant Credentials": "CloudantDetector", - "Discord Bot Token": "DiscordBotTokenDetector", - "GitHub Token": "GitHubTokenDetector", - "GitLab Token": "GitLabTokenDetector", - "Base64 High Entropy String": "Base64HighEntropyString", - "Hex High Entropy String": "HexHighEntropyString", - "IBM Cloud IAM Key": "IbmCloudIamDetector", - "IBM COS HMAC Credentials": "IbmCosHmacDetector", - Secret_Keyword: "KeywordDetector", - "Mailchimp Access Key": "MailchimpDetector", - "NPM tokens": "NpmDetector", - "OpenAI Token": "OpenAIDetector", - "Private Key": "PrivateKeyDetector", - "PyPI upload token": "PypiTokenDetector", - "SendGrid API Key": "SendGridDetector", - "Slack Token": "SlackDetector", - "SoftLayer Credentials": "SoftlayerDetector", - "Square OAuth Secret": "SquareOAuthDetector", - "Stripe Access Key": "StripeDetector", - "Telegram Bot Token": "TelegramBotTokenDetector", - "Twilio API Key": "TwilioKeyDetector", -}; - -interface DetectSecretsFinding { - readonly type?: string; - readonly filename?: string; - readonly hashed_secret?: string; - readonly is_verified?: boolean; - readonly line_number?: number; -} - -interface DetectSecretsReport { - readonly results?: Readonly>; -} - -/** - * Convert a detect-secrets JSON object (already parsed) to a SARIF - * v2.1.0 log. Unknown / malformed input → an empty (but schema-valid) - * SARIF log attributed to detect-secrets. - */ -export function detectSecretsJsonToSarif(json: unknown): SarifLog { - const results: SarifResult[] = []; - const report = asReport(json); - - for (const [filename, findings] of Object.entries(report.results ?? {})) { - for (const finding of findings) { - const result = findingToResult(filename, finding); - if (result !== undefined) results.push(result); - } - } - - const run: SarifRun = { - tool: { driver: { name: DETECT_SECRETS_SPEC.id, version: DETECT_SECRETS_SPEC.version } }, - results, - }; - const log: SarifLog = { version: "2.1.0", runs: [run] }; - - // Defensive — the shape above is pure and should always validate. - // Returning the unvalidated log is safer than throwing. - const parsed = SarifLogSchema.safeParse(log); - if (!parsed.success) return { version: "2.1.0", runs: [run] }; - return parsed.data; -} - -function findingToResult(filename: string, finding: DetectSecretsFinding): SarifResult | undefined { - if (typeof finding.type !== "string" || finding.type.length === 0) return undefined; - const ruleId = TYPE_TO_RULE_ID[finding.type] ?? slugForUnknownType(finding.type); - // detect-secrets uses 1-indexed line numbers, which matches SARIF. - const startLine = - typeof finding.line_number === "number" && finding.line_number >= 1 ? finding.line_number : 1; - const isVerified = finding.is_verified === true; - const result: SarifResult = { - ruleId, - level: isVerified ? "error" : "warning", - message: { text: `${finding.type} detected in ${filename}` }, - locations: [ - { - physicalLocation: { - artifactLocation: { uri: filename }, - region: { startLine }, - }, - }, - ], - properties: { - opencodehub: { - is_verified: isVerified, - }, - }, - }; - if (typeof finding.hashed_secret === "string" && finding.hashed_secret.length > 0) { - return { - ...result, - partialFingerprints: { detect_secrets_sha1: finding.hashed_secret }, - }; - } - return result; -} - -function slugForUnknownType(type: string): string { - // Drop non-alphanumerics, preserve word boundaries. - return type.replace(/[^A-Za-z0-9]+/g, "-").replace(/^-+|-+$/g, ""); -} - -function asReport(json: unknown): DetectSecretsReport { - if (typeof json !== "object" || json === null) return {}; - const obj = json as Record; - const rawResults = obj["results"]; - if (typeof rawResults !== "object" || rawResults === null || Array.isArray(rawResults)) { - return {}; - } - const out: Record = {}; - for (const [filename, findings] of Object.entries(rawResults as Record)) { - if (!Array.isArray(findings)) continue; - const list: DetectSecretsFinding[] = []; - for (const f of findings) { - if (typeof f !== "object" || f === null) continue; - const row = f as Record; - const finding: DetectSecretsFinding = { - ...(typeof row["type"] === "string" ? { type: row["type"] as string } : {}), - ...(typeof row["filename"] === "string" ? { filename: row["filename"] as string } : {}), - ...(typeof row["hashed_secret"] === "string" - ? { hashed_secret: row["hashed_secret"] as string } - : {}), - ...(typeof row["is_verified"] === "boolean" - ? { is_verified: row["is_verified"] as boolean } - : {}), - ...(typeof row["line_number"] === "number" - ? { line_number: row["line_number"] as number } - : {}), - }; - list.push(finding); - } - out[filename] = list; - } - return { results: out }; -} diff --git a/packages/scanners/src/index.ts b/packages/scanners/src/index.ts index d0c74b8b..1f740cd2 100644 --- a/packages/scanners/src/index.ts +++ b/packages/scanners/src/index.ts @@ -11,7 +11,7 @@ * `filterSpecsByProfile`, `findSpec`. * - Runner: `runScanners(path, wrappers, opts)` — concurrent runner. * - P1 wrappers: createSemgrepWrapper / createBetterleaksWrapper / - * createOsvScannerWrapper / createBanditWrapper / createDetectSecretsWrapper / + * createOsvScannerWrapper / createBanditWrapper / * createBiomeWrapper / createPipAuditWrapper / createNpmAuditWrapper. * - P2 wrappers: createTrivyWrapper / createCheckovWrapper / * createHadolintWrapper / createTflintWrapper / createSpectralWrapper. @@ -30,7 +30,6 @@ export { CHECKOV_DOCKER_COMPOSE_SPEC, CHECKOV_SPEC, CLAMAV_SPEC, - DETECT_SECRETS_SPEC, filterSpecsByLanguages, filterSpecsByProfile, findSpec, @@ -50,7 +49,6 @@ export { TY_SPEC, VULTURE_SPEC, } from "./catalog.js"; -export { detectSecretsJsonToSarif } from "./converters/detect-secrets-to-sarif.js"; export type { NpmAuditConvertOptions } from "./converters/npm-audit-to-sarif.js"; export { npmAuditJsonToSarif } from "./converters/npm-audit-to-sarif.js"; export type { PipAuditConvertOptions } from "./converters/pip-audit-to-sarif.js"; @@ -75,7 +73,6 @@ export { createBiomeWrapper } from "./wrappers/biome.js"; export type { CheckovWrapperOptions } from "./wrappers/checkov.js"; export { createCheckovWrapper } from "./wrappers/checkov.js"; export { createClamAvWrapper } from "./wrappers/clamav.js"; -export { createDetectSecretsWrapper } from "./wrappers/detect-secrets.js"; export type { CheckovDockerComposeWrapperOptions } from "./wrappers/docker-compose.js"; export { createCheckovDockerComposeWrapper } from "./wrappers/docker-compose.js"; export { createGrypeWrapper } from "./wrappers/grype.js"; @@ -103,7 +100,6 @@ import { CHECKOV_DOCKER_COMPOSE_SPEC, CHECKOV_SPEC, CLAMAV_SPEC, - DETECT_SECRETS_SPEC, GRYPE_SPEC, HADOLINT_SPEC, NPM_AUDIT_SPEC, @@ -124,7 +120,6 @@ import { createBetterleaksWrapper } from "./wrappers/betterleaks.js"; import { createBiomeWrapper } from "./wrappers/biome.js"; import { type CheckovWrapperOptions, createCheckovWrapper } from "./wrappers/checkov.js"; import { createClamAvWrapper } from "./wrappers/clamav.js"; -import { createDetectSecretsWrapper } from "./wrappers/detect-secrets.js"; import { type CheckovDockerComposeWrapperOptions, createCheckovDockerComposeWrapper, @@ -200,8 +195,6 @@ function createWrapperFor( return deps ? createOsvScannerWrapper(deps) : createOsvScannerWrapper(); case BANDIT_SPEC.id: return deps ? createBanditWrapper(deps) : createBanditWrapper(); - case DETECT_SECRETS_SPEC.id: - return deps ? createDetectSecretsWrapper(deps) : createDetectSecretsWrapper(); case BIOME_SPEC.id: return deps ? createBiomeWrapper(deps) : createBiomeWrapper(); case PIP_AUDIT_SPEC.id: diff --git a/packages/scanners/src/wrappers/betterleaks.ts b/packages/scanners/src/wrappers/betterleaks.ts index fae69fc4..3cc8a230 100644 --- a/packages/scanners/src/wrappers/betterleaks.ts +++ b/packages/scanners/src/wrappers/betterleaks.ts @@ -1,31 +1,120 @@ /** * Betterleaks wrapper — secret detection (gitleaks fork by the original - * gitleaks author). + * gitleaks author, maintained by Aikido Security). * - * Invocation: `betterleaks dir --report-format=sarif --report-path=/dev/stdout .` + * Invocation: * - * Betterleaks emits SARIF natively. We route the report to stdout - * (`/dev/stdout` on POSIX; on Windows we skip the flag and let - * betterleaks write to the default `gitleaks.report.json` file — but - * v1.0 targets POSIX CI runners). + * betterleaks dir --report-format=sarif --report-path=- --no-banner \ + * [--config ] . + * + * We use `dir` mode (not `git`) so the scan reflects the current working + * tree, not historical commits. `betterleaks git --pre-commit=false` + * walks the entire git log and re-flags every secret that ever existed + * in any commit — useful for a one-shot history audit, wrong for the + * "what's in this checkout right now" signal `codehub analyze` wants. + * + * `dir` mode does NOT honor `.gitignore`. The shipped default config + * (`config/betterleaks.default.toml`) compensates by allowlisting common + * vendored / generated / lockfile paths via RE2 regexes. Users can drop + * their own `betterleaks.toml` at the project root to customize. + * + * Output: + * - `--report-path=-` writes SARIF to stdout. The earlier `=/dev/stdout` + * value broke under Node's `execFile` because the child's fd 1 is a + * pipe, not a char device — `open("/dev/stdout")` returned ENXIO and + * betterleaks printed nothing. + * + * Config resolution: + * - If the project root has its own `betterleaks.toml`, `.betterleaks.toml`, + * `gitleaks.toml`, or `.gitleaks.toml`, betterleaks picks it up by + * default (per its config precedence) and we DO NOT pass `--config`. + * - Otherwise we inject `--config /config/betterleaks.default.toml` + * so every consumer gets a sensible vendored-deps / lockfile / build-output + * allowlist out of the box. Users override by dropping a config at the + * project root. */ +import { existsSync } from "node:fs"; +import { dirname, join, resolve } from "node:path"; +import { fileURLToPath } from "node:url"; import { BETTERLEAKS_SPEC } from "../catalog.js"; -import type { ScannerRunContext, ScannerRunResult, ScannerWrapper } from "../spec.js"; -import { DEFAULT_DEPS, invokeScanner, type WrapperDeps } from "./shared.js"; +import { + emptySarifFor, + type ScannerRunContext, + type ScannerRunResult, + type ScannerWrapper, +} from "../spec.js"; +import { DEFAULT_DEPS, parseSarifOrEmpty, type WrapperDeps } from "./shared.js"; + +/** Filenames betterleaks itself recognises at the project root. */ +const USER_CONFIG_NAMES = [ + "betterleaks.toml", + ".betterleaks.toml", + "gitleaks.toml", + ".gitleaks.toml", +] as const; + +/** + * Resolve the path to the vendored default config inside this package. + * `import.meta.url` points at the compiled `dist/wrappers/betterleaks.js`, + * so `../../config/betterleaks.default.toml` lands at the package root. + */ +function defaultConfigPath(): string { + const here = dirname(fileURLToPath(import.meta.url)); + return resolve(here, "..", "..", "config", "betterleaks.default.toml"); +} -const BETTERLEAKS_ARGS: readonly string[] = [ - "dir", - "--report-format=sarif", - "--report-path=/dev/stdout", - "--no-banner", - ".", -]; +function userConfigInProject(projectPath: string): string | undefined { + for (const name of USER_CONFIG_NAMES) { + const p = join(projectPath, name); + if (existsSync(p)) return p; + } + return undefined; +} + +function buildArgs(projectPath: string): readonly string[] { + const args: string[] = ["dir", "--report-format=sarif", "--report-path=-", "--no-banner"]; + if (userConfigInProject(projectPath) === undefined) { + args.push(`--config=${defaultConfigPath()}`); + } + args.push("."); + return args; +} export function createBetterleaksWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { return { spec: BETTERLEAKS_SPEC, - run: (ctx: ScannerRunContext): Promise => - invokeScanner(BETTERLEAKS_SPEC, ctx, "betterleaks", BETTERLEAKS_ARGS, deps), + run: async (ctx: ScannerRunContext): Promise => { + const started = performance.now(); + const probe = await deps.which("betterleaks"); + if (!probe.found) { + const msg = `${BETTERLEAKS_SPEC.id}: binary 'betterleaks' not found on PATH (install: ${BETTERLEAKS_SPEC.installCmd}).`; + ctx.onWarn?.(msg); + return { + spec: BETTERLEAKS_SPEC, + sarif: emptySarifFor(BETTERLEAKS_SPEC), + skipped: msg, + durationMs: performance.now() - started, + }; + } + const args = buildArgs(ctx.projectPath); + const result = await deps.runBinary("betterleaks", args, { + timeoutMs: ctx.timeoutMs, + cwd: ctx.projectPath, + }); + const sarif = parseSarifOrEmpty(result.stdout, BETTERLEAKS_SPEC, ctx.onWarn); + // Betterleaks exits 1 when it finds leaks, 0 when clean. Anything + // else is a genuine failure. + if (result.exitCode !== 0 && result.exitCode !== 1) { + ctx.onWarn?.( + `${BETTERLEAKS_SPEC.id}: exit code ${result.exitCode}; stderr: ${result.stderr.slice(0, 200).trim()}`, + ); + } + return { + spec: BETTERLEAKS_SPEC, + sarif, + durationMs: performance.now() - started, + }; + }, }; } diff --git a/packages/scanners/src/wrappers/detect-secrets.ts b/packages/scanners/src/wrappers/detect-secrets.ts deleted file mode 100644 index caf94b19..00000000 --- a/packages/scanners/src/wrappers/detect-secrets.ts +++ /dev/null @@ -1,76 +0,0 @@ -/** - * detect-secrets wrapper — Yelp's polyglot secret scanner. The 20th - * scanner per ROADMAP constraint 10. - * - * Invocation: - * - * detect-secrets scan . --all-files - * - * `--all-files` matches betterleaks's posture (scan non-git-tracked - * files too) and is the ergonomic default for monorepo scans. The - * `scan` subcommand always emits JSON on stdout — there is no `--json` - * flag at this entry point. (The `--json` flag exists only on the - * separate `detect-secrets-hook` pre-commit entry point.) - * - * Output is JSON, NOT SARIF — we post-process stdout through - * `detectSecretsJsonToSarif` before returning. detect-secrets exits 0 - * on findings, so `invokeScanner`'s default exit-code tolerance is fine. - */ - -import { DETECT_SECRETS_SPEC } from "../catalog.js"; -import { detectSecretsJsonToSarif } from "../converters/detect-secrets-to-sarif.js"; -import { tryParseJson } from "../exec.js"; -import type { ScannerRunContext, ScannerRunResult, ScannerWrapper } from "../spec.js"; -import { emptySarifFor } from "../spec.js"; -import { DEFAULT_DEPS, type WrapperDeps } from "./shared.js"; - -const DETECT_SECRETS_ARGS: readonly string[] = ["scan", ".", "--all-files"]; - -export function createDetectSecretsWrapper(deps: WrapperDeps = DEFAULT_DEPS): ScannerWrapper { - return { - spec: DETECT_SECRETS_SPEC, - run: async (ctx: ScannerRunContext): Promise => { - const started = performance.now(); - const probe = await deps.which("detect-secrets"); - if (!probe.found) { - const msg = `${DETECT_SECRETS_SPEC.id}: binary 'detect-secrets' not found on PATH (install: ${DETECT_SECRETS_SPEC.installCmd}).`; - ctx.onWarn?.(msg); - return { - spec: DETECT_SECRETS_SPEC, - sarif: emptySarifFor(DETECT_SECRETS_SPEC), - skipped: msg, - durationMs: performance.now() - started, - }; - } - const result = await deps.runBinary("detect-secrets", DETECT_SECRETS_ARGS, { - timeoutMs: ctx.timeoutMs, - cwd: ctx.projectPath, - }); - const json = tryParseJson(result.stdout); - if (json === undefined) { - ctx.onWarn?.( - `${DETECT_SECRETS_SPEC.id}: stdout was not valid JSON (stderr: ${truncate( - result.stderr, - 200, - )}); emitting empty SARIF.`, - ); - return { - spec: DETECT_SECRETS_SPEC, - sarif: emptySarifFor(DETECT_SECRETS_SPEC), - durationMs: performance.now() - started, - }; - } - const sarif = detectSecretsJsonToSarif(json); - return { - spec: DETECT_SECRETS_SPEC, - sarif, - durationMs: performance.now() - started, - }; - }, - }; -} - -function truncate(s: string, max: number): string { - if (s.length <= max) return s.trim(); - return `${s.slice(0, max).trim()}…`; -} diff --git a/packages/scanners/src/wrappers/wrappers.test.ts b/packages/scanners/src/wrappers/wrappers.test.ts index 1335311f..784c0ea1 100644 --- a/packages/scanners/src/wrappers/wrappers.test.ts +++ b/packages/scanners/src/wrappers/wrappers.test.ts @@ -14,7 +14,6 @@ import type { ScannerRunContext } from "../spec.js"; import { createBanditWrapper } from "./bandit.js"; import { createBetterleaksWrapper } from "./betterleaks.js"; import { createBiomeWrapper } from "./biome.js"; -import { createDetectSecretsWrapper } from "./detect-secrets.js"; import { createOsvScannerWrapper } from "./osv-scanner.js"; import { createSemgrepWrapper } from "./semgrep.js"; import type { WrapperDeps } from "./shared.js"; @@ -100,13 +99,34 @@ test("semgrep wrapper returns empty SARIF + skipped when binary missing", async assert.ok(out.skipped?.includes("not found on PATH")); }); -test("betterleaks wrapper passes --report-format=sarif", async () => { +test("betterleaks wrapper passes --report-path=- (avoids /dev/stdout ENXIO bug)", async () => { const sarif = fakeSarif("betterleaks", "aws-access-token"); const { deps, calls } = makeFakeDeps(() => ({ stdout: JSON.stringify(sarif) })); const wrapper = createBetterleaksWrapper(deps); await wrapper.run(ctx); assert.equal(calls[0]?.cmd, "betterleaks"); assert.ok(calls[0]?.args.includes("--report-format=sarif")); + assert.ok( + calls[0]?.args.includes("--report-path=-"), + "must use '-' for stdout — '/dev/stdout' fails ENXIO inside execFile", + ); + assert.ok(calls[0]?.args.includes("--no-banner")); +}); + +test("betterleaks wrapper uses `dir` mode and injects vendored config", async () => { + const sarif = fakeSarif("betterleaks", "aws-access-token"); + const { deps, calls } = makeFakeDeps(() => ({ stdout: JSON.stringify(sarif) })); + const wrapper = createBetterleaksWrapper(deps); + // ctx.projectPath = "/tmp/fake-repo" (no user betterleaks.toml). Wrapper + // uses `dir` mode so the scan reflects working-tree state, not git + // history. + await wrapper.run(ctx); + const args = calls[0]?.args ?? []; + assert.equal(args[0], "dir", "wrapper always uses `dir` subcommand"); + // Vendored config should be injected when no user config is present. + const cfgArg = [...args].find((a) => a.startsWith("--config=")); + assert.ok(cfgArg, "vendored default config must be injected when project has none"); + assert.match(cfgArg ?? "", /betterleaks\.default\.toml$/); }); test("osv-scanner wrapper sends --offline-vulnerabilities", async () => { @@ -167,80 +187,3 @@ test("wrappers emit empty SARIF when stdout is malformed", async () => { const out = await wrapper.run(ctx); assert.equal(out.sarif.runs[0]?.results?.length, 0); }); - -// ---------- detect-secrets ------------------------------------------------ - -test("detect-secrets wrapper invokes `scan . --all-files`", async () => { - const json = { - version: "1.5.0", - results: { - "src/x.ts": [ - { - type: "AWS Access Key", - filename: "src/x.ts", - hashed_secret: "h", - is_verified: false, - line_number: 5, - }, - ], - }, - }; - const { deps, calls } = makeFakeDeps(() => ({ stdout: JSON.stringify(json), exitCode: 0 })); - const wrapper = createDetectSecretsWrapper(deps); - const out = await wrapper.run(ctx); - assert.equal(calls.length, 1); - assert.equal(calls[0]?.cmd, "detect-secrets"); - assert.deepEqual([...(calls[0]?.args ?? [])], ["scan", ".", "--all-files"]); - assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); - assert.equal(out.sarif.runs[0]?.results?.[0]?.ruleId, "AWSKeyDetector"); -}); - -test("detect-secrets wrapper returns empty SARIF + skipped when binary missing", async () => { - const { deps } = makeFakeDeps(() => ({ stdout: "" }), { missing: ["detect-secrets"] }); - const wrapper = createDetectSecretsWrapper(deps); - const out = await wrapper.run(ctx); - // tool.driver.name must be preserved even when skipped. - assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); - assert.equal(out.sarif.runs[0]?.results?.length, 0); - assert.ok(out.skipped?.includes("not found on PATH")); -}); - -test("detect-secrets wrapper emits empty SARIF when stdout is malformed", async () => { - const { deps } = makeFakeDeps(() => ({ stdout: "this is not json", exitCode: 0 })); - const wrapper = createDetectSecretsWrapper(deps); - const out = await wrapper.run(ctx); - assert.equal(out.sarif.runs[0]?.tool.driver.name, "detect-secrets"); - assert.equal(out.sarif.runs[0]?.results?.length, 0); -}); - -test("detect-secrets wrapper passes overlapping findings through", async () => { - // KeywordDetector + AWSKeyDetector firing on the same line: both must - // appear in the SARIF output; OCH's downstream merge handles dedupe. - const json = { - results: { - "src/secret.py": [ - { - type: "AWS Access Key", - filename: "src/secret.py", - hashed_secret: "h1", - is_verified: false, - line_number: 7, - }, - { - type: "Secret_Keyword", - filename: "src/secret.py", - hashed_secret: "h2", - is_verified: false, - line_number: 7, - }, - ], - }, - }; - const { deps } = makeFakeDeps(() => ({ stdout: JSON.stringify(json), exitCode: 0 })); - const wrapper = createDetectSecretsWrapper(deps); - const out = await wrapper.run(ctx); - const results = out.sarif.runs[0]?.results ?? []; - assert.equal(results.length, 2); - assert.equal(results[0]?.ruleId, "AWSKeyDetector"); - assert.equal(results[1]?.ruleId, "KeywordDetector"); -});