From e53f3e4901a7d69765943fe4242cbf97986a813a Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:27:03 -0500 Subject: [PATCH 1/6] fix(node): close under-withholding via full ref scope and reachable-only pin set blob_paths walked only refs/heads/* and refs/tags/* and skipped silently on a failed git ls-tree, so a blob reachable only through another namespace, or a ref that failed to traverse, could fall out of the withheld set and ship in cleartext. Walk every ref and fail closed on traversal error. The pin enumerators (ipfs_pin, pinata) used git cat-file --batch-all-objects, which includes unreachable/dangling objects that have no path and cannot be classified for withholding. Switch them to git rev-list --objects --all so the pin set matches the reachable graph blob_paths evaluates. --- .../gitlawb-node/src/git/visibility_pack.rs | 76 +++++++++++++++-- crates/gitlawb-node/src/ipfs_pin.rs | 84 ++++++++++++++++--- crates/gitlawb-node/src/pinata.rs | 73 ++++++++++++++-- 3 files changed, 206 insertions(+), 27 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 90ca772..a670dfc 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -10,25 +10,32 @@ use anyhow::{Context, Result}; use std::collections::{BTreeSet, HashMap, HashSet}; use std::path::Path; -/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch -/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives -/// at is represented (the same blob content can appear at several paths). Paths -/// are returned with a leading "/" to match the glob form used by visibility -/// rules ("/secret/**"). +/// List every (blob_oid, "/repo/relative/path") pair reachable from any ref in +/// `repo_path`. Walks every ref, not just `refs/heads/*` and `refs/tags/*`, so +/// the withheld set covers the same object graph the pack and pin paths expose; +/// a blob reachable only through another namespace (e.g. `refs/notes/*`) must not +/// escape withholding. Uses `git ls-tree -r` per ref so each path a blob lives +/// at is represented (the same blob content can appear at several paths). This is +/// why it is not `git rev-list --objects`, which reports only one path per object. +/// Paths carry a leading "/" to match the glob form used by visibility rules +/// ("/secret/**"). +/// +/// Fails closed: if a ref cannot be traversed, returns an error so the caller +/// aborts the serve/pin rather than producing a partial (under-withheld) set. fn blob_paths(repo_path: &Path) -> Result> { let refs = store::list_refs(repo_path).context("list_refs failed")?; let mut out = Vec::new(); for (refname, _oid) in refs { - if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { - continue; - } let listing = std::process::Command::new("git") .args(["ls-tree", "-r", &refname]) .current_dir(repo_path) .output() .context("git ls-tree -r failed")?; if !listing.status.success() { - continue; + anyhow::bail!( + "git ls-tree -r {refname} failed: {}", + String::from_utf8_lossy(&listing.stderr) + ); } for line in String::from_utf8_lossy(&listing.stdout).lines() { // " blob \t" @@ -295,4 +302,55 @@ mod tests { let env = seal_blob(&bytes, &[reader.verifying_key()]).unwrap(); assert_eq!(open_blob(&env, &reader).unwrap(), bytes); } + + #[test] + fn withholds_blob_reachable_only_via_nonstandard_ref() { + let (_td, bare, secret_oid, _public) = fixture(); + // Move the sole ref out of refs/heads/* into a custom namespace so the + // secret blob is reachable only through a ref the old heads/tags filter + // skipped. It must still be withheld. + let head_ref = { + let out = Command::new("git") + .args(["symbolic-ref", "HEAD"]) + .current_dir(&bare) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&bare) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["update-ref", "refs/custom/snap", "HEAD"]); + run(&["update-ref", "-d", &head_ref]); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "blob reachable only via refs/custom/* must still be withheld" + ); + } + + #[test] + fn fails_closed_when_a_ref_cannot_be_traversed() { + let (_td, bare, secret, _public) = fixture(); + // Point a ref at a blob (a valid object that is not tree-ish). `ls-tree -r` + // fails on it; that must propagate as Err rather than silently dropping the + // ref and under-withholding. + std::fs::write(bare.join("refs/heads/blobref"), format!("{secret}\n")).unwrap(); + let rules = [rule("/secret/**", &[])]; + let result = withheld_blob_oids(&bare, &rules, true, OWNER, None); + assert!( + result.is_err(), + "a ref that cannot be traversed must fail closed (Err)" + ); + } } diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index 9bdaade..89b500b 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -151,30 +151,94 @@ pub async fn pin_new_objects( pinned } -/// Run `git cat-file --batch-all-objects --batch-check='%(objectname)'` -/// to get all object SHA-256 hashes in the repository. +/// Names of every object reachable from any ref, via `git rev-list --objects --all`. +/// Reachable-only on purpose (not `cat-file --batch-all-objects`): an unreachable +/// or dangling object has no ref and no path, so visibility rules cannot classify +/// it for withholding, so it must not be pinned in cleartext. This keeps the pin set +/// aligned with what `blob_paths` can evaluate. fn list_all_objects(repo_path: &std::path::Path) -> Result> { let output = std::process::Command::new("git") - .args([ - "cat-file", - "--batch-all-objects", - "--batch-check=%(objectname)", - ]) + .args(["rev-list", "--objects", "--all"]) .current_dir(repo_path) .output() - .map_err(|e| anyhow::anyhow!("failed to run git cat-file: {e}"))?; + .map_err(|e| anyhow::anyhow!("failed to run git rev-list: {e}"))?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); - return Err(anyhow::anyhow!("git cat-file failed: {stderr}")); + return Err(anyhow::anyhow!("git rev-list failed: {stderr}")); } + // `rev-list --objects` lines are "" or " "; keep the oid. let stdout = String::from_utf8_lossy(&output.stdout); let hashes = stdout .lines() - .map(|l| l.trim().to_string()) + .filter_map(|l| l.split_whitespace().next().map(str::to_string)) .filter(|l| !l.is_empty()) .collect(); Ok(hashes) } + +#[cfg(test)] +mod tests { + use super::list_all_objects; + use std::process::Command; + use tempfile::TempDir; + + #[test] + fn list_all_objects_excludes_unreachable_blobs() { + let td = TempDir::new().unwrap(); + let work = td.path(); + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(work) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + std::fs::write(work.join("a.txt"), b"reachable\n").unwrap(); + run(&["add", "."]); + run(&["commit", "-qm", "init"]); + + let reachable = String::from_utf8_lossy( + &Command::new("git") + .args(["rev-parse", "HEAD:a.txt"]) + .current_dir(work) + .output() + .unwrap() + .stdout, + ) + .trim() + .to_string(); + + // Write a loose blob that no ref reaches (dangling). + std::fs::write(work.join("dangling.txt"), b"DANGLING SECRET\n").unwrap(); + let dangling = String::from_utf8_lossy( + &Command::new("git") + .args(["hash-object", "-w", "dangling.txt"]) + .current_dir(work) + .output() + .unwrap() + .stdout, + ) + .trim() + .to_string(); + + let objs = list_all_objects(work).unwrap(); + assert!( + objs.contains(&reachable), + "the committed (reachable) blob must be listed" + ); + assert!( + !objs.contains(&dangling), + "an unreachable/dangling blob must NOT be listed" + ); + } +} diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index 90bddad..1021d77 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -134,25 +134,26 @@ pub async fn pin_new_objects( pinned } +/// Names of every object reachable from any ref, via `git rev-list --objects --all`. +/// Reachable-only on purpose (not `cat-file --batch-all-objects`): an unreachable +/// or dangling object has no path, cannot be classified for withholding, and must +/// not be pinned in cleartext. fn list_all_objects(repo_path: &std::path::Path) -> Result> { let out = std::process::Command::new("git") - .args([ - "cat-file", - "--batch-all-objects", - "--batch-check=%(objectname)", - ]) + .args(["rev-list", "--objects", "--all"]) .current_dir(repo_path) .output() - .map_err(|e| anyhow::anyhow!("failed to run git cat-file: {e}"))?; + .map_err(|e| anyhow::anyhow!("failed to run git rev-list: {e}"))?; if !out.status.success() { let stderr = String::from_utf8_lossy(&out.stderr); - return Err(anyhow::anyhow!("git cat-file failed: {stderr}")); + return Err(anyhow::anyhow!("git rev-list failed: {stderr}")); } + // `rev-list --objects` lines are "" or " "; keep the oid. Ok(String::from_utf8_lossy(&out.stdout) .lines() - .map(|l| l.trim().to_string()) + .filter_map(|l| l.split_whitespace().next().map(str::to_string)) .filter(|l| !l.is_empty()) .collect()) } @@ -163,6 +164,62 @@ fn list_all_objects(repo_path: &std::path::Path) -> Result> { mod tests { use super::*; + #[test] + fn list_all_objects_excludes_unreachable_blobs() { + use std::process::Command; + use tempfile::TempDir; + + let td = TempDir::new().unwrap(); + let work = td.path(); + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(work) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"]); + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + std::fs::write(work.join("a.txt"), b"reachable\n").unwrap(); + run(&["add", "."]); + run(&["commit", "-qm", "init"]); + + let reachable = String::from_utf8_lossy( + &Command::new("git") + .args(["rev-parse", "HEAD:a.txt"]) + .current_dir(work) + .output() + .unwrap() + .stdout, + ) + .trim() + .to_string(); + + std::fs::write(work.join("dangling.txt"), b"DANGLING SECRET\n").unwrap(); + let dangling = String::from_utf8_lossy( + &Command::new("git") + .args(["hash-object", "-w", "dangling.txt"]) + .current_dir(work) + .output() + .unwrap() + .stdout, + ) + .trim() + .to_string(); + + let objs = list_all_objects(work).unwrap(); + assert!(objs.contains(&reachable), "reachable blob must be listed"); + assert!( + !objs.contains(&dangling), + "unreachable/dangling blob must NOT be listed" + ); + } + #[tokio::test] async fn test_pin_skipped_when_jwt_empty() { let client = reqwest::Client::new(); From 3e047eb6122be47673310c409c9d70f00b914b93 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:59:16 -0500 Subject: [PATCH 2/6] fix(node): include HEAD in the withheld-blob walk to close pin/serve asymmetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit blob_paths classified objects from for-each-ref only, but the pin set (git rev-list --objects --all) and git upload-pack both also expose objects reachable from HEAD. A detached HEAD (or any HEAD pointing outside the ref-covered set) therefore put a blob into the pin/serve path that the withholding walk never classified — a latent cleartext leak of a withheld blob. Walk HEAD too (guarded for unborn HEAD on an empty repo) so the withheld set is a superset of every exposure path. Adds a regression test: a blob reachable only via a detached HEAD must still be withheld (fails without this change). --- .../gitlawb-node/src/git/visibility_pack.rs | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index a670dfc..bd3e2d4 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -23,7 +23,19 @@ use std::path::Path; /// Fails closed: if a ref cannot be traversed, returns an error so the caller /// aborts the serve/pin rather than producing a partial (under-withheld) set. fn blob_paths(repo_path: &Path) -> Result> { - let refs = store::list_refs(repo_path).context("list_refs failed")?; + let mut refs = store::list_refs(repo_path).context("list_refs failed")?; + // `for-each-ref` omits HEAD, but the pin set (`git rev-list --objects --all`) + // and `git upload-pack` both expose objects reachable from HEAD. If HEAD is + // ever detached (or otherwise points outside the ref-covered set), those + // objects must still be classified here, or a withheld blob could escape into + // a pin/serve path that this walk never saw. HEAD is normally symbolic to a + // branch already in `refs`, so this just adds a duplicate the downstream set + // arithmetic collapses; in the detached case it is the safety net. `None` + // means HEAD does not resolve to a commit (e.g. an unborn branch on an empty + // repo) — nothing to walk. + if let Some(head_oid) = store::head_commit(repo_path).context("resolve HEAD failed")? { + refs.push(("HEAD".to_string(), head_oid)); + } let mut out = Vec::new(); for (refname, _oid) in refs { let listing = std::process::Command::new("git") @@ -339,6 +351,51 @@ mod tests { ); } + #[test] + fn withholds_blob_reachable_only_via_detached_head() { + let (_td, bare, secret_oid, _public) = fixture(); + // Detach HEAD onto the only commit, then delete the branch it pointed to, + // so the secret blob is reachable ONLY through HEAD. `for-each-ref` omits + // HEAD, but `rev-list --all` (pin) and upload-pack (serve) reach it, so it + // must still be withheld. + let head_ref = { + let out = Command::new("git") + .args(["symbolic-ref", "HEAD"]) + .current_dir(&bare) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let head_oid = { + let out = Command::new("git") + .args(["rev-parse", "HEAD"]) + .current_dir(&bare) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&bare) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["update-ref", "--no-deref", "HEAD", &head_oid]); + run(&["update-ref", "-d", &head_ref]); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "blob reachable only via detached HEAD must still be withheld" + ); + } + #[test] fn fails_closed_when_a_ref_cannot_be_traversed() { let (_td, bare, secret, _public) = fixture(); From 687b06883f8bd5cc983ee974f6616f5d6a8e9dac Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Tue, 23 Jun 2026 13:26:24 -0500 Subject: [PATCH 3/6] fix(node): classify blobs across full history, not just ref tips blob_paths walked only ref-tip trees (git ls-tree -r ), so a blob that existed in an older commit and was deleted or rotated at the tip was never classified, while the pin set (git rev-list --objects --all) and upload-pack still expose it from history. That under-withheld the blob into the IPFS/Pinata replication paths in cleartext. Enumerate all reachable commits (git rev-list --all + HEAD for the detached case) and walk each commit tree, deduping (oid, path) pairs. ls-tree per commit is kept over rev-list --objects so every path a blob lives at is still represented for the per-path visibility check. A ref that resolves to a non-commit now fails closed explicitly, since rev-list --all silently skips it where the old per-ref ls-tree errored. Regression test: a secret blob deleted at the tip but reachable in history is withheld. --- .../gitlawb-node/src/git/visibility_pack.rs | 188 +++++++++++++++--- 1 file changed, 159 insertions(+), 29 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index bd3e2d4..85f7348 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -10,42 +10,105 @@ use anyhow::{Context, Result}; use std::collections::{BTreeSet, HashMap, HashSet}; use std::path::Path; -/// List every (blob_oid, "/repo/relative/path") pair reachable from any ref in -/// `repo_path`. Walks every ref, not just `refs/heads/*` and `refs/tags/*`, so -/// the withheld set covers the same object graph the pack and pin paths expose; -/// a blob reachable only through another namespace (e.g. `refs/notes/*`) must not -/// escape withholding. Uses `git ls-tree -r` per ref so each path a blob lives -/// at is represented (the same blob content can appear at several paths). This is -/// why it is not `git rev-list --objects`, which reports only one path per object. -/// Paths carry a leading "/" to match the glob form used by visibility rules -/// ("/secret/**"). +/// List every (blob_oid, "/repo/relative/path") pair reachable from any commit in +/// `repo_path` — every ref *and* every historical commit those refs reach, not just +/// the ref tips. The pin set (`git rev-list --objects --all`) and `git upload-pack` +/// expose the full reachable object graph, including a blob that only ever existed +/// in an older commit (a since-deleted file, a rotated secret whose previous version +/// is still in history). Classifying only ref-tip trees would leave those blobs +/// unwithheld while pin/serve still hand them out in cleartext, so we enumerate all +/// reachable commits and walk each commit's tree. /// -/// Fails closed: if a ref cannot be traversed, returns an error so the caller -/// aborts the serve/pin rather than producing a partial (under-withheld) set. +/// `--all` covers every ref namespace (a blob reachable only through `refs/notes/*` +/// must not escape withholding); HEAD is added explicitly for the detached case, +/// where HEAD reaches commits that no ref does. `git ls-tree -r ` per commit +/// keeps every path a blob lives at (the same blob content can appear at several +/// paths, and the per-path visibility check needs all of them). This is why it is +/// not `git rev-list --objects`, which reports only one path per object. Pairs are +/// de-duplicated across commits. Paths carry a leading "/" to match the glob form +/// used by visibility rules ("/secret/**"). +/// +/// Fails closed: if commit enumeration or any tree walk fails, returns an error so +/// the caller aborts the serve/pin rather than producing a partial (under-withheld) +/// set. fn blob_paths(repo_path: &Path) -> Result> { - let mut refs = store::list_refs(repo_path).context("list_refs failed")?; - // `for-each-ref` omits HEAD, but the pin set (`git rev-list --objects --all`) - // and `git upload-pack` both expose objects reachable from HEAD. If HEAD is - // ever detached (or otherwise points outside the ref-covered set), those - // objects must still be classified here, or a withheld blob could escape into - // a pin/serve path that this walk never saw. HEAD is normally symbolic to a - // branch already in `refs`, so this just adds a duplicate the downstream set - // arithmetic collapses; in the detached case it is the safety net. `None` - // means HEAD does not resolve to a commit (e.g. an unborn branch on an empty - // repo) — nothing to walk. - if let Some(head_oid) = store::head_commit(repo_path).context("resolve HEAD failed")? { - refs.push(("HEAD".to_string(), head_oid)); + // Fail closed on any ref that does not resolve to a commit (a ref pointing + // directly at a blob or tree, or an annotated tag of one). `git rev-list --all` + // silently *skips* such refs, but the pin set (`git rev-list --objects --all`) + // still exposes their target object, so a tolerant walk would under-withhold. + // Refuse rather than leak — this is the same guarantee the per-ref `ls-tree` + // walk gave before, which errored on a non-tree-ish ref. + let refs = std::process::Command::new("git") + .args([ + "for-each-ref", + "--format=%(objecttype) %(*objecttype) %(refname)", + ]) + .current_dir(repo_path) + .output() + .context("git for-each-ref failed")?; + if !refs.status.success() { + anyhow::bail!( + "git for-each-ref failed: {}", + String::from_utf8_lossy(&refs.stderr) + ); + } + for line in String::from_utf8_lossy(&refs.stdout).lines() { + // " [] "; an annotated tag carries + // the peeled type, a lightweight ref does not. Refnames cannot contain + // whitespace, so split_whitespace is unambiguous. + let toks: Vec<&str> = line.split_whitespace().collect(); + let Some(&objtype) = toks.first() else { + continue; + }; + let effective = if objtype == "tag" && toks.len() >= 3 { + toks[1] + } else { + objtype + }; + if effective != "commit" { + let refname = toks.last().copied().unwrap_or(""); + anyhow::bail!( + "ref {refname} resolves to a {effective}, not a commit; \ + refusing to produce a partial (under-withheld) set" + ); + } } - let mut out = Vec::new(); - for (refname, _oid) in refs { + + // Enumerate every reachable commit, not just ref tips. `--all` walks all refs; + // append HEAD so a detached HEAD (reachable by rev-list/upload-pack but in no + // ref) is still classified. When HEAD does not resolve (unborn branch on an + // empty repo) `--all` alone yields nothing, which is correct — no objects exist. + let head = store::head_commit(repo_path).context("resolve HEAD failed")?; + let mut rev_args = vec!["rev-list", "--all"]; + if head.is_some() { + rev_args.push("HEAD"); + } + let commits = std::process::Command::new("git") + .args(&rev_args) + .current_dir(repo_path) + .output() + .context("git rev-list --all failed")?; + if !commits.status.success() { + anyhow::bail!( + "git rev-list --all failed: {}", + String::from_utf8_lossy(&commits.stderr) + ); + } + let commits_stdout = String::from_utf8_lossy(&commits.stdout); + let mut out: HashSet<(String, String)> = HashSet::new(); + for commit in commits_stdout.lines() { + let commit = commit.trim(); + if commit.is_empty() { + continue; + } let listing = std::process::Command::new("git") - .args(["ls-tree", "-r", &refname]) + .args(["ls-tree", "-r", commit]) .current_dir(repo_path) .output() .context("git ls-tree -r failed")?; if !listing.status.success() { anyhow::bail!( - "git ls-tree -r {refname} failed: {}", + "git ls-tree -r {commit} failed: {}", String::from_utf8_lossy(&listing.stderr) ); } @@ -60,12 +123,12 @@ fn blob_paths(repo_path: &Path) -> Result> { let oid = parts.next(); if kind == Some("blob") { if let Some(oid) = oid { - out.push((oid.to_string(), format!("/{path}"))); + out.insert((oid.to_string(), format!("/{path}"))); } } } } - Ok(out) + Ok(out.into_iter().collect()) } /// Blob OIDs the caller may not read. A blob is withheld only if visibility @@ -396,6 +459,73 @@ mod tests { ); } + #[test] + fn withholds_secret_blob_deleted_at_tip_but_reachable_in_history() { + // commit 1 adds secret/b.txt; commit 2 deletes it. The secret blob is no + // longer in any ref-tip tree, but `rev-list --objects --all` (pin) and + // upload-pack (serve) still expose it from history, so it must be withheld. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public.txt"), b"public\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "c1"], &work); + let secret_oid = { + let out = Command::new("git") + .args(["rev-parse", "HEAD:secret/b.txt"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + run(&["rm", "-q", "secret/b.txt"], &work); + run(&["commit", "-qm", "c2 delete secret"], &work); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + // Sanity: the blob is gone from the tip tree but still in the pin set. + let tip = Command::new("git") + .args(["ls-tree", "-r", "HEAD"]) + .current_dir(&bare) + .output() + .unwrap(); + assert!( + !String::from_utf8_lossy(&tip.stdout).contains(&secret_oid), + "precondition: secret blob is absent from the tip tree" + ); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob deleted at the tip but reachable in history must be withheld" + ); + } + #[test] fn fails_closed_when_a_ref_cannot_be_traversed() { let (_td, bare, secret, _public) = fixture(); From 9333e4ea66095cc0a773cf19698e0065c563a14f Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:01:39 -0500 Subject: [PATCH 4/6] fix(node): share one history walk and fully peel ref types in withheld classifier Addresses two review findings on the under-withholding classifier: - withheld_blob_recipients walked blob_paths twice (once via withheld_blob_oids, once directly). Extract withheld_from_pairs so a single history walk feeds both the withheld set and the recipient map. - The ref-type fail-closed guard used for-each-ref %(*objecttype), which peels only one tag level and so misclassified a nested annotated tag of a commit as a non-commit, refusing the whole walk (over-withhold). List refnames and peel each fully with ^{} through one cat-file --batch-check; reap the child even if the stdin write fails. Fail closed on a short read or any non-commit/missing target. Adds tests: annotated + nested-tag of a commit does not fail closed, annotated tag of a blob still fails closed, and a blob at both an allowed and a denied path is not withheld. --- .../gitlawb-node/src/git/visibility_pack.rs | 260 +++++++++++++++--- 1 file changed, 224 insertions(+), 36 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 73c1fc2..566bf4c 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -34,17 +34,20 @@ use std::path::Path; /// set. fn blob_paths(repo_path: &Path) -> Result> { // Fail closed on any ref that does not resolve to a commit (a ref pointing - // directly at a blob or tree, or an annotated tag of one). `git rev-list --all` - // silently *skips* such refs, but `git upload-pack` (serve) and the whole-repo - // pin fallback (`git cat-file --batch-all-objects`) still expose their target - // object, so a tolerant walk would under-withhold. - // Refuse rather than leak — this is the same guarantee the per-ref `ls-tree` - // walk gave before, which errored on a non-tree-ish ref. + // directly at a blob or tree, or an annotated tag — even a nested one — of + // such an object). `git rev-list --all` silently *skips* such refs, but + // `git upload-pack` (serve) and the whole-repo pin fallback + // (`git cat-file --batch-all-objects`) still expose their target object, so + // a tolerant walk would under-withhold. Refuse rather than leak — this is + // the same guarantee the per-ref `ls-tree` walk gave before. + // + // We list refnames (which never contain whitespace or newlines) and peel + // each one fully with `^{}` through a single `git cat-file + // --batch-check`. Full peeling is why this is not `for-each-ref + // %(*objecttype)`, which dereferences only one tag level and so misclassifies + // a tag-of-a-tag-of-a-commit as a non-commit. let refs = std::process::Command::new("git") - .args([ - "for-each-ref", - "--format=%(objecttype) %(*objecttype) %(refname)", - ]) + .args(["for-each-ref", "--format=%(refname)"]) .current_dir(repo_path) .output() .context("git for-each-ref failed")?; @@ -54,26 +57,72 @@ fn blob_paths(repo_path: &Path) -> Result> { String::from_utf8_lossy(&refs.stderr) ); } - for line in String::from_utf8_lossy(&refs.stdout).lines() { - // " [] "; an annotated tag carries - // the peeled type, a lightweight ref does not. Refnames cannot contain - // whitespace, so split_whitespace is unambiguous. - let toks: Vec<&str> = line.split_whitespace().collect(); - let Some(&objtype) = toks.first() else { - continue; - }; - let effective = if objtype == "tag" && toks.len() >= 3 { - toks[1] - } else { - objtype + let refs_stdout = String::from_utf8_lossy(&refs.stdout); + let refnames: Vec<&str> = refs_stdout + .lines() + .map(str::trim) + .filter(|l| !l.is_empty()) + .collect(); + if !refnames.is_empty() { + // One `^{}` peel query per line; `cat-file` emits one output + // line per input line, in order. + let queries = refnames + .iter() + .map(|r| format!("{r}^{{}}")) + .collect::>() + .join("\n"); + use std::io::Write; + let mut child = std::process::Command::new("git") + .args(["cat-file", "--batch-check=%(objecttype)"]) + .current_dir(repo_path) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("failed to spawn git cat-file")?; + // Always reap the child even if the stdin write fails, so a write error + // can't drop the Child unwaited and leak a zombie (#53). + let write_result = match child.stdin.take() { + Some(mut stdin) => stdin.write_all(queries.as_bytes()), + None => Err(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "git cat-file stdin unavailable", + )), }; - if effective != "commit" { - let refname = toks.last().copied().unwrap_or(""); + let peel = child.wait_with_output().context("git cat-file failed")?; + write_result.context("failed to write to git cat-file stdin")?; + if !peel.status.success() { anyhow::bail!( - "ref {refname} resolves to a {effective}, not a commit; \ - refusing to produce a partial (under-withheld) set" + "git cat-file --batch-check failed: {}", + String::from_utf8_lossy(&peel.stderr) ); } + let peel_stdout = String::from_utf8_lossy(&peel.stdout); + let types: Vec<&str> = peel_stdout.lines().map(str::trim).collect(); + // A short read means at least one ref went unclassified — fail closed. + if types.len() != refnames.len() { + anyhow::bail!( + "git cat-file returned {} lines for {} refs; \ + refusing to produce a partial (under-withheld) set", + types.len(), + refnames.len() + ); + } + for (refname, kind) in refnames.iter().zip(types.iter()) { + // An unresolvable query prints ` missing`. + if kind.ends_with("missing") { + anyhow::bail!( + "ref {refname} does not resolve to an object; \ + refusing to produce a partial (under-withheld) set" + ); + } + if *kind != "commit" { + anyhow::bail!( + "ref {refname} resolves to a {kind}, not a commit; \ + refusing to produce a partial (under-withheld) set" + ); + } + } } // Enumerate every reachable commit, not just ref tips. `--all` walks all refs; @@ -146,19 +195,37 @@ pub fn withheld_blob_oids( owner_did: &str, caller: Option<&str>, ) -> Result> { + let pairs = blob_paths(repo_path)?; + Ok(withheld_from_pairs( + &pairs, rules, is_public, owner_did, caller, + )) +} + +/// Withheld set from an already-computed (oid, "/path") listing: a blob is +/// withheld only when visibility denies the caller at *every* path it appears +/// at. Split out so a caller that already walked `blob_paths` (e.g. +/// `withheld_blob_recipients`) reuses the listing instead of walking history +/// again. +fn withheld_from_pairs( + pairs: &[(String, String)], + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> HashSet { let mut denied: HashSet = HashSet::new(); let mut allowed: HashSet = HashSet::new(); - for (oid, path) in blob_paths(repo_path)? { - match visibility_check(rules, is_public, owner_did, caller, &path) { + for (oid, path) in pairs { + match visibility_check(rules, is_public, owner_did, caller, path) { Decision::Deny => { - denied.insert(oid); + denied.insert(oid.clone()); } Decision::Allow => { - allowed.insert(oid); + allowed.insert(oid.clone()); } } } - Ok(denied.difference(&allowed).cloned().collect()) + denied.difference(&allowed).cloned().collect() } /// Objects that may replicate to the public: everything not in `withheld`. @@ -181,7 +248,9 @@ pub fn withheld_blob_recipients( is_public: bool, owner_did: &str, ) -> Result>> { - let withheld = withheld_blob_oids(repo_path, rules, is_public, owner_did, None)?; + // One history walk feeds both the withheld set and the recipient mapping. + let pairs = blob_paths(repo_path)?; + let withheld = withheld_from_pairs(&pairs, rules, is_public, owner_did, None); if withheld.is_empty() { return Ok(HashMap::new()); } @@ -192,14 +261,14 @@ pub fn withheld_blob_recipients( } } let mut out: HashMap> = HashMap::new(); - for (oid, path) in blob_paths(repo_path)? { - if !withheld.contains(&oid) { + for (oid, path) in &pairs { + if !withheld.contains(oid) { continue; } - let entry = out.entry(oid).or_default(); + let entry = out.entry(oid.clone()).or_default(); entry.insert(owner_did.to_string()); for did in &candidates { - if visibility_check(rules, is_public, owner_did, Some(did), &path) == Decision::Allow { + if visibility_check(rules, is_public, owner_did, Some(did), path) == Decision::Allow { entry.insert(did.clone()); } } @@ -542,4 +611,123 @@ mod tests { "a ref that cannot be traversed must fail closed (Err)" ); } + + #[test] + fn annotated_tag_to_commit_does_not_fail_closed() { + let (_td, bare, secret_oid, _public) = fixture(); + // An annotated tag — even one nested over another annotated tag — + // ultimately resolves to a commit, so it must NOT trip the non-commit + // fail-closed guard. A one-level `%(*objecttype)` peel would misread the + // nested tag as a non-commit and refuse the whole walk. + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&bare) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + run(&["tag", "-a", "-m", "inner", "v1", "HEAD"]); + run(&["tag", "-a", "-m", "outer", "v2", "v1"]); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob must still be withheld with annotated and nested tags present" + ); + } + + #[test] + fn fails_closed_on_annotated_tag_of_a_blob() { + let (_td, bare, secret, _public) = fixture(); + // An annotated tag whose target peels to a blob is not a commit; the + // guard must fail closed rather than skip the ref. + let run = |args: &[&str]| { + assert!( + Command::new("git") + .args(args) + .current_dir(&bare) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["config", "user.email", "t@t"]); + run(&["config", "user.name", "t"]); + run(&["tag", "-a", "-m", "blobtag", "blobtag", &secret]); + + let rules = [rule("/secret/**", &[])]; + let result = withheld_blob_oids(&bare, &rules, true, OWNER, None); + assert!( + result.is_err(), + "an annotated tag of a blob must fail closed (Err)" + ); + } + + #[test] + fn same_blob_at_allowed_and_denied_path_is_not_withheld() { + // Identical content at a denied and an allowed path shares one blob OID. + // A blob reachable through ANY allowed path must not be withheld. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("secret/shared.txt"), b"SHARED\n").unwrap(); + std::fs::write(work.join("public/shared.txt"), b"SHARED\n").unwrap(); + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let shared_oid = oid("secret/shared.txt"); + assert_eq!( + shared_oid, + oid("public/shared.txt"), + "precondition: identical content shares one blob OID" + ); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + !withheld.contains(&shared_oid), + "a blob also reachable via an allowed path must not be withheld" + ); + } } From 3266eaf8ea03da9d12b4fa55db2718baabbba721 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 14:25:32 -0500 Subject: [PATCH 5/6] fix(node): drain cat-file stdout concurrently to close the peel deadlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ref-type guard peeled every ^{} through git cat-file --batch-check, writing all queries to stdin before draining stdout. On a repo with enough output the child's stdout pipe fills, it stops reading stdin, and write_all blocks forever — hanging every serve/pin that classifies the repo. Batching the queries does NOT bound this: cat-file echoes the full query on a ' missing' line, so output scales with refname length, and a few hundred long-named dangling refs already exceed the pipe buffer. Run the stdin write on a separate thread so the main thread drains stdout concurrently via wait_with_output, which reaps the child unconditionally before any error is surfaced; the writer is joined after the drain so it cannot deadlock, and a vanished stdin pipe becomes a broken-pipe write error. One pass, no batch bound, deadlock-free regardless of ref count or refname length. Extracted as assert_all_refs_are_commits with corrected doc attribution. Detect the missing-object line by its trailing status word. Adds a timeout-bounded regression test with 500 long-named dangling refs (>64 KiB of cat-file output) that fails rather than hangs if the deadlock returns. --- .../gitlawb-node/src/git/visibility_pack.rs | 252 +++++++++++------- 1 file changed, 163 insertions(+), 89 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index 566bf4c..cd8017a 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -10,42 +10,18 @@ use anyhow::{Context, Result}; use std::collections::{BTreeSet, HashMap, HashSet}; use std::path::Path; -/// List every (blob_oid, "/repo/relative/path") pair reachable from any commit in -/// `repo_path` — every ref *and* every historical commit those refs reach, not just -/// the ref tips. `git upload-pack` (serve) and the whole-repo pin fallback -/// (`git cat-file --batch-all-objects`) expose the full reachable object graph, -/// including a blob that only ever existed -/// in an older commit (a since-deleted file, a rotated secret whose previous version -/// is still in history). Classifying only ref-tip trees would leave those blobs -/// unwithheld while pin/serve still hand them out in cleartext, so we enumerate all -/// reachable commits and walk each commit's tree. -/// -/// `--all` covers every ref namespace (a blob reachable only through `refs/notes/*` -/// must not escape withholding); HEAD is added explicitly for the detached case, -/// where HEAD reaches commits that no ref does. `git ls-tree -r ` per commit -/// keeps every path a blob lives at (the same blob content can appear at several -/// paths, and the per-path visibility check needs all of them). This is why it is -/// not `git rev-list --objects`, which reports only one path per object. Pairs are -/// de-duplicated across commits. Paths carry a leading "/" to match the glob form -/// used by visibility rules ("/secret/**"). +/// Fail closed unless every ref ultimately resolves to a commit (a ref pointing +/// directly at a blob or tree, or an annotated tag — even a nested one — of such +/// an object is refused). `git rev-list --all` silently *skips* such refs, but +/// `git upload-pack` (serve) and the whole-repo pin fallback +/// (`git cat-file --batch-all-objects`) still expose their target object, so a +/// tolerant walk would under-withhold. Refuse rather than leak. /// -/// Fails closed: if commit enumeration or any tree walk fails, returns an error so -/// the caller aborts the serve/pin rather than producing a partial (under-withheld) -/// set. -fn blob_paths(repo_path: &Path) -> Result> { - // Fail closed on any ref that does not resolve to a commit (a ref pointing - // directly at a blob or tree, or an annotated tag — even a nested one — of - // such an object). `git rev-list --all` silently *skips* such refs, but - // `git upload-pack` (serve) and the whole-repo pin fallback - // (`git cat-file --batch-all-objects`) still expose their target object, so - // a tolerant walk would under-withhold. Refuse rather than leak — this is - // the same guarantee the per-ref `ls-tree` walk gave before. - // - // We list refnames (which never contain whitespace or newlines) and peel - // each one fully with `^{}` through a single `git cat-file - // --batch-check`. Full peeling is why this is not `for-each-ref - // %(*objecttype)`, which dereferences only one tag level and so misclassifies - // a tag-of-a-tag-of-a-commit as a non-commit. +/// Each ref is peeled fully with `^{}` through `git cat-file --batch-check`. +/// Full peeling is why this is not `for-each-ref %(*objecttype)`, which +/// dereferences only one tag level and so misclassifies a tag-of-a-tag-of-a- +/// commit as a non-commit. +fn assert_all_refs_are_commits(repo_path: &Path) -> Result<()> { let refs = std::process::Command::new("git") .args(["for-each-ref", "--format=%(refname)"]) .current_dir(repo_path) @@ -63,67 +39,116 @@ fn blob_paths(repo_path: &Path) -> Result> { .map(str::trim) .filter(|l| !l.is_empty()) .collect(); - if !refnames.is_empty() { - // One `^{}` peel query per line; `cat-file` emits one output - // line per input line, in order. - let queries = refnames - .iter() - .map(|r| format!("{r}^{{}}")) - .collect::>() - .join("\n"); - use std::io::Write; - let mut child = std::process::Command::new("git") - .args(["cat-file", "--batch-check=%(objecttype)"]) - .current_dir(repo_path) - .stdin(std::process::Stdio::piped()) - .stdout(std::process::Stdio::piped()) - .stderr(std::process::Stdio::piped()) - .spawn() - .context("failed to spawn git cat-file")?; - // Always reap the child even if the stdin write fails, so a write error - // can't drop the Child unwaited and leak a zombie (#53). - let write_result = match child.stdin.take() { - Some(mut stdin) => stdin.write_all(queries.as_bytes()), - None => Err(std::io::Error::new( - std::io::ErrorKind::BrokenPipe, - "git cat-file stdin unavailable", - )), - }; - let peel = child.wait_with_output().context("git cat-file failed")?; - write_result.context("failed to write to git cat-file stdin")?; - if !peel.status.success() { + if refnames.is_empty() { + return Ok(()); + } + + // Peel every ref in one `git cat-file --batch-check` pass: one + // `^{}` query per line, one output line per input line, in order. + // The stdin write runs on a separate thread so this thread can drain stdout + // concurrently. cat-file echoes the full query on a ` missing` line, + // so output scales with refname length (not a fixed size per ref); writing + // all of stdin before reading any stdout would deadlock both pipes once the + // child's stdout buffer fills. Dropping `stdin` at the end of the closure + // sends EOF. + let queries = refnames + .iter() + .map(|r| format!("{r}^{{}}")) + .collect::>() + .join("\n"); + use std::io::Write; + let mut child = std::process::Command::new("git") + .args(["cat-file", "--batch-check=%(objecttype)"]) + .current_dir(repo_path) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .context("failed to spawn git cat-file")?; + // Feed stdin on a writer thread so this thread can drain stdout via + // wait_with_output concurrently; a None handle (the pipe vanished) becomes a + // broken-pipe write error. wait_with_output reaps the child unconditionally + // before any error is surfaced, so no path drops it unwaited (#53), and the + // writer is joined only after the drain so the join cannot deadlock. + let writer = child + .stdin + .take() + .map(|mut stdin| std::thread::spawn(move || stdin.write_all(queries.as_bytes()))); + let peel_result = child.wait_with_output(); + let write_result = match writer { + Some(handle) => handle + .join() + .map_err(|_| anyhow::anyhow!("git cat-file stdin writer thread panicked"))?, + None => Err(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "git cat-file stdin unavailable", + )), + }; + // Surface a write error only if the process didn't already fail with a + // clearer status. + let peel = peel_result.context("git cat-file failed")?; + if !peel.status.success() { + anyhow::bail!( + "git cat-file --batch-check failed: {}", + String::from_utf8_lossy(&peel.stderr) + ); + } + write_result.context("failed to write to git cat-file stdin")?; + + let peel_stdout = String::from_utf8_lossy(&peel.stdout); + let types: Vec<&str> = peel_stdout.lines().map(str::trim).collect(); + // A short read means at least one ref went unclassified — fail closed. + if types.len() != refnames.len() { + anyhow::bail!( + "git cat-file returned {} lines for {} refs; \ + refusing to produce a partial (under-withheld) set", + types.len(), + refnames.len() + ); + } + for (refname, kind) in refnames.iter().zip(types.iter()) { + // git emits ` missing` (not the objecttype) when the peel target + // is absent; the status word is the last token. + if kind.split_ascii_whitespace().last() == Some("missing") { anyhow::bail!( - "git cat-file --batch-check failed: {}", - String::from_utf8_lossy(&peel.stderr) + "ref {refname} does not resolve to an object; \ + refusing to produce a partial (under-withheld) set" ); } - let peel_stdout = String::from_utf8_lossy(&peel.stdout); - let types: Vec<&str> = peel_stdout.lines().map(str::trim).collect(); - // A short read means at least one ref went unclassified — fail closed. - if types.len() != refnames.len() { + if *kind != "commit" { anyhow::bail!( - "git cat-file returned {} lines for {} refs; \ - refusing to produce a partial (under-withheld) set", - types.len(), - refnames.len() + "ref {refname} resolves to a {kind}, not a commit; \ + refusing to produce a partial (under-withheld) set" ); } - for (refname, kind) in refnames.iter().zip(types.iter()) { - // An unresolvable query prints ` missing`. - if kind.ends_with("missing") { - anyhow::bail!( - "ref {refname} does not resolve to an object; \ - refusing to produce a partial (under-withheld) set" - ); - } - if *kind != "commit" { - anyhow::bail!( - "ref {refname} resolves to a {kind}, not a commit; \ - refusing to produce a partial (under-withheld) set" - ); - } - } } + Ok(()) +} + +/// List every (blob_oid, "/repo/relative/path") pair reachable from any commit in +/// `repo_path` — every ref *and* every historical commit those refs reach, not just +/// the ref tips. `git upload-pack` (serve) and the whole-repo pin fallback +/// (`git cat-file --batch-all-objects`) expose the full reachable object graph, +/// including a blob that only ever existed +/// in an older commit (a since-deleted file, a rotated secret whose previous version +/// is still in history). Classifying only ref-tip trees would leave those blobs +/// unwithheld while pin/serve still hand them out in cleartext, so we enumerate all +/// reachable commits and walk each commit's tree. +/// +/// `--all` covers every ref namespace (a blob reachable only through `refs/notes/*` +/// must not escape withholding); HEAD is added explicitly for the detached case, +/// where HEAD reaches commits that no ref does. `git ls-tree -r ` per commit +/// keeps every path a blob lives at (the same blob content can appear at several +/// paths, and the per-path visibility check needs all of them). This is why it is +/// not `git rev-list --objects`, which reports only one path per object. Pairs are +/// de-duplicated across commits. Paths carry a leading "/" to match the glob form +/// used by visibility rules ("/secret/**"). +/// +/// Fails closed: if commit enumeration or any tree walk fails, returns an error so +/// the caller aborts the serve/pin rather than producing a partial (under-withheld) +/// set. +fn blob_paths(repo_path: &Path) -> Result> { + assert_all_refs_are_commits(repo_path)?; // Enumerate every reachable commit, not just ref tips. `--all` walks all refs; // append HEAD so a detached HEAD (reachable by rev-list/upload-pack but in no @@ -671,6 +696,55 @@ mod tests { ); } + #[test] + fn fails_closed_when_a_ref_points_at_a_missing_object() { + let (_td, bare, _secret, _public) = fixture(); + // A ref whose target object does not exist (pruned object, corrupt ref) + // peels to ` missing`. for-each-ref still lists it, so the guard + // must fail closed rather than skip the unclassifiable ref. + std::fs::write( + bare.join("refs/heads/dangling"), + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef\n", + ) + .unwrap(); + let rules = [rule("/secret/**", &[])]; + let result = withheld_blob_oids(&bare, &rules, true, OWNER, None); + assert!( + result.is_err(), + "a ref pointing at a missing object must fail closed (Err)" + ); + } + + #[test] + fn many_long_named_unresolvable_refs_do_not_deadlock() { + // Regression guard for the cat-file stdin/stdout deadlock. cat-file + // echoes the full query on a ` missing` line, so a few hundred + // long-named dangling refs emit >64 KiB of stdout — enough to fill the + // pipe buffer and hang a write-all-before-drain implementation. The + // concurrent stdin writer must keep it live and fail closed. Bounded by + // a timeout so a regression fails the test instead of hanging the suite. + let (_td, bare, _secret, _public) = fixture(); + let longname = "z".repeat(200); + let mut packed = String::new(); + for i in 0..500 { + packed.push_str(&format!( + "deadbeefdeadbeefdeadbeefdeadbeefdeadbeef refs/heads/{longname}-{i}\n" + )); + } + std::fs::write(bare.join("packed-refs"), packed).unwrap(); + + let (tx, rx) = std::sync::mpsc::channel(); + std::thread::spawn(move || { + let rules = [rule("/secret/**", &[])]; + let is_err = withheld_blob_oids(&bare, &rules, true, OWNER, None).is_err(); + let _ = tx.send(is_err); + }); + match rx.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(is_err) => assert!(is_err, "refs pointing at missing objects must fail closed"), + Err(_) => panic!("withheld_blob_oids did not return within 10s (deadlock?)"), + } + } + #[test] fn same_blob_at_allowed_and_denied_path_is_not_withheld() { // Identical content at a denied and an allowed path shares one blob OID. From 836bf7b41834896f6922d6d69e44d5d522f1aac6 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Wed, 24 Jun 2026 21:45:44 -0500 Subject: [PATCH 6/6] fix(node): parse ls-tree -z raw paths and fail closed on non-UTF-8 to stop under-withholding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plain `git ls-tree -r` C-quotes any path with non-ASCII or special bytes (café.txt becomes "secret/caf\303\251.txt"), so the stored path no longer matched a visibility rule like /secret/** and the blob was served/pinned in cleartext. Switch blob_paths to `git ls-tree -rz` and split the NUL-delimited records, which emit paths raw; the TAB field separator survives -z so the per-record parse is otherwise unchanged. Parse strictly with str::from_utf8 and fail closed on invalid UTF-8 rather than from_utf8_lossy: a lossy decode would rewrite an invalid byte in a denied directory name to U+FFFD, and the mangled path would again miss its deny rule (the same under-withholding class one layer down). Tests: a non-ASCII path stays withheld (and the public blob still replicates), and a non-UTF-8 path fails closed. --- .../gitlawb-node/src/git/visibility_pack.rs | 250 +++++++++++++++++- 1 file changed, 244 insertions(+), 6 deletions(-) diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs index cd8017a..4fcbdab 100644 --- a/crates/gitlawb-node/src/git/visibility_pack.rs +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -137,7 +137,7 @@ fn assert_all_refs_are_commits(repo_path: &Path) -> Result<()> { /// /// `--all` covers every ref namespace (a blob reachable only through `refs/notes/*` /// must not escape withholding); HEAD is added explicitly for the detached case, -/// where HEAD reaches commits that no ref does. `git ls-tree -r ` per commit +/// where HEAD reaches commits that no ref does. `git ls-tree -rz ` per commit /// keeps every path a blob lives at (the same blob content can appear at several /// paths, and the per-path visibility check needs all of them). This is why it is /// not `git rev-list --objects`, which reports only one path per object. Pairs are @@ -178,19 +178,35 @@ fn blob_paths(repo_path: &Path) -> Result> { continue; } let listing = std::process::Command::new("git") - .args(["ls-tree", "-r", commit]) + .args(["ls-tree", "-rz", commit]) .current_dir(repo_path) .output() - .context("git ls-tree -r failed")?; + .context("git ls-tree -rz failed")?; if !listing.status.success() { anyhow::bail!( - "git ls-tree -r {commit} failed: {}", + "git ls-tree -rz {commit} failed: {}", String::from_utf8_lossy(&listing.stderr) ); } - for line in String::from_utf8_lossy(&listing.stdout).lines() { + // `-z` NUL-delimits records and emits paths raw; plain `git ls-tree -r` + // C-quotes any path with non-ASCII or special bytes (e.g. café.txt becomes + // "secret/caf\303\251.txt"), and that quoted literal would not match a + // visibility rule like "/secret/**", under-withholding the blob. The TAB + // field separator survives `-z`, so the per-record parse is unchanged. + // + // Parse strictly: a lossy decode would replace an invalid byte in a denied + // path (e.g. a non-UTF-8 directory name) with U+FFFD, and the mangled string + // would no longer match its deny rule — the same under-withholding class, one + // layer down. Fail closed instead so the caller aborts rather than leaks. + let Ok(listing_stdout) = std::str::from_utf8(&listing.stdout) else { + anyhow::bail!( + "git ls-tree -rz {commit} returned a non-UTF-8 path; \ + refusing to produce a partial (under-withheld) set" + ); + }; + for record in listing_stdout.split('\0') { // " blob \t" - let Some((meta, path)) = line.split_once('\t') else { + let Some((meta, path)) = record.split_once('\t') else { continue; }; let mut parts = meta.split_whitespace(); @@ -622,6 +638,228 @@ mod tests { ); } + #[test] + fn withholds_secret_blob_at_non_ascii_path() { + // A secret blob under a non-ASCII path inside a denied subtree must be + // withheld. Plain `git ls-tree -r` C-quotes the path (café.txt becomes + // "secret/caf\303\251.txt"), which would not match "/secret/**" and would + // leak the blob in cleartext; `-rz` emits the raw path so the rule matches. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public.txt"), b"public\n").unwrap(); + std::fs::write(work.join("secret/café.txt"), b"TOP SECRET\n").unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret_oid = oid("secret/café.txt"); + let public_oid = oid("public.txt"); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob at a non-ASCII path must be withheld" + ); + // Guard against an over-withholding (deny-all) regression: the public blob + // must still replicate. + assert!( + !withheld.contains(&public_oid), + "public blob must NOT be withheld" + ); + } + + // TAB/newline are legal filename bytes on unix but rejected by the Windows + // filesystem, so building the fixture only makes sense (and only compiles the + // OsStr handling) under cfg(unix), matching fails_closed_on_non_utf8_path. + #[cfg(unix)] + #[test] + fn withholds_secret_blob_at_path_with_tab_and_newline() { + // A path containing literal TAB and newline bytes must still be withheld. + // This pins two parse choices: `-rz` emits the path raw (plain `-r` would + // C-quote the TAB/newline and break the "/secret/**" match), and splitting + // records on NUL rather than newline keeps the embedded newline from + // splitting one record into two and truncating the path. A revert to + // `git ls-tree -r` or to `.lines()` would regress this case. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public.txt"), b"public\n").unwrap(); + let weird = "secret/a\tb\nc.txt"; + std::fs::write(work.join(weird), b"TOP SECRET\n").unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret_oid = oid(weird); + let public_oid = oid("public.txt"); + // Guard against a vacuous pass: if git ever failed to store the oddly-named + // file, rev-parse would yield an empty/garbage string and the withholding + // assert could trivially hold. A real blob OID is a 40-char (SHA-1) or + // 64-char (SHA-256) hex id. + assert!( + matches!(secret_oid.len(), 40 | 64), + "fixture did not store the TAB/newline path (got oid {secret_oid:?})" + ); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule("/secret/**", &[])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob at a path with TAB/newline must be withheld" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must NOT be withheld" + ); + } + + #[cfg(unix)] + #[test] + fn fails_closed_on_non_utf8_path() { + // A path with a non-UTF-8 byte (here an invalid 0xFF in the denied + // directory name) must not be lossy-decoded: U+FFFD substitution would stop + // the path matching its deny rule and leak the blob. blob_paths must fail + // closed (Err) instead. git stores raw path bytes, so we write the tree by + // hand via `git update-index --cacheinfo` to embed the invalid byte. + use std::os::unix::ffi::OsStrExt; + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(&work).unwrap(); + let run = |args: &[&str], dir: &Path| { + assert!( + Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + // Hash a blob, then index it at a path whose directory byte is invalid UTF-8. + let blob_oid = { + let out = Command::new("git") + .args(["hash-object", "-w", "--stdin"]) + .current_dir(&work) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .spawn() + .and_then(|mut c| { + use std::io::Write; + c.stdin.take().unwrap().write_all(b"TOP SECRET\n")?; + c.wait_with_output() + }) + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let mut bad_path = std::ffi::OsString::from("s"); + bad_path.push(std::ffi::OsStr::from_bytes(&[0xFF])); + bad_path.push("cret/b.txt"); + let cacheinfo = { + let mut s = std::ffi::OsString::from(format!("100644,{blob_oid},")); + s.push(&bad_path); + s + }; + assert!( + Command::new("git") + .arg("update-index") + .arg("--add") + .arg("--cacheinfo") + .arg(&cacheinfo) + .current_dir(&work) + .status() + .unwrap() + .success(), + "git update-index failed" + ); + run(&["commit", "-qm", "init"], &work); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let rules = [rule("/s\u{fffd}cret/**", &[])]; + let result = withheld_blob_oids(&bare, &rules, true, OWNER, None); + assert!( + result.is_err(), + "a non-UTF-8 path must fail closed (Err), not be lossy-decoded and leaked" + ); + } + #[test] fn fails_closed_when_a_ref_cannot_be_traversed() { let (_td, bare, secret, _public) = fixture();