From 2f3a138edddb464825e7f5bb5a2225e7be88fb23 Mon Sep 17 00:00:00 2001 From: hyperpolymath <6759885+hyperpolymath@users.noreply.github.com> Date: Wed, 27 May 2026 14:29:59 +0100 Subject: [PATCH] feat(query): facts-backed (crosslang :from :to) evaluator (issue #33 follow-up) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightens `(crosslang :from :to)` from a same-repo co-occurrence proxy to true FFI/cross-language reachability against persisted kanren-derived facts, while preserving the proxy as a fall-back for users who have not opted into crosslang hexad persistence. Mode selection happens once per query run in `load_context`: - If `/hexads/crosslang/` has ≥ 1 hexad with `CrosslangSemantic`, the evaluator builds a per-repo `(source_file, target_file)` endpoint index and a per-repo, per-category file index. A `TO`-category finding matches iff there is an interaction in the same repo whose source or target file equals the file of a `FROM`-category finding — the "real" reachability check. - Otherwise it falls through to the existing same-repo category co-occurrence proxy. This preserves S3b semantics for callers that haven't enabled `PANIC_ATTACK_STORE_CROSSLANG_HEXADS=1` yet. The pruning matters because the proxy admits cross-finding co-occurrences inside one repo even when no FFI boundary actually connects them; the facts-backed mode rejects such ghost-pairs. Tests (+3 in `query::tests`, 255 lib tests pass): - `run_crosslang_facts_backed_matches`: plant a synthetic interaction with one endpoint at the UnsafeCode finding's file; CryptoMisuse finding in the same repo now matches via facts. - `run_crosslang_falls_back_to_co_occurrence_when_no_facts`: no hexads on disk → legacy proxy path, same hit set as before. - `run_crosslang_facts_backed_no_match_when_endpoint_misses`: hexads present but no interaction touches the FROM-finding's file → reject. This is the false-positive the proxy can't prune. Also drops the obsolete `#[allow(dead_code)]` on `storage::load_crosslang_hexads` (now actively used) and refreshes the `Query::Crosslang` doc to describe both modes. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/query/mod.rs | 250 ++++++++++++++++++++++++++++++++++++++++----- src/storage/mod.rs | 5 - 2 files changed, 223 insertions(+), 32 deletions(-) diff --git a/src/query/mod.rs b/src/query/mod.rs index 9381f75..05e1b5d 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -31,26 +31,32 @@ //! lexicographically. RFC-3339 / ISO-8601 strings sort correctly under //! string comparison, which is what we use. //! -//! - `(crosslang :from X :to Y)` is a **co-occurrence proxy** for -//! FFI/cross-language reachability: it matches a `Y`-category finding -//! in a repo that also has at least one `X`-category finding. This is -//! the operationally useful case for the estate sweep — most -//! FFI-driven proof drift surfaces in the same repository. A future -//! slice will persist `kanren::crosslang` derived facts as hexads -//! and tighten this to true reachability over the FFI boundary graph. +//! - `(crosslang :from X :to Y)` is evaluated in two modes: +//! * **Facts-backed** (`/hexads/crosslang/` is non-empty): +//! matches a `Y`-category finding when there exists a persisted +//! kanren-derived `CrossLangInteraction` in the same repo where one +//! endpoint of the interaction is the file of an `X`-category +//! finding. This is the "real" FFI/cross-language reachability +//! semantics. +//! * **Co-occurrence proxy** (fallback when no crosslang hexads are +//! on disk): matches a `Y`-category finding in any repo that also +//! has ≥ 1 `X`-category finding. Preserves the historical +//! co-occurrence behaviour for users who haven't enabled crosslang +//! persistence yet (`PANIC_ATTACK_STORE_CROSSLANG_HEXADS=1`). //! -//! ## Deferred to later follow-ups -//! -//! - True kanren-derived `(crosslang ...)` evaluation backed by -//! persisted FFI-boundary facts (rather than the current -//! co-occurrence proxy). +//! Most FFI-driven proof drift surfaces in the same repo, so both +//! modes converge on the operationally common case, but the +//! facts-backed mode prunes cross-repo false-positive co-occurrences +//! (e.g. an `UnsafeFFI`-bearing repo that contains an unrelated +//! `ProofDrift` finding in a non-FFI module). use crate::storage::{ - load_campaign_hexads, load_finding_hexads, CampaignSemantic, FindingSemantic, + load_campaign_hexads, load_crosslang_hexads, load_finding_hexads, CampaignSemantic, + FindingSemantic, }; use anyhow::{anyhow, bail, Result}; use serde::Serialize; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::Path; // =========================================================================== @@ -73,13 +79,13 @@ pub enum Query { /// Match by campaign state. `None` means "no campaign hexad yet". PrState(Option), /// `(crosslang :from FROM_CAT :to TO_CAT)` — match a `TO_CAT` finding - /// in a repo that also has at least one `FROM_CAT` finding. + /// reachable from a `FROM_CAT` finding via an FFI boundary. /// - /// Co-occurrence proxy: until kanren-derived cross-language facts are - /// persisted as hexads (S3b follow-up), "the FROM finding is reachable - /// from the TO finding" is approximated by "they live in the same - /// repository", which is the operationally-useful case for the estate - /// sweep — most FFI-driven proof drift surfaces in the same repo. + /// Evaluated in two modes depending on whether crosslang hexads have + /// been persisted (see the module-level doc for the full semantics): + /// facts-backed FFI-endpoint reachability when + /// `/hexads/crosslang/` is populated, same-repo co-occurrence + /// proxy otherwise. Crosslang { from: String, to: String }, /// Match by ISO-8601 / RFC-3339 first-seen timestamp ≥ `since`. /// Filed under the `(since ...)` keyword for compactness; combined @@ -395,12 +401,31 @@ struct FindingRow { } /// Index from repo name → set of category Debug-names present in that -/// repo. Used by `(crosslang ...)` to check co-occurrence. -type RepoCategoryIndex = HashMap>; +/// repo. Used by the co-occurrence proxy path of `(crosslang ...)`. +type RepoCategoryIndex = HashMap>; + +/// Index from `(repo_name_lower, category_lower)` → set of files in that +/// repo that carry a finding of that category. Used by the facts-backed +/// `(crosslang ...)` path so we can check whether a candidate +/// `from`-category finding's file is an endpoint of any persisted +/// `CrossLangInteraction` in the same repo. +type RepoCategoryFileIndex = HashMap<(String, String), HashSet>; + +/// Index from repo name (lowercased) → list of `(source_file, +/// target_file)` pairs derived from persisted crosslang hexads. Used by +/// the facts-backed `(crosslang ...)` path. +type RepoInteractionIndex = HashMap>; struct EvalContext { rows: Vec, repo_categories: RepoCategoryIndex, + /// Per-repo, per-category file index. Populated unconditionally; only + /// consulted by the facts-backed crosslang path. + repo_category_files: RepoCategoryFileIndex, + /// Per-repo crosslang interaction endpoints. `None` when + /// `/hexads/crosslang/` is empty (signal to the evaluator that + /// it should fall back to the co-occurrence proxy). + crosslang_interactions: Option, } fn load_context(base_dir: &Path) -> Result { @@ -417,13 +442,20 @@ fn load_context(base_dir: &Path) -> Result { let mut rows = Vec::new(); let mut repo_categories: RepoCategoryIndex = HashMap::new(); + let mut repo_category_files: RepoCategoryFileIndex = HashMap::new(); for h in finding_hexads { let created_at = h.created_at.clone(); if let Some(f) = h.semantic.finding { + let repo_lower = f.repo_name.to_ascii_lowercase(); + let cat_lower = f.category.to_ascii_lowercase(); repo_categories - .entry(f.repo_name.to_ascii_lowercase()) + .entry(repo_lower.clone()) .or_default() - .insert(f.category.to_ascii_lowercase()); + .insert(cat_lower.clone()); + repo_category_files + .entry((repo_lower, cat_lower)) + .or_default() + .insert(f.file.clone()); let campaign = latest.get(&f.finding_id).cloned(); rows.push(FindingRow { finding: f, @@ -432,12 +464,55 @@ fn load_context(base_dir: &Path) -> Result { }); } } + + // Crosslang facts: load hexads; treat empty dir as "fall back to + // co-occurrence proxy" by leaving `crosslang_interactions = None`. + let crosslang_hexads = load_crosslang_hexads(base_dir)?; + let crosslang_interactions = if crosslang_hexads.is_empty() { + None + } else { + let mut idx: RepoInteractionIndex = HashMap::new(); + for h in crosslang_hexads { + let Some(cl) = h.semantic.crosslang else { + continue; + }; + idx.entry(cl.repo_name.to_ascii_lowercase()) + .or_default() + .push((cl.source_file.clone(), cl.target_file.clone())); + } + Some(idx) + }; + Ok(EvalContext { rows, repo_categories, + repo_category_files, + crosslang_interactions, }) } +/// Facts-backed `(crosslang :from F :to T)` check for one candidate row. +/// +/// Pre-condition: `row.finding.category` already matches `to`. Returns +/// `true` when a persisted `CrossLangInteraction` in the same repo has +/// one endpoint equal to a file carrying an `F`-category finding. +fn crosslang_facts_match(row: &FindingRow, from: &str, ctx: &EvalContext) -> bool { + let Some(by_repo) = ctx.crosslang_interactions.as_ref() else { + return false; + }; + let repo_lower = row.finding.repo_name.to_ascii_lowercase(); + let from_lower = from.to_ascii_lowercase(); + let Some(pairs) = by_repo.get(&repo_lower) else { + return false; + }; + let Some(from_files) = ctx.repo_category_files.get(&(repo_lower, from_lower)) else { + return false; + }; + pairs + .iter() + .any(|(src, tgt)| from_files.contains(src) || from_files.contains(tgt)) +} + fn matches(query: &Query, row: &FindingRow, ctx: &EvalContext) -> bool { match query { Query::Category(target) => row.finding.category.eq_ignore_ascii_case(target), @@ -473,12 +548,22 @@ fn matches(query: &Query, row: &FindingRow, ctx: &EvalContext) -> bool { candidate >= since.as_str() } Query::Crosslang { from, to } => { - // `to`-matching finding in a repo that also has at least one - // `from`-category finding. The current finding must be the - // `to` side (so callers can wrap with `and`/`or`). + // The current finding must be the `to` side (so callers can + // wrap with `and`/`or`). if !row.finding.category.eq_ignore_ascii_case(to) { return false; } + // Mode 1 — facts-backed: `/hexads/crosslang/` has hexads. + // Match when there is a persisted `CrossLangInteraction` in + // the same repo whose source or target file is the location of + // an `F`-category finding. This is true FFI reachability. + if ctx.crosslang_interactions.is_some() { + return crosslang_facts_match(row, from, ctx); + } + // Mode 2 — co-occurrence proxy fallback (no crosslang hexads + // on disk yet): same-repo co-occurrence of categories. + // Preserves S3b semantics for users who haven't enabled + // `PANIC_ATTACK_STORE_CROSSLANG_HEXADS`. let from_lower = from.to_ascii_lowercase(); ctx.repo_categories .get(&row.finding.repo_name.to_ascii_lowercase()) @@ -890,6 +975,117 @@ mod tests { assert!(run(&q, dir.path()).unwrap().is_empty()); } + // ----- Issue #33 kanren-crosslang: facts-backed crosslang tests --- + + /// Write a synthetic crosslang hexad into + /// `/hexads/crosslang/`. Tests use this to simulate persisted + /// `CrossLangInteraction` facts without driving the full kanren + /// pipeline. + fn write_synthetic_crosslang_hexad( + dir: &std::path::Path, + idx: usize, + repo: &str, + source_file: &str, + target_file: &str, + ) { + use crate::storage::{CrosslangSemantic, HexadProvenance, HexadSemantic, PanicAttackHexad}; + let h = PanicAttackHexad { + schema: "verisimdb.hexad.v1".to_string(), + id: format!("pa-crosslang-test-{}", idx), + created_at: "2026-05-26T00:00:00Z".to_string(), + provenance: HexadProvenance { + tool: "panic-attack".to_string(), + version: env!("CARGO_PKG_VERSION").to_string(), + program_path: format!("/tmp/{}", repo), + language: "Rust".to_string(), + attestation_hash: None, + }, + semantic: HexadSemantic { + total_weak_points: 0, + critical_count: 0, + high_count: 0, + total_crashes: 0, + robustness_score: 0.85, + categories: Vec::new(), + migration: None, + finding: None, + campaign: None, + crosslang: Some(CrosslangSemantic { + interaction_id: format!( + "crosslang:{}:{}:Rust:{}:Unknown:CFfi", + repo, source_file, target_file + ), + source_lang: "Rust".to_string(), + target_lang: "Unknown".to_string(), + mechanism: "CFfi".to_string(), + source_file: source_file.to_string(), + source_line: None, + target_file: target_file.to_string(), + target_line: None, + repo_name: repo.to_string(), + }), + }, + document: serde_json::Value::Null, + }; + let cl_dir = dir.join("hexads").join("crosslang"); + std::fs::create_dir_all(&cl_dir).unwrap(); + std::fs::write( + cl_dir.join(format!("h-{}.json", idx)), + serde_json::to_string_pretty(&h).unwrap(), + ) + .unwrap(); + } + + #[test] + fn run_crosslang_facts_backed_matches() { + let dir = tempdir().unwrap(); + write_test_findings(dir.path()); + // alpha has UnsafeCode finding at src/a.rs:1 and CryptoMisuse at + // src/a.rs:7. Plant a crosslang interaction in alpha with one + // endpoint at src/a.rs (the UnsafeCode-finding's file). The + // CryptoMisuse finding must now match + // `(crosslang :from UnsafeCode :to CryptoMisuse)` via the + // facts-backed path. + write_synthetic_crosslang_hexad(dir.path(), 0, "alpha", "src/a.rs", "foreign"); + let q = parse("(crosslang :from UnsafeCode :to CryptoMisuse)").unwrap(); + let hits = run(&q, dir.path()).unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0].repo_name, "alpha"); + assert_eq!(hits[0].category, "CryptoMisuse"); + } + + #[test] + fn run_crosslang_falls_back_to_co_occurrence_when_no_facts() { + // No crosslang hexads written → evaluator must take the legacy + // co-occurrence proxy path. alpha has both UnsafeCode and + // CryptoMisuse findings so the CryptoMisuse finding matches. + let dir = tempdir().unwrap(); + write_test_findings(dir.path()); + let q = parse("(crosslang :from UnsafeCode :to CryptoMisuse)").unwrap(); + let hits = run(&q, dir.path()).unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0].repo_name, "alpha"); + } + + #[test] + fn run_crosslang_facts_backed_no_match_when_endpoint_misses() { + // Mixed setup: crosslang hexads ARE present (so we're on the + // facts-backed path), but no interaction in alpha touches the + // file that carries the UnsafeCode finding. The CryptoMisuse + // finding must NOT match — facts-backed mode strictly requires + // an FFI endpoint at an `from`-finding's file. This is the + // pruning the co-occurrence proxy can't do. + let dir = tempdir().unwrap(); + write_test_findings(dir.path()); + write_synthetic_crosslang_hexad(dir.path(), 0, "alpha", "src/unrelated.rs", "foreign"); + let q = parse("(crosslang :from UnsafeCode :to CryptoMisuse)").unwrap(); + let hits = run(&q, dir.path()).unwrap(); + assert!( + hits.is_empty(), + "facts-backed mode must reject co-occurrences without a real FFI endpoint" + ); + } + #[test] fn render_table_empty() { let s = render_table(&[]); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 200b302..851530a 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -869,11 +869,6 @@ pub fn load_campaign_hexads(base_dir: &Path) -> Result> { /// directory doesn't exist — callers (notably the `(crosslang :from :to)` /// query evaluator's facts-backed path) treat the empty case as "fall back /// to co-occurrence proxy". -/// -/// `#[allow(dead_code)]` because the next stack PR (query evaluator -/// facts-backed mode) is the first caller — kept public so that consumer -/// can pick it up without further plumbing. -#[allow(dead_code)] pub fn load_crosslang_hexads(base_dir: &Path) -> Result> { load_hexad_dir(&base_dir.join("hexads").join("crosslang")) }