diff --git a/agents/feedback.md b/agents/feedback.md index 57a59f9..b0afbcd 100644 --- a/agents/feedback.md +++ b/agents/feedback.md @@ -2,21 +2,26 @@ ## Purpose The Feedback Agent helps Iris's agents learn from real signals instead of -repeating the same mistake. It does two jobs: +repeating the same mistake. It does three jobs: - **VERIFY** — judge whether an agent's HTML output faithfully and accessibly captures its source image. Used at build time to check each page the page agent produces, and reused by the regression gate before any agent change ships (PRD §7.5 / §7.12). +- **CLASSIFY** — decide whether a user-feedback correction is a one-off (specific + to this document, must not change the agent), a generalizable lesson, or an + accessibility-policy rule, and distill it into a reusable instruction plus a + localized before/after example. - **TRAIN** — propose an improved version of an agent's prompt so it avoids a recurring issue, driven either by a user-feedback correction or by the problems found during VERIFY (PRD §7.12 / §7.13). -A proposed improvement to a library agent (e.g. the page agent) is gated on that -agent's regression fixtures and filed as a GitHub issue for a maintainer to -review; a session-built agent is trained in place so its contribution carries the -fix. The goal is that agents improve from real signals rather than repeating the -same mistake. +Generalizable and accessibility lessons are accumulated as an example bank that is +injected into the agent's prompt at run time (so the agent file stays stable); +only a well-corroborated, higher-impact lesson becomes a prompt change — gated on +the agent's regression fixtures and an eval over those fixtures, then filed as a +GitHub issue for a maintainer to review. A session-built agent is trained in place +so its contribution carries the fix. ## Required capability vision, text @@ -25,8 +30,9 @@ deployment's configured providers for these capabilities determine which concret models run. See PRD §10.3.) ## System prompt -You are the Feedback Agent. The user message begins with `TASK: verify` or -`TASK: train`. Do ONLY that task and return ONLY its JSON (no code fences). +You are the Feedback Agent. The user message begins with `TASK: verify`, +`TASK: classify`, or `TASK: train`. Do ONLY that task and return ONLY its JSON +(no code fences). TASK: verify You are given an agent's purpose/contract, the HTML it produced for one source @@ -39,6 +45,22 @@ part the agent is responsible for. List concrete, actionable problems (empty whe there are none). Respond with ONLY: { "faithful": true|false, "accessible": true|false, "problems": ["..."] } +TASK: classify +You are given a user-feedback message and a diff of how the document changed in +response. Decide what KIND of signal this is for the agent: +- "one_off": specific to this one document (a particular name, date, or value, or + a fix that would not recur). Do NOT generalize it; it must not change the agent. +- "generalizable": a mistake the agent would likely repeat on similar documents. +- "a11y_policy": an accessibility rule the agent should always follow. +For generalizable or a11y_policy, write a single, reusable "instruction" (one +sentence, no document-specific text or values), and extract the SMALLEST +"before"/"after" snippets that show the correction (use empty strings if not +clear). For one_off, leave instruction/before/after empty. Respond with ONLY: +{ "kind": "one_off"|"generalizable"|"a11y_policy", + "instruction": "reusable lesson, or empty for one_off", + "before": "localized wrong snippet, or empty", + "after": "localized corrected snippet, or empty" } + TASK: train You are given an agent's full markdown and either a user-feedback correction or a list of verification problems. Propose an improved version of the agent's markdown diff --git a/config.example.yaml b/config.example.yaml index d378b53..fa17b3c 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -55,6 +55,11 @@ providers: image_analysis: openrouter # table: { model: anthropic/claude-opus-4.7 } # stronger model, same provider # reader: { provider: bedrock, model: us.anthropic.claude-haiku-4-5-20251001-v1:0 } + # Run the Feedback Agent (VERIFY/CLASSIFY/TRAIN) on a DIFFERENT or stronger + # model than the page agent, so verification doesn't share the generator's + # blind spots (recommended): + # feedback: { model: anthropic/claude-opus-4.7 } + # page: { model: anthropic/claude-sonnet-4.5 } openrouter: api_key: ${OPENROUTER_API_KEY} diff --git a/package.json b/package.json index 9a6018d..5f81bc4 100644 --- a/package.json +++ b/package.json @@ -11,7 +11,7 @@ "start": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite src/index.ts", "dev": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite --watch src/index.ts", "typecheck": "tsc --noEmit", - "test": "node --test test/feedback.test.ts" + "test": "node --test test/feedback.test.ts test/memory.test.ts" }, "dependencies": { "@aws-sdk/client-bedrock-runtime": "^3.682.0", diff --git a/src/pipeline/extraction.ts b/src/pipeline/extraction.ts index 54067f8..d87994c 100644 --- a/src/pipeline/extraction.ts +++ b/src/pipeline/extraction.ts @@ -5,6 +5,7 @@ import { loadAgent, type AgentSpec } from "../agents/loader.ts"; import { feedbackPreamble, loadImage, type InputImage, type PipelineContext } from "./context.ts"; import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts"; import { verifyAgentOutput } from "./feedback.ts"; +import { examplesForPrompt } from "./memory.ts"; import type { Fragment } from "./fragment.ts"; const PAGE_AGENT = "page"; @@ -78,10 +79,15 @@ interface PageRender { suggestion?: { name: string; reason: string }; } -async function renderPage(ctx: PipelineContext, agent: AgentSpec, img: InputImage): Promise { +async function renderPage( + ctx: PipelineContext, + agent: AgentSpec, + img: InputImage, + lessons: string, +): Promise { const user = `Convert this document page image (filename: ${img.name}, page ${img.order} of ${ctx.images.length}) ` + - `to accessible HTML.\n\n${ACCESSIBILITY_REQUIREMENTS}${feedbackPreamble(ctx)}`; + `to accessible HTML.\n\n${ACCESSIBILITY_REQUIREMENTS}${feedbackPreamble(ctx)}${lessons}`; const res = await ctx.router.complete( PAGE_AGENT, "vision", @@ -138,11 +144,15 @@ async function correctPage( // warrants a specialist agent, collected as `suggestions` for the contribution step. export async function runExtraction(ctx: PipelineContext): Promise { const pageAgent = loadPageAgent(ctx); + // Inject corroborated lessons learned from past feedback into the page agent + // prompt (#1), so it improves without rewriting agents/page.md. + const lessons = examplesForPrompt(ctx.paths, pageAgent.file); + if (lessons) ctx.log.event("page_lessons_injected", { chars: lessons.length }); const fragments: Fragment[] = []; const suggestions: ExtractionResult["suggestions"] = []; for (const img of ctx.images) { - const { html, log, suggestion } = await renderPage(ctx, pageAgent, img); + const { html, log, suggestion } = await renderPage(ctx, pageAgent, img, lessons); let innerHtml = html; let logNote = log; diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts index a2c299f..239ad9f 100644 --- a/src/pipeline/feedback.ts +++ b/src/pipeline/feedback.ts @@ -6,6 +6,7 @@ import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts"; import { loadImage, type InputImage, type PipelineContext } from "./context.ts"; import { flatten } from "./flatten.ts"; import { createAgentUpdateIssue } from "../github/issue.ts"; +import { recordExample, type LessonKind } from "./memory.ts"; import type { FixtureCase } from "./regression.ts"; // Previously imported from github/contributions.ts, which was removed when the @@ -33,6 +34,13 @@ interface VerifyOutput { problems?: string[]; } +interface ClassifyOutput { + kind?: string; + instruction?: string; + before?: string; + after?: string; +} + export interface VerifyVerdict { ok: boolean; problems: string[]; @@ -121,10 +129,14 @@ export const MIN_CONTENT_COVERAGE = 0.85; // Skip the coverage check for very short outputs, where one dropped word swings // the ratio — rely on the model verdict alone there. const MIN_COVERAGE_WORDS = 8; +// A proposed prompt change may not drop the agent's mean fixture coverage by more +// than this versus the current prompt (the holds-or-improves eval gate, #3). +const EVAL_REGRESSION_EPS = 0.02; export interface RegressionResult { passed: boolean; failures: string[]; + meanCoverage: number | null; // mean content coverage of the candidate over fixtures } // Fraction of the accepted output's distinct words that still appear in the @@ -211,6 +223,7 @@ export async function regressionGate( }; const failures: string[] = []; + const coverages: number[] = []; for (const caseFile of caseFiles) { let c: FixtureCase; try { @@ -224,6 +237,7 @@ export async function regressionGate( const blocks = await reRunAgentOnImage(ctx, updatedAgent, img); if (blocks.length === 0) { failures.push(`${c.image_file}: updated agent produced no output`); + coverages.push(0); continue; } // Content-preservation check: the updated agent must still reproduce the @@ -232,6 +246,7 @@ export async function regressionGate( // a large drop means the change regressed a use we already shipped. const candidateHtml = blocks.map((b) => b.html).join("\n\n"); const coverage = contentCoverage(c.accepted_html, candidateHtml); + if (coverage !== null) coverages.push(coverage); if (coverage !== null && coverage < MIN_CONTENT_COVERAGE) { failures.push(`${c.image_file}: only ${(coverage * 100).toFixed(0)}% of the accepted content remained`); continue; @@ -241,8 +256,43 @@ export async function regressionGate( } const passed = failures.length === 0; - ctx.log.event("regression_gate", { agent: file, cases: caseFiles.length, passed, failures: failures.length }); - return { passed, failures }; + const meanCoverage = coverages.length ? coverages.reduce((a, b) => a + b, 0) / coverages.length : null; + ctx.log.event("regression_gate", { agent: file, cases: caseFiles.length, passed, failures: failures.length, meanCoverage }); + return { passed, failures, meanCoverage }; +} + +// Mean content coverage of an agent's content across its regression fixtures, +// reused as a lightweight eval set (#3). Returns null when there are no fixtures. +export async function evalAgent(ctx: PipelineContext, agentFile: string, content: string): Promise { + const dir = ctx.paths.agentFixtures(agentFile); + if (!existsSync(dir)) return null; + const caseFiles = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse().slice(0, MAX_GATE_FIXTURES); + if (caseFiles.length === 0) return null; + const file = agentFile.endsWith(".md") ? agentFile : `${agentFile}.md`; + const agent: AgentSpec = { + name: file.replace(/\.md$/, ""), + file, + content, + capabilities: /\bvision\b/i.test(content) ? ["vision"] : ["text"], + sha: null, + sessionBuilt: false, + }; + const scores: number[] = []; + for (const caseFile of caseFiles) { + let c: FixtureCase; + try { + c = JSON.parse(readFileSync(join(dir, caseFile), "utf8")) as FixtureCase; + } catch { + continue; + } + const imgPath = join(dir, c.image_file); + if (!existsSync(imgPath)) continue; + const img: InputImage = { name: c.source_image, order: 0, path: imgPath }; + const blocks = await reRunAgentOnImage(ctx, agent, img); + const cov = contentCoverage(c.accepted_html, blocks.map((b) => b.html).join("\n\n")); + scores.push(cov !== null ? cov : blocks.length ? 1 : 0); + } + return scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : null; } // --------------------------------------------------------------------------- @@ -310,6 +360,19 @@ export async function proposeAgentUpdatesFromFeedback( return []; } + // Eval gate (#3): the proposed prompt must hold-or-improve the agent's mean + // coverage over its fixtures versus the current prompt — not just pass the floor. + const currentScore = await evalAgent(ctx, target.file, target.content); + if (currentScore !== null && gate.meanCoverage !== null && gate.meanCoverage < currentScore - EVAL_REGRESSION_EPS) { + ctx.log.event("agent_update_blocked", { + agent: target.file, + reason: "eval_regression", + current: Number(currentScore.toFixed(3)), + candidate: Number(gate.meanCoverage.toFixed(3)), + }); + return []; + } + const proposal: AgentUpdateContribution = { agent_name: target.file, summary: parsed.summary?.trim() || `Improved ${target.name} from user feedback.`, @@ -359,3 +422,47 @@ export async function proposeAgentUpdatesFromFeedback( return [proposal]; } + +// Primary, low-rot learning path (#1/#2/#4/#5): classify a feedback correction and, +// when it's a generalizable or accessibility lesson (not a one-off specific to this +// document), distill it into a reusable instruction + localized before/after example +// and record it to the agent's example bank (memory.ts). Recorded lessons are +// corroborated across sessions and injected into the agent's prompt at run time — +// the agent file itself stays stable. +export async function learnFromFeedback( + ctx: PipelineContext, + args: { agentFile: string; before: string; after: string; feedback: string }, +): Promise { + const fb = loadFeedbackAgent(ctx); + if (!fb || !args.feedback.trim()) return; + if (args.before.trim() === args.after.trim()) return; // nothing changed this run + + const correction = diffPreview(args.before, args.after); + const user = + `TASK: classify\n\n` + + `## User feedback\n${args.feedback}\n\n` + + `## How the document changed this run (diff)\n\`\`\`diff\n${correction}\n\`\`\``; + const res = await ctx.router.complete(FEEDBACK_AGENT, "text", [ + { role: "system", content: fb.content }, + { role: "user", content: user }, + ]); + ctx.log.agentCall({ agent: fb, phase: "review", output: res.text }); + + const parsed = extractJson(res.text); + const raw = parsed?.kind; + if (!parsed || !parsed.instruction?.trim() || (raw !== "generalizable" && raw !== "a11y_policy")) { + ctx.log.event("feedback_classified", { kind: raw ?? "unknown", recorded: false }); + return; + } + const kind: LessonKind = raw; + const entry = recordExample(ctx.paths, { + agent: args.agentFile, + kind, + instruction: parsed.instruction.trim(), + before: (parsed.before ?? "").trim(), + after: (parsed.after ?? "").trim(), + feedback: args.feedback.trim(), + session: ctx.sessionId, + }); + ctx.log.event("feedback_learned", { agent: entry.agent, kind: entry.kind, count: entry.count, instruction: entry.instruction }); +} diff --git a/src/pipeline/memory.ts b/src/pipeline/memory.ts new file mode 100644 index 0000000..82473a8 --- /dev/null +++ b/src/pipeline/memory.ts @@ -0,0 +1,135 @@ +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname } from "node:path"; +import type { Paths } from "../store/paths.ts"; + +// Agent memory (PRD §7.12, extended): instead of rewriting an agent's prompt when +// it makes a mistake, we accumulate generalized "lessons" learned from real user +// feedback and inject the corroborated ones into the agent's prompt at run time. +// Examples are easy to add, audit, and remove — and they don't rot the prompt. + +export type LessonKind = "generalizable" | "a11y_policy"; + +export interface CorrectionExample { + agent: string; // agent file, e.g. "page.md" + kind: LessonKind; + instruction: string; // the generalized lesson (one sentence) + before: string; // localized wrong output this targets (may be "") + after: string; // localized corrected output (may be "") + feedback: string; // the user feedback that produced it + sessions: string[]; // distinct sessions that surfaced this lesson + count: number; // = sessions.length (denormalized for convenience) + created_at: string; + updated_at: string; +} + +const MAX_EXAMPLES_PER_AGENT = 20; +// How many examples to inject into a single prompt at most. +const MAX_INJECTED = 6; +// A "generalizable" lesson must be seen in at least this many distinct sessions +// before it is injected or proposed — one user's idiosyncratic correction should +// not steer a shared agent (corroboration). Accessibility-policy lessons are +// exempt: a WCAG rule shouldn't need to recur to be worth applying. +export const CORROBORATION_THRESHOLD = 2; +// Keep injected before/after snippets short so the prompt stays lean. +const SNIPPET_CAP = 280; + +function normKey(instruction: string): string { + return instruction.toLowerCase().replace(/[^a-z0-9\s]/g, " ").replace(/\s+/g, " ").trim(); +} + +function cap(s: string): string { + const t = s.replace(/\s+/g, " ").trim(); + return t.length > SNIPPET_CAP ? `${t.slice(0, SNIPPET_CAP)}…` : t; +} + +export function loadExamples(paths: Paths, agentFile: string): CorrectionExample[] { + const path = paths.agentMemory(agentFile); + if (!existsSync(path)) return []; + try { + const arr = JSON.parse(readFileSync(path, "utf8")); + return Array.isArray(arr) ? (arr as CorrectionExample[]) : []; + } catch { + return []; + } +} + +function saveExamples(paths: Paths, agentFile: string, examples: CorrectionExample[]): void { + const path = paths.agentMemory(agentFile); + mkdirSync(dirname(path), { recursive: true }); + writeFileSync(path, JSON.stringify(examples, null, 2)); +} + +export interface RecordInput { + agent: string; + kind: LessonKind; + instruction: string; + before: string; + after: string; + feedback: string; + session: string; +} + +// Record (or corroborate) a lesson. Dedupe by normalized instruction: a matching +// lesson bumps its distinct-session count (corroboration) and refreshes its +// example; a new lesson is appended. The bank is pruned to the most-corroborated, +// most-recent MAX_EXAMPLES_PER_AGENT entries. Returns the stored example. +export function recordExample(paths: Paths, input: RecordInput): CorrectionExample { + const agent = input.agent.endsWith(".md") ? input.agent : `${input.agent}.md`; + const examples = loadExamples(paths, agent); + const key = normKey(input.instruction); + const now = new Date().toISOString(); + + let entry = examples.find((e) => normKey(e.instruction) === key); + if (entry) { + if (!entry.sessions.includes(input.session)) entry.sessions.push(input.session); + entry.count = entry.sessions.length; + entry.updated_at = now; + entry.kind = input.kind; + entry.before = input.before; + entry.after = input.after; + entry.feedback = input.feedback; + } else { + entry = { + agent, + kind: input.kind, + instruction: input.instruction.trim(), + before: input.before, + after: input.after, + feedback: input.feedback, + sessions: [input.session], + count: 1, + created_at: now, + updated_at: now, + }; + examples.push(entry); + } + + examples.sort((a, b) => b.count - a.count || b.updated_at.localeCompare(a.updated_at)); + saveExamples(paths, agent, examples.slice(0, MAX_EXAMPLES_PER_AGENT)); + return entry; +} + +// The lessons eligible to act on: a11y-policy lessons always, generalizable +// lessons once corroborated across enough sessions. +export function eligibleExamples(paths: Paths, agentFile: string, kinds?: LessonKind[]): CorrectionExample[] { + return loadExamples(paths, agentFile) + .filter((e) => !kinds || kinds.includes(e.kind)) + .filter((e) => e.kind === "a11y_policy" || e.count >= CORROBORATION_THRESHOLD) + .sort((a, b) => b.count - a.count || b.updated_at.localeCompare(a.updated_at)) + .slice(0, MAX_INJECTED); +} + +// Render the eligible lessons as a few-shot block to append to an agent prompt. +// Returns "" when there is nothing eligible (so callers can append unconditionally). +export function examplesForPrompt(paths: Paths, agentFile: string, kinds?: LessonKind[]): string { + const eligible = eligibleExamples(paths, agentFile, kinds); + if (eligible.length === 0) return ""; + const lines = eligible.map((e, i) => { + const demo = e.before && e.after ? `\n - was: ${cap(e.before)}\n - fix: ${cap(e.after)}` : ""; + return `${i + 1}. ${e.instruction}${demo}`; + }); + return ( + `\n\n## Lessons from past corrections (apply when they're relevant to THIS page)\n` + + `${lines.join("\n")}\n` + ); +} diff --git a/src/pipeline/orchestrator.ts b/src/pipeline/orchestrator.ts index 572bee2..bd8f281 100644 --- a/src/pipeline/orchestrator.ts +++ b/src/pipeline/orchestrator.ts @@ -10,7 +10,7 @@ import { runExtraction } from "./extraction.ts"; import { runAssembly, assembleBody, wrapDocument } from "./assembly.ts"; import { runReview, type ReviewResult } from "./review.ts"; import { runAxe } from "./lint.ts"; -import { proposeAgentUpdatesFromFeedback } from "./feedback.ts"; +import { learnFromFeedback, proposeAgentUpdatesFromFeedback } from "./feedback.ts"; import { runContribution } from "./contribute.ts"; import type { Fragment } from "./fragment.ts"; @@ -153,12 +153,12 @@ export async function runPipeline(args: { // page agent, recorded (gated by its regression fixtures) for review; or // in-place training if a session-built page agent is in use. if (args.feedback) { - await proposeAgentUpdatesFromFeedback(ctx, { - agentFile: "page.md", - before: beforeBody, - after: review.body, - feedback: args.feedback, - }); + const learnArgs = { agentFile: "page.md", before: beforeBody, after: review.body, feedback: args.feedback }; + // Primary: record a corroborated, generalized lesson to the agent's example + // bank (injected into future runs). Secondary: a well-corroborated, higher- + // impact lesson may also be proposed as a gated prompt change (issue). + await learnFromFeedback(ctx, learnArgs); + await proposeAgentUpdatesFromFeedback(ctx, learnArgs); } store.updateSession(sessionId, { diff --git a/src/pipeline/review.ts b/src/pipeline/review.ts index 35997b5..f4d383d 100644 --- a/src/pipeline/review.ts +++ b/src/pipeline/review.ts @@ -3,6 +3,7 @@ import { feedbackPreamble, loadImage, type PipelineContext } from "./context.ts" import { wrapDocument } from "./assembly.ts"; import { runAxe, type LintResult } from "./lint.ts"; import { flatten } from "./flatten.ts"; +import { examplesForPrompt } from "./memory.ts"; export interface ReviewIssue { issue: string; @@ -67,7 +68,8 @@ async function runReader(ctx: PipelineContext, body: string, lint: LintResult): for (const c of chunk(body)) { const user = `## HTML\n\`\`\`html\n${c}\n\`\`\`\n\n## Flattened screen-reader view\n${flatten(c)}\n\n## axe-core lint\n${lintSummary(lint)}` + - feedbackPreamble(ctx); + feedbackPreamble(ctx) + + examplesForPrompt(ctx.paths, "page.md", ["a11y_policy"]); const res = await ctx.router.complete("reader", "text", [ { role: "system", content: READER_SYSTEM }, { role: "user", content: user }, diff --git a/src/store/paths.ts b/src/store/paths.ts index 5f9e5e7..eef5c45 100644 --- a/src/store/paths.ts +++ b/src/store/paths.ts @@ -73,6 +73,16 @@ export class Paths { return join(this.fixturesDir(), agentName.replace(/\.md$/, "")); } + // Per-agent "memory": the example bank of generalized corrections learned from + // user feedback, injected into the agent's prompt at run time instead of + // rewriting the agent file. Lives under data_dir (per-instance, not committed). + memoryDir(): string { + return join(this.cfg.storage.data_dir, "memory"); + } + agentMemory(agentName: string): string { + return join(this.memoryDir(), `${agentName.replace(/\.md$/, "")}.json`); + } + tmpDir(id: string): string { return join(this.cfg.storage.data_dir, "tmp", id); } diff --git a/test/memory.test.ts b/test/memory.test.ts new file mode 100644 index 0000000..7419267 --- /dev/null +++ b/test/memory.test.ts @@ -0,0 +1,85 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + recordExample, + eligibleExamples, + examplesForPrompt, + CORROBORATION_THRESHOLD, +} from "../src/pipeline/memory.ts"; +import type { Paths } from "../src/store/paths.ts"; + +// memory.ts only ever calls paths.agentMemory(agentFile), so a duck-typed stub is +// enough to exercise it against a temp directory. +function fakePaths(dir: string): Paths { + return { agentMemory: (agent: string) => join(dir, `${agent.replace(/\.md$/, "")}.json`) } as unknown as Paths; +} + +function withTemp(fn: (paths: Paths) => void): void { + const dir = mkdtempSync(join(tmpdir(), "iris-mem-")); + try { + fn(fakePaths(dir)); + } finally { + rmSync(dir, { recursive: true, force: true }); + } +} + +const LESSON = "Mark decorative images with empty alt text"; + +test("a generalizable lesson needs corroboration before it is eligible to inject", () => { + withTemp((paths) => { + recordExample(paths, { + agent: "page.md", + kind: "generalizable", + instruction: LESSON, + before: "", + after: "", + feedback: "this image is decorative", + session: "ses_1", + }); + assert.equal(eligibleExamples(paths, "page.md").length, 0, "one session: not yet corroborated"); + assert.equal(examplesForPrompt(paths, "page.md"), "", "nothing eligible -> empty injection"); + + // Same lesson, a different session -> corroborated. + const entry = recordExample(paths, { + agent: "page.md", + kind: "generalizable", + instruction: LESSON, + before: "", + after: "", + feedback: "another decorative image", + session: "ses_2", + }); + assert.equal(entry.count, CORROBORATION_THRESHOLD, "two distinct sessions counted"); + const eligible = eligibleExamples(paths, "page.md"); + assert.equal(eligible.length, 1, "corroborated -> eligible"); + assert.match(examplesForPrompt(paths, "page.md"), /decorative|alt/i); + }); +}); + +test("re-recording within the same session does not inflate the corroboration count", () => { + withTemp((paths) => { + recordExample(paths, { agent: "page.md", kind: "generalizable", instruction: LESSON, before: "", after: "", feedback: "x", session: "ses_1" }); + const entry = recordExample(paths, { agent: "page.md", kind: "generalizable", instruction: LESSON, before: "", after: "", feedback: "x again", session: "ses_1" }); + assert.equal(entry.count, 1, "same session should count once"); + assert.equal(eligibleExamples(paths, "page.md").length, 0); + }); +}); + +test("an a11y_policy lesson is eligible immediately, without corroboration", () => { + withTemp((paths) => { + recordExample(paths, { + agent: "page.md", + kind: "a11y_policy", + instruction: "Headings must not skip levels", + before: "", + after: "", + feedback: "h1 jumped to h3", + session: "ses_1", + }); + assert.equal(eligibleExamples(paths, "page.md", ["a11y_policy"]).length, 1); + assert.match(examplesForPrompt(paths, "page.md", ["a11y_policy"]), /Headings must not skip levels/); + }); +});