EqualifyEverything · oshrizak · May 26, 2026
diff --git a/agents/feedback.md b/agents/feedback.md
@@ -2,21 +2,26 @@
 
 ## Purpose
 The Feedback Agent helps Iris's agents learn from real signals instead of
-repeating the same mistake. It does two jobs:
+repeating the same mistake. It does three jobs:
 
 - **VERIFY** — judge whether an agent's HTML output faithfully and accessibly
   captures its source image. Used at build time to check each page the page agent
   produces, and reused by the regression gate before any agent change ships
   (PRD §7.5 / §7.12).
+- **CLASSIFY** — decide whether a user-feedback correction is a one-off (specific
+  to this document, must not change the agent), a generalizable lesson, or an
+  accessibility-policy rule, and distill it into a reusable instruction plus a
+  localized before/after example.
 - **TRAIN** — propose an improved version of an agent's prompt so it avoids a
   recurring issue, driven either by a user-feedback correction or by the problems
   found during VERIFY (PRD §7.12 / §7.13).
 
-A proposed improvement to a library agent (e.g. the page agent) is gated on that
-agent's regression fixtures and filed as a GitHub issue for a maintainer to
-review; a session-built agent is trained in place so its contribution carries the
-fix. The goal is that agents improve from real signals rather than repeating the
-same mistake.
+Generalizable and accessibility lessons are accumulated as an example bank that is
+injected into the agent's prompt at run time (so the agent file stays stable);
+only a well-corroborated, higher-impact lesson becomes a prompt change — gated on
+the agent's regression fixtures and an eval over those fixtures, then filed as a
+GitHub issue for a maintainer to review. A session-built agent is trained in place
+so its contribution carries the fix.
 
 ## Required capability
 vision, text
@@ -25,8 +30,9 @@ deployment's configured providers for these capabilities determine which concret
 models run. See PRD §10.3.)
 
 ## System prompt
-You are the Feedback Agent. The user message begins with `TASK: verify` or
-`TASK: train`. Do ONLY that task and return ONLY its JSON (no code fences).
+You are the Feedback Agent. The user message begins with `TASK: verify`,
+`TASK: classify`, or `TASK: train`. Do ONLY that task and return ONLY its JSON
+(no code fences).
 
 TASK: verify
 You are given an agent's purpose/contract, the HTML it produced for one source
@@ -39,6 +45,22 @@ part the agent is responsible for. List concrete, actionable problems (empty whe
 there are none). Respond with ONLY:
 { "faithful": true|false, "accessible": true|false, "problems": ["..."] }
 
+TASK: classify
+You are given a user-feedback message and a diff of how the document changed in
+response. Decide what KIND of signal this is for the agent:
+- "one_off": specific to this one document (a particular name, date, or value, or
+  a fix that would not recur). Do NOT generalize it; it must not change the agent.
+- "generalizable": a mistake the agent would likely repeat on similar documents.
+- "a11y_policy": an accessibility rule the agent should always follow.
+For generalizable or a11y_policy, write a single, reusable "instruction" (one
+sentence, no document-specific text or values), and extract the SMALLEST
+"before"/"after" snippets that show the correction (use empty strings if not
+clear). For one_off, leave instruction/before/after empty. Respond with ONLY:
+{ "kind": "one_off"|"generalizable"|"a11y_policy",
+  "instruction": "reusable lesson, or empty for one_off",
+  "before": "localized wrong snippet, or empty",
+  "after": "localized corrected snippet, or empty" }
+
 TASK: train
 You are given an agent's full markdown and either a user-feedback correction or a
 list of verification problems. Propose an improved version of the agent's markdown

diff --git a/config.example.yaml b/config.example.yaml
@@ -55,6 +55,11 @@ providers:
     image_analysis: openrouter
     # table: { model: anthropic/claude-opus-4.7 }   # stronger model, same provider
     # reader: { provider: bedrock, model: us.anthropic.claude-haiku-4-5-20251001-v1:0 }
+    # Run the Feedback Agent (VERIFY/CLASSIFY/TRAIN) on a DIFFERENT or stronger
+    # model than the page agent, so verification doesn't share the generator's
+    # blind spots (recommended):
+    # feedback: { model: anthropic/claude-opus-4.7 }
+    # page: { model: anthropic/claude-sonnet-4.5 }
 
   openrouter:
     api_key: ${OPENROUTER_API_KEY}

diff --git a/package.json b/package.json
@@ -11,7 +11,7 @@
     "start": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite src/index.ts",
     "dev": "node --use-system-ca --env-file-if-exists=.env --experimental-sqlite --watch src/index.ts",
     "typecheck": "tsc --noEmit",
-    "test": "node --test test/feedback.test.ts"
+    "test": "node --test test/feedback.test.ts test/memory.test.ts"
   },
   "dependencies": {
     "@aws-sdk/client-bedrock-runtime": "^3.682.0",

diff --git a/src/pipeline/extraction.ts b/src/pipeline/extraction.ts
@@ -5,6 +5,7 @@ import { loadAgent, type AgentSpec } from "../agents/loader.ts";
 import { feedbackPreamble, loadImage, type InputImage, type PipelineContext } from "./context.ts";
 import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
 import { verifyAgentOutput } from "./feedback.ts";
+import { examplesForPrompt } from "./memory.ts";
 import type { Fragment } from "./fragment.ts";
 
 const PAGE_AGENT = "page";
@@ -78,10 +79,15 @@ interface PageRender {
   suggestion?: { name: string; reason: string };
 }
 
-async function renderPage(ctx: PipelineContext, agent: AgentSpec, img: InputImage): Promise<PageRender> {
+async function renderPage(
+  ctx: PipelineContext,
+  agent: AgentSpec,
+  img: InputImage,
+  lessons: string,
+): Promise<PageRender> {
   const user =
     `Convert this document page image (filename: ${img.name}, page ${img.order} of ${ctx.images.length}) ` +
-    `to accessible HTML.\n\n${ACCESSIBILITY_REQUIREMENTS}${feedbackPreamble(ctx)}`;
+    `to accessible HTML.\n\n${ACCESSIBILITY_REQUIREMENTS}${feedbackPreamble(ctx)}${lessons}`;
   const res = await ctx.router.complete(
     PAGE_AGENT,
     "vision",
@@ -138,11 +144,15 @@ async function correctPage(
 // warrants a specialist agent, collected as `suggestions` for the contribution step.
 export async function runExtraction(ctx: PipelineContext): Promise<ExtractionResult> {
   const pageAgent = loadPageAgent(ctx);
+  // Inject corroborated lessons learned from past feedback into the page agent
+  // prompt (#1), so it improves without rewriting agents/page.md.
+  const lessons = examplesForPrompt(ctx.paths, pageAgent.file);
+  if (lessons) ctx.log.event("page_lessons_injected", { chars: lessons.length });
   const fragments: Fragment[] = [];
   const suggestions: ExtractionResult["suggestions"] = [];
 
   for (const img of ctx.images) {
-    const { html, log, suggestion } = await renderPage(ctx, pageAgent, img);
+    const { html, log, suggestion } = await renderPage(ctx, pageAgent, img, lessons);
     let innerHtml = html;
     let logNote = log;
 

diff --git a/src/pipeline/feedback.ts b/src/pipeline/feedback.ts
@@ -6,6 +6,7 @@ import { ACCESSIBILITY_REQUIREMENTS } from "./accessibility.ts";
 import { loadImage, type InputImage, type PipelineContext } from "./context.ts";
 import { flatten } from "./flatten.ts";
 import { createAgentUpdateIssue } from "../github/issue.ts";
+import { recordExample, type LessonKind } from "./memory.ts";
 import type { FixtureCase } from "./regression.ts";
 
 // Previously imported from github/contributions.ts, which was removed when the
@@ -33,6 +34,13 @@ interface VerifyOutput {
   problems?: string[];
 }
 
+interface ClassifyOutput {
+  kind?: string;
+  instruction?: string;
+  before?: string;
+  after?: string;
+}
+
 export interface VerifyVerdict {
   ok: boolean;
   problems: string[];
@@ -121,10 +129,14 @@ export const MIN_CONTENT_COVERAGE = 0.85;
 // Skip the coverage check for very short outputs, where one dropped word swings
 // the ratio — rely on the model verdict alone there.
 const MIN_COVERAGE_WORDS = 8;
+// A proposed prompt change may not drop the agent's mean fixture coverage by more
+// than this versus the current prompt (the holds-or-improves eval gate, #3).
+const EVAL_REGRESSION_EPS = 0.02;
 
 export interface RegressionResult {
   passed: boolean;
   failures: string[];
+  meanCoverage: number | null; // mean content coverage of the candidate over fixtures
 }
 
 // Fraction of the accepted output's distinct words that still appear in the
@@ -211,6 +223,7 @@ export async function regressionGate(
   };
 
   const failures: string[] = [];
+  const coverages: number[] = [];
   for (const caseFile of caseFiles) {
     let c: FixtureCase;
     try {
@@ -224,6 +237,7 @@ export async function regressionGate(
     const blocks = await reRunAgentOnImage(ctx, updatedAgent, img);
     if (blocks.length === 0) {
       failures.push(`${c.image_file}: updated agent produced no output`);
+      coverages.push(0);
       continue;
     }
     // Content-preservation check: the updated agent must still reproduce the
@@ -232,6 +246,7 @@ export async function regressionGate(
     // a large drop means the change regressed a use we already shipped.
     const candidateHtml = blocks.map((b) => b.html).join("\n\n");
     const coverage = contentCoverage(c.accepted_html, candidateHtml);
+    if (coverage !== null) coverages.push(coverage);
     if (coverage !== null && coverage < MIN_CONTENT_COVERAGE) {
       failures.push(`${c.image_file}: only ${(coverage * 100).toFixed(0)}% of the accepted content remained`);
       continue;
@@ -241,8 +256,43 @@ export async function regressionGate(
   }
 
   const passed = failures.length === 0;
-  ctx.log.event("regression_gate", { agent: file, cases: caseFiles.length, passed, failures: failures.length });
-  return { passed, failures };
+  const meanCoverage = coverages.length ? coverages.reduce((a, b) => a + b, 0) / coverages.length : null;
+  ctx.log.event("regression_gate", { agent: file, cases: caseFiles.length, passed, failures: failures.length, meanCoverage });
+  return { passed, failures, meanCoverage };
+}
+
+// Mean content coverage of an agent's content across its regression fixtures,
+// reused as a lightweight eval set (#3). Returns null when there are no fixtures.
+export async function evalAgent(ctx: PipelineContext, agentFile: string, content: string): Promise<number | null> {
+  const dir = ctx.paths.agentFixtures(agentFile);
+  if (!existsSync(dir)) return null;
+  const caseFiles = readdirSync(dir).filter((f) => f.endsWith(".json")).sort().reverse().slice(0, MAX_GATE_FIXTURES);
+  if (caseFiles.length === 0) return null;
+  const file = agentFile.endsWith(".md") ? agentFile : `${agentFile}.md`;
+  const agent: AgentSpec = {
+    name: file.replace(/\.md$/, ""),
+    file,
+    content,
+    capabilities: /\bvision\b/i.test(content) ? ["vision"] : ["text"],
+    sha: null,
+    sessionBuilt: false,
+  };
+  const scores: number[] = [];
+  for (const caseFile of caseFiles) {
+    let c: FixtureCase;
+    try {
+      c = JSON.parse(readFileSync(join(dir, caseFile), "utf8")) as FixtureCase;
+    } catch {
+      continue;
+    }
+    const imgPath = join(dir, c.image_file);
+    if (!existsSync(imgPath)) continue;
+    const img: InputImage = { name: c.source_image, order: 0, path: imgPath };
+    const blocks = await reRunAgentOnImage(ctx, agent, img);
+    const cov = contentCoverage(c.accepted_html, blocks.map((b) => b.html).join("\n\n"));
+    scores.push(cov !== null ? cov : blocks.length ? 1 : 0);
+  }
+  return scores.length ? scores.reduce((a, b) => a + b, 0) / scores.length : null;
 }
 
 // ---------------------------------------------------------------------------
@@ -310,6 +360,19 @@ export async function proposeAgentUpdatesFromFeedback(
     return [];
   }
 
+  // Eval gate (#3): the proposed prompt must hold-or-improve the agent's mean
+  // coverage over its fixtures versus the current prompt — not just pass the floor.
+  const currentScore = await evalAgent(ctx, target.file, target.content);
+  if (currentScore !== null && gate.meanCoverage !== null && gate.meanCoverage < currentScore - EVAL_REGRESSION_EPS) {
+    ctx.log.event("agent_update_blocked", {
+      agent: target.file,
+      reason: "eval_regression",
+      current: Number(currentScore.toFixed(3)),
+      candidate: Number(gate.meanCoverage.toFixed(3)),
+    });
+    return [];
+  }
+
   const proposal: AgentUpdateContribution = {
     agent_name: target.file,
     summary: parsed.summary?.trim() || `Improved ${target.name} from user feedback.`,
@@ -359,3 +422,47 @@ export async function proposeAgentUpdatesFromFeedback(
 
   return [proposal];
 }
+
+// Primary, low-rot learning path (#1/#2/#4/#5): classify a feedback correction and,
+// when it's a generalizable or accessibility lesson (not a one-off specific to this
+// document), distill it into a reusable instruction + localized before/after example
+// and record it to the agent's example bank (memory.ts). Recorded lessons are
+// corroborated across sessions and injected into the agent's prompt at run time —
+// the agent file itself stays stable.
+export async function learnFromFeedback(
+  ctx: PipelineContext,
+  args: { agentFile: string; before: string; after: string; feedback: string },
+): Promise<void> {
+  const fb = loadFeedbackAgent(ctx);
+  if (!fb || !args.feedback.trim()) return;
+  if (args.before.trim() === args.after.trim()) return; // nothing changed this run
+
+  const correction = diffPreview(args.before, args.after);
+  const user =
+    `TASK: classify\n\n` +
+    `## User feedback\n${args.feedback}\n\n` +
+    `## How the document changed this run (diff)\n\`\`\`diff\n${correction}\n\`\`\``;
+  const res = await ctx.router.complete(FEEDBACK_AGENT, "text", [
+    { role: "system", content: fb.content },
+    { role: "user", content: user },
+  ]);
+  ctx.log.agentCall({ agent: fb, phase: "review", output: res.text });
+
+  const parsed = extractJson<ClassifyOutput>(res.text);
+  const raw = parsed?.kind;
+  if (!parsed || !parsed.instruction?.trim() || (raw !== "generalizable" && raw !== "a11y_policy")) {
+    ctx.log.event("feedback_classified", { kind: raw ?? "unknown", recorded: false });
+    return;
+  }
+  const kind: LessonKind = raw;
+  const entry = recordExample(ctx.paths, {
+    agent: args.agentFile,
+    kind,
+    instruction: parsed.instruction.trim(),
+    before: (parsed.before ?? "").trim(),
+    after: (parsed.after ?? "").trim(),
+    feedback: args.feedback.trim(),
+    session: ctx.sessionId,
+  });
+  ctx.log.event("feedback_learned", { agent: entry.agent, kind: entry.kind, count: entry.count, instruction: entry.instruction });
+}