From 9adce24767aae91dd741b987c34dfddb53bae63e Mon Sep 17 00:00:00 2001
From: Andrew Beltrano <anbeltra@microsoft.com>
Date: Wed, 1 Apr 2026 15:24:14 -0600
Subject: [PATCH 1/2] Add vibe-vs-structured demo with 3 scenarios
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds docs/demo/ with complete presenter materials for a live demo
comparing plain 'vibe' prompts against PromptKit-assembled structured
prompts on the same task.

Three scenarios with pre-built code samples and scorecards:
1. Code review — C echo server with 5 planted bugs (demo_server.c)
2. Requirements authoring — rate limiter from 3-sentence description
3. Bug investigation — TOCTOU race condition with red herring (demo_queue.c)

Includes README, full presenter script with timing notes, talking
points, and appendix with planted bug reference.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/demo/README.md                   |  94 ++++++++
 docs/demo/demo-script.md              | 294 ++++++++++++++++++++++++++
 docs/demo/demo_queue.c                | 133 ++++++++++++
 docs/demo/demo_server.c               | 154 ++++++++++++++
 docs/demo/rate_limiter_description.md |  13 ++
 5 files changed, 688 insertions(+)
 create mode 100644 docs/demo/README.md
 create mode 100644 docs/demo/demo-script.md
 create mode 100644 docs/demo/demo_queue.c
 create mode 100644 docs/demo/demo_server.c
 create mode 100644 docs/demo/rate_limiter_description.md

diff --git a/docs/demo/README.md b/docs/demo/README.md
new file mode 100644
index 0000000..18ca7ff
--- /dev/null
+++ b/docs/demo/README.md
@@ -0,0 +1,94 @@
+# PromptKit Demo: Vibe Prompt vs. Structured Prompt
+
+This demo compares LLM output quality when tackling the same task with:
+
+1. A **plain "vibe" prompt** — the kind most developers type
+2. A **PromptKit-assembled structured prompt** — with persona, protocols,
+   taxonomy, and output format
+
+The goal is to show that prompt engineering isn't about clever wording —
+it's about systematic composition of identity, reasoning methodology,
+and output structure.
+
+---
+
+## What's in This Directory
+
+| File | Purpose |
+|------|---------|
+| `demo_server.c` | Code review sample — C echo server with 5 planted bugs |
+| `demo_queue.c` | Bug investigation sample — producer-consumer with a TOCTOU race |
+| `rate_limiter_description.md` | Requirements authoring sample — 3-sentence project description |
+| `demo-script.md` | Presenter script with timing, talking points, and scorecards |
+
+## Quick Start
+
+### Prerequisites
+
+- GitHub Copilot CLI (`copilot` command available)
+- Access to a Copilot Chat model (any tier)
+- This repository cloned locally
+
+### Running a Demo Scenario
+
+Each scenario follows the same pattern:
+
+1. **Vibe run** — paste the vibe prompt into Copilot CLI with the sample
+   code/description as context
+2. **PromptKit run** — use `bootstrap.md` to assemble and execute the
+   structured prompt with the same context
+3. **Compare** — score both outputs on the scorecard from `demo-script.md`
+
+See `demo-script.md` for the full presenter walkthrough.
+
+## Scenarios at a Glance
+
+### Scenario 1: Code Review (Recommended First)
+
+**Task:** Review `demo_server.c` for bugs.
+
+| Approach | Prompt |
+|----------|--------|
+| Vibe | *"Review this C code for bugs."* |
+| PromptKit | `review-cpp-code` template with `systems-engineer` persona, `memory-safety-c` + `cpp-best-practices` protocols |
+
+**What to watch for:** Detection rate (5 planted bugs), false positives,
+severity classification, specificity of fixes, epistemic honesty.
+
+### Scenario 2: Requirements Authoring
+
+**Task:** Write requirements from `rate_limiter_description.md`.
+
+| Approach | Prompt |
+|----------|--------|
+| Vibe | *"Write requirements for a rate limiter for our REST API."* |
+| PromptKit | `author-requirements-doc` template with `software-architect` persona, `requirements-elicitation` protocol |
+
+**What to watch for:** Testability, atomicity, completeness (edge cases),
+precision (RFC 2119 keywords), implicit requirements surfaced.
+
+### Scenario 3: Bug Investigation
+
+**Task:** Find the root cause of intermittent crashes in `demo_queue.c`.
+
+| Approach | Prompt |
+|----------|--------|
+| Vibe | *"This code has a bug that causes intermittent crashes under load. Find it."* |
+| PromptKit | `investigate-bug` template with `systems-engineer` persona, `root-cause-analysis` protocol |
+
+**What to watch for:** Root cause correctness (TOCTOU race), red herring
+rejection (malloc/free is correct), hypothesis rigor, causal chain
+completeness, confidence labeling.
+
+---
+
+## Tips for Presenters
+
+- **Don't reveal the planted bugs** before running both approaches.
+- **Let the vibe output speak for itself** — don't critique it during the
+  run. The scorecard does the talking.
+- **Highlight the anti-hallucination effect** — PromptKit outputs label
+  confidence (KNOWN / INFERRED / ASSUMED); vibe outputs state guesses as facts.
+- **End with the multiplier pitch** — "Now imagine doing this 50 times
+  across a codebase. The vibe prompt is different every time. The PromptKit
+  prompt is version-controlled, tested, and consistent."
diff --git a/docs/demo/demo-script.md b/docs/demo/demo-script.md
new file mode 100644
index 0000000..ef2349b
--- /dev/null
+++ b/docs/demo/demo-script.md
@@ -0,0 +1,294 @@
+# PromptKit Demo — Presenter Script
+
+> **Estimated total time: 15–20 minutes for 2 scenarios, 25–30 for all 3.**
+>
+> Pick **Scenario 1 (Code Review)** plus either Scenario 2 or 3 for a
+> focused demo. Run all 3 only if you have 30 minutes.
+
+---
+
+## Opening (2 minutes)
+
+### Talking Points
+
+- "Every developer asks LLMs for help — code reviews, requirements,
+  debugging. But how much does the *quality of the prompt* affect the
+  *quality of the answer*?"
+- "Today I'll run the **exact same task** through two approaches:
+  a quick, natural-language prompt — what we'll call a *vibe prompt* —
+  and a PromptKit-assembled prompt with an engineered persona, reasoning
+  protocols, and output format."
+- "We'll score the results on the same rubric so you can see the
+  difference objectively."
+
+### Setup Checklist
+
+- [ ] Terminal with `copilot` CLI ready
+- [ ] `docs/demo/` files open or accessible
+- [ ] Scorecard template visible (printed or second screen)
+- [ ] Audience cannot see the planted-bug comments in source files
+
+---
+
+## Scenario 1: Code Review (8 minutes)
+
+### 1a. Show the Code (1 min)
+
+Open `demo_server.c` and briefly walk through it:
+
+> "This is a simple TCP echo server in C — about 120 lines. It accepts
+> connections, reads data, echoes it back. Pretty typical systems code.
+> Let's see what an LLM finds when we ask it to review this."
+
+**⚠️ Scroll past the comment block at the top** — it lists the planted bugs.
+Start showing from the `#include` lines.
+
+### 1b. Vibe Run (2 min)
+
+Type into Copilot CLI:
+
+```
+Review this C code for bugs.
+
+<paste demo_server.c content, excluding the top comment block>
+```
+
+Let the output render. **Don't comment yet** — just say:
+
+> "Okay, let's note what it found. We'll come back to this."
+
+### 1c. PromptKit Run (3 min)
+
+```
+copilot -i "Read bootstrap.md and execute the prompt"
+```
+
+When prompted for the task:
+
+> "I need to review C code for correctness and safety issues."
+
+The bootstrap engine will select `review-cpp-code` and assemble the prompt.
+Provide `demo_server.c` as the code context.
+
+As the output renders, highlight:
+
+> "Notice it's using named pattern families — CPP-1 for memory safety,
+> CPP-5 for error handling. Each finding has a severity, exact line
+> number, and a specific fix."
+>
+> "And see this self-verification step at the end — it re-checks its
+> own findings before presenting them."
+
+### 1d. Scorecard (2 min)
+
+#### Scoring Rubric
+
+| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) |
+|---|--------|:---:|:---:|
+| 1 | **Detection rate** — How many of the 5 bugs found? | ___ / 5 | ___ / 5 |
+| 2 | **False positives** — Non-issues flagged as bugs? (0 = many, 2 = none) | | |
+| 3 | **Severity accuracy** — Critical bugs ranked above medium? | | |
+| 4 | **Specificity** — Exact line cited + explains *why* it's wrong? | | |
+| 5 | **Actionability** — Concrete, correct fix provided? | | |
+| 6 | **Epistemic honesty** — Distinguishes confirmed vs. suspected? | | |
+| | **Total** | ___ / 12 | ___ / 12 |
+
+#### What to Highlight
+
+- **Detection gap:** The vibe prompt typically finds 2–3 bugs; PromptKit
+  finds 4–5. The off-by-one (Bug 4) and use-after-free (Bug 1) are most
+  commonly missed by vibe prompts.
+- **False positives:** Vibe prompts often flag the `create_client` malloc
+  as a leak (it's freed in `destroy_client` — red herring). PromptKit's
+  memory-safety protocol traces allocation/deallocation pairs and
+  correctly dismisses this.
+- **Structure:** Vibe output is a flat list mixing bugs with style
+  suggestions. PromptKit output is severity-classified with pattern IDs.
+
+---
+
+## Scenario 2: Requirements Authoring (8 minutes)
+
+### 2a. Show the Description (1 min)
+
+Read `rate_limiter_description.md` aloud:
+
+> "Here's what the product manager gave us: 'We need a rate limiter
+> for our REST API. It should limit each authenticated user to a
+> configurable number of requests per time window. When the limit is
+> exceeded, return HTTP 429.' That's it — three sentences."
+
+### 2b. Vibe Run (2 min)
+
+```
+Write requirements for a rate limiter for our REST API. It should limit
+each authenticated user to a configurable number of requests per time
+window. When the limit is exceeded, return HTTP 429 Too Many Requests.
+```
+
+Let the output render. Note how many "requirements" it produces and
+whether they have acceptance criteria.
+
+### 2c. PromptKit Run (3 min)
+
+Use `bootstrap.md` → select `author-requirements-doc`. Provide the same
+3-sentence description as the `{{description}}` parameter.
+
+As the output renders, highlight:
+
+> "Look at the REQ-IDs — each requirement is numbered and atomic.
+> Every one has acceptance criteria with specific pass/fail conditions."
+>
+> "And here — it surfaced requirements we didn't even mention:
+> `Retry-After` header, distributed counting, clock skew, observability,
+> graceful degradation. These are the implicit requirements that vibe
+> prompts always miss."
+
+### 2d. Scorecard (2 min)
+
+#### Scoring Rubric
+
+| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) |
+|---|--------|:---:|:---:|
+| 1 | **Testability** — Each req has pass/fail acceptance criteria? | | |
+| 2 | **Atomicity** — Each req is a single verifiable statement? | | |
+| 3 | **Completeness** — Edge cases covered? (burst, distributed, clock skew) | | |
+| 4 | **Precision** — RFC 2119 keywords (MUST/SHOULD/MAY)? | | |
+| 5 | **Structure** — REQ-IDs, constraints, assumptions, risks, non-goals? | | |
+| 6 | **Implicit reqs** — Surfaced things user didn't mention? | | |
+| | **Total** | ___ / 12 | ___ / 12 |
+
+#### What to Highlight
+
+- **Quantity gap:** Vibe prompts produce 5–8 vague bullets. PromptKit
+  produces 15–25 atomic requirements.
+- **"I didn't think of that" moment:** Call out 2–3 implicit requirements
+  the audience wouldn't have considered (e.g., `Retry-After`, behavior
+  when the time window boundary falls mid-request, what happens when
+  the backing store is unavailable).
+- **Downstream value:** "This document can now feed into PromptKit's
+  design → validation → audit pipeline. Try doing that with bullet points."
+
+---
+
+## Scenario 3: Bug Investigation (8 minutes)
+
+### 3a. Show the Code + Symptom (1 min)
+
+Open `demo_queue.c` and briefly walk through it:
+
+> "Producer-consumer queue — producers enqueue strings, consumers
+> dequeue and process them. Works fine in single-threaded tests."
+
+Then read the symptom:
+
+> "But it crashes intermittently under load with 4 producer threads
+> and 2 consumer threads. The crash is a segfault in `process_item()`,
+> but that function looks correct. AddressSanitizer shows no heap issues."
+
+### 3b. Vibe Run (2 min)
+
+```
+This code has a bug that causes intermittent crashes under load with
+4 producer threads and 2 consumer threads. The crash is a segfault
+inside process_item(), but that function looks correct. ASan reports
+no heap issues. Find the root cause.
+
+<paste demo_queue.c content, excluding the top comment block>
+```
+
+### 3c. PromptKit Run (3 min)
+
+Use `bootstrap.md` → select `investigate-bug`. Provide the symptom
+description and `demo_queue.c` as context.
+
+As the output renders, highlight:
+
+> "See the hypothesis table — it generated three potential causes and
+> is evaluating evidence for each one, not just jumping to a conclusion."
+>
+> "It correctly identified the TOCTOU race: the count check on line 55
+> is outside the lock. And look — it explicitly dismissed the strdup/free
+> pattern as a red herring with evidence: 'freed on line 81 in consumer,
+> matching the strdup on line 42.'"
+>
+> "Notice the confidence labels — KNOWN for the code structure, INFERRED
+> for the race condition trigger, ASSUMED for the crash mechanism."
+
+### 3d. Scorecard (2 min)
+
+#### Scoring Rubric
+
+| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) |
+|---|--------|:---:|:---:|
+| 1 | **Root cause correct** — Identified the TOCTOU race? | | |
+| 2 | **Red herring rejected** — Did NOT flag malloc/free as a leak? | | |
+| 3 | **Hypothesis rigor** — Generated ≥3 hypotheses with evidence? | | |
+| 4 | **Causal chain** — Traced check → context switch → stale → crash? | | |
+| 5 | **Confidence labeling** — KNOWN vs. INFERRED vs. ASSUMED? | | |
+| 6 | **Remediation quality** — Fix is correct and complete? | | |
+| | **Total** | ___ / 12 | ___ / 12 |
+
+#### What to Highlight
+
+- **Reasoning depth:** Vibe prompts often jump straight to "the mutex
+  is wrong" without explaining the interleaving. PromptKit's root-cause-
+  analysis protocol forces a causal chain.
+- **Red herring:** Vibe prompts frequently flag the `strdup` as a memory
+  leak. PromptKit's memory-safety protocol traces the alloc/free pair
+  across functions and dismisses it.
+- **Epistemic honesty:** Vibe prompts state "The bug is X" as fact.
+  PromptKit labels confidence and distinguishes root cause from
+  proximate cause (the crash is in `process_item`, but the *root cause*
+  is in `dequeue`).
+
+---
+
+## Closing (2 minutes)
+
+### The Multiplier Pitch
+
+> "You saw the difference on one file. Now imagine doing this across
+> 50 files, or 200 PRs, or every sprint's requirements."
+>
+> "With a vibe prompt, the quality varies every time — it depends on
+> your mood, your wording, what you remembered to ask for."
+>
+> "With PromptKit, the quality is **consistent, composable, and
+> version-controlled**. The same persona, the same protocols, the same
+> output format — every time."
+
+### The Agent Instruction Hook
+
+> "And you don't have to run this manually every time. PromptKit can
+> output persistent agent instruction files — for GitHub Copilot,
+> Claude Code, or Cursor — so these protocols run automatically in
+> every session."
+
+### Call to Action
+
+> "PromptKit is open source. The prompts you saw today are in the
+> repository. You can use them as-is, customize them for your domain,
+> or add new ones. The library has [X] templates, [Y] protocols, and
+> [Z] personas — all composable."
+
+---
+
+## Appendix: Planted Bug Reference
+
+### demo_server.c (5 bugs)
+
+| # | Bug | Severity | Location | Description |
+|---|-----|----------|----------|-------------|
+| 1 | Use-after-free | Critical | L65, L73 | `client->buf` freed on disconnect, then read if caller loops |
+| 2 | Buffer overflow | Critical | L52 | `strcpy`/`strcat` into 64-byte buffer, no bounds check |
+| 3 | Unchecked return | High | L61 | `recv()` can return -1; code treats it as valid length |
+| 4 | Off-by-one | Medium | L46 | `i <= len` writes one past buffer end |
+| 5 | Resource leak | Medium | L83 | `client_fd` not closed on `send()` failure path |
+
+### demo_queue.c (1 root cause + 1 red herring)
+
+| # | Issue | Type | Location | Description |
+|---|-------|------|----------|-------------|
+| 1 | TOCTOU race | Root cause | L55–60 | `count` checked outside lock; another thread can drain queue between check and lock |
+| 2 | strdup/free | Red herring | L42, L81 | Allocation in enqueue, free in consumer — this is correct |
diff --git a/docs/demo/demo_queue.c b/docs/demo/demo_queue.c
new file mode 100644
index 0000000..11bef3d
--- /dev/null
+++ b/docs/demo/demo_queue.c
@@ -0,0 +1,133 @@
+/* demo_queue.c — Lock-free-looking producer-consumer queue with a
+ *                subtle TOCTOU race condition.
+ *
+ * PURPOSE: PromptKit demo. Give this code to an LLM with the symptom
+ * "intermittent crashes under load" and compare how a vibe prompt
+ * vs. a PromptKit-assembled prompt investigates the root cause.
+ *
+ * PLANTED ISSUE (do NOT reveal during the demo):
+ *
+ *   Root Cause — TOCTOU race in dequeue() (lines 55-60):
+ *       `count` is checked OUTSIDE the lock, then the lock is acquired
+ *       and the item is dequeued. Between the check and the lock
+ *       acquisition, another thread can dequeue the last item, causing
+ *       a read from an empty queue (head == tail, stale data or
+ *       segfault if items[head] was already consumed/freed).
+ *
+ *   Red Herring — malloc in enqueue() (line 42):
+ *       `strdup(item)` allocates memory. It is correctly freed in the
+ *       consumer (line 81). A shallow review may flag this as a leak.
+ *
+ * SYMPTOM DESCRIPTION (give this to the LLM):
+ *
+ *   "This producer-consumer queue works fine in our single-threaded
+ *   tests but crashes intermittently under load when we run 4 producer
+ *   threads and 2 consumer threads. The crash is a segfault inside
+ *   process_item(), but that function looks correct. We've checked for
+ *   memory corruption with AddressSanitizer — no heap issues reported."
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#define QUEUE_CAP 64
+
+typedef struct {
+    char *items[QUEUE_CAP];
+    int   head;
+    int   tail;
+    int   count;
+    pthread_mutex_t lock;
+} queue_t;
+
+void queue_init(queue_t *q)
+{
+    memset(q, 0, sizeof(*q));
+    pthread_mutex_init(&q->lock, NULL);
+}
+
+/* Red herring: strdup allocates, but consumer frees — this is correct */
+int enqueue(queue_t *q, const char *item)
+{
+    pthread_mutex_lock(&q->lock);
+    if (q->count >= QUEUE_CAP) {
+        pthread_mutex_unlock(&q->lock);
+        return -1;  /* full */
+    }
+    q->items[q->tail] = strdup(item);
+    q->tail = (q->tail + 1) % QUEUE_CAP;
+    q->count++;
+    pthread_mutex_unlock(&q->lock);
+    return 0;
+}
+
+/* BUG: TOCTOU race — count checked OUTSIDE lock, then lock acquired.
+ * Between the check and the lock, another consumer can drain the queue. */
+char *dequeue(queue_t *q)
+{
+    if (q->count == 0)       /* <-- CHECK outside lock */
+        return NULL;
+
+    pthread_mutex_lock(&q->lock);   /* <-- another thread may dequeue here */
+    char *item = q->items[q->head]; /* stale: head may now be invalid */
+    q->items[q->head] = NULL;
+    q->head = (q->head + 1) % QUEUE_CAP;
+    q->count--;                     /* count can go negative! */
+    pthread_mutex_unlock(&q->lock);
+    return item;
+}
+
+/* This function is correct — the crash is NOT here */
+void process_item(const char *item)
+{
+    printf("Processing: %s\n", item);
+}
+
+/* Consumer thread */
+void *consumer(void *arg)
+{
+    queue_t *q = (queue_t *)arg;
+    while (1) {
+        char *item = dequeue(q);
+        if (item) {
+            process_item(item);
+            free(item);   /* correctly frees the strdup from enqueue */
+        }
+    }
+    return NULL;
+}
+
+/* Producer thread */
+void *producer(void *arg)
+{
+    queue_t *q = (queue_t *)arg;
+    for (int i = 0; i < 10000; i++) {
+        char msg[32];
+        snprintf(msg, sizeof(msg), "msg-%d", i);
+        while (enqueue(q, msg) < 0)
+            ;  /* spin until space available */
+    }
+    return NULL;
+}
+
+int main(void)
+{
+    queue_t q;
+    queue_init(&q);
+
+    pthread_t producers[4], consumers[2];
+
+    for (int i = 0; i < 4; i++)
+        pthread_create(&producers[i], NULL, producer, &q);
+    for (int i = 0; i < 2; i++)
+        pthread_create(&consumers[i], NULL, consumer, &q);
+
+    for (int i = 0; i < 4; i++)
+        pthread_join(producers[i], NULL);
+
+    /* Note: consumers run forever — in production we'd signal them to stop */
+    printf("All producers finished.\n");
+    return 0;
+}
diff --git a/docs/demo/demo_server.c b/docs/demo/demo_server.c
new file mode 100644
index 0000000..6eb7bcb
--- /dev/null
+++ b/docs/demo/demo_server.c
@@ -0,0 +1,154 @@
+/* demo_server.c — Minimal TCP echo server with 5 planted bugs.
+ *
+ * PURPOSE: PromptKit demo. Ask an LLM to review this file for bugs,
+ * first with a plain "vibe" prompt, then with a PromptKit-assembled
+ * prompt, and compare the findings.
+ *
+ * PLANTED ISSUES (do NOT reveal these to the LLM during the demo):
+ *
+ *   Bug 1 (Critical) — Use-after-free on line 65/73:
+ *       `client->buf` is freed, then read on the next iteration.
+ *
+ *   Bug 2 (Critical) — Buffer overflow on line 52:
+ *       `strcpy` into a fixed-size buffer with no bounds check.
+ *
+ *   Bug 3 (High) — Unchecked return value on line 61:
+ *       `recv()` can return -1 on error; code treats it as a valid length.
+ *
+ *   Bug 4 (Medium) — Off-by-one on line 46:
+ *       `<=` should be `<`; writes one byte past `msg` buffer.
+ *
+ *   Bug 5 (Medium) — Resource leak on line 83:
+ *       Early return on `send()` failure does not close `client_fd`.
+ *
+ * There is also a RED HERRING: `create_client` allocates memory that
+ * is freed in `destroy_client` (correct). A shallow review might flag
+ * this as a leak because the free is in a different function.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <unistd.h>
+
+#define MAX_MSG    256
+#define BACKLOG    10
+
+typedef struct {
+    int   fd;
+    char *buf;
+    int   buf_size;
+} client_t;
+
+/* Bug 4 (Off-by-one): loop condition uses <= instead of < */
+static void sanitize(char *msg, int len)
+{
+    for (int i = 0; i <= len; i++) {   /* BUG: should be i < len */
+        if (msg[i] == '\n' || msg[i] == '\r')
+            msg[i] = ' ';
+    }
+}
+
+/* Bug 2 (Buffer overflow): no bounds check on incoming name */
+static void log_connection(const char *client_name)
+{
+    char log_entry[64];
+    strcpy(log_entry, "CONNECT: ");
+    strcat(log_entry, client_name);          /* BUG: unbounded copy */
+    printf("%s\n", log_entry);
+}
+
+/* Red herring: allocation here is correctly freed in destroy_client */
+static client_t *create_client(int fd, int buf_size)
+{
+    client_t *c = malloc(sizeof(client_t));
+    if (!c) return NULL;
+    c->fd       = fd;
+    c->buf      = malloc(buf_size);
+    c->buf_size = buf_size;
+    return c;
+}
+
+/* Bug 3 (Unchecked return): recv() error not handled */
+/* Bug 1 (Use-after-free): buf freed then used in next call */
+static int handle_echo(client_t *client)
+{
+    int n = recv(client->fd, client->buf, client->buf_size, 0);
+    /* BUG 3: n could be -1 (error); code falls through */
+
+    if (n == 0) {
+        free(client->buf);             /* BUG 1: frees buf ... */
+        return 0;  /* client disconnected */
+    }
+
+    sanitize(client->buf, n);
+
+    /* ... but if the caller loops and calls handle_echo again
+     * after a partial read/reconnect, client->buf is dangling. */
+
+    return n;
+}
+
+/* Bug 5 (Resource leak): client_fd not closed on send failure */
+static void serve_client(int client_fd, const char *client_name)
+{
+    log_connection(client_name);
+
+    client_t *client = create_client(client_fd, MAX_MSG);
+    if (!client) {
+        close(client_fd);
+        return;
+    }
+
+    int n;
+    while ((n = handle_echo(client)) > 0) {
+        if (send(client_fd, client->buf, n, 0) < 0) {
+            perror("send");
+            free(client->buf);
+            free(client);
+            return;  /* BUG 5: client_fd never closed */
+        }
+    }
+
+    close(client_fd);
+    /* destroy_client not called — but buf was freed in handle_echo
+     * on the n==0 path, so only the client_t struct leaks here.
+     * (This is a secondary consequence of Bug 1's design.) */
+    free(client);
+}
+
+static void destroy_client(client_t *c)
+{
+    if (!c) return;
+    close(c->fd);
+    free(c->buf);
+    free(c);
+}
+
+int main(void)
+{
+    int server_fd = socket(AF_INET, SOCK_STREAM, 0);
+    struct sockaddr_in addr = {
+        .sin_family = AF_INET,
+        .sin_addr.s_addr = INADDR_ANY,
+        .sin_port = htons(8080),
+    };
+
+    bind(server_fd, (struct sockaddr *)&addr, sizeof(addr));
+    listen(server_fd, BACKLOG);
+
+    printf("Listening on :8080\n");
+
+    for (;;) {
+        struct sockaddr_in client_addr;
+        socklen_t addr_len = sizeof(client_addr);
+        int client_fd = accept(server_fd, (struct sockaddr *)&client_addr,
+                               &addr_len);
+        /* For demo simplicity, handle one client at a time */
+        serve_client(client_fd, "unknown");
+    }
+
+    return 0;
+}
diff --git a/docs/demo/rate_limiter_description.md b/docs/demo/rate_limiter_description.md
new file mode 100644
index 0000000..5d714b7
--- /dev/null
+++ b/docs/demo/rate_limiter_description.md
@@ -0,0 +1,13 @@
+# Rate Limiter — Project Description
+
+> **Give this description to the LLM for both the vibe prompt and the
+> PromptKit prompt. Do NOT add any extra context — the point is to see
+> how much structure each approach extracts from the same sparse input.**
+
+---
+
+## The Ask
+
+We need a rate limiter for our REST API. It should limit each
+authenticated user to a configurable number of requests per time window.
+When the limit is exceeded, return HTTP 429 Too Many Requests.

From ca52bd1dbc6e9dfcec8daee5d110345bfe46b1f7 Mon Sep 17 00:00:00 2001
From: Andrew Beltrano <anbeltra@microsoft.com>
Date: Wed, 1 Apr 2026 15:36:35 -0600
Subject: [PATCH 2/2] Move answer keys out of source files into separate
 answer-key.md

Strip all planted-bug comments, hints, and annotations from
demo_server.c and demo_queue.c so the LLM cannot see the answers
when the code is provided as context during the demo.

All answer details (bug descriptions, interleavings, red herrings,
and the implicit-requirements checklist) are now in answer-key.md,
which the presenter references but never includes in LLM context.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/demo/README.md      |   5 +-
 docs/demo/answer-key.md  | 102 +++++++++++++++++++++++++++++++++++++++
 docs/demo/demo-script.md |  20 ++------
 docs/demo/demo_queue.c   |  43 +++--------------
 docs/demo/demo_server.c  |  49 +++----------------
 5 files changed, 122 insertions(+), 97 deletions(-)
 create mode 100644 docs/demo/answer-key.md

diff --git a/docs/demo/README.md b/docs/demo/README.md
index 18ca7ff..082b56b 100644
--- a/docs/demo/README.md
+++ b/docs/demo/README.md
@@ -16,10 +16,11 @@ and output structure.
 
 | File | Purpose |
 |------|---------|
-| `demo_server.c` | Code review sample — C echo server with 5 planted bugs |
-| `demo_queue.c` | Bug investigation sample — producer-consumer with a TOCTOU race |
+| `demo_server.c` | Code review sample — C echo server |
+| `demo_queue.c` | Bug investigation sample — producer-consumer queue |
 | `rate_limiter_description.md` | Requirements authoring sample — 3-sentence project description |
 | `demo-script.md` | Presenter script with timing, talking points, and scorecards |
+| `answer-key.md` | **Presenter only** — planted bug reference and scoring guide |
 
 ## Quick Start
 
diff --git a/docs/demo/answer-key.md b/docs/demo/answer-key.md
new file mode 100644
index 0000000..5633ba4
--- /dev/null
+++ b/docs/demo/answer-key.md
@@ -0,0 +1,102 @@
+# Demo Answer Key
+
+> **⚠️ PRESENTER ONLY — do not share this file or include it in LLM context.**
+>
+> This file documents the planted issues in the demo code samples.
+> Use it to score LLM outputs after each run.
+
+---
+
+## demo_server.c — Code Review (5 bugs + 1 red herring)
+
+| # | Severity | Category | Location | Description |
+|---|----------|----------|----------|-------------|
+| 1 | Critical | Use-after-free | `handle_echo()` | `client->buf` is freed when `n == 0` (client disconnects), but `serve_client()` loops and calls `handle_echo()` again — `client->buf` is now a dangling pointer. |
+| 2 | Critical | Buffer overflow | `log_connection()` | `strcpy`/`strcat` copies `client_name` into a 64-byte `log_entry` with no bounds check. If `client_name` exceeds ~54 characters, this overflows the stack buffer. |
+| 3 | High | Unchecked return | `handle_echo()` | `recv()` can return -1 on error. The code only checks for `n == 0` (disconnect) and falls through to `sanitize()` with `n == -1`, passing a negative length. |
+| 4 | Medium | Off-by-one | `sanitize()` | Loop condition `i <= len` iterates one past the valid data. Should be `i < len`. Reads (and potentially writes) one byte beyond the received data. |
+| 5 | Medium | Resource leak | `serve_client()` | When `send()` fails, the function frees `client->buf` and `client` but returns without closing `client_fd`, leaking the file descriptor. |
+
+### Red Herring
+
+`create_client()` calls `malloc()` for both the `client_t` struct and
+`client->buf`. A shallow review might flag this as a memory leak because
+`destroy_client()` (which properly frees both) is defined but never called
+in the normal code path. However, the code does free `client->buf` and
+the `client_t` struct through other paths — the design is just unusual,
+not leaking (aside from the bugs above that create leak-like consequences).
+
+---
+
+## demo_queue.c — Bug Investigation (1 root cause + 1 red herring)
+
+### Root Cause: TOCTOU Race in `dequeue()`
+
+```c
+char *dequeue(queue_t *q)
+{
+    if (q->count == 0)          // CHECK — outside the lock
+        return NULL;
+
+    pthread_mutex_lock(&q->lock);   // ACQUIRE — another thread may act here
+    char *item = q->items[q->head]; // USE — head/count may now be stale
+```
+
+**The interleaving:**
+
+1. Thread A calls `dequeue()`, reads `q->count == 1`, passes the check.
+2. Thread A is preempted before acquiring the lock.
+3. Thread B calls `dequeue()`, also reads `q->count == 1`, passes the check.
+4. Thread B acquires the lock, dequeues the last item, decrements count to 0.
+5. Thread A resumes, acquires the lock, reads `q->items[q->head]` — but
+   the item was already consumed by Thread B. The pointer is either NULL
+   (if B set it to NULL) or stale/freed memory.
+6. Thread A passes this to `process_item()` → segfault on NULL dereference
+   or use-after-free.
+
+**Why ASan doesn't catch it:** The segfault is a NULL dereference (reading
+`items[head]` which was set to NULL by Thread B), not a heap corruption.
+ASan's heap checks don't flag NULL pointer reads.
+
+**Correct fix:** Move the count check inside the lock:
+
+```c
+char *dequeue(queue_t *q)
+{
+    pthread_mutex_lock(&q->lock);
+    if (q->count == 0) {
+        pthread_mutex_unlock(&q->lock);
+        return NULL;
+    }
+    char *item = q->items[q->head];
+    // ... rest unchanged
+```
+
+### Red Herring: strdup/free Pattern
+
+`enqueue()` calls `strdup(item)` to allocate a copy of each string.
+`consumer()` calls `free(item)` after `process_item()`. This is a
+correct allocate-in-producer / free-in-consumer pattern. It is NOT a
+memory leak.
+
+---
+
+## rate_limiter_description.md — Requirements Authoring
+
+There are no planted bugs — this scenario measures **completeness and
+structure**. Score by counting how many of these the LLM surfaces:
+
+### Implicit Requirements Most Developers Miss
+
+| Category | Requirement | Why It Matters |
+|----------|-------------|----------------|
+| HTTP semantics | Include `Retry-After` header in 429 response | RFC 6585 recommends it; clients need it for backoff |
+| Distributed | Behavior with multiple API server instances | Single-node counters don't work behind a load balancer |
+| Clock skew | Window boundary behavior | What happens to a request that arrives at the exact window boundary? |
+| Persistence | Rate limit state durability | What happens to counts when the service restarts? |
+| Observability | Metrics / logging for rate limit events | Ops team needs visibility into throttling patterns |
+| Graceful degradation | Behavior when rate limit store is unavailable | Fail-open (allow all) or fail-closed (deny all)? |
+| Burst handling | Sliding window vs. fixed window vs. token bucket | Fixed windows allow 2x burst at boundaries |
+| Identity | What counts as "a user"? | API key? OAuth token? IP fallback for unauthenticated? |
+| Configurability | Per-endpoint or global limits? | Different endpoints may need different thresholds |
+| Response body | What information to include in the 429 body | Current usage, limit, reset time |
diff --git a/docs/demo/demo-script.md b/docs/demo/demo-script.md
index ef2349b..71bb05e 100644
--- a/docs/demo/demo-script.md
+++ b/docs/demo/demo-script.md
@@ -276,19 +276,7 @@ As the output renders, highlight:
 
 ## Appendix: Planted Bug Reference
 
-### demo_server.c (5 bugs)
-
-| # | Bug | Severity | Location | Description |
-|---|-----|----------|----------|-------------|
-| 1 | Use-after-free | Critical | L65, L73 | `client->buf` freed on disconnect, then read if caller loops |
-| 2 | Buffer overflow | Critical | L52 | `strcpy`/`strcat` into 64-byte buffer, no bounds check |
-| 3 | Unchecked return | High | L61 | `recv()` can return -1; code treats it as valid length |
-| 4 | Off-by-one | Medium | L46 | `i <= len` writes one past buffer end |
-| 5 | Resource leak | Medium | L83 | `client_fd` not closed on `send()` failure path |
-
-### demo_queue.c (1 root cause + 1 red herring)
-
-| # | Issue | Type | Location | Description |
-|---|-------|------|----------|-------------|
-| 1 | TOCTOU race | Root cause | L55–60 | `count` checked outside lock; another thread can drain queue between check and lock |
-| 2 | strdup/free | Red herring | L42, L81 | Allocation in enqueue, free in consumer — this is correct |
+See [`answer-key.md`](answer-key.md) for the full planted bug reference
+with expected findings, interleavings, and implicit requirement checklist.
+
+**⚠️ Do not include `answer-key.md` in any LLM context during the demo.**
diff --git a/docs/demo/demo_queue.c b/docs/demo/demo_queue.c
index 11bef3d..f0be8ac 100644
--- a/docs/demo/demo_queue.c
+++ b/docs/demo/demo_queue.c
@@ -1,30 +1,5 @@
-/* demo_queue.c — Lock-free-looking producer-consumer queue with a
- *                subtle TOCTOU race condition.
- *
- * PURPOSE: PromptKit demo. Give this code to an LLM with the symptom
- * "intermittent crashes under load" and compare how a vibe prompt
- * vs. a PromptKit-assembled prompt investigates the root cause.
- *
- * PLANTED ISSUE (do NOT reveal during the demo):
- *
- *   Root Cause — TOCTOU race in dequeue() (lines 55-60):
- *       `count` is checked OUTSIDE the lock, then the lock is acquired
- *       and the item is dequeued. Between the check and the lock
- *       acquisition, another thread can dequeue the last item, causing
- *       a read from an empty queue (head == tail, stale data or
- *       segfault if items[head] was already consumed/freed).
- *
- *   Red Herring — malloc in enqueue() (line 42):
- *       `strdup(item)` allocates memory. It is correctly freed in the
- *       consumer (line 81). A shallow review may flag this as a leak.
- *
- * SYMPTOM DESCRIPTION (give this to the LLM):
- *
- *   "This producer-consumer queue works fine in our single-threaded
- *   tests but crashes intermittently under load when we run 4 producer
- *   threads and 2 consumer threads. The crash is a segfault inside
- *   process_item(), but that function looks correct. We've checked for
- *   memory corruption with AddressSanitizer — no heap issues reported."
+/* demo_queue.c — Producer-consumer queue using a mutex.
+ * Producers enqueue strings, consumers dequeue and process them.
  */
 
 #include <stdio.h>
@@ -48,7 +23,6 @@ void queue_init(queue_t *q)
     pthread_mutex_init(&q->lock, NULL);
 }
 
-/* Red herring: strdup allocates, but consumer frees — this is correct */
 int enqueue(queue_t *q, const char *item)
 {
     pthread_mutex_lock(&q->lock);
@@ -63,23 +37,20 @@ int enqueue(queue_t *q, const char *item)
     return 0;
 }
 
-/* BUG: TOCTOU race — count checked OUTSIDE lock, then lock acquired.
- * Between the check and the lock, another consumer can drain the queue. */
 char *dequeue(queue_t *q)
 {
-    if (q->count == 0)       /* <-- CHECK outside lock */
+    if (q->count == 0)
         return NULL;
 
-    pthread_mutex_lock(&q->lock);   /* <-- another thread may dequeue here */
-    char *item = q->items[q->head]; /* stale: head may now be invalid */
+    pthread_mutex_lock(&q->lock);
+    char *item = q->items[q->head];
     q->items[q->head] = NULL;
     q->head = (q->head + 1) % QUEUE_CAP;
-    q->count--;                     /* count can go negative! */
+    q->count--;
     pthread_mutex_unlock(&q->lock);
     return item;
 }
 
-/* This function is correct — the crash is NOT here */
 void process_item(const char *item)
 {
     printf("Processing: %s\n", item);
@@ -93,7 +64,7 @@ void *consumer(void *arg)
         char *item = dequeue(q);
         if (item) {
             process_item(item);
-            free(item);   /* correctly frees the strdup from enqueue */
+            free(item);
         }
     }
     return NULL;
diff --git a/docs/demo/demo_server.c b/docs/demo/demo_server.c
index 6eb7bcb..4e3a505 100644
--- a/docs/demo/demo_server.c
+++ b/docs/demo/demo_server.c
@@ -1,29 +1,5 @@
-/* demo_server.c — Minimal TCP echo server with 5 planted bugs.
- *
- * PURPOSE: PromptKit demo. Ask an LLM to review this file for bugs,
- * first with a plain "vibe" prompt, then with a PromptKit-assembled
- * prompt, and compare the findings.
- *
- * PLANTED ISSUES (do NOT reveal these to the LLM during the demo):
- *
- *   Bug 1 (Critical) — Use-after-free on line 65/73:
- *       `client->buf` is freed, then read on the next iteration.
- *
- *   Bug 2 (Critical) — Buffer overflow on line 52:
- *       `strcpy` into a fixed-size buffer with no bounds check.
- *
- *   Bug 3 (High) — Unchecked return value on line 61:
- *       `recv()` can return -1 on error; code treats it as a valid length.
- *
- *   Bug 4 (Medium) — Off-by-one on line 46:
- *       `<=` should be `<`; writes one byte past `msg` buffer.
- *
- *   Bug 5 (Medium) — Resource leak on line 83:
- *       Early return on `send()` failure does not close `client_fd`.
- *
- * There is also a RED HERRING: `create_client` allocates memory that
- * is freed in `destroy_client` (correct). A shallow review might flag
- * this as a leak because the free is in a different function.
+/* demo_server.c — Minimal TCP echo server.
+ * Accepts connections, reads data, sanitizes it, and echoes it back.
  */
 
 #include <stdio.h>
@@ -42,25 +18,22 @@ typedef struct {
     int   buf_size;
 } client_t;
 
-/* Bug 4 (Off-by-one): loop condition uses <= instead of < */
 static void sanitize(char *msg, int len)
 {
-    for (int i = 0; i <= len; i++) {   /* BUG: should be i < len */
+    for (int i = 0; i <= len; i++) {
         if (msg[i] == '\n' || msg[i] == '\r')
             msg[i] = ' ';
     }
 }
 
-/* Bug 2 (Buffer overflow): no bounds check on incoming name */
 static void log_connection(const char *client_name)
 {
     char log_entry[64];
     strcpy(log_entry, "CONNECT: ");
-    strcat(log_entry, client_name);          /* BUG: unbounded copy */
+    strcat(log_entry, client_name);
     printf("%s\n", log_entry);
 }
 
-/* Red herring: allocation here is correctly freed in destroy_client */
 static client_t *create_client(int fd, int buf_size)
 {
     client_t *c = malloc(sizeof(client_t));
@@ -71,27 +44,20 @@ static client_t *create_client(int fd, int buf_size)
     return c;
 }
 
-/* Bug 3 (Unchecked return): recv() error not handled */
-/* Bug 1 (Use-after-free): buf freed then used in next call */
 static int handle_echo(client_t *client)
 {
     int n = recv(client->fd, client->buf, client->buf_size, 0);
-    /* BUG 3: n could be -1 (error); code falls through */
 
     if (n == 0) {
-        free(client->buf);             /* BUG 1: frees buf ... */
+        free(client->buf);
         return 0;  /* client disconnected */
     }
 
     sanitize(client->buf, n);
 
-    /* ... but if the caller loops and calls handle_echo again
-     * after a partial read/reconnect, client->buf is dangling. */
-
     return n;
 }
 
-/* Bug 5 (Resource leak): client_fd not closed on send failure */
 static void serve_client(int client_fd, const char *client_name)
 {
     log_connection(client_name);
@@ -108,14 +74,11 @@ static void serve_client(int client_fd, const char *client_name)
             perror("send");
             free(client->buf);
             free(client);
-            return;  /* BUG 5: client_fd never closed */
+            return;
         }
     }
 
     close(client_fd);
-    /* destroy_client not called — but buf was freed in handle_echo
-     * on the n==0 path, so only the client_t struct leaks here.
-     * (This is a secondary consequence of Bug 1's design.) */
     free(client);
 }