From 9adce24767aae91dd741b987c34dfddb53bae63e Mon Sep 17 00:00:00 2001 From: Andrew Beltrano Date: Wed, 1 Apr 2026 15:24:14 -0600 Subject: [PATCH 1/2] Add vibe-vs-structured demo with 3 scenarios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds docs/demo/ with complete presenter materials for a live demo comparing plain 'vibe' prompts against PromptKit-assembled structured prompts on the same task. Three scenarios with pre-built code samples and scorecards: 1. Code review — C echo server with 5 planted bugs (demo_server.c) 2. Requirements authoring — rate limiter from 3-sentence description 3. Bug investigation — TOCTOU race condition with red herring (demo_queue.c) Includes README, full presenter script with timing notes, talking points, and appendix with planted bug reference. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/demo/README.md | 94 ++++++++ docs/demo/demo-script.md | 294 ++++++++++++++++++++++++++ docs/demo/demo_queue.c | 133 ++++++++++++ docs/demo/demo_server.c | 154 ++++++++++++++ docs/demo/rate_limiter_description.md | 13 ++ 5 files changed, 688 insertions(+) create mode 100644 docs/demo/README.md create mode 100644 docs/demo/demo-script.md create mode 100644 docs/demo/demo_queue.c create mode 100644 docs/demo/demo_server.c create mode 100644 docs/demo/rate_limiter_description.md diff --git a/docs/demo/README.md b/docs/demo/README.md new file mode 100644 index 0000000..18ca7ff --- /dev/null +++ b/docs/demo/README.md @@ -0,0 +1,94 @@ +# PromptKit Demo: Vibe Prompt vs. Structured Prompt + +This demo compares LLM output quality when tackling the same task with: + +1. A **plain "vibe" prompt** — the kind most developers type +2. A **PromptKit-assembled structured prompt** — with persona, protocols, + taxonomy, and output format + +The goal is to show that prompt engineering isn't about clever wording — +it's about systematic composition of identity, reasoning methodology, +and output structure. + +--- + +## What's in This Directory + +| File | Purpose | +|------|---------| +| `demo_server.c` | Code review sample — C echo server with 5 planted bugs | +| `demo_queue.c` | Bug investigation sample — producer-consumer with a TOCTOU race | +| `rate_limiter_description.md` | Requirements authoring sample — 3-sentence project description | +| `demo-script.md` | Presenter script with timing, talking points, and scorecards | + +## Quick Start + +### Prerequisites + +- GitHub Copilot CLI (`copilot` command available) +- Access to a Copilot Chat model (any tier) +- This repository cloned locally + +### Running a Demo Scenario + +Each scenario follows the same pattern: + +1. **Vibe run** — paste the vibe prompt into Copilot CLI with the sample + code/description as context +2. **PromptKit run** — use `bootstrap.md` to assemble and execute the + structured prompt with the same context +3. **Compare** — score both outputs on the scorecard from `demo-script.md` + +See `demo-script.md` for the full presenter walkthrough. + +## Scenarios at a Glance + +### Scenario 1: Code Review (Recommended First) + +**Task:** Review `demo_server.c` for bugs. + +| Approach | Prompt | +|----------|--------| +| Vibe | *"Review this C code for bugs."* | +| PromptKit | `review-cpp-code` template with `systems-engineer` persona, `memory-safety-c` + `cpp-best-practices` protocols | + +**What to watch for:** Detection rate (5 planted bugs), false positives, +severity classification, specificity of fixes, epistemic honesty. + +### Scenario 2: Requirements Authoring + +**Task:** Write requirements from `rate_limiter_description.md`. + +| Approach | Prompt | +|----------|--------| +| Vibe | *"Write requirements for a rate limiter for our REST API."* | +| PromptKit | `author-requirements-doc` template with `software-architect` persona, `requirements-elicitation` protocol | + +**What to watch for:** Testability, atomicity, completeness (edge cases), +precision (RFC 2119 keywords), implicit requirements surfaced. + +### Scenario 3: Bug Investigation + +**Task:** Find the root cause of intermittent crashes in `demo_queue.c`. + +| Approach | Prompt | +|----------|--------| +| Vibe | *"This code has a bug that causes intermittent crashes under load. Find it."* | +| PromptKit | `investigate-bug` template with `systems-engineer` persona, `root-cause-analysis` protocol | + +**What to watch for:** Root cause correctness (TOCTOU race), red herring +rejection (malloc/free is correct), hypothesis rigor, causal chain +completeness, confidence labeling. + +--- + +## Tips for Presenters + +- **Don't reveal the planted bugs** before running both approaches. +- **Let the vibe output speak for itself** — don't critique it during the + run. The scorecard does the talking. +- **Highlight the anti-hallucination effect** — PromptKit outputs label + confidence (KNOWN / INFERRED / ASSUMED); vibe outputs state guesses as facts. +- **End with the multiplier pitch** — "Now imagine doing this 50 times + across a codebase. The vibe prompt is different every time. The PromptKit + prompt is version-controlled, tested, and consistent." diff --git a/docs/demo/demo-script.md b/docs/demo/demo-script.md new file mode 100644 index 0000000..ef2349b --- /dev/null +++ b/docs/demo/demo-script.md @@ -0,0 +1,294 @@ +# PromptKit Demo — Presenter Script + +> **Estimated total time: 15–20 minutes for 2 scenarios, 25–30 for all 3.** +> +> Pick **Scenario 1 (Code Review)** plus either Scenario 2 or 3 for a +> focused demo. Run all 3 only if you have 30 minutes. + +--- + +## Opening (2 minutes) + +### Talking Points + +- "Every developer asks LLMs for help — code reviews, requirements, + debugging. But how much does the *quality of the prompt* affect the + *quality of the answer*?" +- "Today I'll run the **exact same task** through two approaches: + a quick, natural-language prompt — what we'll call a *vibe prompt* — + and a PromptKit-assembled prompt with an engineered persona, reasoning + protocols, and output format." +- "We'll score the results on the same rubric so you can see the + difference objectively." + +### Setup Checklist + +- [ ] Terminal with `copilot` CLI ready +- [ ] `docs/demo/` files open or accessible +- [ ] Scorecard template visible (printed or second screen) +- [ ] Audience cannot see the planted-bug comments in source files + +--- + +## Scenario 1: Code Review (8 minutes) + +### 1a. Show the Code (1 min) + +Open `demo_server.c` and briefly walk through it: + +> "This is a simple TCP echo server in C — about 120 lines. It accepts +> connections, reads data, echoes it back. Pretty typical systems code. +> Let's see what an LLM finds when we ask it to review this." + +**⚠️ Scroll past the comment block at the top** — it lists the planted bugs. +Start showing from the `#include` lines. + +### 1b. Vibe Run (2 min) + +Type into Copilot CLI: + +``` +Review this C code for bugs. + + +``` + +Let the output render. **Don't comment yet** — just say: + +> "Okay, let's note what it found. We'll come back to this." + +### 1c. PromptKit Run (3 min) + +``` +copilot -i "Read bootstrap.md and execute the prompt" +``` + +When prompted for the task: + +> "I need to review C code for correctness and safety issues." + +The bootstrap engine will select `review-cpp-code` and assemble the prompt. +Provide `demo_server.c` as the code context. + +As the output renders, highlight: + +> "Notice it's using named pattern families — CPP-1 for memory safety, +> CPP-5 for error handling. Each finding has a severity, exact line +> number, and a specific fix." +> +> "And see this self-verification step at the end — it re-checks its +> own findings before presenting them." + +### 1d. Scorecard (2 min) + +#### Scoring Rubric + +| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) | +|---|--------|:---:|:---:| +| 1 | **Detection rate** — How many of the 5 bugs found? | ___ / 5 | ___ / 5 | +| 2 | **False positives** — Non-issues flagged as bugs? (0 = many, 2 = none) | | | +| 3 | **Severity accuracy** — Critical bugs ranked above medium? | | | +| 4 | **Specificity** — Exact line cited + explains *why* it's wrong? | | | +| 5 | **Actionability** — Concrete, correct fix provided? | | | +| 6 | **Epistemic honesty** — Distinguishes confirmed vs. suspected? | | | +| | **Total** | ___ / 12 | ___ / 12 | + +#### What to Highlight + +- **Detection gap:** The vibe prompt typically finds 2–3 bugs; PromptKit + finds 4–5. The off-by-one (Bug 4) and use-after-free (Bug 1) are most + commonly missed by vibe prompts. +- **False positives:** Vibe prompts often flag the `create_client` malloc + as a leak (it's freed in `destroy_client` — red herring). PromptKit's + memory-safety protocol traces allocation/deallocation pairs and + correctly dismisses this. +- **Structure:** Vibe output is a flat list mixing bugs with style + suggestions. PromptKit output is severity-classified with pattern IDs. + +--- + +## Scenario 2: Requirements Authoring (8 minutes) + +### 2a. Show the Description (1 min) + +Read `rate_limiter_description.md` aloud: + +> "Here's what the product manager gave us: 'We need a rate limiter +> for our REST API. It should limit each authenticated user to a +> configurable number of requests per time window. When the limit is +> exceeded, return HTTP 429.' That's it — three sentences." + +### 2b. Vibe Run (2 min) + +``` +Write requirements for a rate limiter for our REST API. It should limit +each authenticated user to a configurable number of requests per time +window. When the limit is exceeded, return HTTP 429 Too Many Requests. +``` + +Let the output render. Note how many "requirements" it produces and +whether they have acceptance criteria. + +### 2c. PromptKit Run (3 min) + +Use `bootstrap.md` → select `author-requirements-doc`. Provide the same +3-sentence description as the `{{description}}` parameter. + +As the output renders, highlight: + +> "Look at the REQ-IDs — each requirement is numbered and atomic. +> Every one has acceptance criteria with specific pass/fail conditions." +> +> "And here — it surfaced requirements we didn't even mention: +> `Retry-After` header, distributed counting, clock skew, observability, +> graceful degradation. These are the implicit requirements that vibe +> prompts always miss." + +### 2d. Scorecard (2 min) + +#### Scoring Rubric + +| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) | +|---|--------|:---:|:---:| +| 1 | **Testability** — Each req has pass/fail acceptance criteria? | | | +| 2 | **Atomicity** — Each req is a single verifiable statement? | | | +| 3 | **Completeness** — Edge cases covered? (burst, distributed, clock skew) | | | +| 4 | **Precision** — RFC 2119 keywords (MUST/SHOULD/MAY)? | | | +| 5 | **Structure** — REQ-IDs, constraints, assumptions, risks, non-goals? | | | +| 6 | **Implicit reqs** — Surfaced things user didn't mention? | | | +| | **Total** | ___ / 12 | ___ / 12 | + +#### What to Highlight + +- **Quantity gap:** Vibe prompts produce 5–8 vague bullets. PromptKit + produces 15–25 atomic requirements. +- **"I didn't think of that" moment:** Call out 2–3 implicit requirements + the audience wouldn't have considered (e.g., `Retry-After`, behavior + when the time window boundary falls mid-request, what happens when + the backing store is unavailable). +- **Downstream value:** "This document can now feed into PromptKit's + design → validation → audit pipeline. Try doing that with bullet points." + +--- + +## Scenario 3: Bug Investigation (8 minutes) + +### 3a. Show the Code + Symptom (1 min) + +Open `demo_queue.c` and briefly walk through it: + +> "Producer-consumer queue — producers enqueue strings, consumers +> dequeue and process them. Works fine in single-threaded tests." + +Then read the symptom: + +> "But it crashes intermittently under load with 4 producer threads +> and 2 consumer threads. The crash is a segfault in `process_item()`, +> but that function looks correct. AddressSanitizer shows no heap issues." + +### 3b. Vibe Run (2 min) + +``` +This code has a bug that causes intermittent crashes under load with +4 producer threads and 2 consumer threads. The crash is a segfault +inside process_item(), but that function looks correct. ASan reports +no heap issues. Find the root cause. + + +``` + +### 3c. PromptKit Run (3 min) + +Use `bootstrap.md` → select `investigate-bug`. Provide the symptom +description and `demo_queue.c` as context. + +As the output renders, highlight: + +> "See the hypothesis table — it generated three potential causes and +> is evaluating evidence for each one, not just jumping to a conclusion." +> +> "It correctly identified the TOCTOU race: the count check on line 55 +> is outside the lock. And look — it explicitly dismissed the strdup/free +> pattern as a red herring with evidence: 'freed on line 81 in consumer, +> matching the strdup on line 42.'" +> +> "Notice the confidence labels — KNOWN for the code structure, INFERRED +> for the race condition trigger, ASSUMED for the crash mechanism." + +### 3d. Scorecard (2 min) + +#### Scoring Rubric + +| # | Factor | Vibe Score (0–2) | PromptKit Score (0–2) | +|---|--------|:---:|:---:| +| 1 | **Root cause correct** — Identified the TOCTOU race? | | | +| 2 | **Red herring rejected** — Did NOT flag malloc/free as a leak? | | | +| 3 | **Hypothesis rigor** — Generated ≥3 hypotheses with evidence? | | | +| 4 | **Causal chain** — Traced check → context switch → stale → crash? | | | +| 5 | **Confidence labeling** — KNOWN vs. INFERRED vs. ASSUMED? | | | +| 6 | **Remediation quality** — Fix is correct and complete? | | | +| | **Total** | ___ / 12 | ___ / 12 | + +#### What to Highlight + +- **Reasoning depth:** Vibe prompts often jump straight to "the mutex + is wrong" without explaining the interleaving. PromptKit's root-cause- + analysis protocol forces a causal chain. +- **Red herring:** Vibe prompts frequently flag the `strdup` as a memory + leak. PromptKit's memory-safety protocol traces the alloc/free pair + across functions and dismisses it. +- **Epistemic honesty:** Vibe prompts state "The bug is X" as fact. + PromptKit labels confidence and distinguishes root cause from + proximate cause (the crash is in `process_item`, but the *root cause* + is in `dequeue`). + +--- + +## Closing (2 minutes) + +### The Multiplier Pitch + +> "You saw the difference on one file. Now imagine doing this across +> 50 files, or 200 PRs, or every sprint's requirements." +> +> "With a vibe prompt, the quality varies every time — it depends on +> your mood, your wording, what you remembered to ask for." +> +> "With PromptKit, the quality is **consistent, composable, and +> version-controlled**. The same persona, the same protocols, the same +> output format — every time." + +### The Agent Instruction Hook + +> "And you don't have to run this manually every time. PromptKit can +> output persistent agent instruction files — for GitHub Copilot, +> Claude Code, or Cursor — so these protocols run automatically in +> every session." + +### Call to Action + +> "PromptKit is open source. The prompts you saw today are in the +> repository. You can use them as-is, customize them for your domain, +> or add new ones. The library has [X] templates, [Y] protocols, and +> [Z] personas — all composable." + +--- + +## Appendix: Planted Bug Reference + +### demo_server.c (5 bugs) + +| # | Bug | Severity | Location | Description | +|---|-----|----------|----------|-------------| +| 1 | Use-after-free | Critical | L65, L73 | `client->buf` freed on disconnect, then read if caller loops | +| 2 | Buffer overflow | Critical | L52 | `strcpy`/`strcat` into 64-byte buffer, no bounds check | +| 3 | Unchecked return | High | L61 | `recv()` can return -1; code treats it as valid length | +| 4 | Off-by-one | Medium | L46 | `i <= len` writes one past buffer end | +| 5 | Resource leak | Medium | L83 | `client_fd` not closed on `send()` failure path | + +### demo_queue.c (1 root cause + 1 red herring) + +| # | Issue | Type | Location | Description | +|---|-------|------|----------|-------------| +| 1 | TOCTOU race | Root cause | L55–60 | `count` checked outside lock; another thread can drain queue between check and lock | +| 2 | strdup/free | Red herring | L42, L81 | Allocation in enqueue, free in consumer — this is correct | diff --git a/docs/demo/demo_queue.c b/docs/demo/demo_queue.c new file mode 100644 index 0000000..11bef3d --- /dev/null +++ b/docs/demo/demo_queue.c @@ -0,0 +1,133 @@ +/* demo_queue.c — Lock-free-looking producer-consumer queue with a + * subtle TOCTOU race condition. + * + * PURPOSE: PromptKit demo. Give this code to an LLM with the symptom + * "intermittent crashes under load" and compare how a vibe prompt + * vs. a PromptKit-assembled prompt investigates the root cause. + * + * PLANTED ISSUE (do NOT reveal during the demo): + * + * Root Cause — TOCTOU race in dequeue() (lines 55-60): + * `count` is checked OUTSIDE the lock, then the lock is acquired + * and the item is dequeued. Between the check and the lock + * acquisition, another thread can dequeue the last item, causing + * a read from an empty queue (head == tail, stale data or + * segfault if items[head] was already consumed/freed). + * + * Red Herring — malloc in enqueue() (line 42): + * `strdup(item)` allocates memory. It is correctly freed in the + * consumer (line 81). A shallow review may flag this as a leak. + * + * SYMPTOM DESCRIPTION (give this to the LLM): + * + * "This producer-consumer queue works fine in our single-threaded + * tests but crashes intermittently under load when we run 4 producer + * threads and 2 consumer threads. The crash is a segfault inside + * process_item(), but that function looks correct. We've checked for + * memory corruption with AddressSanitizer — no heap issues reported." + */ + +#include +#include +#include +#include + +#define QUEUE_CAP 64 + +typedef struct { + char *items[QUEUE_CAP]; + int head; + int tail; + int count; + pthread_mutex_t lock; +} queue_t; + +void queue_init(queue_t *q) +{ + memset(q, 0, sizeof(*q)); + pthread_mutex_init(&q->lock, NULL); +} + +/* Red herring: strdup allocates, but consumer frees — this is correct */ +int enqueue(queue_t *q, const char *item) +{ + pthread_mutex_lock(&q->lock); + if (q->count >= QUEUE_CAP) { + pthread_mutex_unlock(&q->lock); + return -1; /* full */ + } + q->items[q->tail] = strdup(item); + q->tail = (q->tail + 1) % QUEUE_CAP; + q->count++; + pthread_mutex_unlock(&q->lock); + return 0; +} + +/* BUG: TOCTOU race — count checked OUTSIDE lock, then lock acquired. + * Between the check and the lock, another consumer can drain the queue. */ +char *dequeue(queue_t *q) +{ + if (q->count == 0) /* <-- CHECK outside lock */ + return NULL; + + pthread_mutex_lock(&q->lock); /* <-- another thread may dequeue here */ + char *item = q->items[q->head]; /* stale: head may now be invalid */ + q->items[q->head] = NULL; + q->head = (q->head + 1) % QUEUE_CAP; + q->count--; /* count can go negative! */ + pthread_mutex_unlock(&q->lock); + return item; +} + +/* This function is correct — the crash is NOT here */ +void process_item(const char *item) +{ + printf("Processing: %s\n", item); +} + +/* Consumer thread */ +void *consumer(void *arg) +{ + queue_t *q = (queue_t *)arg; + while (1) { + char *item = dequeue(q); + if (item) { + process_item(item); + free(item); /* correctly frees the strdup from enqueue */ + } + } + return NULL; +} + +/* Producer thread */ +void *producer(void *arg) +{ + queue_t *q = (queue_t *)arg; + for (int i = 0; i < 10000; i++) { + char msg[32]; + snprintf(msg, sizeof(msg), "msg-%d", i); + while (enqueue(q, msg) < 0) + ; /* spin until space available */ + } + return NULL; +} + +int main(void) +{ + queue_t q; + queue_init(&q); + + pthread_t producers[4], consumers[2]; + + for (int i = 0; i < 4; i++) + pthread_create(&producers[i], NULL, producer, &q); + for (int i = 0; i < 2; i++) + pthread_create(&consumers[i], NULL, consumer, &q); + + for (int i = 0; i < 4; i++) + pthread_join(producers[i], NULL); + + /* Note: consumers run forever — in production we'd signal them to stop */ + printf("All producers finished.\n"); + return 0; +} diff --git a/docs/demo/demo_server.c b/docs/demo/demo_server.c new file mode 100644 index 0000000..6eb7bcb --- /dev/null +++ b/docs/demo/demo_server.c @@ -0,0 +1,154 @@ +/* demo_server.c — Minimal TCP echo server with 5 planted bugs. + * + * PURPOSE: PromptKit demo. Ask an LLM to review this file for bugs, + * first with a plain "vibe" prompt, then with a PromptKit-assembled + * prompt, and compare the findings. + * + * PLANTED ISSUES (do NOT reveal these to the LLM during the demo): + * + * Bug 1 (Critical) — Use-after-free on line 65/73: + * `client->buf` is freed, then read on the next iteration. + * + * Bug 2 (Critical) — Buffer overflow on line 52: + * `strcpy` into a fixed-size buffer with no bounds check. + * + * Bug 3 (High) — Unchecked return value on line 61: + * `recv()` can return -1 on error; code treats it as a valid length. + * + * Bug 4 (Medium) — Off-by-one on line 46: + * `<=` should be `<`; writes one byte past `msg` buffer. + * + * Bug 5 (Medium) — Resource leak on line 83: + * Early return on `send()` failure does not close `client_fd`. + * + * There is also a RED HERRING: `create_client` allocates memory that + * is freed in `destroy_client` (correct). A shallow review might flag + * this as a leak because the free is in a different function. + */ + +#include +#include +#include +#include +#include +#include + +#define MAX_MSG 256 +#define BACKLOG 10 + +typedef struct { + int fd; + char *buf; + int buf_size; +} client_t; + +/* Bug 4 (Off-by-one): loop condition uses <= instead of < */ +static void sanitize(char *msg, int len) +{ + for (int i = 0; i <= len; i++) { /* BUG: should be i < len */ + if (msg[i] == '\n' || msg[i] == '\r') + msg[i] = ' '; + } +} + +/* Bug 2 (Buffer overflow): no bounds check on incoming name */ +static void log_connection(const char *client_name) +{ + char log_entry[64]; + strcpy(log_entry, "CONNECT: "); + strcat(log_entry, client_name); /* BUG: unbounded copy */ + printf("%s\n", log_entry); +} + +/* Red herring: allocation here is correctly freed in destroy_client */ +static client_t *create_client(int fd, int buf_size) +{ + client_t *c = malloc(sizeof(client_t)); + if (!c) return NULL; + c->fd = fd; + c->buf = malloc(buf_size); + c->buf_size = buf_size; + return c; +} + +/* Bug 3 (Unchecked return): recv() error not handled */ +/* Bug 1 (Use-after-free): buf freed then used in next call */ +static int handle_echo(client_t *client) +{ + int n = recv(client->fd, client->buf, client->buf_size, 0); + /* BUG 3: n could be -1 (error); code falls through */ + + if (n == 0) { + free(client->buf); /* BUG 1: frees buf ... */ + return 0; /* client disconnected */ + } + + sanitize(client->buf, n); + + /* ... but if the caller loops and calls handle_echo again + * after a partial read/reconnect, client->buf is dangling. */ + + return n; +} + +/* Bug 5 (Resource leak): client_fd not closed on send failure */ +static void serve_client(int client_fd, const char *client_name) +{ + log_connection(client_name); + + client_t *client = create_client(client_fd, MAX_MSG); + if (!client) { + close(client_fd); + return; + } + + int n; + while ((n = handle_echo(client)) > 0) { + if (send(client_fd, client->buf, n, 0) < 0) { + perror("send"); + free(client->buf); + free(client); + return; /* BUG 5: client_fd never closed */ + } + } + + close(client_fd); + /* destroy_client not called — but buf was freed in handle_echo + * on the n==0 path, so only the client_t struct leaks here. + * (This is a secondary consequence of Bug 1's design.) */ + free(client); +} + +static void destroy_client(client_t *c) +{ + if (!c) return; + close(c->fd); + free(c->buf); + free(c); +} + +int main(void) +{ + int server_fd = socket(AF_INET, SOCK_STREAM, 0); + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_addr.s_addr = INADDR_ANY, + .sin_port = htons(8080), + }; + + bind(server_fd, (struct sockaddr *)&addr, sizeof(addr)); + listen(server_fd, BACKLOG); + + printf("Listening on :8080\n"); + + for (;;) { + struct sockaddr_in client_addr; + socklen_t addr_len = sizeof(client_addr); + int client_fd = accept(server_fd, (struct sockaddr *)&client_addr, + &addr_len); + /* For demo simplicity, handle one client at a time */ + serve_client(client_fd, "unknown"); + } + + return 0; +} diff --git a/docs/demo/rate_limiter_description.md b/docs/demo/rate_limiter_description.md new file mode 100644 index 0000000..5d714b7 --- /dev/null +++ b/docs/demo/rate_limiter_description.md @@ -0,0 +1,13 @@ +# Rate Limiter — Project Description + +> **Give this description to the LLM for both the vibe prompt and the +> PromptKit prompt. Do NOT add any extra context — the point is to see +> how much structure each approach extracts from the same sparse input.** + +--- + +## The Ask + +We need a rate limiter for our REST API. It should limit each +authenticated user to a configurable number of requests per time window. +When the limit is exceeded, return HTTP 429 Too Many Requests. From ca52bd1dbc6e9dfcec8daee5d110345bfe46b1f7 Mon Sep 17 00:00:00 2001 From: Andrew Beltrano Date: Wed, 1 Apr 2026 15:36:35 -0600 Subject: [PATCH 2/2] Move answer keys out of source files into separate answer-key.md Strip all planted-bug comments, hints, and annotations from demo_server.c and demo_queue.c so the LLM cannot see the answers when the code is provided as context during the demo. All answer details (bug descriptions, interleavings, red herrings, and the implicit-requirements checklist) are now in answer-key.md, which the presenter references but never includes in LLM context. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/demo/README.md | 5 +- docs/demo/answer-key.md | 102 +++++++++++++++++++++++++++++++++++++++ docs/demo/demo-script.md | 20 ++------ docs/demo/demo_queue.c | 43 +++-------------- docs/demo/demo_server.c | 49 +++---------------- 5 files changed, 122 insertions(+), 97 deletions(-) create mode 100644 docs/demo/answer-key.md diff --git a/docs/demo/README.md b/docs/demo/README.md index 18ca7ff..082b56b 100644 --- a/docs/demo/README.md +++ b/docs/demo/README.md @@ -16,10 +16,11 @@ and output structure. | File | Purpose | |------|---------| -| `demo_server.c` | Code review sample — C echo server with 5 planted bugs | -| `demo_queue.c` | Bug investigation sample — producer-consumer with a TOCTOU race | +| `demo_server.c` | Code review sample — C echo server | +| `demo_queue.c` | Bug investigation sample — producer-consumer queue | | `rate_limiter_description.md` | Requirements authoring sample — 3-sentence project description | | `demo-script.md` | Presenter script with timing, talking points, and scorecards | +| `answer-key.md` | **Presenter only** — planted bug reference and scoring guide | ## Quick Start diff --git a/docs/demo/answer-key.md b/docs/demo/answer-key.md new file mode 100644 index 0000000..5633ba4 --- /dev/null +++ b/docs/demo/answer-key.md @@ -0,0 +1,102 @@ +# Demo Answer Key + +> **⚠️ PRESENTER ONLY — do not share this file or include it in LLM context.** +> +> This file documents the planted issues in the demo code samples. +> Use it to score LLM outputs after each run. + +--- + +## demo_server.c — Code Review (5 bugs + 1 red herring) + +| # | Severity | Category | Location | Description | +|---|----------|----------|----------|-------------| +| 1 | Critical | Use-after-free | `handle_echo()` | `client->buf` is freed when `n == 0` (client disconnects), but `serve_client()` loops and calls `handle_echo()` again — `client->buf` is now a dangling pointer. | +| 2 | Critical | Buffer overflow | `log_connection()` | `strcpy`/`strcat` copies `client_name` into a 64-byte `log_entry` with no bounds check. If `client_name` exceeds ~54 characters, this overflows the stack buffer. | +| 3 | High | Unchecked return | `handle_echo()` | `recv()` can return -1 on error. The code only checks for `n == 0` (disconnect) and falls through to `sanitize()` with `n == -1`, passing a negative length. | +| 4 | Medium | Off-by-one | `sanitize()` | Loop condition `i <= len` iterates one past the valid data. Should be `i < len`. Reads (and potentially writes) one byte beyond the received data. | +| 5 | Medium | Resource leak | `serve_client()` | When `send()` fails, the function frees `client->buf` and `client` but returns without closing `client_fd`, leaking the file descriptor. | + +### Red Herring + +`create_client()` calls `malloc()` for both the `client_t` struct and +`client->buf`. A shallow review might flag this as a memory leak because +`destroy_client()` (which properly frees both) is defined but never called +in the normal code path. However, the code does free `client->buf` and +the `client_t` struct through other paths — the design is just unusual, +not leaking (aside from the bugs above that create leak-like consequences). + +--- + +## demo_queue.c — Bug Investigation (1 root cause + 1 red herring) + +### Root Cause: TOCTOU Race in `dequeue()` + +```c +char *dequeue(queue_t *q) +{ + if (q->count == 0) // CHECK — outside the lock + return NULL; + + pthread_mutex_lock(&q->lock); // ACQUIRE — another thread may act here + char *item = q->items[q->head]; // USE — head/count may now be stale +``` + +**The interleaving:** + +1. Thread A calls `dequeue()`, reads `q->count == 1`, passes the check. +2. Thread A is preempted before acquiring the lock. +3. Thread B calls `dequeue()`, also reads `q->count == 1`, passes the check. +4. Thread B acquires the lock, dequeues the last item, decrements count to 0. +5. Thread A resumes, acquires the lock, reads `q->items[q->head]` — but + the item was already consumed by Thread B. The pointer is either NULL + (if B set it to NULL) or stale/freed memory. +6. Thread A passes this to `process_item()` → segfault on NULL dereference + or use-after-free. + +**Why ASan doesn't catch it:** The segfault is a NULL dereference (reading +`items[head]` which was set to NULL by Thread B), not a heap corruption. +ASan's heap checks don't flag NULL pointer reads. + +**Correct fix:** Move the count check inside the lock: + +```c +char *dequeue(queue_t *q) +{ + pthread_mutex_lock(&q->lock); + if (q->count == 0) { + pthread_mutex_unlock(&q->lock); + return NULL; + } + char *item = q->items[q->head]; + // ... rest unchanged +``` + +### Red Herring: strdup/free Pattern + +`enqueue()` calls `strdup(item)` to allocate a copy of each string. +`consumer()` calls `free(item)` after `process_item()`. This is a +correct allocate-in-producer / free-in-consumer pattern. It is NOT a +memory leak. + +--- + +## rate_limiter_description.md — Requirements Authoring + +There are no planted bugs — this scenario measures **completeness and +structure**. Score by counting how many of these the LLM surfaces: + +### Implicit Requirements Most Developers Miss + +| Category | Requirement | Why It Matters | +|----------|-------------|----------------| +| HTTP semantics | Include `Retry-After` header in 429 response | RFC 6585 recommends it; clients need it for backoff | +| Distributed | Behavior with multiple API server instances | Single-node counters don't work behind a load balancer | +| Clock skew | Window boundary behavior | What happens to a request that arrives at the exact window boundary? | +| Persistence | Rate limit state durability | What happens to counts when the service restarts? | +| Observability | Metrics / logging for rate limit events | Ops team needs visibility into throttling patterns | +| Graceful degradation | Behavior when rate limit store is unavailable | Fail-open (allow all) or fail-closed (deny all)? | +| Burst handling | Sliding window vs. fixed window vs. token bucket | Fixed windows allow 2x burst at boundaries | +| Identity | What counts as "a user"? | API key? OAuth token? IP fallback for unauthenticated? | +| Configurability | Per-endpoint or global limits? | Different endpoints may need different thresholds | +| Response body | What information to include in the 429 body | Current usage, limit, reset time | diff --git a/docs/demo/demo-script.md b/docs/demo/demo-script.md index ef2349b..71bb05e 100644 --- a/docs/demo/demo-script.md +++ b/docs/demo/demo-script.md @@ -276,19 +276,7 @@ As the output renders, highlight: ## Appendix: Planted Bug Reference -### demo_server.c (5 bugs) - -| # | Bug | Severity | Location | Description | -|---|-----|----------|----------|-------------| -| 1 | Use-after-free | Critical | L65, L73 | `client->buf` freed on disconnect, then read if caller loops | -| 2 | Buffer overflow | Critical | L52 | `strcpy`/`strcat` into 64-byte buffer, no bounds check | -| 3 | Unchecked return | High | L61 | `recv()` can return -1; code treats it as valid length | -| 4 | Off-by-one | Medium | L46 | `i <= len` writes one past buffer end | -| 5 | Resource leak | Medium | L83 | `client_fd` not closed on `send()` failure path | - -### demo_queue.c (1 root cause + 1 red herring) - -| # | Issue | Type | Location | Description | -|---|-------|------|----------|-------------| -| 1 | TOCTOU race | Root cause | L55–60 | `count` checked outside lock; another thread can drain queue between check and lock | -| 2 | strdup/free | Red herring | L42, L81 | Allocation in enqueue, free in consumer — this is correct | +See [`answer-key.md`](answer-key.md) for the full planted bug reference +with expected findings, interleavings, and implicit requirement checklist. + +**⚠️ Do not include `answer-key.md` in any LLM context during the demo.** diff --git a/docs/demo/demo_queue.c b/docs/demo/demo_queue.c index 11bef3d..f0be8ac 100644 --- a/docs/demo/demo_queue.c +++ b/docs/demo/demo_queue.c @@ -1,30 +1,5 @@ -/* demo_queue.c — Lock-free-looking producer-consumer queue with a - * subtle TOCTOU race condition. - * - * PURPOSE: PromptKit demo. Give this code to an LLM with the symptom - * "intermittent crashes under load" and compare how a vibe prompt - * vs. a PromptKit-assembled prompt investigates the root cause. - * - * PLANTED ISSUE (do NOT reveal during the demo): - * - * Root Cause — TOCTOU race in dequeue() (lines 55-60): - * `count` is checked OUTSIDE the lock, then the lock is acquired - * and the item is dequeued. Between the check and the lock - * acquisition, another thread can dequeue the last item, causing - * a read from an empty queue (head == tail, stale data or - * segfault if items[head] was already consumed/freed). - * - * Red Herring — malloc in enqueue() (line 42): - * `strdup(item)` allocates memory. It is correctly freed in the - * consumer (line 81). A shallow review may flag this as a leak. - * - * SYMPTOM DESCRIPTION (give this to the LLM): - * - * "This producer-consumer queue works fine in our single-threaded - * tests but crashes intermittently under load when we run 4 producer - * threads and 2 consumer threads. The crash is a segfault inside - * process_item(), but that function looks correct. We've checked for - * memory corruption with AddressSanitizer — no heap issues reported." +/* demo_queue.c — Producer-consumer queue using a mutex. + * Producers enqueue strings, consumers dequeue and process them. */ #include @@ -48,7 +23,6 @@ void queue_init(queue_t *q) pthread_mutex_init(&q->lock, NULL); } -/* Red herring: strdup allocates, but consumer frees — this is correct */ int enqueue(queue_t *q, const char *item) { pthread_mutex_lock(&q->lock); @@ -63,23 +37,20 @@ int enqueue(queue_t *q, const char *item) return 0; } -/* BUG: TOCTOU race — count checked OUTSIDE lock, then lock acquired. - * Between the check and the lock, another consumer can drain the queue. */ char *dequeue(queue_t *q) { - if (q->count == 0) /* <-- CHECK outside lock */ + if (q->count == 0) return NULL; - pthread_mutex_lock(&q->lock); /* <-- another thread may dequeue here */ - char *item = q->items[q->head]; /* stale: head may now be invalid */ + pthread_mutex_lock(&q->lock); + char *item = q->items[q->head]; q->items[q->head] = NULL; q->head = (q->head + 1) % QUEUE_CAP; - q->count--; /* count can go negative! */ + q->count--; pthread_mutex_unlock(&q->lock); return item; } -/* This function is correct — the crash is NOT here */ void process_item(const char *item) { printf("Processing: %s\n", item); @@ -93,7 +64,7 @@ void *consumer(void *arg) char *item = dequeue(q); if (item) { process_item(item); - free(item); /* correctly frees the strdup from enqueue */ + free(item); } } return NULL; diff --git a/docs/demo/demo_server.c b/docs/demo/demo_server.c index 6eb7bcb..4e3a505 100644 --- a/docs/demo/demo_server.c +++ b/docs/demo/demo_server.c @@ -1,29 +1,5 @@ -/* demo_server.c — Minimal TCP echo server with 5 planted bugs. - * - * PURPOSE: PromptKit demo. Ask an LLM to review this file for bugs, - * first with a plain "vibe" prompt, then with a PromptKit-assembled - * prompt, and compare the findings. - * - * PLANTED ISSUES (do NOT reveal these to the LLM during the demo): - * - * Bug 1 (Critical) — Use-after-free on line 65/73: - * `client->buf` is freed, then read on the next iteration. - * - * Bug 2 (Critical) — Buffer overflow on line 52: - * `strcpy` into a fixed-size buffer with no bounds check. - * - * Bug 3 (High) — Unchecked return value on line 61: - * `recv()` can return -1 on error; code treats it as a valid length. - * - * Bug 4 (Medium) — Off-by-one on line 46: - * `<=` should be `<`; writes one byte past `msg` buffer. - * - * Bug 5 (Medium) — Resource leak on line 83: - * Early return on `send()` failure does not close `client_fd`. - * - * There is also a RED HERRING: `create_client` allocates memory that - * is freed in `destroy_client` (correct). A shallow review might flag - * this as a leak because the free is in a different function. +/* demo_server.c — Minimal TCP echo server. + * Accepts connections, reads data, sanitizes it, and echoes it back. */ #include @@ -42,25 +18,22 @@ typedef struct { int buf_size; } client_t; -/* Bug 4 (Off-by-one): loop condition uses <= instead of < */ static void sanitize(char *msg, int len) { - for (int i = 0; i <= len; i++) { /* BUG: should be i < len */ + for (int i = 0; i <= len; i++) { if (msg[i] == '\n' || msg[i] == '\r') msg[i] = ' '; } } -/* Bug 2 (Buffer overflow): no bounds check on incoming name */ static void log_connection(const char *client_name) { char log_entry[64]; strcpy(log_entry, "CONNECT: "); - strcat(log_entry, client_name); /* BUG: unbounded copy */ + strcat(log_entry, client_name); printf("%s\n", log_entry); } -/* Red herring: allocation here is correctly freed in destroy_client */ static client_t *create_client(int fd, int buf_size) { client_t *c = malloc(sizeof(client_t)); @@ -71,27 +44,20 @@ static client_t *create_client(int fd, int buf_size) return c; } -/* Bug 3 (Unchecked return): recv() error not handled */ -/* Bug 1 (Use-after-free): buf freed then used in next call */ static int handle_echo(client_t *client) { int n = recv(client->fd, client->buf, client->buf_size, 0); - /* BUG 3: n could be -1 (error); code falls through */ if (n == 0) { - free(client->buf); /* BUG 1: frees buf ... */ + free(client->buf); return 0; /* client disconnected */ } sanitize(client->buf, n); - /* ... but if the caller loops and calls handle_echo again - * after a partial read/reconnect, client->buf is dangling. */ - return n; } -/* Bug 5 (Resource leak): client_fd not closed on send failure */ static void serve_client(int client_fd, const char *client_name) { log_connection(client_name); @@ -108,14 +74,11 @@ static void serve_client(int client_fd, const char *client_name) perror("send"); free(client->buf); free(client); - return; /* BUG 5: client_fd never closed */ + return; } } close(client_fd); - /* destroy_client not called — but buf was freed in handle_echo - * on the n==0 path, so only the client_t struct leaks here. - * (This is a secondary consequence of Bug 1's design.) */ free(client); }