diff --git a/docs/test-generation.md b/docs/test-generation.md index 6b732d38a..0bb876417 100644 --- a/docs/test-generation.md +++ b/docs/test-generation.md @@ -41,6 +41,63 @@ This category "reverses" the SWE-Bench workflow: instead of generating a fix, th
No results available yet. Check back soon!
{% endif %} +## Tooling Experiments + +Comparing GitHub Copilot CLI runs that enable AL developer tooling—the **AL MCP server** (`altool`) and the **AL LSP server**—against the matching no-tooling **Default** baseline for the same model. + +{% if site.data.test-generation.aggregate %} +{%- assign tooling_models = "" -%} +{%- for agg in site.data.test-generation.aggregate -%} + {%- if agg.experiment -%} + {%- assign is_tooling = false -%} + {%- if agg.experiment.mcp_servers.size > 0 %}{% assign is_tooling = true %}{% endif -%} + {%- if agg.experiment.al_lsp_enabled %}{% assign is_tooling = true %}{% endif -%} + {%- if is_tooling %}{% assign tooling_models = tooling_models | append: "|" | append: agg.model | append: "|" %}{% endif -%} + {%- endif -%} +{%- endfor -%} +| Model | +MCP Servers | +AL LSP | +mean (95% CI) | +pass^5 | +Avg Time | +Ver | +
|---|---|---|---|---|---|---|
| {{ agg.model }} | +{% if is_tooling %}{% if agg.experiment.mcp_servers.size > 0 %}{{ agg.experiment.mcp_servers | join: ", " }}{% else %}—{% endif %}{% else %}Default{% endif %} | +{% if is_tooling and agg.experiment.al_lsp_enabled %}✓{% else %}—{% endif %} | +{{ agg.average | times: 100.0 | round: 1 }}%{% if agg.ci_low %} ({{ agg.ci_low | times: 100.0 | round: 1 }}-{{ agg.ci_high | times: 100.0 | round: 1 }}%){% endif %} | +{% if agg.pass_hat_5 %}{{ agg.pass_hat_5 | times: 100.0 | round: 1 }}%{% endif %} | +{{ agg.average_duration | round: 1 }}s | +{{ agg.benchmark_version }} | +
No results available yet. Check back soon!
+{% endif %} + ## ALTest Custom Agent Comparing experimental configurations for GitHub Copilot CLI with `ALTest` custom agent using **claude-opus-4-6**.