From c29e94cc1303193d3324f413cf2bbc5b8654de49 Mon Sep 17 00:00:00 2001 From: ventselartur Date: Fri, 19 Jun 2026 17:04:35 +0200 Subject: [PATCH] update --- docs/test-generation.md | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/test-generation.md b/docs/test-generation.md index 6b732d38a..0bb876417 100644 --- a/docs/test-generation.md +++ b/docs/test-generation.md @@ -41,6 +41,63 @@ This category "reverses" the SWE-Bench workflow: instead of generating a fix, th

No results available yet. Check back soon!

{% endif %} +## Tooling Experiments + +Comparing GitHub Copilot CLI runs that enable AL developer tooling—the **AL MCP server** (`altool`) and the **AL LSP server**—against the matching no-tooling **Default** baseline for the same model. + +{% if site.data.test-generation.aggregate %} +{%- assign tooling_models = "" -%} +{%- for agg in site.data.test-generation.aggregate -%} + {%- if agg.experiment -%} + {%- assign is_tooling = false -%} + {%- if agg.experiment.mcp_servers.size > 0 %}{% assign is_tooling = true %}{% endif -%} + {%- if agg.experiment.al_lsp_enabled %}{% assign is_tooling = true %}{% endif -%} + {%- if is_tooling %}{% assign tooling_models = tooling_models | append: "|" | append: agg.model | append: "|" %}{% endif -%} + {%- endif -%} +{%- endfor -%} + + + + + + + + + + + + + + {%- assign sorted_results = site.data.test-generation.aggregate | sort: "average" | reverse -%} + {%- for agg in sorted_results -%} + {%- assign is_tooling = false -%} + {%- assign show_row = false -%} + {%- if agg.experiment -%} + {%- if agg.experiment.mcp_servers.size > 0 %}{% assign is_tooling = true %}{% endif -%} + {%- if agg.experiment.al_lsp_enabled %}{% assign is_tooling = true %}{% endif -%} + {%- if is_tooling %}{% assign show_row = true %}{% endif -%} + {%- else -%} + {%- assign model_key = agg.model | prepend: "|" | append: "|" -%} + {%- if tooling_models contains model_key %}{% assign show_row = true %}{% endif -%} + {%- endif -%} + {%- if show_row %} + + + + + + + + + + {%- endif -%} + {%- endfor %} + +
ModelMCP ServersAL LSPmean (95% CI)pass^5Avg TimeVer
{{ agg.model }}{% if is_tooling %}{% if agg.experiment.mcp_servers.size > 0 %}{{ agg.experiment.mcp_servers | join: ", " }}{% else %}—{% endif %}{% else %}Default{% endif %}{% if is_tooling and agg.experiment.al_lsp_enabled %}✓{% else %}—{% endif %}{{ agg.average | times: 100.0 | round: 1 }}%{% if agg.ci_low %} ({{ agg.ci_low | times: 100.0 | round: 1 }}-{{ agg.ci_high | times: 100.0 | round: 1 }}%){% endif %}{% if agg.pass_hat_5 %}{{ agg.pass_hat_5 | times: 100.0 | round: 1 }}%{% endif %}{{ agg.average_duration | round: 1 }}s{{ agg.benchmark_version }}
+{% else %} +

No results available yet. Check back soon!

+{% endif %} + ## ALTest Custom Agent Comparing experimental configurations for GitHub Copilot CLI with `ALTest` custom agent using **claude-opus-4-6**.