diff --git a/src/components/Sidebar.astro b/src/components/Sidebar.astro
index 7f30287a..2c55dae4 100644
--- a/src/components/Sidebar.astro
+++ b/src/components/Sidebar.astro
@@ -94,7 +94,7 @@ const isApiTab = activeTab?.tab === 'API';
function inferApiMethod(title: string): { method: string; css: string } | null {
const t = title.toLowerCase();
- if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts)\b/.test(t)) {
+ if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts|aggregat\w*)\b/.test(t)) {
return { method: 'GET', css: 'api-method-get' };
}
if (/\b(create|add|generate|execute|submit|assign|bulk|complete|skip|release|pause|unpause|check|upload|start|duplicate|fetch|run|rerun|cancel|clone|merge)\b/.test(t)) {
diff --git a/src/lib/api-navigation.ts b/src/lib/api-navigation.ts
index f5ae3e9d..c71153bf 100644
--- a/src/lib/api-navigation.ts
+++ b/src/lib/api-navigation.ts
@@ -294,7 +294,8 @@ export const apiNavigation: ApiNavGroup[] = [
{ "title": "Delete Eval Task", "href": "/docs/api/eval-tasks/delete-eval-task", "method": "DELETE" },
{ "title": "Bulk Delete Eval Tasks", "href": "/docs/api/eval-tasks/bulk-delete-eval-tasks", "method": "POST" },
{ "title": "Pause Eval Task", "href": "/docs/api/eval-tasks/pause-eval-task", "method": "POST" },
- { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" }
+ { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" },
+ { "title": "Eval Task Aggregations", "href": "/docs/api/eval-tasks/eval-task-aggregations", "method": "GET" }
]
},
{
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
index f8c26b39..869c1c3c 100644
--- a/src/lib/navigation.ts
+++ b/src/lib/navigation.ts
@@ -1015,6 +1015,7 @@ export const tabNavigation: NavTab[] = [
{ title: 'Bulk Delete Eval Tasks', href: '/docs/api/eval-tasks/bulk-delete-eval-tasks' },
{ title: 'Pause Eval Task', href: '/docs/api/eval-tasks/pause-eval-task' },
{ title: 'Unpause Eval Task', href: '/docs/api/eval-tasks/unpause-eval-task' },
+ { title: 'Eval Task Aggregations', href: '/docs/api/eval-tasks/eval-task-aggregations' },
]
},
{
diff --git a/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx b/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx
new file mode 100644
index 00000000..2edf3144
--- /dev/null
+++ b/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx
@@ -0,0 +1,128 @@
+---
+title: "Eval Task Aggregations"
+description: "Aggregate eval-task results in two complementary views — per-eval rollups (avg score, pass rate, per-choice proportions) and per-span pivots of raw eval values. Driven by two boolean flags, returned in a single GET."
+---
+
+
+
+
+
+ Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings.
+
+
+ Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com).
+
+
+
+
+
+ The eval task whose runs should be aggregated.
+
+
+ When `true`, the response includes the `eval_aggregation` object — one rollup per `CustomEvalConfig` that ran in the task, keyed by eval name. Defaults to `false`.
+
+
+ When `true`, the response includes the `span_aggregation` object — one entry per span the task evaluated, keyed by `span_id`, with the raw value of every eval that touched it. Defaults to `false`.
+
+
+
+
+ UUID of the eval task that was aggregated. Echoed back from the request.
+
+
+ Per-eval rollup. Present only when `eval_aggregation=true`. Keys are `CustomEvalConfig` names; values are one rollup object per eval.
+
+
+ UUID of the eval config.
+ Eval config name (same as the parent key).
+ Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `aggregated_score`.
+
+ The eval-level rollup. Shape depends on `output_type`:
+
• **`percentage`** — `number` (4-dp average of `output_float` across non-error runs, e.g. `0.7421`).
+
• **`pass_fail`** — `number` (pass rate as `0–100` with 2 dp, e.g. `87.5`).
+
• **`deterministic`** — `object` mapping each observed choice to its occurrence percentage `0–100` with 2 dp, e.g. `{"positive": 62.5, "neutral": 25.0}`. Only choices that actually appeared in the data are included.
+
`null` when no aggregatable rows exist (all errors / empty).
+
+
+
+
+ Per-span pivot. Present only when `span_aggregation=true`. Outer keys are `span_id` (one per span the task evaluated); inner keys are eval names; inner values are one entry per eval that touched the span.
+
+
+ UUID of the eval config.
+ Eval config name.
+ Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `value`.
+
+ The raw per-row eval result — **no averaging**. Shape depends on `output_type`:
+
• **`percentage`** — `number` (the row's `output_float`, e.g. `0.82`).
+
• **`pass_fail`** — `boolean` (the row's `output_bool`).
+
• **`deterministic`** — `array` of choice strings (the row's `output_str_list`, e.g. `["positive"]`).
+
When the same `(span, eval)` pair has multiple runs (re-runs), the latest by `created_at` wins.
+
+
+
+
+
+
+ The aggregation response is only returned when `eval_aggregation` or `span_aggregation` (or both) is `true`. Both flags can be combined in a single request; the response will contain both top-level keys.
+
+
+ Eval runs that have been soft-deleted are skipped in both aggregations so the rollups reflect the user's current view of the data.
+
+
+ Session- and trace-target eval runs (where there is no underlying span) are not included in `span_aggregation`.
+
+
+
+
+
+ `eval_task_id` is missing, or no eval task with that ID exists in the caller's organization.
+
+
+ Invalid or missing API credentials.
+
+