diff --git a/src/components/Sidebar.astro b/src/components/Sidebar.astro index 7f30287a..2c55dae4 100644 --- a/src/components/Sidebar.astro +++ b/src/components/Sidebar.astro @@ -94,7 +94,7 @@ const isApiTab = activeTab?.tab === 'API'; function inferApiMethod(title: string): { method: string; css: string } | null { const t = title.toLowerCase(); - if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts)\b/.test(t)) { + if (/\b(list|get|retrieve|health|find|export|progress|analytics|agreement|compare|stats|summary|voices|tts|aggregat\w*)\b/.test(t)) { return { method: 'GET', css: 'api-method-get' }; } if (/\b(create|add|generate|execute|submit|assign|bulk|complete|skip|release|pause|unpause|check|upload|start|duplicate|fetch|run|rerun|cancel|clone|merge)\b/.test(t)) { diff --git a/src/lib/api-navigation.ts b/src/lib/api-navigation.ts index f5ae3e9d..c71153bf 100644 --- a/src/lib/api-navigation.ts +++ b/src/lib/api-navigation.ts @@ -294,7 +294,8 @@ export const apiNavigation: ApiNavGroup[] = [ { "title": "Delete Eval Task", "href": "/docs/api/eval-tasks/delete-eval-task", "method": "DELETE" }, { "title": "Bulk Delete Eval Tasks", "href": "/docs/api/eval-tasks/bulk-delete-eval-tasks", "method": "POST" }, { "title": "Pause Eval Task", "href": "/docs/api/eval-tasks/pause-eval-task", "method": "POST" }, - { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" } + { "title": "Unpause Eval Task", "href": "/docs/api/eval-tasks/unpause-eval-task", "method": "POST" }, + { "title": "Eval Task Aggregations", "href": "/docs/api/eval-tasks/eval-task-aggregations", "method": "GET" } ] }, { diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts index f8c26b39..869c1c3c 100644 --- a/src/lib/navigation.ts +++ b/src/lib/navigation.ts @@ -1015,6 +1015,7 @@ export const tabNavigation: NavTab[] = [ { title: 'Bulk Delete Eval Tasks', href: '/docs/api/eval-tasks/bulk-delete-eval-tasks' }, { title: 'Pause Eval Task', href: '/docs/api/eval-tasks/pause-eval-task' }, { title: 'Unpause Eval Task', href: '/docs/api/eval-tasks/unpause-eval-task' }, + { title: 'Eval Task Aggregations', href: '/docs/api/eval-tasks/eval-task-aggregations' }, ] }, { diff --git a/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx b/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx new file mode 100644 index 00000000..2edf3144 --- /dev/null +++ b/src/pages/docs/api/eval-tasks/eval-task-aggregations.mdx @@ -0,0 +1,128 @@ +--- +title: "Eval Task Aggregations" +description: "Aggregate eval-task results in two complementary views — per-eval rollups (avg score, pass rate, per-choice proportions) and per-span pivots of raw eval values. Driven by two boolean flags, returned in a single GET." +--- + + + + + + Your Future AGI API key used to authenticate requests. You can find and manage your API keys in the [Dashboard](https://app.futureagi.com) under Settings. + + + Your Future AGI secret key, used alongside the API key for request authentication. This is generated when you create an API key in the [Dashboard](https://app.futureagi.com). + + + + + + The eval task whose runs should be aggregated. + + + When `true`, the response includes the `eval_aggregation` object — one rollup per `CustomEvalConfig` that ran in the task, keyed by eval name. Defaults to `false`. + + + When `true`, the response includes the `span_aggregation` object — one entry per span the task evaluated, keyed by `span_id`, with the raw value of every eval that touched it. Defaults to `false`. + + + + + UUID of the eval task that was aggregated. Echoed back from the request. + + + Per-eval rollup. Present only when `eval_aggregation=true`. Keys are `CustomEvalConfig` names; values are one rollup object per eval. + + + UUID of the eval config. + Eval config name (same as the parent key). + Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `aggregated_score`. + + The eval-level rollup. Shape depends on `output_type`: +
• **`percentage`** — `number` (4-dp average of `output_float` across non-error runs, e.g. `0.7421`). +
• **`pass_fail`** — `number` (pass rate as `0–100` with 2 dp, e.g. `87.5`). +
• **`deterministic`** — `object` mapping each observed choice to its occurrence percentage `0–100` with 2 dp, e.g. `{"positive": 62.5, "neutral": 25.0}`. Only choices that actually appeared in the data are included. +
`null` when no aggregatable rows exist (all errors / empty). +
+
+ + + Per-span pivot. Present only when `span_aggregation=true`. Outer keys are `span_id` (one per span the task evaluated); inner keys are eval names; inner values are one entry per eval that touched the span. + + + UUID of the eval config. + Eval config name. + Normalised output type for the eval: `percentage`, `pass_fail`, or `deterministic`. Drives the shape of `value`. + + The raw per-row eval result — **no averaging**. Shape depends on `output_type`: +
• **`percentage`** — `number` (the row's `output_float`, e.g. `0.82`). +
• **`pass_fail`** — `boolean` (the row's `output_bool`). +
• **`deterministic`** — `array` of choice strings (the row's `output_str_list`, e.g. `["positive"]`). +
When the same `(span, eval)` pair has multiple runs (re-runs), the latest by `created_at` wins. +
+
+
+ + + + The aggregation response is only returned when `eval_aggregation` or `span_aggregation` (or both) is `true`. Both flags can be combined in a single request; the response will contain both top-level keys. + + + Eval runs that have been soft-deleted are skipped in both aggregations so the rollups reflect the user's current view of the data. + + + Session- and trace-target eval runs (where there is no underlying span) are not included in `span_aggregation`. + + + + + + `eval_task_id` is missing, or no eval task with that ID exists in the caller's organization. + + + Invalid or missing API credentials. + +