future-agi · KarthikAvinashFI · May 8, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/public/images/docs/evaluation/custom/agents-tab.png b/public/images/docs/evaluation/custom/agents-tab.png
diff --git a/public/images/docs/evaluation/custom/llm-judge-tab.png b/public/images/docs/evaluation/custom/llm-judge-tab.png
diff --git a/public/images/docs/evaluation/custom/output-type-scoring.png b/public/images/docs/evaluation/custom/output-type-scoring.png
diff --git a/public/images/docs/evaluation/error-localization/toggle-on.png b/public/images/docs/evaluation/error-localization/toggle-on.png
diff --git a/public/images/docs/evaluation/evaluate/add-evaluation.png b/public/images/docs/evaluation/evaluate/add-evaluation.png
diff --git a/public/images/docs/evaluation/evaluate/configured-evals-panel.png b/public/images/docs/evaluation/evaluate/configured-evals-panel.png
diff --git a/public/images/docs/evaluation/evaluate/dataset-page.png b/public/images/docs/evaluation/evaluate/dataset-page.png
diff --git a/public/images/docs/evaluation/evaluate/dataset-with-results-running.png b/public/images/docs/evaluation/evaluate/dataset-with-results-running.png
diff --git a/public/images/docs/evaluation/ground-truth/map-variables.png b/public/images/docs/evaluation/ground-truth/map-variables.png
diff --git a/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png b/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png
diff --git a/public/images/docs/evaluation/ground-truth/tab-selected.png b/public/images/docs/evaluation/ground-truth/tab-selected.png
diff --git a/public/images/docs/evaluation/test-playground/custom-tab.png b/public/images/docs/evaluation/test-playground/custom-tab.png
diff --git a/...ic/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png b/...ic/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png
diff --git a/public/images/docs/observe/evals/editing-eval-in-tasks-page.png b/public/images/docs/observe/evals/editing-eval-in-tasks-page.png
diff --git a/public/images/docs/observe/evals/evals-results-observe-page.png b/public/images/docs/observe/evals/evals-results-observe-page.png
diff --git a/public/images/docs/observe/evals/task-config-page.png b/public/images/docs/observe/evals/task-config-page.png
diff --git a/public/images/docs/simulation/add-evaluation-button.png b/public/images/docs/simulation/add-evaluation-button.png
diff --git a/public/images/docs/simulation/eval-results-page.png b/public/images/docs/simulation/eval-results-page.png
diff --git a/public/images/docs/simulation/map-variables-for-evals.png b/public/images/docs/simulation/map-variables-for-evals.png
diff --git a/public/images/docs/simulation/search-evals.png b/public/images/docs/simulation/search-evals.png
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
@@ -295,8 +295,13 @@ export const tabNavigation: NavTab[] = [
               { title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' },
               { title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' },
               { title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' },
+              { title: 'Output Types', href: '/docs/evaluation/concepts/output-types' },
+              { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
+              { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
+              { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
               { title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' },
               { title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' },
+              { title: 'MCP Connectors in Evaluation', href: '/docs/evaluation/concepts/mcp-connectors' },
             ]
           },
           {
@@ -305,6 +310,10 @@ export const tabNavigation: NavTab[] = [
               { title: 'Built-in Evals', href: '/docs/evaluation/builtin' },
               { title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' },
               { title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' },
+              { title: 'Test Playground', href: '/docs/evaluation/features/test-playground' },
+              { title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' },
+              { title: 'Error Localization', href: '/docs/evaluation/features/error-localization' },
+              { title: 'Configure MCP Connectors for an Eval', href: '/docs/evaluation/features/mcp-connectors' },
               { title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' },
               { title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' },
               { title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' },
@@ -846,6 +855,7 @@ export const tabNavigation: NavTab[] = [
             icon: 'check-double',
             items: [
               { title: "Building an Eval Correction Loop: Teaching Your Evaluator What 'Good' Means for Your Domain", href: '/docs/cookbook/evaluation/eval-correction-loop' },
+              { title: 'Score Support Replies Against Live CRM Data Using MCP Connectors', href: '/docs/cookbook/evaluation/eval-with-mcp-connectors' },
             ]
           },
           {

diff --git a/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx b/src/pages/docs/cookbook/evaluation/eval-with-mcp-connectors.mdx
@@ -0,0 +1,184 @@
+---
+title: "Score Support Replies Against Live CRM Data Using MCP Connectors"
+description: "Build an Agent-mode eval that calls your CRM through an MCP connector to verify a customer-support reply against the user's real account state, then run it across a dataset."
+---
+
+<TLDR>
+Generic helpfulness evals can't tell you if an agent's reply is *factually* right for *this* customer. Wire your CRM as an MCP connector, attach it to an Agent-mode eval, and the judge will look up the real account state per row and score the reply against it. The result is an eval that fails on confident-sounding lies.
+</TLDR>
+
+| Time | Difficulty | Package |
+|------|-----------|---------|
+| 25 min | Intermediate | `ai-evaluation`, any MCP server |
+
+<Prerequisites>
+- FutureAGI account → [app.futureagi.com](https://app.futureagi.com)
+- API keys: `FI_API_KEY` and `FI_SECRET_KEY` (see [Get your API keys](/docs/admin-settings))
+- An MCP server exposing a read-only lookup tool (we use a stub `crm` server below; substitute your own)
+- Python 3.9+
+</Prerequisites>
+
+## Why this matters
+
+Customer support replies often quote things like a user's plan, billing state, or feature flags. A model that hallucinates one of those produces a reply that *reads* correct but is wrong for the user in front of it. A generic eval scores the reply's tone or coherence; it cannot catch a wrong plan name.
+
+An Agent-mode eval with a CRM connector calls `crm.get_user(user_id)` while it scores each row. The judge compares the reply against the real record and fails when they disagree.
+
+---
+
+## Step 1: Stand up the CRM MCP connector
+
+If you already have a CRM with an MCP server (HubSpot, Salesforce, Linear, or an internal one), skip to Step 2. Otherwise, here is a minimal Python stub you can run locally for the walkthrough. It exposes one `get_user(user_id)` tool over the MCP `streamable_http` transport.
+
+```python
+# crm_mcp_server.py
+from fastmcp import FastMCP
+
+mcp = FastMCP("crm")
+
+USERS = {
+    "u-1001": {"plan": "pro", "two_factor": True, "last_invoice": "paid"},
+    "u-1002": {"plan": "free", "two_factor": False, "last_invoice": "unpaid"},
+    "u-1003": {"plan": "team", "two_factor": True, "last_invoice": "paid"},
+}
+
+@mcp.tool()
+def get_user(user_id: str) -> dict:
+    """Return the user's current plan, 2FA state, and last invoice status."""
+    return USERS.get(user_id, {"error": "not found"})
+
+if __name__ == "__main__":
+    mcp.run(transport="streamable-http", host="0.0.0.0", port=8000)
+```
+
+Run it:
+
+```bash
+pip install fastmcp
+python crm_mcp_server.py
+```
+
+Expose it through ngrok or your own tunnel so Future AGI can reach it:
+
+```bash
+ngrok http 8000
+```
+
+Grab the public URL (e.g. `https://abc123.ngrok-free.app`).
+
+---
+
+## Step 2: Register the connector in Future AGI
+
+1. Open **Settings → Connectors → Add Connector**.
+2. Fill in:
+
+   | Field | Value |
+   |---|---|
+   | Name | `crm-stub` |
+   | Server URL | `https://<your-tunnel>/mcp` |
+   | Transport | `streamable_http` |
+   | Auth type | `None` (the stub takes no auth — for a real server use the appropriate option) |
+
+3. Click **Test Connection** and then **Discover Tools**. You should see `get_user` in the list.
+4. Save.
+
+For a production connector, see [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors) for OAuth, bearer, and API-key setups.
+
+---
+
+## Step 3: Prepare the eval dataset
+
+Create a small dataset that pairs a user id with the agent's reply. Each row is one customer interaction.
+
+| user_id | reply |
+|---|---|
+| u-1001 | "Since you're on our Pro plan, you already have 2FA enabled. Your last invoice is paid in full." |
+| u-1002 | "I see you're on the Team plan with all premium features. Your invoice is up to date." |
+| u-1003 | "You're on the Team plan, 2FA is on, and your last invoice was settled." |
+
+The first and third rows are correct against the stub data. The second one is a hallucination: `u-1002` is on `free`, not `team`, and the last invoice is `unpaid`.
+
+Upload this dataset under **Datasets → New Dataset** with two text columns: `user_id` and `reply`.
+
+---
+
+## Step 4: Create the Agent-mode eval
+
+1. Open the dataset and click **Evaluate**.
+2. Click **Add Evaluation → Create your own eval**.
+3. Fill in:
+
+   | Field | Value |
+   |---|---|
+   | Name | `support_reply_matches_crm` |
+   | Mode | `Agent` |
+   | Output type | `pass_fail` |
+   | Model | `turing_large` |
+
+4. In **Tools**, enable the `crm-stub` connector. Leave Internet off — the eval should not need it.
+5. Paste this rule prompt:
+
+   ```text
+   You are scoring a customer-support reply against the user's real
+   account state.
+
+   user_id: {{user_id}}
+   reply: "{{reply}}"
+
+   Call `crm.get_user({{user_id}})` to fetch the user's real plan,
+   2FA state, and last_invoice status. Compare every concrete claim
+   in the reply against the live record.
+
+   Pass only if every concrete claim in the reply matches the record.
+   Fail otherwise. Your reason must quote the mismatched fields and
+   show both the claim and the live value.
+   ```
+
+6. Click **Test on Sample** with row 2 (`u-1002`) selected. You should see:
+
+   - A tool call: `crm.get_user("u-1002")`
+   - Tool response: `{"plan": "free", "two_factor": false, "last_invoice": "unpaid"}`
+   - Score: **Failed**
+   - Reason quoting the plan and invoice mismatch.
+
+7. Click **Add & Run** to score every row.
+
+---
+
+## Step 5: Read the results
+
+| user_id | Score | Reason |
+|---|---|---|
+| u-1001 | Passed | Plan matches Pro; 2FA matches enabled; invoice matches paid. |
+| u-1002 | Failed | Reply claimed Team plan but `crm.get_user` returned `free`. Reply said invoice up to date but record shows `unpaid`. |
+| u-1003 | Passed | All three claims match the live record. |
+
+Open any row to see the **Tool Trace** — the exact tool call and response the judge used. The trace is the audit log for why a row scored the way it did.
+
+---
+
+## Going further
+
+- **Add more tools**: Extend the MCP server with `get_subscription`, `get_invoices`, `get_feature_flags`. The judge will pick the right tool per claim.
+- **Use a real CRM**: Replace the stub with HubSpot, Stripe, or Linear MCP servers. Set the connector auth to OAuth or bearer.
+- **Score traces, not datasets**: The same Agent-mode eval can run against [traced](/docs/observe) production conversations — wire it as a [continuous evaluation](/docs/evaluation/features/cicd).
+- **Cross-source evals**: Enable two connectors at once (CRM + Notion docs). The judge calls both when the reply cites a product feature *and* a customer record.
+
+---
+
+## Pitfalls to avoid
+
+- **Vague rule prompts**. If the prompt does not mention which tool to use and what to check, the judge may skip the lookup. Spell it out.
+- **Too many enabled tools**. The judge has to read every tool schema on every row. Enable only the ones this eval needs.
+- **PII in tool responses**. The full tool response is included in the eval reason and trace. Mask or strip fields the eval doesn't need before they leave your MCP server.
+- **Rate limits**. A 100k-row run with two tool calls per row is 200k upstream requests. Throttle the dataset run or batch the eval rows accordingly.
+
+---
+
+## Next steps
+
+- [MCP Connectors concept](/docs/evaluation/concepts/mcp-connectors): The model behind Agent-mode evals.
+- [Configure MCP Connectors for an Eval](/docs/evaluation/features/mcp-connectors): The UI walkthrough.
+- [Create custom evals](/docs/evaluation/features/custom): Building eval templates without connectors.
+- [Falcon AI MCP Connectors](/docs/falcon-ai/features/mcp-connectors): Registering and authenticating a connector.
diff --git a/src/pages/docs/evaluation/builtin/accuracy.mdx b/src/pages/docs/evaluation/builtin/accuracy.mdx
@@ -0,0 +1,51 @@
+---
+title: "Accuracy: Built-in Evaluation"
+description: "Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comparison"
+---
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="accuracy",
+    inputs={
+        "output": "The capital of France is Paris.",
+        "expected": "Paris"
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "accuracy",
+  {
+    output: "The capital of France is Paris.",
+    expected: "Paris"
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `output` | `string` | Predicted label(s) - string, number, or JSON array. |
+| | `expected` | `string` | Expected label(s) - same format. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/answer-similarity.mdx b/src/pages/docs/evaluation/builtin/answer-similarity.mdx
@@ -0,0 +1,51 @@
+---
+title: "Answer Similarity: Built-in Evaluation"
+description: "Evaluates the similarity between the expected and actual responses"
+---
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="answer_similarity",
+    inputs={
+        "expected_response": "...",
+        "response": "..."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "answer_similarity",
+  {
+    expected_response: "...",
+    response: "..."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `expected_response` | `string` | The expected correct response. |
+| | `response` | `string` | The actual response to be evaluated. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/api-call.mdx b/src/pages/docs/evaluation/builtin/api-call.mdx
@@ -0,0 +1,48 @@
+---
+title: "Api Call: Built-in Evaluation"
+description: "Makes an API call and evaluates the response"
+---
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="api_call",
+    inputs={
+        "response": "..."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "api_call",
+  {
+    response: "..."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `response` | `string` | The response to be evaluated. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `Code`, `Output Validation`