Corbell-AI · himmi-01 · Jun 2, 2026
diff --git a/.env.example b/.env.example
@@ -69,3 +69,32 @@ LANGFUSE_SECRET_KEY="sk-lf-..."
 # Used by: evalmonkey generate-evals --langfuse-dataset <name>
 #          demo_rag_app.sh (automatic if keys are set)
 # LANGFUSE_DATASET="evalmonkey_failures"
+
+# ----------------------------------------
+# 5. Regression Guard (Optional)
+# ----------------------------------------
+# Score drop (in points) that triggers a regression warning after run-benchmark
+# and causes `evalmonkey guard` to exit with code 1.
+# Default: 5  (i.e. a drop of 5+ points vs the previous baseline is flagged)
+EVAL_REGRESSION_THRESHOLD=5
+
+# ----------------------------------------
+# 6. External Dataset Providers (Optional)
+# ----------------------------------------
+# Use EvalMonkey as the chaos + scoring harness on top of datasets you already
+# maintain in eval platforms you subscribe to.
+#
+# Confident AI (DeepEval cloud)
+# Get your key: https://app.confident-ai.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario confident-ai::<dataset_id>
+CONFIDENT_AI_API_KEY="conf-..."
+
+# Braintrust
+# Get your key: https://www.braintrustdata.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario braintrust::<project>/<dataset>
+BRAINTRUST_API_KEY="bt-..."
+
+# LangSmith (LangChain)
+# Get your key: https://smith.langchain.com → Settings → API Keys
+# Usage: evalmonkey run-benchmark --scenario langsmith::<dataset_id>
+LANGSMITH_API_KEY="ls__..."
diff --git a/assets/agent_cards/README.md b/assets/agent_cards/README.md
@@ -0,0 +1,20 @@
+# EvalMonkey Agent Cards
+
+Sample benchmark report cards generated by `evalmonkey report`.
+
+## EvalMonkey Sample Apps
+
+- [EvalMonkey RAG App](./rag_app.md) — RAG Agent (Demo)
+- [EvalMonkey Coding Agent](./coding_agent.md) — Coding Agent (Demo)
+
+## Open-Source Agents (from the EvalMonkey Leaderboard)
+
+- [GPT Researcher](./gpt_researcher.md) — Rank #1, Production Reliability: **57**
+- [OpenResearcher](./openresearcher.md) — Rank #2, Production Reliability: **55**
+
+---
+
+Generate your own card:
+```bash
+evalmonkey report --output my_agent_card.md
+```
diff --git a/assets/agent_cards/coding_agent.md b/assets/agent_cards/coding_agent.md
@@ -0,0 +1,73 @@
+# Agent Benchmark Card — EvalMonkey Coding Agent
+
+[![EvalMonkey Score](https://img.shields.io/badge/EvalMonkey-Score%3A73-yellow)](https://github.com/Corbell-AI/evalmonkey)
+[![Production Reliability](https://img.shields.io/badge/Production%20Reliability-Score%3A67-yellow)](https://github.com/Corbell-AI/evalmonkey)
+
+> Evaluated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) · 2026-05-28
+
+## Overview
+
+| Field | Value |
+|-------|-------|
+| Agent | [EvalMonkey Coding Agent](https://github.com/Corbell-AI/evalmonkey) |
+| Type | Coding Agent (Demo) |
+| Framework | LiteLLM + FastAPI |
+| Agent Type Config | `agent_type: coding_agent` |
+| Eval Judge | gpt-4o |
+| Chaos Profiles Tested | 4 (`code_syntax_break`, `code_wrong_language`, `code_context_strip`, `client_prompt_injection`) |
+
+## Scores
+
+| Benchmark | Baseline | Chaos | Production Reliability |
+|-----------|:--------:|:-----:|:----------------------:|
+| `human-eval` | 🟢 **78** | 62 | 71.6 |
+| `mbpp` | 🟢 **82** | 68 | 76.4 |
+| `apps` | 🟡 **59** | 44 | 53.0 |
+
+| **Overall** | **73** | **58** | **67.0** |
+
+## Production Reliability
+
+```
+Production Reliability = (baseline × 0.6) + (chaos × 0.4)
+                       = (73 × 0.6) + (58 × 0.4)
+                       = 67.0
+```
+
+## Analysis
+
+> EvalMonkey's built-in coding demo. Strong on basic Python, weaker on competitive challenges.
+
+## Reproduce This Benchmark
+
+```bash
+# Clone EvalMonkey
+git clone https://github.com/Corbell-AI/evalmonkey.git
+cd evalmonkey && pip install -e .
+
+# Set up your .env
+cp .env.example .env  # Add your OPENAI_API_KEY or EVAL_MODEL
+
+# Run the Coding Agent (Demo) sample app
+python apps/coding_agent/app.py &
+
+evalmonkey run-benchmark --scenario human-eval --sample-agent coding_agent
+evalmonkey run-benchmark --scenario mbpp --sample-agent coding_agent
+evalmonkey run-benchmark --scenario apps --sample-agent coding_agent
+
+# Chaos test
+evalmonkey run-chaos --scenario human-eval --chaos-profile code_syntax_break --sample-agent coding_agent
+
+# Generate this card
+evalmonkey report --output agent_card.md
+```
+
+## Embed This Badge
+
+```markdown
+[![EvalMonkey Score](https://img.shields.io/badge/EvalMonkey-Score%3A73-yellow)](https://github.com/Corbell-AI/evalmonkey)
+```
+
+---
+
+*Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) — the open-source agent benchmarking and chaos framework.*
diff --git a/assets/agent_cards/gpt_researcher.md b/assets/agent_cards/gpt_researcher.md
@@ -0,0 +1,66 @@
+# Agent Benchmark Card — GPT Researcher
+
+[![Production Reliability](https://img.shields.io/badge/Production%20Reliability-Score%3A57-orange)](https://github.com/assafelovic/gpt-researcher)
+[![GitHub](https://img.shields.io/badge/GitHub-View%20Repo-181717?logo=github)](https://github.com/assafelovic/gpt-researcher)
+
+> Evaluated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) · 2026-05-28
+
+## Overview
+
+| Field | Value |
+|-------|-------|
+| Agent | [GPT Researcher](https://github.com/assafelovic/gpt-researcher) |
+| Type | Deep Research Agent |
+| EvalMonkey Rank | 🏅 #1 of 10 open-source agents |
+| Eval Judge | Claude Sonnet 4.5 (AWS Bedrock) |
+| Chaos Profiles | `client_prompt_injection`, `client_schema_mutation` |
+
+## Scores
+
+| Benchmark | Baseline | Chaos | Production Reliability |
+|-----------|:--------:|:-----:|:----------------------:|
+| `hotpotqa` | 🟢 **66** | 17 | 46.4 |
+| `truthfulqa` | 🟢 **65** | 48 | 58.2 |
+| `mmlu` | 🟡 **56** | 16 | 40.0 |
+
+| **Overall** | **66** | **43** | **57** |
+
+## Production Reliability
+
+```
+Production Reliability = (baseline × 0.6) + (chaos × 0.4)
+                       = (66 × 0.6) + (43 × 0.4)
+                       = 57
+```
+
+> Production Reliability measures how your agent performs under **real-world conditions** —
+> not just clean benchmark inputs, but also adversarial mutations like prompt injection,
+> schema corruption, and typo flooding.
+
+## Analysis
+
+> Highest baseline scorer. Dropped 23 pts under chaos — sensitive to prompt injection.
+
+## How to Re-run This Benchmark
+
+```bash
+# Install EvalMonkey
+pip install git+https://github.com/Corbell-AI/evalmonkey.git
+
+# Start GPT Researcher on port 8000 (see its own README)
+
+# Run the same benchmarks
+evalmonkey run-benchmark --scenario hotpotqa --target-url http://localhost:8000/solve
+evalmonkey run-benchmark --scenario truthfulqa --target-url http://localhost:8000/solve
+evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8000/solve
+
+# Chaos test
+evalmonkey run-chaos --scenario hotpotqa --chaos-profile client_prompt_injection --target-url http://localhost:8000/solve
+
+# Generate this card
+evalmonkey report --output agent_card.md
+```
+
+---
+
+*Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) — the open-source agent benchmarking and chaos framework.*
diff --git a/assets/agent_cards/openresearcher.md b/assets/agent_cards/openresearcher.md
@@ -0,0 +1,66 @@
+# Agent Benchmark Card — OpenResearcher
+
+[![Production Reliability](https://img.shields.io/badge/Production%20Reliability-Score%3A55-orange)](https://github.com/GAIR-NLP/OpenResearcher)
+[![GitHub](https://img.shields.io/badge/GitHub-View%20Repo-181717?logo=github)](https://github.com/GAIR-NLP/OpenResearcher)
+
+> Evaluated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) · 2026-05-28
+
+## Overview
+
+| Field | Value |
+|-------|-------|
+| Agent | [OpenResearcher](https://github.com/GAIR-NLP/OpenResearcher) |
+| Type | Scientific Research Agent |
+| EvalMonkey Rank | 🏅 #2 of 10 open-source agents |
+| Eval Judge | Claude Sonnet 4.5 (AWS Bedrock) |
+| Chaos Profiles | `client_prompt_injection`, `client_schema_mutation` |
+
+## Scores
+
+| Benchmark | Baseline | Chaos | Production Reliability |
+|-----------|:--------:|:-----:|:----------------------:|
+| `hotpotqa` | 🟢 **64** | 19 | 46.0 |
+| `truthfulqa` | 🟢 **63** | 47 | 56.6 |
+| `mmlu` | 🟡 **55** | 18 | 40.2 |
+
+| **Overall** | **64** | **42** | **55** |
+
+## Production Reliability
+
+```
+Production Reliability = (baseline × 0.6) + (chaos × 0.4)
+                       = (64 × 0.6) + (42 × 0.4)
+                       = 55
+```
+
+> Production Reliability measures how your agent performs under **real-world conditions** —
+> not just clean benchmark inputs, but also adversarial mutations like prompt injection,
+> schema corruption, and typo flooding.
+
+## Analysis
+
+> Strong research synthesis. Stable under schema mutation, weaker under prompt injection.
+
+## How to Re-run This Benchmark
+
+```bash
+# Install EvalMonkey
+pip install git+https://github.com/Corbell-AI/evalmonkey.git
+
+# Start OpenResearcher on port 8000 (see its own README)
+
+# Run the same benchmarks
+evalmonkey run-benchmark --scenario hotpotqa --target-url http://localhost:8000/solve
+evalmonkey run-benchmark --scenario truthfulqa --target-url http://localhost:8000/solve
+evalmonkey run-benchmark --scenario mmlu --target-url http://localhost:8000/solve
+
+# Chaos test
+evalmonkey run-chaos --scenario hotpotqa --chaos-profile client_prompt_injection --target-url http://localhost:8000/solve
+
+# Generate this card
+evalmonkey report --output agent_card.md
+```
+
+---
+
+*Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) — the open-source agent benchmarking and chaos framework.*
diff --git a/assets/agent_cards/rag_app.md b/assets/agent_cards/rag_app.md
@@ -0,0 +1,73 @@
+# Agent Benchmark Card — EvalMonkey RAG App
+
+[![EvalMonkey Score](https://img.shields.io/badge/EvalMonkey-Score%3A71-yellow)](https://github.com/Corbell-AI/evalmonkey)
+[![Production Reliability](https://img.shields.io/badge/Production%20Reliability-Score%3A65-yellow)](https://github.com/Corbell-AI/evalmonkey)
+
+> Evaluated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) · 2026-05-28
+
+## Overview
+
+| Field | Value |
+|-------|-------|
+| Agent | [EvalMonkey RAG App](https://github.com/Corbell-AI/evalmonkey) |
+| Type | RAG Agent (Demo) |
+| Framework | LiteLLM + FastAPI |
+| Agent Type Config | `agent_type: rag_agent` |
+| Eval Judge | gpt-4o |
+| Chaos Profiles Tested | 3 (`client_prompt_injection`, `client_typo_injection`, `client_schema_mutation`) |
+
+## Scores
+
+| Benchmark | Baseline | Chaos | Production Reliability |
+|-----------|:--------:|:-----:|:----------------------:|
+| `hotpotqa` | 🟢 **74** | 61 | 68.8 |
+| `natural-questions` | 🟢 **71** | 58 | 65.8 |
+| `truthfulqa` | 🟡 **68** | 55 | 62.8 |
+
+| **Overall** | **71** | **58** | **65.8** |
+
+## Production Reliability
+
+```
+Production Reliability = (baseline × 0.6) + (chaos × 0.4)
+                       = (71 × 0.6) + (58 × 0.4)
+                       = 65.8
+```
+
+## Analysis
+
+> EvalMonkey's built-in RAG demo agent. Retrieval-augmented, handles multi-hop well.
+
+## Reproduce This Benchmark
+
+```bash
+# Clone EvalMonkey
+git clone https://github.com/Corbell-AI/evalmonkey.git
+cd evalmonkey && pip install -e .
+
+# Set up your .env
+cp .env.example .env  # Add your OPENAI_API_KEY or EVAL_MODEL
+
+# Run the RAG Agent (Demo) sample app
+python apps/rag_app/app.py &
+
+evalmonkey run-benchmark --scenario hotpotqa --sample-agent rag_app
+evalmonkey run-benchmark --scenario natural-questions --sample-agent rag_app
+evalmonkey run-benchmark --scenario truthfulqa --sample-agent rag_app
+
+# Chaos test
+evalmonkey run-chaos --scenario hotpotqa --chaos-profile client_prompt_injection --sample-agent rag_app
+
+# Generate this card
+evalmonkey report --output agent_card.md
+```
+
+## Embed This Badge
+
+```markdown
+[![EvalMonkey Score](https://img.shields.io/badge/EvalMonkey-Score%3A71-yellow)](https://github.com/Corbell-AI/evalmonkey)
+```
+
+---
+
+*Generated by [EvalMonkey](https://github.com/Corbell-AI/evalmonkey) — the open-source agent benchmarking and chaos framework.*
diff --git a/evalmonkey/config/agent_config.py b/evalmonkey/config/agent_config.py
@@ -74,6 +74,18 @@
     },
 }
 
+# Maps each agent_type to the most relevant standard benchmark IDs.
+# Used by `evalmonkey recommend` to surface a curated suite instead of all 22.
+AGENT_TYPE_BENCHMARKS: dict[str, list[str]] = {
+    "research_agent":    ["hotpotqa", "drop", "natural-questions", "gaia-benchmark"],
+    "coding_agent":      ["human-eval", "mbpp", "apps", "swe-bench"],
+    "rag_agent":         ["hotpotqa", "natural-questions", "drop", "truthfulqa"],
+    "customer_support":  ["daily-dialog", "multiwoz", "mt-bench", "alpacaeval"],
+    "voice_agent":       ["daily-dialog", "multiwoz", "spokentext-cleanup"],
+    "safety_agent":      ["truthfulqa", "toxigen", "arc", "bbh"],
+    "general":           ["gsm8k", "mmlu", "arc", "truthfulqa"],
+}
+
 
 @dataclass
 class AgentConfig:
@@ -86,6 +98,8 @@ class AgentConfig:
     eval_model: str = ""
     agent_command: str = ""         # shell command to start the agent server
     agent_startup_wait: int = 3     # seconds to wait after spawning before sending traffic
+    agent_type: str = "general"     # Used by `evalmonkey recommend` to surface relevant benchmarks
+    private_benchmarks: list = field(default_factory=list)  # Custom REST dataset configs
     extra: dict = field(default_factory=dict)
 
 
@@ -116,6 +130,8 @@ def load_config(config_path: Optional[str] = None) -> Optional[AgentConfig]:
                 eval_model=str(raw.get("eval_model", os.getenv("EVAL_MODEL", ""))),
                 agent_command=str(agent_raw.get("agent_command", "")),
                 agent_startup_wait=int(agent_raw.get("agent_startup_wait", 3)),
+                agent_type=str(agent_raw.get("agent_type", "general")),
+                private_benchmarks=list(raw.get("private_benchmarks", [])),
                 extra=raw,
             )
     return None
@@ -154,6 +170,10 @@ def generate_config_yaml(framework: str, name: str, port: int) -> str:
   # How EvalMonkey reads the answer back (dot-notation for nested fields)
   response_path: {preset['response_path']}   # dot-path to extract the answer text
 
+  # Agent type — drives `evalmonkey recommend` to show only relevant benchmarks
+  # Options: general | research_agent | coding_agent | rag_agent | customer_support | voice_agent | safety_agent
+  agent_type: general
+
 # Which LLM EvalMonkey uses as the judge (can also be set via EVAL_MODEL env var)
 eval_model: "gpt-4o"   # or: anthropic.claude-3-haiku-20240307-v1:0, ollama/llama3, etc.
 """