10xHub · Iamsdt · Jun 13, 2026 · Jun 11, 2026
diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py
@@ -39,6 +39,7 @@
 class _PendingCase:
     case: Any  # EvalCase
     evaluator: AgentEvaluator
+    config: Any  # EvalConfig — the resolved config for this case
     file_name: str
     eval_set_id: str
     eval_set_name: str
@@ -223,8 +224,8 @@ def _collect_from_file(
         """Load a module and return pending work for every eval case or simulation scenario.
 
         Config priority chain (highest → lowest):
-          1. confeval.py   — global_config (not None when confeval was found)
-          2. Per-file      — EVAL_CONFIG / get_eval_config() inside the file
+          1. Per-file      — EVAL_CONFIG / get_eval_config() inside the file
+          2. confeval.py   — global_config (fallback when no per-file config)
           3. Built-in defaults — _default_config()
 
         Returns _PendingSimulation items when the file exposes get_scenarios() or SCENARIOS.
@@ -253,8 +254,8 @@ def _collect_from_file(
                 file_config = mod.get_eval_config()
             elif hasattr(mod, "EVAL_CONFIG"):
                 file_config = mod.EVAL_CONFIG
-            # Priority: confeval > per-file > defaults
-            config = global_config or file_config or self._default_config()
+            # Priority: per-file > confeval > defaults
+            config = file_config or global_config or self._default_config()
             return self._make_pending(mod, mod.get_eval_set(), config, file_name)
 
         # pytest-style discovery
@@ -267,8 +268,8 @@ def _collect_from_file(
                 if hasattr(mod, "EVAL_CONFIG")
                 else None
             )
-            # Priority: confeval > per-file > defaults
-            config = global_config or file_config or self._default_config()
+            # Priority: per-file > confeval > defaults
+            config = file_config or global_config or self._default_config()
             pending: list[_PendingCase] = []
             for _, es in eval_pairs:
                 pending.extend(self._make_pending(mod, es, config, file_name))
@@ -329,6 +330,7 @@ def _make_pending(
             _PendingCase(
                 case=c,
                 evaluator=evaluator,
+                config=config,
                 file_name=file_name,
                 eval_set_id=eval_set.eval_set_id,
                 eval_set_name=eval_set.name,
@@ -531,7 +533,9 @@ async def _run_one(
     # Report merging
     # ------------------------------------------------------------------
 
-    def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
+    def _merge_reports(
+        self, reports: list[EvalReport], base_config: Any = None
+    ) -> EvalReport:
         if len(reports) == 1:
             return reports[0]
 
@@ -542,6 +546,7 @@ def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
             eval_set_id="combined_eval",
             eval_set_name="Combined Evaluation",
             results=all_results,
+            config_used=base_config.model_dump() if base_config else {},
         )
 
     # ------------------------------------------------------------------
@@ -673,6 +678,13 @@ def execute(  # noqa: PLR0912, PLR0915
             self.output.error("No results produced.")
             return 1
 
+        # Build per-eval-set config map from pending before results are consumed
+        group_configs: dict[str, Any] = {
+            pc.eval_set_id: pc.config
+            for pc in pending
+            if isinstance(pc, _PendingCase)
+        }
+
         # 7. Group by eval_set_id → one EvalReport per set
         groups: dict[str, tuple[str, list[EvalCaseResult]]] = defaultdict(lambda: ("", []))
         for _, eval_set_id, eval_set_name, result in quads:
@@ -681,17 +693,18 @@ def execute(  # noqa: PLR0912, PLR0915
 
         reports: list[EvalReport] = []
         for eval_set_id, (eval_set_name, results) in groups.items():
+            group_cfg = group_configs.get(eval_set_id) or confeval_config or self._default_config()
             reports.append(
                 ER.create(
                     eval_set_id=eval_set_id,
                     eval_set_name=eval_set_name,
                     results=results,
-                    config_used=(confeval_config or self._default_config()).model_dump(),
+                    config_used=group_cfg.model_dump(),
                 )
             )
 
         # 8. Merge into a single report
-        merged = self._merge_reports(reports)
+        merged = self._merge_reports(reports, base_config=confeval_config or self._default_config())
 
         # 9. Determine exit code
         if threshold is not None and merged.summary.pass_rate < threshold:

diff --git a/agentflow_cli/cli/commands/skills.py b/agentflow_cli/cli/commands/skills.py
@@ -92,6 +92,11 @@ def kind(self) -> str:
                 source_relpath="agent-skills",
                 manifest=True,
             ),
+            _InstallArtifact(
+                kind="file",
+                install_relpath=".github/skills/agentflow/SKILL.md",
+                source_relpath="copilot/SKILL.md",
+            ),
         ),
     ),
 )

diff --git a/agentflow_cli/cli/templates/skills/copilot/SKILL.md b/agentflow_cli/cli/templates/skills/copilot/SKILL.md
@@ -0,0 +1,82 @@
+---
+name: agentflow
+description: Expert guidance for building, debugging, and extending applications with AgentFlow (10xscale-agentflow). TRIGGER when: code imports from agentflow (e.g. `from agentflow import`, `StateGraph`, `Agent`, `ToolNode`, `AgentState`); user references `agentflow.json` or CLI commands (`agentflow init`, `agentflow api`, `agentflow play`, `agentflow build`, `agentflow skills`); user is building graph-based multi-agent workflows, tools, memory, checkpointing, or streaming with this framework. SKIP: generic Python or multi-agent questions not referencing agentflow; other frameworks (LangGraph, CrewAI, AutoGen) unless comparing.
+---
+
+# Agentflow Project Skill
+
+Use this skill when working in an Agentflow project. Agentflow is a multi-agent framework that wraps official OpenAI and Google SDK capabilities behind a unified graph, agent, tool, state, storage, API, CLI, and TypeScript client interface.
+
+Treat https://agentflow.10xscale.ai/ as the first source of truth for public package names, install commands, and user-facing behavior. Use implementation source after the docs establish the intended API.
+
+## Workflow
+
+1. Identify the published package or docs surface involved:
+   - PyPI core Python SDK: `10xscale-agentflow` (`pip install 10xscale-agentflow`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow/agentflow
+   - PyPI API/CLI SDK: `10xscale-agentflow-cli` (`pip install 10xscale-agentflow-cli`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow-api/agentflow_cli
+   - npm TypeScript SDK: `@10xscale/agentflow-client` (`npm install @10xscale/agentflow-client`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow-client/src
+   - Main docs: https://agentflow.10xscale.ai/
+   - Playground/UI: `agentflow play` command after installed cli
+
+2. Read the matching reference file before changing behavior:
+
+   ### Core Python SDK
+   - Architecture and package flow: `.github/skills/agentflow/references/architecture.md`
+   - Agent constructor, provider, reasoning, retry, fallback, output_schema: `.github/skills/agentflow/references/agents-and-tools.md`
+   - Graph construction, nodes, edges, compile, interrupts, config keys: `.github/skills/agentflow/references/state-graph.md`
+   - State, messages, and content blocks: `.github/skills/agentflow/references/state-and-messages.md`
+   - Thread and checkpointing: `.github/skills/agentflow/references/checkpointing-and-threads.md`
+   - Dependency injection (InjectQ): `.github/skills/agentflow/references/dependency-injection.md`
+   - Multimodal files and media stores: `.github/skills/agentflow/references/media-and-files.md`
+   - Long-term memory stores (MemoryConfig, QdrantStore, Mem0Store): `.github/skills/agentflow/references/memory-and-store.md`
+   - Streaming, StreamChunk, SSE, ResponseGranularity: `.github/skills/agentflow/references/streaming.md`
+   - Stream emitter for tool progress updates: `.github/skills/agentflow/references/stream-emitter.md`
+   - Observability hooks, validators, and runtime jumps: `.github/skills/agentflow/references/callbacks-and-command.md`
+   - Prebuilt agents (ReactAgent, PlanActReflectAgent, StructuredOutputAgent, SupervisorTeamAgent, SwarmAgent, RAGAgent) and tools: `.github/skills/agentflow/references/prebuilt-agents-and-tools.md`
+   - Event publishers and A2A/ACP runtime protocols: `.github/skills/agentflow/references/publishers-and-runtime-protocols.md`
+   - Context management, ID generation, and background tasks: `.github/skills/agentflow/references/context-id-background.md`
+   - Provider internals and adapters: `.github/skills/agentflow/references/providers-and-adapters.md`
+   - Prompt-injection and validation safety: `.github/skills/agentflow/references/security-and-validators.md`
+
+   ### API/CLI SDK
+   - CLI commands and generated project files: `.github/skills/agentflow/references/cli-commands.md`
+   - `agentflow.json` and dependency loading: `.github/skills/agentflow/references/api-configuration.md`
+   - API auth and authorization: `.github/skills/agentflow/references/auth-and-authorization.md`
+   - API environment, settings, and middleware: `.github/skills/agentflow/references/api-settings-and-middleware.md`
+   - Rate limiting (config, backends, headers, custom backend): `.github/skills/agentflow/references/rate-limiting.md`
+   - REST routes and error behavior: `.github/skills/agentflow/references/rest-api-and-errors.md`
+   - API Snowflake IDs and thread naming: `.github/skills/agentflow/references/id-and-thread-name-generators.md`
+   - API server and deployment runtime: `.github/skills/agentflow/references/production-runtime.md`
+
+   ### TypeScript client SDK
+   - REST and TypeScript client surface: `.github/skills/agentflow/references/api-client.md`
+   - Browser/client-side tool execution: `.github/skills/agentflow/references/remote-tools.md`
+   - TypeScript auth helpers and structured errors: `.github/skills/agentflow/references/client-auth-and-errors.md`
+   - TypeScript messages, invoke, and stream details: `.github/skills/agentflow/references/client-messages-invoke-stream.md`
+   - TypeScript thread, memory, and file APIs: `.github/skills/agentflow/references/client-threads-memory-files.md`
+
+   ### Testing and QA
+   - Unit testing without LLM calls (TestAgent, QuickTest, MockToolRegistry, `agentflow test`): `.github/skills/agentflow/references/unit-testing.md`
+   - Evaluation framework (EvalSet, criteria, AgentEvaluator, QuickEval, UserSimulator, `agentflow eval`): `.github/skills/agentflow/references/evaluation.md`
+   - Testing helpers overview: `.github/skills/agentflow/references/testing-and-evaluation.md`
+
+3. Prefer existing Agentflow abstractions over new custom wiring:
+   - Build workflows with `StateGraph`, `Agent`, `ToolNode`, `AgentState`, and `Message`.
+   - Use prebuilt agents (`ReactAgent`, `PlanActReflectAgent`, `StructuredOutputAgent`, `SupervisorTeamAgent`, `SwarmAgent`, `RAGAgent`) for common patterns before hand-writing graph loops.
+   - Persist conversation state with checkpointers; use stores only for cross-thread memory.
+   - Put business services in `InjectQ` instead of global variables.
+   - Keep API/CLI graph modules storage-agnostic and wire dependencies through `agentflow.json`.
+
+4. Verify against source when implementation details matter. Public names and expected behavior should match https://agentflow.10xscale.ai/; source under https://github.com/10xHub/Agentflow (core), https://github.com/10xHub/agentflow-cli (API/CLI), and https://github.com/10xHub/agentflow-client (TypeScript) explains how that behavior is implemented.
+
+## Local Conventions
+
+- A compiled graph is normally loaded once by the API server and reused per request.
+- Public package naming matters: use `10xscale-agentflow`, `10xscale-agentflow-cli`, and `@10xscale/agentflow-client` in user-facing docs and examples, not repository folder names.
+- Every persisted interaction should include `config.thread_id`.
+- Tools need docstrings and type annotations so model-facing schemas are useful.
+- Injectable tool and node parameters (`state`, `config`, `tool_call_id`) are hidden from the model schema.
+- For production, avoid process-local storage for shared state; use durable checkpointer/store backends.
+- Add observability or audit side effects by registering a `GraphLifecycleHook` on `CallbackManager` — do not wrap `ainvoke()` / `astream()` calls in application code to achieve the same result.
+- `reasoning_config` is on by default at medium effort; disable explicitly with `reasoning_config=None` when not needed.
+- Provider is auto-detected from the model name; use `base_url` for third-party OpenAI-compatible APIs (Ollama, DeepSeek, OpenRouter).
diff --git a/tests/cli/test_cli_main.py b/tests/cli/test_cli_main.py
@@ -1,7 +1,8 @@
 from typer.testing import CliRunner
-
+import pytest
+from unittest.mock import MagicMock, patch
 import agentflow_cli.cli.main as main_mod
-
+from agentflow_cli.cli.exceptions import PyagenityCLIError
 
 runner = CliRunner()
 
@@ -27,8 +28,172 @@ def fake_execute(self, **kwargs):
     assert called["open_playground"] is True
 
 
+def test_api_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.APICommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(main_mod.app, ["api", "-c", "custom.json", "-H", "1.2.3.4", "-p", "8080", "--no-reload", "--verbose"])
+    assert result.exit_code == 0
+    assert called["config"] == "custom.json"
+    assert called["host"] == "1.2.3.4"
+    assert called["port"] == 8080
+    assert called["reload"] is False
+
+
+def test_version_command(monkeypatch):
+    called = []
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.VersionCommand,
+        "execute",
+        lambda self: called.append(True) or 0
+    )
+    result = runner.invoke(main_mod.app, ["version"])
+    assert result.exit_code == 0
+    assert len(called) == 1
+
+
+def test_init_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.InitCommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(main_mod.app, ["init", "--path", "test_path", "--force"])
+    assert result.exit_code == 0
+    assert called["path"] == "test_path"
+    assert called["force"] is True
+
+
+def test_build_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.BuildCommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(
+        main_mod.app,
+        ["build", "-o", "Dfile", "--force", "--python-version", "3.12", "-p", "5000", "--docker-compose"]
+    )
+    assert result.exit_code == 0
+    assert called["output_file"] == "Dfile"
+    assert called["force"] is True
+    assert called["python_version"] == "3.12"
+    assert called["port"] == 5000
+    assert called["docker_compose"] is True
+
+
+def test_skills_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.SkillsCommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(
+        main_mod.app,
+        ["skills", "-a", "codex", "-p", "skills_path", "--force", "--all", "--list"]
+    )
+    assert result.exit_code == 0
+    assert called["agent"] == "codex"
+    assert called["path"] == "skills_path"
+    assert called["force"] is True
+    assert called["all_agents"] is True
+    assert called["list_agents"] is True
+
+
+def test_test_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.TestCommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(
+        main_mod.app,
+        ["test", "tests/foo.py", "--coverage", "--html", "-k", "foo_test", "--", "--lf", "-vv"]
+    )
+    assert result.exit_code == 0
+    assert called["path"] == "tests/foo.py"
+    assert called["coverage"] is True
+    assert called["html"] is True
+    assert called["keyword"] == "foo_test"
+    assert called["extra_args"] == ("--lf", "-vv")
+
+
+def test_eval_command(monkeypatch):
+    called = {}
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.EvalCommand,
+        "execute",
+        lambda self, **kwargs: called.update(kwargs) or 0
+    )
+    result = runner.invoke(
+        main_mod.app,
+        ["eval", "target_eval", "-o", "out_dir", "--no-report", "-t", "0.8", "--open", "--parallel", "-c", "8"]
+    )
+    assert result.exit_code == 0
+    assert called["target"] == "target_eval"
+    assert called["output_dir"] == "out_dir"
+    assert called["no_report"] is True
+    assert called["threshold"] == 0.8
+    assert called["open_report"] is True
+    assert called["parallel"] is True
+    assert called["max_concurrency"] == 8
+
+
 def test_a2a_command_is_not_exposed():
     result = runner.invoke(main_mod.app, ["a2a"])
 
     assert result.exit_code != 0
     assert "No such command 'a2a'" in result.output
+
+
+def test_handle_pyagenity_cli_error(monkeypatch):
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.VersionCommand,
+        "execute",
+        lambda self: (_ for _ in ()).throw(PyagenityCLIError("Custom error message", exit_code=42))
+    )
+    result = runner.invoke(main_mod.app, ["version"])
+    assert result.exit_code == 42
+
+
+def test_handle_generic_exception(monkeypatch):
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    monkeypatch.setattr(
+        main_mod.VersionCommand,
+        "execute",
+        lambda self: (_ for _ in ()).throw(ValueError("Some generic value error"))
+    )
+    result = runner.invoke(main_mod.app, ["version"])
+    assert result.exit_code == 1
+
+
+def test_main_keyboard_interrupt(monkeypatch):
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    with patch("agentflow_cli.cli.main.app", side_effect=KeyboardInterrupt):
+        with pytest.raises(SystemExit) as exc_info:
+            main_mod.main()
+        assert exc_info.value.code == 130
+
+
+def test_main_generic_exception(monkeypatch):
+    monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
+    with patch("agentflow_cli.cli.main.app", side_effect=ValueError("Main error")):
+        with pytest.raises(SystemExit) as exc_info:
+            main_mod.main()
+        assert exc_info.value.code == 1
+