Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions agentflow_cli/cli/commands/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
class _PendingCase:
case: Any # EvalCase
evaluator: AgentEvaluator
config: Any # EvalConfig — the resolved config for this case
file_name: str
eval_set_id: str
eval_set_name: str
Expand Down Expand Up @@ -223,8 +224,8 @@ def _collect_from_file(
"""Load a module and return pending work for every eval case or simulation scenario.

Config priority chain (highest → lowest):
1. confeval.py — global_config (not None when confeval was found)
2. Per-file — EVAL_CONFIG / get_eval_config() inside the file
1. Per-file — EVAL_CONFIG / get_eval_config() inside the file
2. confeval.py — global_config (fallback when no per-file config)
3. Built-in defaults — _default_config()

Returns _PendingSimulation items when the file exposes get_scenarios() or SCENARIOS.
Expand Down Expand Up @@ -253,8 +254,8 @@ def _collect_from_file(
file_config = mod.get_eval_config()
elif hasattr(mod, "EVAL_CONFIG"):
file_config = mod.EVAL_CONFIG
# Priority: confeval > per-file > defaults
config = global_config or file_config or self._default_config()
# Priority: per-file > confeval > defaults
config = file_config or global_config or self._default_config()
return self._make_pending(mod, mod.get_eval_set(), config, file_name)

# pytest-style discovery
Expand All @@ -267,8 +268,8 @@ def _collect_from_file(
if hasattr(mod, "EVAL_CONFIG")
else None
)
# Priority: confeval > per-file > defaults
config = global_config or file_config or self._default_config()
# Priority: per-file > confeval > defaults
config = file_config or global_config or self._default_config()
pending: list[_PendingCase] = []
for _, es in eval_pairs:
pending.extend(self._make_pending(mod, es, config, file_name))
Expand Down Expand Up @@ -329,6 +330,7 @@ def _make_pending(
_PendingCase(
case=c,
evaluator=evaluator,
config=config,
file_name=file_name,
eval_set_id=eval_set.eval_set_id,
eval_set_name=eval_set.name,
Expand Down Expand Up @@ -531,7 +533,9 @@ async def _run_one(
# Report merging
# ------------------------------------------------------------------

def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
def _merge_reports(
self, reports: list[EvalReport], base_config: Any = None
) -> EvalReport:
if len(reports) == 1:
return reports[0]

Expand All @@ -542,6 +546,7 @@ def _merge_reports(self, reports: list[EvalReport]) -> EvalReport:
eval_set_id="combined_eval",
eval_set_name="Combined Evaluation",
results=all_results,
config_used=base_config.model_dump() if base_config else {},
)

# ------------------------------------------------------------------
Expand Down Expand Up @@ -673,6 +678,13 @@ def execute( # noqa: PLR0912, PLR0915
self.output.error("No results produced.")
return 1

# Build per-eval-set config map from pending before results are consumed
group_configs: dict[str, Any] = {
pc.eval_set_id: pc.config
for pc in pending
if isinstance(pc, _PendingCase)
}

# 7. Group by eval_set_id → one EvalReport per set
groups: dict[str, tuple[str, list[EvalCaseResult]]] = defaultdict(lambda: ("", []))
for _, eval_set_id, eval_set_name, result in quads:
Expand All @@ -681,17 +693,18 @@ def execute( # noqa: PLR0912, PLR0915

reports: list[EvalReport] = []
for eval_set_id, (eval_set_name, results) in groups.items():
group_cfg = group_configs.get(eval_set_id) or confeval_config or self._default_config()
reports.append(
ER.create(
eval_set_id=eval_set_id,
eval_set_name=eval_set_name,
results=results,
config_used=(confeval_config or self._default_config()).model_dump(),
config_used=group_cfg.model_dump(),
)
)

# 8. Merge into a single report
merged = self._merge_reports(reports)
merged = self._merge_reports(reports, base_config=confeval_config or self._default_config())

# 9. Determine exit code
if threshold is not None and merged.summary.pass_rate < threshold:
Expand Down
5 changes: 5 additions & 0 deletions agentflow_cli/cli/commands/skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ def kind(self) -> str:
source_relpath="agent-skills",
manifest=True,
),
_InstallArtifact(
kind="file",
install_relpath=".github/skills/agentflow/SKILL.md",
source_relpath="copilot/SKILL.md",
),
),
),
)
Expand Down
82 changes: 82 additions & 0 deletions agentflow_cli/cli/templates/skills/copilot/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
---
name: agentflow
description: Expert guidance for building, debugging, and extending applications with AgentFlow (10xscale-agentflow). TRIGGER when: code imports from agentflow (e.g. `from agentflow import`, `StateGraph`, `Agent`, `ToolNode`, `AgentState`); user references `agentflow.json` or CLI commands (`agentflow init`, `agentflow api`, `agentflow play`, `agentflow build`, `agentflow skills`); user is building graph-based multi-agent workflows, tools, memory, checkpointing, or streaming with this framework. SKIP: generic Python or multi-agent questions not referencing agentflow; other frameworks (LangGraph, CrewAI, AutoGen) unless comparing.
---

# Agentflow Project Skill

Use this skill when working in an Agentflow project. Agentflow is a multi-agent framework that wraps official OpenAI and Google SDK capabilities behind a unified graph, agent, tool, state, storage, API, CLI, and TypeScript client interface.

Treat https://agentflow.10xscale.ai/ as the first source of truth for public package names, install commands, and user-facing behavior. Use implementation source after the docs establish the intended API.

## Workflow

1. Identify the published package or docs surface involved:
- PyPI core Python SDK: `10xscale-agentflow` (`pip install 10xscale-agentflow`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow/agentflow
- PyPI API/CLI SDK: `10xscale-agentflow-cli` (`pip install 10xscale-agentflow-cli`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow-api/agentflow_cli
- npm TypeScript SDK: `@10xscale/agentflow-client` (`npm install @10xscale/agentflow-client`), source at https://github.com/10xHub/Agentflow/tree/main/agentflow-client/src
- Main docs: https://agentflow.10xscale.ai/
- Playground/UI: `agentflow play` command after installed cli

2. Read the matching reference file before changing behavior:

### Core Python SDK
- Architecture and package flow: `.github/skills/agentflow/references/architecture.md`
- Agent constructor, provider, reasoning, retry, fallback, output_schema: `.github/skills/agentflow/references/agents-and-tools.md`
- Graph construction, nodes, edges, compile, interrupts, config keys: `.github/skills/agentflow/references/state-graph.md`
- State, messages, and content blocks: `.github/skills/agentflow/references/state-and-messages.md`
- Thread and checkpointing: `.github/skills/agentflow/references/checkpointing-and-threads.md`
- Dependency injection (InjectQ): `.github/skills/agentflow/references/dependency-injection.md`
- Multimodal files and media stores: `.github/skills/agentflow/references/media-and-files.md`
- Long-term memory stores (MemoryConfig, QdrantStore, Mem0Store): `.github/skills/agentflow/references/memory-and-store.md`
- Streaming, StreamChunk, SSE, ResponseGranularity: `.github/skills/agentflow/references/streaming.md`
- Stream emitter for tool progress updates: `.github/skills/agentflow/references/stream-emitter.md`
- Observability hooks, validators, and runtime jumps: `.github/skills/agentflow/references/callbacks-and-command.md`
- Prebuilt agents (ReactAgent, PlanActReflectAgent, StructuredOutputAgent, SupervisorTeamAgent, SwarmAgent, RAGAgent) and tools: `.github/skills/agentflow/references/prebuilt-agents-and-tools.md`
- Event publishers and A2A/ACP runtime protocols: `.github/skills/agentflow/references/publishers-and-runtime-protocols.md`
- Context management, ID generation, and background tasks: `.github/skills/agentflow/references/context-id-background.md`
- Provider internals and adapters: `.github/skills/agentflow/references/providers-and-adapters.md`
- Prompt-injection and validation safety: `.github/skills/agentflow/references/security-and-validators.md`

### API/CLI SDK
- CLI commands and generated project files: `.github/skills/agentflow/references/cli-commands.md`
- `agentflow.json` and dependency loading: `.github/skills/agentflow/references/api-configuration.md`
- API auth and authorization: `.github/skills/agentflow/references/auth-and-authorization.md`
- API environment, settings, and middleware: `.github/skills/agentflow/references/api-settings-and-middleware.md`
- Rate limiting (config, backends, headers, custom backend): `.github/skills/agentflow/references/rate-limiting.md`
- REST routes and error behavior: `.github/skills/agentflow/references/rest-api-and-errors.md`
- API Snowflake IDs and thread naming: `.github/skills/agentflow/references/id-and-thread-name-generators.md`
- API server and deployment runtime: `.github/skills/agentflow/references/production-runtime.md`

### TypeScript client SDK
- REST and TypeScript client surface: `.github/skills/agentflow/references/api-client.md`
- Browser/client-side tool execution: `.github/skills/agentflow/references/remote-tools.md`
- TypeScript auth helpers and structured errors: `.github/skills/agentflow/references/client-auth-and-errors.md`
- TypeScript messages, invoke, and stream details: `.github/skills/agentflow/references/client-messages-invoke-stream.md`
- TypeScript thread, memory, and file APIs: `.github/skills/agentflow/references/client-threads-memory-files.md`

### Testing and QA
- Unit testing without LLM calls (TestAgent, QuickTest, MockToolRegistry, `agentflow test`): `.github/skills/agentflow/references/unit-testing.md`
- Evaluation framework (EvalSet, criteria, AgentEvaluator, QuickEval, UserSimulator, `agentflow eval`): `.github/skills/agentflow/references/evaluation.md`
- Testing helpers overview: `.github/skills/agentflow/references/testing-and-evaluation.md`

3. Prefer existing Agentflow abstractions over new custom wiring:
- Build workflows with `StateGraph`, `Agent`, `ToolNode`, `AgentState`, and `Message`.
- Use prebuilt agents (`ReactAgent`, `PlanActReflectAgent`, `StructuredOutputAgent`, `SupervisorTeamAgent`, `SwarmAgent`, `RAGAgent`) for common patterns before hand-writing graph loops.
- Persist conversation state with checkpointers; use stores only for cross-thread memory.
- Put business services in `InjectQ` instead of global variables.
- Keep API/CLI graph modules storage-agnostic and wire dependencies through `agentflow.json`.

4. Verify against source when implementation details matter. Public names and expected behavior should match https://agentflow.10xscale.ai/; source under https://github.com/10xHub/Agentflow (core), https://github.com/10xHub/agentflow-cli (API/CLI), and https://github.com/10xHub/agentflow-client (TypeScript) explains how that behavior is implemented.

## Local Conventions

- A compiled graph is normally loaded once by the API server and reused per request.
- Public package naming matters: use `10xscale-agentflow`, `10xscale-agentflow-cli`, and `@10xscale/agentflow-client` in user-facing docs and examples, not repository folder names.
- Every persisted interaction should include `config.thread_id`.
- Tools need docstrings and type annotations so model-facing schemas are useful.
- Injectable tool and node parameters (`state`, `config`, `tool_call_id`) are hidden from the model schema.
- For production, avoid process-local storage for shared state; use durable checkpointer/store backends.
- Add observability or audit side effects by registering a `GraphLifecycleHook` on `CallbackManager` — do not wrap `ainvoke()` / `astream()` calls in application code to achieve the same result.
- `reasoning_config` is on by default at medium effort; disable explicitly with `reasoning_config=None` when not needed.
- Provider is auto-detected from the model name; use `base_url` for third-party OpenAI-compatible APIs (Ollama, DeepSeek, OpenRouter).
169 changes: 167 additions & 2 deletions tests/cli/test_cli_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from typer.testing import CliRunner

import pytest
from unittest.mock import MagicMock, patch
import agentflow_cli.cli.main as main_mod

from agentflow_cli.cli.exceptions import PyagenityCLIError

runner = CliRunner()

Expand All @@ -27,8 +28,172 @@ def fake_execute(self, **kwargs):
assert called["open_playground"] is True


def test_api_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.APICommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(main_mod.app, ["api", "-c", "custom.json", "-H", "1.2.3.4", "-p", "8080", "--no-reload", "--verbose"])
assert result.exit_code == 0
assert called["config"] == "custom.json"
assert called["host"] == "1.2.3.4"
assert called["port"] == 8080
assert called["reload"] is False


def test_version_command(monkeypatch):
called = []
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.VersionCommand,
"execute",
lambda self: called.append(True) or 0
)
result = runner.invoke(main_mod.app, ["version"])
assert result.exit_code == 0
assert len(called) == 1


def test_init_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.InitCommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(main_mod.app, ["init", "--path", "test_path", "--force"])
assert result.exit_code == 0
assert called["path"] == "test_path"
assert called["force"] is True


def test_build_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.BuildCommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(
main_mod.app,
["build", "-o", "Dfile", "--force", "--python-version", "3.12", "-p", "5000", "--docker-compose"]
)
assert result.exit_code == 0
assert called["output_file"] == "Dfile"
assert called["force"] is True
assert called["python_version"] == "3.12"
assert called["port"] == 5000
assert called["docker_compose"] is True


def test_skills_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.SkillsCommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(
main_mod.app,
["skills", "-a", "codex", "-p", "skills_path", "--force", "--all", "--list"]
)
assert result.exit_code == 0
assert called["agent"] == "codex"
assert called["path"] == "skills_path"
assert called["force"] is True
assert called["all_agents"] is True
assert called["list_agents"] is True


def test_test_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.TestCommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(
main_mod.app,
["test", "tests/foo.py", "--coverage", "--html", "-k", "foo_test", "--", "--lf", "-vv"]
)
assert result.exit_code == 0
assert called["path"] == "tests/foo.py"
assert called["coverage"] is True
assert called["html"] is True
assert called["keyword"] == "foo_test"
assert called["extra_args"] == ("--lf", "-vv")


def test_eval_command(monkeypatch):
called = {}
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.EvalCommand,
"execute",
lambda self, **kwargs: called.update(kwargs) or 0
)
result = runner.invoke(
main_mod.app,
["eval", "target_eval", "-o", "out_dir", "--no-report", "-t", "0.8", "--open", "--parallel", "-c", "8"]
)
assert result.exit_code == 0
assert called["target"] == "target_eval"
assert called["output_dir"] == "out_dir"
assert called["no_report"] is True
assert called["threshold"] == 0.8
assert called["open_report"] is True
assert called["parallel"] is True
assert called["max_concurrency"] == 8


def test_a2a_command_is_not_exposed():
result = runner.invoke(main_mod.app, ["a2a"])

assert result.exit_code != 0
assert "No such command 'a2a'" in result.output


def test_handle_pyagenity_cli_error(monkeypatch):
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.VersionCommand,
"execute",
lambda self: (_ for _ in ()).throw(PyagenityCLIError("Custom error message", exit_code=42))
)
result = runner.invoke(main_mod.app, ["version"])
assert result.exit_code == 42


def test_handle_generic_exception(monkeypatch):
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
monkeypatch.setattr(
main_mod.VersionCommand,
"execute",
lambda self: (_ for _ in ()).throw(ValueError("Some generic value error"))
)
result = runner.invoke(main_mod.app, ["version"])
assert result.exit_code == 1


def test_main_keyboard_interrupt(monkeypatch):
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
with patch("agentflow_cli.cli.main.app", side_effect=KeyboardInterrupt):
with pytest.raises(SystemExit) as exc_info:
main_mod.main()
assert exc_info.value.code == 130


def test_main_generic_exception(monkeypatch):
monkeypatch.setattr(main_mod, "setup_cli_logging", lambda **kwargs: None)
with patch("agentflow_cli.cli.main.app", side_effect=ValueError("Main error")):
with pytest.raises(SystemExit) as exc_info:
main_mod.main()
assert exc_info.value.code == 1

Loading
Loading