From 1875cb47e3d8d054ead003589760a6b41d7e1c35 Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Tue, 16 Jun 2026 19:40:55 +0600 Subject: [PATCH 1/2] feat: Enhance eval command to support global confeval.py discovery and improve criteria reporting --- agentflow_cli/cli/commands/eval.py | 189 ++++++++++++++++++++++++++--- tests/cli/test_eval_command.py | 50 ++++++++ tests/cli/test_eval_discovery.py | 83 ++++++++++++- tests/cli/test_eval_flat_pool.py | 12 +- 4 files changed, 312 insertions(+), 22 deletions(-) diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py index d23535a..53e926d 100644 --- a/agentflow_cli/cli/commands/eval.py +++ b/agentflow_cli/cli/commands/eval.py @@ -43,6 +43,7 @@ class _PendingCase: file_name: str eval_set_id: str eval_set_name: str + config_source: str = "" # where config came from: per-file / confeval.py / defaults @dataclass @@ -180,6 +181,50 @@ def _load_confeval(self, evals_dir: Path) -> EvalConfig | None: ) return None + def _confeval_search_dirs(self, target_path: Path, evals_dir: Path) -> list[Path]: + """Directories to search for a global confeval.py, nearest-first. + + confeval.py is the GLOBAL criteria config, so its discovery must not + depend on what the user targets. Walk up from the target (its parent + when the target is a file) toward the current working directory, then + append the configured evals directory. This way evals/confeval.py is + found whether the user runs the whole evals/ dir, a subfolder, or a + single file inside it. + """ + start = target_path.parent if target_path.is_file() else target_path + try: + cur = start.resolve() + cwd = Path.cwd().resolve() + except OSError: + return [evals_dir] + + dirs: list[Path] = [] + while True: + dirs.append(cur) + if cur in (cwd, cur.parent): + break + cur = cur.parent + + ev = evals_dir.resolve() + if ev not in dirs: + dirs.append(ev) + return dirs + + def _resolve_confeval( + self, target_path: Path, evals_dir: Path + ) -> tuple[EvalConfig | None, Path | None]: + """Find and load the nearest global confeval.py. + + Returns (config, path_to_confeval) for the first directory that yields a + usable confeval.py, or (None, None) when none is found so callers fall + back to per-file config and then the built-in defaults. + """ + for d in self._confeval_search_dirs(target_path, evals_dir): + cfg = self._load_confeval(d) + if cfg is not None: + return cfg, d / "confeval.py" + return None, None + def _collect_eval_functions(self, mod: Any) -> tuple[list[tuple[str, Any]], Any]: """Pytest-style discovery: functions annotated -> EvalSet are evals, -> EvalConfig is config.""" @@ -254,9 +299,8 @@ def _collect_from_file( file_config = mod.get_eval_config() elif hasattr(mod, "EVAL_CONFIG"): file_config = mod.EVAL_CONFIG - # Priority: per-file > confeval > defaults - config = file_config or global_config or self._default_config() - return self._make_pending(mod, mod.get_eval_set(), config, file_name) + config, source = self._resolve_case_config(file_config, global_config) + return self._make_pending(mod, mod.get_eval_set(), config, file_name, source) # pytest-style discovery eval_pairs, discovered_config = self._collect_eval_functions(mod) @@ -268,16 +312,30 @@ def _collect_from_file( if hasattr(mod, "EVAL_CONFIG") else None ) - # Priority: per-file > confeval > defaults - config = file_config or global_config or self._default_config() + config, source = self._resolve_case_config(file_config, global_config) pending: list[_PendingCase] = [] for _, es in eval_pairs: - pending.extend(self._make_pending(mod, es, config, file_name)) + pending.extend(self._make_pending(mod, es, config, file_name, source)) return pending self.output.warning(f"Skipping {file_name} — no eval entry point found.") return [] + def _resolve_case_config( + self, file_config: EvalConfig | None, global_config: EvalConfig | None + ) -> tuple[EvalConfig, str]: + """Resolve the config for a file and label its source. + + Priority (highest first): per-file get_eval_config()/EVAL_CONFIG, then the + global confeval.py, then the built-in defaults. The returned label is shown + in the criteria block so users can see which level each file resolved to. + """ + if file_config is not None: + return file_config, "per-file" + if global_config is not None: + return global_config, "confeval.py" + return self._default_config(), "built-in defaults" + def _collect_simulations( self, mod: Any, scenarios: list[Any], file_name: str ) -> list[_PendingSimulation]: @@ -321,7 +379,12 @@ def _collect_simulations( ] def _make_pending( - self, mod: Any, eval_set: Any, config: EvalConfig, file_name: str + self, + mod: Any, + eval_set: Any, + config: EvalConfig, + file_name: str, + config_source: str = "", ) -> list[_PendingCase]: graph = getattr(mod, "app", None) or self._load_agent_from_config() collector = TrajectoryCollector(capture_all_events=True) @@ -334,6 +397,7 @@ def _make_pending( file_name=file_name, eval_set_id=eval_set.eval_set_id, eval_set_name=eval_set.name, + config_source=config_source, ) for c in eval_set.eval_cases ] @@ -345,11 +409,16 @@ def _make_pending( def _print_criteria_block( self, confeval_config: EvalConfig | None, - target_path: Path, + confeval_path: Path | None, ) -> None: - """Print the active criteria and their source before any case runs.""" + """Print the active global criteria and their source before any case runs. + + This is the global fallback config (confeval.py, else built-in defaults) + applied to files without their own get_eval_config()/EVAL_CONFIG. Files + that define a per-file config override this for their own cases. + """ if confeval_config is not None: - source = str(target_path / "confeval.py") + source = str(confeval_path) if confeval_path else "confeval.py" criteria = confeval_config.criteria else: source = "built-in defaults" @@ -370,6 +439,86 @@ def _print_criteria_block( print(f" {name:<40} {' '.join(parts)}", flush=True) # noqa: T201 print("", flush=True) # noqa: T201 + def _criteria_rows(self, config: EvalConfig) -> list[tuple[str, str, dict[str, Any]]]: + """Return (name, human_summary, machine_dict) for each active criterion.""" + criteria = config.criteria + rows: list[tuple[str, str, dict[str, Any]]] = [] + for name in type(criteria).model_fields: + cfg = getattr(criteria, name) + if cfg is None: + continue + match_type = getattr(getattr(cfg, "match_type", None), "value", None) + parts = [f"threshold={cfg.threshold}"] + detail: dict[str, Any] = {"threshold": cfg.threshold} + if match_type: + parts.append(match_type) + detail["match_type"] = match_type + if cfg.judge_model: + parts.append(f"judge={cfg.judge_model}") + detail["judge_model"] = cfg.judge_model + if cfg.num_samples and cfg.num_samples != 1: + parts.append(f"samples={cfg.num_samples}") + detail["num_samples"] = cfg.num_samples + rows.append((name, " ".join(parts), detail)) + return rows + + def _criteria_by_file( + self, pending: list[_PendingCase | _PendingSimulation] + ) -> dict[str, dict[str, Any]]: + """Map each file to its resolved criteria source and active criteria. + + Used both for the per-file console block and the merged report metadata so + a single combined run still records which criteria applied to which file. + """ + out: dict[str, dict[str, Any]] = {} + for pc in pending: + if pc.file_name in out: + continue + if isinstance(pc, _PendingCase): + rows = self._criteria_rows(pc.config) + out[pc.file_name] = { + "source": pc.config_source, + "criteria": {name: detail for name, _, detail in rows}, + } + elif isinstance(pc, _PendingSimulation): + out[pc.file_name] = {"source": "user-simulator goals", "criteria": {}} + return out + + def _print_criteria_per_file( + self, + pending: list[_PendingCase | _PendingSimulation], + confeval_path: Path | None, + ) -> None: + """Print the criteria each file resolved to, before any case runs. + + Unlike the single global block, this shows per-file criteria so a combined + run makes clear that each file is evaluated against its own config + (per-file > confeval.py > built-in defaults). + """ + seen: set[str] = set() + for pc in pending: + if pc.file_name in seen: + continue + seen.add(pc.file_name) + + if isinstance(pc, _PendingSimulation): + print( # noqa: T201 + f"Criteria {pc.file_name} (source: user-simulator goals)", flush=True + ) + print("", flush=True) # noqa: T201 + continue + + source = pc.config_source + if source == "confeval.py" and confeval_path: + source = str(confeval_path) + print(f"Criteria {pc.file_name} (source: {source})", flush=True) # noqa: T201 + rows = self._criteria_rows(pc.config) + if not rows: + print(" (no active criteria)", flush=True) # noqa: T201 + for name, summary, _ in rows: + print(f" {name:<40} {summary}", flush=True) # noqa: T201 + print("", flush=True) # noqa: T201 + # ------------------------------------------------------------------ # Progress printing # ------------------------------------------------------------------ @@ -627,8 +776,12 @@ def execute( # noqa: PLR0912, PLR0915 ) return 1 - # 4. Load confeval.py → global criteria config (None if not found) - confeval_config = self._load_confeval(target_path) + # 4. Load confeval.py → global criteria config (None if not found). + # confeval.py is the GLOBAL config: discover it independently of the + # target so it applies whether the user evaluates the whole evals/ + # directory, a subfolder, or a single file inside it. + evals_dir = Path.cwd() / json_directory + confeval_config, confeval_path = self._resolve_confeval(target_path, evals_dir) # 5. Discover files files = self._discover(target_path) @@ -637,7 +790,7 @@ def execute( # noqa: PLR0912, PLR0915 return 1 # 6. Collect all pending cases across every file - pending: list[_PendingCase] = [] + pending: list[_PendingCase | _PendingSimulation] = [] for f in files: try: cases = self._collect_from_file(f, confeval_config) @@ -666,8 +819,8 @@ def execute( # noqa: PLR0912, PLR0915 f"Found {', '.join(parts)} across {n_files} file(s) in {target_path}", ) - # 7. Print criteria block so users know what is being evaluated - self._print_criteria_block(confeval_config, target_path) + # 7. Print per-file criteria so users see which config each file resolved to + self._print_criteria_per_file(pending, confeval_path) # 8. Run all cases under a single asyncio event loop quads = asyncio.run(self._run_flat_pool(pending, effective_concurrency, effective_parallel)) @@ -702,6 +855,12 @@ def execute( # noqa: PLR0912, PLR0915 # 8. Merge into a single report merged = self._merge_reports(reports, base_config=confeval_config or self._default_config()) + # Record per-file criteria in the merged report so a single combined run still + # captures which criteria applied to which file (the merged config_used alone + # collapses to one config and loses this). + if isinstance(getattr(merged, "metadata", None), dict): + merged.metadata["criteria_by_file"] = self._criteria_by_file(pending) + # 9. Determine exit code if threshold is not None and merged.summary.pass_rate < threshold: self.output.error( diff --git a/tests/cli/test_eval_command.py b/tests/cli/test_eval_command.py index b45c16e..ecd2ddd 100644 --- a/tests/cli/test_eval_command.py +++ b/tests/cli/test_eval_command.py @@ -467,6 +467,56 @@ def test_print_criteria_block_custom(cmd, capsys): assert "judge=gpt-4" in captured.out +def _pending_case(file_name, config, source): + return _PendingCase( + case=MagicMock(), + evaluator=MagicMock(), + config=config, + file_name=file_name, + eval_set_id="id", + eval_set_name="name", + config_source=source, + ) + + +def test_print_criteria_per_file_shows_each_files_criteria(cmd, capsys): + tool_cfg = EvalConfig( + criteria=CriteriaConfig(tool_name_match=CriterionConfig.tool_name_match(threshold=0.6)) + ) + rouge_cfg = EvalConfig( + criteria=CriteriaConfig(rouge_match=CriterionConfig.rouge_match(threshold=0.8)) + ) + pending = [ + _pending_case("eval_tool_agents.py", tool_cfg, "per-file"), + _pending_case("weather_agents_eval.py", rouge_cfg, "confeval.py"), + ] + cmd._print_criteria_per_file(pending, Path("evals/confeval.py")) + out = capsys.readouterr().out + # Each file is listed with its own criteria and resolved source. + assert "eval_tool_agents.py (source: per-file)" in out + assert "tool_name_match" in out + assert "weather_agents_eval.py (source: evals/confeval.py)" in out + assert "rouge_match" in out + + +def test_criteria_by_file_maps_source_and_criteria(cmd): + tool_cfg = EvalConfig( + criteria=CriteriaConfig(tool_name_match=CriterionConfig.tool_name_match(threshold=0.6)) + ) + rouge_cfg = EvalConfig( + criteria=CriteriaConfig(rouge_match=CriterionConfig.rouge_match(threshold=0.8)) + ) + pending = [ + _pending_case("eval_tool_agents.py", tool_cfg, "per-file"), + _pending_case("weather_agents_eval.py", rouge_cfg, "confeval.py"), + ] + by_file = cmd._criteria_by_file(pending) + assert by_file["eval_tool_agents.py"]["source"] == "per-file" + assert "tool_name_match" in by_file["eval_tool_agents.py"]["criteria"] + assert by_file["weather_agents_eval.py"]["source"] == "confeval.py" + assert by_file["weather_agents_eval.py"]["criteria"]["rouge_match"]["threshold"] == 0.8 + + @pytest.mark.asyncio async def test_run_flat_pool_simulation_no_criterion(cmd): from agentflow.qa.evaluation.token_usage import TokenUsage diff --git a/tests/cli/test_eval_discovery.py b/tests/cli/test_eval_discovery.py index a92ae52..520eded 100644 --- a/tests/cli/test_eval_discovery.py +++ b/tests/cli/test_eval_discovery.py @@ -157,6 +157,73 @@ def test_import_error_returns_none(self, tmp_path: Path, cmd: EvalCommand) -> No assert result is None +# ── confeval.py discovery (independent of target) ─────────────────────────────── + + +class TestConfevalDiscovery: + """confeval.py is the global config; discovery must not depend on the target. + + Regression: targeting a single eval file used to look for + `/confeval.py`, which never exists, so files without a per-file + config silently fell back to built-in defaults instead of the global + confeval.py. + """ + + def test_search_dirs_walks_up_from_file( + self, tmp_path: Path, cmd: EvalCommand + ) -> None: + evals_dir = tmp_path / "evals" + sub = evals_dir / "sub" + sub.mkdir(parents=True) + target = sub / "weather_eval.py" + target.write_text("") + dirs = cmd._confeval_search_dirs(target, evals_dir) + # Parent of the file is searched first, walking up; the evals dir is included. + assert dirs[0] == sub.resolve() + assert evals_dir.resolve() in dirs + + def test_search_dirs_includes_evals_dir_for_directory_target( + self, tmp_path: Path, cmd: EvalCommand + ) -> None: + evals_dir = tmp_path / "evals" + evals_dir.mkdir() + dirs = cmd._confeval_search_dirs(evals_dir, evals_dir) + assert evals_dir.resolve() in dirs + + def test_resolve_confeval_finds_for_single_file_target( + self, tmp_path: Path, cmd: EvalCommand + ) -> None: + from agentflow.qa.evaluation import EvalConfig + + evals_dir = tmp_path / "evals" + evals_dir.mkdir() + target_file = evals_dir / "weather_eval.py" + target_file.write_text("") + expected = EvalConfig() + + # confeval.py lives in the evals dir, not alongside nothing under the file. + def fake_load(d: Path): + return expected if d.resolve() == evals_dir.resolve() else None + + with patch.object(cmd, "_load_confeval", side_effect=fake_load): + cfg, path = cmd._resolve_confeval(target_file, evals_dir) + + assert cfg is expected + assert path == evals_dir / "confeval.py" + + def test_resolve_confeval_returns_none_when_absent( + self, tmp_path: Path, cmd: EvalCommand + ) -> None: + evals_dir = tmp_path / "evals" + evals_dir.mkdir() + target_file = evals_dir / "weather_eval.py" + target_file.write_text("") + with patch.object(cmd, "_load_confeval", return_value=None): + cfg, path = cmd._resolve_confeval(target_file, evals_dir) + assert cfg is None + assert path is None + + # ── _collect_from_file ──────────────────────────────────────────────────────── @@ -193,9 +260,11 @@ def test_file_config_takes_priority_over_global_config( ): result = cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is file_cfg assert used_config is not global_cfg + assert used_source == "per-file" assert result == fake_pending def test_file_config_used_when_no_global(self, tmp_path: Path, cmd: EvalCommand) -> None: @@ -215,8 +284,10 @@ def test_file_config_used_when_no_global(self, tmp_path: Path, cmd: EvalCommand) ): result = cmd._collect_from_file(self._dummy_path(tmp_path), None) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is file_cfg + assert used_source == "per-file" def test_default_config_used_when_no_configs(self, tmp_path: Path, cmd: EvalCommand) -> None: from agentflow.qa.evaluation import EvalConfig @@ -235,8 +306,10 @@ def test_default_config_used_when_no_configs(self, tmp_path: Path, cmd: EvalComm ): result = cmd._collect_from_file(self._dummy_path(tmp_path), None) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is default_cfg + assert used_source == "built-in defaults" def test_no_entry_point_returns_empty_and_warns( self, tmp_path: Path, cmd: EvalCommand @@ -267,8 +340,10 @@ def test_pytest_style_discovery_fallback(self, tmp_path: Path, cmd: EvalCommand) ): result = cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is global_cfg + assert used_source == "confeval.py" assert result == fake_pending diff --git a/tests/cli/test_eval_flat_pool.py b/tests/cli/test_eval_flat_pool.py index e442042..5970096 100644 --- a/tests/cli/test_eval_flat_pool.py +++ b/tests/cli/test_eval_flat_pool.py @@ -123,9 +123,11 @@ def test_per_file_config_beats_confeval_config( ): cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is per_file_cfg assert used_config is not global_cfg + assert used_source == "per-file" def test_per_file_config_used_when_no_confeval( self, tmp_path: Path, cmd: EvalCommand @@ -146,8 +148,10 @@ def test_per_file_config_used_when_no_confeval( ): cmd._collect_from_file(self._dummy_path(tmp_path), None) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is per_file_cfg + assert used_source == "per-file" def test_default_config_used_when_no_config_anywhere( self, tmp_path: Path, cmd: EvalCommand @@ -168,5 +172,7 @@ def test_default_config_used_when_no_config_anywhere( ): cmd._collect_from_file(self._dummy_path(tmp_path), None) - _, _, used_config, _ = mock_make.call_args.args + used_config = mock_make.call_args.args[2] + used_source = mock_make.call_args.args[4] assert used_config is default_cfg + assert used_source == "built-in defaults" From afa56d6cb82c8b78915a526c0eab0aeffb5e038b Mon Sep 17 00:00:00 2001 From: Shudipto Trafder Date: Tue, 16 Jun 2026 19:45:21 +0600 Subject: [PATCH 2/2] feat: Add initial agentflow configuration file --- agentflow.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 agentflow.json diff --git a/agentflow.json b/agentflow.json new file mode 100644 index 0000000..119c138 --- /dev/null +++ b/agentflow.json @@ -0,0 +1,15 @@ +{ + "agent": "graph.react:app", + "thread_name_generator": "graph.thread_name_generator:MyNameGenerator", + "env": ".env", + "auth": null, + "rate_limit": { + "enabled": true, + "backend": "memory", + "requests": 100, + "window": 60, + "by": "ip", + "trusted_proxy_headers": false, + "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"] + } +} \ No newline at end of file