From 1875cb47e3d8d054ead003589760a6b41d7e1c35 Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Tue, 16 Jun 2026 19:40:55 +0600
Subject: [PATCH 1/2] feat: Enhance eval command to support global confeval.py
 discovery and improve criteria reporting

---
 agentflow_cli/cli/commands/eval.py | 189 ++++++++++++++++++++++++++---
 tests/cli/test_eval_command.py     |  50 ++++++++
 tests/cli/test_eval_discovery.py   |  83 ++++++++++++-
 tests/cli/test_eval_flat_pool.py   |  12 +-
 4 files changed, 312 insertions(+), 22 deletions(-)

diff --git a/agentflow_cli/cli/commands/eval.py b/agentflow_cli/cli/commands/eval.py
index d23535a..53e926d 100644
--- a/agentflow_cli/cli/commands/eval.py
+++ b/agentflow_cli/cli/commands/eval.py
@@ -43,6 +43,7 @@ class _PendingCase:
     file_name: str
     eval_set_id: str
     eval_set_name: str
+    config_source: str = ""  # where config came from: per-file / confeval.py / defaults
 
 
 @dataclass
@@ -180,6 +181,50 @@ def _load_confeval(self, evals_dir: Path) -> EvalConfig | None:
         )
         return None
 
+    def _confeval_search_dirs(self, target_path: Path, evals_dir: Path) -> list[Path]:
+        """Directories to search for a global confeval.py, nearest-first.
+
+        confeval.py is the GLOBAL criteria config, so its discovery must not
+        depend on what the user targets. Walk up from the target (its parent
+        when the target is a file) toward the current working directory, then
+        append the configured evals directory. This way evals/confeval.py is
+        found whether the user runs the whole evals/ dir, a subfolder, or a
+        single file inside it.
+        """
+        start = target_path.parent if target_path.is_file() else target_path
+        try:
+            cur = start.resolve()
+            cwd = Path.cwd().resolve()
+        except OSError:
+            return [evals_dir]
+
+        dirs: list[Path] = []
+        while True:
+            dirs.append(cur)
+            if cur in (cwd, cur.parent):
+                break
+            cur = cur.parent
+
+        ev = evals_dir.resolve()
+        if ev not in dirs:
+            dirs.append(ev)
+        return dirs
+
+    def _resolve_confeval(
+        self, target_path: Path, evals_dir: Path
+    ) -> tuple[EvalConfig | None, Path | None]:
+        """Find and load the nearest global confeval.py.
+
+        Returns (config, path_to_confeval) for the first directory that yields a
+        usable confeval.py, or (None, None) when none is found so callers fall
+        back to per-file config and then the built-in defaults.
+        """
+        for d in self._confeval_search_dirs(target_path, evals_dir):
+            cfg = self._load_confeval(d)
+            if cfg is not None:
+                return cfg, d / "confeval.py"
+        return None, None
+
     def _collect_eval_functions(self, mod: Any) -> tuple[list[tuple[str, Any]], Any]:
         """Pytest-style discovery: functions annotated -> EvalSet are evals, ->
         EvalConfig is config."""
@@ -254,9 +299,8 @@ def _collect_from_file(
                 file_config = mod.get_eval_config()
             elif hasattr(mod, "EVAL_CONFIG"):
                 file_config = mod.EVAL_CONFIG
-            # Priority: per-file > confeval > defaults
-            config = file_config or global_config or self._default_config()
-            return self._make_pending(mod, mod.get_eval_set(), config, file_name)
+            config, source = self._resolve_case_config(file_config, global_config)
+            return self._make_pending(mod, mod.get_eval_set(), config, file_name, source)
 
         # pytest-style discovery
         eval_pairs, discovered_config = self._collect_eval_functions(mod)
@@ -268,16 +312,30 @@ def _collect_from_file(
                 if hasattr(mod, "EVAL_CONFIG")
                 else None
             )
-            # Priority: per-file > confeval > defaults
-            config = file_config or global_config or self._default_config()
+            config, source = self._resolve_case_config(file_config, global_config)
             pending: list[_PendingCase] = []
             for _, es in eval_pairs:
-                pending.extend(self._make_pending(mod, es, config, file_name))
+                pending.extend(self._make_pending(mod, es, config, file_name, source))
             return pending
 
         self.output.warning(f"Skipping {file_name} — no eval entry point found.")
         return []
 
+    def _resolve_case_config(
+        self, file_config: EvalConfig | None, global_config: EvalConfig | None
+    ) -> tuple[EvalConfig, str]:
+        """Resolve the config for a file and label its source.
+
+        Priority (highest first): per-file get_eval_config()/EVAL_CONFIG, then the
+        global confeval.py, then the built-in defaults. The returned label is shown
+        in the criteria block so users can see which level each file resolved to.
+        """
+        if file_config is not None:
+            return file_config, "per-file"
+        if global_config is not None:
+            return global_config, "confeval.py"
+        return self._default_config(), "built-in defaults"
+
     def _collect_simulations(
         self, mod: Any, scenarios: list[Any], file_name: str
     ) -> list[_PendingSimulation]:
@@ -321,7 +379,12 @@ def _collect_simulations(
         ]
 
     def _make_pending(
-        self, mod: Any, eval_set: Any, config: EvalConfig, file_name: str
+        self,
+        mod: Any,
+        eval_set: Any,
+        config: EvalConfig,
+        file_name: str,
+        config_source: str = "",
     ) -> list[_PendingCase]:
         graph = getattr(mod, "app", None) or self._load_agent_from_config()
         collector = TrajectoryCollector(capture_all_events=True)
@@ -334,6 +397,7 @@ def _make_pending(
                 file_name=file_name,
                 eval_set_id=eval_set.eval_set_id,
                 eval_set_name=eval_set.name,
+                config_source=config_source,
             )
             for c in eval_set.eval_cases
         ]
@@ -345,11 +409,16 @@ def _make_pending(
     def _print_criteria_block(
         self,
         confeval_config: EvalConfig | None,
-        target_path: Path,
+        confeval_path: Path | None,
     ) -> None:
-        """Print the active criteria and their source before any case runs."""
+        """Print the active global criteria and their source before any case runs.
+
+        This is the global fallback config (confeval.py, else built-in defaults)
+        applied to files without their own get_eval_config()/EVAL_CONFIG. Files
+        that define a per-file config override this for their own cases.
+        """
         if confeval_config is not None:
-            source = str(target_path / "confeval.py")
+            source = str(confeval_path) if confeval_path else "confeval.py"
             criteria = confeval_config.criteria
         else:
             source = "built-in defaults"
@@ -370,6 +439,86 @@ def _print_criteria_block(
             print(f"  {name:<40} {'  '.join(parts)}", flush=True)  # noqa: T201
         print("", flush=True)  # noqa: T201
 
+    def _criteria_rows(self, config: EvalConfig) -> list[tuple[str, str, dict[str, Any]]]:
+        """Return (name, human_summary, machine_dict) for each active criterion."""
+        criteria = config.criteria
+        rows: list[tuple[str, str, dict[str, Any]]] = []
+        for name in type(criteria).model_fields:
+            cfg = getattr(criteria, name)
+            if cfg is None:
+                continue
+            match_type = getattr(getattr(cfg, "match_type", None), "value", None)
+            parts = [f"threshold={cfg.threshold}"]
+            detail: dict[str, Any] = {"threshold": cfg.threshold}
+            if match_type:
+                parts.append(match_type)
+                detail["match_type"] = match_type
+            if cfg.judge_model:
+                parts.append(f"judge={cfg.judge_model}")
+                detail["judge_model"] = cfg.judge_model
+            if cfg.num_samples and cfg.num_samples != 1:
+                parts.append(f"samples={cfg.num_samples}")
+                detail["num_samples"] = cfg.num_samples
+            rows.append((name, "  ".join(parts), detail))
+        return rows
+
+    def _criteria_by_file(
+        self, pending: list[_PendingCase | _PendingSimulation]
+    ) -> dict[str, dict[str, Any]]:
+        """Map each file to its resolved criteria source and active criteria.
+
+        Used both for the per-file console block and the merged report metadata so
+        a single combined run still records which criteria applied to which file.
+        """
+        out: dict[str, dict[str, Any]] = {}
+        for pc in pending:
+            if pc.file_name in out:
+                continue
+            if isinstance(pc, _PendingCase):
+                rows = self._criteria_rows(pc.config)
+                out[pc.file_name] = {
+                    "source": pc.config_source,
+                    "criteria": {name: detail for name, _, detail in rows},
+                }
+            elif isinstance(pc, _PendingSimulation):
+                out[pc.file_name] = {"source": "user-simulator goals", "criteria": {}}
+        return out
+
+    def _print_criteria_per_file(
+        self,
+        pending: list[_PendingCase | _PendingSimulation],
+        confeval_path: Path | None,
+    ) -> None:
+        """Print the criteria each file resolved to, before any case runs.
+
+        Unlike the single global block, this shows per-file criteria so a combined
+        run makes clear that each file is evaluated against its own config
+        (per-file > confeval.py > built-in defaults).
+        """
+        seen: set[str] = set()
+        for pc in pending:
+            if pc.file_name in seen:
+                continue
+            seen.add(pc.file_name)
+
+            if isinstance(pc, _PendingSimulation):
+                print(  # noqa: T201
+                    f"Criteria  {pc.file_name}  (source: user-simulator goals)", flush=True
+                )
+                print("", flush=True)  # noqa: T201
+                continue
+
+            source = pc.config_source
+            if source == "confeval.py" and confeval_path:
+                source = str(confeval_path)
+            print(f"Criteria  {pc.file_name}  (source: {source})", flush=True)  # noqa: T201
+            rows = self._criteria_rows(pc.config)
+            if not rows:
+                print("  (no active criteria)", flush=True)  # noqa: T201
+            for name, summary, _ in rows:
+                print(f"  {name:<40} {summary}", flush=True)  # noqa: T201
+            print("", flush=True)  # noqa: T201
+
     # ------------------------------------------------------------------
     # Progress printing
     # ------------------------------------------------------------------
@@ -627,8 +776,12 @@ def execute(  # noqa: PLR0912, PLR0915
                 )
                 return 1
 
-        # 4. Load confeval.py → global criteria config (None if not found)
-        confeval_config = self._load_confeval(target_path)
+        # 4. Load confeval.py → global criteria config (None if not found).
+        #    confeval.py is the GLOBAL config: discover it independently of the
+        #    target so it applies whether the user evaluates the whole evals/
+        #    directory, a subfolder, or a single file inside it.
+        evals_dir = Path.cwd() / json_directory
+        confeval_config, confeval_path = self._resolve_confeval(target_path, evals_dir)
 
         # 5. Discover files
         files = self._discover(target_path)
@@ -637,7 +790,7 @@ def execute(  # noqa: PLR0912, PLR0915
             return 1
 
         # 6. Collect all pending cases across every file
-        pending: list[_PendingCase] = []
+        pending: list[_PendingCase | _PendingSimulation] = []
         for f in files:
             try:
                 cases = self._collect_from_file(f, confeval_config)
@@ -666,8 +819,8 @@ def execute(  # noqa: PLR0912, PLR0915
             f"Found {', '.join(parts)} across {n_files} file(s) in {target_path}",
         )
 
-        # 7. Print criteria block so users know what is being evaluated
-        self._print_criteria_block(confeval_config, target_path)
+        # 7. Print per-file criteria so users see which config each file resolved to
+        self._print_criteria_per_file(pending, confeval_path)
 
         # 8. Run all cases under a single asyncio event loop
         quads = asyncio.run(self._run_flat_pool(pending, effective_concurrency, effective_parallel))
@@ -702,6 +855,12 @@ def execute(  # noqa: PLR0912, PLR0915
         # 8. Merge into a single report
         merged = self._merge_reports(reports, base_config=confeval_config or self._default_config())
 
+        # Record per-file criteria in the merged report so a single combined run still
+        # captures which criteria applied to which file (the merged config_used alone
+        # collapses to one config and loses this).
+        if isinstance(getattr(merged, "metadata", None), dict):
+            merged.metadata["criteria_by_file"] = self._criteria_by_file(pending)
+
         # 9. Determine exit code
         if threshold is not None and merged.summary.pass_rate < threshold:
             self.output.error(
diff --git a/tests/cli/test_eval_command.py b/tests/cli/test_eval_command.py
index b45c16e..ecd2ddd 100644
--- a/tests/cli/test_eval_command.py
+++ b/tests/cli/test_eval_command.py
@@ -467,6 +467,56 @@ def test_print_criteria_block_custom(cmd, capsys):
     assert "judge=gpt-4" in captured.out
 
 
+def _pending_case(file_name, config, source):
+    return _PendingCase(
+        case=MagicMock(),
+        evaluator=MagicMock(),
+        config=config,
+        file_name=file_name,
+        eval_set_id="id",
+        eval_set_name="name",
+        config_source=source,
+    )
+
+
+def test_print_criteria_per_file_shows_each_files_criteria(cmd, capsys):
+    tool_cfg = EvalConfig(
+        criteria=CriteriaConfig(tool_name_match=CriterionConfig.tool_name_match(threshold=0.6))
+    )
+    rouge_cfg = EvalConfig(
+        criteria=CriteriaConfig(rouge_match=CriterionConfig.rouge_match(threshold=0.8))
+    )
+    pending = [
+        _pending_case("eval_tool_agents.py", tool_cfg, "per-file"),
+        _pending_case("weather_agents_eval.py", rouge_cfg, "confeval.py"),
+    ]
+    cmd._print_criteria_per_file(pending, Path("evals/confeval.py"))
+    out = capsys.readouterr().out
+    # Each file is listed with its own criteria and resolved source.
+    assert "eval_tool_agents.py  (source: per-file)" in out
+    assert "tool_name_match" in out
+    assert "weather_agents_eval.py  (source: evals/confeval.py)" in out
+    assert "rouge_match" in out
+
+
+def test_criteria_by_file_maps_source_and_criteria(cmd):
+    tool_cfg = EvalConfig(
+        criteria=CriteriaConfig(tool_name_match=CriterionConfig.tool_name_match(threshold=0.6))
+    )
+    rouge_cfg = EvalConfig(
+        criteria=CriteriaConfig(rouge_match=CriterionConfig.rouge_match(threshold=0.8))
+    )
+    pending = [
+        _pending_case("eval_tool_agents.py", tool_cfg, "per-file"),
+        _pending_case("weather_agents_eval.py", rouge_cfg, "confeval.py"),
+    ]
+    by_file = cmd._criteria_by_file(pending)
+    assert by_file["eval_tool_agents.py"]["source"] == "per-file"
+    assert "tool_name_match" in by_file["eval_tool_agents.py"]["criteria"]
+    assert by_file["weather_agents_eval.py"]["source"] == "confeval.py"
+    assert by_file["weather_agents_eval.py"]["criteria"]["rouge_match"]["threshold"] == 0.8
+
+
 @pytest.mark.asyncio
 async def test_run_flat_pool_simulation_no_criterion(cmd):
     from agentflow.qa.evaluation.token_usage import TokenUsage
diff --git a/tests/cli/test_eval_discovery.py b/tests/cli/test_eval_discovery.py
index a92ae52..520eded 100644
--- a/tests/cli/test_eval_discovery.py
+++ b/tests/cli/test_eval_discovery.py
@@ -157,6 +157,73 @@ def test_import_error_returns_none(self, tmp_path: Path, cmd: EvalCommand) -> No
         assert result is None
 
 
+# ── confeval.py discovery (independent of target) ───────────────────────────────
+
+
+class TestConfevalDiscovery:
+    """confeval.py is the global config; discovery must not depend on the target.
+
+    Regression: targeting a single eval file used to look for
+    `<file>/confeval.py`, which never exists, so files without a per-file
+    config silently fell back to built-in defaults instead of the global
+    confeval.py.
+    """
+
+    def test_search_dirs_walks_up_from_file(
+        self, tmp_path: Path, cmd: EvalCommand
+    ) -> None:
+        evals_dir = tmp_path / "evals"
+        sub = evals_dir / "sub"
+        sub.mkdir(parents=True)
+        target = sub / "weather_eval.py"
+        target.write_text("")
+        dirs = cmd._confeval_search_dirs(target, evals_dir)
+        # Parent of the file is searched first, walking up; the evals dir is included.
+        assert dirs[0] == sub.resolve()
+        assert evals_dir.resolve() in dirs
+
+    def test_search_dirs_includes_evals_dir_for_directory_target(
+        self, tmp_path: Path, cmd: EvalCommand
+    ) -> None:
+        evals_dir = tmp_path / "evals"
+        evals_dir.mkdir()
+        dirs = cmd._confeval_search_dirs(evals_dir, evals_dir)
+        assert evals_dir.resolve() in dirs
+
+    def test_resolve_confeval_finds_for_single_file_target(
+        self, tmp_path: Path, cmd: EvalCommand
+    ) -> None:
+        from agentflow.qa.evaluation import EvalConfig
+
+        evals_dir = tmp_path / "evals"
+        evals_dir.mkdir()
+        target_file = evals_dir / "weather_eval.py"
+        target_file.write_text("")
+        expected = EvalConfig()
+
+        # confeval.py lives in the evals dir, not alongside nothing under the file.
+        def fake_load(d: Path):
+            return expected if d.resolve() == evals_dir.resolve() else None
+
+        with patch.object(cmd, "_load_confeval", side_effect=fake_load):
+            cfg, path = cmd._resolve_confeval(target_file, evals_dir)
+
+        assert cfg is expected
+        assert path == evals_dir / "confeval.py"
+
+    def test_resolve_confeval_returns_none_when_absent(
+        self, tmp_path: Path, cmd: EvalCommand
+    ) -> None:
+        evals_dir = tmp_path / "evals"
+        evals_dir.mkdir()
+        target_file = evals_dir / "weather_eval.py"
+        target_file.write_text("")
+        with patch.object(cmd, "_load_confeval", return_value=None):
+            cfg, path = cmd._resolve_confeval(target_file, evals_dir)
+        assert cfg is None
+        assert path is None
+
+
 # ── _collect_from_file ────────────────────────────────────────────────────────
 
 
@@ -193,9 +260,11 @@ def test_file_config_takes_priority_over_global_config(
         ):
             result = cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is file_cfg
         assert used_config is not global_cfg
+        assert used_source == "per-file"
         assert result == fake_pending
 
     def test_file_config_used_when_no_global(self, tmp_path: Path, cmd: EvalCommand) -> None:
@@ -215,8 +284,10 @@ def test_file_config_used_when_no_global(self, tmp_path: Path, cmd: EvalCommand)
         ):
             result = cmd._collect_from_file(self._dummy_path(tmp_path), None)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is file_cfg
+        assert used_source == "per-file"
 
     def test_default_config_used_when_no_configs(self, tmp_path: Path, cmd: EvalCommand) -> None:
         from agentflow.qa.evaluation import EvalConfig
@@ -235,8 +306,10 @@ def test_default_config_used_when_no_configs(self, tmp_path: Path, cmd: EvalComm
         ):
             result = cmd._collect_from_file(self._dummy_path(tmp_path), None)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is default_cfg
+        assert used_source == "built-in defaults"
 
     def test_no_entry_point_returns_empty_and_warns(
         self, tmp_path: Path, cmd: EvalCommand
@@ -267,8 +340,10 @@ def test_pytest_style_discovery_fallback(self, tmp_path: Path, cmd: EvalCommand)
         ):
             result = cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is global_cfg
+        assert used_source == "confeval.py"
         assert result == fake_pending
 
 
diff --git a/tests/cli/test_eval_flat_pool.py b/tests/cli/test_eval_flat_pool.py
index e442042..5970096 100644
--- a/tests/cli/test_eval_flat_pool.py
+++ b/tests/cli/test_eval_flat_pool.py
@@ -123,9 +123,11 @@ def test_per_file_config_beats_confeval_config(
         ):
             cmd._collect_from_file(self._dummy_path(tmp_path), global_cfg)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is per_file_cfg
         assert used_config is not global_cfg
+        assert used_source == "per-file"
 
     def test_per_file_config_used_when_no_confeval(
         self, tmp_path: Path, cmd: EvalCommand
@@ -146,8 +148,10 @@ def test_per_file_config_used_when_no_confeval(
         ):
             cmd._collect_from_file(self._dummy_path(tmp_path), None)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is per_file_cfg
+        assert used_source == "per-file"
 
     def test_default_config_used_when_no_config_anywhere(
         self, tmp_path: Path, cmd: EvalCommand
@@ -168,5 +172,7 @@ def test_default_config_used_when_no_config_anywhere(
         ):
             cmd._collect_from_file(self._dummy_path(tmp_path), None)
 
-        _, _, used_config, _ = mock_make.call_args.args
+        used_config = mock_make.call_args.args[2]
+        used_source = mock_make.call_args.args[4]
         assert used_config is default_cfg
+        assert used_source == "built-in defaults"

From afa56d6cb82c8b78915a526c0eab0aeffb5e038b Mon Sep 17 00:00:00 2001
From: Shudipto Trafder <shudiptotrafder@gmail.com>
Date: Tue, 16 Jun 2026 19:45:21 +0600
Subject: [PATCH 2/2] feat: Add initial agentflow configuration file

---
 agentflow.json | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 agentflow.json

diff --git a/agentflow.json b/agentflow.json
new file mode 100644
index 0000000..119c138
--- /dev/null
+++ b/agentflow.json
@@ -0,0 +1,15 @@
+{
+  "agent": "graph.react:app",
+  "thread_name_generator": "graph.thread_name_generator:MyNameGenerator",
+  "env": ".env",
+  "auth": null,
+  "rate_limit": {
+    "enabled": true,
+    "backend": "memory",
+    "requests": 100,
+    "window": 60,
+    "by": "ip",
+    "trusted_proxy_headers": false,
+    "exclude_paths": ["/health", "/docs", "/redoc", "/openapi.json"]
+  }
+}
\ No newline at end of file