From 8ae6a3bd6db42a306cf1c537d6ca5ec2007eb651 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 19 Jun 2026 23:02:49 +0800 Subject: [PATCH 1/3] Add agent trajectory evaluation against declarative rubrics --- .../Eng/doc/new_features/v36_features_doc.rst | 56 ++++++++++ .../Zh/doc/new_features/v36_features_doc.rst | 52 +++++++++ je_auto_control/__init__.py | 3 + .../gui/script_builder/command_schema.py | 10 ++ .../utils/executor/action_executor.py | 16 +++ .../utils/mcp_server/tools/_factories.py | 22 ++++ .../utils/mcp_server/tools/_handlers.py | 6 ++ .../utils/trajectory_eval/__init__.py | 6 ++ .../utils/trajectory_eval/trajectory_eval.py | 100 ++++++++++++++++++ .../headless/test_trajectory_eval_batch.py | 94 ++++++++++++++++ 10 files changed, 365 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v36_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v36_features_doc.rst create mode 100644 je_auto_control/utils/trajectory_eval/__init__.py create mode 100644 je_auto_control/utils/trajectory_eval/trajectory_eval.py create mode 100644 test/unit_test/headless/test_trajectory_eval_batch.py diff --git a/docs/source/Eng/doc/new_features/v36_features_doc.rst b/docs/source/Eng/doc/new_features/v36_features_doc.rst new file mode 100644 index 00000000..35b41373 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v36_features_doc.rst @@ -0,0 +1,56 @@ +Agent Trajectory Evaluation +=========================== + +As automations hand control to LLM agents, "did it still work?" becomes "did +the agent take an acceptable path?". :func:`evaluate_trajectory` scores a +recorded run against a declarative **rubric**, giving a deterministic, +dependency-free signal for agent regression testing. + +A *trajectory* is the ordered list of steps a run took — each a dict with at +least an ``"action"`` name and optionally ``"args"`` / ``"observation"``. The +*rubric* is plain data (so it lives happily in a JSON action file or arrives +over MCP): + +================================ =================================================== +Rubric key Meaning +================================ =================================================== +``required_actions`` Actions that must all appear. +``ordered`` With the above, also require that relative order. +``forbidden_actions`` Actions that must never appear. +``max_steps`` Upper bound on trajectory length. +``success_contains`` Substring that must appear in some observation. +================================ =================================================== + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import evaluate_trajectory + + trajectory = [ + {"action": "AC_focus_window", "observation": "focused"}, + {"action": "AC_type_text", "observation": "typed"}, + {"action": "AC_click_mouse", "observation": "Saved successfully"}, + ] + result = evaluate_trajectory(trajectory, { + "required_actions": ["AC_type_text", "AC_click_mouse"], + "forbidden_actions": ["AC_kill_process"], + "max_steps": 10, + "success_contains": "Saved", + }) + assert result["passed"] # every applicable check passed + print(result["score"], result["checks"]) + +``score`` is the fraction of applicable checks that passed; ``passed`` is true +only when all pass; an empty rubric trivially passes. Each entry in ``checks`` +is ``{name, passed, detail}`` so a failure pinpoints the violated expectation. + +Executor command +---------------- + +``AC_evaluate_trajectory`` takes ``trajectory`` and ``rubric`` (each a JSON +string from the visual builder, or already-decoded data from a JSON action file +/ MCP) and returns ``{passed, score, steps, checks}``. The same operation is +exposed as the MCP tool ``ac_evaluate_trajectory`` and as a Script Builder +command under **Agent**. diff --git a/docs/source/Zh/doc/new_features/v36_features_doc.rst b/docs/source/Zh/doc/new_features/v36_features_doc.rst new file mode 100644 index 00000000..0fdf7ffb --- /dev/null +++ b/docs/source/Zh/doc/new_features/v36_features_doc.rst @@ -0,0 +1,52 @@ +Agent 軌跡評估 +============== + +當自動化把控制權交給 LLM agent,「它還能運作嗎?」就變成「agent 是否走了可接受的路 +徑?」。:func:`evaluate_trajectory` 依宣告式**評分標準(rubric)**為一次記錄的執行評 +分,為 agent 回歸測試提供確定性、無相依的訊號。 + +*軌跡(trajectory)*是該次執行所採取步驟的有序清單 —— 每步是一個至少含 ``"action"`` +名稱、可選含 ``"args"`` / ``"observation"`` 的 dict。*評分標準*為純資料(因此可自在地 +存於 JSON action 檔或經 MCP 傳入): + +================================ =================================================== +Rubric 鍵 意義 +================================ =================================================== +``required_actions`` 必須全部出現的動作。 +``ordered`` 搭配上者,還要求其相對順序。 +``forbidden_actions`` 絕不可出現的動作。 +``max_steps`` 軌跡長度上限。 +``success_contains`` 必須出現在某個 observation 中的子字串。 +================================ =================================================== + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import evaluate_trajectory + + trajectory = [ + {"action": "AC_focus_window", "observation": "focused"}, + {"action": "AC_type_text", "observation": "typed"}, + {"action": "AC_click_mouse", "observation": "Saved successfully"}, + ] + result = evaluate_trajectory(trajectory, { + "required_actions": ["AC_type_text", "AC_click_mouse"], + "forbidden_actions": ["AC_kill_process"], + "max_steps": 10, + "success_contains": "Saved", + }) + assert result["passed"] # 所有適用的檢查都通過 + print(result["score"], result["checks"]) + +``score`` 為通過的適用檢查佔比;``passed`` 僅在全部通過時為真;空 rubric 直接通過。 +``checks`` 中每個項目為 ``{name, passed, detail}``,因此失敗時可精準指出被違反的期望。 + +執行器指令 +---------- + +``AC_evaluate_trajectory`` 接受 ``trajectory`` 與 ``rubric``(從視覺化建構器傳入時為 +JSON 字串,從 JSON action 檔 / MCP 傳入時為已解碼資料),回傳 +``{passed, score, steps, checks}``。相同操作亦提供為 MCP 工具 +``ac_evaluate_trajectory``,以及 Script Builder 中 **Agent** 分類下的指令。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 6db8b058..a96d6a1c 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -216,6 +216,8 @@ from je_auto_control.utils.approval import ( ApprovalResult, approve_artifact, pending_artifacts, verify_artifact, ) +# Agent trajectory evaluation: score a recorded run against a rubric +from je_auto_control.utils.trajectory_eval import evaluate_trajectory # Background popup/interrupt watchdog (unattended automation) from je_auto_control.utils.watchdog import ( PopupWatchdog, WatchdogRule, default_popup_watchdog, @@ -656,6 +658,7 @@ def start_autocontrol_gui(*args, **kwargs): "EgressBlocked", "EgressPolicy", "get_egress_policy", "set_egress_policy", "ApprovalResult", "approve_artifact", "pending_artifacts", "verify_artifact", + "evaluate_trajectory", # MCP server "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt", "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 60ba6b0f..6cfb0b08 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -801,6 +801,16 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None: default=".approvals"),), description="List artifacts awaiting approval.", )) + specs.append(CommandSpec( + "AC_evaluate_trajectory", "Agent", "Evaluate Trajectory", + fields=( + FieldSpec("trajectory", FieldType.STRING, + placeholder='[{"action": "AC_click_mouse"}]'), + FieldSpec("rubric", FieldType.STRING, + placeholder='{"required_actions": ["AC_type_text"]}'), + ), + description="Score an agent trajectory against a rubric (JSON).", + )) specs.append(CommandSpec( "AC_generate_sop", "Report", "Generate SOP Document", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index d4bce220..3c6b7348 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2991,6 +2991,21 @@ def _pending_artifacts(approvals_dir: str = ".approvals") -> Dict[str, Any]: return {"pending": pending_artifacts(approvals_dir)} +def _evaluate_trajectory(trajectory: Any, rubric: Any) -> Dict[str, Any]: + """Adapter: score an agent trajectory against a declarative rubric. + + ``trajectory`` / ``rubric`` may be JSON strings (from the visual builder) + or already-decoded list/dict (from JSON action files / MCP). + """ + import json + from je_auto_control.utils.trajectory_eval import evaluate_trajectory + if isinstance(trajectory, str): + trajectory = json.loads(trajectory) + if isinstance(rubric, str): + rubric = json.loads(rubric) + return evaluate_trajectory(trajectory, rubric) + + class Executor: """ Executor @@ -3236,6 +3251,7 @@ def __init__(self): "AC_verify_artifact": _verify_artifact, "AC_approve_artifact": _approve_artifact, "AC_pending_artifacts": _pending_artifacts, + "AC_evaluate_trajectory": _evaluate_trajectory, "AC_a11y_record_start": _a11y_record_start, "AC_a11y_record_stop": _a11y_record_stop, "AC_a11y_record_events": _a11y_record_events, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 96cad3d4..b0c4393e 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -2729,6 +2729,27 @@ def approval_testing_tools() -> List[MCPTool]: ] +def trajectory_eval_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_evaluate_trajectory", + description=("Score an agent's recorded 'trajectory' (a list of " + "{action, args, observation} steps) against a 'rubric' " + "with optional keys required_actions (+ordered), " + "forbidden_actions, max_steps, success_contains. " + "Returns {passed, score, steps, checks} for agent " + "regression testing."), + input_schema=schema( + {"trajectory": {"type": "array", + "items": {"type": "object"}}, + "rubric": {"type": "object"}}, + ["trajectory", "rubric"]), + handler=h.evaluate_trajectory, + annotations=READ_ONLY, + ), + ] + + def unattended_tools() -> List[MCPTool]: return [ MCPTool( @@ -3788,6 +3809,7 @@ def media_assert_tools() -> List[MCPTool]: ci_annotation_tools, clipboard_history_tools, audit_analysis_tools, process_doc_tools, tween_drag_tools, plugin_sdk_tools, governance_tools, credential_lease_tools, egress_tools, approval_testing_tools, + trajectory_eval_tools, screen_record_tools, process_and_shell_tools, remote_desktop_tools, gamepad_tools, usb_passthrough_tools, assertion_tools, data_source_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index e42b44bc..4710a2d7 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -1325,6 +1325,12 @@ def pending_artifacts(approvals_dir: str = ".approvals"): return {"pending": _pending(approvals_dir)} +def evaluate_trajectory(trajectory, rubric): + from je_auto_control.utils.trajectory_eval import ( + evaluate_trajectory as _evaluate) + return _evaluate(trajectory, rubric) + + def vlm_locate(description: str, screen_region: Optional[List[int]] = None, model: Optional[str] = None) -> Optional[List[int]]: diff --git a/je_auto_control/utils/trajectory_eval/__init__.py b/je_auto_control/utils/trajectory_eval/__init__.py new file mode 100644 index 00000000..559a8471 --- /dev/null +++ b/je_auto_control/utils/trajectory_eval/__init__.py @@ -0,0 +1,6 @@ +"""Agent trajectory evaluation: score a recorded run against a rubric.""" +from je_auto_control.utils.trajectory_eval.trajectory_eval import ( + evaluate_trajectory, +) + +__all__ = ["evaluate_trajectory"] diff --git a/je_auto_control/utils/trajectory_eval/trajectory_eval.py b/je_auto_control/utils/trajectory_eval/trajectory_eval.py new file mode 100644 index 00000000..7d0f806c --- /dev/null +++ b/je_auto_control/utils/trajectory_eval/trajectory_eval.py @@ -0,0 +1,100 @@ +"""Score an agent's recorded trajectory against a declarative rubric. + +An agent *trajectory* is the ordered list of steps a run took, each a dict with +at least an ``"action"`` name (and optionally ``"args"`` / ``"observation"``). +A *rubric* is plain data describing what a good run looks like, so it can live +in a JSON action file or be passed from the MCP/socket surfaces: + +* ``required_actions`` — actions that must all appear (set ``ordered: true`` to + also require they appear in that relative order); +* ``forbidden_actions`` — actions that must never appear; +* ``max_steps`` — an upper bound on trajectory length; +* ``success_contains`` — a substring that must appear in some observation. + +:func:`evaluate_trajectory` returns ``{passed, score, steps, checks}`` where the +score is the fraction of applicable checks that passed — a deterministic, +dependency-free signal for agent regression testing. Imports no ``PySide6``. +""" +from typing import Any, Dict, List, Mapping, Sequence + + +def _actions(trajectory: Sequence[Mapping[str, Any]]) -> List[str]: + return [str(step.get("action", "")) for step in trajectory] + + +def _check(name: str, passed: bool, detail: str) -> Dict[str, Any]: + return {"name": name, "passed": bool(passed), "detail": detail} + + +def _is_subsequence(needles: Sequence[str], haystack: Sequence[str]) -> bool: + iterator = iter(haystack) + return all(needle in iterator for needle in needles) + + +def _check_required(actions: List[str], required: Sequence[str], + ordered: bool) -> Dict[str, Any]: + present = [a for a in required if a in actions] + missing = [a for a in required if a not in actions] + if missing: + return _check("required_actions", False, f"missing: {missing}") + if ordered and not _is_subsequence(list(required), actions): + return _check("required_actions", False, + "all present but not in the required order") + return _check("required_actions", True, f"all present: {present}") + + +def _check_forbidden(actions: List[str], + forbidden: Sequence[str]) -> Dict[str, Any]: + hit = [a for a in forbidden if a in actions] + return _check("forbidden_actions", not hit, + f"used forbidden: {hit}" if hit else "none used") + + +def _check_max_steps(steps: int, max_steps: int) -> Dict[str, Any]: + return _check("max_steps", steps <= max_steps, + f"{steps} step(s), limit {max_steps}") + + +def _check_success(trajectory: Sequence[Mapping[str, Any]], + marker: str) -> Dict[str, Any]: + found = any(marker in str(step.get("observation", "")) + for step in trajectory) + return _check("success_contains", found, + f"marker {marker!r} {'found' if found else 'not found'}") + + +def _collect_checks(trajectory: Sequence[Mapping[str, Any]], + actions: List[str], + rubric: Mapping[str, Any]) -> List[Dict[str, Any]]: + checks: List[Dict[str, Any]] = [] + if "required_actions" in rubric: + checks.append(_check_required(actions, rubric["required_actions"], + bool(rubric.get("ordered", False)))) + if "forbidden_actions" in rubric: + checks.append(_check_forbidden(actions, rubric["forbidden_actions"])) + if "max_steps" in rubric: + checks.append(_check_max_steps(len(actions), int(rubric["max_steps"]))) + if "success_contains" in rubric: + checks.append(_check_success(trajectory, + str(rubric["success_contains"]))) + return checks + + +def evaluate_trajectory(trajectory: Sequence[Mapping[str, Any]], + rubric: Mapping[str, Any]) -> Dict[str, Any]: + """Score ``trajectory`` against ``rubric``; return passed/score/checks. + + ``score`` is the fraction of applicable checks that passed; ``passed`` is + true only when every applicable check passed. An empty rubric trivially + passes with a score of ``1.0``. + """ + actions = _actions(trajectory) + checks = _collect_checks(trajectory, actions, rubric) + passed_count = sum(1 for check in checks if check["passed"]) + score = 1.0 if not checks else passed_count / len(checks) + return { + "passed": all(check["passed"] for check in checks), + "score": score, + "steps": len(actions), + "checks": checks, + } diff --git a/test/unit_test/headless/test_trajectory_eval_batch.py b/test/unit_test/headless/test_trajectory_eval_batch.py new file mode 100644 index 00000000..08121506 --- /dev/null +++ b/test/unit_test/headless/test_trajectory_eval_batch.py @@ -0,0 +1,94 @@ +"""Headless tests for agent trajectory evaluation. Pure stdlib, no Qt imports.""" +import json + +import je_auto_control as ac +from je_auto_control.utils.trajectory_eval import evaluate_trajectory + +TRAJ = [ + {"action": "AC_focus_window", "observation": "focused"}, + {"action": "AC_type_text", "args": {"text": "hi"}, "observation": "typed"}, + {"action": "AC_click_mouse", "observation": "Saved successfully"}, +] + + +def test_empty_rubric_passes(): + result = evaluate_trajectory(TRAJ, {}) + assert result["passed"] is True + assert result["score"] == 1.0 + assert result["steps"] == 3 + + +def test_required_actions_present(): + result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_type_text", + "AC_click_mouse"]}) + assert result["passed"] is True + + +def test_required_actions_missing_fails(): + result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_hotkey"]}) + assert result["passed"] is False + assert result["score"] == 0.0 + + +def test_ordered_requirement(): + ordered_ok = {"required_actions": ["AC_focus_window", "AC_click_mouse"], + "ordered": True} + assert evaluate_trajectory(TRAJ, ordered_ok)["passed"] is True + ordered_bad = {"required_actions": ["AC_click_mouse", "AC_focus_window"], + "ordered": True} + assert evaluate_trajectory(TRAJ, ordered_bad)["passed"] is False + + +def test_forbidden_actions(): + assert evaluate_trajectory( + TRAJ, {"forbidden_actions": ["AC_kill_process"]})["passed"] is True + assert evaluate_trajectory( + TRAJ, {"forbidden_actions": ["AC_click_mouse"]})["passed"] is False + + +def test_max_steps(): + assert evaluate_trajectory(TRAJ, {"max_steps": 3})["passed"] is True + assert evaluate_trajectory(TRAJ, {"max_steps": 2})["passed"] is False + + +def test_success_contains(): + assert evaluate_trajectory( + TRAJ, {"success_contains": "Saved"})["passed"] is True + assert evaluate_trajectory( + TRAJ, {"success_contains": "Error"})["passed"] is False + + +def test_partial_score(): + rubric = {"required_actions": ["AC_type_text"], # pass + "forbidden_actions": ["AC_type_text"]} # fail + result = evaluate_trajectory(TRAJ, rubric) + assert result["passed"] is False + assert result["score"] == 0.5 + + +# --- wiring --------------------------------------------------------------- + +def test_executor_round_trip_with_json_strings(): + rec = ac.execute_action([[ + "AC_evaluate_trajectory", + {"trajectory": json.dumps(TRAJ), + "rubric": json.dumps({"required_actions": ["AC_type_text"]})}, + ]]) + assert any(v.get("passed") is True for v in rec.values() + if isinstance(v, dict)) + + +def test_wiring(): + assert "AC_evaluate_trajectory" in ac.executor.known_commands() + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry) + names = {t.name for t in build_default_tool_registry()} + assert "ac_evaluate_trajectory" in names + from je_auto_control.gui.script_builder.command_schema import _build_specs + cmds = {s.command for s in _build_specs()} + assert "AC_evaluate_trajectory" in cmds + + +def test_facade_export(): + assert hasattr(ac, "evaluate_trajectory") + assert "evaluate_trajectory" in ac.__all__ From aad9ddcc0c0dbe935928c1d0041ae11630fc390f Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 19 Jun 2026 23:07:05 +0800 Subject: [PATCH 2/3] Document trajectory evaluation in toctrees and README --- README.md | 7 +++++++ README/README_zh-CN.md | 7 +++++++ README/README_zh-TW.md | 7 +++++++ docs/source/Eng/eng_index.rst | 1 + docs/source/Zh/zh_index.rst | 1 + 5 files changed, 23 insertions(+) diff --git a/README.md b/README.md index 79e5e536..675ab5b4 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ ## Table of Contents +- [What's new (2026-06-19) — Agent Trajectory Evaluation](#whats-new-2026-06-19--agent-trajectory-evaluation) - [What's new (2026-06-19) — Approval Testing (Golden-Master Baselines)](#whats-new-2026-06-19--approval-testing-golden-master-baselines) - [What's new (2026-06-19) — Network Egress Allowlist Guard](#whats-new-2026-06-19--network-egress-allowlist-guard) - [What's new (2026-06-19) — Just-In-Time Credential Leases](#whats-new-2026-06-19--just-in-time-credential-leases) @@ -88,6 +89,12 @@ --- +## What's new (2026-06-19) — Agent Trajectory Evaluation + +Score an agent run against a rubric. Full reference: [`docs/source/Eng/doc/new_features/v36_features_doc.rst`](docs/source/Eng/doc/new_features/v36_features_doc.rst). + +- **`evaluate_trajectory`** (`AC_evaluate_trajectory`, `ac_evaluate_trajectory`): scores a recorded trajectory (ordered `{action, args, observation}` steps) against a declarative rubric — `required_actions` (+`ordered`), `forbidden_actions`, `max_steps`, `success_contains`. Returns `{passed, score, steps, checks}` where `score` is the fraction of applicable checks passed and each `check` pinpoints a violated expectation. A deterministic, dependency-free signal for agent regression testing; the rubric is plain data so it lives in JSON action files and travels over MCP. + ## What's new (2026-06-19) — Approval Testing (Golden-Master Baselines) Lock outputs against a human-approved baseline. Full reference: [`docs/source/Eng/doc/new_features/v35_features_doc.rst`](docs/source/Eng/doc/new_features/v35_features_doc.rst). diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md index aa380069..63c5f6be 100644 --- a/README/README_zh-CN.md +++ b/README/README_zh-CN.md @@ -12,6 +12,7 @@ ## 目录 +- [本次更新 (2026-06-19) — Agent 轨迹评估](#本次更新-2026-06-19--agent-轨迹评估) - [本次更新 (2026-06-19) — 核准式测试(Golden-Master 基准)](#本次更新-2026-06-19--核准式测试golden-master-基准) - [本次更新 (2026-06-19) — 网络出口允许清单守卫](#本次更新-2026-06-19--网络出口允许清单守卫) - [本次更新 (2026-06-19) — 即时凭证租约](#本次更新-2026-06-19--即时凭证租约) @@ -87,6 +88,12 @@ --- +## 本次更新 (2026-06-19) — Agent 轨迹评估 + +依评分标准为 agent 运行评分。完整参考:[`docs/source/Zh/doc/new_features/v36_features_doc.rst`](../docs/source/Zh/doc/new_features/v36_features_doc.rst)。 + +- **`evaluate_trajectory`**(`AC_evaluate_trajectory`、`ac_evaluate_trajectory`):依声明式评分标准 —— `required_actions`(+`ordered`)、`forbidden_actions`、`max_steps`、`success_contains` —— 为一次记录的轨迹(有序 `{action, args, observation}` 步骤)评分。返回 `{passed, score, steps, checks}`,其中 `score` 为通过的适用检查占比,每个 `check` 精准指出被违反的期望。为 agent 回归测试提供确定性、无依赖的信号;rubric 为纯数据,可存于 JSON action 文件并经 MCP 传递。 + ## 本次更新 (2026-06-19) — 核准式测试(Golden-Master 基准) 将输出锁定到人工核准的基准。完整参考:[`docs/source/Zh/doc/new_features/v35_features_doc.rst`](../docs/source/Zh/doc/new_features/v35_features_doc.rst)。 diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md index a59dc4f3..8c3abbe0 100644 --- a/README/README_zh-TW.md +++ b/README/README_zh-TW.md @@ -12,6 +12,7 @@ ## 目錄 +- [本次更新 (2026-06-19) — Agent 軌跡評估](#本次更新-2026-06-19--agent-軌跡評估) - [本次更新 (2026-06-19) — 核准式測試(Golden-Master 基準)](#本次更新-2026-06-19--核准式測試golden-master-基準) - [本次更新 (2026-06-19) — 網路出口允許清單守衛](#本次更新-2026-06-19--網路出口允許清單守衛) - [本次更新 (2026-06-19) — 即時憑證租約](#本次更新-2026-06-19--即時憑證租約) @@ -87,6 +88,12 @@ --- +## 本次更新 (2026-06-19) — Agent 軌跡評估 + +依評分標準為 agent 執行評分。完整參考:[`docs/source/Zh/doc/new_features/v36_features_doc.rst`](../docs/source/Zh/doc/new_features/v36_features_doc.rst)。 + +- **`evaluate_trajectory`**(`AC_evaluate_trajectory`、`ac_evaluate_trajectory`):依宣告式評分標準 —— `required_actions`(+`ordered`)、`forbidden_actions`、`max_steps`、`success_contains` —— 為一次記錄的軌跡(有序 `{action, args, observation}` 步驟)評分。回傳 `{passed, score, steps, checks}`,其中 `score` 為通過的適用檢查佔比,每個 `check` 精準指出被違反的期望。為 agent 回歸測試提供確定性、無相依的訊號;rubric 為純資料,可存於 JSON action 檔並經 MCP 傳遞。 + ## 本次更新 (2026-06-19) — 核准式測試(Golden-Master 基準) 將輸出鎖定到人工核准的基準。完整參考:[`docs/source/Zh/doc/new_features/v35_features_doc.rst`](../docs/source/Zh/doc/new_features/v35_features_doc.rst)。 diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 2e4fe404..ec038412 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -58,6 +58,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v33_features_doc doc/new_features/v34_features_doc doc/new_features/v35_features_doc + doc/new_features/v36_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index 97015308..88f30c6c 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -58,6 +58,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v33_features_doc doc/new_features/v34_features_doc doc/new_features/v35_features_doc + doc/new_features/v36_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc From 8f5b63c0cfa1b4b50ea121189519c633c4a6b364 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 19 Jun 2026 23:12:08 +0800 Subject: [PATCH 3/3] Use pytest.approx for trajectory score comparisons (Sonar S1244) --- test/unit_test/headless/test_trajectory_eval_batch.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/unit_test/headless/test_trajectory_eval_batch.py b/test/unit_test/headless/test_trajectory_eval_batch.py index 08121506..d38b2970 100644 --- a/test/unit_test/headless/test_trajectory_eval_batch.py +++ b/test/unit_test/headless/test_trajectory_eval_batch.py @@ -1,6 +1,8 @@ """Headless tests for agent trajectory evaluation. Pure stdlib, no Qt imports.""" import json +import pytest + import je_auto_control as ac from je_auto_control.utils.trajectory_eval import evaluate_trajectory @@ -14,7 +16,7 @@ def test_empty_rubric_passes(): result = evaluate_trajectory(TRAJ, {}) assert result["passed"] is True - assert result["score"] == 1.0 + assert result["score"] == pytest.approx(1.0) assert result["steps"] == 3 @@ -27,7 +29,7 @@ def test_required_actions_present(): def test_required_actions_missing_fails(): result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_hotkey"]}) assert result["passed"] is False - assert result["score"] == 0.0 + assert result["score"] == pytest.approx(0.0) def test_ordered_requirement(): @@ -63,7 +65,7 @@ def test_partial_score(): "forbidden_actions": ["AC_type_text"]} # fail result = evaluate_trajectory(TRAJ, rubric) assert result["passed"] is False - assert result["score"] == 0.5 + assert result["score"] == pytest.approx(0.5) # --- wiring ---------------------------------------------------------------