From 8ae6a3bd6db42a306cf1c537d6ca5ec2007eb651 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 19 Jun 2026 23:02:49 +0800
Subject: [PATCH 1/3] Add agent trajectory evaluation against declarative
 rubrics

---
 .../Eng/doc/new_features/v36_features_doc.rst |  56 ++++++++++
 .../Zh/doc/new_features/v36_features_doc.rst  |  52 +++++++++
 je_auto_control/__init__.py                   |   3 +
 .../gui/script_builder/command_schema.py      |  10 ++
 .../utils/executor/action_executor.py         |  16 +++
 .../utils/mcp_server/tools/_factories.py      |  22 ++++
 .../utils/mcp_server/tools/_handlers.py       |   6 ++
 .../utils/trajectory_eval/__init__.py         |   6 ++
 .../utils/trajectory_eval/trajectory_eval.py  | 100 ++++++++++++++++++
 .../headless/test_trajectory_eval_batch.py    |  94 ++++++++++++++++
 10 files changed, 365 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v36_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v36_features_doc.rst
 create mode 100644 je_auto_control/utils/trajectory_eval/__init__.py
 create mode 100644 je_auto_control/utils/trajectory_eval/trajectory_eval.py
 create mode 100644 test/unit_test/headless/test_trajectory_eval_batch.py

diff --git a/docs/source/Eng/doc/new_features/v36_features_doc.rst b/docs/source/Eng/doc/new_features/v36_features_doc.rst
new file mode 100644
index 00000000..35b41373
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v36_features_doc.rst
@@ -0,0 +1,56 @@
+Agent Trajectory Evaluation
+===========================
+
+As automations hand control to LLM agents, "did it still work?" becomes "did
+the agent take an acceptable path?". :func:`evaluate_trajectory` scores a
+recorded run against a declarative **rubric**, giving a deterministic,
+dependency-free signal for agent regression testing.
+
+A *trajectory* is the ordered list of steps a run took — each a dict with at
+least an ``"action"`` name and optionally ``"args"`` / ``"observation"``. The
+*rubric* is plain data (so it lives happily in a JSON action file or arrives
+over MCP):
+
+================================ ===================================================
+Rubric key                       Meaning
+================================ ===================================================
+``required_actions``             Actions that must all appear.
+``ordered``                      With the above, also require that relative order.
+``forbidden_actions``            Actions that must never appear.
+``max_steps``                    Upper bound on trajectory length.
+``success_contains``             Substring that must appear in some observation.
+================================ ===================================================
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import evaluate_trajectory
+
+    trajectory = [
+        {"action": "AC_focus_window", "observation": "focused"},
+        {"action": "AC_type_text", "observation": "typed"},
+        {"action": "AC_click_mouse", "observation": "Saved successfully"},
+    ]
+    result = evaluate_trajectory(trajectory, {
+        "required_actions": ["AC_type_text", "AC_click_mouse"],
+        "forbidden_actions": ["AC_kill_process"],
+        "max_steps": 10,
+        "success_contains": "Saved",
+    })
+    assert result["passed"]            # every applicable check passed
+    print(result["score"], result["checks"])
+
+``score`` is the fraction of applicable checks that passed; ``passed`` is true
+only when all pass; an empty rubric trivially passes. Each entry in ``checks``
+is ``{name, passed, detail}`` so a failure pinpoints the violated expectation.
+
+Executor command
+----------------
+
+``AC_evaluate_trajectory`` takes ``trajectory`` and ``rubric`` (each a JSON
+string from the visual builder, or already-decoded data from a JSON action file
+/ MCP) and returns ``{passed, score, steps, checks}``. The same operation is
+exposed as the MCP tool ``ac_evaluate_trajectory`` and as a Script Builder
+command under **Agent**.
diff --git a/docs/source/Zh/doc/new_features/v36_features_doc.rst b/docs/source/Zh/doc/new_features/v36_features_doc.rst
new file mode 100644
index 00000000..0fdf7ffb
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v36_features_doc.rst
@@ -0,0 +1,52 @@
+Agent 軌跡評估
+==============
+
+當自動化把控制權交給 LLM agent,「它還能運作嗎?」就變成「agent 是否走了可接受的路
+徑?」。:func:`evaluate_trajectory` 依宣告式**評分標準(rubric)**為一次記錄的執行評
+分,為 agent 回歸測試提供確定性、無相依的訊號。
+
+*軌跡(trajectory)*是該次執行所採取步驟的有序清單 —— 每步是一個至少含 ``"action"``
+名稱、可選含 ``"args"`` / ``"observation"`` 的 dict。*評分標準*為純資料(因此可自在地
+存於 JSON action 檔或經 MCP 傳入):
+
+================================ ===================================================
+Rubric 鍵                        意義
+================================ ===================================================
+``required_actions``             必須全部出現的動作。
+``ordered``                      搭配上者,還要求其相對順序。
+``forbidden_actions``            絕不可出現的動作。
+``max_steps``                    軌跡長度上限。
+``success_contains``             必須出現在某個 observation 中的子字串。
+================================ ===================================================
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import evaluate_trajectory
+
+    trajectory = [
+        {"action": "AC_focus_window", "observation": "focused"},
+        {"action": "AC_type_text", "observation": "typed"},
+        {"action": "AC_click_mouse", "observation": "Saved successfully"},
+    ]
+    result = evaluate_trajectory(trajectory, {
+        "required_actions": ["AC_type_text", "AC_click_mouse"],
+        "forbidden_actions": ["AC_kill_process"],
+        "max_steps": 10,
+        "success_contains": "Saved",
+    })
+    assert result["passed"]            # 所有適用的檢查都通過
+    print(result["score"], result["checks"])
+
+``score`` 為通過的適用檢查佔比;``passed`` 僅在全部通過時為真;空 rubric 直接通過。
+``checks`` 中每個項目為 ``{name, passed, detail}``,因此失敗時可精準指出被違反的期望。
+
+執行器指令
+----------
+
+``AC_evaluate_trajectory`` 接受 ``trajectory`` 與 ``rubric``(從視覺化建構器傳入時為
+JSON 字串,從 JSON action 檔 / MCP 傳入時為已解碼資料),回傳
+``{passed, score, steps, checks}``。相同操作亦提供為 MCP 工具
+``ac_evaluate_trajectory``,以及 Script Builder 中 **Agent** 分類下的指令。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 6db8b058..a96d6a1c 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -216,6 +216,8 @@
 from je_auto_control.utils.approval import (
     ApprovalResult, approve_artifact, pending_artifacts, verify_artifact,
 )
+# Agent trajectory evaluation: score a recorded run against a rubric
+from je_auto_control.utils.trajectory_eval import evaluate_trajectory
 # Background popup/interrupt watchdog (unattended automation)
 from je_auto_control.utils.watchdog import (
     PopupWatchdog, WatchdogRule, default_popup_watchdog,
@@ -656,6 +658,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "EgressBlocked", "EgressPolicy", "get_egress_policy", "set_egress_policy",
     "ApprovalResult", "approve_artifact", "pending_artifacts",
     "verify_artifact",
+    "evaluate_trajectory",
     # MCP server
     "AuditLogger", "HttpMCPServer", "MCPContent", "MCPPrompt",
     "MCPPromptArgument", "MCPResource", "MCPServer", "MCPTool",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index 60ba6b0f..6cfb0b08 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -801,6 +801,16 @@ def _add_misc_specs(specs: List[CommandSpec]) -> None:
                           default=".approvals"),),
         description="List artifacts awaiting approval.",
     ))
+    specs.append(CommandSpec(
+        "AC_evaluate_trajectory", "Agent", "Evaluate Trajectory",
+        fields=(
+            FieldSpec("trajectory", FieldType.STRING,
+                      placeholder='[{"action": "AC_click_mouse"}]'),
+            FieldSpec("rubric", FieldType.STRING,
+                      placeholder='{"required_actions": ["AC_type_text"]}'),
+        ),
+        description="Score an agent trajectory against a rubric (JSON).",
+    ))
     specs.append(CommandSpec(
         "AC_generate_sop", "Report", "Generate SOP Document",
         fields=(
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index d4bce220..3c6b7348 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -2991,6 +2991,21 @@ def _pending_artifacts(approvals_dir: str = ".approvals") -> Dict[str, Any]:
     return {"pending": pending_artifacts(approvals_dir)}
 
 
+def _evaluate_trajectory(trajectory: Any, rubric: Any) -> Dict[str, Any]:
+    """Adapter: score an agent trajectory against a declarative rubric.
+
+    ``trajectory`` / ``rubric`` may be JSON strings (from the visual builder)
+    or already-decoded list/dict (from JSON action files / MCP).
+    """
+    import json
+    from je_auto_control.utils.trajectory_eval import evaluate_trajectory
+    if isinstance(trajectory, str):
+        trajectory = json.loads(trajectory)
+    if isinstance(rubric, str):
+        rubric = json.loads(rubric)
+    return evaluate_trajectory(trajectory, rubric)
+
+
 class Executor:
     """
     Executor
@@ -3236,6 +3251,7 @@ def __init__(self):
             "AC_verify_artifact": _verify_artifact,
             "AC_approve_artifact": _approve_artifact,
             "AC_pending_artifacts": _pending_artifacts,
+            "AC_evaluate_trajectory": _evaluate_trajectory,
             "AC_a11y_record_start": _a11y_record_start,
             "AC_a11y_record_stop": _a11y_record_stop,
             "AC_a11y_record_events": _a11y_record_events,
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 96cad3d4..b0c4393e 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -2729,6 +2729,27 @@ def approval_testing_tools() -> List[MCPTool]:
     ]
 
 
+def trajectory_eval_tools() -> List[MCPTool]:
+    return [
+        MCPTool(
+            name="ac_evaluate_trajectory",
+            description=("Score an agent's recorded 'trajectory' (a list of "
+                         "{action, args, observation} steps) against a 'rubric' "
+                         "with optional keys required_actions (+ordered), "
+                         "forbidden_actions, max_steps, success_contains. "
+                         "Returns {passed, score, steps, checks} for agent "
+                         "regression testing."),
+            input_schema=schema(
+                {"trajectory": {"type": "array",
+                                "items": {"type": "object"}},
+                 "rubric": {"type": "object"}},
+                ["trajectory", "rubric"]),
+            handler=h.evaluate_trajectory,
+            annotations=READ_ONLY,
+        ),
+    ]
+
+
 def unattended_tools() -> List[MCPTool]:
     return [
         MCPTool(
@@ -3788,6 +3809,7 @@ def media_assert_tools() -> List[MCPTool]:
     ci_annotation_tools, clipboard_history_tools, audit_analysis_tools,
     process_doc_tools, tween_drag_tools, plugin_sdk_tools, governance_tools,
     credential_lease_tools, egress_tools, approval_testing_tools,
+    trajectory_eval_tools,
     screen_record_tools,
     process_and_shell_tools, remote_desktop_tools, gamepad_tools,
     usb_passthrough_tools, assertion_tools, data_source_tools,
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index e42b44bc..4710a2d7 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -1325,6 +1325,12 @@ def pending_artifacts(approvals_dir: str = ".approvals"):
     return {"pending": _pending(approvals_dir)}
 
 
+def evaluate_trajectory(trajectory, rubric):
+    from je_auto_control.utils.trajectory_eval import (
+        evaluate_trajectory as _evaluate)
+    return _evaluate(trajectory, rubric)
+
+
 def vlm_locate(description: str,
                screen_region: Optional[List[int]] = None,
                model: Optional[str] = None) -> Optional[List[int]]:
diff --git a/je_auto_control/utils/trajectory_eval/__init__.py b/je_auto_control/utils/trajectory_eval/__init__.py
new file mode 100644
index 00000000..559a8471
--- /dev/null
+++ b/je_auto_control/utils/trajectory_eval/__init__.py
@@ -0,0 +1,6 @@
+"""Agent trajectory evaluation: score a recorded run against a rubric."""
+from je_auto_control.utils.trajectory_eval.trajectory_eval import (
+    evaluate_trajectory,
+)
+
+__all__ = ["evaluate_trajectory"]
diff --git a/je_auto_control/utils/trajectory_eval/trajectory_eval.py b/je_auto_control/utils/trajectory_eval/trajectory_eval.py
new file mode 100644
index 00000000..7d0f806c
--- /dev/null
+++ b/je_auto_control/utils/trajectory_eval/trajectory_eval.py
@@ -0,0 +1,100 @@
+"""Score an agent's recorded trajectory against a declarative rubric.
+
+An agent *trajectory* is the ordered list of steps a run took, each a dict with
+at least an ``"action"`` name (and optionally ``"args"`` / ``"observation"``).
+A *rubric* is plain data describing what a good run looks like, so it can live
+in a JSON action file or be passed from the MCP/socket surfaces:
+
+* ``required_actions``  — actions that must all appear (set ``ordered: true`` to
+  also require they appear in that relative order);
+* ``forbidden_actions`` — actions that must never appear;
+* ``max_steps``         — an upper bound on trajectory length;
+* ``success_contains``  — a substring that must appear in some observation.
+
+:func:`evaluate_trajectory` returns ``{passed, score, steps, checks}`` where the
+score is the fraction of applicable checks that passed — a deterministic,
+dependency-free signal for agent regression testing. Imports no ``PySide6``.
+"""
+from typing import Any, Dict, List, Mapping, Sequence
+
+
+def _actions(trajectory: Sequence[Mapping[str, Any]]) -> List[str]:
+    return [str(step.get("action", "")) for step in trajectory]
+
+
+def _check(name: str, passed: bool, detail: str) -> Dict[str, Any]:
+    return {"name": name, "passed": bool(passed), "detail": detail}
+
+
+def _is_subsequence(needles: Sequence[str], haystack: Sequence[str]) -> bool:
+    iterator = iter(haystack)
+    return all(needle in iterator for needle in needles)
+
+
+def _check_required(actions: List[str], required: Sequence[str],
+                    ordered: bool) -> Dict[str, Any]:
+    present = [a for a in required if a in actions]
+    missing = [a for a in required if a not in actions]
+    if missing:
+        return _check("required_actions", False, f"missing: {missing}")
+    if ordered and not _is_subsequence(list(required), actions):
+        return _check("required_actions", False,
+                      "all present but not in the required order")
+    return _check("required_actions", True, f"all present: {present}")
+
+
+def _check_forbidden(actions: List[str],
+                     forbidden: Sequence[str]) -> Dict[str, Any]:
+    hit = [a for a in forbidden if a in actions]
+    return _check("forbidden_actions", not hit,
+                  f"used forbidden: {hit}" if hit else "none used")
+
+
+def _check_max_steps(steps: int, max_steps: int) -> Dict[str, Any]:
+    return _check("max_steps", steps <= max_steps,
+                  f"{steps} step(s), limit {max_steps}")
+
+
+def _check_success(trajectory: Sequence[Mapping[str, Any]],
+                   marker: str) -> Dict[str, Any]:
+    found = any(marker in str(step.get("observation", ""))
+               for step in trajectory)
+    return _check("success_contains", found,
+                  f"marker {marker!r} {'found' if found else 'not found'}")
+
+
+def _collect_checks(trajectory: Sequence[Mapping[str, Any]],
+                    actions: List[str],
+                    rubric: Mapping[str, Any]) -> List[Dict[str, Any]]:
+    checks: List[Dict[str, Any]] = []
+    if "required_actions" in rubric:
+        checks.append(_check_required(actions, rubric["required_actions"],
+                                      bool(rubric.get("ordered", False))))
+    if "forbidden_actions" in rubric:
+        checks.append(_check_forbidden(actions, rubric["forbidden_actions"]))
+    if "max_steps" in rubric:
+        checks.append(_check_max_steps(len(actions), int(rubric["max_steps"])))
+    if "success_contains" in rubric:
+        checks.append(_check_success(trajectory,
+                                     str(rubric["success_contains"])))
+    return checks
+
+
+def evaluate_trajectory(trajectory: Sequence[Mapping[str, Any]],
+                        rubric: Mapping[str, Any]) -> Dict[str, Any]:
+    """Score ``trajectory`` against ``rubric``; return passed/score/checks.
+
+    ``score`` is the fraction of applicable checks that passed; ``passed`` is
+    true only when every applicable check passed. An empty rubric trivially
+    passes with a score of ``1.0``.
+    """
+    actions = _actions(trajectory)
+    checks = _collect_checks(trajectory, actions, rubric)
+    passed_count = sum(1 for check in checks if check["passed"])
+    score = 1.0 if not checks else passed_count / len(checks)
+    return {
+        "passed": all(check["passed"] for check in checks),
+        "score": score,
+        "steps": len(actions),
+        "checks": checks,
+    }
diff --git a/test/unit_test/headless/test_trajectory_eval_batch.py b/test/unit_test/headless/test_trajectory_eval_batch.py
new file mode 100644
index 00000000..08121506
--- /dev/null
+++ b/test/unit_test/headless/test_trajectory_eval_batch.py
@@ -0,0 +1,94 @@
+"""Headless tests for agent trajectory evaluation. Pure stdlib, no Qt imports."""
+import json
+
+import je_auto_control as ac
+from je_auto_control.utils.trajectory_eval import evaluate_trajectory
+
+TRAJ = [
+    {"action": "AC_focus_window", "observation": "focused"},
+    {"action": "AC_type_text", "args": {"text": "hi"}, "observation": "typed"},
+    {"action": "AC_click_mouse", "observation": "Saved successfully"},
+]
+
+
+def test_empty_rubric_passes():
+    result = evaluate_trajectory(TRAJ, {})
+    assert result["passed"] is True
+    assert result["score"] == 1.0
+    assert result["steps"] == 3
+
+
+def test_required_actions_present():
+    result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_type_text",
+                                                             "AC_click_mouse"]})
+    assert result["passed"] is True
+
+
+def test_required_actions_missing_fails():
+    result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_hotkey"]})
+    assert result["passed"] is False
+    assert result["score"] == 0.0
+
+
+def test_ordered_requirement():
+    ordered_ok = {"required_actions": ["AC_focus_window", "AC_click_mouse"],
+                  "ordered": True}
+    assert evaluate_trajectory(TRAJ, ordered_ok)["passed"] is True
+    ordered_bad = {"required_actions": ["AC_click_mouse", "AC_focus_window"],
+                   "ordered": True}
+    assert evaluate_trajectory(TRAJ, ordered_bad)["passed"] is False
+
+
+def test_forbidden_actions():
+    assert evaluate_trajectory(
+        TRAJ, {"forbidden_actions": ["AC_kill_process"]})["passed"] is True
+    assert evaluate_trajectory(
+        TRAJ, {"forbidden_actions": ["AC_click_mouse"]})["passed"] is False
+
+
+def test_max_steps():
+    assert evaluate_trajectory(TRAJ, {"max_steps": 3})["passed"] is True
+    assert evaluate_trajectory(TRAJ, {"max_steps": 2})["passed"] is False
+
+
+def test_success_contains():
+    assert evaluate_trajectory(
+        TRAJ, {"success_contains": "Saved"})["passed"] is True
+    assert evaluate_trajectory(
+        TRAJ, {"success_contains": "Error"})["passed"] is False
+
+
+def test_partial_score():
+    rubric = {"required_actions": ["AC_type_text"],   # pass
+              "forbidden_actions": ["AC_type_text"]}   # fail
+    result = evaluate_trajectory(TRAJ, rubric)
+    assert result["passed"] is False
+    assert result["score"] == 0.5
+
+
+# --- wiring ---------------------------------------------------------------
+
+def test_executor_round_trip_with_json_strings():
+    rec = ac.execute_action([[
+        "AC_evaluate_trajectory",
+        {"trajectory": json.dumps(TRAJ),
+         "rubric": json.dumps({"required_actions": ["AC_type_text"]})},
+    ]])
+    assert any(v.get("passed") is True for v in rec.values()
+               if isinstance(v, dict))
+
+
+def test_wiring():
+    assert "AC_evaluate_trajectory" in ac.executor.known_commands()
+    from je_auto_control.utils.mcp_server.tools import (
+        build_default_tool_registry)
+    names = {t.name for t in build_default_tool_registry()}
+    assert "ac_evaluate_trajectory" in names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    cmds = {s.command for s in _build_specs()}
+    assert "AC_evaluate_trajectory" in cmds
+
+
+def test_facade_export():
+    assert hasattr(ac, "evaluate_trajectory")
+    assert "evaluate_trajectory" in ac.__all__

From aad9ddcc0c0dbe935928c1d0041ae11630fc390f Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 19 Jun 2026 23:07:05 +0800
Subject: [PATCH 2/3] Document trajectory evaluation in toctrees and README

---
 README.md                     | 7 +++++++
 README/README_zh-CN.md        | 7 +++++++
 README/README_zh-TW.md        | 7 +++++++
 docs/source/Eng/eng_index.rst | 1 +
 docs/source/Zh/zh_index.rst   | 1 +
 5 files changed, 23 insertions(+)

diff --git a/README.md b/README.md
index 79e5e536..675ab5b4 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@
 
 ## Table of Contents
 
+- [What's new (2026-06-19) — Agent Trajectory Evaluation](#whats-new-2026-06-19--agent-trajectory-evaluation)
 - [What's new (2026-06-19) — Approval Testing (Golden-Master Baselines)](#whats-new-2026-06-19--approval-testing-golden-master-baselines)
 - [What's new (2026-06-19) — Network Egress Allowlist Guard](#whats-new-2026-06-19--network-egress-allowlist-guard)
 - [What's new (2026-06-19) — Just-In-Time Credential Leases](#whats-new-2026-06-19--just-in-time-credential-leases)
@@ -88,6 +89,12 @@
 
 ---
 
+## What's new (2026-06-19) — Agent Trajectory Evaluation
+
+Score an agent run against a rubric. Full reference: [`docs/source/Eng/doc/new_features/v36_features_doc.rst`](docs/source/Eng/doc/new_features/v36_features_doc.rst).
+
+- **`evaluate_trajectory`** (`AC_evaluate_trajectory`, `ac_evaluate_trajectory`): scores a recorded trajectory (ordered `{action, args, observation}` steps) against a declarative rubric — `required_actions` (+`ordered`), `forbidden_actions`, `max_steps`, `success_contains`. Returns `{passed, score, steps, checks}` where `score` is the fraction of applicable checks passed and each `check` pinpoints a violated expectation. A deterministic, dependency-free signal for agent regression testing; the rubric is plain data so it lives in JSON action files and travels over MCP.
+
 ## What's new (2026-06-19) — Approval Testing (Golden-Master Baselines)
 
 Lock outputs against a human-approved baseline. Full reference: [`docs/source/Eng/doc/new_features/v35_features_doc.rst`](docs/source/Eng/doc/new_features/v35_features_doc.rst).
diff --git a/README/README_zh-CN.md b/README/README_zh-CN.md
index aa380069..63c5f6be 100644
--- a/README/README_zh-CN.md
+++ b/README/README_zh-CN.md
@@ -12,6 +12,7 @@
 
 ## 目录
 
+- [本次更新 (2026-06-19) — Agent 轨迹评估](#本次更新-2026-06-19--agent-轨迹评估)
 - [本次更新 (2026-06-19) — 核准式测试(Golden-Master 基准)](#本次更新-2026-06-19--核准式测试golden-master-基准)
 - [本次更新 (2026-06-19) — 网络出口允许清单守卫](#本次更新-2026-06-19--网络出口允许清单守卫)
 - [本次更新 (2026-06-19) — 即时凭证租约](#本次更新-2026-06-19--即时凭证租约)
@@ -87,6 +88,12 @@
 
 ---
 
+## 本次更新 (2026-06-19) — Agent 轨迹评估
+
+依评分标准为 agent 运行评分。完整参考:[`docs/source/Zh/doc/new_features/v36_features_doc.rst`](../docs/source/Zh/doc/new_features/v36_features_doc.rst)。
+
+- **`evaluate_trajectory`**(`AC_evaluate_trajectory`、`ac_evaluate_trajectory`):依声明式评分标准 —— `required_actions`(+`ordered`)、`forbidden_actions`、`max_steps`、`success_contains` —— 为一次记录的轨迹(有序 `{action, args, observation}` 步骤)评分。返回 `{passed, score, steps, checks}`,其中 `score` 为通过的适用检查占比,每个 `check` 精准指出被违反的期望。为 agent 回归测试提供确定性、无依赖的信号;rubric 为纯数据,可存于 JSON action 文件并经 MCP 传递。
+
 ## 本次更新 (2026-06-19) — 核准式测试(Golden-Master 基准)
 
 将输出锁定到人工核准的基准。完整参考:[`docs/source/Zh/doc/new_features/v35_features_doc.rst`](../docs/source/Zh/doc/new_features/v35_features_doc.rst)。
diff --git a/README/README_zh-TW.md b/README/README_zh-TW.md
index a59dc4f3..8c3abbe0 100644
--- a/README/README_zh-TW.md
+++ b/README/README_zh-TW.md
@@ -12,6 +12,7 @@
 
 ## 目錄
 
+- [本次更新 (2026-06-19) — Agent 軌跡評估](#本次更新-2026-06-19--agent-軌跡評估)
 - [本次更新 (2026-06-19) — 核准式測試(Golden-Master 基準)](#本次更新-2026-06-19--核准式測試golden-master-基準)
 - [本次更新 (2026-06-19) — 網路出口允許清單守衛](#本次更新-2026-06-19--網路出口允許清單守衛)
 - [本次更新 (2026-06-19) — 即時憑證租約](#本次更新-2026-06-19--即時憑證租約)
@@ -87,6 +88,12 @@
 
 ---
 
+## 本次更新 (2026-06-19) — Agent 軌跡評估
+
+依評分標準為 agent 執行評分。完整參考:[`docs/source/Zh/doc/new_features/v36_features_doc.rst`](../docs/source/Zh/doc/new_features/v36_features_doc.rst)。
+
+- **`evaluate_trajectory`**(`AC_evaluate_trajectory`、`ac_evaluate_trajectory`):依宣告式評分標準 —— `required_actions`(+`ordered`)、`forbidden_actions`、`max_steps`、`success_contains` —— 為一次記錄的軌跡(有序 `{action, args, observation}` 步驟)評分。回傳 `{passed, score, steps, checks}`,其中 `score` 為通過的適用檢查佔比,每個 `check` 精準指出被違反的期望。為 agent 回歸測試提供確定性、無相依的訊號;rubric 為純資料,可存於 JSON action 檔並經 MCP 傳遞。
+
 ## 本次更新 (2026-06-19) — 核准式測試(Golden-Master 基準)
 
 將輸出鎖定到人工核准的基準。完整參考:[`docs/source/Zh/doc/new_features/v35_features_doc.rst`](../docs/source/Zh/doc/new_features/v35_features_doc.rst)。
diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst
index 2e4fe404..ec038412 100644
--- a/docs/source/Eng/eng_index.rst
+++ b/docs/source/Eng/eng_index.rst
@@ -58,6 +58,7 @@ Comprehensive guides for all AutoControl features.
    doc/new_features/v33_features_doc
    doc/new_features/v34_features_doc
    doc/new_features/v35_features_doc
+   doc/new_features/v36_features_doc
    doc/ocr_backends/ocr_backends_doc
    doc/observability/observability_doc
    doc/operations_layer/operations_layer_doc
diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst
index 97015308..88f30c6c 100644
--- a/docs/source/Zh/zh_index.rst
+++ b/docs/source/Zh/zh_index.rst
@@ -58,6 +58,7 @@ AutoControl 所有功能的完整使用指南。
    doc/new_features/v33_features_doc
    doc/new_features/v34_features_doc
    doc/new_features/v35_features_doc
+   doc/new_features/v36_features_doc
    doc/ocr_backends/ocr_backends_doc
    doc/observability/observability_doc
    doc/operations_layer/operations_layer_doc

From 8f5b63c0cfa1b4b50ea121189519c633c4a6b364 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 19 Jun 2026 23:12:08 +0800
Subject: [PATCH 3/3] Use pytest.approx for trajectory score comparisons (Sonar
 S1244)

---
 test/unit_test/headless/test_trajectory_eval_batch.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/test/unit_test/headless/test_trajectory_eval_batch.py b/test/unit_test/headless/test_trajectory_eval_batch.py
index 08121506..d38b2970 100644
--- a/test/unit_test/headless/test_trajectory_eval_batch.py
+++ b/test/unit_test/headless/test_trajectory_eval_batch.py
@@ -1,6 +1,8 @@
 """Headless tests for agent trajectory evaluation. Pure stdlib, no Qt imports."""
 import json
 
+import pytest
+
 import je_auto_control as ac
 from je_auto_control.utils.trajectory_eval import evaluate_trajectory
 
@@ -14,7 +16,7 @@
 def test_empty_rubric_passes():
     result = evaluate_trajectory(TRAJ, {})
     assert result["passed"] is True
-    assert result["score"] == 1.0
+    assert result["score"] == pytest.approx(1.0)
     assert result["steps"] == 3
 
 
@@ -27,7 +29,7 @@ def test_required_actions_present():
 def test_required_actions_missing_fails():
     result = evaluate_trajectory(TRAJ, {"required_actions": ["AC_hotkey"]})
     assert result["passed"] is False
-    assert result["score"] == 0.0
+    assert result["score"] == pytest.approx(0.0)
 
 
 def test_ordered_requirement():
@@ -63,7 +65,7 @@ def test_partial_score():
               "forbidden_actions": ["AC_type_text"]}   # fail
     result = evaluate_trajectory(TRAJ, rubric)
     assert result["passed"] is False
-    assert result["score"] == 0.5
+    assert result["score"] == pytest.approx(0.5)
 
 
 # --- wiring ---------------------------------------------------------------