flutter · ericwindmill · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/.github/workflows/config_tests.yml → .github/workflows/config_dart_tests.yml b/.github/workflows/config_tests.yml → .github/workflows/config_dart_tests.yml
@@ -3,14 +3,14 @@ name: Config Tests
 on:
   pull_request:
     paths:
-      - 'packages/dataset_config/**'
-      - '.github/workflows/config_tests.yml'
+      - 'packages/dataset_config_dart/**'
+      - '.github/workflows/config_dart_tests.yml'
   push:
     branches:
       - main
     paths:
-      - 'packages/dataset_config/**'
-      - '.github/workflows/config_tests.yml'
+      - 'packages/dataset_config_dart/**'
+      - '.github/workflows/config_dart_tests.yml'
 
 jobs:
   config-tests:
@@ -31,9 +31,9 @@ jobs:
         run: flutter pub get
 
       - name: Analyze
-        working-directory: packages/dataset_config
+        working-directory: packages/dataset_config_dart
         run: dart analyze --fatal-infos
 
       - name: Run tests
-        working-directory: packages/dataset_config
+        working-directory: packages/dataset_config_dart
         run: dart test
diff --git a/.github/workflows/config_parity.yml b/.github/workflows/config_parity.yml
@@ -0,0 +1,46 @@
+name: Config Parity
+
+on:
+  pull_request:
+    paths:
+      - 'packages/dataset_config_dart/**'
+      - 'packages/dataset_config_python/**'
+      - 'tool/config_parity/**'
+      - '.github/workflows/config_parity.yml'
+  push:
+    branches:
+      - main
+    paths:
+      - 'packages/dataset_config_dart/**'
+      - 'packages/dataset_config_python/**'
+      - 'tool/config_parity/**'
+      - '.github/workflows/config_parity.yml'
+
+jobs:
+  config-parity:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Install Flutter
+        run: |
+          git clone https://github.com/flutter/flutter.git --depth 1 -b stable $HOME/flutter
+          echo "$HOME/flutter/bin" >> $GITHUB_PATH
+          echo "$HOME/.pub-cache/bin" >> $GITHUB_PATH
+
+      - name: Install Dart dependencies
+        run: flutter pub get
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Python config package
+        run: pip install -e packages/dataset_config_python
+
+      - name: Verify config parity
+        run: dart run tool/config_parity/bin/config_partiy.dart
diff --git a/.github/workflows/dash_evals_module_tests.yml b/.github/workflows/dash_evals_module_tests.yml
@@ -0,0 +1,44 @@
+name: dash_evals module - Python tests
+
+on:
+  pull_request:
+    paths:
+      - 'packages/dash_evals/**'
+      - '.github/workflows/dash_evals_module_tests.yml'
+  push:
+    branches:
+      - main
+    paths:
+      - 'packages/dash_evals/**'
+      - '.github/workflows/dash_evals_module_tests.yml'
+
+jobs:
+  runner-tests:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Create virtual environment
+        working-directory: packages/dash_evals
+        run: python -m venv .venv
+
+      - name: Install dependencies
+        working-directory: packages/dash_evals
+        run: |
+          source .venv/bin/activate
+          pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Run tests
+        working-directory: packages/dash_evals
+        run: |
+          source .venv/bin/activate
+          pytest -v
diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,7 @@ coverage
 /docs/_build
 /docs/dart_docs
 logs/
+**/pyrefly.toml
 
 
 ##

diff --git a/packages/dash_evals/README.md b/packages/dash_evals/README.md
@@ -0,0 +1,3 @@
+# dash_evals
+
+Python package for running LLM evaluations on Dart and Flutter tasks using [Inspect AI](https://inspect.aisi.org.uk/).
diff --git a/packages/dash_evals/pyproject.toml b/packages/dash_evals/pyproject.toml
@@ -0,0 +1,72 @@
+[project]
+name = "dash-evals"
+version = "0.1.0"
+description = ""
+authors = [{ name = "Eric Windmill", email = "eric@ericwindmill.com" }]
+readme = "README.md"
+requires-python = ">=3.13,<4.0.0"
+dependencies = [
+    "inspect-ai>=0.3.142,<0.4.0",
+    "pyyaml>=6.0.3,<7.0.0",
+    "google-genai>=1.47.0,<2.0.0",
+    "mcp>=1.20.0,<2.0.0",
+    "python-dotenv>=1.2.1,<2.0.0",
+    "anthropic>=0.75.0,<0.81.0",
+    "openai>=2.8.1,<3.0.0",
+    "firebase-admin>=6.0.0,<8.0.0",
+    "pydantic>=2.0.0,<3.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-mock>=3.12.0",
+    "pytest-cov>=4.1.0",
+    "pylint>=3.0.0",
+]
+
+[project.scripts]
+run-evals = "dash_evals.main:main"
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+# Register podman sandbox with inspect_ai
+[project.entry-points.inspect_ai]
+dash_evals = "dash_evals.runner.sandboxes"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+dash_evals = ["data/*.yaml"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+
+[tool.coverage.run]
+omit = [
+    "src/dash_evals/main.py",
+    "src/dash_evals/uploader.py",
+    "src/dash_evals/uploader_aggregates.py",
+    "src/dash_evals/tasks/*",
+]
+
+[tool.pylint.messages_control]
+disable = [
+    "logging-fstring-interpolation",  # Allow f-strings in logging (modern Python standard)
+]
+
+[tool.pylint.format]
+max-line-length = 100
+
+[tool.ruff]
+line-length = 100
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I"]
+ignore = ["E501"]  # Line too long (handled by formatter)
diff --git a/packages/dash_evals/pyrefly.toml b/packages/dash_evals/pyrefly.toml
@@ -0,0 +1,4 @@
+# Pyrefly configuration
+# Tell Pyrefly to use the repo-root venv Python interpreter
+
+python-interpreter = "../../.venv/bin/python"
diff --git a/packages/dash_evals/src/dash_evals/__init__.py b/packages/dash_evals/src/dash_evals/__init__.py
@@ -0,0 +1,12 @@
+"""dash_evals - Evaluation framework for Dart and Flutter AI assistants.
+
+This package provides tools for running evaluations using Inspect AI
+to measure model performance on Dart/Flutter tasks.
+
+Configuration is resolved by the Dart CLI (devals) and emitted as JSONL
+datasets + a run manifest. The Python package reads the manifest and
+calls eval_set() directly.
+
+Main entry point:
+    run-evals --manifest <path-to-manifest>
+"""
diff --git a/packages/dash_evals/src/dash_evals/main.py b/packages/dash_evals/src/dash_evals/main.py
@@ -0,0 +1,118 @@
+# Copyright 2025 The Flutter Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""CLI entry point for running evaluations.
+
+Usage:
+    run-evals --json ./eval_set.json
+    run-evals --task my_task --model openai/gpt-4o --dataset samples.jsonl
+"""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+# Import sandbox environments to register them with InspectAI
+# The @sandboxenv decorator registers the sandbox type when the module is imported
+import dash_evals.runner.sandboxes.podman.podman  # noqa: F401  # Registers 'podman'
+from dash_evals.runner.args_runner import _run_from_args
+from dash_evals.runner.json_runner import run_from_json
+
+# Basic console logger for early startup messages
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+_startup_logger = logging.getLogger("startup")
+
+
+def main():
+    """Parse command-line arguments and run evaluations."""
+    # Load .env from the repo root (walks up from cwd).
+    # This populates os.environ with API keys, credentials, etc.
+    # System env vars take precedence over .env values (python-dotenv default).
+    load_dotenv(override=False)
+
+    parser = argparse.ArgumentParser(
+        description="Run Inspect AI evaluations for the Dart/Flutter plugin.",
+        epilog="Example: run-evals --json ./eval_set.json",
+    )
+
+    # ---------- JSON mode (mutually exclusive with direct args) ----------
+    parser.add_argument(
+        "--json",
+        type=Path,
+        help="Path to eval_set.json (emitted by Dart CLI).",
+    )
+
+    # ---------- Direct-args mode ----------
+    parser.add_argument(
+        "--task",
+        type=str,
+        help="Task function name (e.g. 'flutter_code_gen' or dotted path).",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        action="append",
+        help="Model to evaluate (can be repeated). Example: openai/gpt-4o",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=Path,
+        help="Path to a dataset file (JSON/JSONL/CSV).",
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=Path,
+        help="Directory to write evaluation logs.",
+    )
+    parser.add_argument(
+        "--sandbox",
+        type=str,
+        nargs=2,
+        metavar=("TYPE", "CONFIG"),
+        help="Sandbox type and config path. Example: podman compose.yaml",
+    )
+    parser.add_argument(
+        "--max-connections",
+        type=int,
+        help="Maximum concurrent model connections.",
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        help="Maximum concurrent samples per task.",
+    )
+    parser.add_argument(
+        "--fail-on-error",
+        type=float,
+        help="Proportion of sample errors to tolerate (0.0-1.0).",
+    )
+
+    args = parser.parse_args()
+
+    # Ensure either --json or direct args are provided, but not both.
+    direct_args_provided = any([args.task, args.model, args.dataset])
+    if args.json and direct_args_provided:
+        parser.error(
+            "Cannot combine --json with --task/--model/--dataset. Use one mode or the other."
+        )
+    if not args.json and not direct_args_provided:
+        parser.error("Provide either --json or at least --task and --model.")
+
+    try:
+        if args.json:
+            has_failures = run_from_json(args.json)
+        else:
+            has_failures = _run_from_args(args)
+    except Exception as e:
+        _startup_logger.error(f"Failed to run evaluation: {e}")
+        sys.exit(1)
+
+    sys.exit(1 if has_failures else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/dash_evals/src/dash_evals/runner/__init__.py b/packages/dash_evals/src/dash_evals/runner/__init__.py
@@ -0,0 +1,7 @@
+"""Runner module for executing evaluations.
+
+This module contains the core evaluation logic including:
+- Task definitions and registry
+- Solvers for setting up workspaces
+- Scorers for evaluating model outputs
+"""
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,6 +15,7 @@ coverage @@
     /docs/_build
     /docs/dart_docs
     logs/
+    **/pyrefly.toml
     ##
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# dash_evals

		Python package for running LLM evaluations on Dart and Flutter tasks using [Inspect AI](https://inspect.aisi.org.uk/).