Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ name: Config Tests
on:
pull_request:
paths:
- 'packages/dataset_config/**'
- '.github/workflows/config_tests.yml'
- 'packages/dataset_config_dart/**'
- '.github/workflows/config_dart_tests.yml'
push:
branches:
- main
paths:
- 'packages/dataset_config/**'
- '.github/workflows/config_tests.yml'
- 'packages/dataset_config_dart/**'
- '.github/workflows/config_dart_tests.yml'

jobs:
config-tests:
Expand All @@ -31,9 +31,9 @@ jobs:
run: flutter pub get

- name: Analyze
working-directory: packages/dataset_config
working-directory: packages/dataset_config_dart
run: dart analyze --fatal-infos

- name: Run tests
working-directory: packages/dataset_config
working-directory: packages/dataset_config_dart
run: dart test
46 changes: 46 additions & 0 deletions .github/workflows/config_parity.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Config Parity

on:
pull_request:
paths:
- 'packages/dataset_config_dart/**'
- 'packages/dataset_config_python/**'
- 'tool/config_parity/**'
- '.github/workflows/config_parity.yml'
push:
branches:
- main
paths:
- 'packages/dataset_config_dart/**'
- 'packages/dataset_config_python/**'
- 'tool/config_parity/**'
- '.github/workflows/config_parity.yml'

jobs:
config-parity:
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
- name: Checkout repository
uses: actions/checkout@v6

- name: Install Flutter
run: |
git clone https://github.com/flutter/flutter.git --depth 1 -b stable $HOME/flutter
echo "$HOME/flutter/bin" >> $GITHUB_PATH
echo "$HOME/.pub-cache/bin" >> $GITHUB_PATH

- name: Install Dart dependencies
run: flutter pub get

- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'

- name: Install Python config package
run: pip install -e packages/dataset_config_python

- name: Verify config parity
run: dart run tool/config_parity/bin/config_partiy.dart
44 changes: 44 additions & 0 deletions .github/workflows/dash_evals_module_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: dash_evals module - Python tests

on:
pull_request:
paths:
- 'packages/dash_evals/**'
- '.github/workflows/dash_evals_module_tests.yml'
push:
branches:
- main
paths:
- 'packages/dash_evals/**'
- '.github/workflows/dash_evals_module_tests.yml'

jobs:
runner-tests:
runs-on: ubuntu-latest
timeout-minutes: 15

steps:
- name: Checkout repository
uses: actions/checkout@v6

- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'

- name: Create virtual environment
working-directory: packages/dash_evals
run: python -m venv .venv

- name: Install dependencies
working-directory: packages/dash_evals
run: |
source .venv/bin/activate
pip install --upgrade pip
pip install -e ".[dev]"

- name: Run tests
working-directory: packages/dash_evals
run: |
source .venv/bin/activate
pytest -v
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ coverage
/docs/_build
/docs/dart_docs
logs/
**/pyrefly.toml


##
Expand Down
3 changes: 3 additions & 0 deletions packages/dash_evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# dash_evals

Python package for running LLM evaluations on Dart and Flutter tasks using [Inspect AI](https://inspect.aisi.org.uk/).
72 changes: 72 additions & 0 deletions packages/dash_evals/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
[project]
name = "dash-evals"
version = "0.1.0"
description = ""
authors = [{ name = "Eric Windmill", email = "eric@ericwindmill.com" }]
readme = "README.md"
requires-python = ">=3.13,<4.0.0"
dependencies = [
"inspect-ai>=0.3.142,<0.4.0",
"pyyaml>=6.0.3,<7.0.0",
"google-genai>=1.47.0,<2.0.0",
"mcp>=1.20.0,<2.0.0",
"python-dotenv>=1.2.1,<2.0.0",
"anthropic>=0.75.0,<0.81.0",
"openai>=2.8.1,<3.0.0",
"firebase-admin>=6.0.0,<8.0.0",
"pydantic>=2.0.0,<3.0.0",
]

[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-mock>=3.12.0",
"pytest-cov>=4.1.0",
"pylint>=3.0.0",
]

[project.scripts]
run-evals = "dash_evals.main:main"

[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

# Register podman sandbox with inspect_ai
[project.entry-points.inspect_ai]
dash_evals = "dash_evals.runner.sandboxes"

[tool.setuptools.packages.find]
where = ["src"]

[tool.setuptools.package-data]
dash_evals = ["data/*.yaml"]

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]

[tool.coverage.run]
omit = [
"src/dash_evals/main.py",
"src/dash_evals/uploader.py",
"src/dash_evals/uploader_aggregates.py",
"src/dash_evals/tasks/*",
]

[tool.pylint.messages_control]
disable = [
"logging-fstring-interpolation", # Allow f-strings in logging (modern Python standard)
]

[tool.pylint.format]
max-line-length = 100

[tool.ruff]
line-length = 100

[tool.ruff.lint]
select = ["E", "F", "W", "I"]
ignore = ["E501"] # Line too long (handled by formatter)
4 changes: 4 additions & 0 deletions packages/dash_evals/pyrefly.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Pyrefly configuration
# Tell Pyrefly to use the repo-root venv Python interpreter

python-interpreter = "../../.venv/bin/python"
12 changes: 12 additions & 0 deletions packages/dash_evals/src/dash_evals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""dash_evals - Evaluation framework for Dart and Flutter AI assistants.

This package provides tools for running evaluations using Inspect AI
to measure model performance on Dart/Flutter tasks.

Configuration is resolved by the Dart CLI (devals) and emitted as JSONL
datasets + a run manifest. The Python package reads the manifest and
calls eval_set() directly.

Main entry point:
run-evals --manifest <path-to-manifest>
"""
118 changes: 118 additions & 0 deletions packages/dash_evals/src/dash_evals/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright 2025 The Flutter Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""CLI entry point for running evaluations.

Usage:
run-evals --json ./eval_set.json
run-evals --task my_task --model openai/gpt-4o --dataset samples.jsonl
"""

import argparse
import logging
import sys
from pathlib import Path

from dotenv import load_dotenv

# Import sandbox environments to register them with InspectAI
# The @sandboxenv decorator registers the sandbox type when the module is imported
import dash_evals.runner.sandboxes.podman.podman # noqa: F401 # Registers 'podman'
from dash_evals.runner.args_runner import _run_from_args
from dash_evals.runner.json_runner import run_from_json

# Basic console logger for early startup messages
logging.basicConfig(level=logging.INFO, format="%(message)s")
_startup_logger = logging.getLogger("startup")


def main():
"""Parse command-line arguments and run evaluations."""
# Load .env from the repo root (walks up from cwd).
# This populates os.environ with API keys, credentials, etc.
# System env vars take precedence over .env values (python-dotenv default).
load_dotenv(override=False)

parser = argparse.ArgumentParser(
description="Run Inspect AI evaluations for the Dart/Flutter plugin.",
epilog="Example: run-evals --json ./eval_set.json",
)

# ---------- JSON mode (mutually exclusive with direct args) ----------
parser.add_argument(
"--json",
type=Path,
help="Path to eval_set.json (emitted by Dart CLI).",
)

# ---------- Direct-args mode ----------
parser.add_argument(
"--task",
type=str,
help="Task function name (e.g. 'flutter_code_gen' or dotted path).",
)
parser.add_argument(
"--model",
type=str,
action="append",
help="Model to evaluate (can be repeated). Example: openai/gpt-4o",
)
parser.add_argument(
"--dataset",
type=Path,
help="Path to a dataset file (JSON/JSONL/CSV).",
)
parser.add_argument(
"--log-dir",
type=Path,
help="Directory to write evaluation logs.",
)
parser.add_argument(
"--sandbox",
type=str,
nargs=2,
metavar=("TYPE", "CONFIG"),
help="Sandbox type and config path. Example: podman compose.yaml",
)
parser.add_argument(
"--max-connections",
type=int,
help="Maximum concurrent model connections.",
)
parser.add_argument(
"--max-samples",
type=int,
help="Maximum concurrent samples per task.",
)
parser.add_argument(
"--fail-on-error",
type=float,
help="Proportion of sample errors to tolerate (0.0-1.0).",
)

args = parser.parse_args()

# Ensure either --json or direct args are provided, but not both.
direct_args_provided = any([args.task, args.model, args.dataset])
if args.json and direct_args_provided:
parser.error(
"Cannot combine --json with --task/--model/--dataset. Use one mode or the other."
)
if not args.json and not direct_args_provided:
parser.error("Provide either --json or at least --task and --model.")

try:
if args.json:
has_failures = run_from_json(args.json)
else:
has_failures = _run_from_args(args)
except Exception as e:
_startup_logger.error(f"Failed to run evaluation: {e}")
sys.exit(1)

sys.exit(1 if has_failures else 0)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions packages/dash_evals/src/dash_evals/runner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Runner module for executing evaluations.

This module contains the core evaluation logic including:
- Task definitions and registry
- Solvers for setting up workspaces
- Scorers for evaluating model outputs
"""
Loading
Loading