#!/usr/bin/env python3
"""Analyze swtbench image build workflow runs to determine real build counts vs skipped.
Downloads CI logs for each run via GitHub API, parses them to distinguish
truly-built images from skipped ones (already in GHCR), and computes
per-image build time and throughput.
Handles two log formats:
- New (mid-March+): summary line has separate Built/Skipped counts
- Old (Feb-early March): summary lumps built+skipped; we count skip log messages
"""
import json
import re
import subprocess
import sys
REPO = "OpenHands/benchmarks"
TOTAL_IMAGES = 433
def run_cmd(cmd, timeout=120):
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return result.stdout.strip()
def get_all_runs():
cmd = f'''gh run list --repo {REPO} --workflow build-swtbench-images.yml --limit 100 \
--json databaseId,conclusion,createdAt,updatedAt,headSha,displayTitle,status'''
output = run_cmd(cmd)
all_runs = json.loads(output)
return [r for r in all_runs
if r['conclusion'] not in ('skipped', '') and r['status'] == 'completed']
def get_job_id(run_id):
cmd = f'gh run view {run_id} --repo {REPO} --json jobs'
output = run_cmd(cmd)
if not output:
return None, None, None
data = json.loads(output)
for job in data.get('jobs', []):
if 'build' in job['name'].lower():
return job['databaseId'], job.get('startedAt'), job.get('completedAt')
if data.get('jobs'):
j = data['jobs'][0]
return j['databaseId'], j.get('startedAt'), j.get('completedAt')
return None, None, None
def get_job_logs(job_id):
cmd = f'gh api repos/{REPO}/actions/jobs/{job_id}/logs'
return run_cmd(cmd, timeout=60)
def extract_sdk_sha(title):
m = re.search(r'SDK:\s*([a-f0-9]+)', title)
return m.group(1) if m else "unknown"
def parse_duration_seconds(start_str, end_str):
if not start_str or not end_str:
return None
try:
from datetime import datetime
start = datetime.fromisoformat(start_str.replace('Z', '+00:00'))
end = datetime.fromisoformat(end_str.replace('Z', '+00:00'))
return (end - start).total_seconds()
except Exception:
return None
def parse_tqdm_elapsed(elapsed_str):
"""Parse tqdm elapsed time like '2:55:21' or '44:16' to seconds."""
parts = elapsed_str.split(':')
if len(parts) == 3:
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
elif len(parts) == 2:
return int(parts[0]) * 60 + int(parts[1])
return None
def analyze_logs(logs):
"""Analyze logs to determine real built count, skipped count, and build timing.
Tries four methods in order of preference:
1. New summary line: "Done in Xs. Built=N Skipped=M Failed=K"
2. Old summary line + skip message count: "Done. Built=N Failed=M" minus skip count
3. New progress bar (for cancelled runs): 🛠 N ⏭ M ❌ K 🏃 R
4. Old progress bar (for cancelled runs): ✅ N ❌ K 🏃 R minus skip count
"""
result = {
'real_built': None, 'skipped': None, 'failed': None,
'build_seconds': None, 'tqdm_elapsed': None, 'method': 'unknown',
}
if not logs:
return result
# Count "already exists. Skipping build." messages (needed for old format)
skip_count = len(re.findall(r'already exists\. Skipping build\.', logs))
# Extract tqdm elapsed time from the LAST progress bar line
tqdm_matches = re.findall(r'\[(\d+:\d+(?::\d+)?)<', logs)
if tqdm_matches:
result['tqdm_elapsed'] = parse_tqdm_elapsed(tqdm_matches[-1])
# METHOD 1: New format summary
new_summary = re.search(
r'Done in ([\d.]+)s\.\s+Built=(\d+)\s+Skipped=(\d+)\s+Failed=(\d+)', logs)
if new_summary:
result['build_seconds'] = float(new_summary.group(1))
result['real_built'] = int(new_summary.group(2))
result['skipped'] = int(new_summary.group(3))
result['failed'] = int(new_summary.group(4))
result['method'] = 'new_summary'
return result
# METHOD 2: Old format summary + skip count
old_summary = re.search(r'Done\.\s+Built=(\d+)\s+Failed=(\d+)', logs)
if old_summary:
total_reported = int(old_summary.group(1))
result['real_built'] = total_reported - skip_count
result['skipped'] = skip_count
result['failed'] = int(old_summary.group(2))
result['method'] = 'old_summary_minus_skips'
if result['tqdm_elapsed']:
result['build_seconds'] = result['tqdm_elapsed']
return result
# METHOD 3: New progress bar (cancelled runs)
new_progress = re.findall(r'🛠\s*(\d+)\s+⏭\s*(\d+)\s+❌\s*(\d+)\s+🏃\s*(\d+)', logs)
if new_progress:
last = new_progress[-1]
result['real_built'] = int(last[0])
result['skipped'] = int(last[1])
result['failed'] = int(last[2])
result['method'] = 'new_progress_bar'
if result['tqdm_elapsed']:
result['build_seconds'] = result['tqdm_elapsed']
return result
# METHOD 4: Old progress bar (cancelled runs)
old_progress = re.findall(r'✅\s*(\d+)\s+❌\s*(\d+)\s+🏃\s*(\d+)', logs)
if old_progress:
last = old_progress[-1]
result['real_built'] = int(last[0]) - skip_count
result['skipped'] = skip_count
result['failed'] = int(last[1])
result['method'] = 'old_progress_bar_minus_skips'
if result['tqdm_elapsed']:
result['build_seconds'] = result['tqdm_elapsed']
return result
return result
def format_duration(seconds):
if seconds is None:
return "N/A"
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
return f"{h}h{m:02d}m" if h > 0 else f"{m}m{s:02d}s"
def main():
print("Fetching workflow runs...", file=sys.stderr)
runs = get_all_runs()
print(f"Found {len(runs)} non-skipped completed runs", file=sys.stderr)
results = []
for i, run in enumerate(runs):
run_id = run['databaseId']
title = run['displayTitle']
conclusion = run['conclusion']
created = run['createdAt']
# Skip instant failures (< 2 min)
duration_total = parse_duration_seconds(run['createdAt'], run['updatedAt'])
if duration_total is not None and duration_total < 120:
continue
print(f" [{i+1}/{len(runs)}] Analyzing run {run_id} ({conclusion})...",
file=sys.stderr)
sdk_sha = extract_sdk_sha(title)
benchmarks_sha = run['headSha'][:7]
job_id, job_start, job_end = get_job_id(run_id)
if not job_id:
continue
logs = get_job_logs(job_id)
if not logs:
continue
analysis = analyze_logs(logs)
if analysis['real_built'] is None or analysis['real_built'] < 50:
print(f" Built {analysis['real_built']} images (< 50), skipping",
file=sys.stderr)
continue
build_secs = analysis.get('build_seconds')
if build_secs is None:
build_secs = parse_duration_seconds(job_start, job_end)
real_built = analysis['real_built']
throughput = None
if real_built > 0 and build_secs:
throughput = real_built / (build_secs / 3600)
results.append({
'run_id': run_id,
'date': created[:10],
'conclusion': conclusion,
'sdk_sha': sdk_sha[:7],
'benchmarks_sha': benchmarks_sha,
'real_built': real_built,
'skipped': analysis['skipped'],
'failed': analysis.get('failed', 0),
'build_seconds': build_secs,
'build_duration_str': format_duration(build_secs),
'throughput_img_per_hour': round(throughput, 1) if throughput else None,
'method': analysis['method'],
})
print(f" Built={real_built} Skipped={analysis['skipped']} "
f"Duration={format_duration(build_secs)} "
f"Throughput={round(throughput, 1) if throughput else 'N/A'} img/h "
f"Method={analysis['method']}", file=sys.stderr)
results.sort(key=lambda x: x['date'] + str(x['run_id']))
json.dump(results, sys.stdout, indent=2)
if __name__ == '__main__':
main()
This issue serves as the historical source of truth for SWT-bench image build throughput. It tracks every workflow run that built more than 50 images, with careful accounting of truly built vs. skipped images.
Build throughput over time
bde31c47d81e21cefaebf744df22cefaebf744df22cefaebf744df22eab666f09c9fb13c2bce2863df2a63565b9981a95a8e3e3061f2902030819564d786be30819564d786be46f3d78e6f6da4b498a6936cb585aa9df6949a94d1aa9df69c614f83d0c1a39355829e447aa9122ea7b58e8223b277cb51d94cdf825790418e8223b039aebdd129025e326a67c34cb27bedaad9c34cb2776b2dbcd2d5d47e326a672d027b4e2132d1Reading the table
images_built / build_duration_hours— the measured build rate during this run.Caveats on throughput comparisons across runs:
Key observations
cefaebf) measured 66-76 img/h on 364-382 images. Mid-March runs measured 31-42 img/h on 314-409 images.447aa91) measured only 23.3 img/h on 68 images.Methodology
Why counting "real builds" is non-trivial
The build workflow checks GHCR for each image before building it (
remote_image_exists()check). If the image already exists, it's skipped. A run can report "Built: 433/433" while only truly building a fraction — the rest were skipped because they were already pushed by a prior or concurrent run. If we don't distinguish, a mostly-skipping run looks blazingly fast when it isn't.How real build counts were determined
The log format changed over time, so two different extraction strategies are needed:
Newer runs (mid-March+): The build summary line directly reports separate counts:
The progress bar also distinguishes built from skipped:
🛠 392 ⏭ 41 ❌ 0 🏃 0Older runs (Feb–early March): The summary lumps everything together:
The progress bar shows
✅ 433 ❌ 0 🏃 0where✅includes both built AND skipped. However, each skipped image produces a distinct log line:So for old-format runs:
real_built = reported_total - count("already exists. Skipping build."). This is verified: the count of skip messages equals the count of unique images skipped (no double-counting).Cancelled runs: The last tqdm progress bar line gives the final built/skipped counts at the time of cancellation.
How build duration was measured
Done in Xs.).[5:29:27<00:00, 45.65s/it]), which excludes job setup overhead (checkout, docker login, uv install — typically ~1–2 minutes).Script
Prerequisites:
ghCLI authenticated with access toOpenHands/benchmarks. Run with:analyze_swtbench_builds.py (click to expand)
Verifying skip counts
To independently verify that skip counts are not double-counted for any run: