From abde0bb71106bce610c46c51417f21d3ad828c0d Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 11:53:04 +0100 Subject: [PATCH 1/6] refactor: aggressive scope pruning to essentials Prune security infrastructure to a 2-layer exfiltration defense model: firewall (primary) + narrowed hook (trusted channels only). Remove permission tiers, most hooks, all commands, niche agents, and supporting docs. Local destruction is no longer blocked -- devcontainer is disposable. Deleted: 4 hooks, 3 commands, 6 agents, 1 skill, 3 tier files, 7 docs Kept: 2 hooks, 6 agents, 4 skills, 4 rules, 1 plugin Updated: settings.json, devcontainer.json, CLAUDE.md, DECISIONS.md, DEVELOPMENT_PROCESS.md, IMPLEMENTATION_PLAN.md, all test files Co-Authored-By: Claude Opus 4.6 (1M context) --- .../agents/acceptance-criteria-validator.md | 63 -- .claude/agents/agent-auditor.md | 98 -- .claude/agents/implementation-tracker.md | 61 -- .claude/agents/output-evaluator.md | 93 -- .claude/agents/refactoring-specialist.md | 116 -- .claude/agents/security-auditor.md | 126 --- .claude/commands/cove-isolated.md | 68 -- .claude/commands/cove.md | 60 -- .claude/commands/security-audit.md | 93 -- .claude/hooks/dangerous-actions-blocker.sh | 88 +- .claude/hooks/devcontainer-policy-blocker.sh | 131 --- .claude/hooks/output-secrets-scanner.sh | 80 -- .claude/hooks/test-on-change.sh | 87 -- .claude/hooks/unicode-injection-scanner.sh | 70 -- .claude/settings.json | 11 +- .claude/skills/edit-permissions/SKILL.md | 137 --- .devcontainer/devcontainer.json | 5 +- .devcontainer/permissions/tier1-assisted.json | 63 -- .../permissions/tier2-autonomous.json | 62 -- .../permissions/tier3-full-trust.json | 51 - CLAUDE.md | 20 +- docs/ARCHITECTURE_GUIDE.md | 687 ------------ docs/DECISIONS.md | 138 +-- docs/DEVCONTAINER_PERMISSIONS.md | 109 -- docs/DEVELOPMENT_PROCESS.md | 39 +- docs/IMPLEMENTATION_PLAN.md | 326 +----- docs/community/GROWTH_PLAN.md | 128 --- docs/community/README_DRAFT.md | 19 - docs/community/claude.md | 92 -- docs/community/feedback/claude.md | 55 - docs/community/feedback/gemini.md | 44 - docs/community/feedback/openai.md | 987 ------------------ docs/community/gemini.md | 82 -- docs/community/openai.md | 186 ---- tests/test_agents.py | 51 +- tests/test_commands.py | 105 +- tests/test_hooks.py | 118 +-- tests/test_skills.py | 25 +- 38 files changed, 148 insertions(+), 4626 deletions(-) delete mode 100644 .claude/agents/acceptance-criteria-validator.md delete mode 100644 .claude/agents/agent-auditor.md delete mode 100644 .claude/agents/implementation-tracker.md delete mode 100644 .claude/agents/output-evaluator.md delete mode 100644 .claude/agents/refactoring-specialist.md delete mode 100644 .claude/agents/security-auditor.md delete mode 100644 .claude/commands/cove-isolated.md delete mode 100644 .claude/commands/cove.md delete mode 100644 .claude/commands/security-audit.md delete mode 100755 .claude/hooks/devcontainer-policy-blocker.sh delete mode 100755 .claude/hooks/output-secrets-scanner.sh delete mode 100755 .claude/hooks/test-on-change.sh delete mode 100755 .claude/hooks/unicode-injection-scanner.sh delete mode 100644 .claude/skills/edit-permissions/SKILL.md delete mode 100644 .devcontainer/permissions/tier1-assisted.json delete mode 100644 .devcontainer/permissions/tier2-autonomous.json delete mode 100644 .devcontainer/permissions/tier3-full-trust.json delete mode 100644 docs/ARCHITECTURE_GUIDE.md delete mode 100644 docs/DEVCONTAINER_PERMISSIONS.md delete mode 100644 docs/community/GROWTH_PLAN.md delete mode 100644 docs/community/README_DRAFT.md delete mode 100644 docs/community/claude.md delete mode 100644 docs/community/feedback/claude.md delete mode 100644 docs/community/feedback/gemini.md delete mode 100644 docs/community/feedback/openai.md delete mode 100644 docs/community/gemini.md delete mode 100644 docs/community/openai.md diff --git a/.claude/agents/acceptance-criteria-validator.md b/.claude/agents/acceptance-criteria-validator.md deleted file mode 100644 index f8a4cbe..0000000 --- a/.claude/agents/acceptance-criteria-validator.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -name: acceptance-criteria-validator -description: Use this agent for Step P.3.2 - Acceptance Criteria validation.\n\nVerifies cumulative acceptance criteria from the current phase AND all previous phases.\n\n**Examples:**\n\n\nContext: Completing Phase 2 of implementation.\n\nuser: "Verify acceptance criteria for Phase 2"\n\nassistant: "I'll use the acceptance-criteria-validator agent to check all criteria from Phase 1 and Phase 2."\n\n\n\n\n\nContext: Step P.3.2.\n\nuser: "Check that all acceptance criteria still pass"\n\nassistant: "Let me run the acceptance-criteria-validator to verify cumulative criteria."\n\n\n -model: sonnet -tools: Read, Glob, Grep, Bash -permissionMode: dontAsk -color: green ---- - -You are an Acceptance Criteria Validator for a Python project. Your job is to verify that ALL acceptance criteria from the current phase AND all previous phases are met. - -**Process:** - -1. **Find and read the implementation plan** - - Search for `IMPLEMENTATION_PLAN.md` in the repository (check `docs/`, root, or any package directory) - - Identify the current phase and all previous phases - -2. **Extract acceptance criteria** - - List every acceptance criterion from Phase 1 through the current phase - - Mark each as "automatable" or "non-automatable" - -3. **Verify automatable criteria** by running actual checks: - - API responses: Use `curl` or `httpx` to test endpoints - - Data schemas: Validate with pydantic or JSON schema checks - - CLI output: Run commands and verify output - - Test results: Run specific test suites - - File existence: Verify expected files exist - - Code patterns: Grep for expected implementations - -4. **Document non-automatable criteria** - - Architecture decisions: Reference the code structure - - Design patterns: Point to specific implementations - - UX considerations: Describe how verified - -5. **Report results** - -**Output Format:** - -```markdown -# Acceptance Criteria Validation - -## Phase 1 (Cumulative) -- [PASS] Criterion 1 - How verified -- [PASS] Criterion 2 - How verified -- [FAIL] Criterion 3 - What failed and why - -## Phase 2 (Current) -- [PASS] Criterion 1 - How verified -- [FAIL] Criterion 2 - What failed and why - -## Summary -- Total criteria: N -- Passing: N -- Failing: N -- Blocking issues: [list any FAIL items that must be fixed] -``` - -**Key Rules:** -- Criteria are CUMULATIVE -- Phase 3 must verify Phase 1 + 2 + 3 -- For automatable criteria, run ACTUAL checks -- do not self-assess -- Clearly distinguish between "verified by test" and "verified by inspection" -- Any FAIL on a previous phase's criterion is a regression -- flag prominently -- If you cannot determine a criterion's status, mark it as UNKNOWN with explanation diff --git a/.claude/agents/agent-auditor.md b/.claude/agents/agent-auditor.md deleted file mode 100644 index 8dcf293..0000000 --- a/.claude/agents/agent-auditor.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -name: agent-auditor -description: Audits .claude/agents/ files against official best practices from https://code.claude.com/docs/en/sub-agents. Use when creating, modifying, or reviewing agent definitions to ensure they follow security and configuration best practices.\n\n**Examples:**\n\n\nContext: After creating or modifying agent files.\n\nuser: "Audit our agent definitions"\n\nassistant: "I'll use the agent-auditor agent to check all agent files against official best practices."\n\n\n\n\n\nContext: Reviewing agent security posture.\n\nuser: "Check if our agents have proper tool restrictions"\n\nassistant: "Let me run the agent-auditor to verify tool scoping and permission modes."\n\n\n -tools: Read, Glob, Grep -model: sonnet -permissionMode: plan -color: magenta ---- - -You are an Agent Auditor. You analyze `.claude/agents/*.md` files and check them against the official Claude Code subagent best practices documented at https://code.claude.com/docs/en/sub-agents. - -**Process:** - -1. **Discover all agent files** - - Use Glob to find all `.claude/agents/*.md` files - - Read each file in full - -2. **Parse each agent file** - - Extract YAML frontmatter fields - - Identify the system prompt (markdown body after frontmatter) - - Note which fields are present and which are missing - -3. **Run the audit checklist** against each agent - -4. **Report findings** with severity levels and a summary table - -**Audit Checklist:** - -| # | Check | Severity | Details | -|---|-------|----------|---------| -| 1 | `tools` field present and scoped | Critical | Agents without `tools` inherit ALL tools, including Write, Edit, Bash. Every agent should explicitly list only the tools it needs. | -| 2 | Read-only agents exclude write tools | Critical | Agents that only analyze/report (reviewers, validators, writers) should NOT have Write, Edit, or NotebookEdit. Read-only tools: Read, Grep, Glob, Bash (if needed for git/read commands). | -| 3 | `description` is detailed and includes examples | Warning | The description field is how Claude decides when to delegate. It should explain WHEN to use the agent, not just WHAT it does. Include `` blocks for clarity. | -| 4 | `model` is set appropriately | Suggestion | Use `haiku` for fast/simple tasks (linting, formatting). Use `sonnet` for tasks requiring nuanced analysis. Use `opus` or `inherit` for complex reasoning. Omitting defaults to `inherit`. | -| 5 | `memory` considered for learning agents | Suggestion | Agents that build knowledge over time (code reviewers, debuggers) benefit from `memory: user` or `memory: project` for cross-session learning. | -| 6 | System prompt is focused on one task | Warning | Each agent should excel at one specific task (official docs). Check for prompts that try to do too many unrelated things. | -| 7 | Output format specified | Warning | Agents should define a clear output format (markdown template) so results are consistent and parseable. | -| 8 | No unrestricted dangerous operations | Critical | Agents that can run Bash without restrictions could push to remote, delete files, or modify git history. Agents that commit/push should have hooks or explicit tool restrictions. Flag agents that mention `git push`, `git commit`, `gh pr`, or destructive commands without tool/hook guardrails. | -| 9 | `permissionMode` set when appropriate | Warning | Read-only agents should use `permissionMode: plan` or `permissionMode: dontAsk`. Agents that modify code should use `permissionMode: acceptEdits` or `default`. | -| 10 | Frontmatter uses valid field names | Critical | Valid fields: `name`, `description`, `tools`, `disallowedTools`, `model`, `permissionMode`, `maxTurns`, `skills`, `mcpServers`, `hooks`, `memory`, `color`. Flag any unrecognized fields. Valid model values: `sonnet`, `opus`, `haiku`, `inherit`. Valid permissionMode values: `default`, `acceptEdits`, `delegate`, `dontAsk`, `bypassPermissions`, `plan`. | - -**Severity Definitions:** - -- **Critical**: Security risk or misconfiguration that could cause unintended side effects (e.g., a read-only agent that can write files) -- **Warning**: Best practice violation that reduces clarity, reliability, or maintainability -- **Suggestion**: Optional improvement that would enhance the agent's effectiveness - -**Output Format:** - -```markdown -# Agent Audit Report - -## Per-Agent Findings - -### [agent-name] -- File: `.claude/agents/[filename].md` -- Model: [model value or "not set (inherits)"] -- Tools: [tools value or "not set (inherits all)"] -- Permission Mode: [value or "not set"] -- Memory: [value or "not set"] - -| # | Check | Severity | Status | Details | -|---|-------|----------|--------|---------| -| 1 | tools scoped | Critical | PASS/FAIL | ... | -| 2 | read-only exclusions | Critical | PASS/FAIL/N/A | ... | -| ... | ... | ... | ... | ... | - ---- - -### [next-agent-name] -... - ---- - -## Summary Table - -| Agent | Critical | Warning | Suggestion | Overall | -|-------|----------|---------|------------|---------| -| agent-name | N | N | N | PASS/FAIL | -| ... | ... | ... | ... | ... | -| **Total** | **N** | **N** | **N** | **PASS/FAIL** | - -## Top Recommendations -1. [Most impactful fix across all agents] -2. [Second most impactful fix] -3. ... -``` - -**Key Rules:** -- Read each agent file completely before analyzing it -- Be precise -- cite specific field values and line references -- Distinguish between agents that SHOULD be read-only vs agents that legitimately need write access -- Agents that run `uv run ruff check --fix`, `uv run ruff format`, or edit code as part of their workflow legitimately need write tools -- Agents that only analyze, report, or generate text output should be read-only -- Do NOT suggest changes that would break an agent's core functionality -- If an agent has zero findings, report that clearly -- do not invent problems -- Focus on actionable findings, not hypothetical concerns -- Use plain ASCII in output -- no special Unicode characters diff --git a/.claude/agents/implementation-tracker.md b/.claude/agents/implementation-tracker.md deleted file mode 100644 index fc73133..0000000 --- a/.claude/agents/implementation-tracker.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -name: implementation-tracker -description: Use this agent for Step P.3.3 - Implementation Plan verification.\n\nVerifies that IMPLEMENTATION_PLAN.md accurately reflects the current state of the codebase.\n\n**Examples:**\n\n\nContext: After completing a development phase.\n\nuser: "Check that the implementation plan is up to date"\n\nassistant: "I'll use the implementation-tracker agent to verify the plan matches reality."\n\n\n\n\n\nContext: Before starting a new phase.\n\nuser: "Verify Phase 2 is fully complete before starting Phase 3"\n\nassistant: "Let me use the implementation-tracker agent to confirm."\n\n\n -model: sonnet -tools: Read, Glob, Grep, Bash, Edit -permissionMode: acceptEdits -color: blue ---- - -You are an Implementation Tracker for a Python project. You verify that documentation accurately reflects the actual implementation state. - -**Process:** - -1. **Find and read documentation** - - Search for `IMPLEMENTATION_PLAN.md` in `docs/`, root, or package directories - - Read any other status or plan documents found - -2. **Check recent changes** - - Review git log for recent commits - - Identify what was implemented vs what was planned - -3. **Identify discrepancies** - - Compare documented status with actual file state - - Check if acceptance criteria are accurately marked - - Verify phase completion status matches reality - -4. **Update documentation** - - Update date to current date - - Update status checkboxes - - Correct file references - - Update phase progress - -**Key Files to Check:** -- `docs/IMPLEMENTATION_PLAN.md` - Phase tracking -- `docs/CHANGELOG.md` - Change history -- Package `pyproject.toml` files - Version numbers -- Test files - Verify claimed tests exist - -**Output Format:** - -```markdown -# Implementation Tracker Report - -## Documents Checked -- [file] - Status: UP TO DATE / NEEDS UPDATE - -## Discrepancies Found -1. [description of mismatch] - -## Updates Applied -1. [description of change made] - -## Recommendations -- [any suggested follow-up actions] -``` - -**Key Rules:** -- Always verify claims by checking actual files, not just documentation -- Update documentation to match reality, not the other way around -- If a phase is partially complete, document exactly what was and wasn't done -- Check version numbers across all `pyproject.toml` files for consistency diff --git a/.claude/agents/output-evaluator.md b/.claude/agents/output-evaluator.md deleted file mode 100644 index 74c1882..0000000 --- a/.claude/agents/output-evaluator.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -name: output-evaluator -description: Use this agent to evaluate code outputs for quality before committing.\n\nImplements the LLM-as-Judge pattern, scoring Correctness, Completeness, and Safety on a 0-10 scale. Returns APPROVE, NEEDS_REVIEW, or REJECT verdict.\n\n**Examples:**\n\n\nContext: Automated quality gate before commit.\n\nuser: "Evaluate the code changes before we commit"\n\nassistant: "I'll use the output-evaluator agent to score the changes for correctness, completeness, and safety."\n\n\n\n\n\nContext: Pipeline integration for quality scoring.\n\nuser: "Score this implementation against the acceptance criteria"\n\nassistant: "Let me delegate to the output-evaluator agent for a structured quality assessment."\n\n\n -model: haiku -tools: Read, Glob, Grep -permissionMode: dontAsk -color: blue ---- - -# Output Evaluator - -You are an Output Evaluator implementing the LLM-as-Judge pattern. You evaluate code outputs for quality and provide a structured verdict. - -**Scope:** Score code changes on three dimensions and provide a verdict. Read-only analysis. - -## Evaluation Criteria - -Score each criterion from 0-10: - -### Correctness (0-10) - -- Code parses without syntax errors -- Logic is sound and handles edge cases -- No obvious bugs or regressions -- Type annotations are correct -- No undefined variables or missing imports -- Function signatures match their usage - -### Completeness (0-10) - -- All acceptance criteria addressed -- No TODO/FIXME markers in shipped code -- Error handling present at system boundaries -- Edge cases considered (empty input, None, overflow) -- No stub or placeholder implementations -- Tests included for new functionality - -### Safety (0-10) - -- No hardcoded secrets or credentials -- No destructive operations without safeguards -- No injection vectors (SQL, command, code) -- No overly permissive access patterns -- Sensitive data not logged or exposed -- Dependencies are from trusted sources - -## Verdict Rules - -| Verdict | Condition | -|---------|-----------| -| APPROVE | All scores >= 7, no high-severity issues | -| NEEDS_REVIEW | Any score 5-6, or medium-severity issues found | -| REJECT | Any score < 5, or any high-severity security issue | - -## Evaluation Process - -1. **Identify changes** -- determine what files were modified or created -2. **Read each file** -- understand the full context, not just the diff -3. **Score each dimension** -- apply the criteria above with specific evidence -4. **Identify issues** -- list specific problems with file:line references -5. **Render verdict** -- apply the verdict rules - -## Output Format - -```markdown -# Output Evaluation - -## Scores -| Criterion | Score | Rationale | -|-----------|-------|-----------| -| Correctness | X/10 | [brief explanation] | -| Completeness | X/10 | [brief explanation] | -| Safety | X/10 | [brief explanation] | - -## Issues -- [severity: high/medium/low] [file:line] Description - -## Verdict: [APPROVE / NEEDS_REVIEW / REJECT] - -## Summary -[1-2 sentence summary of the evaluation] - -## Suggestions (if NEEDS_REVIEW) -- [actionable improvement] -``` - -**Key Rules:** -- Be objective -- score based on evidence, not assumptions -- Be specific -- cite file:line for every issue -- High-severity security issues always result in REJECT -- An APPROVE with all 10s is valid if the code is genuinely clean -- Do NOT inflate scores -- if the code is minimal, scores should reflect that -- Focus on the actual changes, not the entire codebase diff --git a/.claude/agents/refactoring-specialist.md b/.claude/agents/refactoring-specialist.md deleted file mode 100644 index f131b0d..0000000 --- a/.claude/agents/refactoring-specialist.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -name: refactoring-specialist -description: Use this agent for code quality analysis and refactoring recommendations.\n\nPerforms read-only analysis applying SOLID principles, code smell detection, and Fowler's refactoring catalog. Does not modify code -- provides actionable recommendations.\n\n**Examples:**\n\n\nContext: Code has grown complex and needs cleanup.\n\nuser: "Analyze this module for refactoring opportunities"\n\nassistant: "I'll use the refactoring-specialist agent to identify code smells and suggest improvements."\n\n\n\n\n\nContext: Pre-review code quality check.\n\nuser: "Check if this code follows SOLID principles"\n\nassistant: "Let me delegate to the refactoring-specialist agent for a SOLID analysis."\n\n\n -model: sonnet -tools: Read, Glob, Grep -permissionMode: plan -color: yellow ---- - -You are a Refactoring Specialist for Python projects. You perform read-only analysis to identify code smells, SOLID violations, and refactoring opportunities. - -**Scope:** Code quality analysis and recommendations only. Do NOT modify any files. - -## SOLID Principles - -| Principle | What to Check | -|-----------|--------------| -| **Single Responsibility** | Classes/modules with multiple unrelated responsibilities, functions doing too many things | -| **Open/Closed** | Code requiring modification (not extension) to add new behavior, switch/if chains on type | -| **Liskov Substitution** | Subclasses that break parent contracts, overridden methods with different semantics | -| **Interface Segregation** | Large interfaces forcing implementations of unused methods, "god" base classes | -| **Dependency Inversion** | High-level modules importing low-level modules directly, no abstraction boundaries | - -### Python-Specific SOLID Checks - -- **Mutable default arguments** (`def f(x=[])`) -- shared state across calls, use `None` + assignment -- **ABC/Protocol misuse** -- prefer `typing.Protocol` for structural subtyping over `abc.ABC` when callers only need a subset of methods (Interface Segregation) -- **Missing dependency injection** -- classes that instantiate their own dependencies internally instead of accepting them via `__init__` (Dependency Inversion) -- **God classes** -- classes with 10+ public methods or mixed concerns (data access + business logic + formatting) -- **`@property` overuse** -- properties hiding expensive computation or side effects; prefer explicit methods when the operation is not trivially cheap -- **Circular imports** -- modules importing each other signals entangled responsibilities (Single Responsibility) - -## Code Smells to Detect - -### Size Smells -- **Long Method**: Functions exceeding 30 lines -- **Large Class**: Classes exceeding 300 lines -- **Long Parameter List**: Functions with more than 4 parameters -- **Deep Nesting**: More than 3 levels of indentation - -### Duplication Smells -- **Duplicated Code**: Similar logic in multiple locations -- **Data Clumps**: Groups of parameters that always appear together -- **Parallel Inheritance**: Matching hierarchies that change together - -### Coupling Smells -- **Feature Envy**: Method using more data from another class than its own -- **Inappropriate Intimacy**: Classes accessing each other's internals -- **Message Chains**: Long chains of method calls (a.b().c().d()) -- **Middle Man**: Classes that only delegate to another class - -### Complexity Smells -- **Primitive Obsession**: Using primitives instead of small value objects -- **Switch Statements**: Repeated type-based conditionals (use polymorphism) -- **Speculative Generality**: Unused abstractions, unnecessary parameters, dead code - -## Refactoring Catalog - -| Smell | Refactoring | Description | -|-------|------------|-------------| -| Long Method | Extract Method | Pull related lines into a named function | -| Large Class | Extract Class | Split responsibilities into focused classes | -| Long Parameter List | Introduce Parameter Object | Group related params into a dataclass | -| Feature Envy | Move Method | Move logic to the class that owns the data | -| Duplicated Code | Extract Method / Pull Up | Consolidate into shared function or base class | -| Switch on Type | Replace Conditional with Polymorphism | Use subclass per type with overridden method | -| Deep Nesting | Guard Clauses / Extract Method | Early returns, pull nested blocks into functions | -| Primitive Obsession | Introduce Value Object | Create typed wrappers (e.g., Email, Money) | - -## Analysis Process - -1. **Identify scope** -- determine which files/modules to analyze -2. **Measure complexity** -- count lines, nesting depth, parameter counts -3. **Detect patterns** -- scan for code smells listed above -4. **Assess SOLID** -- evaluate each principle against the codebase -5. **Prioritize findings** -- rank by impact and effort -6. **Recommend refactorings** -- specific, actionable suggestions with before/after sketches - -## Output Format - -```markdown -# Refactoring Analysis - -## Summary -- Files analyzed: N -- Code smells found: N -- SOLID violations: N - -## High Impact (significant improvement, moderate effort) -- [file:line] Smell: [name] -- [description] - - Suggested refactoring: [technique] - - Before: [brief code sketch] - - After: [brief code sketch] - -## Medium Impact (moderate improvement, low effort) -- [file:line] Smell: [name] -- [description] - - Suggested refactoring: [technique] - -## Low Impact (minor improvement) -- [file:line] Smell: [name] -- [description] - -## SOLID Assessment -- Single Responsibility: [PASS/WARN] -- [notes] -- Open/Closed: [PASS/WARN] -- [notes] -- Liskov Substitution: [PASS/WARN] -- [notes] -- Interface Segregation: [PASS/WARN] -- [notes] -- Dependency Inversion: [PASS/WARN] -- [notes] -``` - -**Key Rules:** -- Read-only analysis -- never modify files -- Be specific -- always cite file:line -- Prioritize actionable findings over theoretical concerns -- Include concrete before/after sketches for high-impact items -- Do NOT flag issues that ruff/pyright already catch -- A clean analysis with zero findings is a valid result diff --git a/.claude/agents/security-auditor.md b/.claude/agents/security-auditor.md deleted file mode 100644 index 5d9622a..0000000 --- a/.claude/agents/security-auditor.md +++ /dev/null @@ -1,126 +0,0 @@ ---- -name: security-auditor -description: Use this agent for security vulnerability detection and OWASP Top 10 compliance checks.\n\nPerforms read-only security analysis adapted for Python projects. Distinct from the /security-audit command -- this agent is delegated to by other agents, while the command is user-invoked.\n\n**Examples:**\n\n\nContext: Another agent needs a security review of specific code.\n\nuser: "Check this module for security issues"\n\nassistant: "I'll delegate to the security-auditor agent for a focused security review."\n\n\n\n\n\nContext: Pre-merge security validation.\n\nuser: "Run a security check on the authentication module"\n\nassistant: "Let me use the security-auditor agent to analyze the auth code for vulnerabilities."\n\n\n -model: sonnet -tools: Read, Glob, Grep -permissionMode: plan -color: red ---- - -You are a Security Auditor for Python projects. You perform read-only security analysis focused on vulnerability detection and secure coding practices. - -**Scope:** Security analysis only. Report findings without implementing fixes. - -## OWASP Top 10 Adapted for Python - -### A01: Broken Access Control -- Authorization checks on all API endpoints -- No IDOR (Insecure Direct Object References) -- Path traversal prevention (`../` in file paths) -- Proper CORS configuration - -### A02: Cryptographic Failures -- No MD5/SHA1 for password hashing (use bcrypt/argon2) -- Secrets not hardcoded in source -- TLS for data in transit -- Proper key management (environment variables, not code) - -### A03: Injection -- SQL injection: parameterized queries, no string concatenation -- Command injection: no `subprocess(shell=True)`, no `os.system()` -- Code injection: no `eval()`, `exec()`, `__import__()` with user input -- Deserialization: no `pickle.loads()` on untrusted data -- YAML: `yaml.safe_load()` not `yaml.load()` - -### A04: Insecure Design -- Input validation at system boundaries -- Principle of least privilege -- Defense in depth (multiple security layers) - -### A05: Security Misconfiguration -- Debug mode disabled in production -- Error messages don't expose internals -- Default credentials changed -- Unnecessary features disabled - -### A06: Vulnerable Components -- Dependencies checked for known CVEs -- Packages pinned to specific versions -- Only necessary packages included - -### A07: Authentication Failures -- Rate limiting on auth endpoints -- Session management secure -- Password requirements enforced - -### A08: Data Integrity Failures -- Input validation present -- Deserialization safety verified -- CI/CD pipeline secured - -### A09: Logging Failures -- Security events logged -- No sensitive data in logs -- Log injection prevention - -### A10: SSRF -- URL validation on user-provided URLs -- Allowlist for permitted destinations -- Private IP ranges blocked - -## Audit Process - -1. **Scan for dangerous patterns** using Grep: - - `eval(`, `exec(`, `pickle.loads(`, `os.system(` - - `subprocess.*shell=True` - - `yaml.load(` without SafeLoader - - SQL string formatting: `f"SELECT`, `"SELECT.*%s"` - - `assert` used for input validation - -2. **Check for leaked secrets** using Grep: - - API key patterns: `sk-`, `AKIA`, `ghp_`, `gho_` - - Private keys: `-----BEGIN.*PRIVATE KEY-----` - - Database URLs with credentials - - Hardcoded passwords - -3. **Review configuration files**: - - `.claude/settings.json` for overly permissive patterns - - `.gitignore` for secret file exclusions - - CI/CD workflows for secret handling - -4. **Assess input boundaries**: - - API endpoint parameter validation - - File upload restrictions - - URL input sanitization - -## Output Format - -```markdown -# Security Audit Report - -## Summary -- Files scanned: N -- Critical issues: N -- High priority: N -- Medium priority: N -- Grade: [A-F] - -## Critical Issues (fix immediately) -- [file:line] Description and remediation - -## High Priority (fix before release) -- [file:line] Description and remediation - -## Medium Priority (plan to fix) -- [file:line] Description and remediation - -## Compliant Areas -- [areas that passed checks] -``` - -**Key Rules:** -- Read-only analysis -- never modify files -- Be specific -- always cite file:line -- Only report high-confidence findings -- Prioritize exploitable vulnerabilities over theoretical concerns -- If no issues found, say so clearly diff --git a/.claude/commands/cove-isolated.md b/.claude/commands/cove-isolated.md deleted file mode 100644 index 9502c55..0000000 --- a/.claude/commands/cove-isolated.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -allowed-tools: Read, Glob, Grep, Bash, Agent -description: Isolated Chain-of-Verification (CoVe) -- verification step runs in a separate agent to prevent confirmation bias. ---- - - - -# Isolated Chain-of-Verification (CoVe) - -Apply the 4-step CoVe process with **isolated verification** -- Step 3 runs in a separate agent that cannot see the baseline response, preventing confirmation bias. - -## Step 1: Generate Baseline Response - -Answer the user's question fully, as you normally would. Write out your complete response under a heading: - -```markdown -## Baseline Response -[your full answer here] -``` - -## Step 2: Plan Verification Questions - -Review your baseline response and generate a numbered list of fact-check questions. Focus on claims that could be wrong -- file paths, function signatures, API behavior, version numbers, configuration syntax, behavioral assertions. - -```markdown -## Verification Questions -1. [Is the file path X correct?] -2. [Does function Y actually accept parameter Z?] -3. [Is it true that library A supports feature B?] -... -``` - -Generate 3-8 questions depending on response complexity. - -## Step 3: Isolated Verification (Agent) - -Launch a general-purpose Agent to answer the verification questions **independently**. The agent must NOT see your baseline response -- only the verification questions. This prevents confirmation bias. - -Provide the agent with: -- The numbered list of verification questions from Step 2 -- Instructions to use Read, Grep, Glob, and Bash to find evidence -- Instructions to answer each question with CONFIRMED or INCORRECT plus evidence - -Example agent prompt: -``` -Answer each of these fact-check questions by investigating the codebase. For each question, respond with CONFIRMED or INCORRECT and cite your evidence. - -Questions: -1. [question 1] -2. [question 2] -... -``` - -## Step 4: Generate Final Verified Response - -Review the agent's verification results and revise your baseline response, incorporating all corrections. If no errors were found, state that the baseline was verified and present it as final. - -```markdown -## Verified Response -[corrected answer, incorporating all verification results] -``` - -If any corrections were made, add a brief summary: - -```markdown -## Corrections Made -- [what changed and why] -``` diff --git a/.claude/commands/cove.md b/.claude/commands/cove.md deleted file mode 100644 index 41c16be..0000000 --- a/.claude/commands/cove.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -allowed-tools: Read, Glob, Grep, Bash -description: Chain-of-Verification (CoVe) prompting for high-stakes accuracy. Generates a response, self-verifies with fact-check questions, then revises. ---- - - - -# Chain-of-Verification (CoVe) - -Apply the 4-step CoVe process to reduce hallucinations and factual errors in your response to the user's question. - -## Step 1: Generate Baseline Response - -Answer the user's question fully, as you normally would. Write out your complete response under a heading: - -```markdown -## Baseline Response -[your full answer here] -``` - -## Step 2: Plan Verification Questions - -Review your baseline response and generate a numbered list of fact-check questions. Focus on claims that could be wrong -- file paths, function signatures, API behavior, version numbers, configuration syntax, behavioral assertions. - -```markdown -## Verification Questions -1. [Is the file path X correct?] -2. [Does function Y actually accept parameter Z?] -3. [Is it true that library A supports feature B?] -... -``` - -Generate 3-8 questions depending on response complexity. - -## Step 3: Answer Verifications - -Answer each verification question independently. Use tools (Read, Grep, Glob, Bash) to check facts against the actual codebase, documentation, or runtime behavior. Do not rely on your baseline response -- verify from source. - -```markdown -## Verification Results -1. [CONFIRMED / INCORRECT] -- [evidence] -2. [CONFIRMED / INCORRECT] -- [evidence] -... -``` - -## Step 4: Generate Final Verified Response - -Revise your baseline response, incorporating all corrections from Step 3. If no errors were found, state that the baseline was verified and present it as final. - -```markdown -## Verified Response -[corrected answer, incorporating all verification results] -``` - -If any corrections were made, add a brief summary: - -```markdown -## Corrections Made -- [what changed and why] -``` diff --git a/.claude/commands/security-audit.md b/.claude/commands/security-audit.md deleted file mode 100644 index ba634d6..0000000 --- a/.claude/commands/security-audit.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -allowed-tools: Read, Glob, Grep, Bash -description: Run a 6-phase Python-specific security posture scan with A-F grading. ---- - -# Security Audit - -Perform a comprehensive security audit of this Python project. - -## Phase 1: Dependency Security - -- Run `uv run pip-audit` if available, otherwise check `pyproject.toml` for known-vulnerable packages -- Check for pinned vs unpinned dependencies -- Flag any dependencies with known CVEs - -## Phase 2: Secrets Scan - -Search the entire codebase for leaked secrets: -- API keys: patterns like `sk-`, `AKIA`, `ghp_`, `gho_` -- Private keys: `-----BEGIN.*PRIVATE KEY-----` -- Database URLs with credentials: `(postgres|mysql|mongodb)://.*:.*@` -- Hardcoded passwords: `password\s*=\s*['"][^'"]+` -- `.env` files that should be gitignored -- Check `.gitignore` includes `.env`, `.env.*` - -## Phase 3: Code Pattern Analysis - -Scan Python source files for unsafe patterns: -- `eval()` and `exec()` usage -- `pickle.loads()` (arbitrary code execution) -- `subprocess(shell=True)` (command injection) -- `yaml.load()` without `SafeLoader` -- `os.system()` calls -- SQL string concatenation (injection risk) -- `assert` used for validation (stripped in `-O` mode) -- `__import__()` with user input - -## Phase 4: Input Validation - -Check API boundaries and user input handling: -- Are function parameters validated at entry points? -- Are file paths sanitized against traversal (`../`)? -- Are URL inputs validated? -- Is there proper error handling at system boundaries? - -## Phase 5: Configuration Security - -Review project configuration: -- `.claude/settings.json` -- check for overly permissive patterns -- GitHub Actions workflows -- check for secret handling -- Docker configuration (if present) -- check for root user, exposed ports -- CORS configuration (if present) - -## Phase 6: Score and Report - -Calculate a security posture score (0-100) and assign a grade: - -| Grade | Score | Meaning | -|-------|-------|---------| -| A | 90-100 | Excellent -- no critical issues, minor improvements only | -| B | 80-89 | Good -- no critical issues, some medium-priority findings | -| C | 70-79 | Adequate -- no critical issues but notable gaps | -| D | 60-69 | Needs work -- critical issues present | -| F | 0-59 | Failing -- multiple critical issues, immediate action needed | - -**Scoring deductions:** -- Critical (leaked secret, eval with user input): -20 each -- High (unpinned deps, shell=True): -10 each -- Medium (missing input validation): -5 each -- Low (style issues, missing .gitignore entries): -2 each - -Present the full report: - -```markdown -# Security Audit Report - -## Score: [X]/100 (Grade: [A-F]) - -## Critical Issues -- [issues requiring immediate fix] - -## High Priority -- [issues to fix soon] - -## Medium Priority -- [issues to plan for] - -## Low Priority -- [nice-to-have improvements] - -## Remediation Plan -1. [ordered steps to fix issues] -``` diff --git a/.claude/hooks/dangerous-actions-blocker.sh b/.claude/hooks/dangerous-actions-blocker.sh index 79497b7..bf34709 100755 --- a/.claude/hooks/dangerous-actions-blocker.sh +++ b/.claude/hooks/dangerous-actions-blocker.sh @@ -1,7 +1,9 @@ #!/bin/bash -# PreToolUse hook: Blocks dangerous shell commands before execution. -# Exit 2 = block the action, Exit 0 = allow. -# Requires jq for JSON parsing; degrades gracefully if missing. +# Exfiltration guard for autonomous mode. +# Primary defense: iptables firewall (network whitelist). +# This hook catches exfiltration via TRUSTED channels (GitHub API, package registries) +# and secrets leaked as command arguments. +# Local destruction (rm -rf, sudo, etc.) is not blocked -- devcontainer is disposable. if ! command -v jq &>/dev/null; then echo "WARNING: jq not found, dangerous-actions-blocker hook disabled" >&2 @@ -21,82 +23,60 @@ if [ -z "$COMMAND" ]; then exit 0 fi -# Literal patterns that should be blocked outright (matched with grep -F for safety) -BLOCKED_LITERAL_PATTERNS=( - 'rm -rf /' - 'rm -rf /*' - 'rm -rf ~' - 'rm -rf $HOME' - 'sudo rm -rf' - 'dd if=/dev/zero' - ':(){:|:&};:' - 'chmod -R 777 /' - 'chmod -R 777 /*' - '> /dev/sda' - 'git push --force' - 'git push -f ' - 'git push origin +master' - 'git push origin +main' - 'DROP DATABASE' - 'DROP TABLE' - 'TRUNCATE TABLE' - 'shutdown' - 'reboot' - 'init 0' - 'init 6' - 'halt' - 'poweroff' -) +# --- Exfiltration via trusted channels (exit 2 = hard block) --- -# Regex patterns that require extended matching -BLOCKED_REGEX_PATTERNS=( - 'mkfs\.' - 'DELETE FROM .* WHERE 1' +EXFIL_LITERAL_PATTERNS=( + 'gh gist create' + 'twine upload' + 'npm publish' + 'pip upload' + 'uv publish' ) -for pattern in "${BLOCKED_LITERAL_PATTERNS[@]}"; do +for pattern in "${EXFIL_LITERAL_PATTERNS[@]}"; do if echo "$COMMAND" | grep -qiF "$pattern"; then - jq -n --arg reason "Blocked by dangerous-actions-blocker hook: command matches dangerous pattern '$pattern'" \ + jq -n --arg reason "Blocked by dangerous-actions-blocker hook: exfiltration via trusted channel '$pattern'" \ '{"decision":"block","reason":$reason}' exit 2 fi done -for pattern in "${BLOCKED_REGEX_PATTERNS[@]}"; do - if echo "$COMMAND" | grep -qiE "$pattern"; then - jq -n --arg reason "Blocked by dangerous-actions-blocker hook: command matches dangerous pattern '$pattern'" \ - '{"decision":"block","reason":$reason}' - exit 2 - fi -done +# gh issue create with --body or --body-file (data exfil via issue body) +if echo "$COMMAND" | grep -qiF "gh issue create" && echo "$COMMAND" | grep -qiE '\-\-body(-file)?'; then + jq -n '{"decision":"block","reason":"Blocked by dangerous-actions-blocker hook: exfiltration via gh issue create --body/--body-file"}' + exit 2 +fi + +# --- Secrets as literal command arguments --- + +SECRET_REGEX_PATTERNS=( + 'AKIA[0-9A-Z]{16}' + 'sk-[a-zA-Z0-9_-]{20,}' + 'ghp_[a-zA-Z0-9]{36}' + 'gho_[a-zA-Z0-9]{36}' + 'github_pat_[a-zA-Z0-9_]{22,}' + 'Bearer [a-zA-Z0-9_./-]+' + 'token=[a-zA-Z0-9_./-]+' +) -# Block commands containing secrets/tokens passed as arguments -# Literal key names use -F; patterns with wildcards use -E SECRET_LITERAL_PATTERNS=( 'ANTHROPIC_API_KEY=' 'OPENAI_API_KEY=' 'AWS_SECRET_ACCESS_KEY=' 'GITHUB_TOKEN=' 'GH_TOKEN=' - 'password=' - 'passwd=' - 'secret=' - 'token=' -) - -SECRET_REGEX_PATTERNS=( - 'DATABASE_URL=.*://.*:.*@' + 'DATABASE_URL=' ) for pattern in "${SECRET_LITERAL_PATTERNS[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then + if echo "$COMMAND" | grep -qF "$pattern"; then jq -n '{"decision":"block","reason":"Blocked by dangerous-actions-blocker hook: command appears to contain secrets or credentials. Use environment variables instead."}' exit 2 fi done for pattern in "${SECRET_REGEX_PATTERNS[@]}"; do - if echo "$COMMAND" | grep -qiE "$pattern"; then + if echo "$COMMAND" | grep -qE "$pattern"; then jq -n '{"decision":"block","reason":"Blocked by dangerous-actions-blocker hook: command appears to contain secrets or credentials. Use environment variables instead."}' exit 2 fi diff --git a/.claude/hooks/devcontainer-policy-blocker.sh b/.claude/hooks/devcontainer-policy-blocker.sh deleted file mode 100755 index 30add04..0000000 --- a/.claude/hooks/devcontainer-policy-blocker.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/bin/bash -# PreToolUse hook: Blocks devcontainer-denied commands via grep on full command string. -# Catches patterns inside chained commands (&&, ;, |) that glob-based deny rules miss. -# Uses $PERMISSION_TIER env var: 1/2 = all categories blocked, 3 = omit GH/infra/tool-install blocks. -# Fail-closed: if PERMISSION_TIER is unset, all categories are blocked (strictest). -# Exit 2 = block the action, Exit 0 = allow. -# Requires jq for JSON parsing; fails closed if missing (blocks all bash). - -if ! command -v jq &>/dev/null; then - printf '%s\n' '{"decision":"block","reason":"devcontainer-policy-blocker requires jq for policy enforcement. Install jq in the Dockerfile."}' - exit 2 -fi - -INPUT=$(cat) -TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty') - -if [ "$TOOL_NAME" != "Bash" ]; then - exit 0 -fi - -COMMAND=$(echo "$INPUT" | jq -r '.tool_input.command // empty') - -if [ -z "$COMMAND" ]; then - exit 0 -fi - -TIER="${PERMISSION_TIER:-0}" - -block() { - jq -n --arg reason "$1" '{"decision":"block","reason":$reason}' - exit 2 -} - -# --- Tool installation (tiers 0/1/2 only, allowed in tier 3) --- -if [ "$TIER" != "3" ]; then - BLOCKED_TOOL_INSTALL=( - 'pip install' - 'pip3 install' - 'pipx install' - 'python -m pip install' - 'npm install -g' - 'npm i -g' - 'cargo install' - 'go install' - 'gem install' - 'uv tool install' - 'apt install' - 'apt-get install' - 'dpkg -i' - 'snap install' - 'brew install' - ) - - for pattern in "${BLOCKED_TOOL_INSTALL[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then - block "Blocked by devcontainer-policy-blocker: '$pattern' is not allowed in tier $TIER. Use 'uv add ' for project dependencies or add to .devcontainer/Dockerfile for tools." - fi - done -fi - -# --- Package publishing (all tiers) --- -BLOCKED_PUBLISH=( - 'npm publish' - 'npx npm publish' - 'uv publish' - 'twine upload' -) - -for pattern in "${BLOCKED_PUBLISH[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then - block "Blocked by devcontainer-policy-blocker: '$pattern' is not allowed. Ask the user to publish manually." - fi -done - -# --- Supply-chain vectors (all tiers) --- -BLOCKED_SUPPLY_CHAIN=( - '| bash' - '|bash' - '| sh' - '|sh' - '| zsh' - '|zsh' -) - -for pattern in "${BLOCKED_SUPPLY_CHAIN[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then - block "Blocked by devcontainer-policy-blocker: piping to a shell ('$pattern') is not allowed. Add tools to .devcontainer/Dockerfile instead." - fi -done - -# --- Docker escape vectors (all tiers) --- -BLOCKED_DOCKER_ESCAPE=( - 'docker run --privileged' - 'docker run --cap-add=ALL' - 'docker run --pid=host' - 'docker run --network=host' -) - -for pattern in "${BLOCKED_DOCKER_ESCAPE[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then - block "Blocked by devcontainer-policy-blocker: '$pattern' is not allowed. Use 'docker run' without escape flags." - fi -done - -# --- GitHub shared state mutations (tiers 0/1/2 only, allowed in tier 3) --- -if [ "$TIER" != "3" ]; then - BLOCKED_GH_MUTATIONS=( - 'gh pr merge' - 'gh workflow run' - 'gh workflow enable' - 'gh workflow disable' - 'gh issue create' - 'gh issue close' - 'gh issue edit' - ) - - for pattern in "${BLOCKED_GH_MUTATIONS[@]}"; do - if echo "$COMMAND" | grep -qiF "$pattern"; then - block "Blocked by devcontainer-policy-blocker: '$pattern' is not allowed in tier $TIER. Ask the user to perform this action." - fi - done -fi - -# --- Infrastructure (tiers 0/1/2 only, allowed in tier 3) --- -if [ "$TIER" != "3" ]; then - if echo "$COMMAND" | grep -qiF 'terraform '; then - block "Blocked by devcontainer-policy-blocker: 'terraform' is not allowed in tier $TIER. Ask the user to run terraform." - fi -fi - -exit 0 diff --git a/.claude/hooks/output-secrets-scanner.sh b/.claude/hooks/output-secrets-scanner.sh deleted file mode 100755 index 14adb7e..0000000 --- a/.claude/hooks/output-secrets-scanner.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# PostToolUse hook: Scans command output for leaked secrets. -# Always exits 0 (PostToolUse cannot block). Emits systemMessage warnings. -# Requires jq for JSON parsing; degrades gracefully if missing. - -if ! command -v jq &>/dev/null; then - echo "WARNING: jq not found, output-secrets-scanner hook disabled" >&2 - exit 0 -fi - -INPUT=$(cat) -TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty') - -if [ "$TOOL_NAME" != "Bash" ]; then - exit 0 -fi - -STDOUT=$(echo "$INPUT" | jq -r '.tool_output.stdout // empty') -STDERR=$(echo "$INPUT" | jq -r '.tool_output.stderr // empty') -OUTPUT="${STDOUT}${STDERR}" - -if [ -z "$OUTPUT" ]; then - exit 0 -fi - -WARNINGS="" - -# AWS Access Key ID (starts with AKIA) -if echo "$OUTPUT" | grep -qE 'AKIA[0-9A-Z]{16}'; then - WARNINGS="${WARNINGS}[!] Possible AWS Access Key ID detected in output.\n" -fi - -# AWS Secret Access Key (40-char base64) -if echo "$OUTPUT" | grep -qE '[^A-Za-z0-9/+=][A-Za-z0-9/+=]{40}[^A-Za-z0-9/+=]'; then - if echo "$OUTPUT" | grep -qiE 'aws_secret|secret.?access'; then - WARNINGS="${WARNINGS}[!] Possible AWS Secret Access Key detected in output.\n" - fi -fi - -# Anthropic API Key -if echo "$OUTPUT" | grep -qE 'sk-ant-[a-zA-Z0-9_-]{20,}'; then - WARNINGS="${WARNINGS}[!] Possible Anthropic API key detected in output.\n" -fi - -# OpenAI API Key -if echo "$OUTPUT" | grep -qE 'sk-[a-zA-Z0-9]{20,}'; then - WARNINGS="${WARNINGS}[!] Possible OpenAI API key detected in output.\n" -fi - -# GitHub Personal Access Token -if echo "$OUTPUT" | grep -qE 'ghp_[a-zA-Z0-9]{36}'; then - WARNINGS="${WARNINGS}[!] Possible GitHub personal access token detected in output.\n" -fi - -# GitHub OAuth Token -if echo "$OUTPUT" | grep -qE 'gho_[a-zA-Z0-9]{36}'; then - WARNINGS="${WARNINGS}[!] Possible GitHub OAuth token detected in output.\n" -fi - -# JWT Token -if echo "$OUTPUT" | grep -qE 'eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*\.[a-zA-Z0-9_-]*'; then - WARNINGS="${WARNINGS}[!] Possible JWT token detected in output.\n" -fi - -# Private Key markers -if echo "$OUTPUT" | grep -qE -- '-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----'; then - WARNINGS="${WARNINGS}[!] Private key material detected in output.\n" -fi - -# Database connection strings with credentials -if echo "$OUTPUT" | grep -qiE '(postgres|mysql|mongodb|redis)://[^:]+:[^@]+@'; then - WARNINGS="${WARNINGS}[!] Possible database connection string with credentials detected in output.\n" -fi - -if [ -n "$WARNINGS" ]; then - MSG=$(echo -e "SECURITY WARNING: ${WARNINGS}Avoid sharing or committing this output. Rotate any exposed credentials immediately.") - jq -n --arg msg "$MSG" '{systemMessage: $msg}' -fi - -exit 0 diff --git a/.claude/hooks/test-on-change.sh b/.claude/hooks/test-on-change.sh deleted file mode 100755 index 704928b..0000000 --- a/.claude/hooks/test-on-change.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -# PostToolUse hook: Discovers and runs associated test files after code changes. -# Uses monorepo conventions: libs/pkg/tests/test_X.py, apps/pkg/tests/test_X.py, tests/test_X.py. -# Informational only -- returns systemMessage with pass/fail, never blocks. -# Requires jq for JSON parsing; degrades gracefully if missing. - -if ! command -v jq &>/dev/null; then - echo "WARNING: jq not found, test-on-change hook disabled" >&2 - exit 0 -fi - -INPUT=$(cat) -TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty') - -if [ "$TOOL_NAME" != "Edit" ] && [ "$TOOL_NAME" != "Write" ]; then - exit 0 -fi - -FILE_PATH=$(echo "$INPUT" | jq -r '.tool_input.file_path // empty') - -if [ -z "$FILE_PATH" ] || [ "$FILE_PATH" = "null" ]; then - exit 0 -fi - -# Only run for Python source files (skip test files themselves) -if [[ "$FILE_PATH" != *.py ]]; then - exit 0 -fi - -BASENAME=$(basename "$FILE_PATH" .py) - -# Skip if already a test file -if [[ "$BASENAME" == test_* ]]; then - exit 0 -fi - -# Skip __init__.py and conftest.py -if [ "$BASENAME" = "__init__" ] || [ "$BASENAME" = "conftest" ]; then - exit 0 -fi - -DIRNAME=$(dirname "$FILE_PATH") -TEST_FILE="" - -# Strategy 1: Adjacent tests/ directory (monorepo pattern) -# If file is in libs/pkg/namespace/pkg/module.py, look in libs/pkg/tests/test_module.py -# Walk up to find a tests/ directory at the package level -CHECK_DIR="$DIRNAME" -while [ "$CHECK_DIR" != "/" ] && [ "$CHECK_DIR" != "." ]; do - if [ -d "$CHECK_DIR/tests" ] && [ -f "$CHECK_DIR/tests/test_${BASENAME}.py" ]; then - TEST_FILE="$CHECK_DIR/tests/test_${BASENAME}.py" - break - fi - # Also check sibling tests/ directory - PARENT=$(dirname "$CHECK_DIR") - if [ -d "$PARENT/tests" ] && [ -f "$PARENT/tests/test_${BASENAME}.py" ]; then - TEST_FILE="$PARENT/tests/test_${BASENAME}.py" - break - fi - CHECK_DIR="$PARENT" -done - -# Strategy 2: Root tests/ directory -if [ -z "$TEST_FILE" ]; then - PROJECT_DIR="${CLAUDE_PROJECT_DIR:-$(pwd)}" - if [ -f "$PROJECT_DIR/tests/test_${BASENAME}.py" ]; then - TEST_FILE="$PROJECT_DIR/tests/test_${BASENAME}.py" - fi -fi - -# If no test file found, exit silently -if [ -z "$TEST_FILE" ]; then - exit 0 -fi - -# Run the test -if command -v uv &>/dev/null; then - TEST_OUTPUT=$(uv run pytest "$TEST_FILE" -x -q 2>&1) - EXIT_CODE=$? - - if [ $EXIT_CODE -ne 0 ]; then - TAIL=$(echo "$TEST_OUTPUT" | tail -20) - jq -n --arg msg "Tests failed in $TEST_FILE: $TAIL" '{systemMessage: $msg}' - fi -fi - -exit 0 diff --git a/.claude/hooks/unicode-injection-scanner.sh b/.claude/hooks/unicode-injection-scanner.sh deleted file mode 100755 index 048a4bc..0000000 --- a/.claude/hooks/unicode-injection-scanner.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# PreToolUse hook: Blocks Unicode injection attacks in file edits. -# Detects zero-width characters, RTL overrides, ANSI escapes, null bytes, -# and Unicode tag characters that can hide malicious code. -# References: CVE-2025-53109, CVE-2025-53110 -# Exit 2 = block the action, Exit 0 = allow. -# Requires jq for JSON parsing; degrades gracefully if missing. - -if ! command -v jq &>/dev/null; then - echo "WARNING: jq not found, unicode-injection-scanner hook disabled" >&2 - exit 0 -fi - -INPUT=$(cat) -TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty') - -if [ "$TOOL_NAME" != "Edit" ] && [ "$TOOL_NAME" != "Write" ]; then - exit 0 -fi - -# Extract the content being written/edited -if [ "$TOOL_NAME" = "Write" ]; then - CONTENT=$(echo "$INPUT" | jq -r '.tool_input.content // empty') -elif [ "$TOOL_NAME" = "Edit" ]; then - CONTENT=$(echo "$INPUT" | jq -r '.tool_input.new_string // empty') -fi - -if [ -z "$CONTENT" ]; then - exit 0 -fi - -# Check for PCRE support (grep -P requires GNU grep with PCRE; not available on macOS/Alpine by default) -if ! echo "" | grep -qP '' 2>/dev/null; then - echo "WARNING: grep -P (PCRE) not supported on this system, unicode-injection-scanner hook disabled" >&2 - exit 0 -fi - -# Zero-width characters (invisible text manipulation) -# U+200B Zero Width Space, U+200C ZWNJ, U+200D ZWJ, U+FEFF BOM -if echo "$CONTENT" | grep -qP '[\x{200B}\x{200C}\x{200D}\x{FEFF}]' 2>/dev/null; then - echo '{"decision":"block","reason":"Blocked by unicode-injection-scanner: zero-width Unicode characters detected. These can hide malicious code. See CVE-2025-53109."}' - exit 2 -fi - -# RTL/LTR override characters (can reorder displayed code) -# U+202A-U+202E, U+2066-U+2069 -if echo "$CONTENT" | grep -qP '[\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}\x{2066}\x{2067}\x{2068}\x{2069}]' 2>/dev/null; then - echo '{"decision":"block","reason":"Blocked by unicode-injection-scanner: bidirectional text override characters detected. These can make code appear different than it executes. See CVE-2025-53110."}' - exit 2 -fi - -# ANSI escape sequences (terminal injection) -if echo "$CONTENT" | grep -qP '\x1B\[' 2>/dev/null; then - echo '{"decision":"block","reason":"Blocked by unicode-injection-scanner: ANSI escape sequences detected. These can manipulate terminal display."}' - exit 2 -fi - -# Null bytes (can truncate strings in C-based parsers) -if echo "$CONTENT" | grep -qP '\x00' 2>/dev/null; then - echo '{"decision":"block","reason":"Blocked by unicode-injection-scanner: null bytes detected in content."}' - exit 2 -fi - -# Unicode tag characters (U+E0001-U+E007F, used for invisible tagging) -if echo "$CONTENT" | grep -qP '[\x{E0001}-\x{E007F}]' 2>/dev/null; then - echo '{"decision":"block","reason":"Blocked by unicode-injection-scanner: Unicode tag characters detected. These can embed invisible instructions."}' - exit 2 -fi - -exit 0 diff --git a/.claude/settings.json b/.claude/settings.json index a1b8cbc..cb76b83 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -55,22 +55,13 @@ { "matcher": "Bash", "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/dangerous-actions-blocker.sh"}] - }, - { - "matcher": "Edit|Write", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/unicode-injection-scanner.sh"}] } ], "PostToolUse": [ - { - "matcher": "Bash", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/output-secrets-scanner.sh"}] - }, { "matcher": "Edit|Write", "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/auto-format.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/test-on-change.sh"} + {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/auto-format.sh"} ] } ] diff --git a/.claude/skills/edit-permissions/SKILL.md b/.claude/skills/edit-permissions/SKILL.md deleted file mode 100644 index a0680a2..0000000 --- a/.claude/skills/edit-permissions/SKILL.md +++ /dev/null @@ -1,137 +0,0 @@ ---- -name: edit-permissions -description: Manage Claude Code permission rules in .claude/settings.json. Use when adding, removing, or reviewing Bash/Read/Edit/WebFetch permission patterns. -argument-hint: "[add|remove|list|migrate] [pattern]" -allowed-tools: Read, Edit, Grep, Glob ---- - -# Edit Permissions - -Manage Claude Code permission rules in `.claude/settings.json`. This skill helps you safely add, remove, list, and migrate permission patterns. - -## Quick Reference -- Permission Rule Syntax - -| Tool | Pattern Format | Example | -|------|---------------|---------| -| Bash | `Bash(command *)` | `Bash(npm run *)` | -| Read | `Read(path)` | `Read(/project/src/**)` | -| Edit | `Edit(path)` | `Edit(/project/src/**)` | -| WebFetch | `WebFetch(domain:...)` | `WebFetch(domain:docs.python.org)` | - -## Shell Operator Protection - -`Bash(cmd *)` does **NOT** match chained commands: - -| Pattern | Matches | Does NOT Match | -|---------|---------|----------------| -| `Bash(ls *)` | `ls /foo` | `cd /foo && ls` | -| `Bash(pytest *)` | `pytest tests/` | `cd tests && pytest` | -| `Bash(grep *)` | `grep pattern /file` | `cat file \| grep pattern` | - -Chained operators (`&&`, `\|`, `;`, `\|\|`) cause the **entire command** to be evaluated as one unit. Individual patterns will not match partial segments. - -**Solution:** Use absolute paths instead of `cd && command` chains. - -## Pattern Syntax - -- **Wildcards:** `*` matches any arguments. `Bash(ls *)` matches `ls`, `ls -la`, `ls /foo/bar`. -- **Word boundaries:** `Bash(ls *)` matches `ls` but NOT `lsof`. The space before `*` acts as a word boundary. -- **Exact match:** `Bash(ls)` matches only `ls` with no arguments. -- **Deprecated syntax:** `Bash(cmd:*)` is equivalent to `Bash(cmd *)` but is **deprecated**. Always use the space form. - -## Evaluation Order - -Permission rules are evaluated in this order: - -1. **deny** -- checked first. If any deny rule matches, the action is blocked. -2. **ask** -- checked second. If any ask rule matches, the user is prompted. -3. **allow** -- checked last. If any allow rule matches, the action proceeds silently. - -First match within each category wins. - -## Settings File Locations - -| Scope | Path | Purpose | -|-------|------|---------| -| Project | `.claude/settings.json` | Shared with team via git | -| User | `~/.claude/settings.json` | Personal preferences | -| Managed | System-dependent managed settings path | Enterprise/admin policies | - -Project settings are checked first, then user settings. Managed settings override both. - -## Common Patterns - -### Git operations -```json -"Bash(git add *)", "Bash(git commit *)", "Bash(git push *)", -"Bash(git fetch *)", "Bash(git pull *)", "Bash(git rebase *)" -``` - -### Testing -```json -"Bash(pytest *)", "Bash(uv run pytest *)", "Bash(python -m pytest *)" -``` - -### Package management (uv) -```json -"Bash(uv sync *)", "Bash(uv add *)", "Bash(uv pip *)", "Bash(uv venv *)" -``` - -### Package management (npm) -```json -"Bash(npm install *)", "Bash(npm run *)", "Bash(npm test *)" -``` - -### Read-only utilities -```json -"Bash(ls *)", "Bash(cat *)", "Bash(find *)", "Bash(grep *)", -"Bash(head *)", "Bash(tail *)", "Bash(wc *)", "Bash(tree *)" -``` - -### GitHub CLI -```json -"Bash(gh pr create *)", "Bash(gh pr view *)", "Bash(gh pr list *)", -"Bash(gh pr checks *)", "Bash(gh api *)" -``` - -## Safety Rules - -1. **Never add** `Bash` or `Bash(*)` to the allow list -- this permits arbitrary command execution. -2. **Prefer narrow patterns** over broad ones. `Bash(npm run build)` is safer than `Bash(npm *)`. -3. **Use `deny`** for destructive commands (e.g., `Bash(rm -rf *)`, `Bash(gh secret *)`). -4. **Use `ask`** for commands that are sometimes needed but carry risk (e.g., `Bash(docker *)`, `Bash(gh pr merge *)`). -5. **Keep allow list minimal** -- only add patterns for commands Claude needs to run frequently without prompting. - -## Workflow - -When the user asks to add, remove, or modify permission patterns: - -1. **Read** `.claude/settings.json` first to see current rules. -2. **Show** the user the proposed change before editing (display the before/after diff). -3. **Edit** the file using the Edit tool. -4. **Validate** the JSON is still valid after editing (check for trailing commas, missing brackets). -5. **Warn** about any deprecated `:*` patterns found and offer to migrate them to ` *`. - -### Adding a pattern - -When adding a new Bash pattern: -- Determine whether it belongs in `allow`, `ask`, or `deny` -- Check if a broader or narrower pattern already covers it -- Place it in the appropriate section, grouped with similar commands - -### Removing a pattern - -When removing a pattern: -- Confirm with the user which specific pattern to remove -- Warn if removing from `deny` (this loosens security) - -### Listing patterns - -When listing, display patterns grouped by category (allow/ask/deny) with a brief explanation of each. - -### Migrating deprecated syntax - -When migrating: -- Find all `:*` patterns across allow, ask, and deny arrays -- Replace `:*)` with ` *)` in each entry -- Show the count of patterns migrated diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f762dcb..bb8250c 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -46,12 +46,11 @@ ], "containerEnv": { "CLAUDE_CONFIG_DIR": "/home/vscode/.claude", - "POWERLEVEL9K_DISABLE_GITSTATUS": "true", - "PERMISSION_TIER": "${localEnv:PERMISSION_TIER:2}" + "POWERLEVEL9K_DISABLE_GITSTATUS": "true" }, "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=delegated", "workspaceFolder": "/workspace", - "onCreateCommand": "bash -c 'case \"${PERMISSION_TIER:-2}\" in 1) f=tier1-assisted.json;; 2) f=tier2-autonomous.json;; 3) f=tier3-full-trust.json;; *) echo \"Invalid PERMISSION_TIER: ${PERMISSION_TIER}\" >&2; exit 1;; esac; cp \".devcontainer/permissions/$f\" .claude/settings.local.json && if ! grep -q \"{{project_name}}\" pyproject.toml 2>/dev/null; then uv sync --all-packages --group dev; fi'", + "onCreateCommand": "bash -c 'if ! grep -q \"{{project_name}}\" pyproject.toml 2>/dev/null; then uv sync --all-packages --group dev; fi'", "postStartCommand": "sudo /usr/local/bin/init-firewall.sh", "waitFor": "postStartCommand" } diff --git a/.devcontainer/permissions/tier1-assisted.json b/.devcontainer/permissions/tier1-assisted.json deleted file mode 100644 index 623f9cc..0000000 --- a/.devcontainer/permissions/tier1-assisted.json +++ /dev/null @@ -1,63 +0,0 @@ -{ - "__comment": "Tier 1: Assisted -- minimal expansion. File ops and WebFetch auto-allowed, bash still asks per-command.", - "permissions": { - "allow": [ - "Edit", "Write", "Read", - "WebFetch", - "Bash(git reset *)", "Bash(git restore *)", - "Bash(git rm *)", "Bash(git mv *)", "Bash(git worktree *)" - ], - "deny": [ - "Bash(*pip install *)", "Bash(*pip3 install *)", - "Bash(*pipx install *)", "Bash(*python -m pip install *)", - "Bash(*npm install -g *)", "Bash(*npm i -g *)", - "Bash(*cargo install *)", "Bash(*go install *)", "Bash(*gem install *)", - "Bash(*uv tool install *)", "Bash(*uv tool *)", - "Bash(*apt install *)", "Bash(*apt-get install *)", "Bash(*dpkg -i *)", - "Bash(*snap install *)", "Bash(*brew install *)", - "Bash(*npm publish*)", "Bash(*npx npm publish*)", - "Bash(*uv publish *)", "Bash(*twine upload *)", - "Bash(*docker run --privileged *)", - "Bash(*docker run --cap-add=ALL *)", - "Bash(*docker run --pid=host *)", - "Bash(*docker run --network=host *)", - "Bash(*gh pr merge *)", - "Bash(*gh workflow run *)", "Bash(*gh workflow enable *)", "Bash(*gh workflow disable *)", - "Bash(*gh issue create *)", "Bash(*gh issue close *)", "Bash(*gh issue edit *)", - "Bash(*terraform *)", - "Bash(*git remote add *)", "Bash(*git remote set-url *)", "Bash(*git remote remove *)", - "Bash(*git remote rename *)", "Bash(*git remote set-head *)" - ] - }, - "enabledPlugins": { - "security-guidance@claude-code-plugins": true - }, - "hooks": { - "PreToolUse": [ - { - "matcher": "Bash", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/dangerous-actions-blocker.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/devcontainer-policy-blocker.sh"} - ] - }, - { - "matcher": "Edit|Write", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/unicode-injection-scanner.sh"}] - } - ], - "PostToolUse": [ - { - "matcher": "Bash", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/output-secrets-scanner.sh"}] - }, - { - "matcher": "Edit|Write", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/auto-format.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/test-on-change.sh"} - ] - } - ] - } -} diff --git a/.devcontainer/permissions/tier2-autonomous.json b/.devcontainer/permissions/tier2-autonomous.json deleted file mode 100644 index 8563990..0000000 --- a/.devcontainer/permissions/tier2-autonomous.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "__comment": "Tier 2: Autonomous (recommended) -- Bash(*) with curated deny list. Zero prompts for bash.", - "permissions": { - "allow": [ - "Edit", "Write", "Read", - "WebFetch", - "Bash(*)" - ], - "deny": [ - "Bash(*gh pr merge *)", - "Bash(*gh workflow run *)", "Bash(*gh workflow enable *)", "Bash(*gh workflow disable *)", - "Bash(*gh issue create *)", "Bash(*gh issue close *)", "Bash(*gh issue edit *)", - "Bash(*npm publish*)", "Bash(*npx npm publish*)", - "Bash(*uv publish *)", "Bash(*twine upload *)", - "Bash(*docker run --privileged *)", - "Bash(*docker run --cap-add=ALL *)", - "Bash(*docker run --pid=host *)", - "Bash(*docker run --network=host *)", - "Bash(*terraform *)", - "Bash(*pip install *)", "Bash(*pip3 install *)", - "Bash(*pipx install *)", "Bash(*python -m pip install *)", - "Bash(*npm install -g *)", "Bash(*npm i -g *)", - "Bash(*cargo install *)", "Bash(*go install *)", "Bash(*gem install *)", - "Bash(*uv tool install *)", "Bash(*uv tool *)", - "Bash(*apt install *)", "Bash(*apt-get install *)", "Bash(*dpkg -i *)", - "Bash(*snap install *)", "Bash(*brew install *)", - "Bash(*git remote add *)", "Bash(*git remote set-url *)", "Bash(*git remote remove *)", - "Bash(*git remote rename *)", "Bash(*git remote set-head *)" - ] - }, - "enabledPlugins": { - "security-guidance@claude-code-plugins": true - }, - "hooks": { - "PreToolUse": [ - { - "matcher": "Bash", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/dangerous-actions-blocker.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/devcontainer-policy-blocker.sh"} - ] - }, - { - "matcher": "Edit|Write", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/unicode-injection-scanner.sh"}] - } - ], - "PostToolUse": [ - { - "matcher": "Bash", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/output-secrets-scanner.sh"}] - }, - { - "matcher": "Edit|Write", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/auto-format.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/test-on-change.sh"} - ] - } - ] - } -} diff --git a/.devcontainer/permissions/tier3-full-trust.json b/.devcontainer/permissions/tier3-full-trust.json deleted file mode 100644 index fa4f76c..0000000 --- a/.devcontainer/permissions/tier3-full-trust.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "__comment": "Tier 3: Full Trust -- minimal deny list. Requires branch protection + scoped PAT. See docs/DEVCONTAINER_PERMISSIONS.md.", - "permissions": { - "allow": [ - "Edit", "Write", "Read", - "WebFetch", - "Bash(*)" - ], - "deny": [ - "Bash(*npm publish*)", "Bash(*npx npm publish*)", - "Bash(*uv publish *)", "Bash(*twine upload *)", - "Bash(*docker run --privileged *)", - "Bash(*docker run --cap-add=ALL *)", - "Bash(*docker run --pid=host *)", - "Bash(*docker run --network=host *)", - "Bash(*git remote add *)", "Bash(*git remote set-url *)", "Bash(*git remote remove *)", - "Bash(*git remote rename *)", "Bash(*git remote set-head *)" - ] - }, - "enabledPlugins": { - "security-guidance@claude-code-plugins": true - }, - "hooks": { - "PreToolUse": [ - { - "matcher": "Bash", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/dangerous-actions-blocker.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/devcontainer-policy-blocker.sh"} - ] - }, - { - "matcher": "Edit|Write", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/unicode-injection-scanner.sh"}] - } - ], - "PostToolUse": [ - { - "matcher": "Bash", - "hooks": [{"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/output-secrets-scanner.sh"}] - }, - { - "matcher": "Edit|Write", - "hooks": [ - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/auto-format.sh"}, - {"type": "command", "command": "\"$CLAUDE_PROJECT_DIR\"/.claude/hooks/test-on-change.sh"} - ] - } - ] - } -} diff --git a/CLAUDE.md b/CLAUDE.md index 195add4..ed96944 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,14 +6,15 @@ Use `/sync` before starting work, `/design` to formalize a plan, `/done` when fi ## Security -- **Real-time scanning**: The `security-guidance` plugin runs automatically during code editing, warning about command injection, eval/exec, pickle deserialization, XSS, and os.system() usage -- **Runtime hooks**: 3 base security hooks run automatically via `.claude/hooks/` (+ 1 devcontainer-only policy hook): - - `dangerous-actions-blocker.sh` (PreToolUse/Bash): blocks `rm -rf`, `sudo`, `DROP DATABASE`, `git push --force`, secrets in args - - `output-secrets-scanner.sh` (PostToolUse/Bash): warns if command output contains API keys, tokens, private keys, or DB URLs - - `unicode-injection-scanner.sh` (PreToolUse/Edit|Write): blocks zero-width chars, RTL overrides, ANSI escapes, null bytes - - `devcontainer-policy-blocker.sh` (PreToolUse/Bash, devcontainer only): blocks tool installation, publishing, supply-chain piping, and tier-dependent GH/infra commands +Two-layer defense against data exfiltration: + +1. **Firewall** (primary): iptables whitelist in devcontainer blocks all non-approved network domains +2. **Exfiltration guard** (hook): `dangerous-actions-blocker.sh` (PreToolUse/Bash) blocks exfiltration via trusted channels -- `gh gist create`, `gh issue create --body`, package publishing (`twine`/`npm`/`uv publish`), and secrets as literal command arguments + +Additional: +- **Real-time scanning**: The `security-guidance` plugin runs automatically during code editing, warning about command injection, eval/exec, deserialization, XSS, and unsafe system calls - **Secrets handling**: Never commit API keys, tokens, passwords, or private keys -- use environment variables or `.env` files (which are gitignored) -- **Unsafe operations**: Avoid `eval`, `exec`, `pickle.loads`, `subprocess(shell=True)`, and `yaml.load` without SafeLoader in production code. If required, document the justification in a code comment +- **Unsafe operations**: Avoid `eval`, `exec`, unsafe deserialization, `subprocess(shell=True)`, and `yaml.load` without SafeLoader in production code. If required, document the justification in a code comment - **Code review**: The code-reviewer agent checks for logic-level security issues (authorization bypass, TOCTOU, data exposure) that static pattern matching cannot catch ## Development Commands @@ -34,13 +35,10 @@ uv run pyright # Type check Do not use unnecessary cd like `cd /path/to/cwd && git log`. -## Devcontainer Rules - -When running in a devcontainer, some operations are denied by policy. Before attempting a command that might be blocked, check `docs/DEVCONTAINER_PERMISSIONS.md` for the approved alternative. Key rules: +## Devcontainer - **Dependencies**: Use `uv add `, never `pip install` - **System tools**: Add to `.devcontainer/Dockerfile`, do not install at runtime -- **No chained cd**: Use absolute paths. `cd /path && command` bypasses permission checks. ## Code Style diff --git a/docs/ARCHITECTURE_GUIDE.md b/docs/ARCHITECTURE_GUIDE.md deleted file mode 100644 index 8157ce5..0000000 --- a/docs/ARCHITECTURE_GUIDE.md +++ /dev/null @@ -1,687 +0,0 @@ -# Architecture Deep Dive - -This guide explains **why** each component exists, **what** it does under the hood, and **what happens if** you remove or modify it. It is for developers who want to customize the template, strip parts out, or understand the design decisions behind it. - -For setup instructions, see [Getting Started](GETTING_STARTED.md). For the workflow reference, see [Development Process](DEVELOPMENT_PROCESS.md). For permission tier details, see [Devcontainer Permissions](DEVCONTAINER_PERMISSIONS.md). - ---- - -## System Architecture - -The template uses defense-in-depth: multiple independent layers that each catch different problems. No single layer is the whole story. - -```text -+-----------------------------------------------------------------------+ -| Devcontainer (optional, recommended) | -| +------------------------------------------------------------------+ | -| | Network firewall (init-firewall.sh) | | -| | Blocks all egress except ~10 whitelisted domains | | -| +------------------------------------------------------------------+ | -| | Non-root user (vscode, uid 1000) | | -| | Cannot install system packages or modify system files | | -| +------------------------------------------------------------------+ | -| | Policy blocker (devcontainer-policy-blocker.sh) | | -| | Tier-aware: blocks pip install, publishing, supply-chain piping | | -| +------------------------------------------------------------------+ | -+-----------------------------------------------------------------------+ -| Claude Code Runtime (always active) | -| +------------------------------------------------------------------+ | -| | settings.json permissions (allow / deny / ask) | | -| | First gate: controls which tools and commands Claude can invoke | | -| +------------------------------------------------------------------+ | -| | PreToolUse hooks | | -| | dangerous-actions-blocker | unicode-injection-scanner | | -| +------------------------------------------------------------------+ | -| | PostToolUse hooks | | -| | output-secrets-scanner | auto-format | test-on-change | | -| +------------------------------------------------------------------+ | -| | security-guidance plugin (real-time warnings during editing) | | -| +------------------------------------------------------------------+ | -| | Review rules (architecture, code-quality, performance, test) | | -| | Loaded as context -- influence Claude's decisions, not enforced | | -| +------------------------------------------------------------------+ | -+-----------------------------------------------------------------------+ -``` - -### What runs during a typical workflow - -```text -/sync /design write code /done /landed - | | | | | - git fetch read DECISIONS [your work] lint+format verify merge CI - branch state scope classify tests+coverage deploy check - dirty files plan output code review branch cleanup - recent commits auto-format.sh PR creation next phase - test-on-change.sh changelog update - (after each edit) -``` - ---- - -## Security Layers - -### Permissions: settings.json - -
-Why / What / Remove / Connects - -**Why it exists.** Controls which shell commands Claude can run without asking. Without this, Claude either prompts for every command (slow) or runs everything unchecked (risky). - -**What it does.** Three lists evaluated in order: **deny** (always blocked, no prompt), **ask** (user must approve), **allow** (runs without prompt). Each entry is a glob pattern matched against the command. The deny list blocks secret management commands (`gh secret`, `gh auth`, `git config`). The ask list covers destructive or stateful operations (`docker`, `terraform`, `gh pr merge`, `git reset`). The allow list covers safe read/build operations (`pytest`, `ruff`, `git status`, `ls`). - -**What happens if you remove it.** Claude falls back to its default behavior: prompting for most commands. The workflow still works but requires constant approval clicks. No security risk -- just friction. - -**How it connects.** Devcontainer tiers override these permissions by copying a tier-specific file to `settings.local.json`, which takes precedence over `settings.json`. The hooks (below) provide a second layer that catches commands the permission patterns miss. - -
- -### PreToolUse Hooks - -These run **before** Claude executes a command or file edit. They can block the action (exit code 2) or allow it (exit code 0). - -#### dangerous-actions-blocker.sh - -
-Why / What / Remove / Connects - -**Why it exists.** Permissions use glob patterns, which cannot catch every dangerous command variant. This hook uses substring and regex matching against the full command string to block destructive patterns that slip through. - -**What it does.** Triggers on every `Bash` tool invocation. Checks the command against three categories: - -1. **Destructive commands** (23 literal patterns): `rm -rf /`, `sudo rm -rf`, `dd if=/dev/zero`, fork bombs, `chmod -R 777 /`, `shutdown`, `reboot`, `halt`, `poweroff`, `git push --force`, `git push -f`, `git push origin +master`, `git push origin +main`, `DROP DATABASE`, `DROP TABLE`, `TRUNCATE TABLE` -2. **Filesystem format** (regex): `mkfs.*`, `DELETE FROM .* WHERE 1` -3. **Secrets in commands** (9 literal + 1 regex): any command containing `ANTHROPIC_API_KEY=`, `OPENAI_API_KEY=`, `AWS_SECRET_ACCESS_KEY=`, `GITHUB_TOKEN=`, `GH_TOKEN=`, `password=`, `passwd=`, `secret=`, `token=`, or database URLs with embedded credentials - -Uses `grep -qiF` (case-insensitive, literal) for exact patterns and `grep -qiE` for regex. Returns a JSON decision object to stdout with exit code 2 to block. - -**What happens if you remove it.** No pre-execution safety net. A command like `git push --force origin main` would execute if it passes the settings.json permissions. The devcontainer policy blocker (if present) covers some patterns but not all. - -**How it connects.** Complements the settings.json permissions (which use glob patterns, not substring matching). In devcontainers, the `devcontainer-policy-blocker.sh` adds tier-aware blocking on top. - -**Dependencies:** `jq` (degrades gracefully -- exits 0 with warning if missing). - -
- -#### unicode-injection-scanner.sh - -
-Why / What / Remove / Connects - -**Why it exists.** Invisible Unicode characters can make code appear different from what it actually does. A zero-width character in a variable name creates a visually identical but semantically different identifier. RTL override characters can make `if (isAdmin)` display as `if (isUser)` in code review. These are real attack vectors (CVE-2025-53109, CVE-2025-53110). - -**What it does.** Triggers on every `Edit` and `Write` tool invocation. Scans the content being written for: - -1. **Zero-width characters**: U+200B (zero-width space), U+200C (zero-width non-joiner), U+200D (zero-width joiner), U+FEFF (byte-order mark) -2. **Bidirectional overrides**: U+202A-E, U+2066-2069 (left-to-right/right-to-left embedding, override, isolate) -3. **ANSI escape sequences**: `\x1B[` (terminal injection) -4. **Null bytes**: U+0000 (can truncate strings in C-based parsers) -5. **Unicode tag characters**: U+E0001-E007F (invisible instruction embedding) - -Uses `grep -P` (PCRE) for Unicode-aware matching. - -**What happens if you remove it.** Code files could contain invisible characters that alter behavior without being visible in code review. This is a supply-chain attack vector. - -**How it connects.** Only scans Edit/Write (not Bash output). The `output-secrets-scanner.sh` covers Bash output separately. Requires GNU grep with PCRE support -- on macOS or Alpine (no PCRE by default), the hook disables itself with a warning. - -**Dependencies:** `jq`, `grep -P` (PCRE). Fails open on macOS/Alpine where PCRE is unavailable. - -
- -### PostToolUse Hooks - -These run **after** a command or edit completes. They cannot block -- they can only warn via `systemMessage`. - -#### output-secrets-scanner.sh - -
-Why / What / Remove / Connects - -**Why it exists.** A developer might run `cat .env` or `env | grep API` and accidentally expose secrets in the conversation log. This hook warns when that happens. - -**What it does.** Triggers after every `Bash` command. Scans the command output for: - -1. AWS Access Key IDs (`AKIA[0-9A-Z]{16}`) -2. AWS Secret Access Keys (40-char base64 near `aws_secret` context) -3. Anthropic API keys (`sk-ant-*`) -4. OpenAI API keys (`sk-*`) -5. GitHub PATs (`ghp_*`) and OAuth tokens (`gho_*`) -6. JWT tokens (`eyJ*.eyJ*.*`) -7. Private key material (`-----BEGIN ... PRIVATE KEY-----`) -8. Database URLs with embedded credentials (`postgres://user:pass@host`) - -Emits a `systemMessage` warning to stderr if any match is found. Always exits 0 -- PostToolUse hooks cannot block. - -**What happens if you remove it.** Secrets can appear in conversation output without any warning. They may end up in logs or screenshots. - -**How it connects.** Complements the `dangerous-actions-blocker.sh` (which blocks secrets in *commands*). This hook catches secrets in *output*. - -**Dependencies:** `jq`, `grep -E`. - -
- -### Productivity Hooks - -#### auto-format.sh - -
-Why / What / Remove / Connects - -**Why it exists.** Without it, Claude writes code, then you (or CI) run the formatter, then Claude sees the diff and gets confused about what changed. Auto-formatting after each edit means Claude always sees correctly formatted code. - -**What it does.** Triggers after every `Edit` or `Write` on `*.py` files. Runs `uv run ruff format ` followed by `uv run ruff check --fix `. Runs synchronously so Claude's next read of the file sees formatted code. - -**What happens if you remove it.** Code style drifts during a session. The CI lint step (`ruff check`) still catches formatting issues, but later -- after commit and push. Claude may also produce inconsistent style if it reads its own unformatted output. - -**How it connects.** Runs before `test-on-change.sh` (both are registered on the same PostToolUse/Edit|Write event, in array order). This means tests run against formatted code. - -**Dependencies:** `jq`, `uv`, `ruff`. Degrades gracefully if uv is not found. - -
- -#### test-on-change.sh - -
-Why / What / Remove / Connects - -**Why it exists.** Immediate feedback. Without it, Claude writes code, moves on, and only discovers test failures when `/done` runs validation -- potentially after many edits. With it, Claude sees failures right after the edit that caused them. - -**What it does.** Triggers after every `Edit` or `Write` on `*.py` files. Discovers the associated test file using two strategies: - -1. Walk up from the edited file looking for `tests/test_.py` adjacent to the package -2. Check the project root `tests/test_.py` - -Skips test files themselves, `__init__.py`, and `conftest.py`. Runs `uv run pytest -x -q` (stop on first failure, quiet output). On failure, emits a `systemMessage` with the last 20 lines of test output. Always exits 0. - -**What happens if you remove it.** No instant test feedback during editing. Tests still run during `/done` validation (via the `test-coverage-validator` agent), so nothing is missed -- just delayed. - -**How it connects.** Runs after `auto-format.sh`, so tests execute against formatted code. Uses `$CLAUDE_PROJECT_DIR` for root-relative test discovery. - -**Dependencies:** `jq`, `uv`, `pytest`. Degrades gracefully if uv is not found. - -
- -### security-guidance Plugin - -
-Why / What / Remove / Connects - -**Why it exists.** The hooks catch dangerous commands and output. This plugin catches dangerous *code patterns* while Claude is writing them -- before they ever execute. - -**What it does.** A Claude Code built-in plugin (`security-guidance@claude-code-plugins`) that emits real-time warnings when Claude writes code containing: -- Command injection patterns (`os.system()`, `subprocess(shell=True)`) -- Code injection (`eval()`, `exec()`, `__import__()` with user input) -- Unsafe deserialization (`pickle.loads()`, `yaml.load()` without SafeLoader) -- XSS patterns - -Enabled via `enabledPlugins` in `settings.json`. - -**What it does not do.** It does not block code from being written. It only warns. The developer or Claude must act on the warning. - -**What happens if you remove it.** No inline security warnings during code editing. The `security-auditor` agent and `/security-audit` command can still catch these patterns, but only when explicitly invoked. - -**How it connects.** Works alongside hooks (which cover commands and output) to cover the third attack surface: the code itself. - -
- ---- - -## Devcontainer Security Layers - -These layers only apply when running inside the devcontainer. They are optional but recommended for autonomous usage. - -### Network Firewall (init-firewall.sh) - -
-Why / What / Remove / Connects - -**Why it exists.** A containerized Claude Code session could download and execute arbitrary code from the internet. The firewall restricts network access to known-good services only. - -**What it does.** Runs via `postStartCommand` (with sudo). Sets the default iptables OUTPUT policy to DROP, then whitelists: - -- DNS (port 53, TCP/UDP) -- SSH (port 22) -- Localhost (loopback) -- GitHub IPs (fetched dynamically from GitHub's API, aggregated into CIDR ranges via `ipset`) -- Specific domains: PyPI (`pypi.org`, `files.pythonhosted.org`), Astral (`astral.sh`), Claude/Anthropic (`claude.ai`, `api.anthropic.com`, `sentry.io`, `statsig.com`), VS Code marketplace - -GitHub CIDR ranges are aggregated using the `aggregate` tool, then stored in `ipset` with `hash:net` for efficient filtering instead of per-IP iptables rules. IPv6 is completely blocked. Self-tests at the end by verifying `example.com` is unreachable and `api.github.com` is reachable. - -**What happens if you remove it.** The container has unrestricted network access. Claude Code could download arbitrary packages from any host, which is a supply-chain attack vector. - -**How it connects.** Complements the `devcontainer-policy-blocker.sh` (which blocks `pip install` commands) by also blocking the network path. Even if someone bypasses the command-level block, the firewall prevents reaching unauthorized hosts. - -**Dependencies:** `iptables-legacy`, `ipset`, `curl`. Degrades gracefully -- logs warning and exits 0 if iptables is unavailable. - -
- -### Policy Blocker (devcontainer-policy-blocker.sh) - -
-Why / What / Remove / Connects - -**Why it exists.** The three permission tiers (Assisted, Autonomous, Full Trust) need enforcement beyond settings.json patterns. This hook reads `$PERMISSION_TIER` and blocks commands that the current tier does not allow. - -**What it does.** Triggers on every `Bash` tool invocation inside the devcontainer. Checks the command against blocked categories: - -| Category | Tiers 0/1/2 | Tier 3 | -|----------|-------------|--------| -| Tool installation (pip, npm -g, apt, uv tool) | BLOCKED | Allowed | -| Package publishing (npm publish, uv publish, twine) | BLOCKED | BLOCKED | -| Supply-chain piping (curl url \| bash) | BLOCKED | BLOCKED | -| Docker escapes (--privileged, --cap-add=ALL) | BLOCKED | BLOCKED | -| GitHub mutations (gh pr merge, gh workflow run) | BLOCKED | Allowed | -| Infrastructure (terraform) | BLOCKED | Allowed | - -**Critical behavior:** This hook is **fail-closed**. If `jq` is missing or `$PERMISSION_TIER` is unset, it blocks ALL bash commands (exit 2). This is intentional -- in a security-critical devcontainer, failing open would defeat the purpose. - -**What happens if you remove it.** Tiers 1 and 2 lose their command-level enforcement. The settings.json permission patterns still apply, but the tier-specific blocking (e.g., blocking `pip install` in tier 2) disappears. - -**How it connects.** Works with the tier files in `.devcontainer/permissions/` and the `$PERMISSION_TIER` env var set in `devcontainer.json`. See [How Tiers Work](#how-tiers-work-under-the-hood) below. - -**Dependencies:** `jq` (REQUIRED -- no graceful degradation; fails closed). - -
- -### Non-root User - -
-Why / What / Remove / Connects - -**Why it exists.** Principle of least privilege. A root user inside the container could modify system files, install packages at the OS level, and potentially escape the container in some Docker configurations. - -**What it does.** The Dockerfile creates a `vscode` user (uid 1000) and runs all commands as that user. Sudo is restricted to a single script: `/usr/local/bin/init-firewall.sh`. The user cannot run `sudo iptables` directly. - -**What happens if you remove it.** Claude Code runs as root inside the container. Combined with removed firewall/policy hooks, this means unrestricted system access within the container. - -**How it connects.** The firewall script uses sudo (allowed for firewall commands). The rest of the stack runs unprivileged. - -
- ---- - -## Agents - -### How Agents Work - -Agents are Claude Code sub-processes defined in `.claude/agents/`. Each agent file is a Markdown document with YAML frontmatter that specifies: - -- **model**: Which Claude model to use. `haiku` is cheap and fast (good for lint/format). `sonnet` is more capable (good for review/analysis). Model choice directly affects token costs. -- **tools**: Which tools the agent can use. Read-only agents (`Read, Glob, Grep`) cannot modify files. Write-capable agents add `Edit` or `Bash`. -- **permissionMode**: Controls user prompting: - - `dontAsk` -- runs silently, no prompts (used for read-only tasks) - - `plan` -- generates recommendations without executing (used for auditors) - - `acceptEdits` -- can create and modify files during execution (used for validators that auto-fix) - -Agents are invoked by the workflow (typically via `/done`) using `subagent_type: "general-purpose"`. You do not invoke them directly -- the skill orchestrates them. - -For the full agent reference table, see [Development Process > Agent Reference](DEVELOPMENT_PROCESS.md#agent-reference). - -### Core Workflow Agents - -These run automatically as part of `/done` for Standard and Project scope tasks. - -
-code-quality-validator (model: haiku) - -**Why.** Catches lint, format, and type errors before they reach CI. Uses Haiku for speed since the task is mechanical. - -**What.** Runs `ruff check`, `ruff format --check`, and `pyright`. Can auto-fix issues via `Edit` tool (`permissionMode: acceptEdits`). - -**Remove.** Lint/format/type issues reach CI instead of being caught locally. `/done` still works but skips this validation step. - -
- -
-test-coverage-validator (model: sonnet) - -**Why.** Verifies tests pass and checks coverage. Uses Sonnet because it needs to reason about test adequacy, not just run commands. - -**What.** Runs `pytest` with coverage flags. Analyzes results and reports findings. Read-only (`permissionMode: dontAsk` -- no Edit tool). - -**Remove.** Tests are not validated before PR creation. CI still runs them, but you lose the pre-commit coverage check. - -
- -
-pr-writer (model: sonnet) - -**Why.** Consistent, informative PR descriptions. Humans tend to write "fixed the thing" -- this agent reads the diff and produces a structured summary with test plan. - -**What.** Reads `git diff`, `git log`, and changed files. Generates a PR description following the template in `.github/PULL_REQUEST_TEMPLATE.md`. Read-only (`permissionMode: dontAsk`). - -**Remove.** You write PR descriptions manually. The PR template still applies but is not auto-filled. - -
- -
-code-reviewer (model: sonnet) - -**Why.** Independent code review before the PR is created. Catches logic errors, security issues, and design problems that linters miss. - -**What.** Reads the diff and applies the review rules (architecture, code-quality, performance, test). Reports findings with severity levels. Read-only (`permissionMode: dontAsk`, `memory: project` for CLAUDE.md context). - -**Remove.** No pre-PR code review. Linters and type checkers still catch mechanical issues, but logic errors, security concerns, and design problems would only be caught during manual review. - -
- -
-docs-updater (model: sonnet) - -**Why.** Keeps changelog and decision log in sync with code changes. Developers forget to update docs -- this agent does it automatically. - -**What.** Reads the diff and recent commits. Updates `docs/CHANGELOG.md` and `docs/DECISIONS.md` if changes warrant it (`permissionMode: acceptEdits`). - -**Remove.** Docs must be updated manually. The changelog and decision log drift out of sync with code. - -
- -### Project-Scope Agents - -These run only for P-scoped (multi-phase) tasks. - -
-acceptance-criteria-validator and implementation-tracker - -**acceptance-criteria-validator.** Verifies that the implementation meets the acceptance criteria defined in `docs/IMPLEMENTATION_PLAN.md`. Runs at step P.3.2. - -**implementation-tracker.** Keeps `IMPLEMENTATION_PLAN.md` in sync with actual progress. Can mark phases complete and update status (`permissionMode: acceptEdits`). - -**Remove either.** Multi-phase projects lose automated progress tracking. You manage the plan manually. - -
- -### Optional Specialists - -These are invoked manually, not by `/done`. They have no workflow dependencies. - -
-Full list - -| Agent | Purpose | Mode | -|-------|---------|------| -| security-auditor | OWASP-based vulnerability scan | plan (read-only) | -| refactoring-specialist | SOLID/code smell analysis | plan (read-only) | -| review-responder | Triage and fix automated PR review comments | acceptEdits | -| output-evaluator | LLM-as-Judge quality scoring | dontAsk | -| agent-auditor | Audit agent definitions against best practices | plan (read-only) | - -**Remove any.** No workflow impact. These are convenience tools you invoke when needed. - -
- ---- - -## Skills and Commands - -### Workflow Skills - -The four workflow skills chain together to form the development lifecycle. Each reads and writes specific files. - -| Skill | Reads | Writes | Depends on | -|-------|-------|--------|------------| -| `/sync` | git state | nothing | (standalone) | -| `/design` | DECISIONS.md, IMPLEMENTATION_PLAN.md | IMPLEMENTATION_PLAN.md (P-scope only) | (standalone) | -| `/done` | git diff, tests, IMPLEMENTATION_PLAN.md | commits, PR, CHANGELOG.md, DECISIONS.md | agents, hooks | -| `/landed` | git reflog, gh pr, gh run, deploy.json | branch deletion, plan status check | (standalone) | - -
-What breaks if you remove one - -**Remove /sync.** No pre-flight check. You might start work on a stale branch or with uncommitted files. Low risk -- it is a convenience. - -**Remove /design.** No structured planning step. You can still plan manually and write to IMPLEMENTATION_PLAN.md yourself. The `/done` skill does not require `/design` to have run first. - -**Remove /done.** This is the orchestrator. Without it, you must manually run lint, tests, code review, commit, push, create PR, and update docs. The agents and hooks still work individually, but nothing chains them together. - -**Remove /landed.** No post-merge verification. You must manually check CI, delete branches, and identify the next phase for P-scope work. - -
- -### Utility Skill - -**`/edit-permissions`** -- Interactive tool for managing `settings.json` permission rules. Provides pattern syntax reference and safety guardrails. Safe to remove -- you can edit `settings.json` manually. - -### Commands - -| Command | Purpose | Remove impact | -|---------|---------|---------------| -| `/cove` | Chain-of-Verification for high-stakes accuracy | No workflow impact. Use when you need verified facts. | -| `/cove-isolated` | CoVe with isolated verification agent | Same as `/cove` but prevents confirmation bias. | -| `/security-audit` | 6-phase Python security scan with A-F grading | No workflow impact. Use for periodic security review. | - ---- - -## Rules - -Rules are Markdown files in `.claude/rules/` that Claude loads as context in every conversation. They influence Claude's decisions but are not enforced programmatically -- they are instructions, not constraints. - -| Rule | What it guides | -|------|---------------| -| architecture-review | Component boundaries, dependency direction, data flow, security boundaries | -| code-quality-review | DRY violations, error handling, type annotations, complexity | -| performance-review | N+1 queries, memory leaks, caching, algorithmic complexity | -| test-review | Coverage gaps, test quality, edge cases, assertion quality | - -
-Adding, modifying, or removing rules - -**Add a rule.** Create a new `.md` file in `.claude/rules/`. It is automatically loaded -- no registration needed. - -**Modify a rule.** Edit the file directly. Changes take effect in the next conversation. - -**Remove a rule.** Delete the file. Claude loses that review dimension. For example, removing `performance-review.md` means Claude stops checking for N+1 queries and memory leaks during code review. - -Rules do not affect hooks, agents, or CI. They only affect Claude's reasoning during interactive sessions and when the `code-reviewer` agent runs (which has `memory: project` and reads CLAUDE.md context). - -
- ---- - -## Configuration Files - -### settings.json - -Location: `.claude/settings.json` - -This is the central configuration file. It controls three things: - -1. **Permissions** (allow/deny/ask lists with glob patterns) -2. **Hooks** (shell scripts triggered by tool events) -3. **Plugins** (built-in Claude Code extensions) - -
-Permission evaluation order - -For each command Claude wants to run: -1. Check **deny** list first -- if any pattern matches, block immediately (no prompt) -2. Check **ask** list -- if any pattern matches, prompt the user for approval -3. Check **allow** list -- if any pattern matches, execute without prompting -4. If nothing matches, prompt the user (default behavior) - -Patterns use glob syntax: `Bash(git status)` matches the exact command, `Bash(git *)` matches any git subcommand. - -
- -
-Hook registration format - -Hooks are registered in `settings.json` under the `hooks` key. Each hook specifies a trigger event (`PreToolUse` or `PostToolUse`), a matcher (which tools it applies to), and one or more command scripts. Multiple hooks on the same event/matcher run in array order. The `$CLAUDE_PROJECT_DIR` variable resolves to the project root at runtime. - -
- -### settings.local.json - -Location: `.claude/settings.local.json` (gitignored) - -Machine-specific overrides. If this file exists, it **replaces** (not merges with) `settings.json` for the keys it defines. This is how devcontainer tiers work: the `onCreateCommand` copies a tier file to `settings.local.json`, overriding the default permissions. - -An example file is provided at `.claude/settings.local.json.example`. - -### CLAUDE.md - -Location: project root - -The main agent directives file. Deliberately compact (~55 lines). Contains: -- Development process reference (which skills to use) -- Security rules (what to avoid) -- Development commands (how to run tests, lint, format) -- Code style conventions - -**What belongs in CLAUDE.md vs elsewhere:** -- CLAUDE.md: short, always-loaded directives -- Rules (`.claude/rules/`): detailed review checklists (loaded as context but not as prominent) -- Agents (`.claude/agents/`): task-specific sub-process definitions -- Skills (`.claude/skills/`): workflow step definitions with their own prompts - -Keep CLAUDE.md small. Everything Claude reads costs tokens on every message. - ---- - -## Devcontainer Deep Dive - -### How Tiers Work Under the Hood - -```text -devcontainer.json - containerEnv: - PERMISSION_TIER: "${localEnv:PERMISSION_TIER:2}" # default: 2 - | - v - onCreateCommand: - case $PERMISSION_TIER in - 1) cp permissions/tier1-assisted.json -> .claude/settings.local.json - 2) cp permissions/tier2-autonomous.json -> .claude/settings.local.json - 3) cp permissions/tier3-full-trust.json -> .claude/settings.local.json - esac - | - v - Claude Code starts, loads: - 1. .claude/settings.json (base config) - 2. .claude/settings.local.json (tier override -- replaces matching keys) - | - v - devcontainer-policy-blocker.sh reads $PERMISSION_TIER at runtime - for additional command-level enforcement -``` - -Key details: -- `$PERMISSION_TIER` is set via `containerEnv` (not `build.args` -- there is no matching `ARG` in the Dockerfile) -- Tier files are **fully self-contained** (permissions + hooks + enabledPlugins). This is required because `settings.local.json` replaces keys, not merges. -- The `onCreateCommand` uses a `case` statement, not simple interpolation, because tier filenames include descriptive suffixes (`tier1-assisted`, not just `tier1`) -- If `$PERMISSION_TIER` is unset, the policy blocker defaults to tier 0 (strictest -- blocks everything) - -For the full tier comparison table, see [Devcontainer Permissions](DEVCONTAINER_PERMISSIONS.md). - -### Firewall Mechanics - -The firewall uses `ipset` + `iptables` instead of DNS-based rules because: -1. DNS resolution can be spoofed -2. Services like GitHub use many IP addresses that change frequently -3. `ipset` handles CIDR ranges efficiently (one rule vs hundreds of per-IP rules) - -Whitelisted domains and why each is needed: - -| Domain | Why | -|--------|-----| -| GitHub IPs (dynamic) | Git operations, GitHub API, Actions | -| pypi.org, files.pythonhosted.org | Python package installation | -| astral.sh | uv package manager updates | -| claude.ai, api.anthropic.com | Claude Code API calls | -| sentry.io | Error reporting (Claude Code telemetry) | -| statsig.com | Feature flags (Claude Code) | -| VS Code marketplace | Extension updates | - -The script self-tests by checking that `example.com` is unreachable and `api.github.com` is reachable. If DNS resolution fails during setup, the affected domain is skipped with a warning (the firewall still activates for everything else). - ---- - -## CI/CD Workflows - -All workflows are in `.github/workflows/`. They use `{{base_branch}}` placeholder (replaced by `setup_project.py`). - -
-tests.yml -- Lint + Test + Typecheck - -**Why.** Catches quality issues before merge. This is the primary quality gate. - -**What.** Triggers on push to base branch or PR. Runs four jobs: -1. **Lint** (ruff format --check + ruff check) -- runs first -2. **test-core** (pytest libs/core/ tests/) -- runs after lint -3. **test-server** (pytest apps/server/) -- runs after lint -4. **typecheck** (pyright) -- runs after lint - -**Remove.** No automated quality checks. Code review and local validation are the only safety nets. - -
- -
-template-integration.yml -- Template Validation - -**Why.** Ensures `setup_project.py` works across all configuration variants. This is template-specific -- downstream projects can remove it. - -**What.** Tests five configurations: mono-default, mono-renamed, mono-extra-pkgs, single-package, mono-postgres. Runs `scripts/test_template_integration.sh` for each. - -**Remove.** Safe to remove in downstream projects. Only needed if you are modifying the template itself. - -
- -
-publish.yml -- PyPI Publishing - -**Why.** Automated package publishing using OIDC (no stored API tokens). - -**What.** Triggers on GitHub release or manual dispatch. Determines package from release tag (e.g., `core-v0.1.0` maps to `libs/core`). Builds with `uv build` and publishes via `pypa/gh-action-pypi-publish`. Requires Trusted Publisher configuration on PyPI. - -**Remove.** Manual publishing only. You would run `uv build` and `twine upload` yourself. - -
- -
-template-sync.yml -- Upstream Template Sync - -**Why.** Keeps downstream projects updated with template improvements (new hooks, agents, CI fixes). - -**What.** Runs on schedule (weekly). Syncs template-managed paths while preserving project-specific content. Creates a PR with the updates for review. - -**Remove.** No automatic updates from the template. You would manually cherry-pick improvements. - -
- ---- - -## Customization Guide - -### Safe to Remove - -These components have no dependencies. Removing them does not break any workflow. - -- **Optional agents**: security-auditor, refactoring-specialist, review-responder, output-evaluator, agent-auditor -- **Commands**: /cove, /cove-isolated, /security-audit -- **Productivity hooks**: auto-format.sh, test-on-change.sh (CI still catches issues) -- **CI workflows**: template-integration.yml (in downstream projects), template-sync.yml, publish.yml -- **Rules**: any individual rule file (Claude loses that review dimension but everything else works) - -### Safe to Modify - -- **Hook patterns**: Add or remove blocked patterns in any hook script. The format is documented in the script comments. -- **Permission lists**: Edit `settings.json` allow/deny/ask patterns. Use `/edit-permissions` for guided editing. -- **Agent models**: Change `model:` in agent frontmatter. Use `haiku` for speed/cost, `sonnet` for capability. -- **Firewall whitelist**: Add domains to `init-firewall.sh`. Follow the existing pattern of DNS resolution + ipset addition. -- **CI workflows**: Add jobs, change triggers, modify test commands. -- **CLAUDE.md**: Add project-specific directives. Keep it concise. - -### Risky to Remove - -These components protect against real attack vectors or are load-bearing for the workflow. - -| Component | Risk if removed | -|-----------|-----------------| -| dangerous-actions-blocker.sh | Destructive commands can execute unchecked | -| unicode-injection-scanner.sh | Vulnerable to trojan source attacks (CVE-2025-53109/53110) | -| output-secrets-scanner.sh | Secrets can leak into conversation logs without warning | -| security-guidance plugin | No real-time security warnings during code editing | -| devcontainer-policy-blocker.sh | Permission tiers lose command-level enforcement | -| init-firewall.sh | Container has unrestricted network access | -| /done skill | No workflow orchestration; must manually run lint, test, review, PR, docs | -| settings.json | Falls back to prompting for every command (functional but painful) | -| tests.yml | No automated quality gate before merge | diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index 86617c0..2500a10 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -13,15 +13,6 @@ When a decision is superseded or obsolete, delete it (git history preserves the **Decisions**: - Use `-name` pattern for package name replacements instead of bare name replacement (avoids false substring matches like "core" in "pyproject") -## 2026-02-16: Permissions Modernization - -**Request**: Eliminate unnecessary permission prompts, migrate deprecated syntax, add permissions management skill. - -**Decisions**: -- Absolute paths in CLAUDE.md instead of adding `Bash(cd *)` allow rule -- shell operator protection blocks chained commands regardless -- `TaskOutput` tool instead of Read allow rule for temp dirs -- temp path is OS-specific and non-portable for a template -- Custom `.claude/skills/` skill instead of community `@otrebu/claude-permissions` -- community skill is for claude.ai web (ZIP upload), not Claude Code CLI - ## 2026-02-24: CLAUDE.md Three-Path Restructuring **Request**: Replace monolithic Phase Completion Checklist with complexity-aware development process. @@ -34,16 +25,12 @@ When a decision is superseded or obsolete, delete it (git history preserves the ## 2026-02-24: Devcontainer Setup -**Request**: Add `.devcontainer/` with Claude Code CLI, network firewall, and docker-compose profiles for common service stacks (inspired by official Claude Code devcontainer and okruh project adaptation). +**Request**: Add `.devcontainer/` with Claude Code CLI and network firewall. **Decisions**: -- Python base image (`python:{{python_version}}-bookworm`) with Node.js 20 added for Claude Code CLI -- not Node base image, since this is a Python project -- `vscode` user (UID 1000) with restricted sudoers (firewall-only) instead of `NOPASSWD:ALL` -- follows principle of least privilege from the official reference +- Python base image (`python:{{python_version}}-bookworm`) with Claude Code native binary installer -- not Node base image, since this is a Python project +- `vscode` user (UID 1000) with restricted sudoers (firewall-only) instead of `NOPASSWD:ALL` -- follows principle of least privilege - No docker-compose.yml by default (simple build) -- compose only generated when user selects a services profile during setup -- Three compose profiles embedded as Python string constants in `setup_project.py` rather than separate template files -- keeps the repo clean, compose files only appear when needed -- `zsh-in-docker` script instead of manual oh-my-zsh installation -- cleaner single-step setup, matches official Claude Code reference -- Firewall uses `aggregate` tool for GitHub CIDR consolidation -- more robust than resolving individual domains, matches official reference -- Non-critical domain resolution failures log WARNING and continue instead of exit 1 -- DNS blips should not prevent container startup ## 2026-02-26: Trim CLAUDE.md Based on "Evaluating AGENTS.md" Paper @@ -55,29 +42,16 @@ When a decision is superseded or obsolete, delete it (git history preserves the - Remove repository structure and testing sections entirely -- proven unhelpful by the paper, fully discoverable from project files - CLAUDE.md must contain a mandatory directive to classify every task as Q/S/P before starting work -## 2026-02-24: Decision Log - -**Request**: Create persistent decision log tracking every feature request and user decision. - -**Decisions**: -- Dedicated `docs/DECISIONS.md` rather than only per-phase tables in IMPLEMENTATION_PLAN.md -- long-lived, cross-phase visibility -- No status field; prune by deletion (git preserves history) -- simpler than Active/Superseded/Obsolete tracking -- Integrated into S.2 (log), S.7 (update), P.1 (consistency check + prune) -- Quick path exempt - -## 2026-03-01: Hooks, Commands, Agents, Rules, CI +## 2026-03-01: Hooks, Commands, Agents, Rules -**Request**: Add hooks (5), slash commands (3), agents (3), review rules (4), and AI-powered CI to bring the template to a comprehensive state. +**Request**: Add hooks, agents, and review rules to bring the template to a comprehensive state. **Decisions**: - Hook scripts in `.claude/hooks/` using `$CLAUDE_PROJECT_DIR` for path resolution -- official Claude Code convention - jq required for JSON parsing in hooks with graceful degradation (exit 0 + stderr warning) if missing -- avoids blocking dev work -- auto-format hook is synchronous (no systemMessage) so Claude sees formatted code; test-on-change is informational only -- Commands in `.claude/commands/` not `.claude/skills/` -- simpler single-file format for instruction-set content -- security-auditor and refactoring-specialist agents are read-only (permissionMode: plan) -- analyze, not modify -- output-evaluator uses haiku + dontAsk -- designed for automated pipeline integration, scoring is formulaic +- auto-format hook is synchronous (no systemMessage) so Claude sees formatted code - Review rules have no `paths:` frontmatter (apply globally) and stay under 80 lines -- loaded into every context window -- CLAUDE.md kept compact per ETH Zurich paper decision; detailed hooks/commands/rules tables added to DEVELOPMENT_PROCESS.md instead -- ~~CI review workflow~~ removed -- see 2026-03-13 decision below +- CLAUDE.md kept compact per ETH Zurich paper decision; detailed tables in DEVELOPMENT_PROCESS.md ## 2026-03-02: QSP Enforcement and Pre-flight Sync @@ -93,14 +67,11 @@ When a decision is superseded or obsolete, delete it (git history preserves the **Request**: Port devcontainer fixes from Vizier repository -- migrate Claude Code CLI from npm to native binary installer, enforce LF line endings, and harden the iptables firewall script. **Decisions**: -- Native binary installer for Claude Code CLI instead of npm + Node.js 20 -- Node.js added no value to a Python project; the official native installer is the recommended path and removes a large runtime dependency -- .gitattributes enforcing LF line endings for shell scripts (`*.sh`), Dockerfiles, and `.devcontainer` files -- CRLF-corrupted shell scripts fail silently on Linux, and Windows developers check out these files by default with CRLF -- iptables-legacy backend instead of the default nftables -- nftables is unreliable inside Docker due to missing kernel module support; iptables-legacy is stable and widely supported across host kernels -- iptables pre-check with graceful degradation (log warning, skip firewall) instead of hard exit 1 -- a missing iptables binary should degrade to an open network rather than block container startup entirely -- Suppress stderr on iptables -X (chain flush) -- Docker pre-populates chains (DOCKER, DOCKER-USER, etc.) that cannot be deleted; the errors are expected and misleading -- Skip lines starting with `#` during DNS rule restoration -- comment lines in the allowlist were being passed to iptables as hostnames, causing spurious errors -- DROP policies added after all ACCEPT rules -- reordering prevents a partial-failure scenario where DROP is installed before the ACCEPT rules complete, locking out the container -- Replaced registry.npmjs.org with claude.ai in the firewall allowlist -- npmjs.org is no longer contacted now that the native installer is used; claude.ai is required for Claude Code authentication +- Native binary installer for Claude Code CLI instead of npm + Node.js 20 -- Node.js added no value to a Python project +- .gitattributes enforcing LF line endings for shell scripts (`*.sh`), Dockerfiles, and `.devcontainer` files -- CRLF-corrupted shell scripts fail silently on Linux +- iptables-legacy backend instead of the default nftables -- nftables is unreliable inside Docker due to missing kernel module support +- iptables pre-check with graceful degradation (log warning, skip firewall) instead of hard exit 1 +- DROP policies added after all ACCEPT rules -- prevents partial-failure lockout ## 2026-03-09: Workflow Skills (/sync, /design, /done) @@ -123,74 +94,6 @@ When a decision is superseded or obsolete, delete it (git history preserves the - Optional deployment verification via `.claude/deploy.json` (gitignored) -- not all projects have deployments, so it's opt-in with an example file - Phase detection uses "Quick Status Summary" table in IMPLEMENTATION_PLAN.md, not `- [ ]` checkboxes -- matches actual file structure -## 2026-03-10: Architecture Deep Dive Guide - -**Request**: Add an onboarding guide for users who want to understand why each template component exists, what it does, and what happens if they modify or remove it. - -**Decisions**: -- Single document (`docs/ARCHITECTURE_GUIDE.md`) rather than splitting into per-topic files (hooks-guide.md, agents-guide.md, etc.) -- one file supports both linear reading and ctrl-F searching without docs/ sprawl -- Consistent 4-question structure per component: Why / What / What if removed / Connections -- makes the guide scannable and predictable -- Collapsible `
` sections keep the document navigable at heading level despite ~700 lines -- Cross-references existing docs (DEVELOPMENT_PROCESS.md for reference tables, DEVCONTAINER_PERMISSIONS.md for tier details) instead of duplicating content -- Ends with a Customization Guide (Safe to Remove / Safe to Modify / Risky to Remove) as the practical payoff - -## 2026-03-10: Template Integration CI Pipeline - -**Request**: Create a CI pipeline that applies the template in various settings to catch template bugs before merge. - -**Decisions**: -- New workflow `template-integration.yml` (not extending `tests.yml`) -- `tests.yml` has `{{base_branch}}` in its trigger and never fires on the raw template repo -- GitHub Actions matrix (5 configs) + reusable shell script (`scripts/test_template_integration.sh`) -- matrix defines WHAT to test, script defines HOW to verify; script also runnable locally -- Copy template to temp dir before applying -- `setup_project.py` modifies in-place, would destroy the checkout -- 5 matrix configs cover all major code paths: default monorepo, package renaming, additional packages, single-package conversion, Docker Compose services -- Unit tests gate job runs first -- fail fast if setup_project.py functions are broken before spending matrix resources -- Placeholder check uses named pattern matching (`{{project_name}}` etc.) not generic `{{` -- avoids false positives from GitHub Actions `${{ }}` expressions -- `test_setup_project.py` excluded from integration pytest runs -- tests setup script internals (already covered by unit-tests job), fails on single-package layout - -## 2026-03-04: Devcontainer Permission Tiers - -**Request**: Expand Claude Code permissions for devcontainer usage, taking advantage of container isolation (firewall, non-root user, hooks) to reduce unnecessary permission prompts. - -**Decisions**: -- Three graduated tiers (Assisted, Autonomous, Full Trust) stored as JSON in `.devcontainer/permissions/` -- copied to `.claude/settings.local.json` at container creation via build arg -- `settings.local.json` (gitignored) for devcontainer-specific expansions, NOT modifying shared `settings.json` -- base settings remain the universal bare-metal baseline -- Tier 2 (Autonomous, recommended default) uses `Bash(*)` allow with curated deny list -- zero prompts for bash, denied commands fail immediately instead of prompting -- Deny list targets three categories: shared external state (gh pr merge, workflow triggers, issue mutations), irreversible actions (package publishing to npm/PyPI), and container escape vectors (docker --privileged) -- Tool installation comprehensively denied in Tier 2 (pip, npm -g, cargo, go, gem, uv tool, apt, snap, brew) -- toolchain defined by Dockerfile, project deps via `uv add` -- Separate `devcontainer-policy-blocker.sh` hook (not modifying existing hooks) catches denied patterns in chained commands (`cd && pip install`) that bypass glob-based deny rules -- Single hook script with `$PERMISSION_TIER` env var for tier awareness -- fail-closed (if unset, blocks everything) -- Each tier file is fully self-contained (permissions + ALL hooks) to survive settings.local.json replace-not-merge semantics -- Template guard in onCreateCommand: skips `uv sync` if pyproject.toml still has `{{project_name}}` placeholders -- `docs/DEVCONTAINER_PERMISSIONS.md` maps every denied command to its approved alternative -- CLAUDE.md references this doc so Claude checks alternatives before attempting blocked commands -- Full implementation recorded in `docs/IMPLEMENTATION_PLAN.md` Phase 9 - -**Accepted Risks**: - -| Risk | Why accepted | Mitigation | -|------|-------------|------------| -| Grep-based hook bypass via obfuscation (`p\ip install`, `alias p=pip; p install`) | Grep hooks are a UX layer to prevent Claude from wasting turns on naive mistakes. They cannot stop deliberate bash obfuscation from prompt injection. | Actual security boundaries are non-root user (installs fail) + firewall (limits exfiltration). The hook catches the 99% case. | -| GitHub API via curl (`curl -H "Authorization: ..." https://api.github.com/.../merge`) | Blocking curl to github.com is fragile and breaks legitimate web fetching. The hook already blocks commands containing `GH_TOKEN=` as a literal argument. | Use fine-grained PATs with minimal scopes. CLAUDE.md instructs Claude to use `gh` CLI, not raw API calls. Token scoping is the real control. | -| Docker not present but deny rules exist | Docker is not installed in the current template container. Deny rules exist as defense-in-depth for users who add Docker-in-Docker later. | If Docker-in-Docker is added, the deny list should be revisited (add `-v` and `--mount` volume escape patterns). | -| Whitelisted domains as exfil channels | `github.com` is whitelisted for git/gh operations. A compromised agent could theoretically exfiltrate via gist creation or issue comments. | Token scoping (no gist/issue create permission) + GH mutation deny rules in Tier 2. Tier 3 accepts this risk explicitly. | - -## 2026-03-13: Remove CI-Based Claude Code Review - -**Request**: Remove the `claude-code-review.yml` GitHub Actions workflow. The local `code-reviewer` agent (run by `/done` at step S.6.5) already provides equivalent pre-PR review coverage, making the CI workflow redundant. - -**Decisions**: -- Delete `claude-code-review.yml` entirely -- the local code-reviewer agent provides the same review before PR creation, and the CI workflow required managing an `ANTHROPIC_API_KEY` secret in GitHub -- Keep `dangerous-actions-blocker.sh` `ANTHROPIC_API_KEY=` pattern unchanged -- it blocks secrets in commands generally, not CI-specific -- Keep `docs/IMPLEMENTATION_PLAN.md` unchanged -- historical record of completed work - -## 2026-03-16: Git Remote Mutation Deny Rules - -**Request**: Prevent code exfiltration by blocking `git remote add evil https://... && git push evil` attack pattern. - -**Decisions**: -- Deny `git remote add`, `set-url`, `remove`, `rename`, `set-head` in settings.json and all tier files -- read-only `git remote -v` remains allowed via the existing `Bash(git remote *)` allow rule -- Deny rules are absolute in Claude Code (cannot be overridden by allow), making this the correct control layer vs hooks -- Tier files use wildcard prefix `Bash(*git remote add *)` to catch chained command variants - ## 2026-03-16: WebFetch Firewall Integration **Request**: Connect the devcontainer iptables firewall to Claude Code's WebFetch permission settings so users don't need to manually edit the firewall script when working with external services. @@ -200,5 +103,18 @@ When a decision is superseded or obsolete, delete it (git history preserves the - Only `allow` and `ask` lists are scanned (not `deny`) -- denied domains should never be whitelisted - Bare `WebFetch` (no domain qualifier) is ignored -- it grants tool permission but has no domain to resolve - Wildcard domains (e.g., `*.example.com`) are skipped with a warning -- DNS cannot resolve wildcard patterns to IPs -- Empty domain values filtered by `sed '/^$/d'` instead of `grep -v '^$'` -- grep exits non-zero on empty input under `set -euo pipefail` -- WebFetch settings changes take effect on container restart (`init-firewall.sh` runs from `postStartCommand`); permission tier changes require rebuild (`onCreateCommand` copies tier to `settings.local.json`) +- WebFetch settings changes take effect on container restart (`init-firewall.sh` runs from `postStartCommand`) + +## 2026-03-18: Security Model Simplification + +**Request**: Prune security infrastructure to essentials. Remove permission tiers, +most hooks, commands, and niche agents. Refocus on exfiltration prevention. + +**Decisions**: +- Two exfiltration channels: network (firewall) and trusted-channel abuse (hook) +- Firewall is primary defense -- iptables whitelist blocks all non-approved domains +- dangerous-actions-blocker.sh narrowed to: GitHub API exfil, publishing, secrets in args +- Local destruction (rm -rf, sudo, etc.) not blocked -- devcontainer is disposable +- output-secrets-scanner removed -- conversation leaks to Anthropic are accepted +- Permission tiers removed -- single settings.json baseline for all environments +- unicode-injection-scanner removed -- exotic threat, low practical risk diff --git a/docs/DEVCONTAINER_PERMISSIONS.md b/docs/DEVCONTAINER_PERMISSIONS.md deleted file mode 100644 index 134b4ae..0000000 --- a/docs/DEVCONTAINER_PERMISSIONS.md +++ /dev/null @@ -1,109 +0,0 @@ -# Devcontainer Permissions - -## Tier System - -The devcontainer uses graduated permission tiers stored in `.devcontainer/permissions/`. The chosen tier is copied to `.claude/settings.local.json` at container creation. - -| Tier | Name | Who | Bash behavior | -|------|------|-----|---------------| -| 1 | Assisted | New users, compliance teams | Per-command approval | -| 2 | Autonomous (default) | Most developers | `Bash(*)` with curated deny list | -| 3 | Full Trust | Solo devs, strong CI/branch protection | `Bash(*)` with minimal deny list | - -Set the tier via `PERMISSION_TIER` environment variable before building the devcontainer (default: `2`). - -## Safety Layers (Always Active) - -Regardless of tier, these layers provide defense-in-depth: - -- **Firewall (iptables)**: All egress blocked except whitelisted domains (built-in + WebFetch settings) -- **Non-root user**: Cannot install system packages or modify system files -- **dangerous-actions-blocker.sh**: Blocks rm -rf, sudo, force push, DROP DATABASE, secrets in args -- **output-secrets-scanner.sh**: Warns on leaked credentials in command output -- **unicode-injection-scanner.sh**: Blocks zero-width chars, RTL overrides in file content -- **devcontainer-policy-blocker.sh**: Catches denied patterns in chained commands -- **Base deny rules (settings.json)**: gh secret/auth/ssh-key/gpg-key, git clean/config, uv self - -## Denied Commands and Approved Alternatives - -| Denied Command | Approved Alternative | Rationale | -|----------------|---------------------|-----------| -| `pip install X` / `pip3 install X` | `uv add X` (project dep) or add to Dockerfile (tool) | Bypasses uv venv management, pollutes environment | -| `pipx install X` / `python -m pip install X` | Same as above | Alternate pip invocations | -| `npm install -g X` / `npm i -g X` | Add to Dockerfile `RUN npm install -g X` | Installs arbitrary executables | -| `cargo install X` | Add to Dockerfile `RUN cargo install X` | Installs arbitrary executables | -| `go install X` | Add to Dockerfile `RUN go install X` | Installs arbitrary executables | -| `gem install X` | Add to Dockerfile `RUN gem install X` | Installs arbitrary executables | -| `uv tool install X` | Add to Dockerfile | CLI tools belong in container image | -| `apt install X` / `apt-get install X` | Add to Dockerfile `RUN apt-get install -y X` | System packages belong in container image | -| `snap install X` / `brew install X` | Add to Dockerfile | System packages belong in container image | -| `npm publish` / `uv publish` / `twine upload` | Ask the user to publish manually | Irreversible; publishes to public registries | -| `gh pr merge` | Ask the user to merge (or use branch protection auto-merge) | Merges code without human final review | -| `gh workflow run` | Ask the user to trigger the workflow | Unexpected CI runs, costs, or deployments | -| `gh issue create/close/edit` | Ask the user to perform the action | Modifies shared issue tracker | -| `terraform *` | Ask the user to run terraform | Could modify cloud infrastructure | -| `docker run --privileged` | Use `docker run` without `--privileged` | Container escape vector | -| `curl ... \| bash` / `wget ... \| sh` | Do not pipe remote scripts. Add to Dockerfile instead. | Supply-chain attack vector | -| `cd path && command` | Use absolute paths: `command /absolute/path` | Chained commands bypass glob-based permission checks | -| `git remote add/set-url/remove/rename/set-head` | Ask the user to manage remotes | Prevents code exfiltration to unauthorized remotes | - -## Firewall Configuration - -The devcontainer firewall (`init-firewall.sh`) restricts all outbound traffic to a built-in allowlist plus domains from Claude Code permission settings. - -**Built-in domains** (always allowed): PyPI, GitHub (via API CIDR ranges), Anthropic/Claude, VS Code Marketplace, uv/Astral, plus telemetry endpoints (`sentry.io`, `statsig.anthropic.com`, `statsig.com`). - -**WebFetch domain auto-whitelisting**: The firewall scans `.claude/settings.json` and `.claude/settings.local.json` for `WebFetch(domain:...)` patterns in `allow` and `ask` lists. Matched domains are resolved via DNS and added to the ipset allowlist. - -| Pattern | Firewall behavior | -|---------|-------------------| -| `WebFetch(domain:algoenergy.cz)` | Resolved and whitelisted | -| `WebFetch(domain:*.example.com)` | Skipped (wildcards cannot be resolved) | -| `WebFetch` (bare) | Ignored (no domain to resolve) | -| `WebFetch(domain:)` (empty) | Filtered out | - -Changes to WebFetch settings in `.claude/settings.json` or `.claude/settings.local.json` take effect on container restart. Changes to `.devcontainer/permissions/*.json` require a full rebuild (`devcontainer rebuild`). - -## Tier Comparison - -| Capability | Tier 1 | Tier 2 | Tier 3 | -|-----------|--------|--------|--------| -| Edit/Write/Read | auto | auto | auto | -| WebFetch | auto | auto | auto | -| Bash (general) | per-command | auto | auto | -| Python execution | ask | auto | auto | -| Git destructive (reset/rm) | auto | auto | auto | -| GitHub PR (non-merge) | ask | auto | auto | -| GitHub PR merge | deny | deny | auto | -| GitHub issue mutations | deny | deny | auto | -| GitHub workflow triggers | deny | deny | auto | -| Package publishing | deny | deny | deny | -| Tool installation | deny | deny | auto | -| Docker (non-escape) | ask | auto | auto | -| Docker escape flags | deny | deny | deny | -| Terraform | deny | deny | auto | - -**Note**: With `Bash(*)` (Tier 2/3), there are no "ask" prompts for bash. Commands either execute (allow) or are hard-blocked (deny). - -## Tier 3 Prerequisites - -Do not use Tier 3 without these external guardrails: - -- Branch protection requiring 1+ approving reviews before merge -- Required CI status checks on protected branches -- GitHub token scoped with fine-grained PAT (minimal permissions, no admin) -- If terraform is used: plan-only permissions in the container - -## Changing Tiers - -Set `PERMISSION_TIER` before building the devcontainer: - -```bash -# In your shell before opening in devcontainer -export PERMISSION_TIER=1 # or 2, 3 - -# Or in VS Code settings (devcontainer.json override) -# "containerEnv": { "PERMISSION_TIER": "3" } -``` - -The tier is applied during `onCreateCommand` by copying the corresponding file to `.claude/settings.local.json`. diff --git a/docs/DEVELOPMENT_PROCESS.md b/docs/DEVELOPMENT_PROCESS.md index c76ea10..5b7770b 100644 --- a/docs/DEVELOPMENT_PROCESS.md +++ b/docs/DEVELOPMENT_PROCESS.md @@ -10,8 +10,7 @@ Detailed development workflow for this repository. Referenced from `CLAUDE.md`. 1. Read `docs/IMPLEMENTATION_PLAN.md` for current progress 2. Read `docs/CHANGELOG.md` for recent changes -3. Read `docs/DECISIONS.md` for prior feature requests and user decisions -4. Read any package-specific documentation relevant to the task +3. Read any package-specific documentation relevant to the task This ensures continuity and prevents duplicated or missed work. @@ -96,7 +95,6 @@ All agents use `subagent_type: "general-purpose"`. Do NOT use `feature-dev:code- | CI (S.6.3) fails on current code | Fix, push, re-run from S.6.3 | | CI fails on pre-existing issue | Document separately, do not block current work | | Code review flags architectural concern | Pause. Evaluate rework (back to S.4) vs. follow-up issue | -| Acceptance criteria (P.3.2) reveals previous phase regression | File as separate issue. Fix in current phase only if it's a direct regression | | Multiple steps fail repeatedly | Stop. Reassess scope -- may need to split into smaller increments | --- @@ -122,9 +120,8 @@ next phase for P-scope work. **P.3 Execute** (repeat per phase) 1. Run Standard Path (S.1 through S.7) for the phase -2. Verify acceptance criteria (use `.claude/agents/acceptance-criteria-validator.md`) -3. Update `docs/IMPLEMENTATION_PLAN.md` (use `.claude/agents/implementation-tracker.md` or built-in `Plan` agent) -4. Write phase handoff note (2-5 sentences: what completed, deviations, risks, dependencies, intentional debt) +2. Update `docs/IMPLEMENTATION_PLAN.md` (use built-in `Plan` agent) +3. Write phase handoff note (2-5 sentences: what completed, deviations, risks, dependencies, intentional debt) **P.4 Finalize** -- Merge. Version bump and changelog consolidation if applicable. @@ -142,54 +139,32 @@ All custom agents are in `.claude/agents/` and use `subagent_type: "general-purp | S.6.4 | `review-responder.md` | Handle automated reviewer comments | | S.6.5 | `code-reviewer.md` | Independent code review | | S.7 | `docs-updater.md` | Verify and update documentation | -| P.3.2 | `acceptance-criteria-validator.md` | Verify acceptance criteria | -| P.3.3 | `implementation-tracker.md` | Verify plan matches reality | -| -- | `agent-auditor.md` | Audit agent definitions against best practices | -| -- | `security-auditor.md` | OWASP-based security analysis (read-only) | -| -- | `refactoring-specialist.md` | SOLID/code smell analysis (read-only) | -| -- | `output-evaluator.md` | LLM-as-Judge quality scoring | --- ## Hooks -5 hook scripts in `.claude/hooks/` run automatically via settings.json: +2 hook scripts in `.claude/hooks/` run automatically via settings.json: | Hook | Event | Matcher | Behavior | |------|-------|---------|----------| -| `dangerous-actions-blocker.sh` | PreToolUse | Bash | Blocks `rm -rf`, `sudo`, `DROP DATABASE`, `git push --force`, secrets in args. Exit 2 = block. | -| `unicode-injection-scanner.sh` | PreToolUse | Edit\|Write | Blocks zero-width chars, RTL overrides, ANSI escapes, null bytes, tag chars. Exit 2 = block. | -| `output-secrets-scanner.sh` | PostToolUse | Bash | Scans output for AWS/Anthropic/OpenAI/GitHub keys, JWTs, private keys, DB URLs. Warns via systemMessage. | +| `dangerous-actions-blocker.sh` | PreToolUse | Bash | Blocks exfiltration via trusted channels (gh gist, gh issue --body, publishing) and secrets in args. Exit 2 = block. | | `auto-format.sh` | PostToolUse | Edit\|Write | Runs `uv run ruff format` and `uv run ruff check --fix` on edited .py files. Synchronous. | -| `test-on-change.sh` | PostToolUse | Edit\|Write | Discovers and runs associated test file. Informational (systemMessage on failure). | All hooks require `jq` for JSON parsing and degrade gracefully if jq is missing. --- -## Commands - -3 slash commands in `.claude/commands/`: - -| Command | Purpose | -|---------|---------| -| `/cove` | Chain-of-Verification (CoVe) for high-stakes accuracy. 4-step process: generate baseline, plan verifications, verify from source, produce corrected response. | -| `/cove-isolated` | Isolated CoVe variant. Verification step runs in a separate agent that cannot see the baseline response, preventing confirmation bias. | -| `/security-audit` | 6-phase Python security scan (deps, secrets, code patterns, input validation, config, scoring). Outputs A-F grade. | - ---- - ## Skills -5 skills in `.claude/skills/`: +4 skills in `.claude/skills/`: | Skill | Purpose | |-------|---------| | `/sync` | Pre-flight workspace sync. Fetches remote, reports branch state, dirty files, ahead/behind, recent commits. | | `/design` | Crystallize brainstorming into a structured plan. Reads DECISIONS.md for conflicts, auto-classifies scope, outputs actionable plan. | -| `/done` | Universal completion. Auto-detects scope (Q/S/P), validates (3-tier checklist), ships/lands/delivers, updates docs. Absorbs former `/ship`. | +| `/done` | Universal completion. Auto-detects scope (Q/S/P), validates (3-tier checklist), ships/lands/delivers, updates docs. | | `/landed` | Post-merge lifecycle. Verifies merge CI, optional deployment checks, cleans up branches, prepares next phase. | -| `/edit-permissions` | Manage Claude Code permission rules in settings.json. Pattern syntax reference and safety guardrails. | --- diff --git a/docs/IMPLEMENTATION_PLAN.md b/docs/IMPLEMENTATION_PLAN.md index 238ab36..f251f82 100644 --- a/docs/IMPLEMENTATION_PLAN.md +++ b/docs/IMPLEMENTATION_PLAN.md @@ -1,327 +1,3 @@ # Implementation Plan -> **Status**: Phase 10 - Complete -> **Last Updated**: 2026-03-09 - -## Quick Status Summary - -| Phase | Name | Status | -|-------|------|--------| -| 1 | Fix 5 Template Bugs | Complete | -| 2 | Permissions Modernization | Complete | -| 3 | Security Hooks | Complete | -| 4 | Productivity Hooks | Complete | -| 5 | Slash Commands | Complete | -| 6 | New Agents | Complete | -| 7 | Review Rules | Complete | -| 8 | GitHub Actions + Config + Documentation | Complete | -| 9 | Devcontainer Permission Tiers | Complete | -| 10 | Workflow Skills (/sync, /design, /done) | Complete | - ---- - -## Phase 1: Fix 5 Template Bugs - -**Goal:** Fix critical setup script bugs preventing correct package customization and workspace builds. - -**Acceptance Criteria:** -- [x] Bug 1: rename_packages() updates pyproject.toml package names after directory renames -- [x] Bug 2: rename_packages() updates __init__.py docstrings after directory renames -- [x] Bug 3: Additional packages use -name pattern to avoid false matches (e.g., "core" substring) -- [x] Bug 4: Root pyproject.toml does not include [build-system] section in workspace mode -- [x] Bug 5: Template packages include tests/ directories so pytest discovers package tests -- [x] All documentation uses uv sync --all-packages flag for workspace installs -- [x] PCC includes Step -1 for mandatory feature branch creation -- [x] settings.json allows uv run execution - -**Tasks:** -- [x] Add _update_package_contents() helper to update renamed package internals -- [x] Call _update_package_contents() in rename_packages() after directory moves -- [x] Change additional package creation to use -name pattern replacements -- [x] Remove [build-system] from root pyproject.toml template -- [x] Add tests/ directory creation to rename_packages() for new packages -- [x] Update CLAUDE.md, README.md, and tests.yml to use --all-packages flag -- [x] Add PCC Step -1 documentation for feature branch requirement -- [x] Add settings.json with Claude Code server uv run permissions - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Use -name pattern for package name replacements | Bare name replacement (e.g., "core" -> "engine") | Avoids false matches on substrings like "core" appearing in "pyproject" or other unrelated strings; more robust and predictable | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 2: Permissions Modernization - -**Goal:** Eliminate unnecessary permission prompts, migrate deprecated syntax, and add a permissions management skill. - -**Acceptance Criteria:** -- [x] CLAUDE.md instructs Claude to use absolute paths instead of `cd /path && command` chains -- [x] CLAUDE.md instructs Claude to use `TaskOutput` tool instead of `tail`/`cat` on task output files -- [x] All `:*` patterns in settings.json replaced with ` *` (deprecated syntax migration) -- [x] settings.json remains valid JSON -- [x] `.claude/skills/edit-permissions/SKILL.md` created with permission management instructions -- [x] `/edit-permissions` skill is user-invocable and includes pattern syntax, safety rules, and shell operator warnings -- [x] IMPLEMENTATION_PLAN.md updated with Phase 2 -- [x] CHANGELOG.md updated with user-facing changes - -**Tasks:** -- [x] Add shell command style rules to CLAUDE.md (absolute paths + TaskOutput guidance) -- [x] Modernize settings.json -- replace all `:*` patterns with ` *` -- [x] Create `.claude/skills/edit-permissions/SKILL.md` with permission management instructions -- [x] Update documentation (IMPLEMENTATION_PLAN.md and CHANGELOG.md) -- [x] Add automated permission tests (tests/test_permissions.py) validating structure, syntax, matching, conflicts, security, and evaluation order - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Instruct absolute paths in CLAUDE.md instead of adding `Bash(cd *)` | Adding `Bash(cd *)` to allow list; using PreToolUse hooks to parse chained commands | `Bash(cd *)` doesn't solve the problem (shell operator protection still blocks `cd && cmd`). Hooks add complexity. Absolute paths are simpler and align with existing permission patterns | -| Instruct `TaskOutput` tool usage instead of adding `Read` allow rule for temp dir | Adding `Read(//c/Users/*/AppData/Local/Temp/claude/**)` to settings.json | The temp path is OS-specific and user-specific, making it non-portable for a template. `TaskOutput` is the dedicated built-in tool and needs no file permissions | -| Create custom `.claude/skills/` skill instead of installing `@otrebu/claude-permissions` | Installing community skill ZIP; using built-in `/permissions` command only | Community skill is for claude.ai web (ZIP upload), not Claude Code CLI. Custom skill lives in `.claude/skills/` and ships with the template automatically | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 3: Security Hooks - -**Goal:** Add 3 security hooks -- the template previously had zero hooks. - -**Acceptance Criteria:** -- [x] All 3 hook scripts exist, are executable, have `#!/bin/bash` shebang -- [x] dangerous-actions-blocker exits 2 for blocked patterns, 0 otherwise -- [x] unicode-injection-scanner exits 2 for blocked Unicode, 0 otherwise -- [x] output-secrets-scanner exits 0 always (PostToolUse), emits systemMessage JSON for warnings -- [x] settings.json has hooks section with correct nested structure -- [x] setup_project.py processes .sh files for placeholders -- [x] tests/test_hooks.py passes - -**Tasks:** -- [x] Create `.claude/hooks/dangerous-actions-blocker.sh` -- PreToolUse(Bash) blocker -- [x] Create `.claude/hooks/output-secrets-scanner.sh` -- PostToolUse(Bash) scanner -- [x] Create `.claude/hooks/unicode-injection-scanner.sh` -- PreToolUse(Edit|Write) blocker -- [x] Add hooks section to `.claude/settings.json` -- [x] Add `.sh` to `TEXT_EXTENSIONS` in setup_project.py -- [x] Create tests/test_hooks.py - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Hook directory `.claude/hooks/` | Root-level `hooks/` dir; inline scripts in settings.json | Official Claude Code convention for project-scoped hooks | -| Path resolution via `$CLAUDE_PROJECT_DIR` | Hardcoded paths; relative paths | Official env var, works across different checkout locations | -| jq for JSON parsing with graceful degradation | Python JSON parsing; no JSON validation | jq is lightweight and standard for shell JSON; graceful exit 0 if missing avoids blocking developer work | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 4: Productivity Hooks - -**Goal:** Add auto-format and test-on-change hooks. - -**Acceptance Criteria:** -- [x] auto-format.sh runs ruff format and ruff check --fix on edited .py files -- [x] test-on-change.sh discovers and runs associated test file -- [x] Both hooks registered in settings.json PostToolUse for Edit|Write -- [x] tests/test_hooks.py updated with new hook tests - -**Tasks:** -- [x] Create `.claude/hooks/auto-format.sh` -- PostToolUse(Edit|Write) formatter -- [x] Create `.claude/hooks/test-on-change.sh` -- PostToolUse(Edit|Write) test runner -- [x] Add PostToolUse hooks to settings.json -- [x] Add tests for new hooks - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| auto-format is synchronous (no systemMessage) | Async with systemMessage reporting | Claude must see formatted code before next edit to avoid fighting the formatter | -| test-on-change is informational (systemMessage) | Blocking (exit 2 on test failure) | Failing tests should inform, not block -- developer may be mid-refactor | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 5: Slash Commands - -**Goal:** Add 3 slash commands for context restoration, security auditing, and pre-deployment checks. - -**Acceptance Criteria:** -- [x] ~~/catchup command restores context after /clear~~ (removed: overlaps with /sync) -- [x] `/security-audit` command runs 6-phase security scan with A-F grading -- [x] `/ship` command runs 3-tier pre-deployment checklist -- [x] All commands have frontmatter with `allowed-tools` and `description` -- [x] tests/test_commands.py passes - -**Tasks:** -- [x] ~~Create `.claude/commands/catchup.md`~~ (removed: replaced by /landed skill) -- [x] Create `.claude/commands/security-audit.md` -- [x] Create `.claude/commands/ship.md` -- [x] Create tests/test_commands.py - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Use `.claude/commands/` not `.claude/skills/` | Skills directory | Commands are simpler (single file) and correct for instruction-set-style content; skills are for more complex multi-file capabilities | -| All commands set `allowed-tools` | No tool restrictions | Scoping tools prevents commands from accidentally modifying code when they should only read | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 6: New Agents - -**Goal:** Add 3 specialized agents (security-auditor, refactoring-specialist, output-evaluator). - -**Acceptance Criteria:** -- [x] security-auditor is read-only (permissionMode: plan, no Bash/Edit/Write) -- [x] refactoring-specialist is read-only (permissionMode: plan, no Bash) -- [x] output-evaluator uses haiku model and dontAsk permissionMode -- [x] All 3 agents follow existing frontmatter pattern -- [x] tests/test_agents.py validates all 12 agents - -**Tasks:** -- [x] Create `.claude/agents/security-auditor.md` -- [x] Create `.claude/agents/refactoring-specialist.md` -- [x] Create `.claude/agents/output-evaluator.md` -- [x] Create tests/test_agents.py - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| security-auditor and refactoring-specialist are read-only (permissionMode: plan) | acceptEdits or dontAsk | These agents analyze, not modify -- principle of least privilege | -| output-evaluator uses dontAsk | plan or acceptEdits | Designed for automated pipeline integration where prompting would block | -| output-evaluator uses haiku model | sonnet | Scoring is structured and formulaic, doesn't need sonnet's reasoning depth | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 7: Review Rules - -**Goal:** Add 4 review rules auto-loaded as project context. - -**Acceptance Criteria:** -- [x] All 4 rule files exist in `.claude/rules/` -- [x] Rules have frontmatter with `description` field, no `paths:` field -- [x] Each rule is under 80 lines, checklist format -- [x] Rules cover architecture, code quality, performance, test quality -- [x] tests/test_rules.py passes - -**Tasks:** -- [x] Create `.claude/rules/architecture-review.md` -- [x] Create `.claude/rules/code-quality-review.md` -- [x] Create `.claude/rules/performance-review.md` -- [x] Create `.claude/rules/test-review.md` -- [x] Create tests/test_rules.py - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| No `paths:` frontmatter -- rules apply globally | Path-scoped rules (e.g., only `libs/`) | Rules cover universal concerns (architecture, quality, performance, tests) that apply to all code | -| Concise checklist format (<80 lines each) | Detailed prose explanations | Rules are loaded into every context window; brevity matters per ETH Zurich paper findings | -| No overlap with ruff/pyright | Including style/type rules | Rules should cover what linters cannot: architecture, design, and logic-level concerns | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 8: GitHub Actions + Config + Documentation - -**Goal:** AI-powered PR review, local config template, and comprehensive doc updates. - -**Acceptance Criteria:** -- [x] claude-code-review.yml workflow created with `anthropics/claude-code-action@v1` -- [x] settings.local.json.example created as local override template -- [x] .gitignore updated with `.claude/settings.local.json`, `.claude/hooks/*.log`, `CLAUDE.local.md` -- [x] setup_project.py adds `chmod +x` for hook scripts after placeholder substitution -- [x] CLAUDE.md updated with security hooks reference (kept compact per ETH Zurich paper decision) -- [x] README.md updated with full feature list (12 agents, 5 hooks, 3 commands, 4 rules) -- [x] DEVELOPMENT_PROCESS.md updated with hooks, commands, rules, and new agent references - -**Tasks:** -- [x] Create `.github/workflows/claude-code-review.yml` -- [x] Create `.claude/settings.local.json.example` -- [x] Update `.gitignore` -- [x] Update `setup_project.py` with chmod +x for hooks and .sh in TEXT_EXTENSIONS -- [x] Update `CLAUDE.md` (compact -- security hooks only) -- [x] Update `README.md` (full feature list, project structure, agent table) -- [x] Update `docs/DEVELOPMENT_PROCESS.md` (hooks, commands, rules tables + new agents) - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Keep CLAUDE.md compact, put detailed tables in DEVELOPMENT_PROCESS.md | Add Hooks/Commands/Rules sections to CLAUDE.md | Respects the ETH Zurich paper decision (D48) to keep CLAUDE.md under ~40 lines | -| claude-code-review.yml uses `claude-sonnet-4-6` model | opus or haiku | Best balance of review quality and cost for automated PR review | -| Read-only tools (Read, Glob, Grep) for CI review action | Including Edit/Bash | CI review should analyze, not modify -- matches security principle of least privilege | - -**Phase Completion Steps:** -> After this phase, follow the development process in `docs/DEVELOPMENT_PROCESS.md`: classify as Q/S/P, then execute the corresponding path (validate with agents, ship, document). - ---- - -## Phase 9: Devcontainer Permission Tiers - -**Goal:** Expand Claude Code permissions inside devcontainers using container isolation (firewall, non-root, hooks) to safely reduce prompts. - -**Acceptance Criteria:** -- [x] Three tier files in `.devcontainer/permissions/` (Assisted, Autonomous, Full Trust) -- [x] `devcontainer-policy-blocker.sh` hook using `$PERMISSION_TIER` env var (fail-closed) -- [x] Tier files are fully self-contained (permissions + hooks) to survive settings.local.json replace semantics -- [x] `docs/DEVCONTAINER_PERMISSIONS.md` maps every denied command to its approved alternative -- [x] Tests validate tier structure and policy hook behavior - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| Three graduated tiers stored as JSON in `.devcontainer/permissions/` | Single expanded settings file | Graduated trust levels let teams choose their risk tolerance | -| Tier 2 uses `Bash(*)` allow with curated deny list | Per-command allow patterns | Zero prompts for bash; denied commands fail immediately instead of prompting | -| Separate `devcontainer-policy-blocker.sh` hook | Modifying existing hooks | Catches denied patterns in chained commands that bypass glob-based deny rules | -| Each tier file fully self-contained | Shared base + tier-specific overlays | `settings.local.json` replaces (not merges) `settings.json`; self-contained files prevent silently losing base hooks | - ---- - -## Phase 10: Workflow Skills (/sync, /design, /done) - -**Goal:** Replace rigid upfront QSP classification with three entry-point skills that auto-detect scope at completion time. - -**Acceptance Criteria:** -- [x] `/sync` skill checks workspace readiness (git fetch, status, branch info, recent commits) -- [x] `/design` skill crystallizes brainstorming into structured plans with DECISIONS.md conflict detection -- [x] `/done` skill auto-detects scope (Q/S/P), validates (3-tier checklist), ships/lands/delivers, documents -- [x] `/ship` command absorbed into `/done` Phase 2 (3-tier checklist preserved) -- [x] CLAUDE.md updated: `/design` estimates scope, `/done` auto-detects actual scope -- [x] `tests/test_skills.py` validates all 4 skills (54 tests) - -**Decisions & Trade-offs:** - -| Decision | Alternatives Considered | Why This Option | -|----------|------------------------|-----------------| -| `/plan` renamed to `/design` | Keep `/plan` | `/plan` is a built-in Claude Code command (enters read-only plan mode) | -| `/sync` and `/done` have `disable-model-invocation: true` | Allow model invocation | Both have side effects (git fetch, git commit/push, PR creation) | -| `/design` intentionally model-invocable | Disable model invocation | Claude should suggest `/design` when brainstorming seems ready to formalize | -| `/ship` absorbed into `/done` Phase 2 | Keep `/ship` as separate command | Single completion command is simpler; `/done` orchestrates the full pipeline | - ---- - -## Implementation Notes - - + diff --git a/docs/community/GROWTH_PLAN.md b/docs/community/GROWTH_PLAN.md deleted file mode 100644 index 49e562b..0000000 --- a/docs/community/GROWTH_PLAN.md +++ /dev/null @@ -1,128 +0,0 @@ -# Community Growth Plan: Make It Usable, Then Known - -Synthesized from assessments by Claude, Gemini, and OpenAI (March 2025). - ---- - -## Phase 1: Lower the Barrier (Coding) - -All three sources agree: **usability before promotion**. Nobody shares a tool that's hard to start with. - -### 1.1 Restructure README for instant comprehension -- Add a **"Why this template?"** pain-point section at the very top (currently jumps straight into features) -- Add a **60-second quickstart** block: 3 commands to a working project -- Lead with **single-package mode** as the default, monorepo as "advanced" (currently `--type single` is buried) -- Group agents as **"5 core + 7 optional specialists"** instead of "12 agents" which sounds overwhelming -- Add badges: CI status, Python version, license - -### 1.2 Create a "Hello World" example repo -- Generate a real project from the template and actually build something small with it (e.g., a CLI tool or a tiny FastAPI app) -- Use the template's own workflow to build it (`/sync`, `/design`, `/done`) -- this proves the workflow works end-to-end -- Publish as `stranma/example-from-template` -- Include screenshots + `tree` output showing the end result -- Link from README: "See what a generated project looks like" -- This is the single most convincing proof point -- it shows the output, not just the scaffolding - -### 1.3 Harden setup_project.py -- Graceful failure when `uv` isn't installed -- detect and print install instructions -- Add a `--minimal` flag that installs only core agents/hooks for people wanting to ease in - -### 1.4 Add community health files -- `CONTRIBUTING.md` -- explain how to add agents, hooks, improve prompts -- `SECURITY.md` -- describe the trust model: what hooks/agents execute and why you shouldn't run on untrusted repos -- `CODE_OF_CONDUCT.md` - -### 1.5 Fix inconsistencies -- Repo description says "7 custom agents" but README says 12 -- align -- Add a brief section describing the trust model for hooks - ---- - -## Phase 2: Build Trust Signals (Coding + GitHub) - -### 2.1 Release strategy -- Tag `v1.0.0` and create a GitHub Release with notes (currently 0 tags) -- Future changes get proper releases with migration notes - -### 2.2 GitHub discoverability -- Set Topics: `claude-code`, `python-template`, `uv`, `tdd`, `monorepo`, `ruff`, `pyright`, `ai-agents`, `pre-commit` -- Tighten the one-line repo description for search - -### 2.3 "Where did you find this?" discussion -- Enable GitHub Discussions, add a "How did you find us?" pinned thread (lightweight adoption telemetry) - -### 2.4 Comparison table -- Add honest comparison vs. `iepathos/python-claude-code`, `discus0434/python-template-for-claude-code`, `scotthavird/claude-code-template`, `serpro69/claude-starter-kit` -- People respect honesty about trade-offs - ---- - -## Phase 3: Create Content (Real-World) - -### 3.1 Record a demo -- 60-second asciinema/Loom showing: template -> setup -> Claude Code catches a linting error -> forced to write tests -- Embed in README - -### 3.2 Write the "methodology" article -- Publish on dev.to/Medium: "How to structure Python projects for AI coding assistants" -- Focus on the Q/S/P classification and *why* -- the methodology is the interesting content, the template is the reference implementation -- Educational first, link to repo as implementation -- not an ad - -### 3.3 Short YouTube/Loom walkthrough -- 5 minutes max showing the full workflow - ---- - -## Phase 4: Distribute (Real-World) - -### 4.1 Targeted communities (do first) -- Submit PR to **awesome-claude-code** list under "Project Templates" -- Post to **Claude Developers Discord** as "here's what I built, feedback welcome" -- Post to **r/ClaudeAI**, **r/Python** -- text post explaining the problem, not just a link drop - -### 4.2 Show HN -- Title: `Show HN: A Python template that forces Claude Code to use TDD` -- Must have the demo GIF and polished README ready first - -### 4.3 Social media -- X/LinkedIn thread with before/after narrative -- Tag **@AnthropicAI** and **@astaborsky** (Astral/uv/ruff creators) - -### 4.4 Newsletters (medium-term) -- Pitch to PyCoder's Weekly, TLDR AI, Python Weekly - ---- - -## Phase 5: Sustain (Ongoing) - -### 5.1 Find 3-5 early adopters -- More important than broadcasting widely -- one genuine testimonial beats 100 impressions -- Their issues/PRs make the repo look alive (10 closed issues > 0 issues) - -### 5.2 Cross-reference from own projects -- Use the template in your public projects, link back -- Real usage is the best advertisement - -### 5.3 Manage support expectations -- Set clear expectations in README/CONTRIBUTING about response times -- Public repos create support obligations - ---- - -## Completed (PR #21) - -- [x] README restructure (1.1) -- personas, workflow, devcontainer section, honest tone -- [x] Getting Started guide (partial 1.4) -- docs/GETTING_STARTED.md -- [x] Fix "battle-tested" / "production-quality" claims (1.5) - -## Priority Order (remaining) - -| Priority | Step | Type | Impact | -|----------|------|------|--------| -| 1 | Example project (1.2) | Code | "Show don't tell" | -| 2 | Community files -- CONTRIBUTING.md, SECURITY.md, CODE_OF_CONDUCT.md (1.4) | Code | Trust signals | -| 3 | v1.0.0 release + GitHub topics (2.1, 2.2) | GitHub | Discoverability | -| 4 | Demo recording (3.1) | Real-world | Conversion driver | -| 5 | awesome-claude-code + Discord + Reddit (4.1) | Real-world | First users | -| 6 | Blog post (3.2) | Real-world | Long-tail discovery | -| 7 | Show HN + social (4.2, 4.3) | Real-world | Broader reach | diff --git a/docs/community/README_DRAFT.md b/docs/community/README_DRAFT.md deleted file mode 100644 index 15db8f2..0000000 --- a/docs/community/README_DRAFT.md +++ /dev/null @@ -1,19 +0,0 @@ -# README Draft Notes - -This file tracked the README rewrite process. The canonical README is now at [/README.md](/README.md). - -## Research inputs - -- `docs/community/claude.md` -- Claude assessment transcript -- `docs/community/gemini.md` -- Gemini assessment transcript -- `docs/community/openai.md` -- OpenAI assessment transcript -- `docs/community/feedback/` -- Round 2 feedback from all three models on the rewritten README - -## Key decisions made during rewrite - -- Lead with personas (solo dev first, beginners last), not features -- `/sync` -> `/design` -> `/done` workflow is the primary selling point -- "Opinionated" is stated upfront with no apology, but no unsubstantiated claims -- Devcontainer section included for security angle but kept after Quick Start -- Token costs described by relative cost drivers, not fake precision -- CLAUDE.md shown inline in collapsible block so visitors can judge quality diff --git a/docs/community/claude.md b/docs/community/claude.md deleted file mode 100644 index 49db2e1..0000000 --- a/docs/community/claude.md +++ /dev/null @@ -1,92 +0,0 @@ -# Archived Transcript: Claude Assessment - -Captured: March 2025 | Model: Claude 3.5 Sonnet | Subject: stranma/claude-code-python-template - ---- - -Please estimate how useful is stranma/claude-code-python-template - -Mar 1 - -Investigated repository availability and resolved to communicate findings honestly -Investigated repository availability and resolved to communicate findings honestly -Martin, I wasn't able to find a repository called stranma/claude-code-python-template on GitHub. My searches returned no results for that exact repo — it either doesn't exist (yet), is private, has been deleted, or the name might be slightly different. - -Could you double-check the exact repo name or share the URL? A couple of possibilities: - -iepathos/python-claude-code — a Python starter template optimized for Claude Code with CLAUDE.md, pytest, pre-commit hooks, Poetry support -discus0434/python-template-for-claude-code — a Japanese-documented Python template with type checking, slash commands, and comprehensive test scaffolding -scotthavird/claude-code-template — a barebones template with devcontainer support and hook logging -If it's one of these (or something else), I'm happy to dig into it and give you an assessment of how useful it'd be for your workflow. - -https://github.com/stranma/claude-code-python-template - -Mar 1 - -Evaluated repository utility for Martin's needs -Evaluated repository utility for Martin's needs -Got it, now I can give you a proper assessment. Here's my take: - -stranma/claude-code-python-template — Usefulness Estimate -Overall: Moderately to quite useful, especially given your workflow with Claude Code. But with caveats. - -What's genuinely valuable: - -The template is well-thought-out in its core design — the Q/S/P (Quick/Standard/Project) task classification is a smart way to avoid over-engineering trivial fixes while maintaining rigor for complex work. The 12 agents are nicely scoped with clear responsibilities (code quality, test coverage, PR writing, security auditing, etc.), and the monorepo layout using uv workspaces with apps/ + libs/ separation is a solid modern Python pattern (credit to carderne/postmodern-mono). The security hooks (dangerous-actions-blocker, unicode-injection-scanner, output-secrets-scanner) are practical additions you rarely see in templates. - -The tool stack (uv, ruff, pyright, pytest, hatchling) is the current "best practice" Python toolchain, so you'd be starting from a good foundation. - -Concerns: - -The repo is brand new with 0 stars, 0 forks, and a single contributor with 55 commits. There's no community validation or battle-testing beyond the author's own projects. The README claims it's "proven across real production projects" but there's no evidence of that beyond the claim itself. - -The CLAUDE.md is described as ~40 lines, which is admirably concise, but the overall template is quite opinionated and heavyweight — 12 agents, 5 hooks, 3 slash commands, 4 review rules. For your energy market optimization projects where you're often doing mathematical/scientific Python work, a lot of this CI/CD and PR ceremony may be overkill, especially when you're working solo as a contractor. - -The monorepo structure might also be more structure than you need — your site-calc and similar projects tend to be focused packages rather than multi-app monorepos. - -For your specific use case, Martin: - -I'd rate it 6/10. The individual pieces (uv workspaces, the agent definitions, the hook scripts) are worth cherry-picking from, but adopting the whole template wholesale might introduce more process than your typical energy optimization project needs. You'd probably get more value by grabbing the security hooks, the CLAUDE.md approach, and maybe the Q/S/P classification concept, then integrating them into your existing devcontainer-based workflow rather than restructuring around this template. - -Good news. It is my repo. How to improve it for community? Would it make sense to make it more known? How to do it? - -Mar 1 - -Recognized authorship and strategized community visibility tactics -Recognized authorship and strategized community visibility tactics -Nice, congrats on putting it together! The foundation is solid. Here's my honest take on both fronts: - -Improving for Community Adoption -The biggest barrier right now is the "wall of text" problem. The README is comprehensive but overwhelming for someone landing on the repo for the first time. A few concrete improvements: - -Lower the entry barrier. Add a 30-second GIF or screenshot showing the template in action — someone running setup_project.py, Claude Code picking up the agents, a test passing. People decide in seconds whether to engage. - -Add a "Why this template?" section right at the top that addresses the pain points it solves. Something like: "You start a Python project with Claude Code, and within an hour you're fighting with linting setup, writing PR templates from scratch, and forgetting to run tests. This template eliminates that." Right now it jumps straight into features without selling the problem. - -Provide a minimal path. The --type single mode is buried. Many people don't want a monorepo — lead with the simple case and make monorepo the "advanced" option. Consider even having a --minimal flag that skips most agents/hooks for people who want to grow into the full setup gradually. - -Show real-world usage. Create a small example project built with the template (even a toy API or CLI tool) so people can see the end result, not just the scaffolding. A link to a repo that was actually built using this template would be incredibly compelling. - -Trim the agent count in marketing. 12 agents sounds like a lot to manage. Consider grouping them — "5 core agents + 7 optional specialists" — so it feels approachable. The README currently reads as "you must use all of this" rather than "take what you need." - -Other practical improvements: - -Add a CONTRIBUTING.md — if you want community contributions, make the path clear -The description says "7 custom agents" but the README lists 12 — fix this inconsistency -Add badges (CI status, Python version, license) for credibility signals -Include a comparison table vs. other templates (iepathos/python-claude-code, discus0434's template) — be honest about trade-offs, people respect that -Getting It Known -Yes, it makes sense to promote it, but timing and channel matter. The Claude Code ecosystem is exploding right now (8,000+ repos tagged on GitHub) and people are actively looking for good templates. Here's a practical playbook: - -Immediate high-impact moves: - -Submit to awesome-claude-code — this is the main curated list. Your template fits squarely in their "Project Templates" category. Open a PR with a concise description focusing on what's unique (the Q/S/P workflow, the security hooks). -Post on the Claude Developers Discord — Anthropic's official community. Share it as "here's what I built, feedback welcome" rather than pure promotion. Devs there are hungry for good Claude Code setups. -Write a short blog post or dev.to article explaining the methodology behind the Q/S/P classification and why you structured the agents this way. The template is the artifact; the thinking behind it is the interesting content that gets shared. Post it to Hacker News, Reddit r/ClaudeAI, and r/Python. -Tweet/post on X and LinkedIn with a concrete before/after: "Before this template, setting up a Python project for Claude Code took me X hours of configuration. Now it's one command." Tag relevant people in the Claude Code community. -Medium-term: - -Get one or two early adopters who'll actually use it and give you feedback. Their issues and PRs make the repo look alive. A repo with 0 issues looks untested; a repo with 10 closed issues looks battle-hardened. -Cross-reference from your other work. If you use this template in your own public projects, link back to it. Real usage is the best advertisement. -Consider a short YouTube/Loom walkthrough (5 minutes max). Visual content converts way better than READMEs for developer tools. -The honest reality: most GitHub templates live or die based on whether 3-5 early adopters find them useful enough to star and share. Focus your energy on finding those first few users rather than broadcasting widely. One genuine testimonial beats a hundred impressions. \ No newline at end of file diff --git a/docs/community/feedback/claude.md b/docs/community/feedback/claude.md deleted file mode 100644 index a8e3452..0000000 --- a/docs/community/feedback/claude.md +++ /dev/null @@ -1,55 +0,0 @@ -Alright, here's my honest take as a mid-experience dev who's used Claude Code a bit but isn't deep into it yet. - -## Would I try it? - -**Probably yes, but with hesitation.** The pitch is compelling — I've definitely felt the pain of Claude Code sessions where tests get skipped, context drifts, and a "quick fix" turns into a rewrite. The three-command workflow (`/sync`, `/design`, `/done`) is immediately appealing because it's simple to remember and the auto-detection of scope is a nice touch. - -But I'd hesitate because: - -1. **It feels like a lot.** 12 agents, 5 hooks, 4 review rules, 3 permission tiers, devcontainer setup... For someone who just wants to write better Python with Claude Code, this is intimidating. I'd be wondering "do I need all this to fix my FastAPI app?" - -2. **I can't see the actual CLAUDE.md or agents before committing.** The README tells me *about* them but doesn't show me what they actually do. I'd want to peek at the CLAUDE.md (~40 lines, you say — just show it!) and maybe one agent definition to judge quality before cloning. - -3. **No demo or screenshot.** I have no idea what a `/done` run actually looks like in practice. A 30-second terminal recording (asciinema) or even a screenshot of a PR it generated would massively increase my confidence. - -## What else I'd need to decide - -- **Does this play nice with my existing project?** Can I adopt pieces incrementally, or is it all-or-nothing? The README implies you start fresh from the template, but most devs have existing repos. -- **How opinionated is *too* opinionated?** You say ruff, pyright, pytest, hatchling. What if I use mypy? Or poetry? How painful is it to swap? -- **What's the actual token cost?** "A few extra cents per PR" is vague. Give me a ballpark: "~5k-20k tokens per `/done` on a standard PR" or something concrete. -- **How does it handle failure?** What happens when an agent disagrees with me, or the code review agent flags something I want to keep? Is there an override? -- **Has anyone else actually used this?** No testimonials, no "used in production at X" — it reads like a solo project, which is fine, but I'd want to know. - -## How to improve the README - -**Structural issues:** - -- **Lead with a concrete example, not abstractions.** The "How It Works" section should show a real terminal session — actual commands, actual output. Right now it's a table of what `/done` *theoretically* does. Show me it working. -- **The "Who Is This For?" section tries to appeal to everyone.** Four personas is too many and dilutes the message. Pick your primary audience (I'd guess "solo dev who knows Python") and lead with that. Move the rest to a collapsible section or cut them. -- **Show the CLAUDE.md inline.** It's 40 lines — just put it in the README under a collapsible block. It's the core of the template and the thing I'd judge quality by. -- **The devcontainer section is too prominent.** It's important but it's not what sells me on the template. Move it after "What's Included" or make it collapsible. -- **Missing: "Adopting in an existing project" section.** Even a short note saying "this is designed for new projects, but here's how to cherry-pick the CLAUDE.md and agents into an existing repo" would significantly widen the audience. - -**Tone/copy issues:** - -- "An opinionated Python project template" — every template on GitHub says this. Lead with the outcome: "Stop babysitting Claude Code. This template makes it produce tested, linted, properly-documented Python on every commit." -- The "A Note on Token Costs" section is defensive. Reframe it positively: "Token costs: ~X tokens per PR for automated review + docs. We think that's a good trade for your time." -- "If that's what you want, you're in the right place. If not, no hard feelings." — this is charming but wastes space in the intro. Save the personality for later. - -## How to improve the repo itself - -1. **Add a `--adopt` mode to the setup script** that copies just the `.claude/` directory, hooks, and a starter CLAUDE.md into an existing project. This is the biggest adoption barrier. - -2. **Make agents composable/optional from day one.** Let me start with just CLAUDE.md + the 3 commands + maybe 2 core agents (code-quality, test-coverage). Add a `/setup-agents` command or config flag to enable the rest. Right now it's 12 agents whether I want them or not. - -3. **Add an example PR** — a real GitHub PR generated by the template, showing the PR description, the CI run, the code review comments. Link it from the README. Nothing sells a developer tool like seeing real output. - -4. **Provide a "minimal" template variant.** Something like `--type minimal` that gives you CLAUDE.md, the 3 commands, ruff/pyright/pytest config, and nothing else. Let people grow into the full setup. - -5. **Document escape hatches.** What if I want mypy instead of pyright? What if I don't want TDD enforced? Show me which lines to change. Opinionated is good, but "opinionated with documented overrides" is better. - -6. **Add metrics or before/after comparisons.** "Projects using this template average X% test coverage" or "Here's a PR without the template vs. with it." Quantify the value. - ---- - -**Bottom line:** The core idea is strong and solves a real problem. The README is well-written technically but tries to do too much at once. It reads like documentation for a mature tool, but what I need at the decision point is a quick "wow, that's cool" moment — a demo, a real PR, a before/after. Trim the personas, show don't tell, and add an adoption path for existing projects, and I think this becomes a genuinely compelling template. \ No newline at end of file diff --git a/docs/community/feedback/gemini.md b/docs/community/feedback/gemini.md deleted file mode 100644 index 5421c17..0000000 --- a/docs/community/feedback/gemini.md +++ /dev/null @@ -1,44 +0,0 @@ -Looking at this from the perspective of a mid-level developer who knows their way around Python but is still figuring out the best ways to integrate AI agents like Claude Code, this looks like a highly compelling, well-thought-out project. - -Here is my honest breakdown of the README, the repo's potential, and where it could be improved. - -### 1. Is this repo helpful? - -**Yes, exceptionally helpful—for the right audience.** The biggest problem with AI coding assistants right now is that they tend to drift. Without strict guardrails, they skip tests, break architectures, and turn small refactors into massive, un-mergeable rewrites. - -This template directly attacks that problem by forcing the AI into a structured, Test-Driven Development (TDD) loop and sandboxing it inside a devcontainer. The permission tiers (Assisted vs. Autonomous) are a fantastic inclusion, as giving an AI unrestricted terminal access on a host machine is a massive security risk. It provides a mature toolchain (`uv`, `ruff`, `pyright`) out of the box, which saves hours of configuration. - -### 2. Would I try it based on this README? - -**Yes, absolutely—but I'd test it on a weekend project first.** -The "Three commands" workflow (`/sync`, `/design`, `/done`) is a great hook. It abstracts away the tedious parts of git and CI/CD, allowing a solo developer to punch above their weight and maintain team-level code quality. - -However, because the template is heavily opinionated (if you prefer `poetry` over `uv`, or `flake8` over `ruff`, you're out of luck), I wouldn't immediately drop this into a production environment. I would want to spin it up in a greenfield project to see if the AI actually respects the boundaries set by the `.claude` config before trusting it with real work. - -### 3. What else would I need to decide before adopting? - -If I were evaluating this for real-world use, the README leaves a few lingering questions I'd need answers to: - -* **Actual Execution Time:** How long does `/done` actually take? If it's running linting, tests, AI code review, and writing a PR description, am I sitting at my terminal waiting 3-5 minutes for a single command to finish? -* **Real Token Costs:** The README waves away token costs by saying "A few extra cents per PR." I would need to know what a standard session actually costs. If I run `/done` 10 times a day, am I spending $1 or $15? -* **Vendor Lock-in / The Eject Button:** What happens if I decide Claude Code isn't working for me? It looks like standard Python, but I'd need to confirm that deleting the `.claude` folder leaves me with a perfectly normal, functional repository without breaking the CI/CD pipeline. - -### 4. How to improve the README - -The README is technically excellent, but it could use a bit more "show, don't tell" and some clarity on the AI specifics: - -* **Add a Demo GIF:** A terminal recording (like asciinema) showing the `/done` command in action. Seeing the AI automatically run tests, fix a linting error, and generate a PR description would be an instant sell. -* **Quantify the Costs:** Replace the vague note on costs with a realistic estimate. (e.g., *"A typical standard feature workflow consumes roughly 50k tokens, costing ~$0.15 in API credits."*) -* **Explain the Eject Path:** Add a tiny section reassuring developers that this is just a standard Python project underneath. Let them know they aren't permanently locked into Claude Code if they use this template. - -### 5. How to improve the repo (based on the provided structure) - -Based on the file tree and descriptions, here are a few technical improvements: - -* **Standard `pre-commit` Hooks:** I see GitHub Actions and Claude hooks, but standard local `pre-commit` (the Python package) seems to be missing. It's good practice to have local, non-AI git hooks as a fallback for when a developer commits manually without using `/done`. -* **Agent Prompt Testing:** If Anthropic updates the Claude model, the prompts in the 12 agents might behave differently. A method or script to "test the agents" (evals) to ensure they still output the expected formats would make this repo robust for the long term. -* **Clean Up Script:** The setup script (`setup_project.py`) is great, but a teardown/eject script (`eject_claude.py`) that strips out the `.claude` folder, devcontainer, and AI-specific GitHub actions would be a huge comfort feature for developers worried about lock-in. - ---- - -Would you like me to draft a more specific "Eject Path" section to add to this README, or perhaps mock up what the concrete Token Cost breakdown could look like? \ No newline at end of file diff --git a/docs/community/feedback/openai.md b/docs/community/feedback/openai.md deleted file mode 100644 index b3ca1ed..0000000 --- a/docs/community/feedback/openai.md +++ /dev/null @@ -1,987 +0,0 @@ -You are mid experienced developer and midly familiar with Claude Code. - -Please read this readme and tell me honest opinion if this repo would be helpful and if you would try it based on this readme. What else you wound need to decide? How to improve the readme? How to improve the repo? - -# Claude Code Python Template - -[![CI](https://github.com/stranma/claude-code-python-template/actions/workflows/tests.yml/badge.svg)](https://github.com/stranma/claude-code-python-template/actions/workflows/tests.yml) -[![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://python.org) -[![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE) -[![uv](https://img.shields.io/badge/uv-workspace-blueviolet)](https://docs.astral.sh/uv/) - -**An opinionated Python project template that makes Claude Code produce consistent, tested, production-quality code.** - -Without structure, Claude Code results vary -- tests get skipped, context gets lost between sessions, small fixes turn into rewrites. This template gives you a complete starting point: TDD workflow, quality gates, security hooks, and a 3-command workflow that scales from typo fixes to multi-phase projects. - -This is opinionated by design. It picks uv, ruff, pyright, pytest, and hatchling. It enforces TDD. It runs agents on every PR. If that's what you want, you're in the right place. If not, no hard feelings. - -## Who Is This For? - -**Solo developer who knows Python?** You move fast, but you still want tests, type checking, linted code, and proper PRs - why not, it is now almost for free! The template's agents handle the discipline so you can focus on the problem. The devcontainer setup means you can let Claude Code run more autonomously inside a sandbox -- no worrying about it running rm -rf on your host machine. - -**Leading a team adopting Claude Code?** Without a shared baseline, every developer has their own CLAUDE.md (or none). This template standardizes how your team uses Claude Code -- same workflow, same quality gates, same security hooks across all projects. The devcontainer with [permission tiers](docs/DEVCONTAINER_PERMISSIONS.md) lets you control how much autonomy Claude Code gets: from per-command approval (Tier 1) to full trust with minimal guardrails (Tier 3). - -**Data scientist or ML engineer?** You know Python and pandas, but software engineering practices (CI/CD, type annotations, code review) feel like overhead. This template adds those practices without you having to learn how to set them up. Claude Code handles the ceremony; you focus on the models. - -**New to Claude Code and still learning Python?** This template is a good way to learn professional practices by doing. It enforces TDD, type checking, linting, and proper git workflow -- things that are hard to pick up from tutorials alone. Claude Code walks you through it, and the agents catch mistakes before they stick. You'll need basic comfort with the terminal and git. If that's new to you, see [Getting Started Guide](docs/GETTING_STARTED.md) for the prerequisites. - -## How It Works - -Three commands. That's the whole workflow: - - -/sync Preflight check. Fetches remote, reports branch state, dirty files. -/design Turns brainstorming into a structured plan. Reads decision log, - auto-classifies scope, outputs actionable steps. -/done Ships your work. Auto-detects scope, validates (lint + test + review), - commits, creates PR, updates docs. One command. - - -Real workflows: - - -Quick fix: /sync -> fix the bug -> /done -New feature: /sync -> brainstorm with Claude -> /design -> "implement this" -> /done -Multi-phase: /sync -> brainstorm -> /design -> "implement phase 1" -> /done -> ... -> /done -Exploration: just talk to Claude -- no commands needed - - -You never classify tasks upfront. /done auto-detects scope from your branch, diff size, and whether an implementation plan exists -- then picks the right level of ceremony: - -| Detected scope | What /done does | -|----------------|-------------------| -| **Quick** (on main, small diff) | Validate, commit, push, verify CI | -| **Standard** (feature branch) | Validate, commit, PR, CI, code review, update changelog | -| **Project** (has plan phases) | All of Standard + acceptance criteria + plan update + handoff note | - -## Quick Start - -**Prerequisites:** Python 3.11+, [uv](https://docs.astral.sh/uv/getting-started/installation/), [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview). New to these tools? See [Getting Started Guide](docs/GETTING_STARTED.md). - -**1. Create your project** - - -bash -# From GitHub: click "Use this template", then clone your new repo -# Or directly: -git clone https://github.com/stranma/claude-code-python-template my-project -cd my-project - - -**2. Run setup** - - -bash -# Simple project (recommended for first use): -python setup_project.py --name my-tool --namespace my_tool --type single - -# Monorepo with multiple packages: -python setup_project.py --name my-project --namespace my_project --type mono --packages "core,api" - - -The setup script replaces {{project_name}} placeholders across all files, renames directories to match your namespace, and optionally initializes git. It only modifies files inside the project directory. - -**3. Install and verify** - - -bash -uv sync --all-packages --group dev -uv run pytest && uv run ruff check . && uv run pyright - - -That's it. Claude Code picks up the agents, hooks, and rules automatically. - -## Devcontainer Setup (Recommended) - -The template includes a full VS Code devcontainer configuration. This is the recommended way to work because it sandboxes Claude Code -- firewall, non-root user, and policy hooks limit what it can do, so you can give it more autonomy without risk to your host machine. - -**What the devcontainer provides:** - -- **Network firewall** -- all egress blocked except ~10 whitelisted domains (GitHub, PyPI, etc.) -- **Non-root user** -- Claude Code cannot install system packages or modify system files -- **Permission tiers** -- control how much autonomy Claude Code gets: - -| Tier | Name | Who | Claude Code behavior | -|------|------|-----|----------------------| -| 1 | Assisted | New users, compliance teams | Per-command approval | -| 2 | Autonomous (default) | Most developers | Free to run commands, curated deny list | -| 3 | Full Trust | Solo devs with strong CI | Minimal restrictions | - -- **Policy hooks** -- block dangerous patterns even in chained commands (cd /tmp && rm -rf *) -- **Pre-installed tools** -- Python, uv, ruff, git, Claude Code VS Code extension - -Set the tier before building: PERMISSION_TIER=1 (or 2, 3) in your environment. Default is 2. - -See [Devcontainer Permissions](docs/DEVCONTAINER_PERMISSIONS.md) for the full denied commands list and approved alternatives. - -## What's Included - -### Core (always active) - -- **CLAUDE.md** -- compact agent directives (~40 lines) with /sync, /design, /done workflow -- **5 workflow agents** -- code quality, test coverage, PR writing, code review, docs updates -- **3 security hooks** -- block destructive commands, scan for leaked secrets, catch Unicode injection -- **CI/CD** -- GitHub Actions for lint + test + typecheck + publish -- **Tool stack** -- [uv](https://docs.astral.sh/uv/) workspaces, [ruff](https://docs.astral.sh/ruff/), [pyright](https://github.com/microsoft/pyright), [pytest](https://pytest.org/), [hatchling](https://hatch.pypa.io/) - -### Optional specialists - -
-7 additional agents for larger projects - -| Agent | Purpose | -|-------|---------| -| acceptance-criteria-validator | Verify acceptance criteria across phases | -| implementation-tracker | Keep plan and reality in sync | -| review-responder | Automated review triage | -| agent-auditor | Audit agent definitions for best practices | -| security-auditor | OWASP-based vulnerability detection (read-only) | -| refactoring-specialist | SOLID/code smell analysis (read-only) | -| output-evaluator | LLM-as-Judge quality scoring | - -
- -
-2 productivity hooks - -- **auto-format** -- auto-formats Python files after edits -- **test-on-change** -- auto-runs associated tests after edits - -
- -
-Commands and skills - -- /sync -- preflight workspace check before starting work -- /design -- crystallize brainstorming into a structured plan -- /done -- validate, ship, and document in one command -- /landed -- post-merge lifecycle: verify CI, check deploys, clean branches -- /cove -- Chain-of-Verification for high-stakes accuracy -- /cove-isolated -- CoVe with isolated verification agent -- /security-audit -- 6-phase security posture scan with A-F grading -- /edit-permissions -- manage Claude Code permission rules - -
- -
-4 review rules - -Architecture, code quality, performance, and test quality -- applied automatically during code review. - -
- -## Project Structure - -### Monorepo (default) - - -my-project/ -├── CLAUDE.md # Agent directives (~40 lines) -├── apps/ # Executable applications -│ └── api/ -│ ├── pyproject.toml -│ └── my_project/api/ -├── libs/ # Reusable libraries -│ └── core/ -│ ├── pyproject.toml -│ └── my_project/core/ -├── tests/ -├── docs/ -│ ├── CHANGELOG.md -│ ├── DECISIONS.md -│ ├── DEVELOPMENT_PROCESS.md -│ └── IMPLEMENTATION_PLAN.md -├── .claude/ # Claude Code config -│ ├── settings.json -│ ├── agents/ # 12 agents -│ ├── skills/ # /sync, /design, /done, /landed, /edit-permissions -│ ├── commands/ # /cove, /cove-isolated, /security-audit -│ ├── hooks/ # 5 hook scripts -│ └── rules/ # 4 review rules -├── .devcontainer/ # VS Code devcontainer -│ ├── Dockerfile -│ ├── devcontainer.json -│ ├── init-firewall.sh -│ └── permissions/ # Tier 1/2/3 configs -├── .github/ -│ ├── workflows/ # CI/CD -│ ├── PULL_REQUEST_TEMPLATE.md -│ └── ISSUE_TEMPLATE/ -└── pyproject.toml # Root workspace config - - -### Single Package - - -my-tool/ -├── CLAUDE.md -├── src/my_tool/ -├── tests/ -├── docs/ -├── .claude/ -├── .devcontainer/ -├── .github/ -└── pyproject.toml - - -## Setup Script Options - -| Flag | Default | Description | -|------|---------|-------------| -| --name | (required) | Project name (e.g., my-project) | -| --namespace | from name | Python namespace (e.g., my_project) | -| --description | "A Python project" | Short description | -| --author | "" | Author name | -| --email | "" | Author email | -| --python-version | "3.11" | Python version requirement | -| --base-branch | "master" | Git base branch | -| --type | "mono" | mono or single | -| --packages | "core,server" | Comma-separated package names (mono only) | -| --git-init | false | Init git + initial commit | - -Package naming: by default, the first package is a library (in libs/), the rest are applications (in apps/). Use prefixes to control placement: --packages "lib:models,lib:utils,app:api,app:worker". - -## A Note on Token Costs - -The agents in this template use Claude's sub-agents (mostly Sonnet, some Haiku) to validate code quality, run reviews, and write PR descriptions. This costs tokens beyond what you'd spend on a bare Claude Code session. - -This is intentional. A few extra cents per PR is trivially cheap compared to the cost of your time debugging a bug that a code review agent would have caught, or manually writing PR descriptions, or re-running tests you forgot to run. The agents exist because developer time is the expensive resource, not tokens. - -If cost is a concern, the workflow agents (code-quality-validator, test-coverage-validator) are the ones that run most frequently. The others (pr-writer, code-reviewer, docs-updater) run once per PR. The optional specialist agents only run when you explicitly invoke them. - -## Credits - -Monorepo structure inspired by [carderne/postmodern-mono](https://github.com/carderne/postmodern-mono), which demonstrates excellent uv workspace patterns. Key differences: - -- Direct uv run commands instead of Poe the Poet -- Standard pyright instead of basedpyright -- Claude Code methodology layer (CLAUDE.md, agents, skills, hooks) -- Setup script for template initialization - -## License - -MIT - -Thought for a couple of seconds - -Assumptions: I am judging only from the README text you pasted, not from the actual repo contents, code quality, hook implementations, or maintenance history. - -Answer - -My honest opinion: this looks potentially useful, but not yet obviously compelling enough that I would adopt it just from this README alone. I would likely bookmark it and inspect the repo, but I would not immediately use it for a real project without checking several concrete things first. - -Why: - -The README communicates a clear opinionated workflow. - -It targets a real pain point: Claude Code drift, inconsistent process, skipped tests, weak session continuity. - -It has a strong positioning angle: “template for disciplined AI-assisted Python development.” - -But it also raises several concerns: - -It promises a lot. -“Production-quality,” “agents on every PR,” “security hooks,” “3-command workflow,” “permission tiers,” “sandboxed autonomy” — that is a broad surface area. Broad templates often become brittle, noisy, or high-maintenance. - -The core value proposition is still partly abstract. -I can see what it includes, but I cannot yet tell whether it actually improves day-to-day work versus adding ceremony. - -The workflow feels optimized for people who want strong process, not necessarily for people who want speed. -That is fine, but the README slightly oversells “almost for free.” This stack is not free in cognitive overhead. It may save discipline, but it adds framework. - -The biggest missing proof is operational proof. -I need examples of what Claude Code actually does better with this template than without it. - -So: helpful? Possibly yes. Would I try it? Yes, but only in a throwaway repo first. Would I trust it in production based on this README alone? No. - -Rationale -What is strong in this README -1. Clear target problem - -The README identifies real failure modes of AI coding workflows: - -skipped tests - -inconsistent outputs - -lost context - -over-rewrites - -lack of team standardization - -That is credible and relevant. - -2. Opinionated choices are a strength - -Choosing uv, ruff, pyright, pytest, hatchling is sensible. For a Python template, that is a strong baseline. It reduces decision fatigue. - -3. The command-based workflow is memorable - -/sync, /design, /done is a good abstraction. It is easy to remember and easy to explain. - -4. Devcontainer + restricted permissions is a genuinely valuable angle - -This is probably the most differentiated part. “Let Claude operate more freely, but inside a sandbox” is a practical idea. - -5. Audience segmentation is good - -The README speaks separately to solo devs, team leads, DS/ML engineers, and beginners. That improves reach. - -What weakens it -1. Too many claims, not enough evidence - -The README says this template makes Claude Code produce: - -consistent code - -tested code - -production-quality code - -safer code - -better PRs - -proper docs - -scalable workflow - -That is a large promise stack. I need evidence like: - -before/after examples - -sample generated PR - -example implementation plan - -example /done output - -what the agents actually check - -failure cases - -Without that, this reads slightly like “process theater with AI branding.” - -2. It is not clear what is essential vs optional - -The README lists: - -agents - -hooks - -commands - -rules - -CI/CD - -devcontainer - -security audits - -productivity hooks - -permission tiers - -But I cannot quickly tell: - -what absolutely runs in normal use - -what runs only if enabled - -what is expensive - -what is intrusive - -what is safe to remove - -A mid-level developer evaluating adoption wants a clearer mental model: -what is the minimum viable template, and what is advanced add-on machinery? - -3. The “TDD enforced” claim may turn people away - -This is a polarizing choice. Many developers like tests but do not want rigid TDD enforcement. If the template truly blocks non-TDD flows, that is a major adoption constraint. - -You should be brutally explicit: - -Does it require red-green-refactor every time? - -Does /done fail if tests were added after implementation? - -Is this a recommendation or a hard workflow constraint? - -Right now it sounds stricter than many teams would tolerate. - -4. “Runs agents on every PR” can be either great or annoying - -This could mean: - -useful automation, or - -token burn, latency, noise, and low-signal comments - -The README defends token costs, but not review quality. Cost is rarely the only concern. Signal-to-noise ratio matters more. - -5. The repo may be over-engineered relative to its audience - -For beginners and many solo devs, this may be too much machinery: - -monorepo support - -multiple agents - -security hooks - -permission tiers - -review rules - -implementation plan docs - -changelog updates - -handoff notes - -That can help discipline, but it can also overwhelm the very people who most need simplicity. - -Would I try it? -My actual decision - -Yes, I would try it in a sandbox project for 30–60 minutes. -I would not adopt it directly into an active codebase. - -Why I would try it - -Because it has enough potentially high-value ideas: - -constrained AI workflow - -sandboxing Claude - -explicit ship command - -standard team baseline - -less setup for quality gates - -Why I would hesitate - -Because templates like this often fail in one of four ways: - -Too rigid — slows normal work. - -Too magical — hard to debug or customize. - -Too noisy — too many automated checks/comments. - -Too fragile — breaks when real project structure deviates from template assumptions. - -What else I would need to decide - -This is the main missing section. To decide, I would need: - -1. One realistic demo - -Not a list of features. A concrete walkthrough: - -create repo - -run setup - -ask Claude to implement a feature - -use /design - -use /done - -show resulting commit / PR / CI / docs update - -This would do more than half the persuasion work. - -2. Example outputs of the commands - -Show exactly what /sync, /design, /done produce. - -Questions I want answered: - -Is /design just a prompt wrapper, or does it create durable artifacts? - -Does /done ask follow-up questions? - -What does “auto-detect scope” really look like? - -What if it misclassifies? - -3. Maintenance story - -Who maintains this? How often? How stable is Claude Code compatibility? - -AI-tooling templates rot faster than normal templates because upstream behavior changes. I need to know: - -how often this repo is updated - -supported Claude Code versions - -changelog/release policy - -compatibility guarantees, if any - -4. Escape hatches - -Can I disable pieces cleanly? - -I would want to know: - -can I use only CLAUDE.md + commands without devcontainer? - -can I disable PR agents? - -can I switch off TDD enforcement? - -can I use basedpyright, mypy, or no pyright? - -can I keep single-package only? - -5. Proof the security model is real - -The devcontainer section sounds promising, but security claims need specifics: - -what exactly is blocked - -what is only best-effort - -what happens on macOS/Linux/Windows - -can Claude still exfiltrate through allowed domains - -what threat model is covered - -Right now it is directionally good, but still marketing-heavy. - -6. Repo examples or adopters - -I would want: - -one minimal sample app - -one real repo using it - -maybe one “team mode” example - -That would reduce perceived risk a lot. - -How to improve the README -1. Add a “Should you use this?” decision section near the top - -Example structure: - -Use this if: - -you use Claude Code regularly - -you want standardized Python quality gates - -you are comfortable with opinionated tooling - -you want containerized/sandboxed AI workflow - -Do not use this if: - -you want minimal process - -you do not want TDD pressure - -you dislike devcontainers - -you want framework-agnostic freedom - -you already have strong CI + project conventions - -This would build trust immediately. - -2. Add “What this actually changes in your daily workflow” - -A plain-language section: - -before template - -after template - -what you do differently each day - -Right now the README explains features more than behavior. - -3. Add one end-to-end demo - -This is the highest-impact improvement. - -Include: - -prompt given to Claude - -generated plan - -tests created - -/done output - -PR result - -Even screenshots or terminal transcripts would help. - -4. Separate “core” from “extended” more aggressively - -Make it obvious: - -Minimal mode - -CLAUDE.md - -uv + ruff + pyright + pytest - -/sync, /design, /done - -Advanced mode - -agents - -hooks - -devcontainer - -review rules - -security audit - -That makes adoption feel incremental instead of all-or-nothing. - -5. Be more precise about enforcement - -For example: - -“enforces TDD” → explain whether hard or soft - -“runs agents on every PR” → specify which agents, default behavior, and expected cost - -“production-quality” → define what that means operationally - -Avoid fuzzy claims. - -6. Add a “Failure modes / limitations” section - -This would massively increase credibility. - -Examples: - -may be overkill for tiny scripts - -agent quality depends on Claude model behavior - -may require tuning for existing repos - -devcontainer setup may be heavy on some machines - -hooks can occasionally block legitimate commands - -PR review agents are advisory, not authoritative - -That kind of honesty would make me trust the project more. - -7. Explain migration path for existing repos - -Many people do not start greenfield. Add: - -how to apply template to existing repo - -what files to copy - -what to keep/remove - -how to phase adoption - -8. Reorder for speed - -Suggested order: - -What problem this solves - -Should you use it? - -2-minute demo - -Quick start - -What is included - -Devcontainer/security - -Deep details - -Current README is decent, but still a bit feature-first. - -How to improve the repo - -Based on the README alone, these are the highest-value repo improvements. - -1. Add an examples/ directory - -Include: - -minimal single-package example - -monorepo example - -example PR generated through workflow - -example docs evolution across phases - -2. Add automated smoke tests for template generation - -Templates often fail in placeholder substitution and project generation. - -You want CI that verifies: - -single-package setup works - -monorepo setup works - -generated project installs - -tests/lint/typecheck pass after generation - -Without this, template repos rot quietly. - -3. Add a compatibility matrix - -Document: - -Python versions - -Claude Code versions or assumptions - -VS Code/devcontainer assumptions - -host OS notes - -4. Make modules independently adoptable - -Let people adopt: - -just CLAUDE.md - -just commands - -just devcontainer - -just hooks - -That will broaden adoption and reduce fear. - -5. Add a “disable/override” guide - -This is important for real teams. - -Examples: - -switch TDD policy off - -remove auto docs updates - -disable review agents - -change base branch to main - -replace pyright with mypy/basedpyright - -remove hatchling if not publishing - -6. Provide observability into agent behavior - -A common failure in AI automation repos is hidden logic. - -You want: - -logs - -dry-run mode - -verbose mode - -clear explanation of why /done chose a given scope - -“Auto-detect scope” needs transparency. - -7. Add benchmark-style evidence - -Even lightweight: - -median time to first PR - -number of issues caught by hooks - -common PR review findings - -token cost per typical workflow - -Not fake precision—just honest metrics. - -8. Add repo health signals - -I would look for: - -releases or tags - -changelog - -issue templates - -examples of active maintenance - -test coverage for setup logic and hooks - -9. Reduce magic in setup script - -The setup script should be extremely predictable, idempotent, and test-covered. If it is doing global find/replace across many files, that is a source of subtle breakage. - -10. Add comparison section - -Compare against: - -plain Claude Code + manual workflow - -cookiecutter/copier Python templates - -generic Python project templates - -team-internal CLAUDE.md only - -That helps people understand the delta. - -Alternatives with tradeoffs -Option 1 — Try this template as-is in a throwaway repo - -Best when: you want to evaluate the full opinionated experience. -Upside: fastest way to assess real value. -Downside: you may confuse template complexity with Claude Code complexity. - -Option 2 — Borrow only the core ideas - -Take: - -CLAUDE.md - -/sync, /design, /done - -uv + ruff + pyright + pytest - -Skip: - -heavy agents - -devcontainer - -advanced hooks initially - -Best when: you want lower adoption risk. -Upside: much easier to trial. -Downside: you miss the differentiated security and automation story. - -Option 3 — Use it only as a design reference for your own internal template - -Best when: you already know your team preferences. -Upside: keeps the good ideas, avoids lock-in to someone else’s opinions. -Downside: more up-front effort. - -Option 4 — Do nothing and keep a lightweight Claude workflow - -Use: - -your own CLAUDE.md - -normal CI - -pre-commit - -standard PR template - -Best when: your current process is already disciplined. -Upside: minimal overhead. -Downside: less standardization and fewer AI-specific safeguards. - -My weighting: - -0.40 Option 2: borrow the core ideas only - -0.30 Option 1: trial the full template in a sandbox - -0.20 Option 3: use it as inspiration for an internal template - -0.10 Option 4: ignore it entirely - -Uncertainty note: these weights would change significantly after seeing the actual agent definitions, hooks, and one real demo. - -Confidence + what I’d need to be more certain - -Confidence: 0.74 - -Why not higher: - -I only saw the README - -the biggest unknowns are implementation quality and maintenance discipline - -this category is highly sensitive to polish and edge cases - -To be more certain, I would inspect: - -CLAUDE.md - -the definitions of /sync, /design, /done - -one or two hook scripts - -CI workflows - -setup script tests - -evidence of active maintenance - -one generated example repo - -That would be enough to move to either: - -“yes, I’d try this seriously,” or - -“no, this is mostly process packaging.” - -Opportunity Radar -1. Turn the README into a conversion funnel - -Impact: high -Effort: medium -Risk: low -Confidence: high - -Right now it is informative, but not maximally persuasive. A short demo, limitations section, and “who should not use this” section would sharply improve trust. - -2. Add a “lite mode” - -Impact: high -Effort: medium -Risk: low -Confidence: high - -A lot of potential users will want 20% of the system for 80% of the value. Make that path explicit. - -3. Productize the proof, not just the features - -Impact: very high -Effort: medium -Risk: medium -Confidence: medium-high - -The repo’s success likely depends less on adding more agents and more on proving that the current ones materially improve outcomes. Real examples beat more capability claims. - -If you want, paste the actual repo tree or key files (CLAUDE.md, the 3 commands, hooks, CI), and I’ll give you a much harsher and more concrete assessment. \ No newline at end of file diff --git a/docs/community/gemini.md b/docs/community/gemini.md deleted file mode 100644 index 8445e7a..0000000 --- a/docs/community/gemini.md +++ /dev/null @@ -1,82 +0,0 @@ -# Archived Transcript: Gemini Assessment - -Captured: March 2025 | Model: Gemini 2.0 | Subject: stranma/claude-code-python-template - ---- - -Please estimate how useful is https://github.com/stranma/claude-code-python-template - -Odpověď Gemini -Based on a review of the repository's contents and documentation, https://github.com/stranma/claude-code-python-template has a high potential utility, but its usefulness depends heavily on your specific development workflow. - -Here is an estimation of its usefulness broken down by features, target audience, and modern development standards. - -1. What it is -It is a "production-ready" Python project template specifically engineered to be manipulated by Claude Code (Anthropic’s terminal-based AI coding assistant). Instead of just providing a standard Python boilerplate, it provides an entire "AI-first" methodology aimed at keeping the AI agent on track, secure, and compliant with best practices. - -2. Key Features that Drive Utility -AI Guardrails & Agents: It includes a highly structured CLAUDE.md file and 12 custom Claude Code agents (e.g., code-quality-validator, test-coverage-validator, pr-writer). This is immensely useful because it solves a major problem with AI coding assistants: they tend to lose context or skip testing. This template forces the AI to follow a Test-Driven Development (TDD) workflow. - -Modern, Bleeding-Edge Tooling: It uses the best modern Python tools available right now. It utilizes uv for lightning-fast package management and monorepo workspaces, ruff for fast linting/formatting, pyright for strict type checking, and pytest. - -Structured Methodology: It bakes in a "Quick / Standard / Project" workflow. This helps scale the AI's compute and thinking process based on the complexity of the task, preventing the AI from overcomplicating simple fixes or under-planning large features. - -Security & CI/CD: It comes with pre-configured GitHub Actions, PR templates, and 5 security/productivity hooks (like blocking destructive actions or secret leaks), making it ready for production deployment out of the box. - -3. Estimation of Usefulness by Audience -For Claude Code Users (High Utility): If you are actively using Anthropic's Claude Code CLI, this template is highly useful. Writing context files and setting up guardrails for AI agents is tedious. This repository does the heavy lifting for you, effectively giving Claude a "manager" to ensure it writes tests and checks its own work before claiming a task is done. - -For Modern Python Developers (Moderate to High Utility): Even if you strip away the AI features, this is a very well-structured template. Setting up a Python monorepo (apps/libs) using uv workspaces can be tricky, and this repository provides a clean, working example inspired by proven community patterns. - -For Traditional / Non-AI Developers (Low Utility): If you don't use AI coding tools, a significant portion of this repository (the .claude folder, the agents, the specific markdown prompt structures) is dead weight to you. You would be better off using a standard minimal uv or poetry template. - -4. Caveats -It is worth noting that as of right now, the repository is practically brand new (showing 0 stars and 0 forks). While the underlying stack and ideas are excellent, it does not yet have the backing of a large community, meaning you might run into edge cases or bugs in the setup script that haven't been battle-tested by hundreds of users. - -Verdict: If you want to experiment with AI-driven software engineering using Claude Code and the fastest new Python tools (uv, ruff), this repository is extremely useful as a starting point or a learning resource on how to structure prompts and workflows for autonomous coding agents. - -Váš pokyn -Good news. It is my repo. How to improve it for community? Would it make sense to make it more known? How to do it? - -Odpověď Gemini -That is fantastic news! Congratulations on shipping this. Building a bridge between bleeding-edge AI tooling (Claude Code) and strict, production-ready Python standards (uv, TDD) solves a massive pain point right now: keeping AI agents from making a mess of codebases. - -To answer your second question first: Yes, it absolutely makes sense to make it more known. Developers are scrambling to figure out how to best utilize autonomous coding agents without sacrificing code quality or security. Your template provides a concrete, opinionated solution to that exact problem. - -Here is a straightforward, practical guide on how to polish it for the community and get it in front of the right eyes. - -1. How to Improve the Repo for the Community -Before driving traffic to the repository, you want to ensure the Developer Experience (DX) is as frictionless as possible. When developers land on your page, they should instantly understand the value and trust the code. - -Add a Video or GIF Demonstration: This is the #1 missing piece. People need to see it working. Record a 60-second Loom or Asciinema screen capture showing Claude Code interacting with your template. Show it catching a linting error or being forced to write a test before implementation. - -Create a "Hello World" Example App: The monorepo structure is great, but starting from scratch can be intimidating. Consider adding a tiny, pre-configured dummy app (e.g., a simple FastAPI endpoint in apps/api) so users can immediately run a Claude command and see the agents in action without having to write boilerplate first. - -Flesh out Community Documents: * Add a CONTRIBUTING.md file explaining how others can add new Claude agents or improve the existing prompts. - -Ensure you have a clear SECURITY.md file, especially since you are dealing with AI and code execution. - -Bulletproof the Setup Script: Ensure setup_project.py handles edge cases gracefully (e.g., what if the user doesn't have uv installed yet? Does it fail gracefully or provide installation instructions?). - -GitHub Topics: Make sure your repository has relevant tags in the "About" section on GitHub. Use tags like: python, claude-code, uv, monorepo, ai-agents, tdd, ruff. - -2. How to Make It Known (Distribution) -Open-source success is 30% code and 70% distribution. You need to put this in front of developers where they already hang out. - -High-Impact Platforms -Hacker News ("Show HN"): HN loves developer tools, but they are highly critical. Frame your post around the problem you solved. - -Title idea: Show HN: A Python template that forces Claude Code to use TDD - -Reddit: Do not just drop a link; write a short, text-based post explaining why you built this and the pain points of using vanilla Claude Code. - -Target subreddits: r/Python, r/ClaudeAI, r/LocalLLaMA, r/coding, and r/machinelearning. - -X (Twitter): Share a short thread with a demo video. The Python and AI developer communities are highly active here. Be sure to tag the creators of the tools you are using—specifically Anthropic (makers of Claude) and Astral (makers of uv and ruff). - -Content Marketing -Write an Article: Publish a blog post on Dev.to, Medium, or Hashnode. Don't just make it an ad for your repo; make it an educational piece on "How to structure Python projects for AI coding assistants." Walk through your methodology (the Quick/Standard/Project paths) and link to your repo as the reference implementation. - -Newsletters: Reach out to Python and AI newsletter curators (like PyCoder's Weekly, TLDR AI, or Python Weekly) with a brief email pitching your template as a useful new tool for their readers. - -Would you like me to help you draft a "Show HN" post or a Reddit announcement tailored to your project? \ No newline at end of file diff --git a/docs/community/openai.md b/docs/community/openai.md deleted file mode 100644 index 27d02ec..0000000 --- a/docs/community/openai.md +++ /dev/null @@ -1,186 +0,0 @@ -# Archived Transcript: OpenAI (ChatGPT) Assessment - -Captured: March 2025 | Model: ChatGPT (o3) | Subject: stranma/claude-code-python-template - ---- - -Please estimate how useful is stranma/claude-code-python-template -Přemýšlení po dobu 19s -Usefulness estimate: High if you actively use Claude Code; medium-to-low otherwise - -Overall score (subjective): 7.5/10 for Claude Code–centric teams, 4/10 as a generic Python template. - -What it’s strong at - -Claude Code “operating system,” not just a skeleton repo. It includes a defined workflow (TDD + “Quick/Standard/Project” paths), a phase checklist, and a compact CLAUDE.md with constraints meant to steer the agent reliably. - -Quality gates are first-class. Built-in agents for lint/typecheck/tests/coverage, code review, docs updating, PR writing, etc., plus hooks that auto-format and run tests-on-change. This can materially reduce the “LLM wrote it, now we babysit it” tax. - -Modern Python tooling choices. Uses uv workspaces and a monorepo layout (apps/ + libs/) which is a solid fit for multi-package codebases. - -Main risks / downsides - -Complexity overhead. 12 agents + hooks + enforced process is powerful, but it’s also “process debt” if you don’t consistently use Claude Code or if your team won’t follow it. - -Ecosystem/security posture matters. Claude Code has had recent security research around configuration-driven execution and related trust boundaries (patched, but still a reminder to treat hooks/plugins/config as attack surface). If you clone untrusted repos or accept external contributions, be extra strict. - -Low external validation (so far). At the time of capture the repo shows 0 stars/0 forks, so you’re relying on the author’s judgment rather than community hardening. - -Rationale (first principles) - -A “template” pays off when it reduces repeated setup + prevents common failure modes. This one targets the main failure mode of agentic coding: drift (unclear standards, missing tests, inconsistent changes). The combination of (1) explicit workflow docs, (2) automated validators, and (3) hooks that create immediate feedback loops is exactly how you stabilize LLM-driven development. -The tradeoff is that you’re adopting a framework for collaborating with Claude Code, not merely a Python starter. - -Alternatives (2–4) with tradeoffs - -discus0434/python-template-for-claude-code – simpler “Claude Code optimized” Python template; may be easier to adopt but typically less opinionated/less automation than stranma’s full methodology. - -scotthavird/claude-code-template – described as barebones + devcontainer + hooks/logging; good if you want Claude Code integration without a heavyweight process. - -serpro69/claude-starter-kit – broader Claude Code starter kit (agents/commands/hooks/MCP); good for general Claude Code enablement, less specifically “Python production template.” - -davila7/claude-code-templates ecosystem – huge collection and installer-style approach; best when you want to compose pieces rather than adopt one repo’s full opinionated workflow. - -Verbalized sampling (what “useful” could mean here) - -0.55: Very useful as a Claude Code “guardrails + process” pack for solo devs/teams doing lots of agentic changes. - -0.25: Moderately useful: you’ll keep the uv/CI layout, drop most agents/hooks, and treat it as a starting point. - -0.15: Low usefulness: too heavyweight; you’ll spend more time maintaining the workflow than shipping. - -0.05: Not useful: you don’t use Claude Code (or you can’t/won’t run hooks/plugins), so the main value disappears. - -Confidence - -0.78 (good). I can see the repo structure, feature list, and intended workflow, but I haven’t executed the setup script or validated the hooks/agents in a real Claude Code session. - -What I’d need to be more certain: - -Evidence of actual adoption (stars/forks/issues over time) and/or a quick smoke test: python setup_project.py + uv sync + pytest/ruff/pyright on a clean machine. - -Opportunity Radar - -Adopt selectively: keep uv workspaces + CI + CLAUDE.md; start with 2–3 validators (quality/tests/security) before enabling all agents/hooks. High impact, low risk. - -Threat-model your hooks/plugins: treat .claude/ config like executable code; lock down permissions and avoid running it on untrusted repos. High impact, medium effort. - -Benchmark vs “lighter” templates: if your dev style is already disciplined, a lighter Claude template may deliver 80% of the benefit with 20% of the complexity. Medium impact, low effort. - -Good news. It is my repo. Would it make sense to make it more known? How to do it? -Přemýšlení po dobu 19s - -Yes—it likely makes sense to make it more known if your goal is adoption, feedback, and contributions. Right now the repo looks like it has low discoverability signals (e.g., no stars/forks at the time I checked), which usually means “people who would benefit simply aren’t finding it yet.” - -(1) Answer: should you promote it? - -Promote it if at least one is true: - -You want external contributions (agents/hooks, docs improvements, more templates). - -You want the template to become a “default starting point” for Claude Code + Python. - -You want distribution leverage for your other work (consulting, tooling, company brand, etc.). - -Don’t over-promote if you don’t have bandwidth to handle issues/PRs; public repos quickly create “support obligations” unless you set expectations clearly. - -(2) Rationale: what drives “being known” on GitHub - -Discoverability is mostly a function of: - -Search + classification (topics/keywords) - -Friction to first success (template button, quickstart, Codespaces/devcontainer) - -Trust signals (license, contributing, code of conduct, issues templates, releases) - -Distribution channels (communities, lists, content, talks) - -If you improve (1–3), promotion (4) becomes far more effective. - -What to do (practical, high-leverage checklist) -A. Make it instantly “click-to-use” - -Mark the repo as a Template (Settings → “Template repository”). This is the single biggest adoption accelerator for template repos. - -Add a README CTA: “Use this template” + “What you get in 60 seconds” + “First 3 commands”. - -If you can, add Codespaces/devcontainer support so people can try it without local setup. - -B. Make it searchable - -Add GitHub Topics: e.g. claude-code, python-template, uv, tdd, monorepo, fastapi, ruff, pyright, pre-commit, etc. - -Add a tight one-line description (GitHub repo “About” field) with the primary keywords. - -C. Raise “trust signals” to reduce hesitation - -Ensure community health files are complete (GitHub’s community profile checklist): LICENSE, README, CONTRIBUTING, CODE_OF_CONDUCT, etc. - -Add issue/PR templates to guide contributions and reduce maintainer load. - -Add a small release strategy (even for templates): tag versions and publish GitHub Releases with notes for what changed and migration guidance. - -D. Distribute where the users already are (low-cost first) - -Post in Claude Code communities (Discords/Reddit/etc.), and in Python tooling circles (uv/ruff/pyright). - -Submit PRs to “awesome” lists or curated template lists (where relevant). - -Write a short “why this template exists” post (dev.to / Medium / your blog) and link back; content is often the best long-tail discovery engine. - -(3) Alternatives with tradeoffs - -Soft-launch only (optimize repo, no marketing) - -Pros: minimal support burden; clean foundation. - -Cons: slower adoption/feedback. - -Targeted promotion (Claude Code + Python niches only) - -Pros: high-signal users, higher-quality issues/PRs. - -Cons: still some support load; smaller reach. - -Broad promotion (HN, general OSS channels) - -Pros: fast visibility. - -Cons: noisy feedback; bigger support burden; more “drive-by” issues. - -Spin into an “ecosystem” (multiple templates / installer / CLI) - -Pros: becomes a platform; higher moat. - -Cons: much more maintenance; harder to keep coherent. - -(4) Confidence + what would make it higher - -Confidence: 0.80. The actions above are standard levers that GitHub itself supports and recommends (topics, template repos, community health files, templates, releases). -To be more certain about ROI: your goal (stars vs contributors vs consulting leads) and current funnel (how people find it today). - -Opportunity Radar - -Add “adoption telemetry” without creepy tracking: a short “Where did you find this?” optional discussion thread + a GitHub Discussions category. Impact high, effort low. - -Security posture note: add a brief section describing what hooks/agents do and the trust model (“don’t run on untrusted repos”). This reduces adoption friction in teams. - -Create a minimal “hello world” example repo generated from the template so people can see the output instantly (screenshots + tree). High impact, low effort. - -If you want one concrete next step: mark it as a template + add topics + add a 60-second quickstart—that alone usually changes adoption dynamics materially. - -I think it is marked as template, is it not? -Přemýšlení po dobu 4 - -Yes — it is marked as a template. - -On the GitHub header for the repo it shows “Public template” next to the repository name. - -If someone can’t see the “Use this template” button, the usual causes are: - -They’re on a mobile view or narrow window (button can be in the “Code” dropdown depending on layout). - -They’re not logged in (GitHub UI sometimes hides/relocates actions for signed-out users). - -GitHub’s own guidance on template repositories is here. \ No newline at end of file diff --git a/tests/test_agents.py b/tests/test_agents.py index 3e21dc4..11f3816 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -7,26 +7,15 @@ AGENTS_DIR = Path(__file__).parent.parent / ".claude" / "agents" -EXISTING_AGENTS = [ - "acceptance-criteria-validator.md", - "agent-auditor.md", +ALL_AGENTS = [ "code-quality-validator.md", "code-reviewer.md", "docs-updater.md", - "implementation-tracker.md", "pr-writer.md", "review-responder.md", "test-coverage-validator.md", ] -NEW_AGENTS = [ - "security-auditor.md", - "refactoring-specialist.md", - "output-evaluator.md", -] - -ALL_AGENTS = EXISTING_AGENTS + NEW_AGENTS - VALID_MODELS = {"haiku", "sonnet", "opus"} VALID_PERMISSION_MODES = {"plan", "dontAsk", "acceptEdits"} VALID_TOOLS = {"Read", "Glob", "Grep", "Bash", "Edit", "Write", "NotebookEdit", "WebSearch", "WebFetch"} @@ -69,9 +58,7 @@ def test_agent_file_exists(self, agent_name: str) -> None: def test_total_agent_count(self) -> None: actual_agents = {f.name for f in AGENTS_DIR.iterdir() if f.is_file() and f.suffix == ".md"} - assert len(actual_agents) >= len(ALL_AGENTS), ( - f"Expected at least {len(ALL_AGENTS)} agents, found {len(actual_agents)}" - ) + assert actual_agents == set(ALL_AGENTS), f"Agent mismatch. Expected: {set(ALL_AGENTS)}, Got: {actual_agents}" class TestAgentFrontmatter: @@ -115,9 +102,9 @@ def test_agent_has_tools(self, agent_name: str, agent_frontmatter: dict[str, dic def test_agent_has_permission_mode(self, agent_name: str, agent_frontmatter: dict[str, dict[str, str]]) -> None: fm = agent_frontmatter.get(agent_name, {}) assert "permissionMode" in fm, f"{agent_name} missing 'permissionMode' in frontmatter" - assert fm["permissionMode"] in VALID_PERMISSION_MODES, ( - f"{agent_name} has invalid permissionMode: {fm['permissionMode']!r}" - ) + assert ( + fm["permissionMode"] in VALID_PERMISSION_MODES + ), f"{agent_name} has invalid permissionMode: {fm['permissionMode']!r}" class TestAgentBody: @@ -138,31 +125,3 @@ def test_agent_body_has_heading(self, agent_name: str) -> None: parts = content.split("---", 2) body = parts[2] if len(parts) >= 3 else "" assert re.search(r"^#+\s", body, re.MULTILINE), f"{agent_name} body missing markdown heading" - - -class TestNewAgentSpecifics: - """Verify new agents have correct read-only/permission configurations.""" - - def test_security_auditor_is_read_only(self, agent_frontmatter: dict[str, dict[str, str]]) -> None: - fm = agent_frontmatter.get("security-auditor.md", {}) - assert fm.get("permissionMode") == "plan", "security-auditor should be read-only (permissionMode: plan)" - tools = {t.strip() for t in fm.get("tools", "").split(",")} - assert "Bash" not in tools, "security-auditor should not have Bash access" - assert "Edit" not in tools, "security-auditor should not have Edit access" - assert "Write" not in tools, "security-auditor should not have Write access" - - def test_refactoring_specialist_is_read_only(self, agent_frontmatter: dict[str, dict[str, str]]) -> None: - fm = agent_frontmatter.get("refactoring-specialist.md", {}) - assert fm.get("permissionMode") == "plan", "refactoring-specialist should be read-only (permissionMode: plan)" - tools = {t.strip() for t in fm.get("tools", "").split(",")} - assert "Bash" not in tools, "refactoring-specialist should not have Bash access" - - def test_output_evaluator_uses_dontask(self, agent_frontmatter: dict[str, dict[str, str]]) -> None: - fm = agent_frontmatter.get("output-evaluator.md", {}) - assert fm.get("permissionMode") == "dontAsk", ( - "output-evaluator should use dontAsk for automated pipeline integration" - ) - - def test_output_evaluator_uses_haiku(self, agent_frontmatter: dict[str, dict[str, str]]) -> None: - fm = agent_frontmatter.get("output-evaluator.md", {}) - assert fm.get("model") == "haiku", "output-evaluator should use haiku model for efficiency" diff --git a/tests/test_commands.py b/tests/test_commands.py index 66e144c..016c058 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -1,105 +1,14 @@ -"""Tests for .claude/commands/ -- validates command files exist and have correct structure.""" +"""Tests for .claude/commands/ -- validates no stale command files exist.""" from pathlib import Path -import pytest - COMMANDS_DIR = Path(__file__).parent.parent / ".claude" / "commands" -ALL_COMMANDS = [ - "cove.md", - "cove-isolated.md", - "security-audit.md", -] - - -class TestCommandExistence: - """Verify all expected command files exist.""" - - def test_commands_directory_exists(self) -> None: - assert COMMANDS_DIR.exists(), f"{COMMANDS_DIR} does not exist" - assert COMMANDS_DIR.is_dir(), f"{COMMANDS_DIR} is not a directory" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_file_exists(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - assert command_path.exists(), f"Command file missing: {command_name}" - - -class TestCommandStructure: - """Verify command files have correct frontmatter and content.""" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_has_frontmatter(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - content = command_path.read_text(encoding="utf-8") - assert content.startswith("---"), f"{command_name} missing YAML frontmatter" - parts = content.split("---", 2) - assert len(parts) >= 3, f"{command_name} has unclosed frontmatter" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_has_allowed_tools(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - content = command_path.read_text(encoding="utf-8") - assert "allowed-tools:" in content, f"{command_name} missing allowed-tools in frontmatter" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_has_description(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - content = command_path.read_text(encoding="utf-8") - assert "description:" in content, f"{command_name} missing description in frontmatter" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_is_not_empty(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - content = command_path.read_text(encoding="utf-8") - # Strip frontmatter and check body has content - parts = content.split("---", 2) - body = parts[2].strip() if len(parts) >= 3 else "" - assert len(body) > 50, f"{command_name} body is too short ({len(body)} chars)" - - @pytest.mark.parametrize("command_name", ALL_COMMANDS) - def test_command_has_markdown_heading(self, command_name: str) -> None: - command_path = COMMANDS_DIR / command_name - content = command_path.read_text(encoding="utf-8") - parts = content.split("---", 2) - body = parts[2] if len(parts) >= 3 else "" - assert "# " in body, f"{command_name} missing markdown heading in body" - - -class TestCommandContent: - """Verify specific command content.""" - - def test_security_audit_has_scoring(self) -> None: - content = (COMMANDS_DIR / "security-audit.md").read_text(encoding="utf-8") - assert "Grade" in content or "grade" in content, "security-audit should include grading" - for grade in ["A", "B", "C", "D", "F"]: - assert grade in content, f"security-audit missing grade: {grade}" - - def test_security_audit_checks_secrets(self) -> None: - content = (COMMANDS_DIR / "security-audit.md").read_text(encoding="utf-8") - assert "secret" in content.lower(), "security-audit should scan for secrets" - - def test_security_audit_checks_code_patterns(self) -> None: - # Checks that security-audit references unsafe code patterns (static strings, not code usage) - content = (COMMANDS_DIR / "security-audit.md").read_text(encoding="utf-8") - for pattern in ["eval", "exec", "pickle", "subprocess"]: - assert pattern in content, f"security-audit missing code pattern: {pattern}" - - def test_cove_has_four_steps(self) -> None: - content = (COMMANDS_DIR / "cove.md").read_text(encoding="utf-8") - for step in ["Step 1", "Step 2", "Step 3", "Step 4"]: - assert step in content, f"cove missing {step}" - - def test_cove_has_verification_questions(self) -> None: - content = (COMMANDS_DIR / "cove.md").read_text(encoding="utf-8") - assert "Verification" in content, "cove should mention verification" - def test_cove_isolated_uses_agent(self) -> None: - content = (COMMANDS_DIR / "cove-isolated.md").read_text(encoding="utf-8") - assert "Agent" in content, "cove-isolated should use Agent tool for isolation" +class TestCommandsRemoved: + """Verify commands directory was removed (all commands pruned).""" - def test_cove_isolated_has_four_steps(self) -> None: - content = (COMMANDS_DIR / "cove-isolated.md").read_text(encoding="utf-8") - for step in ["Step 1", "Step 2", "Step 3", "Step 4"]: - assert step in content, f"cove-isolated missing {step}" + def test_commands_directory_does_not_exist(self) -> None: + assert ( + not COMMANDS_DIR.exists() + ), f"Commands directory should not exist after pruning, but found: {COMMANDS_DIR}" diff --git a/tests/test_hooks.py b/tests/test_hooks.py index 99d09b9..5347d75 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -8,23 +8,11 @@ HOOKS_DIR = Path(__file__).parent.parent / ".claude" / "hooks" -SECURITY_HOOKS = [ +ALL_HOOKS = [ "dangerous-actions-blocker.sh", - "output-secrets-scanner.sh", - "unicode-injection-scanner.sh", -] - -PRODUCTIVITY_HOOKS = [ "auto-format.sh", - "test-on-change.sh", ] -DEVCONTAINER_HOOKS = [ - "devcontainer-policy-blocker.sh", -] - -ALL_HOOKS = SECURITY_HOOKS + PRODUCTIVITY_HOOKS + DEVCONTAINER_HOOKS - class TestHookExistence: """Verify all expected hook scripts exist.""" @@ -60,9 +48,9 @@ def test_hook_is_executable(self, hook_name: str) -> None: cwd=repo_root, ) if result.stdout: - assert result.stdout.startswith("100755"), ( - f"{hook_name} is not tracked as executable by git (expected mode 100755)" - ) + assert result.stdout.startswith( + "100755" + ), f"{hook_name} is not tracked as executable by git (expected mode 100755)" else: # Not in a git repo (e.g. integration test copy) -- fall back to filesystem mode = hook_path.stat().st_mode @@ -117,91 +105,65 @@ def test_hook_is_not_empty(self, hook_name: str) -> None: assert len(content) > 100, f"{hook_name} appears to be too short ({len(content)} bytes)" -class TestSecurityHookBehavior: - """Verify security hooks have the correct blocking/warning patterns.""" +class TestExfiltrationGuardBehavior: + """Verify dangerous-actions-blocker blocks exfiltration patterns.""" - def test_dangerous_actions_blocker_exits_2_for_blocks(self) -> None: + def test_exits_2_for_blocks(self) -> None: content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") assert "exit 2" in content, "dangerous-actions-blocker must exit 2 to block actions" - def test_dangerous_actions_blocker_checks_bash_only(self) -> None: + def test_checks_bash_only(self) -> None: content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") assert '"Bash"' in content, "dangerous-actions-blocker should only check Bash tool" - def test_dangerous_actions_blocker_has_blocked_patterns(self) -> None: + def test_blocks_gh_gist_create(self) -> None: content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") - for pattern in ["rm -rf", "DROP DATABASE", "git push --force"]: - assert pattern in content, f"dangerous-actions-blocker missing pattern: {pattern}" + assert "gh gist create" in content, "dangerous-actions-blocker missing gh gist create pattern" - def test_dangerous_actions_blocker_checks_secrets(self) -> None: + def test_blocks_gh_issue_create_with_body(self) -> None: content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") - for pattern in ["ANTHROPIC_API_KEY", "AWS_SECRET_ACCESS_KEY"]: - assert pattern in content, f"dangerous-actions-blocker missing secret pattern: {pattern}" - - def test_unicode_scanner_exits_2_for_blocks(self) -> None: - content = (HOOKS_DIR / "unicode-injection-scanner.sh").read_text(encoding="utf-8") - assert "exit 2" in content, "unicode-injection-scanner must exit 2 to block actions" - - def test_unicode_scanner_checks_edit_and_write(self) -> None: - content = (HOOKS_DIR / "unicode-injection-scanner.sh").read_text(encoding="utf-8") - assert '"Edit"' in content, "unicode-injection-scanner should check Edit tool" - assert '"Write"' in content, "unicode-injection-scanner should check Write tool" - - def test_unicode_scanner_detects_zero_width_chars(self) -> None: - content = (HOOKS_DIR / "unicode-injection-scanner.sh").read_text(encoding="utf-8") - assert "200B" in content, "unicode-injection-scanner should detect zero-width space (U+200B)" - - def test_unicode_scanner_detects_rtl_override(self) -> None: - content = (HOOKS_DIR / "unicode-injection-scanner.sh").read_text(encoding="utf-8") - assert "202A" in content or "202E" in content, "unicode-injection-scanner should detect RTL override" + assert "gh issue create" in content, "dangerous-actions-blocker missing gh issue create pattern" + assert "--body" in content, "dangerous-actions-blocker missing --body check" - def test_unicode_scanner_detects_ansi_escapes(self) -> None: - content = (HOOKS_DIR / "unicode-injection-scanner.sh").read_text(encoding="utf-8") - assert "ANSI" in content, "unicode-injection-scanner should detect ANSI escape sequences" - - def test_output_secrets_scanner_never_blocks(self) -> None: - content = (HOOKS_DIR / "output-secrets-scanner.sh").read_text(encoding="utf-8") - assert "exit 2" not in content, "output-secrets-scanner (PostToolUse) must never exit 2" + def test_blocks_publishing_commands(self) -> None: + content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") + for pattern in ["twine upload", "npm publish", "uv publish"]: + assert pattern in content, f"dangerous-actions-blocker missing publishing pattern: {pattern}" - def test_output_secrets_scanner_emits_system_message(self) -> None: - content = (HOOKS_DIR / "output-secrets-scanner.sh").read_text(encoding="utf-8") - assert "systemMessage" in content, "output-secrets-scanner should emit systemMessage for warnings" + def test_checks_secrets(self) -> None: + content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") + for pattern in ["ANTHROPIC_API_KEY", "AWS_SECRET_ACCESS_KEY", "AKIA", "sk-", "ghp_"]: + assert pattern in content, f"dangerous-actions-blocker missing secret pattern: {pattern}" - def test_output_secrets_scanner_detects_key_patterns(self) -> None: - content = (HOOKS_DIR / "output-secrets-scanner.sh").read_text(encoding="utf-8") - for pattern in ["AKIA", "sk-ant-", "ghp_", "PRIVATE KEY"]: - assert pattern in content, f"output-secrets-scanner missing pattern: {pattern}" + def test_does_not_block_local_destruction(self) -> None: + content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") + # Extract only the block-list arrays (non-comment lines containing patterns) + non_comment_lines = [line for line in content.splitlines() if not line.strip().startswith("#")] + code_content = "\n".join(non_comment_lines) + for pattern in ["rm -rf /", "'sudo'", "DROP DATABASE", "git push --force"]: + assert ( + pattern not in code_content + ), f"dangerous-actions-blocker should NOT block local destruction pattern: {pattern}" + + def test_has_security_model_comment(self) -> None: + content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") + assert "Exfiltration guard" in content, "dangerous-actions-blocker missing security model comment" + assert "disposable" in content, "dangerous-actions-blocker missing disposable devcontainer note" -class TestProductivityHookBehavior: - """Verify productivity hooks have correct patterns.""" +class TestAutoFormatBehavior: + """Verify auto-format hook has correct patterns.""" - def test_auto_format_targets_python_files(self) -> None: + def test_targets_python_files(self) -> None: content = (HOOKS_DIR / "auto-format.sh").read_text(encoding="utf-8") assert "*.py" in content or ".py" in content, "auto-format should target Python files" - def test_auto_format_uses_ruff(self) -> None: + def test_uses_ruff(self) -> None: content = (HOOKS_DIR / "auto-format.sh").read_text(encoding="utf-8") assert "ruff format" in content, "auto-format should use ruff format" assert "ruff check --fix" in content, "auto-format should use ruff check --fix" - def test_auto_format_checks_edit_and_write(self) -> None: + def test_checks_edit_and_write(self) -> None: content = (HOOKS_DIR / "auto-format.sh").read_text(encoding="utf-8") assert '"Edit"' in content, "auto-format should check Edit tool" assert '"Write"' in content, "auto-format should check Write tool" - - def test_test_on_change_discovers_test_files(self) -> None: - content = (HOOKS_DIR / "test-on-change.sh").read_text(encoding="utf-8") - assert "test_" in content, "test-on-change should discover test_ prefixed files" - - def test_test_on_change_uses_pytest(self) -> None: - content = (HOOKS_DIR / "test-on-change.sh").read_text(encoding="utf-8") - assert "pytest" in content, "test-on-change should use pytest" - - def test_test_on_change_never_blocks(self) -> None: - content = (HOOKS_DIR / "test-on-change.sh").read_text(encoding="utf-8") - assert "exit 2" not in content, "test-on-change should never block (informational only)" - - def test_test_on_change_emits_system_message_on_failure(self) -> None: - content = (HOOKS_DIR / "test-on-change.sh").read_text(encoding="utf-8") - assert "systemMessage" in content, "test-on-change should emit systemMessage for test failures" diff --git a/tests/test_skills.py b/tests/test_skills.py index 053c1e5..39bae08 100644 --- a/tests/test_skills.py +++ b/tests/test_skills.py @@ -7,7 +7,6 @@ SKILLS_DIR = Path(__file__).parent.parent / ".claude" / "skills" ALL_SKILLS = [ - "edit-permissions", "sync", "design", "done", @@ -33,6 +32,12 @@ def test_skill_file_exists(self, skill_name: str) -> None: skill_path = SKILLS_DIR / skill_name / "SKILL.md" assert skill_path.exists(), f"SKILL.md missing for: {skill_name}" + def test_no_unexpected_skills(self) -> None: + actual_skills = {d.name for d in SKILLS_DIR.iterdir() if d.is_dir()} + expected_skills = set(ALL_SKILLS) + unexpected = actual_skills - expected_skills + assert not unexpected, f"Unexpected skill directories found: {unexpected}" + class TestSkillFrontmatter: """Verify skill files have correct frontmatter.""" @@ -86,17 +91,17 @@ def test_side_effect_skills_disable_model_invocation(self, skill_name: str) -> N content = (SKILLS_DIR / skill_name / "SKILL.md").read_text(encoding="utf-8") parts = content.split("---", 2) frontmatter = parts[1] if len(parts) >= 3 else "" - assert "disable-model-invocation: true" in frontmatter, ( - f"{skill_name} should have disable-model-invocation: true (has side effects)" - ) + assert ( + "disable-model-invocation: true" in frontmatter + ), f"{skill_name} should have disable-model-invocation: true (has side effects)" def test_design_allows_model_invocation(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") parts = content.split("---", 2) frontmatter = parts[1] if len(parts) >= 3 else "" - assert "disable-model-invocation" not in frontmatter, ( - "design should NOT have disable-model-invocation (intentionally model-invocable)" - ) + assert ( + "disable-model-invocation" not in frontmatter + ), "design should NOT have disable-model-invocation (intentionally model-invocable)" class TestSkillContent: @@ -126,9 +131,9 @@ def test_design_reads_implementation_plan(self) -> None: def test_design_classifies_scope(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") - assert "**Q** (Quick)" in content and "**S** (Standard)" in content and "**P** (Project)" in content, ( - "design should classify scope as Q/S/P with descriptive labels" - ) + assert ( + "**Q** (Quick)" in content and "**S** (Standard)" in content and "**P** (Project)" in content + ), "design should classify scope as Q/S/P with descriptive labels" def test_design_has_argument_hint(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") From d201cf00451318d7492fdf7b181633d73b30ac33 Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 11:56:17 +0100 Subject: [PATCH 2/6] docs: add changelog entries and subagent limitation note Document the scope pruning changes in CHANGELOG.md (Changed, Removed sections) and add a DECISIONS.md entry noting that subagents don't read CLAUDE.md directives -- a known template limitation. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/CHANGELOG.md | 17 +++++++++++++++++ docs/DECISIONS.md | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 80c8eed..962fa42 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed +- Security model simplified to 2-layer exfiltration defense: iptables firewall (primary) blocks non-approved network domains; `dangerous-actions-blocker.sh` (narrowed) blocks exfiltration via trusted channels (gh gist, gh issue --body, package publishing, secrets in args) -- local destruction (rm -rf, sudo, etc.) is no longer blocked since devcontainer is disposable +- CLAUDE.md Security section rewritten to describe the 2-layer defense model instead of listing individual hooks +- Devcontainer simplified: permission tiers removed, single settings.json baseline for all environments + +### Removed +- Permission tier system (`.devcontainer/permissions/tier1-assisted.json`, `tier2-autonomous.json`, `tier3-full-trust.json`) and `PERMISSION_TIER` env var -- single settings.json baseline replaces graduated tiers +- `devcontainer-policy-blocker.sh` hook -- tier-dependent policy enforcement no longer needed +- `output-secrets-scanner.sh` hook -- conversation leaks to Anthropic are accepted risk +- `unicode-injection-scanner.sh` hook -- exotic threat with low practical risk +- `test-on-change.sh` hook -- informational-only hook that added latency without preventing issues +- All slash commands (`/cove`, `/cove-isolated`, `/security-audit`) -- niche utilities that added complexity without proportional value +- 6 agents: `agent-auditor`, `security-auditor`, `output-evaluator`, `acceptance-criteria-validator`, `implementation-tracker`, `refactoring-specialist` -- pruned to the 6 agents directly used by the QSP workflow +- `/edit-permissions` skill -- permission tier system removed +- `docs/ARCHITECTURE_GUIDE.md`, `docs/DEVCONTAINER_PERMISSIONS.md`, `docs/community/` -- supporting docs for removed features +- Local destruction patterns from `dangerous-actions-blocker.sh` (`rm -rf`, `sudo`, `DROP DATABASE`, `git push --force`, etc.) -- devcontainer is disposable, these blocks added friction without security value + ### Added - Architecture Deep Dive guide (`docs/ARCHITECTURE_GUIDE.md`) explains why each component exists, what it does under the hood, and what happens if you remove or modify it -- covers all hooks, agents, skills, rules, configuration files, devcontainer layers, and CI/CD workflows with a defense-in-depth diagram and customization guide - `/landed` skill for post-merge lifecycle -- verifies merge CI, optionally checks deployments (via `.claude/deploy.json`), cleans up feature branches, and identifies the next phase for P-scope work diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index 2500a10..702b4cb 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -105,6 +105,16 @@ When a decision is superseded or obsolete, delete it (git history preserves the - Wildcard domains (e.g., `*.example.com`) are skipped with a warning -- DNS cannot resolve wildcard patterns to IPs - WebFetch settings changes take effect on container restart (`init-firewall.sh` runs from `postStartCommand`) +## 2026-03-18: Subagent CLAUDE.md Limitation + +**Observation**: Spawned subagents (via the Agent tool) do not read CLAUDE.md or project instructions. They only follow what the parent agent includes in the prompt. This means directives like "use `uv run` for all commands" are silently ignored by subagents unless explicitly passed through. + +**Decisions**: +- Known template limitation -- subagents must receive key directives in their spawn prompt +- Agent `.md` files could include critical directives (e.g., "use `uv run`") but this duplicates CLAUDE.md and creates drift risk +- For this template repo specifically, `uv run` fails due to `{{project_name}}` placeholders, so `python -m pytest` is the correct fallback +- No code change for now; document as a known limitation + ## 2026-03-18: Security Model Simplification **Request**: Prune security infrastructure to essentials. Remove permission tiers, From f266946f3b9936b382dc93f8ea4d266041902d1d Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 12:04:47 +0100 Subject: [PATCH 3/6] style: reformat test files for ruff 0.15.6 compatibility Local ruff 0.8.6 formatted differently than CI's ruff 0.15.6. Reformatted with 0.15.6 to match CI expectations. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_agents.py | 6 +++--- tests/test_commands.py | 6 +++--- tests/test_hooks.py | 12 ++++++------ tests/test_skills.py | 18 +++++++++--------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/test_agents.py b/tests/test_agents.py index 11f3816..a740f12 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -102,9 +102,9 @@ def test_agent_has_tools(self, agent_name: str, agent_frontmatter: dict[str, dic def test_agent_has_permission_mode(self, agent_name: str, agent_frontmatter: dict[str, dict[str, str]]) -> None: fm = agent_frontmatter.get(agent_name, {}) assert "permissionMode" in fm, f"{agent_name} missing 'permissionMode' in frontmatter" - assert ( - fm["permissionMode"] in VALID_PERMISSION_MODES - ), f"{agent_name} has invalid permissionMode: {fm['permissionMode']!r}" + assert fm["permissionMode"] in VALID_PERMISSION_MODES, ( + f"{agent_name} has invalid permissionMode: {fm['permissionMode']!r}" + ) class TestAgentBody: diff --git a/tests/test_commands.py b/tests/test_commands.py index 016c058..60de11e 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -9,6 +9,6 @@ class TestCommandsRemoved: """Verify commands directory was removed (all commands pruned).""" def test_commands_directory_does_not_exist(self) -> None: - assert ( - not COMMANDS_DIR.exists() - ), f"Commands directory should not exist after pruning, but found: {COMMANDS_DIR}" + assert not COMMANDS_DIR.exists(), ( + f"Commands directory should not exist after pruning, but found: {COMMANDS_DIR}" + ) diff --git a/tests/test_hooks.py b/tests/test_hooks.py index 5347d75..f3ce3ce 100644 --- a/tests/test_hooks.py +++ b/tests/test_hooks.py @@ -48,9 +48,9 @@ def test_hook_is_executable(self, hook_name: str) -> None: cwd=repo_root, ) if result.stdout: - assert result.stdout.startswith( - "100755" - ), f"{hook_name} is not tracked as executable by git (expected mode 100755)" + assert result.stdout.startswith("100755"), ( + f"{hook_name} is not tracked as executable by git (expected mode 100755)" + ) else: # Not in a git repo (e.g. integration test copy) -- fall back to filesystem mode = hook_path.stat().st_mode @@ -141,9 +141,9 @@ def test_does_not_block_local_destruction(self) -> None: non_comment_lines = [line for line in content.splitlines() if not line.strip().startswith("#")] code_content = "\n".join(non_comment_lines) for pattern in ["rm -rf /", "'sudo'", "DROP DATABASE", "git push --force"]: - assert ( - pattern not in code_content - ), f"dangerous-actions-blocker should NOT block local destruction pattern: {pattern}" + assert pattern not in code_content, ( + f"dangerous-actions-blocker should NOT block local destruction pattern: {pattern}" + ) def test_has_security_model_comment(self) -> None: content = (HOOKS_DIR / "dangerous-actions-blocker.sh").read_text(encoding="utf-8") diff --git a/tests/test_skills.py b/tests/test_skills.py index 39bae08..83e4570 100644 --- a/tests/test_skills.py +++ b/tests/test_skills.py @@ -91,17 +91,17 @@ def test_side_effect_skills_disable_model_invocation(self, skill_name: str) -> N content = (SKILLS_DIR / skill_name / "SKILL.md").read_text(encoding="utf-8") parts = content.split("---", 2) frontmatter = parts[1] if len(parts) >= 3 else "" - assert ( - "disable-model-invocation: true" in frontmatter - ), f"{skill_name} should have disable-model-invocation: true (has side effects)" + assert "disable-model-invocation: true" in frontmatter, ( + f"{skill_name} should have disable-model-invocation: true (has side effects)" + ) def test_design_allows_model_invocation(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") parts = content.split("---", 2) frontmatter = parts[1] if len(parts) >= 3 else "" - assert ( - "disable-model-invocation" not in frontmatter - ), "design should NOT have disable-model-invocation (intentionally model-invocable)" + assert "disable-model-invocation" not in frontmatter, ( + "design should NOT have disable-model-invocation (intentionally model-invocable)" + ) class TestSkillContent: @@ -131,9 +131,9 @@ def test_design_reads_implementation_plan(self) -> None: def test_design_classifies_scope(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") - assert ( - "**Q** (Quick)" in content and "**S** (Standard)" in content and "**P** (Project)" in content - ), "design should classify scope as Q/S/P with descriptive labels" + assert "**Q** (Quick)" in content and "**S** (Standard)" in content and "**P** (Project)" in content, ( + "design should classify scope as Q/S/P with descriptive labels" + ) def test_design_has_argument_hint(self) -> None: content = (SKILLS_DIR / "design" / "SKILL.md").read_text(encoding="utf-8") From c3a84e55e07975605a5f15db10646591cf25f6ea Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 12:24:47 +0100 Subject: [PATCH 4/6] fix: address CodeRabbit review feedback - Move gh api from allow to ask (exfiltration risk via raw API calls) - Add git push --force/--f to deny list (remote repo is not disposable) - Remove stale tier references from settings.local.json.example - Fix P.3.2 reference to deleted agent in DEVELOPMENT_PROCESS.md - Update test expectations for gh api and force push changes Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/settings.json | 4 +++- .claude/settings.local.json.example | 9 +-------- docs/DEVELOPMENT_PROCESS.md | 2 +- tests/test_permissions.py | 10 ++++++++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/.claude/settings.json b/.claude/settings.json index cb76b83..035d5e6 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -19,7 +19,7 @@ "Bash(git describe *)", "Bash(git shortlog *)", "Bash(git rev-list *)", "Bash(gh pr create *)", "Bash(gh pr view *)", "Bash(gh pr list *)", "Bash(gh pr checks *)", "Bash(gh pr diff *)", "Bash(gh pr edit *)", - "Bash(gh pr close *)", "Bash(gh api *)", + "Bash(gh pr close *)", "Bash(gh run list *)", "Bash(gh run view *)", "Bash(gh run watch *)", "Bash(gh issue list *)", "Bash(gh issue view *)", "Bash(gh repo view *)", "Bash(gh release list *)", "Bash(gh release view *)", @@ -32,6 +32,7 @@ ], "deny": [ "Bash(gh secret *)", "Bash(gh auth *)", "Bash(gh ssh-key *)", "Bash(gh gpg-key *)", + "Bash(git push --force *)", "Bash(git push -f *)", "Bash(git clean *)", "Bash(git config *)", "Bash(*git remote add *)", "Bash(*git remote set-url *)", "Bash(*git remote remove *)", "Bash(*git remote rename *)", "Bash(*git remote set-head *)", @@ -43,6 +44,7 @@ "Bash(gh pr merge *)", "Bash(gh pr reopen *)", "Bash(gh pr comment *)", "Bash(gh pr review *)", "Bash(gh pr ready *)", "Bash(gh workflow run *)", "Bash(gh workflow enable *)", "Bash(gh workflow disable *)", + "Bash(gh api *)", "Bash(gh issue create *)", "Bash(gh issue comment *)", "Bash(gh issue close *)", "Bash(gh issue edit *)", "Bash(git init *)", "Bash(git clone *)", diff --git a/.claude/settings.local.json.example b/.claude/settings.local.json.example index 97dc6ad..8df1fed 100644 --- a/.claude/settings.local.json.example +++ b/.claude/settings.local.json.example @@ -5,14 +5,7 @@ "Precedence: global < project settings.json < settings.local.json", "Use this for: local hook overrides, extra permissions, MCP server configs" ], - "__devcontainer_tiers": [ - "In devcontainers, this file is auto-generated from .devcontainer/permissions/tier{1,2,3}.json.", - "Set PERMISSION_TIER env var before building (default: 2).", - " Tier 1 (Assisted): File ops auto-allowed, bash asks per-command", - " Tier 2 (Autonomous): Bash(*) with curated deny list -- zero prompts", - " Tier 3 (Full Trust): Bash(*) with minimal deny -- requires branch protection", - "See docs/DEVCONTAINER_PERMISSIONS.md for full details." - ], + "__note": "In devcontainers, settings.json is the single baseline for all environments.", "hooks": { "PreToolUse": [], diff --git a/docs/DEVELOPMENT_PROCESS.md b/docs/DEVELOPMENT_PROCESS.md index 5b7770b..4d210c5 100644 --- a/docs/DEVELOPMENT_PROCESS.md +++ b/docs/DEVELOPMENT_PROCESS.md @@ -120,7 +120,7 @@ next phase for P-scope work. **P.3 Execute** (repeat per phase) 1. Run Standard Path (S.1 through S.7) for the phase -2. Update `docs/IMPLEMENTATION_PLAN.md` (use built-in `Plan` agent) +2. Update `docs/IMPLEMENTATION_PLAN.md` 3. Write phase handoff note (2-5 sentences: what completed, deviations, risks, dependencies, intentional debt) **P.4 Finalize** -- Merge. Version bump and changelog consolidation if applicable. diff --git a/tests/test_permissions.py b/tests/test_permissions.py index 00ed40e..cad56e3 100644 --- a/tests/test_permissions.py +++ b/tests/test_permissions.py @@ -302,6 +302,11 @@ def test_git_config_is_denied(self, settings: dict[str, Any]) -> None: def test_uv_self_is_denied(self, settings: dict[str, Any]) -> None: assert evaluate("Bash(uv self update)", settings) == "deny" + def test_git_push_force_is_denied(self, settings: dict[str, Any]) -> None: + """Force push affects the remote repo (not disposable) -- must be denied.""" + assert evaluate("Bash(git push --force origin main)", settings) == "deny" + assert evaluate("Bash(git push -f origin main)", settings) == "deny" + def test_rm_rf_is_not_allowed(self, settings: dict[str, Any]) -> None: assert evaluate("Bash(rm -rf /)", settings) != "allow" @@ -440,8 +445,9 @@ def test_web_fetch_requires_confirmation(self, settings: dict[str, Any]) -> None def test_chained_commands_fall_through(self, settings: dict[str, Any]) -> None: assert evaluate("Bash(cd /foo && ls)", settings) == "none" - def test_gh_api_is_allowed(self, settings: dict[str, Any]) -> None: - assert evaluate("Bash(gh api repos/owner/repo/pulls)", settings) == "allow" + def test_gh_api_requires_confirmation(self, settings: dict[str, Any]) -> None: + """gh api can create gists/issues -- requires confirmation to prevent exfiltration.""" + assert evaluate("Bash(gh api repos/owner/repo/pulls)", settings) == "ask" def test_gh_pr_review_operations_require_confirmation(self, settings: dict[str, Any]) -> None: """PR comment/review/ready are state-changing and have data exfiltration risk.""" From c5757aabfc160d9b1a47d81b14991769361706f1 Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 18:04:38 +0100 Subject: [PATCH 5/6] fix: address CodeRabbit round 2 feedback - Move gh pr close from allow to ask (remote state change) - Move git push from allow to ask (remote state change); force variants stay in deny as a hard block - This prevents force-push bypass via flag ordering variants like git push --force-with-lease or git push -u -f Co-Authored-By: Claude Opus 4.6 (1M context) --- .claude/settings.json | 6 +++--- tests/test_permissions.py | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.claude/settings.json b/.claude/settings.json index 035d5e6..91eda18 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -6,7 +6,7 @@ "Bash(uv run pyright *)", "Bash(uv add *)", "Bash(uv pip *)", "Bash(uv venv *)", "Bash(uv lock *)", "Bash(uv tree *)", "Bash(uv export *)", - "Bash(git add *)", "Bash(git commit *)", "Bash(git push *)", + "Bash(git add *)", "Bash(git commit *)", "Bash(git fetch *)", "Bash(git pull *)", "Bash(git rebase *)", "Bash(git branch *)", "Bash(git checkout *)", "Bash(git status *)", "Bash(git diff *)", "Bash(git log *)", "Bash(git show *)", @@ -19,7 +19,6 @@ "Bash(git describe *)", "Bash(git shortlog *)", "Bash(git rev-list *)", "Bash(gh pr create *)", "Bash(gh pr view *)", "Bash(gh pr list *)", "Bash(gh pr checks *)", "Bash(gh pr diff *)", "Bash(gh pr edit *)", - "Bash(gh pr close *)", "Bash(gh run list *)", "Bash(gh run view *)", "Bash(gh run watch *)", "Bash(gh issue list *)", "Bash(gh issue view *)", "Bash(gh repo view *)", "Bash(gh release list *)", "Bash(gh release view *)", @@ -41,12 +40,13 @@ "ask": [ "Bash(python *)", "Bash(uv run python *)", "Bash(docker *)", "Bash(docker-compose *)", "Bash(terraform *)", - "Bash(gh pr merge *)", "Bash(gh pr reopen *)", "Bash(gh pr comment *)", + "Bash(gh pr merge *)", "Bash(gh pr reopen *)", "Bash(gh pr close *)", "Bash(gh pr comment *)", "Bash(gh pr review *)", "Bash(gh pr ready *)", "Bash(gh workflow run *)", "Bash(gh workflow enable *)", "Bash(gh workflow disable *)", "Bash(gh api *)", "Bash(gh issue create *)", "Bash(gh issue comment *)", "Bash(gh issue close *)", "Bash(gh issue edit *)", + "Bash(git push *)", "Bash(git init *)", "Bash(git clone *)", "Bash(uv remove *)", "Bash(uv cache *)", "Bash(uv init *)", "WebFetch" diff --git a/tests/test_permissions.py b/tests/test_permissions.py index cad56e3..25bf46c 100644 --- a/tests/test_permissions.py +++ b/tests/test_permissions.py @@ -416,9 +416,13 @@ def test_git_read_operations_are_allowed(self, settings: dict[str, Any]) -> None assert evaluate(f"Bash({cmd})", settings) == "allow", f"{cmd} should be allowed" def test_git_write_operations_are_allowed(self, settings: dict[str, Any]) -> None: - for cmd in ["git add .", 'git commit -m "msg"', "git push origin main"]: + for cmd in ["git add .", 'git commit -m "msg"']: assert evaluate(f"Bash({cmd})", settings) == "allow", f"{cmd} should be allowed" + def test_git_push_requires_confirmation(self, settings: dict[str, Any]) -> None: + """git push affects remote state -- requires confirmation.""" + assert evaluate("Bash(git push origin main)", settings) == "ask" + def test_testing_commands_are_allowed(self, settings: dict[str, Any]) -> None: for cmd in ["pytest tests/", "uv run pytest -v"]: assert evaluate(f"Bash({cmd})", settings) == "allow", f"{cmd} should be allowed" From 18947911609a8d800c35f614a8bb74348fb34539 Mon Sep 17 00:00:00 2001 From: Martin Stransky Date: Wed, 18 Mar 2026 18:22:41 +0100 Subject: [PATCH 6/6] test: document force-push variant behavior Add tests asserting that --force-with-lease and -u -f variants hit the ask rule (not deny), confirming they still require user confirmation even though they bypass the deny prefix patterns. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_permissions.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_permissions.py b/tests/test_permissions.py index 25bf46c..9a23466 100644 --- a/tests/test_permissions.py +++ b/tests/test_permissions.py @@ -307,6 +307,15 @@ def test_git_push_force_is_denied(self, settings: dict[str, Any]) -> None: assert evaluate("Bash(git push --force origin main)", settings) == "deny" assert evaluate("Bash(git push -f origin main)", settings) == "deny" + def test_git_push_force_variants_require_confirmation(self, settings: dict[str, Any]) -> None: + """Force-push variants with different flag ordering hit ask (not deny). + + --force-with-lease and -u -f don't match the deny prefix patterns, + but git push * is in ask so they still require user confirmation. + """ + assert evaluate("Bash(git push --force-with-lease origin main)", settings) == "ask" + assert evaluate("Bash(git push -u -f origin main)", settings) == "ask" + def test_rm_rf_is_not_allowed(self, settings: dict[str, Any]) -> None: assert evaluate("Bash(rm -rf /)", settings) != "allow"