From 3f69c664fac7a716e6634f4234394dc40f3a110f Mon Sep 17 00:00:00 2001
From: "Keith A. Taylor" <120050018+fractal360@users.noreply.github.com>
Date: Wed, 20 May 2026 17:25:07 +0100
Subject: [PATCH] Move ECS tasks to private subnets with VPC endpoints
---
README.md | 36 +-
docs/TRACKER.md | 39 +-
docs/aws_deployment_target.md | 58 ++-
docs/aws_operator_runbook.md | 454 +++++++++++++++++++
docs/aws_terraform_deployment_sequence.md | 90 +++-
docs/production_secret_and_state_security.md | 4 +-
infra/terraform/ecs_service.tf | 13 +-
infra/terraform/network.tf | 18 +
infra/terraform/security_groups.tf | 42 +-
infra/terraform/vpc_endpoints.tf | 95 ++++
10 files changed, 784 insertions(+), 65 deletions(-)
create mode 100644 docs/aws_operator_runbook.md
create mode 100644 infra/terraform/vpc_endpoints.tf
diff --git a/README.md b/README.md
index 05d4112..20bdf45 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ This repository is intended to demonstrate backend/platform engineering depth, n
- PostgreSQL-backed business data and audit persistence
- structured runtime and PDP audit logging
- Docker Compose local development workflow
-- AWS deployment with ECR, ECS/Fargate, ALB, RDS PostgreSQL, Secrets Manager, IAM, and CloudWatch
+- AWS deployment with ECR, ECS/Fargate, ALB, private ECS networking, VPC endpoints, RDS PostgreSQL, Secrets Manager, IAM, and CloudWatch
- rerunnable SQL migrations
- one-off ECS operational tasks for RDS migrations and dev credential seeding
- local and AWS MCP smoke-test helpers
@@ -278,7 +278,11 @@ Implemented AWS infrastructure:
- HTTP listener
- target group registration for ECS tasks
- VPC, public subnets, private app subnets, and private DB subnets
-- security groups for ALB, app tasks, and RDS
+- private app route table for private ECS task subnets
+- VPC endpoints for private AWS service access:
+ - interface endpoints for ECR API, ECR Docker registry, CloudWatch Logs, and Secrets Manager
+ - S3 gateway endpoint associated with the private app route table
+- security groups for ALB, app tasks, AWS service interface endpoints, and RDS
- private RDS PostgreSQL instance
- RDS-managed database password secret
- manually-created Secrets Manager secret for `AGENT_CREDENTIAL_HASH_SECRET`
@@ -309,16 +313,21 @@ Verified AWS smoke tests:
- deployed `docs_tool` denies `doc2` with `DEFAULT_DENY`
- the deployed tool path resolves a DB-backed registered-agent credential through `X-Agent-Api-Key`
-Current AWS development limitation:
+Current AWS networking posture:
-- ECS app tasks currently run in public subnets with `assignPublicIp=ENABLED`.
-- This avoids NAT Gateway or VPC endpoints during the first runnable AWS slice.
-- Inbound access remains restricted through security groups:
- - Internet -> ALB on port `80`
- - ALB -> ECS app task on port `8000`
- - ECS app task -> RDS on port `5432`
+- ALB nodes remain in public subnets and provide the public HTTP entry point.
+- ECS/Fargate app tasks run in private app subnets with `assignPublicIp=DISABLED`.
+- Running app tasks have private IPs only; they are registered with the ALB target group by private task IP.
+- RDS PostgreSQL remains in private DB subnets.
+- Private app task access to required AWS services uses VPC endpoints rather than a NAT Gateway:
+ - interface endpoints for ECR API, ECR Docker registry, CloudWatch Logs, and Secrets Manager
+ - S3 gateway endpoint associated with the private app route table
+- App task security group egress is limited to required paths:
+ - RDS PostgreSQL on port `5432`
+ - AWS service interface endpoint security group on port `443`
+ - S3 endpoint prefix list on port `443`
-This is a deliberate development-stage trade-off, not the intended production networking posture.
+No NAT Gateway is currently deployed. That is intentional for this slice because the app does not yet need general outbound internet access to third-party APIs or arbitrary external services.
## Operational helper scripts
@@ -512,7 +521,7 @@ It is not currently trying to be:
- IdP integration
- database-backed policy authoring/storage
- production credential registry UI
-- production-grade AWS networking hardening
+- full production-grade AWS hardening beyond the current portfolio/dev deployment
The emphasis is on doing a smaller set of backend/platform concerns properly:
@@ -538,8 +547,7 @@ Credible next improvements include:
- extend immutable image tagging consistently across manual and Terraform-driven deployment paths
- HTTPS listener with ACM certificate
- optional HTTP-to-HTTPS redirect
-- private ECS task networking without public task IPs
-- NAT Gateway or VPC endpoints for outbound AWS service access
+- optional NAT Gateway or controlled egress path only if future app behaviour requires general external access
- Terraform remote state backend
- migration version tracking
- production-grade registered-agent credential registration and rotation workflow
@@ -564,6 +572,8 @@ Current status:
- local Docker/PostgreSQL path works
- local and CI tests pass
- AWS ECS/RDS/ALB deployment path works
+- ECS app tasks run in private app subnets with no public IP
+- VPC endpoints provide private AWS service access for ECR, CloudWatch Logs, Secrets Manager, and S3
- AWS RDS migrations run through one-off ECS tasks
- AWS dev registered-agent credential seeding/rotation works
- deployed MCP allow and deny paths have been smoke-tested
diff --git a/docs/TRACKER.md b/docs/TRACKER.md
index 8117793..6f0e0b8 100644
--- a/docs/TRACKER.md
+++ b/docs/TRACKER.md
@@ -25,6 +25,8 @@ The supporting engineering story is backend/platform implementation depth:
- Docker-based local development
- SQL migrations and seed data
- AWS ECS/Fargate deployment
+- private ECS/Fargate task networking
+- VPC endpoints for private AWS service access
- RDS PostgreSQL runtime configuration
- Secrets Manager runtime secret injection
- CloudWatch log collection
@@ -215,10 +217,18 @@ Implemented AWS infrastructure includes:
- private DB subnets
- internet gateway
- public route table
+- private app route table
- DB subnet group
+- VPC endpoints for:
+ - ECR API
+ - ECR Docker registry
+ - CloudWatch Logs
+ - Secrets Manager
+ - S3
- security groups for:
- ALB
- ECS app task
+ - AWS service interface endpoints
- RDS PostgreSQL
- private RDS PostgreSQL instance
- RDS-managed database password secret
@@ -257,26 +267,27 @@ Verified AWS checks:
- The deployed ECS task definition uses the Git commit SHA image tag, not `latest`.
- The CD workflow waits for ECS service stability and checks `/health` after deployment.
-Current intentional AWS development limitation:
+Current AWS networking posture:
-- ECS app tasks currently run in public subnets with `assignPublicIp=ENABLED`.
-- This avoids adding NAT Gateway or VPC endpoints during the first runnable AWS vertical slice.
-- Inbound access is still controlled by security groups:
- - Internet -> ALB on port `80`
- - ALB -> ECS app task on port `8000`
- - ECS app task -> RDS on port `5432`
+- ALB nodes run in public subnets and provide the public HTTP entry point.
+- ECS/Fargate app tasks run in private app subnets with `assignPublicIp=DISABLED`.
+- Running app tasks have no public IP.
+- RDS PostgreSQL runs in private DB subnets.
+- Private app task access to required AWS services is provided by VPC endpoints:
+ - interface endpoints for ECR API, ECR Docker registry, CloudWatch Logs, and Secrets Manager
+ - S3 gateway endpoint associated with the private app route table
+- App task egress is restricted to RDS, the AWS service interface endpoint security group, and the S3 endpoint prefix list.
+- No NAT Gateway is currently deployed; add one later only if the app needs general outbound access to external/non-AWS services.
Deferred AWS hardening:
- HTTPS listener with ACM certificate
- optional HTTP-to-HTTPS redirect
-- private ECS task networking without public task IPs
-- NAT Gateway or VPC endpoints for outbound AWS service access
-- immutable image tags instead of deploying `latest`
- Terraform remote state backend
- migration version tracking
- production-grade credential registration/rotation workflow
-- CI/CD deployment workflow
+- CI-before-deploy safety clarification and deployment guardrails
+- Terraform image tag handling alignment with SHA-based CD
---
@@ -366,7 +377,7 @@ The project is not currently trying to implement:
- SQLAlchemy/Alembic unless direct SQL becomes a real limitation
- broad AI governance platform features
- production credential registry UI
-- production-grade AWS networking hardening
+- full production-grade AWS hardening beyond the current portfolio/dev deployment
These are deliberate scope boundaries, not forgotten requirements.
@@ -384,7 +395,7 @@ Good next candidates:
- keep README, tracker, and AWS deployment docs aligned with the implemented runtime
- extend immutable image tagging consistently across manual and Terraform-driven deployment paths
- add HTTPS/ACM support for the ALB
-- add private ECS task networking using NAT Gateway or VPC endpoints
+- add NAT Gateway or another explicit egress path only if future external API access requires it
- add Terraform remote state
- add migration version tracking if migration reruns become harder to reason about
- formalize production-style registered-agent credential registration and rotation
@@ -410,6 +421,8 @@ The current stable implementation demonstrates:
- structured runtime/audit logging
- local Docker/PostgreSQL runtime
- AWS ECS/Fargate/RDS/ALB runtime
+- private ECS task networking with no public task IP
+- VPC endpoint-based AWS service access without NAT Gateway
- RDS-backed registered-agent identity resolution
- HMAC-hashed API-key identity adapter
- one-off ECS operational tasks
diff --git a/docs/aws_deployment_target.md b/docs/aws_deployment_target.md
index b0a1149..f5fd3ae 100644
--- a/docs/aws_deployment_target.md
+++ b/docs/aws_deployment_target.md
@@ -2,17 +2,17 @@
## Purpose
-Define the AWS runtime shape for `aws-python-service-platform` before implementing Terraform.
+Define the implemented AWS runtime shape for `aws-python-service-platform`.
## Target runtime path
```text
Client / MCP caller
-> Application Load Balancer
- -> ECS Fargate service
+ -> ECS Fargate service in private app subnets
-> FastAPI + FastMCP app
- -> RDS PostgreSQL
- -> CloudWatch logs
+ -> RDS PostgreSQL in private DB subnets
+ -> CloudWatch logs via VPC endpoint
```
## AWS services
@@ -24,6 +24,7 @@ Client / MCP caller
| Database | RDS PostgreSQL |
| Secrets | Secrets Manager or SSM Parameter Store |
| Logs | CloudWatch Logs |
+| Private AWS service access | VPC endpoints for ECR, CloudWatch Logs, Secrets Manager, and S3 |
| Runtime permissions | ECS task role |
| Infrastructure | Terraform |
@@ -43,9 +44,9 @@ The AWS deployment should keep the same application configuration contract used
The application code should continue reading configuration through the existing settings module. Terraform and ECS are responsible for supplying the correct runtime values.
-## Initial deployment scope
+## Implemented deployment scope
-The first AWS deployment will run the existing service using RDS-backed configuration and CloudWatch logging.
+The current AWS deployment runs the existing service using RDS-backed configuration, CloudWatch logging, private ECS task networking, and VPC endpoints for required AWS-service access.
## Deferred scope
@@ -74,15 +75,18 @@ flowchart TB
ECSService["ECS Service
desired task count"]
TaskDef["Task Definition
image + env + CPU/memory"]
ECR["ECR Repository
container image"]
+ Logs["CloudWatch Logs
container logs"]
+ Secrets["Secrets Manager
runtime secrets"]
+ S3["S3
ECR image layers"]
end
subgraph VPC["VPC: private network boundary"]
ALB["Logical Application Load Balancer"]
Listener["ALB Listener
HTTP/HTTPS"]
- TG["Target Group
registered task IPs + health state"]
+ TG["Target Group
registered private task IPs + health state"]
- subgraph PublicA["Public Subnet A"]
+ subgraph PublicA["Public subnet A"]
ALBNodeA["ALB node / network interface
AZ: eu-west-2a
public-facing IP"]
end
@@ -91,11 +95,19 @@ flowchart TB
end
subgraph PrivateAppA["Private app subnet A"]
- TaskA["Fargate task
FastAPI container
AZ: eu-west-2a
private IP: 10.0.11.x:8000"]
+ TaskA["Fargate task
FastAPI container
AZ: eu-west-2a
private IP only
no public IP"]
end
subgraph PrivateAppB["Private app subnet B"]
- TaskB["Fargate task
FastAPI container
AZ: eu-west-2b
private IP: 10.0.12.x:8000"]
+ TaskB["Fargate task
FastAPI container
AZ: eu-west-2b
private IP only
no public IP"]
+ end
+
+ subgraph VPCEndpoints["VPC endpoints for AWS service access"]
+ EcrApiVpce["Interface endpoint
ECR API"]
+ EcrDkrVpce["Interface endpoint
ECR Docker registry"]
+ LogsVpce["Interface endpoint
CloudWatch Logs"]
+ SecretsVpce["Interface endpoint
Secrets Manager"]
+ S3GatewayVpce["Gateway endpoint
S3 via private app route table"]
end
subgraph PrivateDB["Private DB subnets"]
@@ -111,15 +123,33 @@ flowchart TB
ALBNodeA --> Listener
ALBNodeB --> Listener
Listener --> TG
- TG -->|"healthy target"| TaskA
- TG -->|"healthy target"| TaskB
+ TG -->|"healthy private target"| TaskA
+ TG -->|"healthy private target"| TaskB
ECSCluster --> ECSService
ECSService --> TaskDef
TaskDef --> ECR
- ECSService -->|"starts/registers tasks"| TaskA
- ECSService -->|"starts/registers tasks"| TaskB
+ ECSService -->|"starts/registers private tasks"| TaskA
+ ECSService -->|"starts/registers private tasks"| TaskB
TaskA --> RDS
TaskB --> RDS
+
+ TaskA -->|"HTTPS 443"| EcrApiVpce
+ TaskA -->|"HTTPS 443"| EcrDkrVpce
+ TaskA -->|"HTTPS 443"| LogsVpce
+ TaskA -->|"HTTPS 443"| SecretsVpce
+ TaskA -->|"S3 route"| S3GatewayVpce
+
+ TaskB -->|"HTTPS 443"| EcrApiVpce
+ TaskB -->|"HTTPS 443"| EcrDkrVpce
+ TaskB -->|"HTTPS 443"| LogsVpce
+ TaskB -->|"HTTPS 443"| SecretsVpce
+ TaskB -->|"S3 route"| S3GatewayVpce
+
+ EcrApiVpce --> ECR
+ EcrDkrVpce --> ECR
+ LogsVpce --> Logs
+ SecretsVpce --> Secrets
+ S3GatewayVpce --> S3
```
\ No newline at end of file
diff --git a/docs/aws_operator_runbook.md b/docs/aws_operator_runbook.md
new file mode 100644
index 0000000..9747ec3
--- /dev/null
+++ b/docs/aws_operator_runbook.md
@@ -0,0 +1,454 @@
+# AWS Operator Runbook
+
+## Purpose
+
+This runbook records the operational commands and checks for the AWS deployment of `aws-python-service-platform`.
+
+It is intended for development/portfolio operation, not production operations.
+
+Current AWS shape:
+
+```text
+Internet client
+-> public ALB
+-> ECS/Fargate app task in private app subnet
+-> FastAPI + FastMCP app
+-> RDS PostgreSQL in private DB subnet
+
+Private ECS task AWS-service access:
+ECS task
+-> VPC endpoints
+-> ECR / CloudWatch Logs / Secrets Manager / S3
+```
+
+---
+
+## Current operating model
+
+Terraform manages the AWS infrastructure.
+
+GitHub Actions manual CD deploys application image updates into the existing ECS service.
+
+Operational scripts handle controlled tasks such as:
+
+- running RDS migrations
+- registering/rotating the AWS dev agent credential
+- smoke-testing deployed MCP tool calls
+
+---
+
+## Important safety rules
+
+Do not commit:
+
+```text
+.env
+.env.docker
+terraform.tfvars
+terraform.tfstate
+terraform.tfstate.*
+.terraform/
+raw API keys
+AWS access keys
+secret values
+```
+
+Do not paste raw dev API keys into documentation, GitHub variables, GitHub secrets, or commits.
+
+The AWS dev agent API key is smoke-test-only material.
+
+---
+
+## Pause AWS runtime
+
+Use this when the deployed runtime is not currently needed.
+
+### 1. Scale ECS app service to zero
+
+```powershell
+aws ecs update-service `
+ --cluster aws-python-service-platform-dev-cluster `
+ --service aspsp-dev-app-service `
+ --desired-count 0
+```
+
+Verify:
+
+```powershell
+aws ecs describe-services `
+ --cluster aws-python-service-platform-dev-cluster `
+ --services aspsp-dev-app-service `
+ --query "services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount}" `
+ --output table
+```
+
+Expected:
+
+```text
+Desired = 0
+Running = 0
+Pending = 0
+```
+
+### 2. Stop RDS
+
+```powershell
+aws rds stop-db-instance `
+ --db-instance-identifier aspsp-dev-postgres
+```
+
+Verify:
+
+```powershell
+aws rds describe-db-instances `
+ --db-instance-identifier aspsp-dev-postgres `
+ --query "DBInstances[0].DBInstanceStatus" `
+ --output text
+```
+
+Expected final state:
+
+```text
+stopped
+```
+
+Note: stopping RDS can take several minutes.
+
+---
+
+## Restart AWS runtime
+
+Use this when the deployed app needs to be tested again.
+
+### 1. Start RDS
+
+```powershell
+aws rds start-db-instance `
+ --db-instance-identifier aspsp-dev-postgres
+```
+
+Wait until available:
+
+```powershell
+aws rds describe-db-instances `
+ --db-instance-identifier aspsp-dev-postgres `
+ --query "DBInstances[0].DBInstanceStatus" `
+ --output text
+```
+
+Expected:
+
+```text
+available
+```
+
+### 2. Scale ECS service back to one task
+
+```powershell
+aws ecs update-service `
+ --cluster aws-python-service-platform-dev-cluster `
+ --service aspsp-dev-app-service `
+ --desired-count 1
+```
+
+Verify:
+
+```powershell
+aws ecs describe-services `
+ --cluster aws-python-service-platform-dev-cluster `
+ --services aspsp-dev-app-service `
+ --query "services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount}" `
+ --output table
+```
+
+Expected:
+
+```text
+Desired = 1
+Running = 1
+Pending = 0
+```
+
+### 3. Check health endpoint
+
+```powershell
+curl.exe http://aspsp-dev-alb-1213226492.eu-west-2.elb.amazonaws.com/health
+```
+
+Expected:
+
+```json
+{"status":"ok"}
+```
+
+### 4. Verify ECS private networking
+
+Check the ECS service network configuration:
+
+```powershell
+aws ecs describe-services `
+ --cluster aws-python-service-platform-dev-cluster `
+ --services aspsp-dev-app-service `
+ --query "services[0].networkConfiguration.awsvpcConfiguration"
+```
+
+Expected:
+
+```text
+assignPublicIp = DISABLED
+subnets = private app subnet IDs
+```
+
+Check the running task network interface:
+
+```powershell
+$taskArn = aws ecs list-tasks `
+ --cluster aws-python-service-platform-dev-cluster `
+ --service-name aspsp-dev-app-service `
+ --query "taskArns[0]" `
+ --output text
+
+$networkInterfaceId = aws ecs describe-tasks `
+ --cluster aws-python-service-platform-dev-cluster `
+ --tasks $taskArn `
+ --query "tasks[0].attachments[0].details[?name=='networkInterfaceId'].value" `
+ --output text
+
+aws ec2 describe-network-interfaces `
+ --network-interface-ids $networkInterfaceId `
+ --query "NetworkInterfaces[0].{SubnetId:SubnetId,PrivateIp:PrivateIpAddress,PublicIp:Association.PublicIp,Status:Status}" `
+ --output table
+```
+
+Expected:
+
+```text
+PublicIp = None
+SubnetId = private app subnet ID
+```
+
+---
+
+## Manual CD deployment
+
+Manual CD is run from GitHub Actions.
+
+Path:
+
+```text
+GitHub
+-> Actions
+-> Deploy to AWS ECS
+-> Run workflow
+-> Branch: main
+-> Run workflow
+```
+
+The workflow:
+
+```text
+GitHub OIDC
+-> assumes AWS deploy role
+-> builds Docker image
+-> tags image with Git commit SHA and latest
+-> pushes image to ECR
+-> registers new ECS task definition revision
+-> updates ECS service
+-> waits for service stability
+-> checks /health
+```
+
+Verify deployed task definition:
+
+```powershell
+aws ecs describe-services `
+ --cluster aws-python-service-platform-dev-cluster `
+ --services aspsp-dev-app-service `
+ --query "services[0].taskDefinition" `
+ --output text
+```
+
+Inspect deployed image:
+
+```powershell
+aws ecs describe-task-definition `
+ --task-definition aspsp-dev-app-task `
+ --query "taskDefinition.containerDefinitions[0].image" `
+ --output text
+```
+
+Expected: the image tag should be a Git commit SHA, not only `latest`.
+
+---
+
+## Run AWS migrations
+
+Use only when the RDS schema needs applying/updating.
+
+From project root:
+
+```powershell
+.\scripts\run-aws-migrations-task.ps1
+```
+
+This starts a one-off ECS/Fargate task that runs:
+
+```text
+python scripts/run_aws_migrations.py
+```
+
+Check logs:
+
+```powershell
+aws logs tail /ecs/aws-python-service-platform-dev-app --since 15m
+```
+
+Expected:
+
+```text
+All migrations completed successfully.
+```
+
+---
+
+## Register or rotate AWS dev agent credential
+
+Use this to create or rotate the AWS dev credential used for MCP smoke tests.
+
+From project root:
+
+```powershell
+.\scripts\register-aws-dev-agent-task.ps1
+```
+
+Then check CloudWatch logs:
+
+```powershell
+aws logs tail /ecs/aws-python-service-platform-dev-app --since 15m
+```
+
+Copy the newly printed raw dev API key into a local/private location only.
+
+Do not commit it.
+
+---
+
+## Run deployed MCP smoke tests
+
+Requires a current raw AWS dev agent API key.
+
+Allow-path check:
+
+```powershell
+.\scripts\smoke-aws-docs-tool.ps1 -RawApiKey "AWS_DEV_AGENT_KEY_HERE" -DocumentId "doc1"
+```
+
+Expected:
+
+```text
+Tool response status:
+200
+
+document_id:
+doc1
+```
+
+Deny-path check:
+
+```powershell
+.\scripts\smoke-aws-docs-tool.ps1 -RawApiKey "AWS_DEV_AGENT_KEY_HERE" -DocumentId "doc2"
+```
+
+Expected:
+
+```text
+decision:
+deny
+
+rationale:
+DEFAULT_DENY
+```
+
+---
+
+## Check CloudWatch logs
+
+```powershell
+aws logs tail /ecs/aws-python-service-platform-dev-app --since 15m
+```
+
+Use this for:
+
+- app startup checks
+- migration task output
+- credential registration task output
+- runtime errors
+- ECS task command failures
+
+---
+
+## Check cost drivers
+
+Main ongoing cost drivers while infrastructure exists:
+
+```text
+ALB
+VPC public IPv4
+VPC interface endpoints
+ECS/Fargate tasks when running
+RDS when running
+Secrets Manager
+CloudWatch logs
+ECR image storage
+```
+
+Check monthly cost by service:
+
+```text
+AWS Console
+-> Billing and Cost Management
+-> Cost Explorer
+-> Cost analysis
+-> Group by: Service
+```
+
+Pausing ECS and RDS reduces runtime cost, but ALB, public IPv4, VPC interface endpoints, Secrets Manager, CloudWatch, and ECR storage costs can remain until the infrastructure is destroyed.
+
+---
+
+## Destroy AWS infrastructure
+
+Use only when the live AWS environment is no longer needed.
+
+From `infra/terraform`:
+
+```powershell
+terraform plan -destroy
+```
+
+Review the plan carefully.
+
+Then:
+
+```powershell
+terraform destroy
+```
+
+This removes the Terraform-managed AWS resources.
+
+Do not run this if you still need the live ALB/ECS/RDS environment for testing or demonstration.
+
+---
+
+## Current known development limitations
+
+The current AWS environment is a development/portfolio deployment.
+
+Known non-production limitations:
+
+- HTTP only; HTTPS/ACM is not yet configured.
+- Terraform state is currently local, not remote S3-backed state.
+- RDS migration version tracking is not yet implemented.
+- Dev credential registration is operational-script based, not a production admin workflow.
+- No NAT Gateway is currently deployed; this is acceptable for the current AWS-service-only egress model, but would need revisiting if the app calls external APIs.
+- Post-deploy smoke testing currently checks `/health`; fuller MCP smoke checks remain manual.
\ No newline at end of file
diff --git a/docs/aws_terraform_deployment_sequence.md b/docs/aws_terraform_deployment_sequence.md
index 66d95d5..7055507 100644
--- a/docs/aws_terraform_deployment_sequence.md
+++ b/docs/aws_terraform_deployment_sequence.md
@@ -31,6 +31,7 @@ Current relevant Terraform files:
- `infra/terraform/resources.tf`
- `infra/terraform/network.tf`
- `infra/terraform/security_groups.tf`
+- `infra/terraform/vpc_endpoints.tf`
- `infra/terraform/load_balancer.tf`
- `infra/terraform/rds.tf`
- `infra/terraform/terraform.tfvars.example`
@@ -72,15 +73,21 @@ File purposes:
- private app subnets
- private DB subnets
- public route table
+ - private app route table
- route table associations
- RDS DB subnet group
- `security_groups.tf`
- ALB security group
- app/ECS task security group
+ - AWS service interface endpoint security group
- DB/RDS security group
- security group ingress and egress rules
+- `vpc_endpoints.tf`
+ - interface VPC endpoints for ECR API, ECR Docker registry, CloudWatch Logs, and Secrets Manager
+ - S3 gateway endpoint associated with the private app route table
+
- `load_balancer.tf`
- Application Load Balancer
- target group
@@ -115,6 +122,7 @@ Files:
- `infra/terraform/resources.tf`
- `infra/terraform/network.tf`
- `infra/terraform/security_groups.tf`
+- `infra/terraform/vpc_endpoints.tf`
- `infra/terraform/load_balancer.tf`
- `infra/terraform/rds.tf`
- `infra/terraform/terraform.tfvars.example`
@@ -687,8 +695,10 @@ Completed:
- private DB subnets
- internet gateway
- public route table
+- private app route table
- DB subnet group
- security groups
+- VPC endpoints
- ALB
- target group
- HTTP listener
@@ -723,29 +733,78 @@ Verified smoke-test outcomes:
- `doc1` is public and returns the document body.
- `doc2` is private and returns a denied MCP result with `DEFAULT_DENY`.
-Current intentional dev limitation:
+Current networking posture:
-- ECS app tasks currently run in public subnets with `assignPublicIp=ENABLED`.
-- This avoids adding NAT Gateway or VPC endpoints during the first runnable AWS vertical slice.
-- Inbound access is still controlled through security groups:
- - Internet -> ALB on port `80`
- - ALB -> ECS app task on port `8000`
- - ECS app task -> RDS on port `5432`
+- ECS app tasks run in private app subnets with `assignPublicIp=DISABLED`.
+- Running ECS app tasks have no public IP.
+- ALB nodes remain in public subnets and forward to the private task IPs registered in the target group.
+- Required AWS-service access from private ECS tasks uses VPC endpoints:
+ - interface endpoints for ECR API, ECR Docker registry, CloudWatch Logs, and Secrets Manager
+ - S3 gateway endpoint associated with the private app route table
+- App task egress is restricted to RDS, the AWS service interface endpoint security group, and the S3 endpoint prefix list.
+- No NAT Gateway is currently deployed; this remains deferred until there is a real requirement for general external egress.
Deferred production hardening:
- HTTPS listener with ACM certificate
- optional HTTP `80 -> 443` redirect
-- private app subnets without public task IPs
-- NAT Gateway or VPC endpoints
-- immutable image tags instead of `latest`
- Terraform remote state
- production credential registry/admin process
- migration version table
+- CI-before-deploy safety clarification and deployment guardrails
+- Terraform image tag handling alignment with SHA-based CD
+
+---
+
+## 19. Added private ECS task networking and VPC endpoints
+
+Files:
+
+- `infra/terraform/network.tf`
+- `infra/terraform/security_groups.tf`
+- `infra/terraform/vpc_endpoints.tf`
+- `infra/terraform/ecs_service.tf`
+
+Why:
+
+- ECS/Fargate app tasks should not need public IP addresses.
+- The ALB should remain the public entry point.
+- Private ECS tasks still need AWS-service access for image pulls, runtime secrets, and logs.
+- The project does not currently require general outbound internet access, so VPC endpoints are more precise than adding a NAT Gateway for this phase.
+
+Implemented changes:
+
+- Added an explicit private app route table for the private app subnets.
+- Added an AWS service interface endpoint security group.
+- Added interface VPC endpoints for:
+ - ECR API
+ - ECR Docker registry
+ - CloudWatch Logs
+ - Secrets Manager
+- Added an S3 gateway endpoint associated with the private app route table.
+- Moved the ECS service network configuration from public subnets to private app subnets.
+- Changed the ECS service network configuration to `assign_public_ip = false`.
+- Replaced broad app HTTPS egress to `0.0.0.0/0` with narrower egress rules:
+ - app task security group to AWS service interface endpoint security group on port `443`
+ - app task security group to the S3 endpoint prefix list on port `443`
+
+Verified result:
+
+- ECS service reports `assignPublicIp = DISABLED`.
+- Running task network interface has no public IP.
+- VPC endpoints are available for ECR API, ECR Docker registry, CloudWatch Logs, Secrets Manager, and S3.
+- `/health` through the ALB returns `{"status":"ok"}` after the change.
+- Terraform plan is clean after apply.
+
+Important distinction:
+
+- The Internet Gateway remains because the ALB is public.
+- No NAT Gateway is currently used.
+- Private ECS task AWS-service access is through VPC endpoint resources, not through public task IPs.
---
-## 19. Implemented AWS runtime mapping
+## 20. Implemented AWS runtime mapping
Local Docker Compose mapping:
@@ -760,8 +819,9 @@ AWS mapping now implemented:
- ECS task definition injects secret values from Secrets Manager.
- RDS supplies the database endpoint.
- ECS service keeps the app task running.
-- ECS registers Fargate task IPs with the ALB target group.
-- ALB forwards public traffic to healthy ECS app tasks.
+- ECS registers private Fargate task IPs with the ALB target group.
+- ALB forwards public traffic to healthy private ECS app tasks.
+- Private ECS tasks use VPC endpoints for ECR image pulls, CloudWatch Logs, Secrets Manager, and S3-backed ECR image layers.
- CloudWatch receives app logs.
- One-off ECS tasks run operational scripts inside the same AWS runtime boundary.
@@ -778,7 +838,7 @@ The application settings code continues reading the same variable names. The dep
---
-## 20. Local-to-AWS environment mapping
+## 21. Local-to-AWS environment mapping
Local app configuration:
@@ -839,7 +899,7 @@ Example smoke checks:
---
-## 21. Why so many explicit resources are required
+## 22. Why so many explicit resources are required
AWS does not infer the runtime wiring automatically.
diff --git a/docs/production_secret_and_state_security.md b/docs/production_secret_and_state_security.md
index a270d46..ae58952 100644
--- a/docs/production_secret_and_state_security.md
+++ b/docs/production_secret_and_state_security.md
@@ -17,7 +17,7 @@ This keeps the infrastructure-as-code layer separate from the sensitive runtime
Terraform should be responsible for:
-- VPC, subnets, route tables, and security groups
+- VPC, subnets, route tables, security groups, and VPC endpoints
- ECR repositories
- ECS clusters, services, and task definitions
- ALB, listeners, and target groups
@@ -25,6 +25,7 @@ Terraform should be responsible for:
- IAM roles and permissions
- CloudWatch log groups
- references to existing Secrets Manager secrets
+- private AWS-service access paths needed by ECS tasks
Terraform should not be responsible for:
@@ -275,6 +276,7 @@ Example permission intent:
Allow ECS task execution role to read:
DB password secret
agent credential hash secret
+```
It should not have broad access such as:
diff --git a/infra/terraform/ecs_service.tf b/infra/terraform/ecs_service.tf
index ddfc4c0..901237e 100644
--- a/infra/terraform/ecs_service.tf
+++ b/infra/terraform/ecs_service.tf
@@ -9,17 +9,24 @@ resource "aws_ecs_service" "app_service" {
health_check_grace_period_seconds = 60
+ lifecycle {
+ ignore_changes = [
+ task_definition,
+ desired_count,
+ ]
+ }
+
network_configuration {
subnets = [
- aws_subnet.public_a.id,
- aws_subnet.public_b.id,
+ aws_subnet.private_app_a.id,
+ aws_subnet.private_app_b.id,
]
security_groups = [
aws_security_group.app.id,
]
- assign_public_ip = true
+ assign_public_ip = false
}
load_balancer {
diff --git a/infra/terraform/network.tf b/infra/terraform/network.tf
index 1bf80ea..1dfd01f 100644
--- a/infra/terraform/network.tf
+++ b/infra/terraform/network.tf
@@ -108,6 +108,24 @@ resource "aws_route_table_association" "public_b" {
route_table_id = aws_route_table.public.id
}
+resource "aws_route_table" "private_app_subnets" {
+ vpc_id = aws_vpc.app.id
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-private-app-rt"
+ })
+}
+
+resource "aws_route_table_association" "private_app_a_to_private_app_route_table" {
+ subnet_id = aws_subnet.private_app_a.id
+ route_table_id = aws_route_table.private_app_subnets.id
+}
+
+resource "aws_route_table_association" "private_app_b_to_private_app_route_table" {
+ subnet_id = aws_subnet.private_app_b.id
+ route_table_id = aws_route_table.private_app_subnets.id
+}
+
resource "aws_db_subnet_group" "app" {
name = "${local.name_prefix}-db-subnet-group"
diff --git a/infra/terraform/security_groups.tf b/infra/terraform/security_groups.tf
index 280d467..306e888 100644
--- a/infra/terraform/security_groups.tf
+++ b/infra/terraform/security_groups.tf
@@ -28,6 +28,26 @@ resource "aws_security_group" "db" {
})
}
+resource "aws_security_group" "aws_service_interface_endpoints" {
+ name = "${local.name_prefix}-aws-service-endpoints-sg"
+ description = "Allow private app tasks to reach AWS service interface endpoints."
+ vpc_id = aws_vpc.app.id
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-aws-service-endpoints-sg"
+ })
+}
+
+resource "aws_vpc_security_group_ingress_rule" "aws_service_interface_endpoints_from_app_tasks" {
+ security_group_id = aws_security_group.aws_service_interface_endpoints.id
+ description = "Allow app tasks to connect to AWS service interface endpoints over HTTPS."
+
+ referenced_security_group_id = aws_security_group.app.id
+ ip_protocol = "tcp"
+ from_port = 443
+ to_port = 443
+}
+
resource "aws_vpc_security_group_ingress_rule" "alb_http_from_internet" {
security_group_id = aws_security_group.alb.id
description = "Allow HTTP traffic from the internet to the ALB."
@@ -78,12 +98,22 @@ resource "aws_vpc_security_group_ingress_rule" "db_from_app" {
to_port = 5432
}
-resource "aws_vpc_security_group_egress_rule" "app_to_https_internet" {
+resource "aws_vpc_security_group_egress_rule" "app_to_aws_service_interface_endpoints" {
security_group_id = aws_security_group.app.id
- description = "Allow app tasks to reach AWS service endpoints for ECR, Secrets Manager, and CloudWatch Logs."
+ description = "Allow app tasks to reach AWS service interface endpoints over HTTPS."
- cidr_ipv4 = "0.0.0.0/0"
- ip_protocol = "tcp"
- from_port = 443
- to_port = 443
+ referenced_security_group_id = aws_security_group.aws_service_interface_endpoints.id
+ ip_protocol = "tcp"
+ from_port = 443
+ to_port = 443
+}
+
+resource "aws_vpc_security_group_egress_rule" "app_to_s3_gateway_endpoint" {
+ security_group_id = aws_security_group.app.id
+ description = "Allow app tasks to reach S3 through the private app S3 gateway endpoint."
+
+ prefix_list_id = aws_vpc_endpoint.s3_gateway_for_private_app_route_table.prefix_list_id
+ ip_protocol = "tcp"
+ from_port = 443
+ to_port = 443
}
\ No newline at end of file
diff --git a/infra/terraform/vpc_endpoints.tf b/infra/terraform/vpc_endpoints.tf
new file mode 100644
index 0000000..d7d7979
--- /dev/null
+++ b/infra/terraform/vpc_endpoints.tf
@@ -0,0 +1,95 @@
+data "aws_region" "current" {}
+
+resource "aws_vpc_endpoint" "ecr_api_interface" {
+ vpc_id = aws_vpc.app.id
+ service_name = "com.amazonaws.${data.aws_region.current.region}.ecr.api"
+ vpc_endpoint_type = "Interface"
+ private_dns_enabled = true
+
+ subnet_ids = [
+ aws_subnet.private_app_a.id,
+ aws_subnet.private_app_b.id,
+ ]
+
+ security_group_ids = [
+ aws_security_group.aws_service_interface_endpoints.id,
+ ]
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-ecr-api-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "ecr_docker_registry_interface" {
+ vpc_id = aws_vpc.app.id
+ service_name = "com.amazonaws.${data.aws_region.current.region}.ecr.dkr"
+ vpc_endpoint_type = "Interface"
+ private_dns_enabled = true
+
+ subnet_ids = [
+ aws_subnet.private_app_a.id,
+ aws_subnet.private_app_b.id,
+ ]
+
+ security_group_ids = [
+ aws_security_group.aws_service_interface_endpoints.id,
+ ]
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-ecr-dkr-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "cloudwatch_logs_interface" {
+ vpc_id = aws_vpc.app.id
+ service_name = "com.amazonaws.${data.aws_region.current.region}.logs"
+ vpc_endpoint_type = "Interface"
+ private_dns_enabled = true
+
+ subnet_ids = [
+ aws_subnet.private_app_a.id,
+ aws_subnet.private_app_b.id,
+ ]
+
+ security_group_ids = [
+ aws_security_group.aws_service_interface_endpoints.id,
+ ]
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-logs-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "secrets_manager_interface" {
+ vpc_id = aws_vpc.app.id
+ service_name = "com.amazonaws.${data.aws_region.current.region}.secretsmanager"
+ vpc_endpoint_type = "Interface"
+ private_dns_enabled = true
+
+ subnet_ids = [
+ aws_subnet.private_app_a.id,
+ aws_subnet.private_app_b.id,
+ ]
+
+ security_group_ids = [
+ aws_security_group.aws_service_interface_endpoints.id,
+ ]
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-secretsmanager-endpoint"
+ })
+}
+
+resource "aws_vpc_endpoint" "s3_gateway_for_private_app_route_table" {
+ vpc_id = aws_vpc.app.id
+ service_name = "com.amazonaws.${data.aws_region.current.region}.s3"
+ vpc_endpoint_type = "Gateway"
+
+ route_table_ids = [
+ aws_route_table.private_app_subnets.id,
+ ]
+
+ tags = merge(local.common_tags, {
+ Name = "${local.name_prefix}-s3-gateway-endpoint"
+ })
+}
\ No newline at end of file