diff --git a/.github/workflows/k8s-challenger-lint.yml b/.github/workflows/k8s-challenger-lint.yml new file mode 100644 index 0000000..47f26ad --- /dev/null +++ b/.github/workflows/k8s-challenger-lint.yml @@ -0,0 +1,39 @@ +name: k8s-challenger-lint + +on: + push: + branches: [main] + paths: + - "kubernetes/**" + - ".github/workflows/k8s-challenger-lint.yml" + pull_request: + branches: [main] + paths: + - "kubernetes/**" + +jobs: + lint: + runs-on: ubuntu-latest + name: Lint k8s_challenger Python code + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.14" + cache: "pip" + cache-dependency-path: "kubernetes/pyproject.toml" + + - name: Install dependencies + working-directory: kubernetes + run: pip install -e ".[dev]" + + - name: Lint with Ruff + working-directory: kubernetes + run: ruff check cli/ + + - name: Type check with ty + working-directory: kubernetes + run: ty check cli/ \ No newline at end of file diff --git a/kubernetes/.gitignore b/kubernetes/.gitignore new file mode 100644 index 0000000..6158b5b --- /dev/null +++ b/kubernetes/.gitignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +dist/ +build/ +*.egg + +# Virtual environments +venv/ +env/ +ENV/ +.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Testing and linting +.pytest_cache/ +.coverage +.mypy_cache/ +.ruff_cache/ +htmlcov/ + +# Docker volumes and runtime +kubeconfig/ +*.log + +# Environment files +.env +.env.local + +# Temporary files +*.tmp +*.bak \ No newline at end of file diff --git a/kubernetes/CONTRIBUTING.md b/kubernetes/CONTRIBUTING.md new file mode 100644 index 0000000..e802e1c --- /dev/null +++ b/kubernetes/CONTRIBUTING.md @@ -0,0 +1,261 @@ +# Contributing to Flagsmith Interview Challenges + +This guide explains how to create, validate, and maintain interview challenges for the Flagsmith challenge system. + +## Requirements + +Every challenge **MUST** consist of: + +1. **Challenge Definition** (`{name}.yaml`) + - Name, description, estimates, helm values, kubectl patches + +2. **Reviewer Manual** (`{name}.md`) + - Step-by-step solution guide for the interviewer + - Educational learning points + - Debugging techniques and prevention strategies + +3. **Required YAML Fields** + ```yaml + name: "Challenge Title" + description: "Candidate must [action] to fix [issue] causing [symptom]" + estimates: + Lv1: # Bronze: 0-2 years K8s experience + Lv2: # Silver: 2-4 years K8s experience + Lv3: # Gold: 4+ years K8s experience + helm_values: {} # Helm chart configuration + kubectl_patches: [] # Optional kubectl JSON patches + ``` + +All fields are mandatory. The system will fail immediately if any required field is missing or if the corresponding `.md` manual doesn't exist. + +## Creating a Challenge + +### Step 1: Define the Challenge + +Create a new YAML file in `challenges/` with your challenge definition. + +**Requirements:** +- Filename must be kebab-case: `my-challenge.yaml` +- Must include all required fields +- `name` should be descriptive and clear +- `description` should follow the pattern: "Candidate must [action] to fix [issue] causing [symptom]" +- Time estimates should be realistic integers (not `∞` unless unsuitable for that level) + +**Example:** +```yaml +name: "Database Connection Timeout" +description: "Candidate must identify and fix connection pool exhaustion causing timeout errors." +estimates: + Lv1: 30 + Lv2: 15 + Lv3: 8 + +helm_values: + api: + replicaCount: 1 + resources: + limits: + cpu: "100m" + memory: "256Mi" + +kubectl_patches: + - resource: "deployment" + name: "flagsmith-api" + namespace: "flagsmith" + patch_type: "json" + patch: + - op: "add" + path: "/spec/template/spec/containers/0/env/-" + value: + name: "DB_POOL_SIZE" + value: "1" +``` + +### Step 2: Write the Reviewer Manual + +Create a corresponding `.md` file with the same name: `my-challenge.md` + +**Required sections:** + +1. **Problem Description** - What's broken and why +2. **Step-by-Step Solution** - Each debugging step with commands and expected output +3. **Key Learning Points** - Educational objectives (5-7 bullet points) +4. **Prevention** - How to avoid this in production +5. **Additional Debugging Commands** - Extra tools and techniques (optional but recommended) + +**Structure Example:** +```markdown +# Challenge: Database Connection Timeout - Solution + +## Problem Description +[Explain what's broken and the root cause] + +## Step-by-Step Solution + +### Step 1: Identify the Problem +[Commands and expected symptoms] + +### Step 2: Investigate the Configuration +[Debugging commands with output] + +[Continue with numbered steps...] + +## Key Learning Points + +1. [Educational point 1] +2. [Educational point 2] +[etc...] + +## Prevention + +- [Prevention strategy 1] +- [Prevention strategy 2] + +## Additional Debugging Commands + +[Optional reference commands] +``` + +### Step 3: Estimate Time Accurately + +Time estimates guide interviewer expectations and candidate assessment. + +**Guidelines:** +- **Lv1 (Bronze)**: 0-2 years K8s experience, may need to Google commands +- **Lv2 (Silver)**: 2-4 years, solid troubleshooting, familiar with tools +- **Lv3 (Gold)**: 4+ years, senior/staff level, can diagnose quickly + +**Mark as Unsuitable:** +Use `"∞"` (infinity symbol) for Lv1 if the challenge is unsuitable for beginners: +```yaml +estimates: + Lv1: "∞" # Senior-level challenge, unsuitable for beginners + Lv2: 20 + Lv3: 9 +``` + +**Estimation Guidelines:** +- Simple challenge (e.g., single configuration error): 15-25 minutes for Lv1 +- Moderate challenge (multi-step debugging): 30-45 minutes for Lv1 +- Complex challenge (multiple concurrent issues): 45-60 minutes for Lv1, or mark as `"∞"` +- Senior-level candidates (Lv3): Typically complete challenges in 5-15 minutes + +## Validation Checklist + +Before submitting a challenge: + +- [ ] Both files exist: `challenges/my-challenge.yaml` and `challenges/my-challenge.md` +- [ ] YAML contains all required fields: `name`, `description`, `estimates`, `helm_values`, `kubectl_patches` +- [ ] Time estimates are positive integers or `"∞"` where appropriate +- [ ] Helm values deploy successfully +- [ ] kubectl patches use valid JSON syntax +- [ ] Challenge deployment is actually broken (health check fails) +- [ ] Solution manual is technically accurate and matches the challenge +- [ ] Time estimates are realistic (verified through manual walkthrough) + +## Testing Your Challenge + +Run the system and test your challenge: + +```bash +make run +``` + +Then: +1. Select your challenge from the menu +2. Verify it deploys successfully +3. Confirm the challenge is actually broken (health check fails) +4. Work through the solution steps in your manual +5. Verify time estimates are realistic for each skill level +6. Confirm the health endpoint returns 200 after your fixes + +## Code Quality + +All Python code must pass linting and type checking: + +```bash +ruff check cli/ +ty check cli/ +``` + +Challenge files are discovered automatically. No code changes are needed when adding new challenges. + +## Challenge Consistency + +### Naming Conventions +- Use kebab-case for filenames: `network-issue.yaml`, not `NetworkIssue.yaml` +- Use Title Case for `name` field in YAML +- Keep names concise but descriptive + +### Description Format +Follow this pattern: +> "Candidate must **[action]** to fix **[issue]** causing **[symptom]**." + +**Good examples:** +- "Candidate must identify and fix a typo in the database service name causing connection failures." +- "Candidate must remove a CPU-intensive sidecar container causing API performance issues." +- "Candidate must adjust memory limits that are set too low causing OOMKilled pods." + +**Bad examples:** +- "There's a broken pod" (vague) +- "Fix this" (unclear) +- "Challenge about networking" (not action-oriented) + +### Patch Format +Use JSON patches with proper escaping for special characters: +```yaml +# Correct: Use ~1 for forward slash in JSON pointer +path: "/spec/selector/app.kubernetes.io~1name" + +# Wrong: Don't use literal forward slash +path: "/spec/selector/app.kubernetes.io/name" +``` + +## Educational Standards + +### What Makes a Good Challenge? + +**Characteristics of effective challenges:** +- Tests real-world debugging skills and systematic troubleshooting +- Clear root cause that candidates can identify through investigation +- Teaches production-relevant concepts and best practices +- Multiple valid approaches to reach the solution +- Objective success criteria (health endpoint returns HTTP 200) + +**What to avoid:** +- Ambiguous symptoms that could indicate multiple unrelated issues +- Contrived scenarios that wouldn't occur in real environments +- Challenges requiring deep Flagsmith application knowledge (focus on Kubernetes) +- Overly trivial fixes (unless teaching a specific lesson) +- Subjective or unclear completion criteria + +### Learning Progression + +Design challenges across a spectrum of difficulty: + +1. **Beginner-friendly (Lv1)**: Single, well-defined issue with clear symptoms +2. **Intermediate (Lv2)**: Multiple related issues requiring systematic debugging +3. **Advanced (Lv3)**: Complex scenarios with concurrent problems, demanding deep expertise + +Each difficulty level should introduce new concepts or techniques, not just expect faster execution. + +## Maintenance + +### When to Update Challenges + +Update challenges when: +- Time estimates prove inaccurate based on candidate performance +- Debugging techniques or tools change +- New Kubernetes best practices emerge +- Technical inaccuracies are discovered in the solution + +### When to Mark as Unsuitable + +Use `"∞"` for specific levels when: +- New tooling makes the challenge trivial for that skill level +- The scenario no longer reflects realistic production issues +- The educational value no longer justifies the time investment + +## Questions? + +Review existing challenges in the `challenges/` directory for working examples. The system enforces these requirements to maintain consistency and educational quality across all challenges. \ No newline at end of file diff --git a/kubernetes/Makefile b/kubernetes/Makefile new file mode 100644 index 0000000..2756126 --- /dev/null +++ b/kubernetes/Makefile @@ -0,0 +1,29 @@ +.PHONY: run help clean + +# Default target +help: + @echo "Available commands:" + @echo " make run - Start the Flagsmith interview challenge system" + @echo " make clean - Clean up Docker resources" + @echo " make help - Show this help message" + +# Clean up project resources +clean: + @echo "🧹 Cleaning up k8s_challenger resources..." + -docker-compose down --volumes --remove-orphans + -docker ps -a --filter "name=k8s_challenger" --format "{{.ID}}" | xargs -r docker rm -f + -docker volume ls --filter "name=k8s_challenger" --format "{{.Name}}" | xargs -r docker volume rm + @echo "✅ Cleanup complete" + +# Start the interview system with cleanup before and after +run: clean + @echo "🚀 Starting Flagsmith Interview Challenge System..." + # Build images + docker-compose build cli candidate-env + # Start k3s in background + docker-compose up -d k3s + # Run CLI interactively + docker-compose run --rm cli + # Cleanup when done + $(MAKE) clean + @echo "✅ Session complete" \ No newline at end of file diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 0000000..a67eed4 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,154 @@ +# Flagsmith Interview Challenge System + +A technical interview system that creates broken Flagsmith deployments for infrastructure debugging challenges. + +## Overview + +This system provides a self-contained Docker environment where candidates can debug real Flagsmith infrastructure issues. The interviewer selects a challenge, and the system creates a broken Kubernetes deployment that the candidate must fix using standard debugging tools. + +## Requirements + +- Docker (with Docker Compose) + +## Quick Start + +1. **Clone and start the system:** + ```bash + git clone + cd flagsmith-interviews + make run + ``` + +2. **Select a challenge** from the interactive menu with time estimates + +3. **Share the tmate session URL** with the candidate + +4. **Monitor progress** via the CLI health checks and timer + +5. **End the session** with Ctrl+C when complete + +## CLI Demo + +The interviewer interface guides you through the complete workflow with three main screens: + +### 1. Challenge Selection +![Challenge Selector](docs/cli-challenge-selector.png) + +Select from available debugging scenarios with time estimates for different skill levels. + +### 2. Session Management +![Session Started](docs/cli-session-started.png) + +Monitor the candidate session with real-time health checks and connection details. + +### 3. Challenge Completion +![Challenge Complete](docs/cli-challenge-complete.png) + +Track progress and verify successful resolution of the infrastructure issue. + +**Key Features:** +- **Time Estimates**: Bronze/Silver/Gold time targets for performance benchmarking +- **Real-time Monitoring**: Health checks and session status during candidate sessions +- **Automatic Management**: Handles deployment, monitoring, and cleanup + +## Architecture + +The system consists of three main components: + +- **CLI Container**: Python application for challenge management and monitoring +- **K3s Cluster**: Lightweight Kubernetes cluster running Flagsmith +- **Candidate Environment**: tmate session with kubectl, helm, and debugging tools + +## Available Challenges + +The system automatically discovers challenges from YAML files in the `challenges/` directory. + +For more information on which challenges are available, please refer to that directory, or run the program. + +## Contributing Challenges + +**See [CONTRIBUTING.md](./CONTRIBUTING.md) for complete guidelines on creating challenges.** + +### Quick Overview + +Each challenge requires two files: + +1. **Challenge Definition** (`{name}.yaml`) + - Defines the broken Kubernetes setup using Helm values and kubectl patches + - Required fields: `name`, `description`, `estimates`, `helm_values`, `kubectl_patches` + +2. **Reviewer Manual** (`{name}.md`) + - Step-by-step solution guide for interviewers + - Educational learning points and prevention strategies + - Must exist alongside every challenge YAML file + +### Requirements + +- Both YAML and markdown files must be present +- All YAML fields are required (no optional fields) +- Time estimates should be realistic integers, or `"∞"` when unsuitable for a skill level +- Manuals are automatically referenced when challenges are selected +- The system will fail immediately if required files or fields are missing + +See [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines on naming conventions, validation, testing, and educational standards. + +## Candidate Experience + +### Available Tools +- `kubectl` - Kubernetes CLI +- `helm` - Helm package manager +- `curl` - HTTP client for testing +- `jq` - JSON processor +- `nano`/`vim` - Text editors +- Standard Unix tools (`grep`, `awk`, `sed`, etc.) + +### Typical Workflow +1. **Connect** to the provided tmate session +2. **Explore** the cluster: `kubectl get pods -n flagsmith` +3. **Investigate** issues: `kubectl logs `, `kubectl describe pod ` +4. **Identify** the root cause through systematic debugging +5. **Fix** the configuration: `kubectl edit deployment flagsmith-api` +6. **Verify** health: `kubectl port-forward svc/flagsmith-api 8000:8000` + +### Success Criteria +The Flagsmith API should respond with HTTP 200 on the `/health/liveness/` endpoint. + +## Commands + +```bash +make run # Start the interview system (includes automatic cleanup) +``` + +## Development + +### Project Structure +``` +├── cli/ # Python CLI application with health monitoring +├── candidate-env/ # Candidate debugging environment with tmate +├── challenges/ # Challenge YAML files with solutions +├── docker-compose.yml # Service orchestration +├── Makefile # Convenience commands +└── pyproject.toml # Python dependencies +``` + +### Architecture Notes +- K3s runs in Docker with kubeconfig shared via volumes +- CLI container has Docker socket access for candidate container management +- Candidate environment is ephemeral per challenge +- Health checks use kubectl port-forward for reliability +- Challenges use Helm values + optional kubectl patches + +## Performance + +- K3s startup: ~30 seconds +- Flagsmith deployment: ~2-3 minutes +- Challenge setup: ~1 minute +- **Total time to candidate access: ~4 minutes** + +## Security + +- Candidates have full kubectl access within the isolated cluster +- No access to host system or Docker daemon +- Network traffic isolated within Docker network +- Sessions are ephemeral and automatically cleaned up +- Each challenge runs in a fresh environment diff --git a/kubernetes/candidate-env/Dockerfile b/kubernetes/candidate-env/Dockerfile new file mode 100644 index 0000000..442fc6d --- /dev/null +++ b/kubernetes/candidate-env/Dockerfile @@ -0,0 +1,67 @@ +FROM alpine:3.21 + +# ============================================================================== +# ROOT OPERATIONS - Install packages and system setup +# ============================================================================== + +# Install system dependencies and tools from Alpine repos +RUN apk add --no-cache \ + bash \ + curl \ + wget \ + git \ + vim \ + nano \ + jq \ + netcat-openbsd \ + bind-tools \ + iputils \ + tree \ + htop \ + less \ + openssh-client \ + ca-certificates \ + kubectl \ + helm \ + sudo \ + shadow + +# TODO: Use OS repos when tmate becomes available in stable Alpine packages +# Install tmate (currently only available in testing, so use direct download) +RUN curl -L https://github.com/tmate-io/tmate/releases/download/2.4.0/tmate-2.4.0-static-linux-amd64.tar.xz \ + | tar -xJ -C /tmp \ + && mv /tmp/tmate-2.4.0-static-linux-amd64/tmate /usr/local/bin/tmate \ + && rm -rf /tmp/tmate-* \ + && chmod +x /usr/local/bin/tmate + +# Create candidate user and configure sudo access +RUN adduser -D -s /bin/bash candidate \ + && echo 'candidate ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +# Copy system scripts and make them executable +COPY --chown=root:root start-session.sh /opt/ +RUN chmod +x /opt/start-session.sh + +# ============================================================================== +# CANDIDATE USER OPERATIONS - User environment and files +# ============================================================================== + +# Switch to candidate user for all remaining operations +USER candidate +WORKDIR /home/candidate + +# Generate SSH key for tmate +RUN mkdir -p ~/.ssh \ + && ssh-keygen -t rsa -b 2048 -f ~/.ssh/id_rsa -N "" \ + && chmod 700 ~/.ssh \ + && chmod 600 ~/.ssh/id_rsa* + +# Copy challenge files directly to home directory +COPY --chown=candidate:candidate challenge-files/ ./ + +# Copy and run candidate environment setup +COPY --chown=candidate:candidate setup-environment.sh /tmp/ +RUN chmod +x /tmp/setup-environment.sh && /tmp/setup-environment.sh && rm /tmp/setup-environment.sh + +# Start the candidate session - this is the main purpose of this container +CMD ["/opt/start-session.sh"] \ No newline at end of file diff --git a/kubernetes/candidate-env/challenge-files/challenge-guide.md b/kubernetes/candidate-env/challenge-files/challenge-guide.md new file mode 100644 index 0000000..05c90ca --- /dev/null +++ b/kubernetes/candidate-env/challenge-files/challenge-guide.md @@ -0,0 +1,40 @@ +# Flagsmith Interview Challenge + +## Your Task +Debug and fix the broken Flagsmith deployment in this Kubernetes cluster. + +**Important:** This is a production environment scenario. You must diagnose and fix the issue using kubectl and standard debugging techniques. Redeploying the entire application from scratch is not an option. + +## Available Tools +- `kubectl` - Kubernetes CLI +- `helm` - Helm package manager +- `curl` - HTTP testing +- Standard Unix tools (grep, awk, sed, etc.) + +## Quick Start Commands +```bash +# Check overall status +kubectl get pods -n flagsmith +kubectl get svc -n flagsmith + +# View pod logs +kubectl logs -n flagsmith + +# Check health endpoint +kubectl port-forward svc/flagsmith-api 8000:8000 -n flagsmith +curl http://localhost:8000/health/liveness/ + +# Manual exploration +kubectl get pods -n flagsmith +kubectl describe pod -n flagsmith +helm list -A +``` + +## Key Files +- `~/challenge-guide.md` - This guide (you're reading it now) +- `~/kubectl-cheatsheet.md` - Kubernetes debugging commands reference + +## Success Criteria +The Flagsmith API should respond with HTTP 200 on the `/health/liveness/` endpoint. + +Good luck! 🚀 diff --git a/kubernetes/candidate-env/challenge-files/kubectl-cheatsheet.md b/kubernetes/candidate-env/challenge-files/kubectl-cheatsheet.md new file mode 100644 index 0000000..f409121 --- /dev/null +++ b/kubernetes/candidate-env/challenge-files/kubectl-cheatsheet.md @@ -0,0 +1,52 @@ +# Kubernetes Debugging Cheatsheet + +## Pod Investigation +```bash +kubectl get pods -n flagsmith # List pods +kubectl describe pod -n flagsmith # Pod details +kubectl logs -n flagsmith # Current logs +kubectl logs -n flagsmith --previous # Previous container logs +kubectl logs -n flagsmith --tail=50 # Last 50 lines +kubectl exec -it -n flagsmith -- /bin/sh # Shell into pod +``` + +## Service & Networking +```bash +kubectl get svc -n flagsmith # List services +kubectl describe svc -n flagsmith # Service details +kubectl get endpoints -n flagsmith # Service endpoints +kubectl port-forward svc/ 8080:8000 -n flagsmith # Port forward +``` + +## Helm Operations +```bash +helm list -A # All releases +helm status flagsmith -n flagsmith # Release status +helm get values flagsmith -n flagsmith # Current values +helm get manifest flagsmith -n flagsmith # Generated manifests +``` + +## Resource Investigation +```bash +kubectl get events -n flagsmith --sort-by='.lastTimestamp' # Recent events +kubectl top pods -n flagsmith # Resource usage +kubectl get pvc -n flagsmith # Persistent volumes +kubectl get configmaps -n flagsmith # Config maps +kubectl get secrets -n flagsmith # Secrets +``` + +## Troubleshooting Common Issues +```bash +# Check resource limits +kubectl describe pod -n flagsmith | grep -A 5 "Limits\|Requests" + +# Check if image can be pulled +kubectl describe pod -n flagsmith | grep -A 5 "Events" + +# Check service selector matches pod labels +kubectl get pods -n flagsmith --show-labels +kubectl describe svc -n flagsmith + +# Test connectivity between pods +kubectl run debug --image=busybox --rm -it --restart=Never -- nslookup . +``` diff --git a/kubernetes/candidate-env/setup-environment.sh b/kubernetes/candidate-env/setup-environment.sh new file mode 100644 index 0000000..7521d1e --- /dev/null +++ b/kubernetes/candidate-env/setup-environment.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Setup minimal environment for the candidate debugging session + +# Set up minimal environment +cat >> ~/.bashrc << 'EOF' +# Set default namespace for convenience +export KUBECONFIG=~/.kube/config + +# Set editor +export EDITOR=nano +EOF + +# Create a simple .nanorc for better editing experience +cat > ~/.nanorc << 'EOF' +set tabsize 2 +set autoindent +set linenumbers +set mouse +EOF + +echo "✅ Candidate environment setup completed" \ No newline at end of file diff --git a/kubernetes/candidate-env/start-session.sh b/kubernetes/candidate-env/start-session.sh new file mode 100644 index 0000000..68e9534 --- /dev/null +++ b/kubernetes/candidate-env/start-session.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# This script runs as root initially to do system setup, then switches to candidate user +set -e + +echo "🚀 Starting Flagsmith Interview Challenge System..." + +# ============================================================================== +# ROOT SETUP - Runtime system configuration +# ============================================================================== + +echo "📁 Setting up tmate info directory..." +mkdir -p /tmp/tmate-info +chown candidate:candidate /tmp/tmate-info + +echo "🔧 Setting up kubeconfig..." + +# Setup kubectl config for candidate user +mkdir -p /home/candidate/.kube + +# Find and copy kubeconfig +if [ -f "/home/candidate/.kube-shared/kubeconfig.yaml" ]; then + # Use shared kubeconfig from k3s + cp /home/candidate/.kube-shared/kubeconfig.yaml /home/candidate/.kube/config +elif [ -f "/etc/rancher/k3s/k3s.yaml" ]; then + # Fallback to direct k3s config + cp /etc/rancher/k3s/k3s.yaml /home/candidate/.kube/config +else + echo "❌ Kubeconfig not found in expected locations" + exit 1 +fi + +# Set proper ownership and permissions +chown candidate:candidate /home/candidate/.kube/config +chmod 600 /home/candidate/.kube/config + +# Update server address to point to k3s-server container +sed -i 's/127.0.0.1:6443/k3s-server:6443/g' /home/candidate/.kube/config +sed -i 's/localhost:6443/k3s-server:6443/g' /home/candidate/.kube/config +sed -i 's|server: https://k3s:6443|server: https://k3s-server:6443|g' /home/candidate/.kube/config + +# Verify the replacement worked +echo "📋 Kubeconfig server URL: $(grep 'server:' /home/candidate/.kube/config)" + +echo "✅ Kubeconfig setup complete" + +# Quick connectivity test (should work since k3s is healthy) +if kubectl cluster-info >/dev/null 2>&1; then + echo "✅ Kubernetes cluster connectivity verified" +else + echo "⚠️ Kubernetes cluster not immediately ready (may need a moment)" +fi + +# ============================================================================== +# CANDIDATE SESSION - Switch to candidate user and start tmate +# ============================================================================== + +# Switch to candidate user for the actual tmate session +echo "👤 Starting tmate session as candidate user..." + +# Create a temporary script to avoid nested quote issues +cat > /tmp/tmate_session.sh << 'SCRIPT_EOF' +#!/bin/bash +set -e + +echo "🚀 Starting tmate session..." + +# Initialize status +echo "starting" > /tmp/tmate-info/status + +# Quick connectivity verification (should work since k3s is healthy) +if kubectl cluster-info >/dev/null 2>&1; then + echo "✅ Kubernetes cluster is ready!" +else + echo "⚠️ Kubernetes cluster not immediately ready, continuing anyway..." +fi +echo "🔗 Starting tmate session (this may take a moment)..." + +# Define socket path +TMATE_SOCKET="/tmp/tmate-$USER.sock" + +# Start tmate with socket +echo "🔧 Starting tmate with socket: $TMATE_SOCKET" +/usr/local/bin/tmate -S "$TMATE_SOCKET" new-session -d +echo "📊 Tmate session created" + +# Wait for tmate to be ready +echo "⏳ Waiting for tmate to be ready..." +if ! /usr/local/bin/tmate -S "$TMATE_SOCKET" wait tmate-ready; then + echo "❌ Tmate failed to become ready" + echo "failed" > /tmp/tmate-info/status + exit 1 +fi + +echo "✅ Tmate is ready!" + +# Get the tmate session URLs +TMATE_SSH=$(/usr/local/bin/tmate -S "$TMATE_SOCKET" display -p '#{tmate_ssh}' 2>/dev/null || echo "") +TMATE_WEB=$(/usr/local/bin/tmate -S "$TMATE_SOCKET" display -p '#{tmate_web}' 2>/dev/null || echo "") + +if [ -n "$TMATE_SSH" ]; then + echo "" + echo "🎯 CANDIDATE CONNECTION INFO:" + echo "================================================" + echo "SSH: $TMATE_SSH" + if [ -n "$TMATE_WEB" ]; then + echo "Web: $TMATE_WEB" + fi + echo "================================================" + echo "" + echo "✨ Session is ready! Candidate can now connect." + + # Write connection info directly to shared volume for CLI to read + echo "$TMATE_SSH" > /tmp/tmate-info/ssh + if [ -n "$TMATE_WEB" ]; then + echo "$TMATE_WEB" > /tmp/tmate-info/web + fi + echo "ready" > /tmp/tmate-info/status +else + echo "⚠️ Could not retrieve tmate session info after 30 seconds" + echo "timeout" > /tmp/tmate-info/status + exit 1 +fi + +# Set up cleanup on exit +cleanup() { + echo "🧹 Cleaning up tmate session..." + echo "stopped" > /tmp/tmate-info/status 2>/dev/null || true + /usr/local/bin/tmate -S "$TMATE_SOCKET" kill-session 2>/dev/null || true +} +trap cleanup EXIT + +# Keep the session alive - just sleep to maintain container +echo "🔄 Tmate session is now available. Container will stay alive." + +exec sleep infinity +SCRIPT_EOF + +chmod +x /tmp/tmate_session.sh +exec su - candidate -c '/tmp/tmate_session.sh' \ No newline at end of file diff --git a/kubernetes/challenges/database-typo.md b/kubernetes/challenges/database-typo.md new file mode 100644 index 0000000..03d6fc8 --- /dev/null +++ b/kubernetes/challenges/database-typo.md @@ -0,0 +1,110 @@ +# Challenge 001: Database Connection Issue - Solution + +## Problem Description +The Flagsmith API pods are crashing with database connection errors. There's a subtle typo in the PostgreSQL service name that prevents the API from connecting to the database. + +## Step-by-Step Solution + +### Step 1: Identify the Problem +Check the overall pod status to see what's broken: +```bash +kubectl get pods -n flagsmith +``` + +You should see a new API pod in `CrashLoopBackOff` or `Error` status, while an older pod may still be running. + +### Step 2: Examine Pod Logs +Get the logs from the failing pod to understand the error: +```bash +# Get the failing pod name (look for CrashLoopBackOff or Error status) +BROKEN_POD=$(kubectl get pods -n flagsmith | grep -E 'CrashLoop|Error' | awk '{print $1}') + +# Check the logs +kubectl logs $BROKEN_POD -n flagsmith --tail=20 +``` + +**Key Error Message:** +``` +OperationalError: could not translate host name "dev-postgresl" to address: Name or service not known +``` + +### Step 3: Investigate the Configuration +Check the deployment configuration to find the source of the wrong hostname: +```bash +kubectl describe pod $BROKEN_POD -n flagsmith | grep -A 20 'Environment:' +``` + +Look for the `DATABASE_URL` environment variable. You'll find it set to a hardcoded value with a typo: +``` +DATABASE_URL: postgresql://postgres:flagsmith@dev-postgresl:5432/flagsmith +``` + +This hardcoded value overrides the correct DATABASE_URL that would normally come from a secret, and contains the typo `dev-postgresl` instead of `dev-postgresql`. + +### Step 4: Identify the Root Cause +The hostname `dev-postgresl` is missing the letter 'q' - it should be `dev-postgresql`. + +Verify the correct service name exists: +```bash +kubectl get services -n flagsmith | grep postgresql +``` + +You should see `flagsmith-dev-postgresql`. + +### Step 5: Apply the Fix +Open the deployment in an editor to remove the incorrect environment variable: + +```bash +kubectl edit deployment flagsmith-api -n flagsmith +``` + +This will open the deployment configuration in your default editor. Look for the `env:` section under `spec.template.spec.containers[0]` and find the hardcoded `DATABASE_URL` environment variable at the bottom of the list: + +```yaml +- name: DATABASE_URL + value: postgresql://postgres:flagsmith@dev-postgresl:5432/flagsmith +``` + +Delete these two lines entirely, then save and exit the editor. This removes the hardcoded DATABASE_URL that overrides the correct one from the secret. + + + +### Step 6: Verify the Fix +Wait for the deployment to roll out: +```bash +kubectl rollout status deployment/flagsmith-api -n flagsmith --timeout=120s +``` + +Check that all pods are running: +```bash +kubectl get pods -n flagsmith +``` + +Test the health endpoint: +```bash +kubectl port-forward svc/flagsmith-api 8080:8000 -n flagsmith & +curl http://localhost:8080/health/liveness/ +kill %1 +``` + +Expected response: `{"status": "ok"}` + + + +## Key Learning Points + +1. **DNS Resolution Errors**: When you see "could not translate host name" errors, it usually means a typo in service names or the service doesn't exist. + +2. **Environment Variable Priority**: Later environment variables can override earlier ones. The broken `DATABASE_URL` was added after the correct one from the secret, causing the application to use the incorrect value. + +3. **Service Discovery**: Kubernetes services are accessible via DNS using the format `..svc.cluster.local` or just `` within the same namespace. + +4. **Debugging Strategy**: Always start with pod logs, then examine configuration, and verify the resources that the application is trying to connect to actually exist. + +5. **JSON Patch Operations**: The `kubectl patch` command with `--type json` allows precise modifications using operations like `add`, `remove`, and `replace` on specific paths in the resource structure. + +## Prevention +- Use proper naming conventions and double-check service names +- Validate connectivity between services before deployment +- Use health checks and monitoring to catch issues early +- Consider using tools like `nslookup` or `dig` from within pods to test DNS resolution diff --git a/kubernetes/challenges/database-typo.yaml b/kubernetes/challenges/database-typo.yaml new file mode 100644 index 0000000..61f6e69 --- /dev/null +++ b/kubernetes/challenges/database-typo.yaml @@ -0,0 +1,21 @@ +name: "Database Connection Issue" +description: "Candidate must identify and fix a typo in the database service name causing connection failures." +estimates: + Lv1: 35 + Lv2: 12 + Lv3: 7 + +helm_values: + devPostgresql: + enabled: true + nameOverride: dev-postgresql + auth: + postgresPassword: flagsmith + database: flagsmith + api: + extraEnv: + # This broken DATABASE_URL will override the correct one from the secret + DATABASE_URL: "postgresql://postgres:flagsmith@dev-postgresl:5432/flagsmith" + +# No kubectl patches needed - broken config is deployed directly +kubectl_patches: [] \ No newline at end of file diff --git a/kubernetes/challenges/memory-limit.md b/kubernetes/challenges/memory-limit.md new file mode 100644 index 0000000..beb969d --- /dev/null +++ b/kubernetes/challenges/memory-limit.md @@ -0,0 +1,150 @@ +# Challenge 002: Resource Configuration Issue - Solution + +## Problem Description +The Flagsmith API pods are failing to start due to insufficient memory resources. There's a misconfiguration in the resource limits that causes pods to be killed by the OOMKiller during startup. + +## Step-by-Step Solution + +### Step 1: Identify the Problem +Check the overall pod status to see what's broken: +```bash +kubectl get pods -n flagsmith +``` + +You should see the API pod in `CrashLoopBackOff` or `OOMKilled` status. + +### Step 2: Examine Pod Events and Status +Get detailed information about why the pod is failing: +```bash +# Get the failing pod name +BROKEN_POD=$(kubectl get pods -n flagsmith | grep -E "(CrashLoop|OOMKilled|Error)" | awk '{print $1}') + +# Check pod events +kubectl describe pod $BROKEN_POD -n flagsmith | grep -A 10 'Events:' +``` + +**Key Error Messages to Look For:** +- `OOMKilled` status +- `Last State: Terminated (Reason: OOMKilled)` +- Memory limit exceeded events + +### Step 3: Check Resource Configuration +Examine the deployment's resource configuration: +```bash +kubectl describe deployment flagsmith-api -n flagsmith | grep -A 10 'Limits:' +``` + +Or get the full resource configuration: +```bash +kubectl get deployment flagsmith-api -n flagsmith -o yaml | grep -A 10 'resources:' +``` + +You'll find the memory limit is set too low: +```yaml +resources: + limits: + cpu: 500m + memory: "64Mi" # This is too low! + requests: + cpu: 300m + memory: "32Mi" # Request too low for startup +``` + +### Step 4: Identify the Root Cause +The issue is that: +1. Memory limit (64Mi) is extremely low for a Django application +2. Memory request (32Mi) is too low for startup - the pod can be scheduled but will immediately crash +3. Flagsmith API typically needs at least 256Mi-512Mi to start properly + +### Step 5: Apply the Fix +Edit the deployment to fix the memory configuration: + +```bash +kubectl edit deployment flagsmith-api -n flagsmith +``` + +This opens the deployment YAML in your editor. Find the `resources:` section under `spec.template.spec.containers[0]` and update it: + +```yaml +resources: + limits: + cpu: 500m + memory: "512Mi" # Increase from 64Mi + requests: + cpu: 300m + memory: "256Mi" # Reduce from 300Mi to be less than limit +``` + +Save and exit the editor. + +**Alternative: Remove resource limits entirely** (for development): +Delete the entire `resources:` section in the editor. + +### Step 6: Verify the Fix +Wait for the deployment to roll out: +```bash +kubectl rollout status deployment/flagsmith-api -n flagsmith --timeout=120s +``` + +Check that all pods are running and ready: +```bash +kubectl get pods -n flagsmith +``` + +Monitor resource usage to ensure it's within acceptable limits: +```bash +kubectl top pods -n flagsmith +``` + +Test the health endpoint: +```bash +kubectl port-forward svc/flagsmith-api 8080:8000 -n flagsmith & +curl http://localhost:8080/health/liveness/ +kill %1 +``` + +Expected response: `{"status": "ok"}` + +## Key Learning Points + +1. **Resource Limits vs Requests**: + - Requests: Guaranteed resources Kubernetes will allocate + - Limits: Maximum resources the container can use + - Limits must be >= requests + +2. **OOMKiller Behavior**: When a container exceeds its memory limit, the Linux OOMKiller terminates it with status `OOMKilled`. + +3. **Sizing Guidelines for Django Apps**: + - Minimum: 256Mi memory + - Recommended: 512Mi-1Gi for production + - Monitor actual usage and adjust accordingly + +4. **Debugging Memory Issues**: + - Check pod events for OOMKilled messages + - Use `kubectl top pods` to monitor actual usage + - Look at historical resource usage patterns + +## Prevention + +- **Capacity Planning**: Profile your application's memory usage under load +- **Monitoring**: Set up alerts for high memory usage (>80% of limits) +- **Testing**: Test resource changes in development first +- **Resource Quotas**: Use namespace resource quotas to prevent resource exhaustion +- **Horizontal Pod Autoscaling**: Scale pods based on resource utilization rather than just increasing limits + +## Additional Commands for Investigation + +Check cluster resource availability: +```bash +kubectl describe nodes | grep -A 5 "Allocated resources" +``` + +View resource usage across all pods: +```bash +kubectl top pods -A --sort-by=memory +``` + +Check if there are resource quotas limiting the namespace: +```bash +kubectl describe resourcequota -n flagsmith +``` diff --git a/kubernetes/challenges/memory-limit.yaml b/kubernetes/challenges/memory-limit.yaml new file mode 100644 index 0000000..cd8dbe3 --- /dev/null +++ b/kubernetes/challenges/memory-limit.yaml @@ -0,0 +1,27 @@ +name: "Resource Configuration Issue" +description: "Candidate must diagnose OOMKilled pods and adjust memory limits that are set too low." +estimates: + Lv1: 40 + Lv2: 15 + Lv3: 8 + +helm_values: + api: + replicaCount: 1 + resources: {} + +kubectl_patches: + - resource: "deployment" + name: "flagsmith-api" + namespace: "flagsmith" + patch_type: "json" + patch: + - op: "add" + path: "/spec/template/spec/containers/0/resources" + value: + limits: + cpu: 500m + memory: "64Mi" + requests: + cpu: 300m + memory: "32Mi" \ No newline at end of file diff --git a/kubernetes/challenges/network-issue.md b/kubernetes/challenges/network-issue.md new file mode 100644 index 0000000..90b7ef2 --- /dev/null +++ b/kubernetes/challenges/network-issue.md @@ -0,0 +1,209 @@ +# Challenge 006: Network Connectivity Issue - Solution + +## Problem Description +The Flagsmith API cannot connect to the PostgreSQL database due to network configuration problems. DNS resolution is failing because of incorrect service names and broken service selectors that prevent proper endpoint discovery. + +## Step-by-Step Solution + +### Step 1: Identify the Problem +Check the overall pod status to see what's broken: +```bash +kubectl get pods -n flagsmith +``` + +You should see API pods in `CrashLoopBackOff` or `Init:CrashLoopBackOff` status while the PostgreSQL pod is running fine. + +### Step 2: Examine Connection Errors +Get the logs from the failing pod to understand the error: +```bash +# Get the failing pod name +BROKEN_POD=$(kubectl get pods -n flagsmith | grep -E "(CrashLoop|Init)" | awk '{print $1}') + +# Check init container logs for database connection errors +kubectl logs $BROKEN_POD -n flagsmith -c migrate-db --tail=20 +``` + +**Key Error Messages:** +``` +could not translate host name "flagsmith-postgresql" to address: Name or service not known +connection to server at "flagsmith-postgresql" (x.x.x.x), port 5432 failed: Connection timed out +``` + +### Step 3: Investigate DNS and Service Configuration +Check if the database service exists and is properly configured: +```bash +# Check all services in the namespace +kubectl get svc -n flagsmith + +# Check if the service the app is trying to connect to exists +kubectl get svc flagsmith-postgresql -n flagsmith 2>/dev/null || echo "Service not found" + +# Check the correct service name +kubectl get svc -n flagsmith | grep postgresql +``` + +You'll discover: +1. The app is trying to connect to `flagsmith-postgresql` +2. But the actual service is named `flagsmith-dev-postgresql` +3. The service selector may be broken + +### Step 4: Check Service Endpoints +Verify if the service has healthy endpoints: +```bash +# Check the correct service +kubectl describe svc flagsmith-dev-postgresql -n flagsmith + +# Check service endpoints +kubectl get endpoints flagsmith-dev-postgresql -n flagsmith + +# Check what the service selector is looking for vs what pods have +kubectl get pods -n flagsmith --show-labels | grep postgresql +``` + +**Key Issues to Look For:** +- Service selector doesn't match pod labels +- No endpoints listed for the service +- Incorrect service name in DATABASE_URL + +### Step 5: Identify the Root Causes +The network issues are caused by: +1. **Wrong service name**: DATABASE_URL references `flagsmith-postgresql` instead of `flagsmith-dev-postgresql` +2. **Broken service selector**: Service selector has wrong label value preventing endpoint discovery +3. **DNS configuration**: Aggressive DNS timeout settings causing premature failures + +### Step 6: Apply the Fixes + +**Fix 1: Fix the PostgreSQL service selector** +```bash +kubectl edit svc flagsmith-dev-postgresql -n flagsmith +``` + +This opens the service YAML in your editor. Find the `selector:` section and fix the broken label: + +```yaml +selector: + app.kubernetes.io/name: postgresql-wrong # Change this +``` + +Change `postgresql-wrong` to `postgresql` to match the actual pod labels. + +**Fix 2: Remove the incorrect DATABASE_URL override** +```bash +kubectl edit deployment flagsmith-api -n flagsmith +``` + +This opens the deployment YAML. Find the `env:` section under `spec.template.spec.containers[0]` and look for the hardcoded DATABASE_URL at the bottom: + +```yaml +- name: DATABASE_URL + value: postgresql://postgres:flagsmith@flagsmith-postgresql:5432/flagsmith +``` + +Delete these two lines entirely. Also remove any `dnsConfig:` section if present. + +Save and exit the editor for both files. + +### Step 7: Verify Service Connectivity +Check that the service now has proper endpoints: +```bash +# Verify service endpoints are populated +kubectl get endpoints flagsmith-dev-postgresql -n flagsmith + +# Test DNS resolution from a pod +kubectl run dns-test --image=busybox --rm -it --restart=Never -n flagsmith -- nslookup flagsmith-dev-postgresql + +# Test connectivity to the database +kubectl run netcat-test --image=busybox --rm -it --restart=Never -n flagsmith -- nc -zv flagsmith-dev-postgresql 5432 +``` + +### Step 8: Verify the Fix +Wait for the deployment to roll out and test connectivity: +```bash +# Wait for rollout to complete +kubectl rollout status deployment/flagsmith-api -n flagsmith --timeout=120s + +# Check that all pods are running +kubectl get pods -n flagsmith + +# Test the health endpoint +kubectl port-forward svc/flagsmith-api 8080:8000 -n flagsmith & +curl http://localhost:8080/health/liveness/ +kill %1 +``` + +Expected response: `{"status": "ok"}` + +### Step 9: Test End-to-End Connectivity +Verify database connectivity is working: +```bash +# Check that the API can connect to the database +API_POD=$(kubectl get pods -n flagsmith -l app.kubernetes.io/component=api -o jsonpath='{.items[0].metadata.name}') + +# Test database connection from the API pod +kubectl exec -it $API_POD -n flagsmith -- python manage.py dbshell --command="SELECT 1;" +``` + +## Key Learning Points + +1. **Service Discovery**: Kubernetes uses DNS for service discovery - service names must match exactly +2. **Service Selectors**: Services must have correct selectors to discover and route to pods +3. **DNS Debugging**: Use `nslookup` and `dig` to debug DNS resolution issues +4. **Network Connectivity**: Test both DNS resolution AND actual connectivity (ports, firewalls, etc.) +5. **Endpoint Verification**: Always check service endpoints to ensure pods are being discovered + +## Common Network Issues in Kubernetes + +- **Wrong service names**: Typos in service names cause DNS resolution failures +- **Broken selectors**: Service selectors that don't match pod labels result in no endpoints +- **Namespace issues**: Services in different namespaces require FQDN (service.namespace.svc.cluster.local) +- **Port mismatches**: Service ports don't match container ports +- **Network policies**: Restrictive network policies blocking traffic + +## Prevention + +- **Service Validation**: Always verify service selectors match pod labels +- **DNS Testing**: Test service DNS resolution during deployment +- **Connectivity Monitoring**: Monitor service endpoint health and connectivity +- **Network Policies**: Document and test network policy impacts +- **Service Mesh**: Consider using service mesh for advanced traffic management and observability + +## Production Considerations + +- **Testing**: Test network changes in staging before production +- **Monitoring**: Set up alerts for DNS resolution failures and connection timeouts +- **Circuit Breakers**: Implement connection retry logic with backoff +- **Network Observability**: Use tools like Istio or Linkerd for network insights +- **Disaster Recovery**: Document network dependencies for incident response + +## Additional Debugging Commands + +Test service connectivity from different contexts: +```bash +# Test from within the cluster +kubectl run debug-pod --image=busybox --rm -it --restart=Never -n flagsmith + +# Check service discovery across namespaces +kubectl get svc --all-namespaces | grep postgresql + +# Test basic connectivity +kubectl run test-pod --image=busybox --rm -it --restart=Never -n flagsmith -- ping flagsmith-dev-postgresql +``` + +Check DNS configuration: +```bash +# View DNS configuration in pods +kubectl exec -it -n flagsmith -- cat /etc/resolv.conf + +# Test different DNS queries +kubectl run dns-debug --image=busybox --rm -it --restart=Never -n flagsmith -- nslookup flagsmith-dev-postgresql +``` + +Debug service mesh issues (if applicable): +```bash +# Check for service mesh sidecars +kubectl get pods -n flagsmith -o jsonpath='{.items[*].spec.containers[*].name}' + +# Check network policies +kubectl get networkpolicies -n flagsmith +kubectl describe networkpolicy -n flagsmith +``` diff --git a/kubernetes/challenges/network-issue.yaml b/kubernetes/challenges/network-issue.yaml new file mode 100644 index 0000000..f48046c --- /dev/null +++ b/kubernetes/challenges/network-issue.yaml @@ -0,0 +1,49 @@ +name: "Network Connectivity Issue" +description: "Candidate must fix broken service selectors and DNS configuration preventing API-database connectivity." +estimates: + Lv1: "∞" + Lv2: 20 + Lv3: 9 + +helm_values: + api: + replicaCount: 1 + devPostgresql: + enabled: true + nameOverride: dev-postgresql + auth: + postgresPassword: flagsmith + database: flagsmith + +kubectl_patches: + - resource: "service" + name: "flagsmith-dev-postgresql" + namespace: "flagsmith" + patch_type: "json" + patch: + # Break the service selector to prevent endpoint discovery + - op: "replace" + path: "/spec/selector/app.kubernetes.io~1name" + value: "postgresql-wrong" + + - resource: "deployment" + name: "flagsmith-api" + namespace: "flagsmith" + patch_type: "json" + patch: + # Override DATABASE_URL to use incorrect service name (missing -dev-) + - op: "add" + path: "/spec/template/spec/containers/0/env/-" + value: + name: "DATABASE_URL" + value: "postgresql://postgres:flagsmith@flagsmith-postgresql:5432/flagsmith" + + # Add incorrect DNS search configuration + - op: "add" + path: "/spec/template/spec/dnsConfig" + value: + options: + - name: "ndots" + value: "1" + - name: "timeout" + value: "2" \ No newline at end of file diff --git a/kubernetes/challenges/performance-issue.md b/kubernetes/challenges/performance-issue.md new file mode 100644 index 0000000..30cbb2b --- /dev/null +++ b/kubernetes/challenges/performance-issue.md @@ -0,0 +1,149 @@ +# Challenge 004: Performance Degradation Issue - Solution + +## Problem Description +The Flagsmith API is experiencing severe performance issues with response times over 10 seconds. There are resource constraints and a CPU-intensive sidecar container consuming system resources unnecessarily. + +## Step-by-Step Solution + +### Step 1: Identify the Problem +Check the overall pod status and resource usage: +```bash +kubectl get pods -n flagsmith +kubectl top pods -n flagsmith +``` + +You should see high CPU usage on the API pods and potentially slow response times. + +### Step 2: Examine Pod Resource Usage +Get detailed resource information: +```bash +# Check current resource limits and requests +kubectl describe pod -n flagsmith | grep -A 10 'Limits:\|Requests:' + +# Monitor real-time resource usage +kubectl top pods -n flagsmith --containers +``` + +**Key Issues to Look For:** +- Very low CPU limits (100m) for the main application +- High CPU usage approaching or exceeding limits +- Additional containers consuming resources unnecessarily + +### Step 3: Investigate Container Configuration +Examine the deployment configuration: +```bash +kubectl get deployment flagsmith-api -n flagsmith -o yaml | grep -A 20 'containers:' +``` + +You'll discover: +1. **Insufficient CPU limits**: 100m CPU limit is too low for a web application under load +2. **CPU-intensive sidecar**: A "cpu-hog" container running unnecessary CPU-intensive operations +3. **No horizontal scaling**: Single replica handling all traffic + +### Step 4: Identify the Root Causes +The performance issues are caused by: +1. **Resource starvation**: CPU limits too restrictive for application needs +2. **Resource competition**: Sidecar container consuming available CPU +3. **Lack of scaling**: Single pod handling all requests without autoscaling + +### Step 5: Apply the Fixes + +**Fix 1: Edit the deployment to remove the sidecar and fix resources** +```bash +kubectl edit deployment flagsmith-api -n flagsmith +``` + +This opens the deployment YAML. You'll see two containers in the `containers:` section: + +1. The main `flagsmith-api` container (first) +2. A `cpu-hog` container (second) - **delete this entire container block** + +In the main container's `resources:` section, update: +```yaml +resources: + limits: + cpu: 500m # Increase from 100m + memory: "256Mi" + requests: + cpu: 250m # Increase from 50m + memory: "128Mi" +``` + +Save and exit the editor. + +**Fix 3: Scale up replicas for better load distribution** +```bash +kubectl scale deployment flagsmith-api -n flagsmith --replicas=2 +``` + +### Step 6: Verify the Fix +Wait for the deployment to roll out and test performance: +```bash +# Wait for rollout to complete +kubectl rollout status deployment/flagsmith-api -n flagsmith --timeout=120s + +# Check resource usage after fix +kubectl top pods -n flagsmith + +# Test response time +kubectl port-forward svc/flagsmith-api 8080:8000 -n flagsmith & +time curl http://localhost:8080/health/liveness/ +kill %1 +``` + +Expected results: +- Response time should drop to under 1 second +- CPU usage should be more reasonable and stable +- Multiple pods should be handling traffic + +### Step 7: Monitor Stability +```bash +# Watch pods for stability +kubectl get pods -n flagsmith -w + +# Monitor resource usage over time +watch kubectl top pods -n flagsmith +``` + +## Key Learning Points + +1. **Resource Right-sizing**: Web applications typically need 250m-1000m CPU for reasonable performance +2. **Container Efficiency**: Remove unnecessary sidecar containers that consume resources without adding value +3. **Horizontal Scaling**: Multiple replicas distribute load and improve resilience +4. **Performance Monitoring**: Use `kubectl top` to identify resource bottlenecks +5. **Systematic Approach**: Check resource usage before making assumptions about performance issues + +## Prevention + +- **Resource Planning**: Profile applications under expected load to set appropriate limits +- **Monitoring**: Set up alerts for high resource utilization (>80% of limits) +- **Horizontal Pod Autoscaling**: Configure HPA to automatically scale based on CPU/memory usage +- **Load Testing**: Regular performance testing to identify bottlenecks before they reach production +- **Resource Reviews**: Periodic review of resource allocation vs. actual usage + +## Production Considerations + +- **Gradual Scaling**: In production, scale replicas gradually and monitor impact +- **Resource Quotas**: Ensure namespace has sufficient CPU/memory quotas for increased allocation +- **Monitoring Integration**: Set up proper APM tools for ongoing performance monitoring +- **SLA Impact**: Document how resource changes affect application SLAs +- **Change Management**: Test resource changes carefully and monitor impact + +## Additional Debugging Commands + +Check cluster resource availability: +```bash +kubectl describe nodes | grep -A 5 "Allocated resources" +``` + +View historical resource usage (if metrics-server is available): +```bash +kubectl top pods -n flagsmith --sort-by=cpu +kubectl top nodes +``` + +Test application performance: +```bash +# Simple load test +for i in {1..10}; do curl -w "%{time_total}s\n" -o /dev/null -s http://localhost:8080/health/liveness/; done +``` diff --git a/kubernetes/challenges/performance-issue.yaml b/kubernetes/challenges/performance-issue.yaml new file mode 100644 index 0000000..5b54e45 --- /dev/null +++ b/kubernetes/challenges/performance-issue.yaml @@ -0,0 +1,39 @@ +name: "Performance Degradation Issue" +description: "Candidate must identify and remove a CPU-intensive sidecar container causing API performance issues." +estimates: + Lv1: 50 + Lv2: 22 + Lv3: 12 + +helm_values: + api: + replicaCount: 1 + resources: + limits: + cpu: "100m" + memory: "256Mi" + requests: + cpu: "50m" + memory: "128Mi" + # Disable autoscaling + autoscaling: + enabled: false + +kubectl_patches: + - resource: "deployment" + name: "flagsmith-api" + namespace: "flagsmith" + patch_type: "json" + patch: + # Add a CPU-intensive sidecar container to create load + - op: "add" + path: "/spec/template/spec/containers/-" + value: + name: "cpu-hog" + image: "alpine:3.21" + command: ["/bin/sh"] + args: ["-c", "while true; do dd if=/dev/zero of=/dev/null bs=1M count=100; sleep 1; done"] + resources: + requests: + cpu: "200m" + memory: "64Mi" \ No newline at end of file diff --git a/kubernetes/cli/Dockerfile b/kubernetes/cli/Dockerfile new file mode 100644 index 0000000..4b1e59c --- /dev/null +++ b/kubernetes/cli/Dockerfile @@ -0,0 +1,33 @@ +FROM python:3.14-alpine + +# Install system dependencies and tools from Alpine repos +RUN apk add --no-cache \ + curl \ + git \ + patch \ + wget \ + bash \ + ca-certificates \ + docker-cli \ + docker-compose \ + helm \ + kubectl + +# Upgrade pip to latest version for pyproject.toml support +RUN pip install --upgrade pip + +# Set working directory +WORKDIR /app + +# Copy Python project files +COPY pyproject.toml ./ +COPY cli/ ./cli/ + +# Install Python dependencies +RUN pip install -e . + +# Create challenges directory +RUN mkdir -p /app/challenges + +# Default command +CMD ["python", "-m", "cli.main"] diff --git a/kubernetes/cli/__init__.py b/kubernetes/cli/__init__.py new file mode 100644 index 0000000..f2cd8ef --- /dev/null +++ b/kubernetes/cli/__init__.py @@ -0,0 +1,9 @@ +""" +Flagsmith Infrastructure Challenger + +Production-grade Kubernetes troubleshooting scenarios for technical interviews. +""" + +__version__ = "0.1.0" +__author__ = "Flagsmith Team" +__description__ = "Production-grade Kubernetes troubleshooting scenarios" diff --git a/kubernetes/cli/challenges.py b/kubernetes/cli/challenges.py new file mode 100644 index 0000000..bb0bdfa --- /dev/null +++ b/kubernetes/cli/challenges.py @@ -0,0 +1,505 @@ +""" +Challenge discovery and management system. + +Automatically discovers challenges from patch files in the challenges/ directory +and provides functionality to apply patches, manage deployments, and run health checks. +""" + +import atexit +import json +import os +import shlex +import signal +import subprocess +import time +import uuid +from pathlib import Path +from typing import Any + +import requests +import yaml +from requests.exceptions import ConnectionError as RequestsConnectionError +from requests.exceptions import ConnectTimeout +from requests.exceptions import RequestException +from requests.exceptions import Timeout + + +class Challenge: + """Represents a single challenge with metadata, helm values, and kubectl patches.""" + + def __init__( + self, + filename: str, + name: str, + description: str, + helm_values: dict[str, Any], + kubectl_patches: list[dict[str, Any]], + estimates: dict[str, str], + challenge_dir: Path, + ): + self.filename = filename + self.name = name + self.description = description + self.helm_values = helm_values + self.kubectl_patches = kubectl_patches + self.estimates = estimates + self.challenge_dir = challenge_dir + self._manual: str | None = None + + @property + def manual(self) -> str: + """Load and cache the manual for this challenge. + + Every challenge requires a corresponding .md manual file. + The file path is constructed from the YAML filename. + """ + if self._manual is not None: + return self._manual + + manual_filename = self.filename.replace('.yaml', '.md') + manual_path = self.challenge_dir / manual_filename + + self._manual = manual_path.read_text() + return self._manual + + def __str__(self) -> str: + return f"{self.name}" + + +class ChallengeManager: + """Manages challenge discovery, deployment, and health checking.""" + + def __init__(self, challenges_dir: str = "/app/challenges"): + self.challenges_dir = Path(challenges_dir) + self.challenges: list[Challenge] = [] + self.current_challenge: Challenge | None = None + self.flagsmith_namespace = "flagsmith" + self.helm_release_name = "flagsmith" + # Generate unique container name using UUID hash + unique_id = str(uuid.uuid4())[:12] + self.candidate_container_name = f"k8s_challenger-candidate-env-{unique_id}" + + # Register cleanup on exit and signals + atexit.register(self._cleanup) + signal.signal(signal.SIGINT, self._signal_cleanup) + signal.signal(signal.SIGTERM, self._signal_cleanup) + + def _run_command(self, cmd: str, check: bool = True, verbose: bool = False) -> subprocess.CompletedProcess: + """ + Run shell command with consistent interface. + + Args: + cmd: Command to run + check: Whether to raise on non-zero exit + verbose: Whether to show output in real-time + + Returns: + CompletedProcess result + """ + return subprocess.run( + shlex.split(cmd), + check=check, + text=True, + capture_output=not verbose, + ) + + def _signal_cleanup(self, signum, frame): + """Handle cleanup on signal.""" + self._cleanup() + + def _cleanup(self): + """Clean up project Docker resources.""" + self._run_command("docker-compose down --volumes --remove-orphans", check=False) + # Also clean up any dynamically created candidate containers + self._force_cleanup_containers() + + def _force_cleanup_containers(self): + """Force cleanup of any remaining candidate containers.""" + try: + # Find and remove any candidate containers (including uniquely named ones) + cmd = 'docker ps -a --filter "name=k8s_challenger-candidate-env" --format "{{.ID}}"' + result = self._run_command(cmd, check=False) + if result.returncode == 0 and result.stdout.strip(): + container_ids = result.stdout.strip().split('\n') + for container_id in container_ids: + if container_id.strip(): + self._run_command(f"docker rm -f {container_id.strip()}", check=False) + except Exception: + # Don't fail cleanup due to container cleanup issues + pass + + def discover_challenges(self) -> None: + """Automatically discover challenges from patch files.""" + self.challenges = [] + + if not self.challenges_dir.exists(): + return + + yaml_files = sorted(self.challenges_dir.glob("*.yaml")) + + for yaml_file in yaml_files: + try: + challenge = self._parse_challenge_file(yaml_file) + if challenge: + self.challenges.append(challenge) + except Exception: + pass + + def _parse_challenge_file(self, filepath: Path) -> Challenge | None: + """Parse a YAML challenge file and extract metadata. + + Challenge YAML files must include all required fields: + name, description, helm_values, kubectl_patches, estimates. + """ + try: + data = yaml.safe_load(filepath.read_text()) + except (yaml.YAMLError, OSError): + return None + + # Validate all required fields + required_fields = ['name', 'description', 'helm_values', 'kubectl_patches', 'estimates'] + if not isinstance(data, dict) or not all(field in data for field in required_fields): + return None + + return Challenge( + filename=filepath.name, + name=data['name'], + description=data['description'], + helm_values=data['helm_values'], + kubectl_patches=data['kubectl_patches'], + estimates=data['estimates'], + challenge_dir=filepath.parent, + ) + + def list_challenges(self) -> list[Challenge]: + """Return list of available challenges.""" + return self.challenges + + def get_challenge(self, index: int) -> Challenge | None: + """Get challenge by index.""" + if 0 <= index < len(self.challenges): + return self.challenges[index] + return None + + def setup_challenge(self, challenge: Challenge) -> bool: + """Set up a challenge by applying its patch to the Flagsmith deployment.""" + self.current_challenge = challenge + + try: + # Cleanup before starting + self._cleanup() + + # Wait for cleanup to complete + time.sleep(2) + + # Create namespace + self._run_command(f"kubectl create namespace {self.flagsmith_namespace}", check=False, verbose=True) + + # Clone Flagsmith charts if not exists + if not Path("/tmp/flagsmith-charts").exists(): + self._run_command( + "git clone https://github.com/Flagsmith/flagsmith-charts.git /tmp/flagsmith-charts", + verbose=True, + ) + + # Prepare values file + values_file = "/tmp/flagsmith-values.yaml" + + if challenge.helm_values: + # Use custom values from challenge + Path(values_file).write_text(yaml.dump(challenge.helm_values)) + else: + # Use original values + original_values = "/tmp/flagsmith-charts/charts/flagsmith/values.yaml" + self._run_command(f"cp {original_values} {values_file}", check=True) + + # Add Flagsmith Helm repo + self._run_command( + "helm repo add flagsmith https://flagsmith.github.io/flagsmith-charts/", + check=False, + verbose=True, + ) + # Add common dependency repos used by the chart + self._run_command( + "helm repo add bitnami https://charts.bitnami.com/bitnami", + check=False, + verbose=True, + ) + self._run_command( + "helm repo add influxdata https://helm.influxdata.com/", + check=False, + verbose=True, + ) + self._run_command( + "helm repo add kiwigrid https://kiwigrid.github.io/helm-charts/", + check=False, + verbose=True, + ) + self._run_command("helm repo update", check=False, verbose=True) + + # Optionally build local chart dependencies (not required when using remote chart) + self._run_command( + "helm dependency build /tmp/flagsmith-charts/charts/flagsmith", + check=False, + verbose=True, + ) + + # Deploy using Helm (no --wait to allow broken deployments) + helm_cmd = ( + f"helm upgrade --install {self.helm_release_name} flagsmith/flagsmith " + f"-f {values_file} -n {self.flagsmith_namespace} --create-namespace" + ) + + self._run_command(helm_cmd, check=True, verbose=True) + + # Apply kubectl patches after successful helm deployment + if challenge.kubectl_patches: + self._apply_kubectl_patches(challenge.kubectl_patches) + + return True + + except Exception: + return False + + def _apply_kubectl_patches(self, patches: list[dict[str, Any]]) -> None: + """Apply kubectl patches after helm deployment.""" + for patch in patches: + try: + resource = patch.get('resource') + name = patch.get('name') + namespace = patch.get('namespace', self.flagsmith_namespace) + patch_data = patch.get('patch', {}) + patch_type = patch.get('patch_type', 'merge') # Default to merge + + if not resource or not name or not patch_data: + continue + + # Convert patch data to JSON + patch_json = json.dumps(patch_data) + + kubectl_cmd = ( + f"kubectl patch {resource} {name} -n {namespace} " + f"--type {patch_type} -p '{patch_json}'" + ) + + self._run_command(kubectl_cmd, check=True, verbose=True) + + except Exception as e: + # Continue with other patches even if one fails + print(f"Warning: Failed to apply patch to {resource}/{name}: {e}") + continue + + def _cleanup_deployment(self) -> None: + """Clean up existing Flagsmith deployment.""" + # Cleanup handles everything + self._cleanup() + + + + def check_health(self) -> tuple[bool, str, int | None]: + """ + Check if Flagsmith is healthy using kubectl port-forward. + + Returns: + Tuple of (is_healthy, status_message, http_status_code) + """ + try: + # First check if API service exists + result = self._run_command( + f"kubectl get service {self.helm_release_name}-api -n {self.flagsmith_namespace}", + check=False, + ) + + if result.returncode != 0: + return False, "Service not deployed", None + + # Use kubectl port-forward to create a reliable connection + local_port = 8080 + port_forward_cmd = f"kubectl port-forward svc/{self.helm_release_name}-api {local_port}:8000 -n {self.flagsmith_namespace}" + + # Start port-forward in background + port_forward_proc = subprocess.Popen( + shlex.split(port_forward_cmd), + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + try: + # Give port-forward time to establish + time.sleep(2) + + # Check if port-forward is still running + if port_forward_proc.poll() is not None: + return False, "Port-forward failed", None + + # Make health check request to localhost + health_url = f"http://localhost:{local_port}/health/liveness/" + response = requests.get(health_url, timeout=5) + + is_healthy = response.status_code == 200 + + if is_healthy: + return True, "Healthy", response.status_code + elif response.status_code == 404: + return False, "Endpoint not found", response.status_code + elif response.status_code == 500: + return False, "Server error", response.status_code + elif response.status_code == 503: + return False, "Service unavailable", response.status_code + else: + return False, f"HTTP {response.status_code}", response.status_code + + finally: + # Always clean up port-forward process + try: + port_forward_proc.terminate() + port_forward_proc.wait(timeout=5) + except Exception: + try: + port_forward_proc.kill() + except Exception: + pass + + except ConnectTimeout: + return False, "Connection timeout", None + except RequestsConnectionError: + return False, "Cannot connect to service", None + except Timeout: + return False, "Request timeout", None + except RequestException: + return False, "Network error", None + except Exception: + return False, "Health check failed", None + + def get_flagsmith_info(self) -> dict[str, Any]: + """Get information about the current Flagsmith deployment.""" + try: + # Get pods + pods_result = self._run_command(f"kubectl get pods -n {self.flagsmith_namespace} -o json") + + info: dict[str, Any] = { + "namespace": self.flagsmith_namespace, + "release": self.helm_release_name, + "pods": [], + "services": [], + } + + if pods_result.returncode == 0: + pods_data: Any = json.loads(pods_result.stdout) + if isinstance(pods_data, dict): + items: Any = pods_data.get("items", []) + if isinstance(items, list): + for pod in items: + info["pods"].append({ + "name": pod["metadata"]["name"], + "status": pod["status"]["phase"], + "ready": self._is_pod_ready(pod), + }) + + # Get services + services_result = self._run_command(f"kubectl get services -n {self.flagsmith_namespace} -o json") + + if services_result.returncode == 0: + services_data: Any = json.loads(services_result.stdout) + if isinstance(services_data, dict): + items = services_data.get("items", []) + if isinstance(items, list): + for svc in items: + info["services"].append({ + "name": svc["metadata"]["name"], + "type": svc["spec"]["type"], + "ports": svc["spec"]["ports"], + }) + + return info + + except Exception as e: + return {"error": str(e)} + + def _is_pod_ready(self, pod: dict[str, Any]) -> bool: + """Check if a pod is ready.""" + conditions = pod.get("status", {}).get("conditions", []) + for condition in conditions: + if condition.get("type") == "Ready": + return condition.get("status") == "True" + return False + + def get_tmate_info(self) -> tuple[bool, str | None, str | None]: + """ + Get tmate connection information from candidate environment. + + Returns: + Tuple of (is_ready, ssh_url, web_url) + """ + try: + status_file = Path("/tmp/tmate-info/status") + ssh_file = Path("/tmp/tmate-info/ssh") + web_file = Path("/tmp/tmate-info/web") + + if not status_file.exists(): + return False, None, None + + status = status_file.read_text().strip() + if status != "ready": + return False, None, None + + ssh_url = ssh_file.read_text().strip() if ssh_file.exists() else None + web_url = web_file.read_text().strip() if web_file.exists() else None + + return True, ssh_url, web_url + + except Exception: + return False, None, None + + def start_candidate_environment(self) -> None: + """Start a new candidate environment container for debugging.""" + # Cleanup first + self._cleanup() + time.sleep(2) + + # Get network name from existing k3s container + k3s_inspect = self._run_command( + 'docker inspect k8s_challenger-k3s-1 --format "{{range .NetworkSettings.Networks}}{{.NetworkID}}{{end}}"', + check=False, + ) + network_id = k3s_inspect.stdout.strip() if k3s_inspect.returncode == 0 else "k8s_challenger_flagsmith-network" + + # Start fresh candidate environment container with auto-remove on stop + docker_cmd = ( + f"docker run -d --name {self.candidate_container_name} --network {network_id} " + f"--user root --rm -v k8s_challenger_k3s-server:/etc/rancher/k3s:ro " + f"-v k8s_challenger_kubeconfig-data:/home/candidate/.kube-shared:ro " + f"-v k8s_challenger_tmate-info:/tmp/tmate-info " + f"--env KUBECONFIG=/home/candidate/.kube/config k8s_challenger-candidate-env" + ) + + self._run_command(docker_cmd, check=True) + + def stop_candidate_environment(self) -> None: + """Stop and remove the candidate environment container for cleanup.""" + self._cleanup() + + def _cleanup_tmate_info(self) -> None: + """Clean up tmate info files safely.""" + try: + tmate_info_dir = Path("/tmp/tmate-info") + if tmate_info_dir.exists(): + # Remove files individually instead of entire directory to avoid resource busy + for file_path in tmate_info_dir.glob("*"): + try: + os.remove(file_path) + except OSError: + pass # Ignore individual file removal errors + # Try to remove directory, but don't fail if it's busy + try: + tmate_info_dir.rmdir() + except OSError: + pass # Directory might still be in use by volume mount + except Exception: + pass # Don't fail cleanup due to minor issues + + def cleanup(self) -> None: + """Clean up current challenge deployment and candidate environment.""" + # Cleanup handles everything + self._cleanup() + self.current_challenge = None diff --git a/kubernetes/cli/main.py b/kubernetes/cli/main.py new file mode 100644 index 0000000..c05d1a3 --- /dev/null +++ b/kubernetes/cli/main.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 +""" +Flagsmith Infrastructure Challenger - Main CLI Application +Production-grade Kubernetes troubleshooting scenarios +""" + +import argparse +import atexit +import signal +import sys +import time +from datetime import datetime +from pathlib import Path + +from rich.console import Console +from rich.panel import Panel +from rich.prompt import Prompt +from rich.text import Text + +from cli.challenges import Challenge +from cli.challenges import ChallengeManager + +console = Console() + + +def handle_error(error: Exception) -> None: + """Display error with confirmation message.""" + console.print() + error_panel = Panel( + f"[red bold]Error:[/red bold] {error!s}\n\n" + f"[dim]Type: {type(error).__name__}[/dim]", + title="❌ System Error", + border_style="red", + padding=(1, 2), + ) + console.print(error_panel) + console.print() + console.print("[yellow]Press Enter to exit...[/yellow]") + try: + input() + except (EOFError, KeyboardInterrupt): + pass + + +def get_safe_width(console: Console) -> int: + """Get safe panel width that prevents border wrapping.""" + return console.size.width - 2 + + +def create_panel(content: str, title: str = "", style: str = "blue", title_align: str = "left") -> Panel: + """Create a consistent panel with standard styling.""" + return Panel( + content, + title=title, + title_align=title_align, + border_style=style, + padding=(0, 1), + width=get_safe_width(console), + ) + + +def create_tip_panel(message: str) -> Panel: + """Create a tip panel with consistent yellow styling.""" + return Panel( + f"[yellow]💡 Tip:[/yellow] {message}", + border_style="yellow", + padding=(0, 1), + width=get_safe_width(console), + ) + + +class InterviewSession: + """Manages interview session lifecycle and health monitoring.""" + + def __init__(self, challenge_manager: ChallengeManager): + self.challenge_manager = challenge_manager + self.start_time: datetime | None = None + self.current_challenge: Challenge | None = None + self.session_active = False + + def start_session(self, challenge: Challenge) -> bool: + """Start interview session with selected challenge.""" + self.current_challenge = challenge + self.start_time = datetime.now() + self.session_active = True + + # Display challenge info in panel + panel_text = Text() + panel_text.append(challenge.name, style="bold") + panel_text.append("\n") + panel_text.append(challenge.description) + + bronze_min = challenge.estimates["Lv1"] + silver_min = challenge.estimates["Lv2"] + gold_min = challenge.estimates["Lv3"] + panel_text.append("\n") + panel_text.append("Time Estimates: ", style="cyan") + panel_text.append("🥉", style="dim") + panel_text.append(str(bronze_min), style="red") + panel_text.append("/", style="dim") + panel_text.append(str(silver_min), style="white") + panel_text.append("/", style="dim") + panel_text.append("🥇", style="dim") + panel_text.append(str(gold_min), style="yellow") + panel_text.append("min", style="dim") + + challenge_panel = Panel( + panel_text, + title="Challenge", + title_align="left", + border_style="cyan", + padding=(0, 1), + width=get_safe_width(console), + ) + console.print() + console.print(challenge_panel) + console.print() + + # Show reference to manual + _ = challenge.manual # Ensure manual exists, will raise if missing + manual_filename = Path(challenge.filename).stem + manual_reference = Panel( + f"📖 Reviewer manual: [blue underline]challenges/{manual_filename}.md[/blue underline]", + title_align="left", + border_style="blue", + padding=(0, 1), + width=get_safe_width(console), + ) + console.print(manual_reference) + console.print() + + # Deploy challenge + console.print("📦 [bold]Step 1:[/bold] Deploying challenge environment") + self.challenge_manager.setup_challenge(challenge) + + # Start candidate environment + console.print("🔗 [bold]Step 2:[/bold] Starting candidate session") + self.challenge_manager.start_candidate_environment() + + if not self._setup_candidate_connection(): + console.print("[red]Connection setup failed[/red]") + return False + + # Begin monitoring + console.print("🏥 [bold]Step 3:[/bold] Monitoring active") + self._start_health_monitoring() + + return True + + def _setup_candidate_connection(self) -> bool: + """Setup and display candidate connection information.""" + max_retries = 2 + + for retry in range(max_retries + 1): + if retry > 0: + console.print(f"Retry {retry}/{max_retries}") + self.challenge_manager.start_candidate_environment() + time.sleep(3) + + console.print("Initializing session...") + + for attempt in range(10): # 10 second timeout + is_ready, ssh_url, web_url = self.challenge_manager.get_tmate_info() + + if is_ready and ssh_url: + # Connection info panel + connection_info = f"[bold]SSH:[/bold] [cyan]{ssh_url}[/cyan]" + if web_url: + connection_info += f"\n[bold]Web:[/bold] [cyan]{web_url}[/cyan]" + + instructions = ( + "📋 [bold]Candidate instructions:[/bold]\n" + " 1. Connect using SSH command above\n" + " 2. Explore cluster with kubectl\n" + " 3. Debug and fix the deployment\n" + " 4. Verify health endpoint returns HTTP 200" + ) + + panel_content = f"{connection_info}\n\n{instructions}" + panel = create_panel(panel_content, "Session Ready", "blue") + console.print(panel) + return True + + # Check for session failure + if self._check_session_failure(): + break + + if attempt % 3 == 0 and attempt > 0: + console.print(f"Still initializing... ({attempt}s)") + + time.sleep(1) + + if retry < max_retries: + console.print("[yellow]Session failed, retrying...[/yellow]") + else: + console.print("[red]Session initialization failed[/red]") + + return False + + def _check_session_failure(self) -> bool: + """Check if session initialization failed.""" + status_file = Path("/tmp/tmate-info/status") + if status_file.exists(): + status = status_file.read_text().strip() + if status in ["failed", "timeout"]: + console.print(f"[red]Session {status}[/red]") + return True + return False + + def _start_health_monitoring(self): + """Start manual health monitoring with space bar.""" + console.print("Session active. [bold]Press ENTER to check health[/bold] (Ctrl+C to exit)") + + try: + while self.session_active: + user_input = input() + if user_input.strip() == "": # Enter pressed + console.print("[dim]Probing health endpoint...[/dim]") + self._check_health() + else: + console.print("[yellow]Press ENTER to check health (Ctrl+C to exit)[/yellow]") + except KeyboardInterrupt: + pass + + def _check_health(self): + """Perform health check and display result.""" + elapsed = self._format_elapsed_time() + is_healthy, status_msg, _ = self.challenge_manager.check_health() + console.print(f"{elapsed} | {status_msg.strip()}") + + if is_healthy: + success_panel = Panel( + "[green]Challenge completed successfully!\n" + "Health endpoint is responding correctly.\n\n" + "[yellow]Press Ctrl+C to exit[/yellow]", + title="Success", + title_align="center", + border_style="green", + padding=(0, 1), + width=get_safe_width(console), + ) + console.print(success_panel) + else: + console.print("[bold]Press ENTER to check again[/bold] (Ctrl+C to exit)") + + def _format_elapsed_time(self) -> str: + """Format session elapsed time.""" + assert self.start_time is not None, "start_time must be set when session is active" + elapsed = datetime.now() - self.start_time + total_seconds = int(elapsed.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + + return f"{hours:02d}:{minutes:02d}:{seconds:02d}".strip() + + def stop_session(self): + """Clean up session and exit.""" + if not self.session_active: + return + + console.print("\nEnding session...") + self.session_active = False + + # Clean up resources + self.challenge_manager.cleanup() + + elapsed = self._format_elapsed_time() + console.print(f"Session duration: [bold]{elapsed}[/bold]") + console.print("Environment cleaned up") + + self.current_challenge = None + self.start_time = None + + +class InterviewCLI: + """Main CLI application.""" + + def __init__(self): + self.challenge_manager = ChallengeManager() + self.session = InterviewSession(self.challenge_manager) + + # Setup cleanup handlers + signal.signal(signal.SIGINT, self._handle_exit) + signal.signal(signal.SIGTERM, self._handle_exit) + atexit.register(self._cleanup) + + def _handle_exit(self, signum, frame): + """Handle exit signals gracefully.""" + if self.session.session_active: + self.session.stop_session() + + console.print("\n👋 [bold]Goodbye[/bold]") + self._cleanup() + sys.exit(0) + + def _cleanup(self): + """Perform final cleanup.""" + try: + self.challenge_manager._force_cleanup_containers() + except Exception: + pass + + def run(self): + """Run the CLI application.""" + self._show_banner() + + # Load challenges + self.challenge_manager.discover_challenges() + + if not self.challenge_manager.challenges: + console.print("[red]No challenges found[/red]") + return + + # Select challenge + if not (challenge := self._select_challenge()): + return + + self._run_challenge(challenge) + + def run_specific_challenge(self, challenge_id: str) -> None: + """Run a specific challenge by its filename (e.g., 'database-typo').""" + self._show_banner() + + # Load challenges + self.challenge_manager.discover_challenges() + + if not self.challenge_manager.challenges: + console.print("[red]No challenges found[/red]") + console.print("Add .yaml files to the challenges/ directory") + return + + # Find challenge by filename prefix + if not (challenge := self._find_challenge(challenge_id)): + console.print(f"[red]Challenge '{challenge_id}' not found[/red]") + self._list_available_challenges() + return + + console.print(f"[bold]Running challenge:[/bold] {challenge.name}") + console.print() + + self._run_challenge(challenge) + + def _find_challenge(self, challenge_id: str) -> Challenge | None: + """Find challenge by matching its filename.""" + for challenge in self.challenge_manager.challenges: + if Path(challenge.filename).stem == challenge_id: + return challenge + return None + + def _list_available_challenges(self): + """List available challenges for error messages.""" + console.print("\n[bold]Available challenges:[/bold]") + for challenge in self.challenge_manager.challenges: + console.print(f" {Path(challenge.filename).stem}: {challenge.name}") + console.print() + + def _show_banner(self): + """Display application banner.""" + banner_content = Text() + banner_content.append("Flagsmith Infrastructure Challenger", style="bold blue") + banner_content.append("\n") + banner_content.append("Production-grade Kubernetes troubleshooting scenarios", style="dim") + + banner = Panel( + banner_content, + border_style="blue", + padding=(0, 1), + width=get_safe_width(console), + ) + console.print() + console.print(banner) + console.print() + + def _select_challenge(self) -> Challenge | None: + """Display challenges and get user selection.""" + console.print(f"[bold]Available challenges[/bold] ({len(self.challenge_manager.challenges)} found)") + console.print() + + # List challenges + from textwrap import fill + + for i, challenge in enumerate(self.challenge_manager.challenges, 1): + # Challenge title + title = Text() + title.append(f" {i}. ", style="bold") + title.append(challenge.name, style="cyan") + # Show time estimates + bronze_min = challenge.estimates["Lv1"] + silver_min = challenge.estimates["Lv2"] + gold_min = challenge.estimates["Lv3"] + title.append(" (", style="dim") + title.append(str(bronze_min), style="red") + title.append("/", style="dim") + title.append(str(silver_min), style="white") + title.append("/", style="dim") + title.append(str(gold_min), style="yellow") + title.append("min)", style="dim") + console.print(title) + + # Description with proper wrapping + wrapped_desc = fill( + challenge.description, + width=70, + initial_indent=" ", + subsequent_indent=" ", + ) + console.print(wrapped_desc, style="dim") + console.print() + + # Exit tip + tip = create_tip_panel("Press [bold]Ctrl+C[/bold] anytime to exit") + console.print(tip) + console.print() + + # Get selection + while True: + try: + choice = Prompt.ask("Select challenge number") + except (EOFError, KeyboardInterrupt): + return None + + if not choice.isdigit(): + console.print("[red]Enter a number[/red]") + continue + + num = int(choice) + if not (1 <= num <= len(self.challenge_manager.challenges)): + console.print(f"[red]Enter 1-{len(self.challenge_manager.challenges)}[/red]") + continue + + return self.challenge_manager.get_challenge(num - 1) + + + + def _run_challenge(self, challenge: Challenge) -> None: + """Execute selected challenge.""" + # Run session + if self.session.start_session(challenge): + # Wait for completion or interruption + while self.session.session_active: + try: + time.sleep(1) + except KeyboardInterrupt: + break + + +def setup_kubeconfig() -> None: + """Simple kubeconfig setup - k3s is already healthy via docker-compose.""" + import shutil + + kube_dir = Path("/root/.kube") + kube_dir.mkdir(exist_ok=True, parents=True) + + # K3s writes to this location via K3S_KUBECONFIG_OUTPUT + source = Path("/root/.kube/kubeconfig.yaml") # from kubeconfig-data volume + target = Path("/root/.kube/config") + + if not source.exists(): + # Fallback to direct k3s location + source = Path("/etc/rancher/k3s/k3s.yaml") + + if not source.exists(): + raise FileNotFoundError("Kubeconfig not found in expected locations") + + shutil.copy2(source, target) + + # Fix server URL for container networking + content = target.read_text() + content = content.replace('127.0.0.1:6443', 'k3s-server:6443') + content = content.replace('localhost:6443', 'k3s-server:6443') + target.write_text(content) + + +def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser(description='Flagsmith Infrastructure Challenger') + parser.add_argument( + '--challenges-dir', + default='/app/challenges', + help='Challenges directory path', + ) + parser.add_argument( + '--challenge', + help='Run specific challenge by name (e.g., "database-typo")', + ) + + args = parser.parse_args() + + # Setup kubeconfig (k3s is already healthy via docker-compose) + try: + setup_kubeconfig() + except Exception as e: + console.print(f"[red]Failed to setup kubeconfig: {e}[/red]") + sys.exit(1) + + # Run application with top-level exception handler + try: + cli = InterviewCLI() + cli.challenge_manager.challenges_dir = Path(args.challenges_dir) + + if args.challenge: + cli.run_specific_challenge(args.challenge) + else: + cli.run() + except KeyboardInterrupt: + console.print("\n[yellow]Interrupted by user[/yellow]") + sys.exit(0) + except Exception as e: + handle_error(e) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/kubernetes/docker-compose.yml b/kubernetes/docker-compose.yml new file mode 100644 index 0000000..ffcfcd2 --- /dev/null +++ b/kubernetes/docker-compose.yml @@ -0,0 +1,90 @@ +name: k8s_challenger + +services: + # K3s Kubernetes cluster + k3s: + image: rancher/k3s:v1.34.1-k3s1 + command: >- + server + --disable=traefik + --disable=servicelb + --write-kubeconfig-mode=644 + --node-name=k3s-server + privileged: true + restart: unless-stopped + environment: + K3S_KUBECONFIG_OUTPUT: /output/kubeconfig.yaml + K3S_KUBECONFIG_MODE: "666" + volumes: + - k3s-server:/var/lib/rancher/k3s + - kubeconfig-data:/output + ports: + - "6443:6443" + networks: + flagsmith-network: + aliases: + - k3s-server + healthcheck: + test: ["CMD", "kubectl", "get", "nodes"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 30s + + cli: + build: + context: . + dockerfile: cli/Dockerfile + volumes: + - ./challenges:/app/challenges:ro + - /var/run/docker.sock:/var/run/docker.sock + - k3s-server:/etc/rancher/k3s:ro + - kubeconfig-data:/root/.kube + - tmate-info:/tmp/tmate-info + environment: + KUBECONFIG: /root/.kube/config + CHALLENGES_DIR: /app/challenges + depends_on: + k3s: + condition: service_healthy + networks: + - flagsmith-network + stdin_open: true + tty: true + restart: "no" + command: ["python", "-m", "cli.main"] + + # Candidate debugging environment with tmate + candidate-env: + build: + context: ./candidate-env + dockerfile: Dockerfile + volumes: + - k3s-server:/etc/rancher/k3s:ro + - kubeconfig-data:/home/candidate/.kube-shared:ro + - tmate-info:/tmp/tmate-info + environment: + KUBECONFIG: /home/candidate/.kube/config + depends_on: + k3s: + condition: service_healthy + networks: + - flagsmith-network + ports: + - "2222:22" # SSH port for tmate + restart: "no" + +volumes: + k3s-server: + driver: local + kubeconfig-data: + driver: local + tmate-info: + driver: local + +networks: + flagsmith-network: + driver: bridge + ipam: + config: + - subnet: 172.20.0.0/16 diff --git a/kubernetes/docs/cli-challenge-complete.png b/kubernetes/docs/cli-challenge-complete.png new file mode 100644 index 0000000..c45c3ca Binary files /dev/null and b/kubernetes/docs/cli-challenge-complete.png differ diff --git a/kubernetes/docs/cli-challenge-selector.png b/kubernetes/docs/cli-challenge-selector.png new file mode 100644 index 0000000..31b5ada Binary files /dev/null and b/kubernetes/docs/cli-challenge-selector.png differ diff --git a/kubernetes/docs/cli-session-started.png b/kubernetes/docs/cli-session-started.png new file mode 100644 index 0000000..d11ad03 Binary files /dev/null and b/kubernetes/docs/cli-session-started.png differ diff --git a/kubernetes/pyproject.toml b/kubernetes/pyproject.toml new file mode 100644 index 0000000..f1f85be --- /dev/null +++ b/kubernetes/pyproject.toml @@ -0,0 +1,56 @@ +[project] +name = "flagsmith-interviews" +version = "0.1.0" +description = "Technical interview challenge system for Flagsmith infrastructure debugging" +requires-python = ">=3.14" +dependencies = [ + "requests>=2.28.0", + "pyyaml>=6.0", + "rich>=13.0.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["cli"] + +[project.optional-dependencies] +dev = [ + "ruff>=0.6.0", + "ty>=0.0.1a25", + "types-requests>=2.32.0", + "types-PyYAML>=6.0.0", +] + +[project.scripts] +flagsmith-challenger = "cli.main:main" + +[tool.ruff] +line-length = 88 +target-version = "py314" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "COM", # flake8-commas (trailing commas) + "UP", # pyupgrade + "B", # flake8-bugbear + "A", # flake8-builtins + "C4", # flake8-comprehensions + "RUF", # Ruff-specific rules +] +ignore = [ + "E501", # line too long (handled by line-length) +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] + +[tool.ruff.lint.isort] +force-single-line = true +known-first-party = ["cli"] \ No newline at end of file diff --git a/readme.md b/readme.md index 013fad9..c6d3dde 100644 --- a/readme.md +++ b/readme.md @@ -3,6 +3,21 @@ Repository containing technical tests for applicants to work at Flagsmith. We currently have the following tests available: 1. [Server-side/Python/Django](/python-django/) +2. [Infrastructure/Kubernetes/DevOps](/kubernetes/) - Interactive debugging challenges for infrastructure roles + +## Kubernetes Interview System + +The `kubernetes/` directory contains a self-contained technical interview system for infrastructure and DevOps roles. It creates broken Flagsmith deployments that candidates must debug in real-time. + +**Quick start:** +```bash +cd kubernetes/ +make run +``` + +This launches an interactive system where you can select debugging challenges and share a terminal session with candidates. The system provides a complete Kubernetes environment with Flagsmith deployed in various broken states. + +For detailed setup instructions, available challenges, and architecture information, see [kubernetes/README.md](kubernetes/README.md). ## Instructions