Skip to content

Commit 5aebda9

Browse files
TytaniumDevclaude
andauthored
chore: add structured logging to API and worker (#81)
* chore: add structured logging to API and worker (#10) Add lightweight `createLogger(component)` utility for both API and worker packages. Format: `[Component] message {context}` with no external dependencies — just formatted console output. Adopted in highest-traffic files: - api/lib/job-store-factory.ts (startup, recovery, cleanup) - api/app/api/jobs/[id]/simulations/[simId]/route.ts (sim PATCH) - worker/src/worker.ts (startup, heartbeat) - worker/src/docker-runner.ts (container lifecycle, capacity) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: add jobId context to cleanup log calls Addresses Gemini review feedback to include jobId in all fire-and-forget cleanup log contexts within aggregateJobResults. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cfcda7b commit 5aebda9

6 files changed

Lines changed: 124 additions & 44 deletions

File tree

api/app/api/jobs/[id]/simulations/[simId]/route.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,11 @@ import { updateJobProgress, updateSimProgress } from '@/lib/rtdb';
55
import { GAMES_PER_CONTAINER, type SimulationState } from '@/lib/types';
66
import { canSimTransition, isTerminalSimState } from '@shared/types/state-machine';
77
import * as Sentry from '@sentry/nextjs';
8+
import { createLogger } from '@/lib/logger';
89
import { errorResponse, badRequestResponse } from '@/lib/api-response';
910

11+
const log = createLogger('SimPatch');
12+
1013
interface RouteParams {
1114
params: Promise<{ id: string; simId: string }>;
1215
}
@@ -93,7 +96,7 @@ export async function PATCH(request: NextRequest, { params }: RouteParams) {
9396
}
9497

9598
// Fire-and-forget RTDB write for simulation progress
96-
updateSimProgress(id, simId, update).catch(err => console.warn('[RTDB] updateSimProgress failed:', err instanceof Error ? err.message : err));
99+
updateSimProgress(id, simId, update).catch(err => log.warn('RTDB updateSimProgress failed', { jobId: id, simId, error: err instanceof Error ? err.message : err }));
97100

98101
// Auto-detect job lifecycle transitions
99102
if (state === 'RUNNING') {
@@ -104,7 +107,7 @@ export async function PATCH(request: NextRequest, { params }: RouteParams) {
104107
status: 'RUNNING',
105108
startedAt: new Date().toISOString(),
106109
workerName: workerName ?? null,
107-
}).catch(err => console.warn('[RTDB] job RUNNING transition failed:', err instanceof Error ? err.message : err));
110+
}).catch(err => log.warn('RTDB job RUNNING transition failed', { jobId: id, error: err instanceof Error ? err.message : err }));
108111
}
109112
}
110113

@@ -116,7 +119,7 @@ export async function PATCH(request: NextRequest, { params }: RouteParams) {
116119
// Fire-and-forget RTDB progress update
117120
// Estimate gamesCompleted from completedCount (not exact for CANCELLED, but close enough for UI)
118121
const gamesCompleted = completedSimCount * GAMES_PER_CONTAINER;
119-
updateJobProgress(id, { completedCount: completedSimCount, gamesCompleted }).catch(err => console.warn('[RTDB] progress count update failed:', err instanceof Error ? err.message : err));
122+
updateJobProgress(id, { completedCount: completedSimCount, gamesCompleted }).catch(err => log.warn('RTDB progress count update failed', { jobId: id, error: err instanceof Error ? err.message : err }));
120123

121124
if (completedSimCount >= totalSimCount && totalSimCount > 0) {
122125
// Set flag before fire-and-forget aggregation
@@ -125,18 +128,18 @@ export async function PATCH(request: NextRequest, { params }: RouteParams) {
125128
updateJobProgress(id, {
126129
status: 'COMPLETED',
127130
completedAt: new Date().toISOString(),
128-
}).catch(err => console.warn('[RTDB] job COMPLETED status update failed:', err instanceof Error ? err.message : err));
131+
}).catch(err => log.warn('RTDB job COMPLETED status update failed', { jobId: id, error: err instanceof Error ? err.message : err }));
129132

130133
jobStore.aggregateJobResults(id).catch(err => {
131-
console.error(`[Aggregation] Failed for job ${id}:`, err);
134+
log.error('Aggregation failed', { jobId: id, error: err instanceof Error ? err.message : String(err) });
132135
Sentry.captureException(err, { tags: { component: 'sim-aggregation', jobId: id } });
133136
});
134137
}
135138
}
136139

137140
return NextResponse.json({ updated: true });
138141
} catch (error) {
139-
console.error('PATCH /api/jobs/[id]/simulations/[simId] error:', error);
142+
log.error('PATCH error', { error: error instanceof Error ? error.message : String(error) });
140143
return errorResponse(error instanceof Error ? error.message : 'Failed to update simulation', 500);
141144
}
142145
}

api/lib/job-store-factory.ts

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,20 @@ import * as workerStore from './worker-store-factory';
1010
import { deleteJobProgress } from './rtdb';
1111
import { cancelRecoveryCheck } from './cloud-tasks';
1212
import * as Sentry from '@sentry/nextjs';
13+
import { createLogger } from './logger';
14+
15+
const log = createLogger('JobStore');
16+
const recoveryLog = createLogger('Recovery');
1317

1418
const USE_FIRESTORE = typeof process.env.GOOGLE_CLOUD_PROJECT === 'string' && process.env.GOOGLE_CLOUD_PROJECT.length > 0;
1519

1620
// Log mode detection at startup
17-
console.log(`[Job Store] Running in ${USE_FIRESTORE ? 'GCP' : 'LOCAL'} mode`);
21+
log.info('Running in mode', { mode: USE_FIRESTORE ? 'GCP' : 'LOCAL' });
1822
if (USE_FIRESTORE) {
19-
console.log(`[Job Store] Project: ${process.env.GOOGLE_CLOUD_PROJECT}`);
20-
console.log(`[Job Store] Using: Firestore + Cloud Storage + Pub/Sub`);
23+
log.info('Project', { project: process.env.GOOGLE_CLOUD_PROJECT });
24+
log.info('Using Firestore + Cloud Storage + Pub/Sub');
2125
} else {
22-
console.log(`[Job Store] Using: SQLite + local filesystem`);
26+
log.info('Using SQLite + local filesystem');
2327
}
2428

2529
export function isGcpMode(): boolean {
@@ -312,7 +316,7 @@ async function recoverStaleSimulations(
312316
const jobStartedMs = job.startedAt ? job.startedAt.getTime() : job.createdAt.getTime();
313317
const pendingForMs = now - jobStartedMs;
314318
if (pendingForMs > STALE_PENDING_THRESHOLD_MS) {
315-
console.log(`[Recovery] Job ${jobId} sim ${sim.simId} stuck PENDING for ${Math.round(pendingForMs / 1000)}s, republishing`);
319+
recoveryLog.info('Sim stuck PENDING, republishing', { jobId, simId: sim.simId, pendingSec: Math.round(pendingForMs / 1000) });
316320
simsToRepublish.push(sim);
317321
recovered = true;
318322
}
@@ -322,7 +326,7 @@ async function recoverStaleSimulations(
322326
if (sim.state === 'RUNNING' && sim.startedAt) {
323327
const runningForMs = now - new Date(sim.startedAt).getTime();
324328
if (runningForMs > STALE_RUNNING_THRESHOLD_MS) {
325-
console.log(`[Recovery] Job ${jobId} sim ${sim.simId} stuck RUNNING for ${Math.round(runningForMs / 60000)}min, marking FAILED for retry`);
329+
recoveryLog.info('Sim stuck RUNNING, marking FAILED for retry', { jobId, simId: sim.simId, runningMin: Math.round(runningForMs / 60000) });
326330
const updated = await conditionalUpdateSimulationStatus(jobId, sim.simId, ['RUNNING'], {
327331
state: 'FAILED',
328332
errorMessage: `Simulation timed out after ${Math.round(runningForMs / 60000)} minutes`,
@@ -337,7 +341,7 @@ async function recoverStaleSimulations(
337341

338342
// Case 3: RUNNING sim whose specific worker is dead
339343
if (sim.state === 'RUNNING' && sim.workerId && !activeWorkerIds.has(sim.workerId)) {
340-
console.log(`[Recovery] Job ${jobId} sim ${sim.simId} worker ${sim.workerId} is dead, marking FAILED for retry`);
344+
recoveryLog.info('Sim worker is dead, marking FAILED for retry', { jobId, simId: sim.simId, workerId: sim.workerId });
341345
const updated = await conditionalUpdateSimulationStatus(jobId, sim.simId, ['RUNNING'], {
342346
state: 'FAILED',
343347
errorMessage: 'Worker lost connection',
@@ -351,7 +355,7 @@ async function recoverStaleSimulations(
351355

352356
// Case 4: FAILED sim — retry by resetting to PENDING + republish
353357
if (sim.state === 'FAILED' && activeWorkers.length > 0) {
354-
console.log(`[Recovery] Job ${jobId} sim ${sim.simId} is FAILED, retrying`);
358+
recoveryLog.info('Sim is FAILED, retrying', { jobId, simId: sim.simId });
355359
simsToRepublish.push(sim);
356360
recovered = true;
357361
}
@@ -377,9 +381,9 @@ async function recoverStaleSimulations(
377381
});
378382
});
379383
await Promise.all(promises);
380-
console.log(`[Recovery] Republished ${simsToRepublish.length} simulation messages for job ${jobId}`);
384+
recoveryLog.info('Republished simulation messages', { jobId, count: simsToRepublish.length });
381385
} catch (err) {
382-
console.warn(`[Recovery] Failed to republish sims for job ${jobId}:`, err);
386+
recoveryLog.warn('Failed to republish sims', { jobId, error: err instanceof Error ? err.message : String(err) });
383387
}
384388
}
385389

@@ -394,7 +398,7 @@ async function recoverStaleSimulations(
394398
const needsRetrigger = job.status === 'RUNNING' || job.needsAggregation === true;
395399
if (needsRetrigger) {
396400
aggregateJobResults(jobId).catch((err) => {
397-
console.error(`[Recovery] Aggregation failed for job ${jobId}:`, err);
401+
recoveryLog.error('Aggregation failed', { jobId, error: err instanceof Error ? err.message : String(err) });
398402
Sentry.captureException(err, { tags: { component: 'recovery-aggregation', jobId } });
399403
});
400404
}
@@ -427,7 +431,7 @@ async function recoverStaleQueuedJob(jobId: string, job: Job): Promise<boolean>
427431
const activeWorkers = await workerStore.getActiveWorkers();
428432
if (activeWorkers.length === 0) return false;
429433

430-
console.log(`[Recovery] Job ${jobId} stuck in QUEUED for ${Math.round(queuedForMs / 1000)}s, re-publishing to Pub/Sub`);
434+
recoveryLog.info('Job stuck in QUEUED, re-publishing to Pub/Sub', { jobId, queuedSec: Math.round(queuedForMs / 1000) });
431435
requeueCooldowns.set(jobId, Date.now());
432436

433437
try {
@@ -456,12 +460,12 @@ async function recoverStaleQueuedJob(jobId: string, job: Job): Promise<boolean>
456460
return topic.publishMessage({ json: msg });
457461
});
458462
await Promise.all(promises);
459-
console.log(`[Recovery] Re-published ${pendingSims.length} pending simulation messages for job ${jobId}`);
463+
recoveryLog.info('Re-published pending simulation messages', { jobId, count: pendingSims.length });
460464
}
461465
}
462466
return true;
463467
} catch (err) {
464-
console.warn(`[Recovery] Failed to re-publish queued job ${jobId}:`, err);
468+
recoveryLog.warn('Failed to re-publish queued job', { jobId, error: err instanceof Error ? err.message : String(err) });
465469
return false;
466470
}
467471
}
@@ -563,22 +567,22 @@ export async function aggregateJobResults(jobId: string): Promise<void> {
563567
if (Array.isArray(job.deckIds) && job.deckIds.length === 4 && structuredData?.games?.length) {
564568
const { processJobForRatings } = await import('./trueskill-service');
565569
processJobForRatings(jobId, job.deckIds, structuredData.games).catch((err) => {
566-
console.error(`[TrueSkill] Rating update failed for job ${jobId} (non-fatal):`, err);
570+
log.error('TrueSkill rating update failed (non-fatal)', { jobId, error: err instanceof Error ? err.message : String(err) });
567571
Sentry.captureException(err, { tags: { component: 'trueskill', jobId } });
568572
});
569573
}
570574

571575
// Don't overwrite CANCELLED status — logs are ingested above, but status stays CANCELLED
572576
if (job.status === 'CANCELLED') {
573577
await setNeedsAggregation(jobId, false);
574-
deleteJobProgress(jobId).catch(err => console.warn('[Cleanup] fire-and-forget failed:', err instanceof Error ? err.message : err));
578+
deleteJobProgress(jobId).catch(err => log.warn('Cleanup fire-and-forget failed', { jobId, error: err instanceof Error ? err.message : err }));
575579
return;
576580
}
577581

578582
const allCancelled = sims.every(s => s.state === 'CANCELLED');
579583
if (allCancelled) {
580584
await setNeedsAggregation(jobId, false);
581-
deleteJobProgress(jobId).catch(err => console.warn('[Cleanup] fire-and-forget failed:', err instanceof Error ? err.message : err));
585+
deleteJobProgress(jobId).catch(err => log.warn('Cleanup fire-and-forget failed', { jobId, error: err instanceof Error ? err.message : err }));
582586
return; // Already handled by cancel flow
583587
}
584588

@@ -588,7 +592,7 @@ export async function aggregateJobResults(jobId: string): Promise<void> {
588592
await setNeedsAggregation(jobId, false);
589593

590594
// Clean up RTDB ephemeral data and cancel recovery task
591-
cancelRecoveryCheck(jobId).catch(err => console.warn('[Cleanup] fire-and-forget failed:', err instanceof Error ? err.message : err));
592-
deleteJobProgress(jobId).catch(err => console.warn('[Cleanup] fire-and-forget failed:', err instanceof Error ? err.message : err));
595+
cancelRecoveryCheck(jobId).catch(err => log.warn('Cleanup fire-and-forget failed', { jobId, error: err instanceof Error ? err.message : err }));
596+
deleteJobProgress(jobId).catch(err => log.warn('Cleanup fire-and-forget failed', { jobId, error: err instanceof Error ? err.message : err }));
593597
}
594598

api/lib/logger.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/**
2+
* Lightweight structured logger.
3+
* Format: [Component] message {context}
4+
* No external dependencies — wraps console methods with consistent formatting.
5+
*/
6+
7+
type LogContext = Record<string, unknown>;
8+
9+
function formatContext(ctx?: LogContext): string {
10+
if (!ctx || Object.keys(ctx).length === 0) return '';
11+
return ' ' + JSON.stringify(ctx);
12+
}
13+
14+
function formatMessage(component: string, message: string, ctx?: LogContext): string {
15+
return `[${component}] ${message}${formatContext(ctx)}`;
16+
}
17+
18+
export interface Logger {
19+
info(message: string, ctx?: LogContext): void;
20+
warn(message: string, ctx?: LogContext): void;
21+
error(message: string, ctx?: LogContext): void;
22+
}
23+
24+
export function createLogger(component: string): Logger {
25+
return {
26+
info(message: string, ctx?: LogContext) {
27+
console.log(formatMessage(component, message, ctx));
28+
},
29+
warn(message: string, ctx?: LogContext) {
30+
console.warn(formatMessage(component, message, ctx));
31+
},
32+
error(message: string, ctx?: LogContext) {
33+
console.error(formatMessage(component, message, ctx));
34+
},
35+
};
36+
}

worker/src/docker-runner.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ import * as os from 'os';
99
import { execFile } from 'child_process';
1010
import { spawn } from 'child_process';
1111
import { GAMES_PER_CONTAINER } from './constants.js';
12+
import { createLogger } from './logger.js';
13+
14+
const log = createLogger('DockerRunner');
1215

1316
// ============================================================================
1417
// Types
@@ -63,11 +66,11 @@ export async function cleanupOrphanedContainers(): Promise<void> {
6366
'ps', '-a', '--filter', 'name=sim-', '--format', '{{.Names}}',
6467
]);
6568
if (!output) {
66-
console.log('Startup cleanup: no orphaned sim containers found');
69+
log.info('Startup cleanup: no orphaned sim containers found');
6770
return;
6871
}
6972
const names = output.split('\n').filter(Boolean);
70-
console.log(`Startup cleanup: removing ${names.length} orphaned sim container(s)...`);
73+
log.info('Startup cleanup: removing orphaned sim containers', { count: names.length });
7174
for (const name of names) {
7275
try {
7376
await execDockerCommand(['rm', '-f', name]);
@@ -88,13 +91,13 @@ export async function cleanupOrphanedContainers(): Promise<void> {
8891
export async function pruneDockerResources(): Promise<void> {
8992
try {
9093
const containerOutput = await execDockerCommand(['container', 'prune', '-f']);
91-
console.log('Docker container prune:', containerOutput || '(nothing to prune)');
94+
log.info('Docker container prune', { result: containerOutput || '(nothing to prune)' });
9295
} catch (err) {
9396
console.warn('Docker container prune failed:', err instanceof Error ? err.message : err);
9497
}
9598
try {
9699
const imageOutput = await execDockerCommand(['image', 'prune', '-f']);
97-
console.log('Docker image prune:', imageOutput || '(nothing to prune)');
100+
log.info('Docker image prune', { result: imageOutput || '(nothing to prune)' });
98101
} catch (err) {
99102
console.warn('Docker image prune failed:', err instanceof Error ? err.message : err);
100103
}
@@ -184,7 +187,7 @@ export async function runSimulationContainer(
184187
let timedOut = false;
185188
const timeout = setTimeout(() => {
186189
timedOut = true;
187-
console.error(`[${simId}] Container timed out after ${CONTAINER_TIMEOUT_MS / 1000}s, killing...`);
190+
log.error('Container timed out', { simId, timeoutMs: CONTAINER_TIMEOUT_MS });
188191
// Kill the docker run process
189192
proc.kill('SIGTERM');
190193
// Also force-remove the container in case SIGTERM doesn't propagate
@@ -261,10 +264,7 @@ export function calculateLocalCapacity(): number {
261264

262265
const capacity = Math.max(1, Math.min(memSlots, cpuSlots, MAX_CONCURRENT_SIMS));
263266

264-
console.log(
265-
`Local capacity: ${totalMemMB}MB RAM, ${cpuCount} CPUs → ` +
266-
`memSlots=${memSlots}, cpuSlots=${cpuSlots}, cap=${MAX_CONCURRENT_SIMS}, using=${capacity}`,
267-
);
267+
log.info('Local capacity', { totalMemMB, cpuCount, memSlots, cpuSlots, cap: MAX_CONCURRENT_SIMS, using: capacity });
268268

269269
return capacity;
270270
}

worker/src/logger.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/**
2+
* Lightweight structured logger for the worker.
3+
* Format: [Component] message {context}
4+
*/
5+
6+
type LogContext = Record<string, unknown>;
7+
8+
function formatContext(ctx?: LogContext): string {
9+
if (!ctx || Object.keys(ctx).length === 0) return '';
10+
return ' ' + JSON.stringify(ctx);
11+
}
12+
13+
function formatMessage(component: string, message: string, ctx?: LogContext): string {
14+
return `[${component}] ${message}${formatContext(ctx)}`;
15+
}
16+
17+
export interface Logger {
18+
info(message: string, ctx?: LogContext): void;
19+
warn(message: string, ctx?: LogContext): void;
20+
error(message: string, ctx?: LogContext): void;
21+
}
22+
23+
export function createLogger(component: string): Logger {
24+
return {
25+
info(message: string, ctx?: LogContext) {
26+
console.log(formatMessage(component, message, ctx));
27+
},
28+
warn(message: string, ctx?: LogContext) {
29+
console.warn(formatMessage(component, message, ctx));
30+
},
31+
error(message: string, ctx?: LogContext) {
32+
console.error(formatMessage(component, message, ctx));
33+
},
34+
};
35+
}

worker/src/worker.ts

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ import {
4949
} from './condenser.js';
5050
import { startWorkerApi, stopWorkerApi } from './worker-api.js';
5151
import { GAMES_PER_CONTAINER } from './constants.js';
52+
import { createLogger } from './logger.js';
53+
54+
const log = createLogger('Worker');
5255

5356

5457
const SECRET_NAME = 'simulation-worker-config';
@@ -557,7 +560,7 @@ async function sendHeartbeat(status?: 'idle' | 'busy' | 'updating', timeoutMs?:
557560
signal: AbortSignal.timeout(timeoutMs ?? API_TIMEOUT_MS),
558561
});
559562
if (!res.ok) {
560-
console.warn(`Heartbeat failed: HTTP ${res.status} ${res.statusText}`);
563+
log.warn('Heartbeat failed', { status: res.status, statusText: res.statusText });
561564
return;
562565
}
563566

@@ -567,7 +570,7 @@ async function sendHeartbeat(status?: 'idle' | 'busy' | 'updating', timeoutMs?:
567570
applyOverride(data.maxConcurrentOverride ?? null);
568571
}
569572
} catch (err) {
570-
console.warn('Heartbeat error:', err instanceof Error ? err.message : err);
573+
log.warn('Heartbeat error', { error: err instanceof Error ? err.message : err });
571574
}
572575
}
573576

@@ -928,14 +931,13 @@ async function main(): Promise<void> {
928931

929932
currentWorkerName = getWorkerName();
930933
currentWorkerId = getWorkerId();
931-
console.log('Worker ID:', currentWorkerId);
932-
console.log('Worker Name:', currentWorkerName);
934+
log.info('Worker identity', { workerId: currentWorkerId, workerName: currentWorkerName });
933935

934-
console.log('Worker starting...');
935-
console.log('Mode: Per-Simulation (docker run --rm)');
936-
console.log('Transport:', usePubSub ? 'Pub/Sub' : 'Polling');
937-
console.log('API URL:', getApiUrl());
938-
console.log('Simulation image:', SIMULATION_IMAGE);
936+
log.info('Worker starting');
937+
log.info('Mode', { mode: 'Per-Simulation (docker run --rm)' });
938+
log.info('Transport', { transport: usePubSub ? 'Pub/Sub' : 'Polling' });
939+
log.info('API URL', { url: getApiUrl() });
940+
log.info('Simulation image', { image: SIMULATION_IMAGE });
939941

940942
// Verify Docker is accessible
941943
await verifyDockerAvailable();
@@ -1004,6 +1006,6 @@ async function main(): Promise<void> {
10041006
}
10051007

10061008
main().catch((error) => {
1007-
console.error('Worker failed to start:', error);
1009+
log.error('Failed to start', { error: error instanceof Error ? error.message : String(error) });
10081010
process.exit(1);
10091011
});

0 commit comments

Comments
 (0)