diff --git a/services/kiloclaw/src/config.ts b/services/kiloclaw/src/config.ts index 3c775e76bc..2a39a5e033 100644 --- a/services/kiloclaw/src/config.ts +++ b/services/kiloclaw/src/config.ts @@ -85,8 +85,17 @@ export const RESTARTING_TIMEOUT_MS = 5 * 60 * 1000; // 5 min export const RESTARTING_MAX_TIMEOUT_MS = 15 * 60 * 1000; // 15 min /** Maximum time to stay in 'recovering' before surfacing a timeout */ export const RECOVERING_TIMEOUT_MS = 10 * 60 * 1000; // 10 min -/** Destroying: retry pending deletes quickly */ +/** Destroying: initial retry interval for pending deletes */ export const ALARM_INTERVAL_DESTROYING_MS = 60 * 1000; // 1 min +/** Volume deletion retry tiers; the last tier repeats until the retry cap. */ +export const DESTROY_VOLUME_RETRY_DELAYS_MS = [ + 60 * 1000, + 5 * 60 * 1000, + 15 * 60 * 1000, + 60 * 60 * 1000, + 6 * 60 * 60 * 1000, + 24 * 60 * 60 * 1000, +] as const; /** Pending destroy age before emitting stuck-destroy telemetry */ export const DESTROY_STUCK_THRESHOLD_MS = 15 * 60 * 1000; // 15 min /** Minimum interval between repeated stuck-destroy telemetry events */ diff --git a/services/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts b/services/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts index 0d5135b67a..56dc031f8c 100644 --- a/services/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts +++ b/services/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts @@ -145,6 +145,7 @@ vi.mock('../utils/encryption', async () => { import { KiloClawInstance } from './kiloclaw-instance'; import { buildChannelConfigPatch } from './kiloclaw-instance/channel-config'; +import { destroyRetryDelay } from './kiloclaw-instance/log'; import * as flyClient from '../fly/client'; import { FlyApiError } from '../fly/client'; import * as db from '../db'; @@ -1394,7 +1395,17 @@ describe('destroy error tracking', () => { }); }); -describe('destroy volume: max-retry abandon', () => { +describe('destroy volume: retry backoff and abandon', () => { + it('uses tiered proportional jitter and caps at the daily tier', () => { + expect(destroyRetryDelay(1, 0.5)).toBe(60 * 1000); + expect(destroyRetryDelay(2, 0.5)).toBe(5 * 60 * 1000); + expect(destroyRetryDelay(3, 0.5)).toBe(15 * 60 * 1000); + expect(destroyRetryDelay(4, 0.5)).toBe(60 * 60 * 1000); + expect(destroyRetryDelay(5, 0.5)).toBe(6 * 60 * 60 * 1000); + expect(destroyRetryDelay(6, 0.5)).toBe(24 * 60 * 60 * 1000); + expect(destroyRetryDelay(100, 0)).toBe(12 * 60 * 60 * 1000); + expect(destroyRetryDelay(100, 1)).toBe(36 * 60 * 60 * 1000); + }); // vi.clearAllMocks() in the global beforeEach clears call history but not // implementations. Without this reset, a previous test in the file that // used `.mockResolvedValue([volumes...])` on listVolumes would leak its @@ -1442,6 +1453,34 @@ describe('destroy volume: max-retry abandon', () => { expect(storage._store.get('pendingDestroyVolumeId')).toBe('vol-1'); }); + it('emits retry escalation telemetry before reaching the cap', async () => { + const env = createFakeEnv(); + const { storage } = createInstance(createFakeStorage(), env); + await seedProvisioned(storage, { + status: 'destroying', + flyMachineId: null, + flyVolumeId: 'vol-1', + pendingDestroyMachineId: null, + pendingDestroyVolumeId: 'vol-1', + destroyVolumeAttempts: 5, + }); + (flyClient.getVolume as Mock).mockResolvedValue({ + id: 'vol-1', + attached_machine_id: null, + state: 'detached', + }); + (flyClient.deleteVolume as Mock).mockRejectedValue( + new FlyApiError('persistent failure', 503, '{}') + ); + + const { instance } = createInstance(storage, env); + await instance.alarm(); + + expect(storage._store.get('destroyVolumeAttempts')).toBe(6); + expect(storage._store.get('pendingDestroyVolumeId')).toBe('vol-1'); + expect(analyticsEventsByName(env, 'reconcile.destroy_volume_retry_escalated')).toHaveLength(1); + }); + it('emits destroy_volume_abandoned_after_max_retries and clears state at the cap', async () => { const env = createFakeEnv(); const { storage } = createInstance(createFakeStorage(), env); @@ -1452,7 +1491,7 @@ describe('destroy volume: max-retry abandon', () => { flyVolumeId: 'vol-1', pendingDestroyMachineId: null, pendingDestroyVolumeId: 'vol-1', - destroyVolumeAttempts: 49, + destroyVolumeAttempts: 99, }); (flyClient.getVolume as Mock).mockResolvedValue({ @@ -1500,7 +1539,7 @@ describe('destroy volume: max-retry abandon', () => { flyVolumeId: 'vol-1', pendingDestroyMachineId: null, pendingDestroyVolumeId: 'vol-1', - destroyVolumeAttempts: 49, + destroyVolumeAttempts: 99, }); (flyClient.getVolume as Mock).mockResolvedValue({ diff --git a/services/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts b/services/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts index 1852522580..53f3f790e6 100644 --- a/services/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts +++ b/services/kiloclaw/src/durable-objects/kiloclaw-instance/index.ts @@ -401,7 +401,7 @@ export class KiloClawInstance extends DurableObject { private async scheduleAlarm(): Promise { if (!this.s.status) return; - await this.ctx.storage.setAlarm(nextAlarmTime(this.s.status)); + await this.ctx.storage.setAlarm(nextAlarmTime(this.s.status, this.s.destroyVolumeAttempts)); } private recoveryRuntime(): RecoveryRuntime { diff --git a/services/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts b/services/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts index 8e7c0c58db..77d7671f71 100644 --- a/services/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts +++ b/services/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts @@ -8,6 +8,7 @@ import { ALARM_INTERVAL_DESTROYING_MS, ALARM_INTERVAL_IDLE_MS, ALARM_JITTER_MS, + DESTROY_VOLUME_RETRY_DELAYS_MS, } from '../../config'; import { writeEvent, eventContextFromState } from '../../utils/analytics'; @@ -233,9 +234,18 @@ export function alarmIntervalForStatus(status: InstanceStatus): number { } } +export function destroyRetryDelay(attempt: number, random = Math.random()): number { + const index = Math.min(Math.max(attempt, 1) - 1, DESTROY_VOLUME_RETRY_DELAYS_MS.length - 1); + const baseDelay = DESTROY_VOLUME_RETRY_DELAYS_MS[index]; + return baseDelay * (0.5 + random); +} + /** * Next alarm time with jitter. */ -export function nextAlarmTime(status: InstanceStatus): number { +export function nextAlarmTime(status: InstanceStatus, destroyVolumeAttempts = 0): number { + if (status === 'destroying' && destroyVolumeAttempts > 0) { + return Date.now() + destroyRetryDelay(destroyVolumeAttempts); + } return Date.now() + alarmIntervalForStatus(status) + Math.random() * ALARM_JITTER_MS; } diff --git a/services/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts b/services/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts index b7a7147d55..46910ba16c 100644 --- a/services/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts +++ b/services/kiloclaw/src/durable-objects/kiloclaw-instance/reconcile.ts @@ -1543,8 +1543,9 @@ export async function tryDeleteMachine( /** * Cap on retries against a single `pendingDestroyVolumeId` before the DO gives - * up. At the current ~1 retry/minute alarm cadence, 50 attempts is roughly an - * hour of wall-clock retries. Past this point the volume is treated as + * up. Retry alarms back off from one minute to a jittered daily cadence, so this + * cap represents a long-lived provider failure rather than a short outage. + * Past this point the volume is treated as * permanently stuck — the DO emits `destroy_volume_abandoned_after_max_retries` * (for alerting), clears the pending pointer so the destroy loop can finalize, * and the volume will be picked up by the org-wide volume janitor (if any). @@ -1563,7 +1564,16 @@ export async function tryDeleteMachine( * needs human attention" rather than "this volume is leaked," and re-check * actual Fly state before acting. */ -const MAX_DESTROY_VOLUME_ATTEMPTS = 50; +const MAX_DESTROY_VOLUME_ATTEMPTS = 100; +/** + * Attempt at which to emit `destroy_volume_retry_escalated` as an early signal. + * The retry backoff reaches its daily tier when scheduling after attempt 6 + * (~7h of accumulated retries), which is the point a short outage becomes a + * long-lived provider failure worth alerting on. Firing later (e.g. at attempt + * 10, ~4 days in) would just trail the daily-tier transition instead of + * surfacing it while still actionable. + */ +const DESTROY_VOLUME_ESCALATION_ATTEMPTS = 6; export async function tryDeleteVolume( flyConfig: FlyClientConfig, @@ -1597,6 +1607,14 @@ export async function tryDeleteVolume( await persistDestroyError(ctx, state, 'volume', status, message); const attempts = state.destroyVolumeAttempts + 1; + if (attempts === DESTROY_VOLUME_ESCALATION_ATTEMPTS) { + rctx.log('destroy_volume_retry_escalated', { + volume_id: state.pendingDestroyVolumeId, + attempts, + last_error: message, + last_status: status, + }); + } if (attempts >= MAX_DESTROY_VOLUME_ATTEMPTS) { rctx.log('destroy_volume_abandoned_after_max_retries', { volume_id: state.pendingDestroyVolumeId,