Skip to content

Commit 8d2067e

Browse files
committed
fix(db): detect wrapped lock-timeout errors, jittered retries, direct migration DSN support
1 parent 3cd7eaf commit 8d2067e

2 files changed

Lines changed: 35 additions & 7 deletions

File tree

packages/db/.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,8 @@
44
# primary — never set a replica URL here.
55

66
DATABASE_URL="postgresql://postgres:postgres@localhost:5432/simstudio"
7+
8+
# Direct (non-pooled) DSN for db:migrate. Required when DATABASE_URL points at
9+
# a transaction-pooling PgBouncer: session advisory locks and session SETs are
10+
# unsupported through transaction pooling. Falls back to DATABASE_URL.
11+
# MIGRATION_DATABASE_URL=""

packages/db/scripts/migrate.ts

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { sleep } from '@sim/utils/helpers'
2+
import { backoffWithJitter } from '@sim/utils/retry'
23
import { drizzle } from 'drizzle-orm/postgres-js'
34
import { migrate } from 'drizzle-orm/postgres-js/migrator'
45
import postgres from 'postgres'
@@ -34,7 +35,21 @@ import postgres from 'postgres'
3435
* `warnOnInvalidIndexes` check below logs any such index after every run.
3536
*/
3637

37-
const url = process.env.DATABASE_URL
38+
/**
39+
* Migrations must run on a DIRECT Postgres connection, never through a
40+
* transaction-pooling PgBouncer. Session-level advisory locks, session `SET`s
41+
* (`statement_timeout`/`lock_timeout` below), and `pg_advisory_unlock` are all
42+
* officially unsupported in transaction pooling — statements can land on
43+
* different server connections, so the lock may not guard the migration, the
44+
* unlock can strand the lock on a pooled connection (wedging the NEXT deploy
45+
* for the full acquisition deadline), and timeout settings can leak into app
46+
* traffic. This is the same reason Prisma requires `directUrl` for migrate.
47+
*
48+
* Set MIGRATION_DATABASE_URL to the direct (non-pooled) DSN in environments
49+
* where DATABASE_URL points at a PgBouncer; it falls back to DATABASE_URL for
50+
* dev/self-hosted setups that connect directly anyway.
51+
*/
52+
const url = process.env.MIGRATION_DATABASE_URL || process.env.DATABASE_URL
3853
if (!url) {
3954
console.error('ERROR: Missing DATABASE_URL environment variable.')
4055
console.error('Ensure packages/db/.env is configured.')
@@ -74,8 +89,8 @@ const LOCK_RETRY_INTERVAL_MS = 5_000
7489
* world unblocked; we retry below, then let the deploy retry.
7590
*/
7691
const DDL_LOCK_TIMEOUT = '5s'
77-
const MAX_MIGRATE_ATTEMPTS = 3
78-
const MIGRATE_RETRY_DELAY_MS = 15_000
92+
const MAX_MIGRATE_ATTEMPTS = 8
93+
const MIGRATE_RETRY_BACKOFF = { baseMs: 2_000, maxMs: 30_000 } as const
7994

8095
try {
8196
// statement_timeout=0: index builds (esp. CONCURRENTLY on large tables) can run
@@ -130,11 +145,12 @@ async function runMigrationsWithRetry(): Promise<void> {
130145
return
131146
} catch (error) {
132147
if (!isLockNotAvailable(error) || attempt >= MAX_MIGRATE_ATTEMPTS) throw error
148+
const delayMs = Math.round(backoffWithJitter(attempt, null, MIGRATE_RETRY_BACKOFF))
133149
console.warn(
134150
`WARN: migration DDL hit lock_timeout (attempt ${attempt}/${MAX_MIGRATE_ATTEMPTS}); ` +
135-
`retrying in ${MIGRATE_RETRY_DELAY_MS}ms.`
151+
`retrying in ${delayMs}ms.`
136152
)
137-
await sleep(MIGRATE_RETRY_DELAY_MS)
153+
await sleep(delayMs)
138154
// Re-assert: a migration file's post-COMMIT `SET lock_timeout = 0` (the
139155
// CONCURRENTLY convention above) is session-level and would otherwise
140156
// leak into the retry.
@@ -145,10 +161,17 @@ async function runMigrationsWithRetry(): Promise<void> {
145161

146162
/**
147163
* SQLSTATE 55P03 (`lock_not_available`) — raised when `lock_timeout` expires
148-
* while a statement queues for a lock.
164+
* while a statement queues for a lock. drizzle's `migrate()` wraps driver
165+
* failures (e.g. `DrizzleQueryError` with the Postgres error on `cause`), so
166+
* walk the whole cause chain looking for the code.
149167
*/
150168
function isLockNotAvailable(error: unknown): boolean {
151-
return error instanceof Error && (error as { code?: string }).code === '55P03'
169+
let current: unknown = error
170+
for (let depth = 0; depth < 10 && current instanceof Error; depth++) {
171+
if ((current as { code?: string }).code === '55P03') return true
172+
current = current.cause
173+
}
174+
return false
152175
}
153176

154177
/**

0 commit comments

Comments
 (0)