11import { sleep } from '@sim/utils/helpers'
2+ import { backoffWithJitter } from '@sim/utils/retry'
23import { drizzle } from 'drizzle-orm/postgres-js'
34import { migrate } from 'drizzle-orm/postgres-js/migrator'
45import postgres from 'postgres'
@@ -34,7 +35,21 @@ import postgres from 'postgres'
3435 * `warnOnInvalidIndexes` check below logs any such index after every run.
3536 */
3637
37- const url = process . env . DATABASE_URL
38+ /**
39+ * Migrations must run on a DIRECT Postgres connection, never through a
40+ * transaction-pooling PgBouncer. Session-level advisory locks, session `SET`s
41+ * (`statement_timeout`/`lock_timeout` below), and `pg_advisory_unlock` are all
42+ * officially unsupported in transaction pooling — statements can land on
43+ * different server connections, so the lock may not guard the migration, the
44+ * unlock can strand the lock on a pooled connection (wedging the NEXT deploy
45+ * for the full acquisition deadline), and timeout settings can leak into app
46+ * traffic. This is the same reason Prisma requires `directUrl` for migrate.
47+ *
48+ * Set MIGRATION_DATABASE_URL to the direct (non-pooled) DSN in environments
49+ * where DATABASE_URL points at a PgBouncer; it falls back to DATABASE_URL for
50+ * dev/self-hosted setups that connect directly anyway.
51+ */
52+ const url = process . env . MIGRATION_DATABASE_URL || process . env . DATABASE_URL
3853if ( ! url ) {
3954 console . error ( 'ERROR: Missing DATABASE_URL environment variable.' )
4055 console . error ( 'Ensure packages/db/.env is configured.' )
@@ -74,8 +89,8 @@ const LOCK_RETRY_INTERVAL_MS = 5_000
7489 * world unblocked; we retry below, then let the deploy retry.
7590 */
7691const DDL_LOCK_TIMEOUT = '5s'
77- const MAX_MIGRATE_ATTEMPTS = 3
78- const MIGRATE_RETRY_DELAY_MS = 15_000
92+ const MAX_MIGRATE_ATTEMPTS = 8
93+ const MIGRATE_RETRY_BACKOFF = { baseMs : 2_000 , maxMs : 30_000 } as const
7994
8095try {
8196 // statement_timeout=0: index builds (esp. CONCURRENTLY on large tables) can run
@@ -130,11 +145,12 @@ async function runMigrationsWithRetry(): Promise<void> {
130145 return
131146 } catch ( error ) {
132147 if ( ! isLockNotAvailable ( error ) || attempt >= MAX_MIGRATE_ATTEMPTS ) throw error
148+ const delayMs = Math . round ( backoffWithJitter ( attempt , null , MIGRATE_RETRY_BACKOFF ) )
133149 console . warn (
134150 `WARN: migration DDL hit lock_timeout (attempt ${ attempt } /${ MAX_MIGRATE_ATTEMPTS } ); ` +
135- `retrying in ${ MIGRATE_RETRY_DELAY_MS } ms.`
151+ `retrying in ${ delayMs } ms.`
136152 )
137- await sleep ( MIGRATE_RETRY_DELAY_MS )
153+ await sleep ( delayMs )
138154 // Re-assert: a migration file's post-COMMIT `SET lock_timeout = 0` (the
139155 // CONCURRENTLY convention above) is session-level and would otherwise
140156 // leak into the retry.
@@ -145,10 +161,17 @@ async function runMigrationsWithRetry(): Promise<void> {
145161
146162/**
147163 * SQLSTATE 55P03 (`lock_not_available`) — raised when `lock_timeout` expires
148- * while a statement queues for a lock.
164+ * while a statement queues for a lock. drizzle's `migrate()` wraps driver
165+ * failures (e.g. `DrizzleQueryError` with the Postgres error on `cause`), so
166+ * walk the whole cause chain looking for the code.
149167 */
150168function isLockNotAvailable ( error : unknown ) : boolean {
151- return error instanceof Error && ( error as { code ?: string } ) . code === '55P03'
169+ let current : unknown = error
170+ for ( let depth = 0 ; depth < 10 && current instanceof Error ; depth ++ ) {
171+ if ( ( current as { code ?: string } ) . code === '55P03' ) return true
172+ current = current . cause
173+ }
174+ return false
152175}
153176
154177/**
0 commit comments