@@ -6,57 +6,26 @@ import { migrate } from 'drizzle-orm/postgres-js/migrator'
66import postgres from 'postgres'
77
88/**
9- * Concurrent-index convention (avoid write-blocking index builds on large tables)
10- * --------------------------------------------------------------------------------
11- * drizzle-kit emits plain `CREATE INDEX`, which takes a SHARE lock and blocks all
12- * writes for the build duration — on a big, write-hot table (e.g.
13- * workflow_execution_logs, usage_log) that stalls every in-flight workflow
14- * completion for minutes. drizzle runs migrations inside a transaction, and
15- * `CREATE INDEX CONCURRENTLY` cannot run inside a transaction block.
16- *
17- * So, after generating a migration that adds an index on a large/hot table, edit
18- * the generated SQL to end drizzle's transaction first, clear the session
19- * `lock_timeout` (set below for fail-fast DDL; it would cancel CONCURRENTLY's
20- * legitimate waits on old transactions and leave an INVALID index), build
21- * concurrently and idempotently, then RESTORE the fail-fast timeout so later
22- * statements and files in the same run keep the protection (the SET is
23- * session-level and would otherwise persist):
9+ * Concurrent-index convention: plain `CREATE INDEX` write-blocks large/hot
10+ * tables, and CONCURRENTLY cannot run inside drizzle's migration transaction.
11+ * For indexes on big tables, edit the generated SQL to:
2412 *
2513 * COMMIT;--> statement-breakpoint
2614 * SET lock_timeout = 0;--> statement-breakpoint
2715 * CREATE INDEX CONCURRENTLY IF NOT EXISTS "idx_name" ON "table" (...);--> statement-breakpoint
2816 * SET lock_timeout = '5s';
2917 *
30- * Notes:
31- * - Put the `COMMIT` breakpoint AFTER all transactional DDL (ALTER TABLE/TYPE)
32- * in the file and only the concurrent CREATE INDEX statements below it.
33- * - drizzle's migrate() wraps ALL pending files in ONE transaction, so the
34- * embedded `COMMIT` ends that batch transaction: everything after it — in
35- * this file AND any later pending files — runs in autocommit, one statement
36- * at a time. This is why EVERY statement in a file using this convention,
37- * and in any file that can follow it in a batch, must be idempotent
38- * (`IF NOT EXISTS` / `ADD COLUMN IF NOT EXISTS` / `ADD VALUE IF NOT EXISTS`):
39- * a failure after the COMMIT cannot roll back, and the re-run replays every
40- * file whose journal record has not yet committed.
41- * - CONCURRENTLY only takes a SHARE UPDATE EXCLUSIVE lock (allows reads/writes).
42- * - Always validate on staging before prod; a failed CONCURRENTLY build can
43- * leave an INVALID index that must be dropped and rebuilt — the
44- * `warnOnInvalidIndexes` check below logs any such index after every run.
18+ * The embedded COMMIT ends the batch transaction, so everything after it (in
19+ * this and later pending files) runs in autocommit and must be idempotent
20+ * (`IF NOT EXISTS` etc.) — a failed run replays unjournaled files from the top.
21+ * A failed CONCURRENTLY build leaves an INVALID index that `IF NOT EXISTS`
22+ * skips; `warnOnInvalidIndexes` below surfaces those.
4523 */
4624
4725/**
48- * Migrations must run on a DIRECT Postgres connection, never through a
49- * transaction-pooling PgBouncer. Session-level advisory locks, session `SET`s
50- * (`statement_timeout`/`lock_timeout` below), and `pg_advisory_unlock` are all
51- * officially unsupported in transaction pooling — statements can land on
52- * different server connections, so the lock may not guard the migration, the
53- * unlock can strand the lock on a pooled connection (wedging the NEXT deploy
54- * for the full acquisition deadline), and timeout settings can leak into app
55- * traffic. This is the same reason Prisma requires `directUrl` for migrate.
56- *
57- * Set MIGRATION_DATABASE_URL to the direct (non-pooled) DSN in environments
58- * where DATABASE_URL points at a PgBouncer; it falls back to DATABASE_URL for
59- * dev/self-hosted setups that connect directly anyway.
26+ * Prefer a direct (non-pooled) DSN: session advisory locks and session `SET`s
27+ * are unsupported through PgBouncer transaction pooling. Falls back to
28+ * DATABASE_URL for setups that connect directly anyway.
6029 */
6130const url = process . env . MIGRATION_DATABASE_URL || process . env . DATABASE_URL
6231if ( ! url ) {
@@ -66,63 +35,38 @@ if (!url) {
6635}
6736
6837/**
69- * The backend-pid session guard below is only sound on a DIRECT connection:
70- * through a transaction-pooling PgBouncer, consecutive statements can
71- * legitimately run on different server backends, so a pid change does not mean
72- * the session was lost and the guard would false-positive on every run.
73- * MIGRATION_DATABASE_URL is by contract the direct DSN; when falling back to
74- * DATABASE_URL (which may be pooled), the guard is skipped.
38+ * The pid guard is only sound on a direct connection — through transaction
39+ * pooling, consecutive statements legitimately land on different backends.
7540 */
7641const hasDirectMigrationUrl = Boolean ( process . env . MIGRATION_DATABASE_URL )
7742
7843/**
79- * `max_lifetime: null` is load-bearing: postgres-js defaults to recycling the
80- * connection after a randomized 30–60 minutes, and a transparent reconnect
81- * silently drops the session advisory lock and session `SET`s. The migration
82- * session must live exactly as long as the run.
44+ * `max_lifetime: null` pins the session for the whole run: the postgres-js
45+ * default recycles the connection after 30–60 min, silently dropping the
46+ * session advisory lock and `SET`s.
8347 */
8448const client = postgres ( url , { max : 1 , connect_timeout : 10 , max_lifetime : null } )
8549
8650/**
87- * Cross-process migration lock key (a stable, app-wide 64-bit constant).
88- *
89- * drizzle's `migrate()` has no built-in lock, so when a deployment starts N app
90- * replicas at once — each with a migration sidecar — all N read
91- * `__drizzle_migrations`, all see the same migration pending, and all try to apply
92- * it concurrently. One wins; the losers run the same DDL against already-mutated
93- * state and die (e.g. `DROP TABLE "form"` → `table "form" does not exist`,
94- * exit 1 / TaskFailedToStart).
95- *
96- * Acquisition is a bounded `pg_try_advisory_lock` retry loop rather than a plain
97- * `pg_advisory_lock`: an unbounded wait meant one wedged runner silently hung
98- * every other deploy sidecar (and the whole ECS deployment) behind it. With a
99- * deadline, a stuck winner turns into a visible non-zero exit on the losers that
100- * the deploy orchestrator can retry or surface. Session locks auto-release if
101- * the connection drops, so a crashed runner never wedges the lock.
51+ * Cross-process migration lock. drizzle's `migrate()` has no built-in lock, so
52+ * concurrent runners (one per app replica at deploy time) must be serialized.
53+ * Acquisition is a bounded try-lock loop: a plain `pg_advisory_lock` wait let
54+ * one wedged runner silently hang every other runner and the whole deploy.
10255 */
10356const MIGRATION_LOCK_KEY = 4_961_002_270n
10457const LOCK_ACQUIRE_DEADLINE_MS = 30 * 60_000
10558const LOCK_RETRY_INTERVAL_MS = 5_000
10659
10760/**
108- * How long any single migration statement may QUEUE for a table lock before
109- * failing with SQLSTATE 55P03. Without this, DDL needing an AccessExclusiveLock
110- * (e.g. `DROP TABLE ... CASCADE`) queues indefinitely behind long-running reads
111- * — and every other query on that table queues behind the pending exclusive
112- * lock, stalling all reads/writes table-wide until the DDL gets its turn
113- * (observed in production: a ~15-minute full stall). Failing fast keeps the
114- * world unblocked; we retry below, then let the deploy retry.
61+ * Max time a migration statement may queue for a table lock (SQLSTATE 55P03 on
62+ * expiry). Without it, DDL waiting on an AccessExclusiveLock queues every other
63+ * query on the table behind it — a table-wide stall for the whole wait.
11564 */
11665const DDL_LOCK_TIMEOUT = '5s'
11766const MAX_MIGRATE_ATTEMPTS = 8
11867const MIGRATE_RETRY_BACKOFF = { baseMs : 2_000 , maxMs : 30_000 } as const
11968
120- /**
121- * Backend pid of the session that acquired the advisory lock. Re-checked at the
122- * top of every migration attempt: if the connection was silently replaced
123- * (server restart, network failure), the new session does NOT hold the lock,
124- * and running migrations on it would break mutual exclusion — abort loudly.
125- */
69+ /** Backend pid of the lock-holding session; a change means the lock was lost. */
12670let lockSessionPid = 0
12771
12872try {
@@ -166,25 +110,12 @@ async function acquireMigrationLock(): Promise<void> {
166110}
167111
168112/**
169- * Run pending migrations, retrying when a statement loses the `lock_timeout`
170- * race (SQLSTATE 55P03, detected anywhere in the error's `cause` chain since
171- * drizzle wraps driver failures).
172- *
173- * Every attempt starts by verifying the session still holds the advisory lock
174- * (backend pid unchanged) and re-asserting the session timeouts:
175- * `statement_timeout = 0` because index builds (esp. CONCURRENTLY on large
176- * tables) can run far longer than any app default and must never be killed
177- * mid-build, and the fail-fast `lock_timeout` because a prior attempt's
178- * migration file may have left the session at `lock_timeout = 0` (the
179- * CONCURRENTLY convention above). `SET` is rejected by Postgres when
180- * parameterized, so the constants are inlined via `client.unsafe`.
181- *
182- * Retry safety: drizzle wraps the whole pending batch in one transaction, so a
183- * lock-timeout failure rolls the batch back and the retry resumes from the
184- * first file whose journal record has not committed. Files using the
185- * embedded-`COMMIT` CONCURRENTLY convention break out of that transaction —
186- * their post-COMMIT statements are required to be idempotent (see the
187- * convention notes above) precisely so a replay is safe.
113+ * Run pending migrations, retrying on lock timeout (55P03, found anywhere in
114+ * the wrapped `cause` chain). Each attempt re-verifies the lock session (pid)
115+ * and re-asserts the session timeouts — a migration file may have changed them,
116+ * and `SET` cannot be parameterized, hence `client.unsafe` with constants.
117+ * Replays are safe: drizzle rolls the batch back on failure, and post-COMMIT
118+ * CONCURRENTLY statements are idempotent by convention.
188119 */
189120async function runMigrationsWithRetry ( ) : Promise < void > {
190121 for ( let attempt = 1 ; ; attempt ++ ) {
@@ -216,10 +147,8 @@ async function runMigrationsWithRetry(): Promise<void> {
216147}
217148
218149/**
219- * A `CREATE INDEX CONCURRENTLY` that fails partway leaves an INVALID index that
220- * `IF NOT EXISTS` then silently skips on every future run. Surface any such
221- * index loudly (warn, don't fail — the migration itself committed) so it can be
222- * dropped and rebuilt.
150+ * A failed CONCURRENTLY build leaves an INVALID index that `IF NOT EXISTS`
151+ * silently skips forever — surface it (warn only; the migration committed).
223152 */
224153async function warnOnInvalidIndexes ( ) : Promise < void > {
225154 try {
@@ -242,11 +171,8 @@ async function warnOnInvalidIndexes(): Promise<void> {
242171}
243172
244173/**
245- * Release the advisory lock without ever failing the process. The session-level
246- * lock auto-releases when the connection closes, so a thrown unlock — e.g. the
247- * connection dropped right after `migrate()` committed — must be swallowed.
248- * Letting it reach the outer `catch` would exit 1 and falsely report a
249- * successful migration as failed to the deploy orchestrator.
174+ * Unlock errors are swallowed: the session lock auto-releases on disconnect,
175+ * and a thrown unlock would falsely report a committed migration as failed.
250176 */
251177async function releaseMigrationLock ( ) : Promise < void > {
252178 try {
0 commit comments