diff --git a/.server-changes/realtime-runs-subscription-scalability.md b/.server-changes/realtime-runs-subscription-scalability.md new file mode 100644 index 00000000000..5de00aae675 --- /dev/null +++ b/.server-changes/realtime-runs-subscription-scalability.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a new backend for the realtime runs feed (single runs, tags, and batches) that scales under high concurrency, available behind a feature flag diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index 7a3537dbc04..2725fe2b729 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -18,7 +18,7 @@ "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 9996eb7b30a..8cc23bff089 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -27,6 +27,7 @@ import { registerRunEngineEventBusHandlers, setupBatchQueueCallbacks, } from "./v3/runEngineHandlers.server"; +import { registerRunChangeNotifierHandlers } from "./services/realtime/runChangeNotifierHandlers.server"; // Touch the sessions replication singleton at entry so it boots deterministically // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors @@ -269,6 +270,9 @@ process.on("uncaughtException", (error, origin) => { singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); +// Attach the run-changed notifier delegations to the engine event bus. +// No-ops (registers nothing) unless REALTIME_NOTIFIER_ENABLED=1. +singleton("RunChangeNotifierHandlers", registerRunChangeNotifierHandlers); // Wrapped in singleton() so Remix's dev-mode CJS reloads don't append // duplicate copies of the processor — Sentry's processor list lives in diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c55bb424001..f01e8285916 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -300,6 +300,47 @@ const EnvironmentSchema = z .int() .default(24 * 60 * 60 * 1000), // 1 day in milliseconds + // Master switch for the notifier-backed realtime feed. + // "0" (default) = the existing realtime path serves everything, publishes are + // no-ops, and no notifier Redis connections are opened (zero-overhead off). + // "1" = run-changed signals are published and the per-org `realtimeBackend` + // feature flag selects the backend per request. + REALTIME_NOTIFIER_ENABLED: z.string().default("0"), + // Backstop wait before a live notifier request refetches the run (ms). Matches + // Electric's ~20s live long-poll hold so the client polling cadence is unchanged + // across backends (a ±15% jitter is applied per request to avoid refetch herds). + REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(20_000), + // Hard cap on the tag-list snapshot size served by the notifier feed. + REALTIME_NOTIFIER_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), + // Short-TTL coalescing cache for the multi-run (tag-list/batch) resolve+hydrate. + // Concurrent same-filter feeds share one ClickHouse resolve + Postgres hydrate + // within this window, so an env-wide wake doesn't fan out into per-feed queries. + // Staleness budget: a newly-matching run is visible within ~ttl + poll interval. + REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), + REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // Cap on the per-handle working-set cache (runId -> updatedAt) the notifier keeps + // for diffing multi-run live polls. + REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), + // Quantize the tag-list createdAt lower bound to this epoch-aligned bucket (ms) so + // same-tag feeds that pin their window within the same bucket share one resolve+ + // hydrate cache entry. Floored, so the window only ever widens by < bucket. 0 + // disables bucketing (each feed keeps its exact lower bound). + REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + // Leading-edge throttle (ms) on the per-env wake channel: a busy env's run-change + // firehose is collapsed to at most one feed-wake per window, decoupling wake load + // from run throughput. Lossless because consumers refetch current state on a wake. + // 0 disables coalescing (every change wakes immediately). + REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(100), + // When "1", a multi-run live poll woken by a change irrelevant to its filter keeps + // holding the long-poll (re-resolving cheaply) instead of returning an empty + // up-to-date the client would immediately re-issue. "0" reverts to per-wake replies. + REALTIME_NOTIFIER_HOLD_ON_EMPTY: z.string().default("1"), + // Max concurrent fresh ClickHouse resolves (cache misses) per instance. Caps the + // distinct-filter reconnect stampede: a mass reconnect of N feeds on N different filters + // queues to this many concurrent CH queries instead of firing all N at once. Same-filter + // bursts collapse via the single-flight cache before taking a permit. 0 disables the gate. + REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), + PUBSUB_REDIS_HOST: z .string() .optional() @@ -332,6 +373,41 @@ const EnvironmentSchema = z PUBSUB_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"), + // Dedicated pub/sub Redis for the realtime runs feed's run-changed notifier, so + // its publish/subscribe traffic can run on its own instance. Each value falls + // back to the shared PUBSUB_REDIS_* (then REDIS_*) when unset, so the default is + // unchanged until explicitly pointed at a dedicated instance. + REALTIME_RUNS_PUBSUB_REDIS_HOST: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_HOST ?? process.env.REDIS_HOST), + REALTIME_RUNS_PUBSUB_REDIS_PORT: z.coerce + .number() + .optional() + .transform((v) => { + if (v !== undefined) return v; + const raw = process.env.PUBSUB_REDIS_PORT ?? process.env.REDIS_PORT; + return raw ? parseInt(raw) : undefined; + }), + REALTIME_RUNS_PUBSUB_REDIS_USERNAME: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_USERNAME ?? process.env.REDIS_USERNAME), + REALTIME_RUNS_PUBSUB_REDIS_PASSWORD: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_PASSWORD ?? process.env.REDIS_PASSWORD), + REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED: z + .string() + .default(process.env.PUBSUB_REDIS_TLS_DISABLED ?? process.env.REDIS_TLS_DISABLED ?? "false"), + REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z + .string() + .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), + // Use sharded pub/sub (SSUBSCRIBE/SPUBLISH) when in cluster mode, so a busy env's + // traffic stays on one shard instead of broadcasting to every node. Only takes + // effect alongside CLUSTER_MODE_ENABLED. "0" forces classic pub/sub on the cluster. + REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED: z.string().default("1"), + DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), DEFAULT_ORG_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(300), @@ -1608,6 +1684,20 @@ const EnvironmentSchema = z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), RUN_ENGINE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), + // ClickHouse client used by the realtime runs feed for tag/batch id resolution. + // Kept on its own URL + pool so the feed's reads can't contend with the main + // analytics client (CLICKHOUSE_URL). Falls back to the main URL when unset. + REALTIME_RUNS_CLICKHOUSE_URL: z + .string() + .optional() + .transform((v) => v ?? process.env.CLICKHOUSE_URL), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000), EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000), diff --git a/apps/webapp/app/models/runtimeEnvironment.server.ts b/apps/webapp/app/models/runtimeEnvironment.server.ts index 64b1da3be49..be05adaa8a7 100644 --- a/apps/webapp/app/models/runtimeEnvironment.server.ts +++ b/apps/webapp/app/models/runtimeEnvironment.server.ts @@ -237,10 +237,20 @@ export async function findEnvironmentBySlug( return environment ? toAuthenticated(environment) : null; } +// The authenticated environment plus the run scalars the realtime publish needs. +// Both come from one taskRun read — see findEnvironmentFromRun. +export type EnvironmentFromRun = { + environment: AuthenticatedEnvironment; + runTags: string[]; + batchId: string | null; +}; + export async function findEnvironmentFromRun( runId: string, tx?: PrismaClientOrTransaction -): Promise { +): Promise { + // The include (no select) already pulls every taskRun scalar, so runTags/batchId + // ride along for free — no extra query for the realtime publish to send a full record. const taskRun = await (tx ?? $replica).taskRun.findFirst({ where: { id: runId, @@ -249,7 +259,14 @@ export async function findEnvironmentFromRun( runtimeEnvironment: { include: authIncludeBase }, }, }); - return taskRun?.runtimeEnvironment ? toAuthenticated(taskRun.runtimeEnvironment) : null; + if (!taskRun?.runtimeEnvironment) { + return null; + } + return { + environment: toAuthenticated(taskRun.runtimeEnvironment), + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }; } export async function createNewSession( diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index ceae1efb4b4..c88009a84a4 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -12,6 +12,7 @@ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { ServiceValidationError } from "~/v3/services/common.server"; import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; @@ -184,7 +185,10 @@ const { action } = createActionApiRoute( return json({ error: "Internal Server Error" }, { status: 500 }); } if (pgResult) { - return json(pgResult, { status: 200 }); + // Reflect metadata.set() on a live feed before the next lifecycle event. Publish the + // internal id (the router keys single-run feeds by it, not the friendly id from the URL). + publishChangeRecord({ runId: pgResult.runId, envId: env.id, batchId: pgResult.batchId }); + return json({ metadata: pgResult.metadata }, { status: 200 }); } // PG miss. Target run is either buffered or genuinely absent. diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index ef7f3180bf3..c8fa5ea37d2 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -7,6 +7,7 @@ import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; // Pull the existing tags out of a buffer entry's serialised payload so @@ -90,6 +91,13 @@ export async function action({ request, params }: ActionFunctionArgs) { }, data: { runTags: { push: newTags } }, }); + // Publish a run-changed record with the NEW tag set so tag feeds reindex + // (no-op unless enabled). + publishChangeRecord({ + runId: taskRun.id, + envId: env.id, + tags: existing.concat(newTags), + }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, // Buffer-applied patch path. The mutateSnapshot Lua deduplicates diff --git a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts index 2b8fb106681..973cd5f96cd 100644 --- a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts +++ b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts @@ -1,7 +1,7 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; const ParamsSchema = z.object({ @@ -33,7 +33,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: batchRun, apiVersion }) => { - return realtimeClient.streamBatch( + // Pick the Electric proxy or the notifier-backed batch feed + // per org (defaults to Electric). Both implement streamBatch. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamBatch( request.url, authentication.environment, batchRun.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index e03787c6200..3e224ddedf2 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -2,7 +2,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -48,7 +48,12 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { - return realtimeClient.streamRun( + // Pick the Electric proxy or the notifier-backed shim per org (defaults to + // Electric; controlled by REALTIME_NOTIFIER_ENABLED + the realtimeBackend + // feature flag). Both implement the same streamRun contract. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRun( request.url, authentication.environment, run.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.ts b/apps/webapp/app/routes/realtime.v1.runs.ts index b04c2d55bbc..436f4ef48d8 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.ts @@ -1,6 +1,6 @@ import { z } from "zod"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -39,7 +39,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ searchParams, authentication, request, apiVersion }) => { - return realtimeClient.streamRuns( + // Pick the Electric proxy or the notifier-backed tag-list feed per org + // (defaults to Electric). Both implement streamRuns. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRuns( request.url, authentication.environment, searchParams, diff --git a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts index fb7f384fd27..c563621408c 100644 --- a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts +++ b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts @@ -211,6 +211,36 @@ function initializeRunEngineClickhouseClient(): ClickHouse { }); } +/** Realtime runs feed tag/batch id resolution (`REALTIME_RUNS_CLICKHOUSE_URL`); + * falls back to the default client if unset. */ +const defaultRealtimeClickhouseClient = singleton( + "realtimeClickhouseClient", + initializeRealtimeClickhouseClient +); + +function initializeRealtimeClickhouseClient(): ClickHouse { + if (!env.REALTIME_RUNS_CLICKHOUSE_URL) { + return defaultClickhouseClient; + } + + const url = new URL(env.REALTIME_RUNS_CLICKHOUSE_URL); + url.searchParams.delete("secure"); + + return new ClickHouse({ + url: url.toString(), + name: "realtime-runs-clickhouse", + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); +} + /** Task events (`EVENTS_CLICKHOUSE_URL`); not exported — accessed via factory. */ const defaultEventsClickhouseClient = singleton( "eventsClickhouseClient", @@ -257,7 +287,8 @@ export type ClientType = | "logs" | "query" | "admin" - | "engine"; + | "engine" + | "realtime"; function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHouse { const parsed = new URL(url); @@ -330,6 +361,20 @@ function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHou }, maxOpenConnections: env.RUN_ENGINE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); + case "realtime": + return new ClickHouse({ + url: parsed.toString(), + name, + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); case "standard": case "query": case "admin": @@ -398,6 +443,8 @@ export class ClickhouseFactory { return defaultAdminClickhouseClient; case "engine": return defaultRunEngineClickhouseClient; + case "realtime": + return defaultRealtimeClickhouseClient; } } diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index cfb946a1024..6422e3c5666 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -308,6 +308,7 @@ export class UpdateMetadataService { }, select: { id: true, + batchId: true, completedAt: true, status: true, metadata: true, @@ -355,6 +356,9 @@ export class UpdateMetadataService { return { metadata: newMetadata, + // Internal id + batchId, so callers can publish realtime records keyed how the router indexes feeds. + runId: taskRun.id, + batchId: taskRun.batchId, }; } diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts new file mode 100644 index 00000000000..8efcde55609 --- /dev/null +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -0,0 +1,59 @@ +/** + * Tiny in-process bounded TTL cache shared by the realtime feeds. + * + * Entries expire after `ttlMs`. An expired entry is evicted when read (`get`); on + * write, if the cache is at `maxEntries`, expired entries are swept and, if it's + * still full (pathologically all live), the oldest insertion is dropped. Node is + * single-threaded so no locking is needed. Used where a miss is cheap and + * correctness-safe (read-through hydration, per-handle working sets, per-org flag + * resolution). + * + * A stored value of `undefined` cannot be distinguished from a miss; callers that + * need to cache "absence" should store an explicit sentinel (e.g. `null`). + */ +export class BoundedTtlCache { + readonly #entries = new Map(); + + constructor( + private readonly ttlMs: number, + private readonly maxEntries: number + ) {} + + get(key: string): V | undefined { + const entry = this.#entries.get(key); + if (!entry) { + return undefined; + } + if (entry.expiresAt > Date.now()) { + return entry.value; + } + // Evict on read so expired entries don't linger until the next at-capacity + // sweep — important for read-heavy / low-churn caches (per-handle working sets). + this.#entries.delete(key); + return undefined; + } + + set(key: string, value: V): void { + // Only run capacity eviction when inserting a NEW key — updating an existing key + // doesn't grow the map, so it must never drop an unrelated live entry. + if (!this.#entries.has(key) && this.#entries.size >= this.maxEntries) { + const now = Date.now(); + for (const [key, entry] of this.#entries) { + if (entry.expiresAt <= now) { + this.#entries.delete(key); + } + } + if (this.#entries.size >= this.maxEntries) { + const oldest = this.#entries.keys().next().value; + if (oldest !== undefined) { + this.#entries.delete(oldest); + } + } + } + this.#entries.set(key, { value, expiresAt: Date.now() + this.ttlMs }); + } + + get size(): number { + return this.#entries.size; + } +} diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts new file mode 100644 index 00000000000..003646bb74a --- /dev/null +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -0,0 +1,43 @@ +import { type ClickHouse } from "@internal/clickhouse"; +import { type PrismaClientOrTransaction } from "~/db.server"; +import { RunsRepository } from "~/services/runsRepository/runsRepository.server"; +import { type RunListFilter, type RunListResolver } from "./runReader.server"; + +export type ClickHouseRunListResolverOptions = { + /** Resolves the per-organization ClickHouse client (multi-tenant routing). */ + getClickhouse: (organizationId: string) => Promise; + prisma: PrismaClientOrTransaction; +}; + +/** + * Resolves the realtime tag/list filter into matching run ids via ClickHouse + * `listRunIds`. Tag matching is contains-ANY (OR), the same + * semantics the dashboard runs list uses. Filter-only: ids only, hydrated from + * Postgres by id afterward. This keeps the realtime tag feed off the Postgres + * `runTags` GIN index entirely. + * + * (Multi-tag subscribeToRunsWithTag is therefore OR, not the AND that Electric's + * `runTags @> ARRAY[...]` shape used. Restoring AND is a follow-up: add a + * `hasAll` mode to the ClickHouse runs filter and use it here.) + */ +export class ClickHouseRunListResolver implements RunListResolver { + constructor(private readonly options: ClickHouseRunListResolverOptions) {} + + async resolveMatchingRunIds(filter: RunListFilter): Promise { + const clickhouse = await this.options.getClickhouse(filter.organizationId); + const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); + + const { runIds } = await repository.listRunIds({ + organizationId: filter.organizationId, + projectId: filter.projectId, + environmentId: filter.environmentId, + tags: filter.tags && filter.tags.length > 0 ? filter.tags : undefined, + batchId: filter.batchId, + from: filter.createdAtAfter?.getTime(), + page: { size: filter.limit }, + }); + + // listRunIds is keyset-paginated; runIds is already capped to page.size (= limit). + return runIds; + } +} diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts new file mode 100644 index 00000000000..6a276bcb03d --- /dev/null +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -0,0 +1,321 @@ +/** + * Electric HTTP shape-stream wire protocol serializer for the single-run feed. + * + * This re-emits the exact wire shape that the deployed `@electric-sql/client` + * (1.0.14 modern + 0.4.0 legacy) and the SDK's `SubscribeRunRawShape` expect, + * so the notifier-backed realtime feed stays byte-faithful to what those clients + * already expect. + * + * The module is intentionally pure: no DB, Redis, or env access, so the wire + * contract can be unit-tested by round-tripping through the real client parser + * + the SDK schema. Header rewrites, tokens, and transport live in the client. + * + * Wire facts this encodes (verified against @electric-sql/client@1.0.14): + * - Response body is a JSON array of messages; an empty body is treated as `[]`. + * - Each column value is wire-encoded as a STRING (or null); the client decodes + * it back using the per-column `electric-schema` header. Columns absent from + * the schema are passed through unparsed (so text/timestamp stay strings). + * - `up-to-date` is the only control message that makes the client emit rows. + * - Re-sending the full row each cycle is idempotent: the client merges by `key`. + */ + +export type ElectricColumnType = + | "text" + | "timestamp" + | "int4" + | "int8" + | "float8" + | "bool" + | "jsonb"; + +type ElectricColumn = { + name: string; + type: ElectricColumnType; + /** Array dimensionality. 1 => `type[]` (Postgres `{a,b}` literal). */ + dims?: number; + /** + * Array columns only. True when the Postgres column has NO default, so an + * empty/absent value is stored as SQL NULL (Electric emits `null`) rather than + * an empty-array literal `{}`. Prisma erases this distinction — it coerces both + * NULL and `{}` to `[]` on read — so we re-derive the wire form from the column's + * known schema. `runTags` has no default; `realtimeStreams` has `@default([])`. + */ + emptyArrayAsNull?: boolean; +}; + +/** + * The columns the realtime run feed exposes, mirroring `DEFAULT_ELECTRIC_COLUMNS` + * in `realtimeClient.server.ts` and their Postgres types from the `TaskRun` + * Prisma model. The `type`/`dims` drive both the `electric-schema` header and + * the value encoding. Keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. + */ +export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ + { name: "id", type: "text" }, + { name: "taskIdentifier", type: "text" }, + { name: "createdAt", type: "timestamp" }, + { name: "updatedAt", type: "timestamp" }, + { name: "startedAt", type: "timestamp" }, + { name: "delayUntil", type: "timestamp" }, + { name: "queuedAt", type: "timestamp" }, + { name: "expiredAt", type: "timestamp" }, + { name: "completedAt", type: "timestamp" }, + { name: "friendlyId", type: "text" }, + { name: "number", type: "int4" }, + { name: "isTest", type: "bool" }, + { name: "status", type: "text" }, + { name: "usageDurationMs", type: "int4" }, + { name: "costInCents", type: "float8" }, + { name: "baseCostInCents", type: "float8" }, + { name: "ttl", type: "text" }, + { name: "payload", type: "text" }, + { name: "payloadType", type: "text" }, + { name: "metadata", type: "text" }, + { name: "metadataType", type: "text" }, + { name: "output", type: "text" }, + { name: "outputType", type: "text" }, + { name: "runTags", type: "text", dims: 1, emptyArrayAsNull: true }, + { name: "error", type: "jsonb" }, + { name: "realtimeStreams", type: "text", dims: 1 }, +]; + +/** Columns that can never be skipped via `skipColumns` (mirrors realtimeClient). */ +export const RESERVED_COLUMNS = ["id", "taskIdentifier", "friendlyId", "status", "createdAt"]; + +/** + * Shape of a single run hydrated for the realtime feed. Structurally compatible + * with the Prisma `TaskRun` projection produced by `RunHydrator`. + */ +export type RealtimeRunRow = { + id: string; + taskIdentifier: string; + createdAt: Date; + updatedAt: Date; + startedAt: Date | null; + delayUntil: Date | null; + queuedAt: Date | null; + expiredAt: Date | null; + completedAt: Date | null; + friendlyId: string; + number: number; + isTest: boolean; + status: string; + usageDurationMs: number; + costInCents: number; + baseCostInCents: number; + ttl: string | null; + payload: string; + payloadType: string; + metadata: string | null; + metadataType: string; + output: string | null; + outputType: string; + runTags: string[]; + error: unknown; + realtimeStreams: string[]; +}; + +type Operation = "insert" | "update" | "delete"; + +type ChangeMessage = { + key: string; + value: Record; + headers: { operation: Operation }; +}; + +type ControlMessage = { + headers: { control: "up-to-date" | "must-refetch" }; +}; + +type ShapeMessage = ChangeMessage | ControlMessage; + +const UP_TO_DATE: ControlMessage = { headers: { control: "up-to-date" } }; + +function effectiveSkipColumns(skipColumns: string[]): Set { + return new Set(skipColumns.filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c))); +} + +function quoteArrayElement(value: string): string { + return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; +} + +function pgArrayLiteral(values: unknown[]): string { + if (values.length === 0) { + return "{}"; + } + return `{${values.map((v) => quoteArrayElement(String(v))).join(",")}}`; +} + +function serializeValue(value: unknown, column: ElectricColumn): string | null { + if (value === null || value === undefined) { + return null; + } + + if (column.dims && column.dims > 0) { + if (!Array.isArray(value)) { + return null; + } + // A no-default array column stores NULL when empty, so Electric emits `null` + // (not `{}`); match that here since Prisma handed us `[]` for the NULL value. + if (value.length === 0 && column.emptyArrayAsNull) { + return null; + } + return pgArrayLiteral(value); + } + + switch (column.type) { + case "bool": + // Postgres text representation; the client's parseBool accepts "t"/"f". + return value ? "t" : "f"; + case "timestamp": + // The SDK's RawShapeDate appends "Z" before parsing, so we emit the ISO + // string WITHOUT the trailing "Z". + return value instanceof Date ? value.toISOString().slice(0, -1) : String(value); + case "jsonb": + return JSON.stringify(value); + case "int4": + case "int8": + case "float8": + case "text": + default: + return String(value); + } +} + +/** The merge key the client uses to reassemble a row across insert/update cycles. */ +export function runShapeKey(runId: string): string { + return `"public"."TaskRun"/"${runId}"`; +} + +/** Encode a single run row into the wire `value` object (column -> string|null). */ +export function serializeRunRow( + row: RealtimeRunRow, + skipColumns: string[] = [] +): Record { + const skip = effectiveSkipColumns(skipColumns); + const value: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + value[column.name] = serializeValue((row as Record)[column.name], column); + } + + return value; +} + +/** The `electric-schema` response header value for the (optionally trimmed) column set. */ +export function buildElectricSchemaHeader(skipColumns: string[] = []): string { + const skip = effectiveSkipColumns(skipColumns); + const schema: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + schema[column.name] = column.dims ? { type: column.type, dims: column.dims } : { type: column.type }; + } + + return JSON.stringify(schema); +} + +/** + * Initial snapshot body: a single `insert` for the row (if it exists) followed by + * `up-to-date`. An absent row emits a bare `up-to-date` (an empty shape), which is + * how Electric represents "no rows match". + */ +export function buildSnapshotBody(row: RealtimeRunRow | null, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = []; + if (row) { + messages.push({ + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "insert" }, + }); + } + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** Live body when the row advanced: a full-row `update` followed by `up-to-date`. */ +export function buildUpdateBody(row: RealtimeRunRow, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = [ + { + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "update" }, + }, + UP_TO_DATE, + ]; + return JSON.stringify(messages); +} + +/** Live body when nothing advanced: a bare `up-to-date` (no row emission). */ +export function buildUpToDateBody(): string { + return JSON.stringify([UP_TO_DATE]); +} + +export type RowChange = { row: RealtimeRunRow; operation: "insert" | "update" }; + +/** + * Multi-row body for the tag-list feed: one change message per row (insert for + * rows new to the shape, update for rows that advanced) followed by `up-to-date`. + * An empty `changes` array emits a bare `up-to-date`. The client merges every row + * by key, so re-emitting a full row is idempotent. + */ +export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.row.id), + value: serializeRunRow(change.row, skipColumns), + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** A row change whose wire `value` was already serialized (once, shared across feeds by + * the EnvChangeRouter); the per-feed `operation` is applied here. */ +export type SerializedRowChange = { + runId: string; + value: Record; + operation: "insert" | "update"; +}; + +/** Like `buildRowsBody`, but from values serialized once per (runId, columnSet) upstream, + * so a run matching many feeds is serialized once and reused across their bodies. */ +export function buildRowsBodyFromSerialized(changes: SerializedRowChange[]): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.runId), + value: change.value, + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +export const INITIAL_OFFSET = "-1"; + +/** + * Opaque offset token, formatted to satisfy the client's `${number}_${number}` + * type. The first segment is the row's `updatedAt` epoch-ms (lets a live request + * detect whether the replica row has advanced past what the client already has); + * the second is a per-connection sequence counter. + */ +export function encodeOffset(updatedAtMs: number, seq: number): string { + return `${Math.trunc(updatedAtMs)}_${Math.trunc(seq)}`; +} + +/** Extract the `updatedAt` epoch-ms a client last saw from its echoed offset. */ +export function parseOffsetUpdatedAtMs(offset: string | null | undefined): number { + if (!offset) { + return 0; + } + const [first] = offset.split("_"); + const value = Number(first); + return Number.isFinite(value) && value > 0 ? value : 0; +} + +/** Mirror of realtimeClient's DEQUEUED->EXECUTING rewrite for non-current API versions. */ +export function rewriteBodyForLegacyApiVersion(body: string): string { + return body.replace(/"status":"DEQUEUED"/g, '"status":"EXECUTING"'); +} diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts new file mode 100644 index 00000000000..0c68140e58b --- /dev/null +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -0,0 +1,347 @@ +import { type ChangeRecord } from "./runChangeNotifier.server"; +import { type RealtimeRunRow, serializeRunRow } from "./electricStreamProtocol.server"; + +/** + * EnvChangeRouter — the per-instance routing layer that turns "feeds as predicates over + * one env stream" into cheap fan-out. + * + * It owns ONE subscription per environment (over the RunChangeNotifier) and an inverted + * index of the feeds currently held by THIS instance: `runId -> feeds`, `tag -> feeds`, + * `batchId -> feeds`. On a coalesced batch of ChangeRecords it: + * 1. routes each record to only the matching held feeds via the index (O(record-tags), + * not O(feeds)) — a record that matches nothing costs nothing; + * 2. batch-hydrates the matched runs from Postgres ONCE per column set (collapsing the + * hot-shared-tag fan-out: one run matching N feeds = one `hydrateByIds`, not N); + * 3. serializes each row's wire value ONCE per column set, reused across all matching + * feeds; + * 4. resolves each matching feed's pending wait with its hydrated+serialized rows. + * + * It is stateless across reconnects: the index is rebuilt from whatever feeds this + * instance happens to hold, so no shape affinity or cross-poll memory is required. The + * per-handle working-set diff (insert vs update) stays in the consumer; the router only + * decides membership, hydrates, and serializes. + */ + +export type WakeReason = "notify" | "timeout" | "abort"; + +/** A feed's membership predicate over the env stream. */ +export type FeedFilter = + | { kind: "run"; runId: string } + | { kind: "tag"; tags: string[]; createdAtFloorMs?: number } + | { kind: "batch"; batchId: string }; + +/** A matched run handed to a feed: the hydrated row (for the feed's working-set diff) and + * its wire `value` serialized once for this feed's column set (shared across feeds). */ +export type MatchedRow = { row: RealtimeRunRow; value: Record }; + +export type WaitResult = { reason: WakeReason; rows: MatchedRow[] }; + +/** Minimal deps so the router is unit-testable without Redis/Postgres. */ +export interface EnvChangeSource { + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void; +} +export interface RowHydrator { + hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] + ): Promise; +} + +export type EnvChangeRouterOptions = { + source: EnvChangeSource; + hydrator: RowHydrator; + /** Observability: a hydrate-by-id batch ran (count = runs hydrated this tick). */ + onHydrate?: (runCount: number) => void; +}; + +/** Handle a feed holds for the duration of one long-poll. */ +export type FeedRegistration = { + /** Wait for the next batch matching this feed (or timeout/abort), with the matched runs + * hydrated + serialized for this feed's columns. One wait active at a time. */ + waitForMatch(signal: AbortSignal | undefined, timeoutMs: number): Promise; + /** Deregister from the index; unsubscribes the env when the last feed leaves. */ + close(): void; +}; + +type Feed = { + filter: FeedFilter; + skipColumns: string[]; + columnSig: string; + /** The currently-waiting poll's resolver (null between polls). */ + resolve: ((result: WaitResult) => void) | null; +}; + +type EnvState = { + unsubscribe: () => void; + feeds: Set; + byRunId: Map>; + byTag: Map>; + byBatchId: Map>; + /** All tag feeds, for routing partial records (no tags) as hydrate-to-classify candidates. */ + tagFeeds: Set; +}; + +function addToIndex(index: Map>, key: string, feed: Feed) { + let set = index.get(key); + if (!set) { + set = new Set(); + index.set(key, set); + } + set.add(feed); +} + +function removeFromIndex(index: Map>, key: string, feed: Feed) { + const set = index.get(key); + if (set) { + set.delete(feed); + if (set.size === 0) { + index.delete(key); + } + } +} + +export class EnvChangeRouter { + readonly #envs = new Map(); + + constructor(private readonly options: EnvChangeRouterOptions) {} + + register(environmentId: string, filter: FeedFilter, skipColumns: string[]): FeedRegistration { + const env = this.#ensureEnv(environmentId); + const feed: Feed = { + filter, + skipColumns, + columnSig: skipColumns.length > 0 ? [...skipColumns].sort().join(",") : "", + resolve: null, + }; + + env.feeds.add(feed); + this.#indexFeed(env, feed); + + const waitForMatch = (signal: AbortSignal | undefined, timeoutMs: number) => + new Promise((resolve) => { + if (signal?.aborted) { + resolve({ reason: "abort", rows: [] }); + return; + } + let settled = false; + let timer: ReturnType | undefined; + let onAbort: (() => void) | undefined; + const settle = (result: WaitResult) => { + if (settled) return; + settled = true; + feed.resolve = null; + if (timer) clearTimeout(timer); + if (signal && onAbort) signal.removeEventListener("abort", onAbort); + resolve(result); + }; + feed.resolve = settle; + timer = setTimeout(() => settle({ reason: "timeout", rows: [] }), timeoutMs); + timer.unref?.(); + if (signal) { + onAbort = () => settle({ reason: "abort", rows: [] }); + signal.addEventListener("abort", onAbort, { once: true }); + } + }); + + const close = () => { + if (!env.feeds.has(feed)) { + return; + } + env.feeds.delete(feed); + this.#deindexFeed(env, feed); + // Resolve any in-flight wait so the poll doesn't hang. + feed.resolve?.({ reason: "abort", rows: [] }); + feed.resolve = null; + if (env.feeds.size === 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + } + }; + + return { waitForMatch, close }; + } + + /** Distinct environments currently routed (for metrics). */ + get activeEnvCount(): number { + return this.#envs.size; + } + + #ensureEnv(environmentId: string): EnvState { + const existing = this.#envs.get(environmentId); + if (existing) { + return existing; + } + const env: EnvState = { + unsubscribe: () => {}, + feeds: new Set(), + byRunId: new Map(), + byTag: new Map(), + byBatchId: new Map(), + tagFeeds: new Set(), + }; + this.#envs.set(environmentId, env); + env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { + // Fire-and-forget; the notifier doesn't await us. Errors fall through to the feeds' + // backstop (a hydrate failure leaves waiters to time out into a full resolve). + void this.#onBatch(environmentId, env, records); + }); + return env; + } + + #indexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + addToIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + addToIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.add(feed); + for (const tag of feed.filter.tags) { + addToIndex(env.byTag, tag, feed); + } + break; + } + } + + #deindexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + removeFromIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + removeFromIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.delete(feed); + for (const tag of feed.filter.tags) { + removeFromIndex(env.byTag, tag, feed); + } + break; + } + } + + async #onBatch(environmentId: string, env: EnvState, records: ChangeRecord[]) { + // 1. Route each record to the held feeds it matches; collect matched runIds per feed. + const matchedRunIdsByFeed = new Map>(); + const addMatch = (feed: Feed, runId: string) => { + if (!feed.resolve) { + // Feed isn't currently waiting (between polls). Drop — its backstop catches gaps. + return; + } + let set = matchedRunIdsByFeed.get(feed); + if (!set) { + set = new Set(); + matchedRunIdsByFeed.set(feed, set); + } + set.add(runId); + }; + + for (const record of records) { + // run feeds: exact runId match. + const runFeeds = env.byRunId.get(record.runId); + if (runFeeds) { + for (const feed of runFeeds) addMatch(feed, record.runId); + } + + // batch feeds: exact batchId match (only when the record carries one). + if (record.batchId) { + const batchFeeds = env.byBatchId.get(record.batchId); + if (batchFeeds) { + for (const feed of batchFeeds) addMatch(feed, record.runId); + } + } + + // tag feeds. + if (record.tags !== undefined) { + // Full record: prune via the tag index; only feeds whose filter intersects match. + const seen = new Set(); + for (const tag of record.tags) { + const tagFeeds = env.byTag.get(tag); + if (!tagFeeds) continue; + for (const feed of tagFeeds) { + if (seen.has(feed)) continue; + seen.add(feed); + addMatch(feed, record.runId); + } + } + } else { + // Partial record (no membership data): route to every tag feed as a candidate to + // hydrate-and-classify (rare; the publish side emits full records in practice). + for (const feed of env.tagFeeds) addMatch(feed, record.runId); + } + } + + if (matchedRunIdsByFeed.size === 0) { + return; + } + + // 2. Batch-hydrate ONCE per column set, then 3. serialize ONCE per (runId, column set). + const runIdsByColumnSig = new Map }>(); + for (const [feed, runIds] of matchedRunIdsByFeed) { + let group = runIdsByColumnSig.get(feed.columnSig); + if (!group) { + group = { skipColumns: feed.skipColumns, runIds: new Set() }; + runIdsByColumnSig.set(feed.columnSig, group); + } + for (const id of runIds) group.runIds.add(id); + } + + const hydratedByColumnSig = new Map>(); + await Promise.all( + [...runIdsByColumnSig.entries()].map(async ([columnSig, group]) => { + const ids = [...group.runIds]; + const rows = await this.options.hydrator.hydrateByIds( + environmentId, + ids, + group.skipColumns + ); + this.options.onHydrate?.(rows.length); + const map = new Map(); + for (const row of rows) { + map.set(row.id, { row, value: serializeRunRow(row, group.skipColumns) }); + } + hydratedByColumnSig.set(columnSig, map); + }) + ); + + // 4. Assemble each feed's matched rows (post-filtering tag feeds against the + // authoritative hydrated row) and resolve its pending wait. + for (const [feed, runIds] of matchedRunIdsByFeed) { + if (!feed.resolve) { + continue; // stopped waiting while we hydrated; its next poll/backstop covers it + } + const hydrated = hydratedByColumnSig.get(feed.columnSig); + if (!hydrated) continue; + + const rows: MatchedRow[] = []; + for (const runId of runIds) { + const matched = hydrated.get(runId); + if (!matched) continue; // run not found / left the table + if (feed.filter.kind === "tag" && !this.#tagRowMatches(matched.row, feed.filter)) { + continue; // re-confirm tags + createdAt floor against the authoritative row + } + rows.push(matched); + } + + if (rows.length > 0) { + feed.resolve({ reason: "notify", rows }); + } + // No surviving rows (e.g. a partial-record candidate that didn't actually match): + // leave the feed waiting; nothing relevant changed for it. + } + } + + /** Authoritative re-check for tag feeds: the hydrated row's tags intersect the filter + * and its createdAt is within the feed's window. Handles partial-record candidates and + * guards record/row tag skew. */ + #tagRowMatches(row: RealtimeRunRow, filter: Extract): boolean { + if (filter.createdAtFloorMs !== undefined && row.createdAt.getTime() < filter.createdAtFloorMs) { + return false; + } + const rowTags = row.runTags ?? []; + return filter.tags.some((tag) => rowTags.includes(tag)); + } +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts new file mode 100644 index 00000000000..8d5d597c65b --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -0,0 +1,1026 @@ +import { json } from "@remix-run/server-runtime"; +import { safeParseNaturalLanguageDurationAgo } from "@trigger.dev/core/v3/isomorphic"; +import { randomUUID } from "node:crypto"; +import { API_VERSIONS, CURRENT_API_VERSION } from "~/api/versions"; +import { + type CachedLimitProvider, + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { logger } from "../logger.server"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildRowsBodyFromSerialized, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + INITIAL_OFFSET, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + RESERVED_COLUMNS, + type RowChange, + type SerializedRowChange, +} from "./electricStreamProtocol.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { + type EnvChangeRouter, + type FeedFilter, + type MatchedRow, +} from "./envChangeRouter.server"; +import { type RunHydrator, type RunListResolver } from "./runReader.server"; +import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; + +/** The tag-list feed resolves ids via ClickHouse, which needs org + project + env. + * `authentication.environment` (AuthenticatedEnvironment) provides projectId, so + * widening here avoids touching the Electric client's RealtimeEnvironment type. */ +export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; + +/** The realtime feeds the run routes depend on (single-run, tag-list, batch). Both + * the Electric client and this notifier client satisfy it, so the routes can switch + * between them behind a flag. */ +export interface RealtimeStreamClient { + streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; +} + +export type WakeupReason = "notify" | "timeout" | "abort"; + +/** How a live poll resolved, for observability: + * - `fast-hydrate`: the router woke this feed with matched rows (hydrated by id, NO + * ClickHouse). Non-matching changes never wake the feed, so they cost nothing. + * - `full-resolve`: the backstop timeout did a ClickHouse resolve (the correctness net). */ +export type LivePollPath = "fast-hydrate" | "full-resolve"; + +export type NotifierRealtimeClientOptions = { + runReader: RunHydrator; + /** Resolves the tag/list filter into the matching id-set (filter-only). */ + runListResolver: RunListResolver; + /** Per-instance routing layer over the single env change channel. */ + router: EnvChangeRouter; + limiter: RealtimeConcurrencyLimiter; + cachedLimitProvider: CachedLimitProvider; + /** Backstop wait before refetching on a live request (ms). Defaults to 5000. */ + livePollTimeoutMs?: number; + /** Ceiling for the tag-list createdAt lookback window (ms). */ + maximumCreatedAtFilterAgeMs: number; + /** Hard cap on tag-list snapshot size. Defaults to 1000. */ + maxListResults?: number; + /** TTL (ms) for the multi-run resolve+hydrate coalescing cache (initial + backstop). */ + runSetResolveCacheTtlMs?: number; + /** Max entries in the resolve+hydrate cache. Defaults to 5000. */ + runSetResolveCacheMaxEntries?: number; + /** Max entries in the per-handle working-set cache. Defaults to 10000. */ + listCacheMaxEntries?: number; + /** Epoch-aligned bucket (ms) the tag-list createdAt lower bound is floored to, so + * same-tag feeds pinned within the same bucket share a cache entry. Defaults to + * 60000. 0 disables bucketing. */ + runSetCreatedAtBucketMs?: number; + /** When true (default), a multi-run live poll holds the connection until a real delta + * or the backstop, rather than returning an empty up-to-date the client would re-issue. */ + holdOnEmpty?: boolean; + /** Max concurrent fresh ClickHouse resolves (cache misses) across this instance. Bounds a + * distinct-filter reconnect stampede so it queues instead of hammering ClickHouse. Defaults + * to 16; 0 disables the gate (unbounded). */ + resolveAdmissionLimit?: number; + /** Observability hook: why a live request woke (notify vs timeout vs abort). */ + onWakeup?: (reason: WakeupReason) => void; + /** Observability hook: how a live poll resolved (fast path vs full resolve). */ + onLivePollPath?: (path: LivePollPath) => void; + /** Observability hook: whether a multi-run resolve (initial/backstop) hit the cache, + * coalesced onto an in-flight resolve, or missed (fresh ClickHouse + Postgres). */ + onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; + /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ + onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; + /** Observability hook: a fresh resolve had to wait `ms` for an admission permit (the gate + * engaged — i.e. a stampede was throttled). Not called when a permit is free. */ + onResolveAdmissionWait?: (ms: number) => void; +}; + +const DEFAULT_CONCURRENCY_LIMIT = 100_000; +// Matches Electric's ~20s live long-poll hold (jittered ±15% per request). +const DEFAULT_LIVE_POLL_TIMEOUT_MS = 20_000; +const DEFAULT_MAX_LIST_RESULTS = 1_000; +const LIST_CACHE_TTL_MS = 5 * 60_000; +const LIST_CACHE_MAX_ENTRIES = 10_000; +const DEFAULT_RUNSET_CACHE_TTL_MS = 1_000; +const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; +const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; +const DEFAULT_RESOLVE_ADMISSION_LIMIT = 16; + +/** + * Fair FIFO semaphore bounding how many fresh ClickHouse resolves run concurrently. It sits + * BEHIND the single-flight + TTL cache, so only genuine cache-miss resolves take a permit: a + * same-filter reconnect stampede still collapses to one in-flight resolve (one permit), while + * a distinct-filter stampede — where every filter is a different cache key and so can't + * coalesce — is throttled to `limit` concurrent CH queries instead of firing all N at the + * database at once. Trades a little connect latency under a stampede for bounded CH load. + */ +class ResolveAdmissionGate { + #available: number; + #inUse = 0; + readonly #waiters: Array<() => void> = []; + + constructor(limit: number) { + this.#available = limit; + } + + /** Permits currently held (for a metrics gauge); never exceeds the limit. */ + get inUse(): number { + return this.#inUse; + } + + async acquire(): Promise { + if (this.#available > 0) { + this.#available--; + this.#inUse++; + return; + } + await new Promise((resolve) => this.#waiters.push(resolve)); + this.#inUse++; + } + + release(): void { + this.#inUse--; + const next = this.#waiters.shift(); + if (next) { + next(); // hand the freed permit straight to the next waiter (FIFO, no count churn) + } else { + this.#available++; + } + } +} + +/** A multi-run feed's filter. Tag-list sets `tags` (+ pinned `createdAtAfter`); + * the batch feed sets `batchId`. Both resolve to an id-set via the resolver. */ +type RunSetFilter = { + tags?: string[]; + batchId?: string; + createdAtAfter?: Date; +}; + +/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls + * emit only rows that advanced. */ +type WorkingSet = Map; + +type ResponseHeaderInput = { + offset: string; + handle: string; + cursor?: string; + schema?: string; +}; + +/** + * Notifier-backed implementation of the realtime run feeds. All three feeds are + * predicates over ONE per-environment change stream (the EnvChangeRouter); the router + * decides membership, hydrates the matched runs from a read replica, and serializes their + * wire values once. This client owns the snapshot, the per-handle working-set diff, the + * ClickHouse-backed backstop, and the wire response. + * + * Single-run (`streamRun`): + * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema). + * - live: the router wakes this feed when its run changes; emit a full-row `update` when + * `updatedAt` advanced past what the client has, else a bare `up-to-date`. The backstop + * re-checks via `getRunById`. + * + * Multi-run feeds (`streamRuns` tag-list, `streamBatch`): + * - initial: resolve the matching id-set via ClickHouse (filter-only), hydrate by-id from + * Postgres, emit N `insert`s, seed the working set. + * - live: the router wakes the feed with the matched runs already hydrated + serialized; + * diff them on the authoritative Postgres `updatedAt` against the per-handle working + * set and emit only new/advanced rows. The backstop (timeout) does a full ClickHouse + * resolve — the correctness net that catches gaps and drops departed runs. + * + * Tokens are opaque: `offset` = `_`, `handle` is per-shape, `cursor` + * is a live-only counter. The wire format is produced by `electricStreamProtocol`. + */ +export class NotifierRealtimeClient implements RealtimeStreamClient { + #seq = 0; + readonly #workingSetCache: BoundedTtlCache; + /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair used by the + * initial snapshot and the backstop, keyed by (env, filter, columns). Collapses a + * reconnect/snapshot stampede of identical filters into one shared resolve+hydrate. */ + readonly #runSetCache: BoundedTtlCache; + readonly #runSetInflight = new Map>(); + /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ + readonly #admissionGate?: ResolveAdmissionGate; + + constructor(private readonly options: NotifierRealtimeClientOptions) { + this.#workingSetCache = new BoundedTtlCache( + LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); + this.#runSetCache = new BoundedTtlCache( + options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, + options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES + ); + const admissionLimit = options.resolveAdmissionLimit ?? DEFAULT_RESOLVE_ADMISSION_LIMIT; + if (admissionLimit > 0) { + this.#admissionGate = new ResolveAdmissionGate(admissionLimit); + } + } + + /** Current size of the per-handle working-set cache (for a metrics gauge). */ + get workingSetCacheSize(): number { + return this.#workingSetCache.size; + } + + /** Fresh CH resolves currently holding an admission permit (for a metrics gauge). */ + get resolveAdmissionInUse(): number { + return this.#admissionGate?.inUse ?? 0; + } + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // Initial snapshot — no prior offset/handle. + if (offset === INITIAL_OFFSET || !handle) { + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion); + } + + if (isLive) { + return this.#liveResponse({ + environment, + runId, + offset, + handle, + skipColumns, + apiVersion, + clientVersion, + signal, + }); + } + + // Non-live catch-up with a handle: re-emit the current snapshot (idempotent). + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion, handle); + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + const tags = params.tags ?? []; + + // Initial snapshot — pin the createdAt window in a fresh handle. + if (offset === INITIAL_OFFSET || !handle) { + const createdAtFilterMs = this.#computeCreatedAtFilter(params.createdAt).getTime(); + return this.#runSetSnapshotResponse( + environment, + { tags, createdAtAfter: new Date(createdAtFilterMs) }, + this.#mintListHandle(createdAtFilterMs), + skipColumns, + apiVersion, + clientVersion + ); + } + + // Recover the pinned window from the handle so the lower bound never drifts. + // Re-clamp the recovered value to the max-age floor so a stale or crafted handle + // can't widen the lookback past the configured ceiling. + const recoveredMs = this.#filterMsFromHandle(handle); + const filter: RunSetFilter = { + tags, + createdAtAfter: new Date( + recoveredMs !== undefined + ? this.#clampCreatedAtFloor(recoveredMs) + : this.#computeCreatedAtFilter(params.createdAt).getTime() + ), + }; + + if (isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Non-live catch-up under the same handle. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + const filter: RunSetFilter = { batchId }; + + if (offset !== INITIAL_OFFSET && handle && isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Initial snapshot + non-live catch-up. The handle must be per-connection, never + // derived from the batchId: working sets are keyed by handle, and a shared handle + // lets one subscriber's emit permanently suppress the same row for another. + return this.#runSetSnapshotResponse( + environment, + filter, + handle ?? this.#mintBatchHandle(batchId), + skipColumns, + apiVersion, + clientVersion + ); + } + + #snapshotResponse( + runId: string, + row: Awaited>, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string, + existingHandle?: string + ): Response { + const body = buildSnapshotBody(row, skipColumns); + const offset = row ? encodeOffset(row.updatedAt.getTime(), this.#nextSeq()) : encodeOffset(0, 0); + return this.#buildResponse(body, apiVersion, clientVersion, { + offset, + handle: existingHandle ?? this.#mintHandle(runId), + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** + * Live poll for a single-run feed. The router wakes this feed when its run changes, + * with the run already hydrated + serialized (no ClickHouse, ever). On the backstop + * timeout it re-checks via `getRunById`. Only-on-advance: emit a full-row `update` when + * the row moved past what the client already has; else a bare `up-to-date`. + */ + async #liveResponse(params: { + environment: RealtimeEnvironment; + runId: string; + offset: string; + handle: string; + skipColumns: string[]; + apiVersion: API_VERSIONS; + clientVersion?: string; + signal?: AbortSignal; + }): Promise { + const { environment, runId, offset, handle, skipColumns, apiVersion, clientVersion, signal } = + params; + + return this.#withConcurrencySlot(environment, async () => { + const lastSeenMs = parseOffsetUpdatedAtMs(offset); + const registration = this.options.router.register( + environment.id, + { kind: "run", runId }, + skipColumns + ); + + try { + const { reason, rows } = await registration.waitForMatch(signal, this.#jitteredTimeout()); + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(this.#nextSeq()), + }); + } + + if (reason === "notify" && rows.length > 0) { + // The router hydrated + serialized this run; emit it (only on advance). + this.options.onLivePollPath?.("fast-hydrate"); + const matched = rows[0]; + const updatedAtMs = matched.row.updatedAt.getTime(); + const seq = this.#nextSeq(); + if (updatedAtMs > lastSeenMs) { + return this.#buildResponse( + buildRowsBodyFromSerialized([ + { runId: matched.row.id, value: matched.value, operation: "update" }, + ]), + apiVersion, + clientVersion, + { offset: encodeOffset(updatedAtMs, seq), handle, cursor: String(seq) } + ); + } + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + } + + // Backstop timeout: re-check the run directly (no ClickHouse for the single-run feed). + this.options.onLivePollPath?.("full-resolve"); + const row = await this.options.runReader.getRunById(environment.id, runId); + const seq = this.#nextSeq(); + if (row && row.updatedAt.getTime() > lastSeenMs) { + return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + }); + } + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + } finally { + registration.close(); + } + }); + } + + /** Initial (and non-live catch-up) snapshot for a multi-run feed: resolve the + * id-set, hydrate, emit every row as an `insert`, and seed the working set. */ + async #runSetSnapshotResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string + ): Promise { + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + const changes: RowChange[] = rows.map((row) => ({ row, operation: "insert" as const })); + + // updatedAt comes from the authoritative Postgres hydrate, not ClickHouse. + const seen: WorkingSet = new Map(); + let maxUpdatedAt = 0; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + } + this.#workingSetCache.set(this.#workingSetKey(environment.id, handle), seen); + + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), + handle, + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** + * Live poll for a multi-run feed. Two paths: + * - Fast path (router notify): the router woke us with the matched runs already + * membership-confirmed, hydrated, and serialized (no ClickHouse). Diff them against + * the per-handle working set and emit new/advanced rows. + * - Backstop (timeout): a full ClickHouse resolve + hydrate. The correctness net — + * catches members missed during a gap and drops runs that left the filter. + * With hold-on-empty (default) the connection holds until a real delta or the backstop + * rather than returning an empty response the client would re-issue. + */ + async #runSetLiveResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + offset: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + signal: AbortSignal | undefined + ): Promise { + return this.#withConcurrencySlot(environment, async () => { + const offsetFloorMs = parseOffsetUpdatedAtMs(offset); + // Total time to hold this long-poll, jittered to avoid synchronized refetch herds. + const deadline = Date.now() + this.#jitteredTimeout(); + const holdOnEmpty = this.options.holdOnEmpty ?? true; + + // Working set we diff against: seeded from the cache (or the offset floor on a + // miss) and advanced on each refetch within this held request. + const workingSetKey = this.#workingSetKey(environment.id, handle); + let prevSeen = this.#workingSetCache.get(workingSetKey); + + const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildRowsBodyFromSerialized(changes), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitFromRows = (changes: RowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitUpToDate = (maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + + const registration = this.options.router.register( + environment.id, + this.#feedFilter(filter), + skipColumns + ); + + try { + while (true) { + const remaining = deadline - Date.now(); + const { reason, rows } = + remaining > 0 + ? await registration.waitForMatch(signal, remaining) + : { reason: "timeout" as const, rows: [] as MatchedRow[] }; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return emitUpToDate(offsetFloorMs); + } + + // FAST PATH: the router already confirmed membership + the createdAt window and + // hydrated/serialized the matched runs. Just diff against the working set. + if (reason === "notify") { + this.options.onLivePollPath?.("fast-hydrate"); + const { changes, maxUpdatedAt, touched } = this.#diffMatched( + rows, + prevSeen, + offsetFloorMs + ); + // Merge (not replace): the router only surfaced the changed subset, so keep the + // rest of the working set intact. The backstop full-resolve rebuilds it. + const merged = this.#mergeWorkingSet(prevSeen, touched); + this.#workingSetCache.set(workingSetKey, merged); + prevSeen = merged; + + if (changes.length > 0) { + return emitFromSerialized(changes, maxUpdatedAt); + } + // Matched but no row advanced (already seen). Keep holding. + if (holdOnEmpty) { + continue; + } + return emitUpToDate(maxUpdatedAt); + } + + // BACKSTOP: full ClickHouse resolve + hydrate. Replaces the working set so runs + // that left the filter stop being tracked (the client keeps showing them). + this.options.onLivePollPath?.("full-resolve"); + const resolved = await this.#resolveAndHydrate(environment, filter, skipColumns); + const { changes, maxUpdatedAt, touched } = this.#diffRows( + resolved, + prevSeen, + offsetFloorMs + ); + this.#workingSetCache.set(workingSetKey, touched); + prevSeen = touched; + + if (changes.length > 0) { + return emitFromRows(changes, maxUpdatedAt); + } + // Empty backstop diff: timeout returns up-to-date; (holdOnEmpty never reaches + // here on a notify — those are handled in the fast path above). + return emitUpToDate(maxUpdatedAt); + } + } finally { + registration.close(); + } + }); + } + + /** Translate a multi-run filter into the router's membership predicate. */ + #feedFilter(filter: RunSetFilter): FeedFilter { + if (filter.batchId !== undefined) { + return { kind: "batch", batchId: filter.batchId }; + } + return { + kind: "tag", + tags: filter.tags ?? [], + createdAtFloorMs: filter.createdAtAfter?.getTime(), + }; + } + + /** Diff router-matched rows (already serialized) against the prior working set, pairing + * each row's shared `value` with this feed's operation. */ + #diffMatched( + matched: MatchedRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: SerializedRowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: SerializedRowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const { row, value } of matched) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ runId: row.id, value, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ runId: row.id, value, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ runId: row.id, value, operation: "update" }); + } + } + return { changes, maxUpdatedAt, touched }; + } + + /** + * Diff hydrated rows against the prior working set on the authoritative Postgres + * `updatedAt`: a run not in the set is an `insert`, one whose `updatedAt` advanced is an + * `update`. On a working-set miss, anything past the offset floor is a merge-safe + * `update`. Used by the snapshot and the backstop full-resolve. + */ + #diffRows( + rows: RealtimeRunRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: RowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: RowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ row, operation: "update" }); + } + } + return { changes, maxUpdatedAt, touched }; + } + + /** Merge fast-path touched rows into the prior working set. The fast path only saw the + * changed subset, so we keep the rest (the backstop full-resolve does the exact rebuild). */ + #mergeWorkingSet(prevSeen: WorkingSet | undefined, touched: WorkingSet): WorkingSet { + const merged: WorkingSet = new Map(prevSeen ?? undefined); + for (const [id, updatedAtMs] of touched) { + merged.set(id, updatedAtMs); + } + return merged; + } + + /** + * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), coalesced + + * short-TTL cached by (env, filter, columns). Used by the initial snapshot and the + * backstop. A reconnect/snapshot stampede of identical filters shares ONE resolve+hydrate + * (concurrent callers await the in-flight one; callers within the TTL reuse the rows). + */ + async #resolveAndHydrate( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const key = this.#runSetCacheKey(environment.id, filter, skipColumns); + + const cached = this.#runSetCache.get(key); + if (cached) { + this.options.onRunSetResolve?.("hit"); + return cached; + } + + const existing = this.#runSetInflight.get(key); + if (existing) { + this.options.onRunSetResolve?.("coalesced"); + return existing; + } + + this.options.onRunSetResolve?.("miss"); + // Registered in #runSetInflight synchronously below, so same-filter callers that arrive + // while this is still waiting for an admission permit coalesce onto it (one permit, not N). + const promise = this.#admitAndResolveUncached(environment, filter, skipColumns) + .then((rows) => { + this.#runSetCache.set(key, rows); + return rows; + }) + .finally(() => { + this.#runSetInflight.delete(key); + }); + + this.#runSetInflight.set(key, promise); + return promise; + } + + /** Acquire an admission permit (if the gate is enabled) before the fresh CH+PG resolve, so + * a distinct-filter stampede is throttled to the configured concurrency. */ + async #admitAndResolveUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + if (!this.#admissionGate) { + return this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } + const waitStart = Date.now(); + await this.#admissionGate.acquire(); + const waited = Date.now() - waitStart; + if (waited > 0) { + this.options.onResolveAdmissionWait?.(waited); + } + try { + return await this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } finally { + this.#admissionGate.release(); + } + } + + async #resolveAndHydrateUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const resolveStart = Date.now(); + const ids = await this.#resolveIds(environment, filter); + this.options.onRunSetQuery?.("resolve", Date.now() - resolveStart); + + const hydrateStart = Date.now(); + const rows = await this.options.runReader.hydrateByIds(environment.id, ids, skipColumns); + this.options.onRunSetQuery?.("hydrate", Date.now() - hydrateStart); + + return rows; + } + + /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the + * same projected columns, so cached rows always match the requesting feed. */ + #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { + // JSON-encode the arrays (not a join) so a value containing the separators — + // e.g. a tag with a comma — can't collide: ["a,b"] must not key the same as + // ["a","b"], which are different ClickHouse filters. + const tags = filter.tags && filter.tags.length > 0 ? JSON.stringify([...filter.tags].sort()) : ""; + const cols = skipColumns.length > 0 ? JSON.stringify([...skipColumns].sort()) : ""; + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ + filter.createdAtAfter?.getTime() ?? "" + }|${maxListResults}|${cols}`; + } + + async #resolveIds(environment: RealtimeListEnvironment, filter: RunSetFilter): Promise { + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + const ids = await this.options.runListResolver.resolveMatchingRunIds({ + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: filter.tags, + batchId: filter.batchId, + createdAtAfter: filter.createdAtAfter, + limit: maxListResults, + }); + + if (ids.length >= maxListResults) { + logger.warn("[notifierRealtimeClient] run-set feed hit the result cap", { + environmentId: environment.id, + filter, + cap: maxListResults, + }); + } + + return ids; + } + + #computeCreatedAtFilter(createdAt: string | undefined): Date { + // Clamp to the maximum lookback window, mirroring realtimeClient. + const floor = new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs); + const parsed = safeParseNaturalLanguageDurationAgo(createdAt ?? "24h"); + const resolved = !parsed || parsed < floor ? floor : parsed; + // Quantize the lower bound to a coarse epoch-aligned bucket and pin THAT in the + // handle, so same-tag feeds whose windows land in the same bucket resolve to the + // same filter -> same coalescing cache key -> one shared ClickHouse + Postgres + // query instead of one per feed. Floored (rounds the bound earlier), so the + // window only ever widens by < bucket and never drops a run the client should see. + return new Date(this.#bucketCreatedAtMs(resolved.getTime())); + } + + #bucketCreatedAtMs(ms: number): number { + const bucket = this.options.runSetCreatedAtBucketMs ?? DEFAULT_RUNSET_CREATED_AT_BUCKET_MS; + return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; + } + + /** Clamp a handle-recovered createdAt lower bound up to the max-age floor (so a + * stale or crafted handle can't widen the window past the ceiling), then re-bucket. */ + #clampCreatedAtFloor(ms: number): number { + const floorMs = Date.now() - this.options.maximumCreatedAtFilterAgeMs; + return this.#bucketCreatedAtMs(Math.max(ms, floorMs)); + } + + #mintListHandle(createdAtFilterMs: number): string { + // Pins the createdAt threshold in the opaque handle so live polls reuse the + // same lower bound even on a working-set cache miss. + return `runs_${Math.trunc(createdAtFilterMs)}_${this.#mintUniqueSuffix()}`; + } + + #mintBatchHandle(batchId: string): string { + return `batch_${batchId}_${this.#mintUniqueSuffix()}`; + } + + #mintUniqueSuffix(): string { + // The seq alone isn't unique across instances/restarts; behind a non-sticky ALB a + // collision would land two connections on one working-set cache entry. + return `${this.#nextSeq()}_${randomUUID().slice(0, 8)}`; + } + + #workingSetKey(environmentId: string, handle: string): string { + // The handle is client-echoed; env-prefix the key so a foreign handle can never + // read or overwrite another tenant's working set. + return `${environmentId}:${handle}`; + } + + #filterMsFromHandle(handle: string): number | undefined { + const parts = handle.split("_"); + if (parts[0] !== "runs") { + return undefined; + } + const ms = Number(parts[1]); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } + + #parseStreamRequest( + url: URL | string, + requestOptions?: RealtimeRequestOptions + ): { offset: string; handle: string | null; isLive: boolean; skipColumns: string[] } { + const $url = new URL(url.toString()); + return { + offset: $url.searchParams.get("offset") ?? INITIAL_OFFSET, + handle: $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"), + isLive: $url.searchParams.get("live") === "true", + skipColumns: this.#resolveSkipColumns($url, requestOptions), + }; + } + + /** + * Runs `work` inside a per-env concurrency slot: acquires a slot (429 if over the + * org limit, 500 if the limit can't be read) and always releases it afterward. + */ + async #withConcurrencySlot( + environment: RealtimeEnvironment, + work: () => Promise + ): Promise { + const requestId = randomUUID(); + const concurrencyLimit = await this.options.cachedLimitProvider.getCachedLimit( + environment.organizationId, + DEFAULT_CONCURRENCY_LIMIT + ); + + if (concurrencyLimit == null) { + logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { + organizationId: environment.organizationId, + }); + return json({ error: "Failed to get concurrency limit" }, { status: 500 }); + } + + const canProceed = await this.options.limiter.incrementAndCheck( + environment.id, + requestId, + concurrencyLimit + ); + + if (!canProceed) { + return json({ error: "Too many concurrent requests" }, { status: 429 }); + } + + try { + return await work(); + } finally { + await this.options.limiter.decrement(environment.id, requestId); + } + } + + #jitteredTimeout(): number { + const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; + // +/-15% jitter to avoid synchronized refetch herds. + return Math.round(base * (0.85 + Math.random() * 0.3)); + } + + #buildResponse( + body: string, + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + headers: ResponseHeaderInput + ): Response { + const finalBody = + apiVersion === CURRENT_API_VERSION ? body : rewriteBodyForLegacyApiVersion(body); + + const responseHeaders = new Headers(); + responseHeaders.set("content-type", "application/json"); + responseHeaders.set("cache-control", "no-store"); + + // Carry CORS on the response itself, mirroring how the Electric upstream does + // (apiCors passes a response through untouched once it has allow-origin). Browsers + // can only read the electric-* headers cross-origin if they're explicitly exposed; + // without this the deployed react-hooks fail with MissingHeadersError. Bearer-token + // requests are non-credentialed, so a wildcard is safe. + responseHeaders.set("access-control-allow-origin", "*"); + responseHeaders.set("access-control-expose-headers", "*"); + + // Modern clients (1.0.14) send `x-trigger-electric-version` and read the + // lowercase `electric-*` headers. Legacy clients (0.4.0) omit the version and + // read `electric-shape-id`/`electric-chunk-last-offset` (case-insensitive), + // matching realtimeClient's rewriteResponseHeaders behavior exactly. + if (clientVersion) { + responseHeaders.set("electric-offset", headers.offset); + responseHeaders.set("electric-handle", headers.handle); + } else { + responseHeaders.set("electric-chunk-last-offset", headers.offset); + responseHeaders.set("electric-shape-id", headers.handle); + } + + if (headers.cursor !== undefined) { + responseHeaders.set("electric-cursor", headers.cursor); + } + if (headers.schema !== undefined) { + responseHeaders.set("electric-schema", headers.schema); + } + + return new Response(finalBody, { status: 200, headers: responseHeaders }); + } + + #mintHandle(runId: string): string { + // Stable per-run handle: the single-run shape never changes columns, so the + // client never needs a must-refetch from a handle change. + return `run-${runId}`; + } + + #nextSeq(): number { + this.#seq = (this.#seq + 1) % Number.MAX_SAFE_INTEGER; + return this.#seq; + } + + #resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); + } +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..24d5f13b0c6 --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -0,0 +1,142 @@ +import { Counter, Gauge, Histogram } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { getCachedLimit } from "../platform.v3.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { EnvChangeRouter } from "./envChangeRouter.server"; +import { NotifierRealtimeClient } from "./notifierRealtimeClient.server"; +import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; +import { RunHydrator } from "./runReader.server"; + +/** + * Process-singleton wiring for the notifier-backed realtime client. Only + * constructed when a request actually routes to the + * notifier backend, so a disabled webapp never instantiates it. + */ +function initializeNotifierRealtimeClient(): NotifierRealtimeClient { + const wakeups = new Counter({ + name: "realtime_notifier_wakeups_total", + help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", + labelNames: ["reason"] as const, + registers: [metricsRegister], + }); + + const runSetResolves = new Counter({ + name: "realtime_notifier_runset_resolve_total", + help: "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query under an env-wide wake.", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + + const runSetQueryMs = new Histogram({ + name: "realtime_notifier_runset_query_ms", + help: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", + labelNames: ["stage"] as const, + buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000], + registers: [metricsRegister], + }); + + const livePollPaths = new Counter({ + name: "realtime_notifier_live_poll_total", + help: "How live polls resolved. 'fast-hydrate' = the router woke the feed with matched runs hydrated by id (no ClickHouse); 'full-resolve' = the backstop timeout did a ClickHouse resolve. A high fast-path share is the local-membership routing working.", + labelNames: ["path"] as const, + registers: [metricsRegister], + }); + + const routerHydrates = new Counter({ + name: "realtime_notifier_router_hydrated_runs_total", + help: "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run — the hot-shared-tag fan-out collapse).", + registers: [metricsRegister], + }); + + const resolveAdmissionWaits = new Counter({ + name: "realtime_notifier_resolve_admission_waits_total", + help: "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", + registers: [metricsRegister], + }); + + const limiter = new RealtimeConcurrencyLimiter({ + keyPrefix: "tr:realtime:notifier:concurrency", + redis: { + port: env.RATE_LIMIT_REDIS_PORT, + host: env.RATE_LIMIT_REDIS_HOST, + username: env.RATE_LIMIT_REDIS_USERNAME, + password: env.RATE_LIMIT_REDIS_PASSWORD, + tlsDisabled: env.RATE_LIMIT_REDIS_TLS_DISABLED === "true", + clusterMode: env.RATE_LIMIT_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + // One RunHydrator shared by the router (fast-path batch-hydrate) and the client + // (snapshot + backstop), so its single-flight + short-TTL cache covers both. + const runReader = new RunHydrator({ replica: $replica }); + + const router = new EnvChangeRouter({ + source: getRunChangeNotifier(), + hydrator: runReader, + onHydrate: (runCount) => routerHydrates.inc(runCount), + }); + + const client = new NotifierRealtimeClient({ + runReader, + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), + prisma: $replica, + }), + router, + limiter, + cachedLimitProvider: { + async getCachedLimit(organizationId, defaultValue) { + const result = await getCachedLimit( + organizationId, + "realtimeConcurrentConnections", + defaultValue + ); + return result.val; + }, + }, + livePollTimeoutMs: env.REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + runSetResolveCacheTtlMs: env.REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS, + runSetResolveCacheMaxEntries: env.REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES, + listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, + runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, + holdOnEmpty: env.REALTIME_NOTIFIER_HOLD_ON_EMPTY === "1", + resolveAdmissionLimit: env.REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT, + onWakeup: (reason) => wakeups.inc({ reason }), + onLivePollPath: (path) => livePollPaths.inc({ path }), + onRunSetResolve: (result) => runSetResolves.inc({ result }), + onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), + onResolveAdmissionWait: () => resolveAdmissionWaits.inc(), + }); + + new Gauge({ + name: "realtime_notifier_working_set_size", + help: "Entries in the per-handle working-set cache (one per active multi-run feed session).", + registers: [metricsRegister], + collect() { + this.set(client.workingSetCacheSize); + }, + }); + + new Gauge({ + name: "realtime_notifier_resolve_admission_in_use", + help: "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", + registers: [metricsRegister], + collect() { + this.set(client.resolveAdmissionInUse); + }, + }); + + return client; +} + +export function getNotifierRealtimeClient(): NotifierRealtimeClient { + return singleton("notifierRealtimeClient", initializeNotifierRealtimeClient); +} diff --git a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts new file mode 100644 index 00000000000..a935858fef0 --- /dev/null +++ b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts @@ -0,0 +1,111 @@ +import { Callback, Result } from "ioredis"; +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RealtimeConcurrencyLimiterOptions = { + redis: RedisWithClusterOptions; + keyPrefix: string; + /** How long a tracked request lives before it's swept as stale (seconds). */ + expiryTimeInSeconds?: number; + connectionName?: string; +}; + +/** + * Per-environment concurrent-connection limiter for realtime long-polls. + * + * This is a standalone copy of the limiter embedded in `realtimeClient.server.ts` + * (Electric path), so the notifier-backed client can enforce the same per-env cap + * WITHOUT modifying the existing Electric client. The Lua + key shape are + * identical; only the key prefix differs, so the two paths track independently. + */ +export class RealtimeConcurrencyLimiter { + private redis: RedisClient; + private expiryTimeInSeconds: number; + + constructor(private options: RealtimeConcurrencyLimiterOptions) { + this.redis = createRedisClient( + options.connectionName ?? "trigger:realtime:notifier:concurrency", + options.redis + ); + this.expiryTimeInSeconds = options.expiryTimeInSeconds ?? 60 * 5; + this.#registerCommands(); + } + + async incrementAndCheck(environmentId: string, requestId: string, limit: number): Promise { + const key = this.#getKey(environmentId); + const now = Date.now(); + + const result = await this.redis.incrementAndCheckRealtimeNotifierConcurrency( + key, + now.toString(), + requestId, + this.expiryTimeInSeconds.toString(), + (now - this.expiryTimeInSeconds * 1000).toString(), + limit.toString() + ); + + return result === 1; + } + + async decrement(environmentId: string, requestId: string): Promise { + const key = this.#getKey(environmentId); + await this.redis.zrem(key, requestId); + } + + #getKey(environmentId: string): string { + return `${this.options.keyPrefix}:${environmentId}`; + } + + #registerCommands() { + this.redis.defineCommand("incrementAndCheckRealtimeNotifierConcurrency", { + numberOfKeys: 1, + lua: /* lua */ ` + local concurrencyKey = KEYS[1] + + local timestamp = tonumber(ARGV[1]) + local requestId = ARGV[2] + local expiryTime = tonumber(ARGV[3]) + local cutoffTime = tonumber(ARGV[4]) + local limit = tonumber(ARGV[5]) + + -- Remove expired entries + redis.call('ZREMRANGEBYSCORE', concurrencyKey, '-inf', cutoffTime) + + -- Add the new request to the sorted set + redis.call('ZADD', concurrencyKey, timestamp, requestId) + + -- Set the expiry time on the key + redis.call('EXPIRE', concurrencyKey, expiryTime) + + -- Get the total number of concurrent requests + local totalRequests = redis.call('ZCARD', concurrencyKey) + + -- Check if the limit has been exceeded + if totalRequests > limit then + redis.call('ZREM', concurrencyKey, requestId) + return 0 + end + + return 1 + `, + }); + + this.redis.on("error", (error) => { + logger.error("[realtimeConcurrencyLimiter] redis error", { error }); + }); + } +} + +declare module "ioredis" { + interface RedisCommander { + incrementAndCheckRealtimeNotifierConcurrency( + key: string, + timestamp: string, + requestId: string, + expiryTime: string, + cutoffTime: string, + limit: string, + callback?: Callback + ): Result; + } +} diff --git a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts new file mode 100644 index 00000000000..220f79f9308 --- /dev/null +++ b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts @@ -0,0 +1,86 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { logger } from "../logger.server"; +import { type RealtimeEnvironment } from "../realtimeClient.server"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeStreamClient } from "./notifierRealtimeClient.server"; +import { getNotifierRealtimeClient } from "./notifierRealtimeClientInstance.server"; +import { getShadowRealtimeClient } from "./shadowRealtimeClientInstance.server"; + +type RealtimeBackend = "electric" | "notifier" | "shadow"; + +/** + * Chooses which backend serves a realtime run request. + * + * Two gates, both defaulting to the Electric path: + * 1. `REALTIME_NOTIFIER_ENABLED` (env master switch). When off, this returns the + * Electric client immediately — no flag read, no notifier client construction, + * byte-identical to pre-Electric-Sunset behavior. + * 2. the `realtimeBackend` feature flag (global + per-org, org wins), resolved per + * org and cached in-process for 30s so the long-poll feed doesn't hit the DB + * on every request. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; +const BACKEND_CACHE_TTL_MS = 30_000; +// Org count is bounded, but cap to avoid unbounded growth. +const BACKEND_CACHE_MAX_ENTRIES = 50_000; + +const flag = makeFlag($replica); +const backendCache = new BoundedTtlCache( + BACKEND_CACHE_TTL_MS, + BACKEND_CACHE_MAX_ENTRIES +); + +export async function resolveRealtimeStreamClient( + environment: RealtimeEnvironment +): Promise { + if (!notifierEnabled) { + return realtimeClient; + } + + switch (await getRealtimeBackend(environment.organizationId)) { + case "notifier": + return getNotifierRealtimeClient(); + case "shadow": + // Client is still served Electric; the notifier path is diffed in the background. + return getShadowRealtimeClient(); + case "electric": + default: + return realtimeClient; + } +} + +async function getRealtimeBackend(organizationId: string): Promise { + const cached = backendCache.get(organizationId); + if (cached !== undefined) { + return cached; + } + + let backend: RealtimeBackend = "electric"; + + try { + const org = await $replica.organization.findFirst({ + where: { id: organizationId }, + select: { featureFlags: true }, + }); + + backend = await flag({ + key: FEATURE_FLAG.realtimeBackend, + defaultValue: "electric", + overrides: (org?.featureFlags as Record) ?? {}, + }); + } catch (error) { + // Never let a flag lookup failure break the realtime feed — fall back to Electric. + logger.error("[resolveRealtimeStreamClient] failed to resolve realtimeBackend flag", { + organizationId, + error, + }); + backend = "electric"; + } + + backendCache.set(organizationId, backend); + return backend; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts new file mode 100644 index 00000000000..f975af05723 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -0,0 +1,370 @@ +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export const CHANGE_RECORD_VERSION = 1; + +/** + * A run-change fact, published once to the run's environment channel. Self-describing: + * - `envId` routes it to its channel (mandatory). + * - `tags` / `batchId` let a tag/batch feed decide membership LOCALLY, without a + * ClickHouse re-resolve. `tags` present (even `[]`) marks a "full" record; `tags` + * absent marks a "partial" record (envId+runId only) that a tag feed must hydrate to + * classify. `batchId` present only when the run is in a batch. + * - `runId` lets a single-run feed match; `createdAtMs` lets a tag feed apply its + * createdAt floor locally; `updatedAtMs`/`status` are hints. + * Row state (payload/output/...) is never on the wire — it's refetched from Postgres. + */ +export type ChangeRecord = { + v: number; + runId: string; + envId: string; + tags?: string[]; + batchId?: string | null; + createdAtMs?: number; + updatedAtMs?: number; + status?: string; +}; + +/** What a publish site provides; the notifier stamps the version. */ +export type ChangeRecordInput = Omit; + +export function encodeChangeRecord(record: ChangeRecord): string { + return JSON.stringify(record); +} + +/** Decode a wire message into a ChangeRecord. Tolerant of a bare runId (no membership + * data) so a malformed/legacy frame degrades to a partial record (hydrate-to-classify) + * rather than throwing. */ +export function decodeChangeRecord(message: string): ChangeRecord { + if (message.length === 0 || message[0] !== "{") { + return { v: 0, runId: message, envId: "" }; + } + try { + const parsed = JSON.parse(message) as Partial; + if (parsed && typeof parsed.runId === "string") { + return { + v: parsed.v ?? 0, + runId: parsed.runId, + envId: parsed.envId ?? "", + tags: parsed.tags, + batchId: parsed.batchId, + createdAtMs: parsed.createdAtMs, + updatedAtMs: parsed.updatedAtMs, + status: parsed.status, + }; + } + } catch { + // fall through to the bare-runId fallback + } + return { v: 0, runId: message, envId: "" }; +} + +export type RunChangeNotifierOptions = { + redis: RedisWithClusterOptions; + /** Channel name prefix; the envId is appended inside a hash-tag for slot locality. */ + channelPrefix?: string; + connectionName?: string; + /** + * Leading-edge throttle (ms) for the per-env channel: deliver the first wake + * immediately, then at most one more per window while changes keep arriving. Bounds the + * wake rate per env regardless of run throughput. Defaults to 100ms. 0 disables it. + */ + envWakeCoalesceWindowMs?: number; + /** + * Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH) instead of classic pub/sub. Only + * valid against a Redis Cluster (channels are hash-tagged by envId, so each lands on one + * shard) and requires the client built with `clusterOptions.shardedSubscribers: true`. + * Classic PUBLISH in a cluster broadcasts to every node, so sharded pub/sub is what + * actually distributes the load. Defaults to false (classic, for single-node / local). + */ + shardedPubSub?: boolean; +}; + +const DEFAULT_CHANNEL_PREFIX = "realtime:"; +const DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS = 100; + +/** + * RunChangeNotifier — carries "run X changed" facts from write sites to the realtime + * feed over ONE per-environment channel. + * + * Design constraints baked in here: + * - ONE channel type, `env:{}`. A change is one fact published once; who + * cares about it is a predicate evaluated by the consumer (the EnvChangeRouter), not a + * second channel. Single-run, tag, and batch feeds all read this one stream. + * - Minimal wire data (a self-describing `ChangeRecord` of small keys), never row + * columns. Row state is always refetched from Postgres. + * - ONE shared, multiplexed subscriber connection per process with a refcounted + * `Map>`. The RunQueue pattern, deliberately NOT the + * per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would exhaust + * ElastiCache `maxclients`). + * - Connections are created lazily: a process that never publishes or subscribes (the + * default, flag-off state) opens no Redis connections at all. + * - `publish` is fire-and-forget and never throws; a dropped publish only costs latency + * because the consumer has a timeout backstop. + * + * Channels are hash-tagged (`env:{}`) so an env's traffic lands on one + * cluster slot. With `shardedPubSub` (cluster only) the feed uses SSUBSCRIBE/SPUBLISH so + * each env's traffic stays on one shard rather than broadcasting cluster-wide. + */ +export class RunChangeNotifier { + #publisher: RedisClient | undefined; + #subscriber: RedisClient | undefined; + readonly #listeners = new Map void>>(); + /** + * Per-channel accumulator of records since the last delivery, deduped by runId. A + * coalesced env window collapses many publishes into one wake; this holds the batch so + * the wake carries every run that moved, not just the last one (latest record per run + * wins, keeping the freshest keys). + */ + readonly #pending = new Map>(); + readonly #channelPrefix: string; + readonly #connectionName: string; + readonly #coalesceWindowMs: number; + /** When true, use sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) — see options. */ + readonly #sharded: boolean; + /** Active coalescing windows per channel. */ + readonly #coalesceTimers = new Map>(); + /** Channels that received a message while their window was open (need a trailing wake). */ + readonly #coalesceDirty = new Set(); + + constructor(private readonly options: RunChangeNotifierOptions) { + this.#channelPrefix = options.channelPrefix ?? DEFAULT_CHANNEL_PREFIX; + this.#connectionName = options.connectionName ?? "trigger:realtime:run-change-notifier"; + this.#coalesceWindowMs = options.envWakeCoalesceWindowMs ?? DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS; + this.#sharded = options.shardedPubSub ?? false; + } + + /** + * Fire-and-forget publish of a run-changed fact to the run's environment channel. Never + * throws. The notifier stamps the record version. + */ + publish(input: ChangeRecordInput): void { + const record: ChangeRecord = { v: CHANGE_RECORD_VERSION, ...input }; + this.#publishToChannel(this.#channelForEnv(record.envId), encodeChangeRecord(record)); + } + + /** Fire-and-forget publish of many run-changed facts. Never throws. */ + publishMany(inputs: ChangeRecordInput[]): void { + for (const input of inputs) { + this.publish(input); + } + } + + #publishToChannel(channel: string, payload: string): void { + try { + const publisher = this.#ensurePublisher(); + // Sharded pub/sub (SPUBLISH) routes to the channel's slot owner; classic PUBLISH + // broadcasts cluster-wide. The channel is hash-tagged by envId. + const result = this.#sharded + ? publisher.spublish(channel, payload) + : publisher.publish(channel, payload); + if (typeof (result as Promise)?.catch === "function") { + (result as Promise).catch((error) => { + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); + }); + } + } catch (error) { + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); + } + } + + /** + * Subscribe (persistently) to an environment's run-change stream. `onBatch` is invoked + * with the coalesced batch of records on every wake until the returned unsubscribe is + * called. Refcounted over the shared subscriber: the first listener for an env issues + * SUBSCRIBE, the last one UNSUBSCRIBE. + */ + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void { + const channel = this.#channelForEnv(environmentId); + const subscriber = this.#ensureSubscriber(); + + let listeners = this.#listeners.get(channel); + if (!listeners) { + listeners = new Set(); + this.#listeners.set(channel, listeners); + this.#subscribeChannel(subscriber, channel).catch((error) => { + logger.error("[runChangeNotifier] Failed to subscribe to run-change channel", { + error, + channel, + }); + }); + } + listeners.add(onBatch); + + let unsubscribed = false; + return () => { + if (unsubscribed) { + return; + } + unsubscribed = true; + + const current = this.#listeners.get(channel); + if (!current) { + return; + } + current.delete(onBatch); + if (current.size === 0) { + // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and only if + // no new listener re-subscribed while it was in flight. The map entry's existence + // mirrors "subscribed (or subscribe in flight) in Redis", so the subscribe path + // safely reuses it without a duplicate SUBSCRIBE. + this.#unsubscribeChannel(subscriber, channel) + .then(() => { + const latest = this.#listeners.get(channel); + if (!latest) { + return; + } + if (latest.size === 0) { + this.#listeners.delete(channel); + } else { + // A listener arrived during the in-flight UNSUBSCRIBE; the channel is now + // unsubscribed in Redis but has live listeners. Re-subscribe so they keep + // receiving messages (the long-poll backstop covers the gap). + this.#subscribeChannel(subscriber, channel).catch((error) => { + logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { + error, + channel, + }); + }); + } + }) + .catch((error) => { + // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. Keep the + // (empty) map entry so a future subscriber reuses it without a duplicate + // SUBSCRIBE and #onMessage stays consistent with Redis state. + logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { + error, + channel, + }); + }); + } + }; + } + + /** Number of distinct env channels currently subscribed (for metrics). */ + get activeSubscriptionCount(): number { + return this.#listeners.size; + } + + async quit(): Promise { + for (const timer of this.#coalesceTimers.values()) { + clearTimeout(timer); + } + this.#coalesceTimers.clear(); + this.#coalesceDirty.clear(); + this.#pending.clear(); + await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); + this.#subscriber = undefined; + this.#publisher = undefined; + this.#listeners.clear(); + } + + #ensurePublisher(): RedisClient { + if (!this.#publisher) { + this.#publisher = createRedisClient(`${this.#connectionName}:pub`, this.options.redis); + } + return this.#publisher; + } + + #ensureSubscriber(): RedisClient { + if (!this.#subscriber) { + const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); + const onMessage = (channel: string, message: string) => this.#onMessage(channel, message); + // Classic pub/sub delivers "message"; sharded pub/sub delivers "smessage". Register + // both so the delivery path is identical regardless of mode. + subscriber.on("message", onMessage); + subscriber.on("smessage", onMessage); + this.#subscriber = subscriber; + } + return this.#subscriber; + } + + /** SUBSCRIBE (classic) vs SSUBSCRIBE (sharded, cluster-only). */ + #subscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.ssubscribe(channel) : subscriber.subscribe(channel); + } + + /** UNSUBSCRIBE (classic) vs SUNSUBSCRIBE (sharded, cluster-only). */ + #unsubscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.sunsubscribe(channel) : subscriber.unsubscribe(channel); + } + + #onMessage(channel: string, message: string) { + // Accumulate the decoded record (deduped by runId) before delivering, so a coalesced + // wake carries every run that moved during the window. + this.#addPending(channel, decodeChangeRecord(message)); + + if (this.#coalesceWindowMs > 0) { + this.#deliverCoalesced(channel); + return; + } + this.#deliver(channel); + } + + /** Accumulate a record into the channel's pending batch, deduped by runId (a later + * record for the same run replaces the earlier one, keeping the freshest keys). */ + #addPending(channel: string, record: ChangeRecord) { + let batch = this.#pending.get(channel); + if (!batch) { + batch = new Map(); + this.#pending.set(channel, batch); + } + batch.set(record.runId, record); + } + + #deliver(channel: string) { + // Drain the accumulated batch (and clear it) so listeners woken now get every run that + // changed since the last delivery, and a later message starts a fresh batch. + const batchMap = this.#pending.get(channel); + const batch = batchMap ? [...batchMap.values()] : []; + this.#pending.delete(channel); + + const listeners = this.#listeners.get(channel); + if (!listeners || batch.length === 0) { + return; + } + for (const onBatch of [...listeners]) { + onBatch(batch); + } + } + + /** + * Leading-edge throttle: deliver the first wake immediately, then suppress further wakes + * for the window, delivering one trailing wake if any messages arrived during it (and + * re-opening while activity continues). Caps the wake rate per env to ~1/window no + * matter how fast runs change. Lossless: the batch accumulates across the window. + */ + #deliverCoalesced(channel: string) { + if (this.#coalesceTimers.has(channel)) { + this.#coalesceDirty.add(channel); + return; + } + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + + #openCoalesceWindow(channel: string) { + const timer = setTimeout(() => { + this.#coalesceTimers.delete(channel); + if (this.#coalesceDirty.delete(channel)) { + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + }, this.#coalesceWindowMs); + // Don't let a pending coalescing window hold the process open at shutdown. + timer.unref?.(); + this.#coalesceTimers.set(channel, timer); + } + + // Hash-tagged (`...{}`) so all of an env's traffic maps to one cluster slot (one + // shard) under sharded pub/sub. + #channelForEnv(environmentId: string): string { + return `${this.#channelPrefix}env:{${environmentId}}`; + } +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts new file mode 100644 index 00000000000..fa5f5681f90 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -0,0 +1,101 @@ +import { env } from "~/env.server"; +import { engine } from "~/v3/runEngine.server"; +import { logger } from "../logger.server"; +import { publishChangeRecord } from "./runChangeNotifierInstance.server"; + +/** + * ChangeRecordBuilder — builds and publishes a self-describing `ChangeRecord` to the run's + * environment channel for the lifecycle events whose engine-bus payload already carries + * env + tags + batchId. One publish per change; `envId` is always present. + * + * The terminal transitions (runSucceeded/runFailed/runExpired/runCancelled), + * runAttemptFailed, and runMetadataUpdated publish from `runEngineHandlers.server.ts` + * instead — those events don't carry env/tags/batchId on the bus, but that file already + * re-reads the run (or resolves the env) for each, so the publish piggybacks on the + * existing read rather than widening the event bus. So fully disabling publishing is the + * env master switch (`REALTIME_NOTIFIER_ENABLED`), not just deleting this file. + * + * Coverage is intentionally not exhaustive: a dropped or uncovered transition only adds + * latency because the consumer has a periodic backstop full-resolve. + */ +export function registerRunChangeNotifierHandlers() { + // Return a truthy value in every path so the singleton() wrapper (which uses ??=) caches + // the result and never re-runs this factory — re-running would attach duplicate + // engine-bus listeners on each Remix dev-mode reload. + if (env.REALTIME_NOTIFIER_ENABLED !== "1") { + return true; + } + + // Run created (trigger). The first signal a tag/batch feed gets for a brand-new run: a + // freshly-created run is born QUEUED with no status transition, so without this it only + // surfaces on the consumer's periodic backstop resolve (and not at all before ClickHouse + // ingests it). Routing the create record hydrates the new run by id straight from Postgres. + engine.eventBus.on("runCreated", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Status transitions (checkpoint suspend/resume, pending version, dequeue). + engine.eventBus.on("runStatusChanged", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Dequeue/lock (sets startedAt) and attempt start (DEQUEUED -> EXECUTING) — the + // most-watched "my run started" transitions. + engine.eventBus.on("runLocked", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + engine.eventBus.on("runAttemptStarted", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + engine.eventBus.on("runRetryScheduled", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Delay lifecycle (delayUntil / queued-after-delay changes). + engine.eventBus.on("runDelayRescheduled", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + engine.eventBus.on("runEnqueuedAfterDelay", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + logger.info("[runChangeNotifier] realtime change-record builder registered"); + + return true; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts new file mode 100644 index 00000000000..ed1d1ce12b2 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -0,0 +1,74 @@ +import { Gauge } from "prom-client"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { RunChangeNotifier, type ChangeRecordInput } from "./runChangeNotifier.server"; + +/** + * Process-singleton wiring for the RunChangeNotifier plus the thin, gated + * convenience functions that write sites and the realtime route delegate to. + * + * The notifier is constructed lazily (only on the first publish/subscribe when + * enabled), so a webapp running with `REALTIME_NOTIFIER_ENABLED=0` (the default) + * opens no Redis connections and registers no metrics for this subsystem. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; + +function initializeRunChangeNotifier(): RunChangeNotifier { + const clusterMode = env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1"; + // Sharded pub/sub only works against a cluster; classic pub/sub there would + // broadcast every message to every node, so this is what actually shards load. + const shardedPubSub = clusterMode && env.REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED === "1"; + + const notifier = new RunChangeNotifier({ + redis: { + host: env.REALTIME_RUNS_PUBSUB_REDIS_HOST, + port: env.REALTIME_RUNS_PUBSUB_REDIS_PORT, + username: env.REALTIME_RUNS_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_RUNS_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode, + // One subscriber connection per shard so SSUBSCRIBE routes to the slot owner. + ...(shardedPubSub ? { clusterOptions: { shardedSubscribers: true } } : {}), + }, + envWakeCoalesceWindowMs: env.REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS, + shardedPubSub, + }); + + new Gauge({ + name: "realtime_run_change_notifier_active_subscriptions", + help: "Distinct runs currently subscribed for realtime change notifications", + collect() { + this.set(notifier.activeSubscriptionCount); + }, + registers: [metricsRegister], + }); + + return notifier; +} + +/** Lazily construct (and memoize) the notifier singleton. */ +export function getRunChangeNotifier(): RunChangeNotifier { + return singleton("runChangeNotifier", initializeRunChangeNotifier); +} + +/** Whether the notifier subsystem is enabled for this process. */ +export function isRunChangeNotifierEnabled(): boolean { + return notifierEnabled; +} + +/** Fire-and-forget publish of a run-changed record. No-op (and no notifier construction) + * when disabled, so publish sites can call it unconditionally. */ +export function publishChangeRecord(input: ChangeRecordInput): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publish(input); +} + +export function publishManyChangeRecords(inputs: ChangeRecordInput[]): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publishMany(inputs); +} diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts new file mode 100644 index 00000000000..4135e94366b --- /dev/null +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -0,0 +1,191 @@ +import { type Prisma, type PrismaClient } from "@trigger.dev/database"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { RESERVED_COLUMNS, type RealtimeRunRow } from "./electricStreamProtocol.server"; + +/** + * RunReader — the pluggable read half of the notifier-backed realtime feed. + * + * The mandate: ClickHouse is filter-only and resolves IDs, + * Postgres always hydrates row columns. This file owns the Postgres hydration + * half (`RunHydrator`, by-id) and the `RunListResolver` interface (the tag/list + * filter -> id-set seam, implemented over ClickHouse). + * + * Splitting hydration behind this small surface keeps the realtime feed + * decoupled from where runs physically live, ready for a future `TaskRunFast` + * table or a non-Postgres row store. + */ + +/** The TaskRun columns the realtime feed projects (mirrors DEFAULT_ELECTRIC_COLUMNS). */ +export const RUN_HYDRATOR_SELECT = { + id: true, + taskIdentifier: true, + createdAt: true, + updatedAt: true, + startedAt: true, + delayUntil: true, + queuedAt: true, + expiredAt: true, + completedAt: true, + friendlyId: true, + number: true, + isTest: true, + status: true, + usageDurationMs: true, + costInCents: true, + baseCostInCents: true, + ttl: true, + payload: true, + payloadType: true, + metadata: true, + metadataType: true, + output: true, + outputType: true, + runTags: true, + error: true, + realtimeStreams: true, +} satisfies Prisma.TaskRunSelect; + +/** + * Columns the feed needs internally regardless of the client's `skipColumns`: + * `id` keys the row, `updatedAt` drives the offset and the live working-set diff. + * Everything else can be projected away when the client skips it (see + * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ + * `metadata`/`error` columns the response will drop anyway. + */ +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt", ...RESERVED_COLUMNS]); + +/** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus + * the always-needed ones). An empty skip set returns the full select unchanged. */ +export function buildHydratorSelect(skipColumns: string[] = []): Prisma.TaskRunSelect { + if (skipColumns.length === 0) { + return RUN_HYDRATOR_SELECT; + } + const skip = new Set(skipColumns); + const select: Record = {}; + for (const column of Object.keys(RUN_HYDRATOR_SELECT)) { + if (ALWAYS_HYDRATED_COLUMNS.has(column) || !skip.has(column)) { + select[column] = true; + } + } + return select as Prisma.TaskRunSelect; +} + +export type RunListFilter = { + organizationId: string; + projectId: string; + environmentId: string; + /** Contains-ANY tag match (OR). Omit/empty for non-tag feeds. */ + tags?: string[]; + /** Restrict to a single batch (internal batch id) — the batch feed. */ + batchId?: string; + /** Lower bound on createdAt (the tag-list feed pins this; batch omits it). */ + createdAtAfter?: Date; + /** Hard cap on the result set so a broad filter can't unbound the snapshot. */ + limit: number; +}; + +/** + * Resolves a tag/list filter into the matching run id-set, filter-only (no row + * columns; rows are hydrated from Postgres by id afterward). Pluggable so the + * resolution source can change without touching the feed. The ClickHouse + * implementation lives in `clickHouseRunListResolver.server.ts`. + */ +export interface RunListResolver { + resolveMatchingRunIds(filter: RunListFilter): Promise; +} + +export type RunHydratorOptions = { + /** A read-replica Prisma client (`$replica`). Always Postgres. */ + replica: Pick; + /** + * Read-through cache TTL (ms) to collapse duplicate refetches across a burst + * of live polls for the same run. Fan-in is low in practice, so this is + * insurance, not load-bearing. Set to 0 to disable. Defaults to 250ms. + */ + cacheTtlMs?: number; + /** Hard cap on cache entries before expired entries are swept. */ + maxCacheEntries?: number; +}; + +const DEFAULT_CACHE_TTL_MS = 250; +const DEFAULT_MAX_CACHE_ENTRIES = 5_000; + +/** + * Hydrates a single run by id from the read replica, projected to the realtime + * columns. Concurrent refetches for the same (env, run) are single-flighted, and + * a short TTL cache collapses rapid repeats. + */ +export class RunHydrator { + readonly #inflight = new Map>(); + readonly #cache: BoundedTtlCache; + readonly #cacheTtlMs: number; + + constructor(private readonly options: RunHydratorOptions) { + this.#cacheTtlMs = options.cacheTtlMs ?? DEFAULT_CACHE_TTL_MS; + this.#cache = new BoundedTtlCache( + this.#cacheTtlMs, + options.maxCacheEntries ?? DEFAULT_MAX_CACHE_ENTRIES + ); + } + + async getRunById(environmentId: string, runId: string): Promise { + const key = `${environmentId}:${runId}`; + + if (this.#cacheTtlMs > 0) { + // A cached null is a valid "run not found" hit; only undefined is a miss. + const cached = this.#cache.get(key); + if (cached !== undefined) { + return cached; + } + } + + const existing = this.#inflight.get(key); + if (existing) { + return existing; + } + + const promise = this.#fetch(environmentId, runId).finally(() => this.#inflight.delete(key)); + this.#inflight.set(key, promise); + + const row = await promise; + + if (this.#cacheTtlMs > 0) { + this.#cache.set(key, row); + } + + return row; + } + + /** Hydrate many runs by id in one query (tag/list feed). Order is not guaranteed. + * `skipColumns` projects the SELECT so the replica doesn't ship columns the client + * dropped (notably the large `payload`/`output`/`metadata`/`error` columns). */ + async hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] = [] + ): Promise { + if (ids.length === 0) { + return []; + } + const rows = await this.options.replica.taskRun.findMany({ + where: { + runtimeEnvironmentId: environmentId, + id: { in: ids }, + }, + select: buildHydratorSelect(skipColumns), + }); + return rows as unknown as RealtimeRunRow[]; + } + + async #fetch(environmentId: string, runId: string): Promise { + const run = await this.options.replica.taskRun.findFirst({ + where: { + id: runId, + runtimeEnvironmentId: environmentId, + }, + select: RUN_HYDRATOR_SELECT, + }); + + return (run ?? null) as RealtimeRunRow | null; + } +} diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts new file mode 100644 index 00000000000..b24540bfca3 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -0,0 +1,297 @@ +import { + type ElectricColumnType, + RUN_ELECTRIC_COLUMNS, + serializeRunRow, +} from "./electricStreamProtocol.server"; +import { type RunHydrator, type RunListFilter, type RunListResolver } from "./runReader.server"; + +/** + * Dual-run shadow-compare. + * + * The client is always served the Electric response; in the background this + * re-derives what the notifier path WOULD emit and diffs the two, so we can prove + * parity on real production traffic before any cutover. + * + * Two kinds of divergence are checked: + * - serialization: for each run Electric emitted, re-hydrate it and serialize via + * the notifier serializer, then compare SEMANTICALLY (decode both sides per + * column type) so equivalent-but-differently-encoded wire values (timestamp + * format, bool t/true, number formatting) are not false positives. The compare + * is gated on same-version (matching updatedAt) so a row that changed between + * Electric's emit and our refetch is recorded as "skew", not a divergence. + * - membership (tag/batch initial snapshot only): the set of run ids Electric + * emitted vs the set the notifier resolver returns. This is where the known + * tag OR-vs-AND difference shows up. + * + * Pure except for the injected RunHydrator/RunListResolver, so it's unit-testable. + */ + +export type ShadowFeed = "run" | "runs" | "batch"; + +type WireValue = Record; + +type ShapeMessage = { + key?: string; + value?: WireValue; + headers: { operation?: string; control?: string }; +}; + +const COLUMN_BY_NAME = new Map(RUN_ELECTRIC_COLUMNS.map((column) => [column.name, column])); + +export type ColumnDiff = { + runId: string; + column: string; + electric: string | null; + notifier: string | null; +}; + +export type ShadowCompareOutcome = { + feed: ShadowFeed; + /** Runs whose every emitted column matched (same-version). */ + serializationMatched: number; + /** Runs with at least one semantic column divergence (same-version). */ + serializationDiverged: number; + /** Runs that changed between Electric's emit and our refetch (not a divergence). */ + serializationSkew: number; + /** Per-column divergences (capped) for logging. */ + diffs: ColumnDiff[]; + /** Set membership (tag/batch initial snapshot only). undefined when not checked. */ + membershipMatch?: boolean; + missingInNotifier?: string[]; + extraInNotifier?: string[]; +}; + +export type ShadowCompareInput = { + feed: ShadowFeed; + /** The served Electric response body (a JSON array of messages, or "" / "[]"). */ + electricBody: string; + environment: { id: string }; + skipColumns: string[]; + /** True when this was an initial snapshot request (offset=-1); enables membership compare. */ + isInitialSnapshot: boolean; + /** When set (tag/batch initial snapshot), compare the resolved id-set. */ + membershipFilter?: RunListFilter; +}; + +const MAX_DIFFS = 20; + +export class RealtimeShadowComparator { + constructor( + private readonly options: { runReader: RunHydrator; runListResolver: RunListResolver } + ) {} + + async compare(input: ShadowCompareInput): Promise { + const messages = parseBody(input.electricBody); + const changes = messages.filter( + (m): m is ShapeMessage & { value: WireValue } => + typeof m.headers?.operation === "string" && !!m.value && m.headers.operation !== "delete" + ); + + const outcome: ShadowCompareOutcome = { + feed: input.feed, + serializationMatched: 0, + serializationDiverged: 0, + serializationSkew: 0, + diffs: [], + }; + + // Bulk-hydrate every emitted run in one query rather than a per-message round + // trip, so shadow mode doesn't inflate the very replica load it's measuring. + const emittedIds = changes + .map((m) => m.value.id) + .filter((id): id is string => typeof id === "string"); + const hydrated = await this.options.runReader.hydrateByIds(input.environment.id, emittedIds); + const rowsById = new Map(hydrated.map((row) => [row.id, row])); + + for (const message of changes) { + const runId = message.value.id ?? undefined; + if (!runId) { + continue; + } + + const row = rowsById.get(runId); + if (!row) { + // Run no longer readable (deleted / replica miss). Not a serialization divergence. + outcome.serializationSkew++; + continue; + } + + const notifierValue = serializeRunRow(row, input.skipColumns); + + // Only compare rows at the same version; otherwise the row advanced between + // Electric's emit and our refetch (timing skew, not a divergence). + if (!sameInstant(message.value.updatedAt, notifierValue.updatedAt)) { + outcome.serializationSkew++; + continue; + } + + let rowDiverged = false; + for (const [column, electricRaw] of Object.entries(message.value)) { + const meta = COLUMN_BY_NAME.get(column); + if (!meta) { + continue; + } + const notifierRaw = notifierValue[column] ?? null; + if (!valuesEqual(electricRaw, notifierRaw, meta.type, meta.dims, column)) { + rowDiverged = true; + if (outcome.diffs.length < MAX_DIFFS) { + outcome.diffs.push({ runId, column, electric: electricRaw, notifier: notifierRaw }); + } + } + } + + if (rowDiverged) { + outcome.serializationDiverged++; + } else { + outcome.serializationMatched++; + } + } + + if (input.isInitialSnapshot && input.membershipFilter) { + const electricIds = new Set( + changes.map((m) => m.value.id).filter((id): id is string => typeof id === "string") + ); + const notifierIds = new Set( + await this.options.runListResolver.resolveMatchingRunIds(input.membershipFilter) + ); + + outcome.missingInNotifier = [...electricIds].filter((id) => !notifierIds.has(id)); + outcome.extraInNotifier = [...notifierIds].filter((id) => !electricIds.has(id)); + outcome.membershipMatch = + outcome.missingInNotifier.length === 0 && outcome.extraInNotifier.length === 0; + } + + return outcome; + } +} + +function parseBody(body: string): ShapeMessage[] { + const text = body.trim(); + if (!text) { + return []; + } + try { + const parsed = JSON.parse(text); + return Array.isArray(parsed) ? (parsed as ShapeMessage[]) : []; + } catch { + return []; + } +} + +/** Status carries a known legacy rewrite (DEQUEUED -> EXECUTING) applied equally to + * both paths for non-current API versions; treat them as equivalent. */ +function normalizeStatus(value: string): string { + return value === "DEQUEUED" ? "EXECUTING" : value; +} + +function sameInstant(a: string | null | undefined, b: string | null | undefined): boolean { + if (a == null || b == null) { + return a == null && b == null; + } + // Mirror the SDK's RawShapeDate (`new Date(val + "Z")`). + return new Date(`${a}Z`).getTime() === new Date(`${b}Z`).getTime(); +} + +function valuesEqual( + electricRaw: string | null, + notifierRaw: string | null, + type: ElectricColumnType, + dims: number | undefined, + column: string +): boolean { + if (electricRaw == null || notifierRaw == null) { + return electricRaw == null && notifierRaw == null; + } + + if (dims && dims > 0) { + return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(notifierRaw)); + } + + switch (type) { + case "timestamp": + return new Date(`${electricRaw}Z`).getTime() === new Date(`${notifierRaw}Z`).getTime(); + case "bool": + return parseBool(electricRaw) === parseBool(notifierRaw); + case "int4": + case "int8": + case "float8": + return Number(electricRaw) === Number(notifierRaw); + case "jsonb": + return jsonEqual(electricRaw, notifierRaw); + case "text": + default: + if (column === "status") { + return normalizeStatus(electricRaw) === normalizeStatus(notifierRaw); + } + return electricRaw === notifierRaw; + } +} + +function parseBool(value: string): boolean { + return value === "t" || value === "true"; +} + +function jsonEqual(a: string, b: string): boolean { + try { + return deepEqual(JSON.parse(a), JSON.parse(b)); + } catch { + return a === b; + } +} + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (typeof a !== typeof b || a === null || b === null) return false; + if (Array.isArray(a) && Array.isArray(b)) { + return a.length === b.length && a.every((v, i) => deepEqual(v, b[i])); + } + if (typeof a === "object" && typeof b === "object") { + const ak = Object.keys(a as object).sort(); + const bk = Object.keys(b as object).sort(); + return ( + ak.length === bk.length && + ak.every((k, i) => k === bk[i]) && + ak.every((k) => deepEqual((a as any)[k], (b as any)[k])) + ); + } + return false; +} + +function arraysEqual(a: string[], b: string[]): boolean { + return a.length === b.length && a.every((v, i) => v === b[i]); +} + +/** Parse a Postgres text-array literal (`{"a","b"}` / `{}`). Mirrors the client's pgArrayParser. */ +function parsePgTextArray(literal: string): string[] { + if (literal === "{}" || literal === "") { + return []; + } + const inner = literal.startsWith("{") && literal.endsWith("}") ? literal.slice(1, -1) : literal; + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts new file mode 100644 index 00000000000..b66b70e7ad5 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -0,0 +1,194 @@ +import { API_VERSIONS } from "~/api/versions"; +import { logger } from "../logger.server"; +import { + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { RESERVED_COLUMNS } from "./electricStreamProtocol.server"; +import { + type RealtimeListEnvironment, + type RealtimeStreamClient, +} from "./notifierRealtimeClient.server"; +import { type RunListFilter } from "./runReader.server"; +import { + type RealtimeShadowComparator, + type ShadowCompareOutcome, + type ShadowFeed, +} from "./shadowCompare.server"; + +export type ShadowRealtimeClientOptions = { + /** The path actually served to the client (Electric). */ + electric: RealtimeStreamClient; + comparator: RealtimeShadowComparator; + /** createdAt window (ms) used to resolve tag-list membership for the compare. */ + maximumCreatedAtFilterAgeMs: number; + /** Cap for the membership resolve. */ + maxListResults: number; + /** Metrics sink for compare outcomes. */ + onOutcome?: (outcome: ShadowCompareOutcome) => void; +}; + +/** + * Dual-run gate: a transparent wrapper that serves the Electric + * response unchanged and, in the background, diffs what the notifier path would emit + * against it. The shadow work is fire-and-forget — it never blocks or fails the + * client's request — and it exercises the read replica so the notifier's real load + * can be measured before cutover. + */ +export class ShadowRealtimeClient implements RealtimeStreamClient { + constructor(private readonly options: ShadowRealtimeClientOptions) {} + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRun( + url, + environment, + runId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("run", response, url, environment, requestOptions); + return response; + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRuns( + url, + environment, + params, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("runs", response, url, environment, requestOptions, { tags: params.tags ?? [] }); + return response; + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamBatch( + url, + environment, + batchId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("batch", response, url, environment, requestOptions, { batchId }); + return response; + } + + /** Fire-and-forget; never blocks the served response, never throws into the request. */ + #shadow( + feed: ShadowFeed, + electricResponse: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions?: RealtimeRequestOptions, + membership?: { tags?: string[]; batchId?: string } + ): void { + // Clone synchronously before the client consumes the body. + let bodyClone: Response; + try { + if (electricResponse.status !== 200) { + return; + } + bodyClone = electricResponse.clone(); + } catch { + return; + } + + void this.#runShadow(feed, bodyClone, url, environment, requestOptions, membership).catch( + (error) => logger.debug("[shadowRealtime] compare failed", { feed, error }) + ); + } + + async #runShadow( + feed: ShadowFeed, + bodyClone: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions: RealtimeRequestOptions | undefined, + membership: { tags?: string[]; batchId?: string } | undefined + ): Promise { + const $url = new URL(url.toString()); + const offset = $url.searchParams.get("offset") ?? "-1"; + const handle = $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"); + const isInitialSnapshot = offset === "-1" || !handle; + const skipColumns = resolveSkipColumns($url, requestOptions); + const electricBody = await bodyClone.text(); + + let membershipFilter: RunListFilter | undefined; + if (isInitialSnapshot && membership && environment.projectId) { + membershipFilter = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: membership.tags, + batchId: membership.batchId, + createdAtAfter: membership.batchId + ? undefined + : new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs), + limit: this.options.maxListResults, + }; + } + + const outcome = await this.options.comparator.compare({ + feed, + electricBody, + environment: { id: environment.id }, + skipColumns, + isInitialSnapshot, + membershipFilter, + }); + + this.options.onOutcome?.(outcome); + + if (outcome.serializationDiverged > 0 || outcome.membershipMatch === false) { + logger.warn("[shadowRealtime] divergence detected", { + feed, + serializationDiverged: outcome.serializationDiverged, + serializationMatched: outcome.serializationMatched, + serializationSkew: outcome.serializationSkew, + membershipMatch: outcome.membershipMatch, + missingInNotifier: outcome.missingInNotifier?.slice(0, 20), + extraInNotifier: outcome.extraInNotifier?.slice(0, 20), + // Log only which run/column diverged, never the raw cell values — they can + // include run payload/output/metadata and must not leak into logs. + diffs: outcome.diffs.map(({ runId, column }) => ({ runId, column })), + }); + } + } +} + +function resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..95edc82620d --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -0,0 +1,66 @@ +import { Counter } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { singleton } from "~/utils/singleton"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { RunHydrator } from "./runReader.server"; +import { RealtimeShadowComparator } from "./shadowCompare.server"; +import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; + +/** + * Process-singleton wiring for the shadow-compare client. Only constructed + * when an org's `realtimeBackend` flag is set to "shadow". + */ +function initializeShadowRealtimeClient(): ShadowRealtimeClient { + const compares = new Counter({ + name: "realtime_shadow_compare_total", + help: "Dual-run shadow-compare outcomes (Electric vs notifier). kind=serialization|membership, result=match|diverge|skew.", + labelNames: ["feed", "kind", "result"] as const, + registers: [metricsRegister], + }); + + const comparator = new RealtimeShadowComparator({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), + prisma: $replica, + }), + }); + + return new ShadowRealtimeClient({ + electric: realtimeClient, + comparator, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + onOutcome: (outcome) => { + const { feed } = outcome; + if (outcome.serializationMatched) { + compares.inc({ feed, kind: "serialization", result: "match" }, outcome.serializationMatched); + } + if (outcome.serializationDiverged) { + compares.inc( + { feed, kind: "serialization", result: "diverge" }, + outcome.serializationDiverged + ); + } + if (outcome.serializationSkew) { + compares.inc({ feed, kind: "serialization", result: "skew" }, outcome.serializationSkew); + } + if (outcome.membershipMatch !== undefined) { + compares.inc({ + feed, + kind: "membership", + result: outcome.membershipMatch ? "match" : "diverge", + }); + } + }, + }); +} + +export function getShadowRealtimeClient(): ShadowRealtimeClient { + return singleton("shadowRealtimeClient", initializeShadowRealtimeClient); +} diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 9a5d75cfe25..55b30a8396e 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -10,6 +10,7 @@ export const FEATURE_FLAG = { hasPrivateConnections: "hasPrivateConnections", mollifierEnabled: "mollifierEnabled", workerQueueScheduledSplitEnabled: "workerQueueScheduledSplitEnabled", + realtimeBackend: "realtimeBackend", } as const; export const FeatureFlagCatalog = { @@ -22,6 +23,10 @@ export const FeatureFlagCatalog = { [FEATURE_FLAG.hasPrivateConnections]: z.coerce.boolean(), [FEATURE_FLAG.mollifierEnabled]: z.coerce.boolean(), [FEATURE_FLAG.workerQueueScheduledSplitEnabled]: z.coerce.boolean(), + // Which backend serves the realtime run feed. Controllable + // globally and per-org (org wins). Defaults to "electric" when unset. + // "shadow" serves Electric but diffs the notifier path in the background. + [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "notifier", "shadow"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 3277d74ba6e..7ef4efdef82 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -20,11 +20,12 @@ import { createExceptionPropertiesFromError } from "./eventRepository/common.ser import { getEventRepositoryForStore, recordRunDebugLog } from "./eventRepository/index.server"; import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server"; import { engine } from "./runEngine.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; export function registerRunEngineEventBusHandlers() { - engine.eventBus.on("runSucceeded", async ({ time, run, organization }) => { + engine.eventBus.on("runSucceeded", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -45,6 +46,11 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read so the + // per-env channel carries the membership keys (no separate query). No-op when + // the notifier is disabled. + runTags: true, + batchId: true, }, }) ); @@ -57,6 +63,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -91,7 +104,7 @@ export function registerRunEngineEventBusHandlers() { }); // Handle events - engine.eventBus.on("runFailed", async ({ time, run, organization }) => { + engine.eventBus.on("runFailed", async ({ time, run, organization, environment }) => { const sanitizedError = sanitizeError(run.error); const exception = createExceptionPropertiesFromError(sanitizedError); @@ -115,6 +128,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -127,6 +144,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -172,6 +196,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -184,6 +212,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: taskRun.runtimeEnvironmentId, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + if (!taskRun.organizationId) { logger.error("[runAttemptFailed] Task run has no organization id", { runId: run.id, @@ -328,7 +363,7 @@ export function registerRunEngineEventBusHandlers() { } ); - engine.eventBus.on("runExpired", async ({ time, run, organization }) => { + engine.eventBus.on("runExpired", async ({ time, run, organization, environment }) => { if (!run.ttl) { return; } @@ -353,6 +388,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -365,6 +404,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -386,7 +432,7 @@ export function registerRunEngineEventBusHandlers() { } }); - engine.eventBus.on("runCancelled", async ({ time, run, organization }) => { + engine.eventBus.on("runCancelled", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -407,6 +453,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -419,6 +469,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -505,15 +562,20 @@ export function registerRunEngineEventBusHandlers() { }); engine.eventBus.on("runMetadataUpdated", async ({ time, run }) => { - const env = await findEnvironmentFromRun(run.id); + const result = await findEnvironmentFromRun(run.id); - if (!env) { + if (!result) { logger.error("[runMetadataUpdated] Failed to find environment", { runId: run.id }); return; } + const { environment, runTags, batchId } = result; + try { - await updateMetadataService.call(run.id, run.metadata, env); + await updateMetadataService.call(run.id, run.metadata, environment); + // Realtime run-changed publish, after the write so the router's hydrate sees the new + // row. A full record (env + tags + batchId), so feeds route by index. + publishChangeRecord({ runId: run.id, envId: environment.id, tags: runTags, batchId }); } catch (e) { if (e instanceof MetadataTooLargeError) { logger.warn("[runMetadataUpdated] Failed to update metadata, too large", { diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 162a9ede9a0..efebaf48207 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -163,7 +163,7 @@ "humanize-duration": "^3.27.3", "input-otp": "^1.4.2", "intl-parse-accept-language": "^1.0.0", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "isbot": "^3.6.5", "jose": "^5.4.0", "json-stable-stringify": "^1.3.0", diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts new file mode 100644 index 00000000000..a3fb0b1e425 --- /dev/null +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -0,0 +1,52 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; + +describe("BoundedTtlCache", () => { + afterEach(() => { + vi.useRealTimers(); + }); + + it("returns a live entry within its TTL", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("k", "v"); + vi.advanceTimersByTime(500); + expect(cache.get("k")).toBe("v"); + expect(cache.size).toBe(1); + }); + + it("evicts an expired entry on read instead of letting it linger", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("a", 1); + expect(cache.size).toBe(1); + + vi.advanceTimersByTime(1_001); + expect(cache.get("a")).toBeUndefined(); + // The previous bug left expired entries in the map until an at-capacity sweep; + // they must now be removed on read. + expect(cache.size).toBe(0); + }); + + it("does not evict another entry when updating an existing key at capacity", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + // Updating an existing key doesn't grow the map, so it must not drop "b". + cache.set("a", 11); + expect(cache.get("a")).toBe(11); + expect(cache.get("b")).toBe(2); + expect(cache.size).toBe(2); + }); + + it("drops the oldest entry when full of still-live entries", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + cache.set("c", 3); // over capacity, none expired -> evict oldest insertion (a) + expect(cache.get("a")).toBeUndefined(); + expect(cache.get("b")).toBe(2); + expect(cache.get("c")).toBe(3); + expect(cache.size).toBe(2); + }); +}); diff --git a/apps/webapp/test/realtime/electricStreamProtocol.test.ts b/apps/webapp/test/realtime/electricStreamProtocol.test.ts new file mode 100644 index 00000000000..a48f4f9f8e8 --- /dev/null +++ b/apps/webapp/test/realtime/electricStreamProtocol.test.ts @@ -0,0 +1,304 @@ +import { SubscribeRunRawShape } from "@trigger.dev/core/v3/schemas"; +import { describe, expect, it } from "vitest"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_abc123", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-06T10:00:00.000Z"), + updatedAt: new Date("2026-06-06T10:05:30.123Z"), + startedAt: new Date("2026-06-06T10:01:00.000Z"), + delayUntil: null, + queuedAt: new Date("2026-06-06T10:00:30.000Z"), + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_abc", + number: 42, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: '{"step":1}', + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["user:123", "env:prod"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +/** + * Faithful re-implementation of the @electric-sql/client value parser rules + * (defaultParser + pgArrayParser), so we can decode our wire `value` object the + * same way the deployed client would, then validate against the real SDK schema. + * Source: @electric-sql/client@1.0.14 src/parser.ts. + */ +function electricParse( + value: Record, + schema: Record +): Record { + const out: Record = {}; + for (const [key, raw] of Object.entries(value)) { + if (raw === null) { + out[key] = null; + continue; + } + const info = schema[key]; + if (!info) { + out[key] = raw; + continue; + } + if (info.dims && info.dims > 0) { + out[key] = parsePgTextArray(raw); + continue; + } + switch (info.type) { + case "bool": + out[key] = raw === "t" || raw === "true"; + break; + case "int8": + out[key] = BigInt(raw); + break; + case "int2": + case "int4": + case "float4": + case "float8": + out[key] = Number(raw); + break; + case "json": + case "jsonb": + out[key] = JSON.parse(raw); + break; + default: + out[key] = raw; // text/timestamp pass through as strings + } + } + return out; +} + +function parsePgTextArray(literal: string): string[] { + if (literal === "{}") { + return []; + } + const inner = literal.slice(1, -1); + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; // closing quote + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} + +describe("electricStreamProtocol serializer", () => { + it("encodes each Postgres type the way the Electric client expects", () => { + const value = serializeRunRow(sampleRow()); + + // text: passed through as-is + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + expect(value.payload).toBe('{"hello":"world"}'); + + // int/float: stringified + expect(value.number).toBe("42"); + expect(value.usageDurationMs).toBe("1234"); + expect(value.costInCents).toBe("0.55"); + + // bool: postgres "t"/"f" + expect(value.isTest).toBe("t"); + + // timestamp: ISO without trailing Z (the SDK appends Z before parsing) + expect(value.updatedAt).toBe("2026-06-06T10:05:30.123"); + expect(value.createdAt).toBe("2026-06-06T10:00:00.000"); + + // nullable timestamp: null stays null + expect(value.delayUntil).toBeNull(); + expect(value.completedAt).toBeNull(); + + // text[]: quoted pg array literal; empty realtimeStreams (@default([])) => {} + expect(value.runTags).toBe('{"user:123","env:prod"}'); + expect(value.realtimeStreams).toBe("{}"); + + // jsonb: null stays null + expect(value.error).toBeNull(); + }); + + it("encodes an empty no-default array column (runTags) as null, matching Electric", () => { + // runTags has no Postgres default, so an empty value is stored as SQL NULL and + // Electric emits `null` (not `{}`). realtimeStreams has @default([]), so its + // empty value is `{}`. Prisma hands us `[]` for both; we re-derive the wire form. + const value = serializeRunRow(sampleRow({ runTags: [], realtimeStreams: [] })); + expect(value.runTags).toBeNull(); + expect(value.realtimeStreams).toBe("{}"); + }); + + it("encodes jsonb error as a JSON string", () => { + const value = serializeRunRow(sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } })); + expect(value.error).toBe('{"type":"STRING_ERROR","raw":"boom"}'); + }); + + it("round-trips through the client parser into a valid SubscribeRunRawShape", () => { + const row = sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } }); + const value = serializeRunRow(row); + const schema = JSON.parse(buildElectricSchemaHeader()); + + const decoded = electricParse(value, schema); + const parsed = SubscribeRunRawShape.parse(decoded); + + expect(parsed.id).toBe("run_abc123"); + expect(parsed.friendlyId).toBe("run_friendly_abc"); + expect(parsed.status).toBe("EXECUTING"); + expect(parsed.number).toBe(42); + expect(parsed.isTest).toBe(true); + expect(parsed.usageDurationMs).toBe(1234); + expect(parsed.costInCents).toBeCloseTo(0.55); + expect(parsed.runTags).toEqual(["user:123", "env:prod"]); + expect(parsed.realtimeStreams).toEqual([]); + // RawShapeDate appends "Z" and coerces to a Date equal to the source instant. + expect(parsed.createdAt.toISOString()).toBe("2026-06-06T10:00:00.000Z"); + expect(parsed.updatedAt.toISOString()).toBe("2026-06-06T10:05:30.123Z"); + expect(parsed.startedAt?.toISOString()).toBe("2026-06-06T10:01:00.000Z"); + expect(parsed.delayUntil ?? null).toBeNull(); + expect(parsed.error).toEqual({ type: "STRING_ERROR", raw: "boom" }); + }); + + it("honors skipColumns (but never the reserved columns)", () => { + const value = serializeRunRow(sampleRow(), ["payload", "output", "id", "status"]); + expect(value.payload).toBeUndefined(); + expect(value.output).toBeUndefined(); + // reserved columns can't be skipped + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + + const schema = JSON.parse(buildElectricSchemaHeader(["payload"])); + expect(schema.payload).toBeUndefined(); + expect(schema.status).toBeDefined(); + }); +}); + +describe("electricStreamProtocol message bodies", () => { + it("emits insert + up-to-date for an initial snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(sampleRow())); + expect(messages).toHaveLength(2); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_abc123"'); + expect(messages[0].value.status).toBe("EXECUTING"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty (missing) run snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(null)); + expect(messages).toHaveLength(1); + expect(messages[0].headers.control).toBe("up-to-date"); + }); + + it("emits update + up-to-date for a live change", () => { + const messages = JSON.parse(buildUpdateBody(sampleRow())); + expect(messages[0].headers.operation).toBe("update"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date when nothing advanced", () => { + const messages = JSON.parse(buildUpToDateBody()); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("uses the same merge key across insert and update so the client merges by row", () => { + const insert = JSON.parse(buildSnapshotBody(sampleRow()))[0]; + const update = JSON.parse(buildUpdateBody(sampleRow()))[0]; + expect(insert.key).toBe(update.key); + }); +}); + +describe("electricStreamProtocol multi-row (tag-list) bodies", () => { + it("emits one change message per row with per-row operation, then up-to-date", () => { + const a = sampleRow({ id: "run_a" }); + const b = sampleRow({ id: "run_b", status: "QUEUED" }); + const messages = JSON.parse( + buildRowsBody([ + { row: a, operation: "insert" }, + { row: b, operation: "update" }, + ]) + ); + expect(messages).toHaveLength(3); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_a"'); + expect(messages[1].headers.operation).toBe("update"); + expect(messages[1].key).toBe('"public"."TaskRun"/"run_b"'); + expect(messages[1].value.status).toBe("QUEUED"); + expect(messages[2].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty change set", () => { + const messages = JSON.parse(buildRowsBody([])); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("honors skipColumns across all rows", () => { + const messages = JSON.parse( + buildRowsBody([{ row: sampleRow(), operation: "insert" }], ["payload"]) + ); + expect(messages[0].value.payload).toBeUndefined(); + expect(messages[0].value.status).toBe("EXECUTING"); + }); +}); + +describe("electricStreamProtocol tokens + legacy rewrite", () => { + it("encodes and parses the offset updatedAt segment", () => { + const offset = encodeOffset(1717667130123, 7); + expect(offset).toBe("1717667130123_7"); + expect(parseOffsetUpdatedAtMs(offset)).toBe(1717667130123); + }); + + it("treats the initial offset (-1) and garbage as zero", () => { + expect(parseOffsetUpdatedAtMs("-1")).toBe(0); + expect(parseOffsetUpdatedAtMs(null)).toBe(0); + expect(parseOffsetUpdatedAtMs("nonsense")).toBe(0); + }); + + it("rewrites DEQUEUED to EXECUTING for legacy API versions", () => { + const body = buildUpdateBody(sampleRow({ status: "DEQUEUED" })); + expect(body).toContain('"status":"DEQUEUED"'); + const rewritten = rewriteBodyForLegacyApiVersion(body); + expect(rewritten).not.toContain('"status":"DEQUEUED"'); + expect(rewritten).toContain('"status":"EXECUTING"'); + }); +}); diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts new file mode 100644 index 00000000000..befe0356284 --- /dev/null +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -0,0 +1,187 @@ +import { describe, expect, it, vi } from "vitest"; +import { + EnvChangeRouter, + type EnvChangeSource, + type RowHydrator, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; + +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row( + id: string, + opts: { tags?: string[]; createdAtMs?: number; updatedAtMs?: number } = {} +): RealtimeRunRow { + return { + id, + runTags: opts.tags ?? [], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), + updatedAt: new Date(opts.updatedAtMs ?? FLOOR_MS + 5_000), + } as unknown as RealtimeRunRow; +} + +function record(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource: tests push batches to the env's listener. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => { + listeners.get(envId)?.delete(onBatch); + }; + }, + }; + return { + source, + push(envId: string, records: ChangeRecord[]) { + for (const l of listeners.get(envId) ?? []) l(records); + }, + isSubscribed(envId: string) { + return (listeners.get(envId)?.size ?? 0) > 0; + }, + }; +} + +function makeRouter(rowsById: Map = new Map()) { + const src = fakeSource(); + const hydrateSpy = vi.fn(async (_env, ids) => + ids.map((id) => rowsById.get(id)).filter((r): r is RealtimeRunRow => Boolean(r)) + ); + const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); + return { router, src, hydrateSpy }; +} + +describe("EnvChangeRouter", () => { + it("routes a tag match to the feed (hydrated + serialized) and ignores non-matches", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + + // A non-matching tag is dropped (no wake); a matching tag wakes with the hydrated row. + src.push("env_1", [record("rX", { tags: ["b"] }), record("r1", { tags: ["a"] })]); + + const result = await wait; + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(result.rows[0].value.id).toBe("r1"); // serialized wire value + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r1"], []); + reg.close(); + }); + + it("batch-hydrates ONCE and shares the serialized value across feeds matching the same run", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const regs = [ + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + ]; + const waits = regs.map((r) => r.waitForMatch(undefined, 1_000)); + + src.push("env_1", [record("r1", { tags: ["a"] })]); + const results = await Promise.all(waits); + + // One hydrate for the whole tick (same column set), shared by both feeds... + expect(hydrateSpy).toHaveBeenCalledTimes(1); + // ...and the same serialized value object is reused (serialize-once). + expect(results[0].rows[0].value).toBe(results[1].rows[0].value); + regs.forEach((r) => r.close()); + }); + + it("routes a run feed by exact runId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "run", runId: "r1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [record("r2"), record("r1")]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("routes a batch feed by batchId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [ + record("rX", { batchId: "other" }), + record("r1", { batchId: "batch_1" }), + ]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("drops a tag match created before the feed's createdAt floor", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"], createdAtFloorMs: FLOOR_MS }, []); + let settled = false; + const wait = reg.waitForMatch(undefined, 60).then((r) => { + settled = true; + return r; + }); + src.push("env_1", [record("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]); + // Hydrated but out-of-window -> not woken; falls through to the timeout. + const result = await wait; + expect(settled).toBe(true); + expect(result.reason).toBe("timeout"); + reg.close(); + }); + + it("classifies a partial record (no tags) by hydrating and re-checking the row's tags", async () => { + // Partial record routes to all tag feeds as candidates; the authoritative row decides. + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src } = makeRouter(rows); + const match = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const noMatch = router.register("env_1", { kind: "tag", tags: ["z"] }, []); + const matchWait = match.waitForMatch(undefined, 1_000); + let noMatchSettled = false; + const noMatchWait = noMatch.waitForMatch(undefined, 80).then((r) => { + noMatchSettled = true; + return r; + }); + + src.push("env_1", [record("r1", { tags: undefined })]); // partial: tags absent + + expect((await matchWait).rows.map((m) => m.row.id)).toEqual(["r1"]); + expect((await noMatchWait).reason).toBe("timeout"); // row tags ["a"] don't intersect ["z"] + expect(noMatchSettled).toBe(true); + match.close(); + noMatch.close(); + }); + + it("times out and aborts cleanly", async () => { + const { router, src } = makeRouter(); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + expect((await reg.waitForMatch(undefined, 30)).reason).toBe("timeout"); + + const controller = new AbortController(); + const wait = reg.waitForMatch(controller.signal, 5_000); + controller.abort(); + expect((await wait).reason).toBe("abort"); + reg.close(); + expect(src.isSubscribed("env_1")).toBe(false); // last feed left -> unsubscribed + }); + + it("only routes to feeds currently waiting (gaps between polls fall to the backstop)", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + // Not waiting yet: a push is dropped (no hydrate, no buffering). + src.push("env_1", [record("r1", { tags: ["a"] })]); + expect(hydrateSpy).not.toHaveBeenCalled(); + reg.close(); + }); +}); diff --git a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts new file mode 100644 index 00000000000..e0c51d57f52 --- /dev/null +++ b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts @@ -0,0 +1,192 @@ +import { setTimeout as sleep } from "node:timers/promises"; +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { + EnvChangeRouter, + type EnvChangeSource, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +// Fixed offset floor: a row's updatedAt above/below it produces a delta / empty diff. The +// createdAt window resolves to this same floor (large maximumCreatedAtFilterAgeMs below). +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row( + id: string, + updatedAtMs: number, + opts: { createdAtMs?: number; tags?: string[] } = {} +): RealtimeRunRow { + return { + id, + runTags: opts.tags ?? ["t"], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), + updatedAt: new Date(updatedAtMs), + } as unknown as RealtimeRunRow; +} + +function rec(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource the test pushes batches into. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => listeners.get(envId)?.delete(onBatch); + }, + }; + return { + source, + push: (envId: string, records: ChangeRecord[]) => { + for (const l of listeners.get(envId) ?? []) l(records); + }, + isSubscribed: (envId: string) => (listeners.get(envId)?.size ?? 0) > 0, + }; +} + +function makeClient(overrides: Record = {}) { + let rowsToReturn: RealtimeRunRow[] = []; + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => + rowsToReturn.filter((r) => ids.includes(r.id)) + ); + const resolveSpy = vi.fn(async () => rowsToReturn.map((r) => r.id)); + const src = fakeSource(); + const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); + + const client = new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + router, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + // Large so the recovered createdAt floor isn't clamped past FLOOR_MS. + maximumCreatedAtFilterAgeMs: 100 * 365 * 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, + livePollTimeoutMs: 10_000, + ...overrides, + }); + + return { client, src, hydrateSpy, resolveSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; +} + +function liveRuns(client: NotifierRealtimeClient) { + return client.streamRuns( + `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +async function whenWaiting(src: ReturnType) { + // Subscribed (feed registered) + a tick so waitForMatch has armed feed.resolve. + await vi.waitFor(() => expect(src.isSubscribed("env_1")).toBe(true)); + await sleep(15); +} + +async function bodyOf(res: Response) { + return JSON.parse(await res.text()) as Array<{ + headers?: { control?: string; operation?: string }; + value?: unknown; + }>; +} +const hasRowOp = (body: Awaited>) => + body.some((m) => m?.headers?.operation || (m && typeof m === "object" && "value" in m)); +const isUpToDate = (body: Awaited>) => + body.some((m) => m?.headers?.control === "up-to-date"); + +describe("NotifierRealtimeClient multi-run live path over the router", () => { + it("a matching change hydrates by id (no ClickHouse) and returns a delta", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t", "x"] })]); + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(hasRowOp(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); // ClickHouse skipped + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + }); + + it("a change that doesn't match the filter never wakes the feed (no CH, no PG); a later match does", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + + src.push("env_1", [rec("run_x", { tags: ["other"] })]); // doesn't intersect ["t"] + await sleep(50); + expect(settled).toBe(false); + expect(hydrateSpy).not.toHaveBeenCalled(); // router never routed it + expect(resolveSpy).not.toHaveBeenCalled(); + + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + const res = await responsePromise; + expect(settled).toBe(true); + expect(hasRowOp(await bodyOf(res))).toBe(true); + }); + + it("a matching run created before the window floor is hydrated but dropped (keeps holding)", async () => { + // Generous backstop so the "still holding" assertion can't race a timeout in slow CI. + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ livePollTimeoutMs: 1500 }); + setRows([row("run_1", FLOOR_MS + 5_000, { createdAtMs: FLOOR_MS - 10_000, tags: ["t"] })]); + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + + await sleep(40); + expect(settled).toBe(false); // dropped by the createdAt floor -> held + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + expect(resolveSpy).not.toHaveBeenCalled(); + + await responsePromise; // drain via the backstop + }); + + it("the backstop timeout does a full ClickHouse resolve and returns up-to-date", async () => { + const { client, resolveSpy } = makeClient({ livePollTimeoutMs: 50 }); + const res = await liveRuns(client); // never pushed -> backstop fires + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).toHaveBeenCalled(); + }); + + it("with holdOnEmpty=false, a matched-but-not-advanced change returns up-to-date without ClickHouse", async () => { + const { client, src, resolveSpy, setRows } = makeClient({ holdOnEmpty: false }); + // Matches the tag and is in-window, but updatedAt is at/below the offset floor -> no delta. + setRows([row("run_1", FLOOR_MS - 1_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts new file mode 100644 index 00000000000..5f7b96fc099 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts @@ -0,0 +1,108 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(): RealtimeRunRow { + return { + id: "run_1", + taskIdentifier: "t", + createdAt: new Date("2026-06-07T10:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:01.000Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_1", + number: 1, + isTest: false, + status: "EXECUTING", + usageDurationMs: 0, + costInCents: 0, + baseCostInCents: 0, + ttl: null, + payload: "{}", + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: [], + error: null, + realtimeStreams: [], + }; +} + +// Only the initial-snapshot path is exercised here, which touches the shared +// #buildResponse — enough to lock the response-header contract. +function makeClient(row: RealtimeRunRow | null) { + return new NotifierRealtimeClient({ + runReader: { + getRunById: async () => row, + hydrateByIds: async () => (row ? [row] : []), + } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + // Snapshot path only; the router (over a no-op source) is never invoked here. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: async () => (row ? [row] : []) }, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + }); +} + +const ENV: RealtimeListEnvironment = { + id: "env_1", + organizationId: "org_1", + projectId: "proj_1", +}; + +describe("NotifierRealtimeClient response headers", () => { + it("exposes electric headers cross-origin so browser hooks can read them", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + "1.0.0-beta.1" // modern client => lowercase electric-* headers + ); + + // Without these the deployed @electric-sql/client throws MissingHeadersError + // (it can't read the electric-* headers across origins). This regressed once. + expect(res.headers.get("access-control-allow-origin")).toBe("*"); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + + // Initial (non-live) snapshot requires offset + handle + schema. + expect(res.headers.get("electric-offset")).toBeTruthy(); + expect(res.headers.get("electric-handle")).toBeTruthy(); + expect(res.headers.get("electric-schema")).toBeTruthy(); + expect(res.headers.get("content-type")).toBe("application/json"); + }); + + it("renames headers for legacy (0.4.0) clients", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + undefined // no client version => legacy header names + ); + + expect(res.headers.get("electric-chunk-last-offset")).toBeTruthy(); + expect(res.headers.get("electric-shape-id")).toBeTruthy(); + expect(res.headers.get("electric-offset")).toBeNull(); + expect(res.headers.get("electric-handle")).toBeNull(); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts new file mode 100644 index 00000000000..7a6449a9eb7 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -0,0 +1,340 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +function row(id: string): RealtimeRunRow { + // Only id/createdAt/updatedAt are read directly; the rest serialize to null. + return { + id, + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:00.000Z"), + } as unknown as RealtimeRunRow; +} + +function makeClient(overrides: Record = {}) { + const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + + const client = new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + // No-op source: live polls never get a router wake, so they fall through to the + // backstop full-resolve — which is what the live tests below assert on. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 5_000, + ...overrides, + }); + + return { client, resolveSpy, hydrateSpy }; +} + +// streamBatch with offset=-1 takes the snapshot path, which calls the coalescing +// resolve+hydrate directly (no concurrency slot / subscription needed). +function snapshot(client: NotifierRealtimeClient, batchId: string, skipColumns?: string) { + const skip = skipColumns ? `&skipColumns=${skipColumns}` : ""; + return client.streamBatch( + `http://localhost:3030/realtime/v1/batches/${batchId}?offset=-1${skip}`, + ENV, + batchId, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +// Tag-list snapshot (offset=-1) — exercises the createdAt bucketing + cache key. +function snapshotTag(client: NotifierRealtimeClient, tags: string[]) { + return client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=-1", + ENV, + { tags }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { + it("coalesces concurrent same-filter resolves into one ClickHouse + Postgres query", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + let release!: (ids: string[]) => void; + const gate = new Promise((resolve) => { + release = resolve; + }); + resolveSpy.mockReturnValueOnce(gate); + + const p1 = snapshot(client, "batch_1"); + const p2 = snapshot(client, "batch_1"); + release(["run_1"]); + await Promise.all([p1, p2]); + + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("serves a second same-filter request from the cache within the TTL", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share the cache across different filters", async () => { + const { client, resolveSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_2"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("re-queries after the cache TTL expires", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + try { + const { client, resolveSpy } = makeClient({ runSetResolveCacheTtlMs: 1_000 }); + await snapshot(client, "batch_1"); + vi.advanceTimersByTime(1_001); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } + }); + + it("passes the client's skipColumns through to the hydrator (column projection)", async () => { + const { client, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1", "payload,output"); + expect(hydrateSpy).toHaveBeenCalledWith("env_1", expect.any(Array), ["payload", "output"]); + }); + + it("reports resolve outcomes (miss then hit) to the metrics hook", async () => { + const results: string[] = []; + const { client } = makeClient({ onRunSetResolve: (r: string) => results.push(r) }); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(results).toEqual(["miss", "hit"]); + }); + + it("mints a distinct batch handle per connection and echoes a client-provided one", async () => { + const { client } = makeClient(); + // Two subscribers to the SAME batch must never share a handle (the working-set + // cache is keyed by it; sharing lets one suppress the other's deltas forever). + const res1 = await snapshot(client, "batch_1"); + const res2 = await snapshot(client, "batch_1"); + const h1 = res1.headers.get("electric-handle"); + const h2 = res2.headers.get("electric-handle"); + expect(h1).toBeTruthy(); + expect(h1).not.toBe(h2); + + // Catch-up under an existing handle keeps it. + const res3 = await client.streamBatch( + `http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&handle=${h1}`, + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res3.headers.get("electric-handle")).toBe(h1); + }); +}); + +describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { + // A resolver that blocks each invocation until released, so we can watch how many run + // concurrently. Tracks peak concurrency and exposes a release-one-at-a-time drain. + function gatedResolver() { + let active = 0; + let peak = 0; + const releases: Array<() => void> = []; + const resolve = vi.fn(async () => { + active++; + peak = Math.max(peak, active); + await new Promise((r) => releases.push(r)); + active--; + return ["run_1"]; + }); + return { + resolve, + peak: () => peak, + releaseOne: () => releases.shift()?.(), + waiting: () => releases.length, + }; + } + + function makeGatedClient(resolveAdmissionLimit: number, resolver: ReturnType) { + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + return new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolver.resolve } as any, + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, // no cache -> every distinct filter is a fresh resolve + resolveAdmissionLimit, + }); + } + + it("throttles a distinct-filter stampede to the admission limit of concurrent CH resolves", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(2, resolver); + + // 5 distinct batchIds => 5 distinct filters => 5 fresh resolves, fired at once. + const polls = [0, 1, 2, 3, 4].map((i) => snapshot(client, `batch_${i}`)); + + // Only the limit (2) may run concurrently; the rest queue for a permit. + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(2)); + await sleep(20); + expect(resolver.resolve).toHaveBeenCalledTimes(2); // 3 still queued behind the gate + expect(resolver.peak()).toBe(2); + + // Drain: each release frees a permit, admitting exactly one queued resolve. + while (resolver.waiting() > 0) { + resolver.releaseOne(); + await sleep(5); + } + await Promise.all(polls); + + expect(resolver.resolve).toHaveBeenCalledTimes(5); // all ran... + expect(resolver.peak()).toBe(2); // ...but never more than the limit at once + }); + + it("lets a same-filter burst through on a single permit (coalesces before the gate)", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(1, resolver); // limit 1 would deadlock if each took a permit + + // 5 identical filters fired at once -> single-flight collapses to one in-flight resolve. + const polls = [0, 1, 2, 3, 4].map(() => snapshot(client, "batch_same")); + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(1)); + await sleep(20); + + resolver.releaseOne(); + await Promise.all(polls); + expect(resolver.resolve).toHaveBeenCalledTimes(1); // one resolve, one permit, no queue + }); +}); + +describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { + it("floors the resolved createdAt lower bound to the bucket boundary", async () => { + // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60_000 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + expect(passed.getTime() % 60_000).toBe(0); + } finally { + vi.useRealTimers(); + } + }); + + it("lets two same-tag feeds in the same bucket share one resolve", async () => { + // A large bucket guarantees both windows floor to the same boundary regardless of + // the sub-millisecond gap between the two calls. + const { client, resolveSpy, hydrateSpy } = makeClient({ + runSetCreatedAtBucketMs: 60 * 60_000, + }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["critical"]); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share across different tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["debug"]); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("does not collide a comma-containing tag with two separate tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["a,b"]); // one tag "a,b" + await snapshotTag(client, ["a", "b"]); // two tags a OR b — a different filter + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 0 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Exact (now - 24h) lower bound, not floored to a 60s boundary. + expect(passed.getTime() % 60_000).not.toBe(0); + } finally { + vi.useRealTimers(); + } + }); +}); + +describe("NotifierRealtimeClient review fixes", () => { + // makeClient's router has a no-op source, so the live poll never gets a wake and falls + // through to its backstop timeout — the full ClickHouse resolve these tests assert on + // (createdAt clamp / concurrency limit). + + it("clamps a stale/crafted handle's createdAt up to the max-age floor", async () => { + const maxAge = 24 * 60 * 60 * 1000; + const { client, resolveSpy } = makeClient({ + maximumCreatedAtFilterAgeMs: maxAge, + runSetCreatedAtBucketMs: 0, + livePollTimeoutMs: 50, + }); + const before = Date.now(); + // Handle encodes createdAt = 1ms epoch, far older than the 24h ceiling. + await client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=123_1&live=true&handle=runs_1_7", + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Clamped to ~now - maxAge, not the epoch value encoded in the handle. + expect(passed.getTime()).toBeGreaterThan(before - maxAge - 1_000); + }); + + it("enforces a concurrency limit of 0 instead of failing with a 500", async () => { + let limitCheckedWith: number | undefined; + const { client } = makeClient({ + cachedLimitProvider: { getCachedLimit: async () => 0 }, + limiter: { + incrementAndCheck: async (_env: string, _id: string, limit: number) => { + limitCheckedWith = limit; + return true; + }, + decrement: async () => {}, + }, + livePollTimeoutMs: 50, + }); + const res = await client.streamBatch( + "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true&handle=batch_batch_1_7_abc", + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res.status).toBe(200); + expect(limitCheckedWith).toBe(0); + }); +}); diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts new file mode 100644 index 00000000000..96d7fd56a45 --- /dev/null +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -0,0 +1,172 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, it, vi } from "vitest"; +import { + type ChangeRecord, + decodeChangeRecord, + encodeChangeRecord, + RunChangeNotifier, +} from "~/services/realtime/runChangeNotifier.server"; + +function toRedisOptions(redisOptions: { host?: string; port?: number; password?: string }) { + return { + host: redisOptions.host, + port: redisOptions.port, + password: redisOptions.password, + tlsDisabled: true, + clusterMode: false, + }; +} + +// Time for a SUBSCRIBE to register server-side before we publish. +const SUBSCRIBE_SETTLE_MS = 250; + +describe("RunChangeNotifier", () => { + redisTest( + "delivers a published change to an env subscriber", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const received: ChangeRecord[] = []; + const unsubscribe = notifier.subscribeToEnv("env_1", (records) => received.push(...records)); + expect(notifier.activeSubscriptionCount).toBe(1); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_1", tags: ["a"], batchId: "batch_1" }); + + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { + timeout: 5_000, + interval: 50, + }); + const got = received.find((r) => r.runId === "run_1")!; + expect(got.tags).toEqual(["a"]); + expect(got.batchId).toBe("batch_1"); + + unsubscribe(); + // Cleanup is deferred until Redis confirms UNSUBSCRIBE, so the count converges to 0. + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not deliver a change for a different env", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_a", (records) => received.push(...records)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_b", tags: [] }); // different env + await sleep(500); + + expect(received).toHaveLength(0); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "coalesces a burst of env publishes into far fewer batches than publishes (lossless)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + envWakeCoalesceWindowMs: 100, + }); + try { + let batches = 0; + const runIds = new Set(); + notifier.subscribeToEnv("env_burst", (records) => { + batches++; + for (const r of records) runIds.add(r.runId); + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + let pubs = 0; + const end = Date.now() + 1_000; + while (Date.now() < end) { + notifier.publish({ runId: `r${pubs++}`, envId: "env_burst", tags: [] }); + await sleep(5); + } + await sleep(300); + + expect(pubs).toBeGreaterThan(100); + expect(batches).toBeGreaterThanOrEqual(1); + // Leading-edge throttle: far fewer deliveries than publishes... + expect(batches).toBeLessThan(pubs / 4); + // ...but lossless — the batch accumulates every run that changed in the window. + expect(runIds.size).toBeGreaterThan(pubs / 2); + } finally { + await notifier.quit(); + } + } + ); + + // Sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) wiring — validated end to end on a + // single node (Redis 7.2 accepts these and delivers same-node). Multi-shard ROUTING + // needs a real cluster (the cluster fixture covers that); this proves the command path. + redisTest( + "delivers via sharded pub/sub on the env channel", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + shardedPubSub: true, + }); + try { + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_sharded", (records) => received.push(...records)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", envId: "env_sharded", tags: ["a"] }); + + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + describe("ChangeRecord codec", () => { + it("round-trips a full record (tags with a separator survive)", () => { + const encoded = encodeChangeRecord({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", + }); + expect(decodeChangeRecord(encoded)).toMatchObject({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", + }); + }); + + it("decodes a bare runId to a partial record (tags undefined)", () => { + // A bare/legacy frame: the consumer falls back to hydrate-to-classify. + const decoded = decodeChangeRecord("run_3"); + expect(decoded.runId).toBe("run_3"); + expect(decoded.tags).toBeUndefined(); + }); + + it("falls back to a bare runId on an unparseable message", () => { + expect(decodeChangeRecord("{not json").runId).toBe("{not json"); + }); + }); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts new file mode 100644 index 00000000000..07aebf92589 --- /dev/null +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -0,0 +1,75 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildHydratorSelect, RunHydrator } from "~/services/realtime/runReader.server"; + +describe("buildHydratorSelect", () => { + it("returns the full select when nothing is skipped", () => { + const select = buildHydratorSelect([]); + expect(select.id).toBe(true); + expect(select.payload).toBe(true); + expect(select.output).toBe(true); + expect(select.metadata).toBe(true); + expect(select.error).toBe(true); + }); + + it("keeps protocol-reserved columns even when asked to skip them", () => { + // Reserved columns are always emitted by the serializer, so hydration must keep + // them regardless of skipColumns or the output is null/incorrect. + const select = buildHydratorSelect([ + "status", + "taskIdentifier", + "createdAt", + "friendlyId", + "payload", + ]); + expect(select.status).toBe(true); + expect(select.taskIdentifier).toBe(true); + expect(select.createdAt).toBe(true); + expect(select.friendlyId).toBe(true); + // A non-reserved skipped column is still dropped. + expect(select.payload).toBeUndefined(); + }); + + it("drops skipped columns but always keeps id + updatedAt", () => { + const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.metadata).toBeUndefined(); + expect(select.error).toBeUndefined(); + // Needed internally regardless of skipColumns (keys the row, drives the diff/offset). + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + // A non-skipped column survives. + expect(select.status).toBe(true); + }); +}); + +describe("RunHydrator.hydrateByIds column projection", () => { + function makeHydrator() { + let capturedSelect: Record | undefined; + const replica = { + taskRun: { + findMany: vi.fn(async ({ select }: { select: Record }) => { + capturedSelect = select; + return []; + }), + }, + } as any; + return { hydrator: new RunHydrator({ replica }), getSelect: () => capturedSelect }; + } + + it("projects the SELECT by skipColumns", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"], ["payload", "output"]); + const select = getSelect()!; + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + }); + + it("selects the full column set when no skipColumns are given", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"]); + expect(getSelect()!.payload).toBe(true); + }); +}); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts new file mode 100644 index 00000000000..e6604a02cd6 --- /dev/null +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -0,0 +1,216 @@ +import { + type RealtimeRunRow, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; +import { type RunListFilter } from "~/services/realtime/runReader.server"; +import { RealtimeShadowComparator } from "~/services/realtime/shadowCompare.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_a", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:05:30.123Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_a", + number: 7, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["a", "b"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +const UP_TO_DATE = { headers: { control: "up-to-date" } }; + +function insert(value: Record) { + return { key: `"public"."TaskRun"/"${value.id}"`, value, headers: { operation: "insert" } }; +} + +function makeComparator( + rowsById: Record, + resolvedIds: string[] = [] +) { + return new RealtimeShadowComparator({ + runReader: { + getRunById: async (_env: string, id: string) => rowsById[id] ?? null, + hydrateByIds: async (_env: string, ids: string[]) => + ids.map((id) => rowsById[id]).filter((row): row is RealtimeRunRow => Boolean(row)), + } as any, + runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, + }); +} + +describe("RealtimeShadowComparator serialization", () => { + it("counts a faithful re-serialization as a match", async () => { + const row = sampleRow(); + const body = JSON.stringify([insert(serializeRunRow(row)), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + expect(out.serializationSkew).toBe(0); + expect(out.diffs).toEqual([]); + }); + + it("does not flag semantically-equivalent but differently-encoded values", async () => { + const row = sampleRow(); + // Electric encodes bool as "true" (notifier uses "t"), a number with a trailing + // zero, and a timestamp without millis — all equal after decoding. + const value = { + ...serializeRunRow(row), + isTest: "true", + costInCents: "0.5500", + createdAt: "2026-06-07T09:00:00", + }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + }); + + it("flags a genuine column divergence (same version)", async () => { + const row = sampleRow(); + const value = { ...serializeRunRow(row), payload: '{"hello":"TAMPERED"}' }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.diffs).toEqual([ + { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', notifier: '{"hello":"world"}' }, + ]); + }); + + it("treats DEQUEUED/EXECUTING as equivalent (legacy status rewrite)", async () => { + const row = sampleRow({ status: "EXECUTING" }); + const value = { ...serializeRunRow(row), status: "DEQUEUED" }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(0); + expect(out.serializationMatched).toBe(1); + }); + + it("records skew when the row advanced between emit and refetch", async () => { + const row = sampleRow(); + // Electric emitted an older version; the refetched row is newer. + const value = { ...serializeRunRow(sampleRow({ updatedAt: new Date("2026-06-07T10:00:00.000Z") })) }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationSkew).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.serializationDiverged).toBe(0); + }); +}); + +describe("RealtimeShadowComparator membership", () => { + const filter: RunListFilter = { + organizationId: "org_1", + projectId: "proj_1", + environmentId: "env_1", + tags: ["t"], + createdAtAfter: new Date("2026-06-06T00:00:00.000Z"), + limit: 1000, + }; + + function bodyFor(ids: string[]) { + const msgs = ids.map((id) => insert(serializeRunRow(sampleRow({ id })))); + return JSON.stringify([...msgs, UP_TO_DATE]); + } + + it("matches when Electric's set equals the notifier resolver's set", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "b"] + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(true); + expect(out.missingInNotifier).toEqual([]); + expect(out.extraInNotifier).toEqual([]); + }); + + it("reports rows missing from / extra in the notifier resolution", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "c"] // notifier missing b, has extra c + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(false); + expect(out.missingInNotifier).toEqual(["b"]); + expect(out.extraInNotifier).toEqual(["c"]); + }); +}); diff --git a/internal-packages/redis/package.json b/internal-packages/redis/package.json index 9c13bbf21b0..6c7d8aa2608 100644 --- a/internal-packages/redis/package.json +++ b/internal-packages/redis/package.json @@ -6,7 +6,7 @@ "types": "./src/index.ts", "type": "module", "dependencies": { - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "@trigger.dev/core": "workspace:*" }, "scripts": { diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts index 2e4adeed4b1..bd29869d280 100644 --- a/internal-packages/run-engine/src/engine/eventBus.ts +++ b/internal-packages/run-engine/src/engine/eventBus.ts @@ -11,7 +11,14 @@ export type EventBusEvents = { runCreated: [ { time: Date; - runId: string; + run: { + id: string; + runTags: string[]; + batchId: string | null; + }; + environment: { + id: string; + }; }, ]; runEnqueuedAfterDelay: [ @@ -23,6 +30,8 @@ export type EventBusEvents = { queuedAt: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -44,6 +53,8 @@ export type EventBusEvents = { delayUntil: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -76,6 +87,8 @@ export type EventBusEvents = { maxDurationInSeconds?: number; maxAttempts?: number; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -96,6 +109,8 @@ export type EventBusEvents = { status: TaskRunStatus; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id?: string; @@ -119,6 +134,8 @@ export type EventBusEvents = { attemptNumber: number; baseCostInCents: number; executedAt: Date | undefined; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -245,6 +262,8 @@ export type EventBusEvents = { createdAt: Date; error: TaskRunError; taskEventStore?: string; + runTags: string[]; + batchId: string | null; }; organization: { id: string; diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 835ff90cc48..c3e0a5c75d0 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1042,7 +1042,14 @@ export class RunEngine { this.eventBus.emit("runCreated", { time: new Date(), - runId: taskRun.id, + run: { + id: taskRun.id, + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }, + environment: { + id: environment.id, + }, }); return taskRun; diff --git a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts index 384384fd8c7..6c66591e288 100644 --- a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts @@ -147,6 +147,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, @@ -308,6 +310,8 @@ export class CheckpointSystem { projectId: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, }, }); @@ -326,6 +330,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.organizationId ?? undefined, diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts index 32ab98bad6c..10c965741cf 100644 --- a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts @@ -79,6 +79,8 @@ export class DelayedRunSystem { delayUntil: delayUntil, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: snapshot.organizationId, @@ -192,6 +194,8 @@ export class DelayedRunSystem { queuedAt, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts index 3fe1ef072cf..7c811ebfdfc 100644 --- a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -490,6 +490,8 @@ export class DequeueSystem { maxAttempts: lockedTaskRun.maxAttempts ?? undefined, updatedAt: lockedTaskRun.updatedAt, createdAt: lockedTaskRun.createdAt, + runTags: lockedTaskRun.runTags, + batchId: lockedTaskRun.batchId, }, organization: { id: orgId, @@ -751,6 +753,8 @@ export class DequeueSystem { attemptNumber: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, runtimeEnvironment: { select: { id: true, @@ -792,6 +796,8 @@ export class DequeueSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.project.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts index 6d503012fbc..b46b857f02a 100644 --- a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts @@ -163,6 +163,8 @@ export class PendingVersionSystem { status: "PENDING", updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: backgroundWorker.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 06c80f67f2c..02fd83a7a25 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -520,6 +520,8 @@ export class RunAttemptSystem { attemptNumber: nextAttemptNumber, baseCostInCents: updatedRun.baseCostInCents, executedAt: updatedRun.executedAt ?? undefined, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: updatedRun.runtimeEnvironment.organizationId, @@ -1052,6 +1054,8 @@ export class RunAttemptSystem { error: completion.error, createdAt: run.createdAt, taskEventStore: run.taskEventStore, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, diff --git a/internal-packages/testcontainers/package.json b/internal-packages/testcontainers/package.json index 4ea83344c34..b3ab7ce5dc4 100644 --- a/internal-packages/testcontainers/package.json +++ b/internal-packages/testcontainers/package.json @@ -16,7 +16,7 @@ "@clickhouse/client": "^1.11.1", "@opentelemetry/api": "^1.9.1", "@trigger.dev/database": "workspace:*", - "ioredis": "^5.3.2" + "ioredis": "~5.6.0" }, "devDependencies": { "@testcontainers/postgresql": "^11.14.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 782b62cf7ff..39273b2976c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -228,8 +228,8 @@ importers: specifier: ^4.0.6 version: 4.0.6 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 p-limit: specifier: ^6.2.0 version: 6.2.0 @@ -664,8 +664,8 @@ importers: specifier: ^1.0.0 version: 1.0.0 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 isbot: specifier: ^3.6.5 version: 3.6.5 @@ -1256,8 +1256,8 @@ importers: specifier: workspace:* version: link:../../packages/core ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 internal-packages/replication: dependencies: @@ -1404,8 +1404,8 @@ importers: specifier: workspace:* version: link:../database ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 devDependencies: '@testcontainers/postgresql': specifier: ^11.14.0 @@ -11970,8 +11970,8 @@ packages: resolution: {integrity: sha512-YFMSV91JNBOSjw1cOfw2tup6hDP7mkz+2AUV7W1L1AM6ntgI75qC1ZeFpjPGMrWp+upmBRTX2fJWQ8c7jsUWpA==} engines: {node: '>=14'} - ioredis@5.3.2: - resolution: {integrity: sha512-1DKMMzlIHM02eBBVOFQ1+AolGjs6+xEcM4PDL7NqOS6szq7H9jSaEkIUH6/a5Hl241LzW6JLSiAbNvTQjUupUA==} + ioredis@5.6.1: + resolution: {integrity: sha512-UxC0Yv1Y4WRJiGQxQkP0hfdL0/5/6YvdfOOClRgJ0qppSarkhneSa6UvkMkms0AkdGimSH3Ikqm+6mkMmX7vGA==} engines: {node: '>=12.22.0'} ip-address@10.0.1: @@ -30048,11 +30048,11 @@ snapshots: intl-parse-accept-language@1.0.0: {} - ioredis@5.3.2: + ioredis@5.6.1: dependencies: '@ioredis/commands': 1.2.0 cluster-key-slot: 1.1.2 - debug: 4.3.7(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) denque: 2.1.0 lodash.defaults: 4.2.0 lodash.isarguments: 3.1.0 @@ -33909,7 +33909,7 @@ snapshots: send@1.1.0(supports-color@10.0.0): dependencies: - debug: 4.3.6(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) destroy: 1.2.0 encodeurl: 2.0.0 escape-html: 1.0.3