diff --git a/.github/workflows/manual-deploy.yml b/.github/workflows/manual-deploy.yml index 01bac788ad..ac312f3317 100644 --- a/.github/workflows/manual-deploy.yml +++ b/.github/workflows/manual-deploy.yml @@ -335,6 +335,35 @@ jobs: exit 1 fi + recycle-prerender: + name: Recycle prerender after host is live + # The prerender fleet deploys before the realm server (the manager, + # worker, and realm server all depend on it being up for boot indexing), + # so its tabs warm against the OLD host shell the realm server was still + # serving at that point. Once the realm server is up serving the new + # shell, re-deploy the prerender service so its tabs re-warm against it. + # The reusable deploy passes force-new-deployment, so this recycles fresh + # tasks even though the image is unchanged. + # + # Gate on `deploy-realm-server` (wait-for-service-stability: true → the + # new realm server is up and serving the new shell), NOT on + # `post-deploy-realm-server`: that job is a separate post-deploy hook + # (it POSTs the realm server's `/_post-deployment` endpoint) and can fail + # independently of the recycle's only real precondition — the new shell + # being served. Coupling to it would skip this recycle on an unrelated + # hook failure. + needs: [build-prerender, deploy-realm-server] + uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main + secrets: inherit + with: + container-name: "boxel-prerender-server" + environment: ${{ inputs.environment }} + cluster: ${{ inputs.environment }} + service-name: "boxel-prerender-server-${{ inputs.environment }}" + image: ${{ needs.build-prerender.outputs.image }} + timeout-minutes: 10 + wait-for-service-stability: true + apply-observability: # Push the observability/ package's dashboards/folders/data sources/alerts # into the production self-host Grafana as part of the deploy. The @@ -375,6 +404,7 @@ jobs: post-deploy-worker, deploy-realm-server, post-deploy-realm-server, + recycle-prerender, apply-observability, ] if: github.event_name == 'workflow_dispatch' && always() diff --git a/packages/realm-server/handlers/handle-post-deployment.ts b/packages/realm-server/handlers/handle-post-deployment.ts index 7be38ee876..a139307f69 100644 --- a/packages/realm-server/handlers/handle-post-deployment.ts +++ b/packages/realm-server/handlers/handle-post-deployment.ts @@ -20,6 +20,7 @@ export default function handlePostDeployment({ definitionLookup, queue, realmServerSecretSeed, + reportHostShell, }: CreateRoutesArgs): (ctxt: Koa.Context, next: Koa.Next) => Promise { return async function (ctxt: Koa.Context, _next: Koa.Next) { if (ctxt.request.headers.authorization !== realmServerSecretSeed) { @@ -27,6 +28,14 @@ export default function handlePostDeployment({ return; } + // This hook fires after the deploy reports the service stable, so the new + // host shell is live and load-balancer-routable. Re-report the host-shell + // token to the prerender manager from here so the fleet's recycle signal + // reflects the now-serving shell, closing the rolling-deploy window where + // the boot-time report could precede the new task receiving traffic. + // Fire-and-forget — best-effort, must not affect the hook's response. + void reportHostShell?.(); + await definitionLookup.clearAllDefinitions(); let boxelUiChangeCheckerResult = diff --git a/packages/realm-server/main.ts b/packages/realm-server/main.ts index b8db76c556..0a95099fde 100644 --- a/packages/realm-server/main.ts +++ b/packages/realm-server/main.ts @@ -363,6 +363,45 @@ const smokeTestHostApp = async () => { throw lastError ?? new Error('host app smoke test timed out'); }; +// Report the host-shell token this realm server is serving to the prerender +// manager. The manager echoes it on heartbeat responses so prerender servers +// recycle their browsers when it changes — i.e. when a deploy ships a new +// host bundle. Runs at boot (after the smoke test confirmed the shell is +// reachable): a deploy restarts this process, so a new bundle is reported +// here and picked up by the prerender fleet. Best-effort — a missing or +// unreachable manager must never block realm-server boot. +const reportHostShellToManager = async () => { + try { + let html = await getIndexHTML(); + let { createHash } = await import('crypto'); + let hash = createHash('md5').update(html).digest('hex').slice(0, 8); + // Report to the manager URL the realm server already uses (prerendererUrl); + // PRERENDER_MANAGER_URL is only set on the prerender-server tasks. + let managerURL = prerendererUrl.replace(/\/$/, ''); + let response = await fetch(`${managerURL}/host-shell`, { + method: 'POST', + headers: { + 'Content-Type': 'application/vnd.api+json', + Accept: 'application/vnd.api+json', + }, + body: JSON.stringify({ data: { attributes: { hash } } }), + }); + if (response.ok) { + console.log( + `Reported host shell token ${hash} to prerender manager at ${managerURL}`, + ); + } else { + console.warn( + `Prerender manager rejected host shell report: ${response.status}`, + ); + } + } catch (e: any) { + console.warn( + `Failed to report host shell token to prerender manager: ${e?.message ?? e}`, + ); + } +}; + (async () => { try { await smokeTestHostApp(); @@ -594,6 +633,7 @@ const smokeTestHostApp = async () => { ? getRegistrationSecret : undefined, prerenderer, + reportHostShell: reportHostShellToManager, }); let httpServer = server.listen(port); @@ -735,6 +775,16 @@ const smokeTestHostApp = async () => { // wait for first-request mount via reconciler.lookupOrMount(). await server.start(); + // Now that the HTTP listener is accepting traffic and serving the new host + // shell, tell the prerender manager which shell we're serving so the fleet + // recycles after a host redeploy. Reporting earlier (before the listener is + // live) races a rolling deploy: the manager could echo the new token while + // the load balancer still routes to the old task, so a prerender would + // recycle against the old shell, record the new token, and stop retrying. + // The post-deployment hook reports again once the service is fully stable. + // Fire-and-forget — a missing/unreachable manager must never affect serving. + void reportHostShellToManager(); + // Begin the reconciler's background poll loop (LISTEN realm_registry + // 30s safety poll). It picks up changes from peer instances (publish, // unpublish, delete) and reconciles them into local mounted state. diff --git a/packages/realm-server/prerender/manager-app.ts b/packages/realm-server/prerender/manager-app.ts index e3c2f41dcc..313b0f7ab4 100644 --- a/packages/realm-server/prerender/manager-app.ts +++ b/packages/realm-server/prerender/manager-app.ts @@ -8,6 +8,7 @@ import { import { format } from 'date-fns'; import { PRERENDER_JOB_ID_HEADER, + PRERENDER_HOST_SHELL_HASH_HEADER, PRERENDER_REQUEST_ID_HEADER, PRERENDER_SERVER_DRAINING_STATUS_CODE, PRERENDER_SERVER_STATUS_DRAINING, @@ -64,6 +65,11 @@ type Registry = { servers: Map; // key: serverUrl affinities: Map; // affinityKey (:) -> assigned serverUrls (deque semantics) lastAccessByAffinity: Map; + // Latest host-shell token reported by a realm server (POST /host-shell). + // Echoed to prerender servers on every heartbeat response so they recycle + // when it changes (host redeployed). Undefined until first reported; reset + // on manager restart, re-learned from the next realm-server boot report. + hostShellHash?: string; }; const log = logger('prerender-manager'); @@ -507,6 +513,11 @@ export function buildPrerenderManagerApp(options?: { warmedAffinities, affinityVacancy, }); + // Echo the current host-shell token so the server can recycle its + // browser when the host is redeployed (see PRERENDER_HOST_SHELL_HASH_HEADER). + if (registry.hostShellHash) { + ctxt.set(PRERENDER_HOST_SHELL_HASH_HEADER, registry.hostShellHash); + } ctxt.status = 204; ctxt.set('X-Prerender-Server-Id', url); } catch (e) { @@ -516,6 +527,56 @@ export function buildPrerenderManagerApp(options?: { } }); + // The realm server reports the host-shell token it is currently serving + // (POST at boot, after it has fetched the new shell). A change means the + // host was redeployed; prerender servers pick it up on their next heartbeat + // and recycle. Storing the latest token (rather than counting) keeps this + // robust across the manager's own restart in the deploy train — the next + // realm-server boot re-reports the current token. + router.post('/host-shell', async (ctxt) => { + try { + let req = await fetchRequestFromContext(ctxt); + let raw = await req.text(); + let requestBody: any = {}; + if (raw) { + try { + requestBody = JSON.parse(raw); + } catch (e) { + log.debug('Invalid JSON body on /host-shell; treating as empty:', e); + } + } + let hash = requestBody?.data?.attributes?.hash; + if (typeof hash !== 'string' || hash.trim().length === 0) { + ctxt.status = 400; + ctxt.body = { errors: [{ status: 400, message: 'hash is required' }] }; + return; + } + // Normalize and bound the token before storing it: it is echoed into a + // response header on every heartbeat, so a stray-whitespace variant would + // spuriously read as a change, and an oversized value would bloat every + // heartbeat response. The real token is a short hex digest, so anything + // longer is malformed — reject rather than silently truncate (a truncated + // token would never match and would recycle forever). + hash = hash.trim(); + if (hash.length > 64) { + ctxt.status = 400; + ctxt.body = { errors: [{ status: 400, message: 'hash too long' }] }; + return; + } + if (registry.hostShellHash !== hash) { + log.info( + `host shell token changed (${registry.hostShellHash ?? 'none'} -> ${hash}); prerender servers will recycle on next heartbeat`, + ); + registry.hostShellHash = hash; + } + ctxt.status = 204; + } catch (e) { + log.error('Error in /host-shell:', e); + ctxt.status = 500; + ctxt.body = { errors: [{ status: 500, message: 'host-shell error' }] }; + } + }); + // maintenance: clear affinity assignments and capacity tracking router.post('/prerender-maintenance/reset', async (ctxt) => { for (let [, info] of registry.servers) { diff --git a/packages/realm-server/prerender/prerender-app.ts b/packages/realm-server/prerender/prerender-app.ts index 2445d268e1..ec85d5492a 100644 --- a/packages/realm-server/prerender/prerender-app.ts +++ b/packages/realm-server/prerender/prerender-app.ts @@ -22,6 +22,7 @@ import { Prerenderer } from './index.ts'; import type { Timings } from './render-runner.ts'; import { resolvePrerenderManagerURL } from './config.ts'; import { + PRERENDER_HOST_SHELL_HASH_HEADER, PRERENDER_JOB_ID_HEADER, PRERENDER_REQUEST_ID_HEADER, PRERENDER_SERVER_DRAINING_STATUS_CODE, @@ -60,6 +61,30 @@ export function decorateRenderErrorDiagnostics( }; } +// Pure decision for the host-shell recycle reconcile. Given the token the +// manager last reported (`reported`, null when it doesn't know one yet) and +// the token this server warmed against (`warmed`, undefined before the first +// report), decide whether to recycle and what the baseline token becomes: +// - no report → keep the current baseline, don't recycle +// - first report seen → adopt it as the baseline, don't recycle (we just +// warmed against whatever shell is current) +// - same as baseline → no-op +// - differs from baseline → recycle and advance the baseline +// Exported for unit testing; the live caller layers the draining / in-flight +// guards and the async recycle on top. +export function decideHostShellRecycle( + reported: string | null, + warmed: string | undefined, +): { recycle: boolean; nextWarmed: string | undefined } { + if (!reported) { + return { recycle: false, nextWarmed: warmed }; + } + if (warmed === undefined || warmed === reported) { + return { recycle: false, nextWarmed: reported }; + } + return { recycle: true, nextWarmed: reported }; +} + export function buildPrerenderApp(options: { serverURL: string; maxPages?: number; @@ -1021,6 +1046,13 @@ export function createPrerenderHttpServer(options?: { let drainingResolved = false; let drainingDeferred = new Deferred(); let heartbeatTimer: NodeJS.Timeout | undefined; + // Host-shell token the standbys were last warmed against, learned from the + // manager's heartbeat responses (PRERENDER_HOST_SHELL_HASH_HEADER). When the + // manager reports a different token — the host was redeployed and the realm + // server is now serving a new shell — the browser is recycled so pages + // reload it. Undefined until the first heartbeat that carries a token. + let warmedHostShellHash: string | undefined; + let recyclingForHostChange = false; let isClosing = false; let fatalExitOnUncaught = options?.fatalExitOnUncaught ?? true; let serverURL = resolvePrerenderServerURL(options?.port); @@ -1079,7 +1111,7 @@ export function createPrerenderHttpServer(options?: { log.debug( `POST heartbeat to ${managerURL}/prerender-servers with body:\n${JSON.stringify(body, null, 2)}`, ); - await fetch(`${managerURL}/prerender-servers`, { + let response = await fetch(`${managerURL}/prerender-servers`, { method: 'POST', headers: { 'Content-Type': 'application/vnd.api+json', @@ -1088,13 +1120,54 @@ export function createPrerenderHttpServer(options?: { body: JSON.stringify(body), }).catch((e) => { log.debug('Prerender manager heartbeat request failed:', e); + return undefined; }); + if (response) { + reconcileHostShell( + response.headers.get(PRERENDER_HOST_SHELL_HASH_HEADER), + ); + } } catch (e) { // best-effort, but log for visibility log.debug('Error while attempting heartbeat with prerender manager:', e); } } + // Compare the manager's current host-shell token against the one we warmed + // against. A change means the host was redeployed, so recycle the browser + // (fire-and-forget; the heartbeat itself must not block on the restart). + function reconcileHostShell(hash: string | null) { + if (draining || recyclingForHostChange) { + return; + } + let { recycle, nextWarmed } = decideHostShellRecycle( + hash, + warmedHostShellHash, + ); + if (!recycle) { + // Either nothing reported, or we adopted a baseline / matched — record + // the (possibly newly-adopted) token and we're done. + warmedHostShellHash = nextWarmed; + return; + } + recyclingForHostChange = true; + log.info( + `host shell changed (${warmedHostShellHash} -> ${hash}); recycling prerender browser`, + ); + void prerenderer + .recycle() + .then(() => { + warmedHostShellHash = nextWarmed; + }) + .catch((e) => { + // Leave warmedHostShellHash unchanged so the next heartbeat retries. + log.error('Failed to recycle prerender browser on host change:', e); + }) + .finally(() => { + recyclingForHostChange = false; + }); + } + function startHeartbeatLoop() { if (heartbeatTimer) return; void sendHeartbeat(); diff --git a/packages/realm-server/prerender/prerender-constants.ts b/packages/realm-server/prerender/prerender-constants.ts index 7ec8005b19..31afbd97e9 100644 --- a/packages/realm-server/prerender/prerender-constants.ts +++ b/packages/realm-server/prerender/prerender-constants.ts @@ -2,6 +2,16 @@ export const PRERENDER_SERVER_STATUS_HEADER = 'X-Boxel-Prerender-Server-Status'; export const PRERENDER_SERVER_STATUS_DRAINING = 'draining'; export const PRERENDER_SERVER_DRAINING_STATUS_CODE = 410; +// Opaque token for the current host shell (the realm server's rewritten +// index.html). The realm server reports it to the manager at boot +// (POST /host-shell); the manager echoes the latest value on every +// heartbeat response via this header, and a prerender server recycles its +// browser when the value differs from the shell it last warmed against — +// i.e. the host was redeployed. The token only has to change when the host +// bundle changes; prerender servers treat it opaquely. +export const PRERENDER_HOST_SHELL_HASH_HEADER = + 'X-Boxel-Prerender-Host-Shell-Hash'; + // CS-10872: correlates one client-initiated prerender call across // remote-prerenderer → manager → prerender-server. The client assigns // the ID on the first request; the manager and prerender-server echo diff --git a/packages/realm-server/prerender/prerenderer.ts b/packages/realm-server/prerender/prerenderer.ts index 65059b09d0..2481b6974a 100644 --- a/packages/realm-server/prerender/prerenderer.ts +++ b/packages/realm-server/prerender/prerenderer.ts @@ -209,6 +209,17 @@ export class Prerenderer { await this.#pagePool.warmStandbys(); } + // Recycle the whole browser to pick up a redeployed host. Reuses the + // failure-recovery restart path (closeAll → restart Chrome → re-warm + // standbys); the full browser restart also clears Chrome's HTTP cache, so + // re-warmed pages reload the current host shell rather than a cached stale + // bundle. Coalesced via the same in-flight guard as #restartBrowser, so + // overlapping recycle signals collapse to one restart. + async recycle(): Promise { + log.info('Recycling prerender browser to pick up a redeployed host'); + await this.#restartBrowser(); + } + // Emit the `render cancelled` log line (format from CS-10872) // and, on a `rendering`-state cancel, tear down the affinity so // the next request gets a fresh tab rather than one whose diff --git a/packages/realm-server/routes.ts b/packages/realm-server/routes.ts index e319ee8f19..6b313e7637 100644 --- a/packages/realm-server/routes.ts +++ b/packages/realm-server/routes.ts @@ -110,6 +110,10 @@ export type CreateRoutesArgs = { }; assetsURL: URL; prerenderer?: Prerenderer; + // Reports the current host-shell token to the prerender manager. The + // post-deployment hook calls it so the fleet's recycle signal is refreshed + // once the new code is live and the service is stable. + reportHostShell?: () => Promise; searchCache: JobScopedSearchCache; }; diff --git a/packages/realm-server/server.ts b/packages/realm-server/server.ts index 787d9164d7..9dd4c551b3 100644 --- a/packages/realm-server/server.ts +++ b/packages/realm-server/server.ts @@ -894,6 +894,7 @@ export class RealmServer { } | undefined; private prerenderer: Prerenderer | undefined; + private reportHostShell: (() => Promise) | undefined; private reconciler: RealmRegistryReconciler; private searchCache: JobScopedSearchCache; private cachedApp: ReturnType | undefined; @@ -919,6 +920,7 @@ export class RealmServer { getRegistrationSecret, domainsForPublishedRealms, prerenderer, + reportHostShell, searchCache, }: { serverURL: URL; @@ -945,6 +947,10 @@ export class RealmServer { boxelSite?: string; }; prerenderer?: Prerenderer; + // Reports the current host-shell token to the prerender manager. main.ts + // wires this so the post-deployment hook can re-report once the service is + // stable (the boot-time report fires as soon as this server starts serving). + reportHostShell?: () => Promise; // Optional so test harnesses that construct a RealmServer directly get a // private cache for free. main.ts passes a shared instance so the // JobsFinishedListener can evict the same cache the handlers populate. @@ -989,6 +995,7 @@ export class RealmServer { this.realms = realms; this.reconciler = reconciler; this.prerenderer = prerenderer; + this.reportHostShell = reportHostShell; this.searchCache = searchCache ?? new JobScopedSearchCache(dbAdapter); } @@ -1091,6 +1098,7 @@ export class RealmServer { matrixAdminPassword: this.matrixAdminPassword, domainsForPublishedRealms: this.domainsForPublishedRealms, prerenderer: this.prerenderer, + reportHostShell: this.reportHostShell, reconciler: this.reconciler, searchCache: this.searchCache, }), diff --git a/packages/realm-server/tests/index.ts b/packages/realm-server/tests/index.ts index 8f1f93aeb2..5dfa9da64b 100644 --- a/packages/realm-server/tests/index.ts +++ b/packages/realm-server/tests/index.ts @@ -191,6 +191,7 @@ const ALL_TEST_FILES: string[] = [ './prerendering-test', './prerender-server-test', './prerender-manager-test', + './prerender-host-shell-recycle-test', './prerender-artifact-sink-test', './prerender-affinity-activity-test', './prerender-batch-ownership-test', diff --git a/packages/realm-server/tests/prerender-host-shell-recycle-test.ts b/packages/realm-server/tests/prerender-host-shell-recycle-test.ts new file mode 100644 index 0000000000..497fae4495 --- /dev/null +++ b/packages/realm-server/tests/prerender-host-shell-recycle-test.ts @@ -0,0 +1,43 @@ +import { module, test } from 'qunit'; +import { basename } from 'path'; +import { decideHostShellRecycle } from '../prerender/prerender-app.ts'; + +// Unit tests for the host-shell recycle decision a prerender server makes on +// every heartbeat: the manager echoes the current host-shell token, and the +// server recycles its browser when that token differs from the one it warmed +// against (the host was redeployed). See PRERENDER_HOST_SHELL_HASH_HEADER. +module(basename(__filename), function () { + module('decideHostShellRecycle', function () { + test('no token reported yet → no recycle, baseline unchanged', function (assert) { + assert.deepEqual(decideHostShellRecycle(null, undefined), { + recycle: false, + nextWarmed: undefined, + }); + assert.deepEqual(decideHostShellRecycle(null, 'aaa'), { + recycle: false, + nextWarmed: 'aaa', + }); + }); + + test('first token seen → adopt as baseline, no recycle', function (assert) { + assert.deepEqual(decideHostShellRecycle('aaa', undefined), { + recycle: false, + nextWarmed: 'aaa', + }); + }); + + test('token matches baseline → no-op', function (assert) { + assert.deepEqual(decideHostShellRecycle('aaa', 'aaa'), { + recycle: false, + nextWarmed: 'aaa', + }); + }); + + test('token differs from baseline → recycle and advance baseline', function (assert) { + assert.deepEqual(decideHostShellRecycle('bbb', 'aaa'), { + recycle: true, + nextWarmed: 'bbb', + }); + }); + }); +}); diff --git a/packages/realm-server/tests/prerender-manager-test.ts b/packages/realm-server/tests/prerender-manager-test.ts index abb6b5fb46..02a20af85f 100644 --- a/packages/realm-server/tests/prerender-manager-test.ts +++ b/packages/realm-server/tests/prerender-manager-test.ts @@ -8,6 +8,7 @@ import type { RealmHttpServer as Server } from '../server.ts'; import http, { createServer } from 'http'; import { buildPrerenderManagerApp } from '../prerender/manager-app.ts'; import { + PRERENDER_HOST_SHELL_HASH_HEADER, PRERENDER_SERVER_DRAINING_STATUS_CODE, PRERENDER_SERVER_STATUS_DRAINING, PRERENDER_SERVER_STATUS_HEADER, @@ -97,6 +98,59 @@ module(basename(__filename), function () { ); }); + test('reports the host shell token and echoes it on heartbeats', async function (assert) { + let { app } = buildPrerenderManagerApp(); + let request: SuperTest = supertest(app.callback()); + let headerKey = PRERENDER_HOST_SHELL_HASH_HEADER.toLowerCase(); + let heartbeat = () => + request.post('/prerender-servers').send({ + data: { + type: 'prerender-server', + attributes: { capacity: 2, url: serverUrlA }, + }, + }); + + // No token reported yet → heartbeat carries no host-shell header. + let first = await heartbeat(); + assert.strictEqual(first.status, 204, 'heartbeat accepted'); + assert.strictEqual( + first.headers[headerKey], + undefined, + 'no host-shell header before any report', + ); + + // Realm server reports a token. + let reportA = await request + .post('/host-shell') + .send({ data: { attributes: { hash: 'aaa111' } } }); + assert.strictEqual(reportA.status, 204, 'host-shell report accepted'); + + // Now heartbeats echo it. + let second = await heartbeat(); + assert.strictEqual( + second.headers[headerKey], + 'aaa111', + 'heartbeat echoes the reported host-shell token', + ); + + // A changed token is echoed; a repeat of the same token is a no-op. + await request + .post('/host-shell') + .send({ data: { attributes: { hash: 'bbb222' } } }); + let third = await heartbeat(); + assert.strictEqual( + third.headers[headerKey], + 'bbb222', + 'heartbeat echoes the updated host-shell token', + ); + + // A missing hash is rejected. + let bad = await request + .post('/host-shell') + .send({ data: { attributes: {} } }); + assert.strictEqual(bad.status, 400, 'host-shell report requires a hash'); + }); + test('health includes active servers with affinities and last used times', async function (assert) { process.env.PRERENDER_MULTIPLEX = '2'; let { app } = buildPrerenderManagerApp();