From c868b08754fadb874893fa764c304dc60f8e6ceb Mon Sep 17 00:00:00 2001 From: spypsy Date: Tue, 9 Jun 2026 20:02:25 +0300 Subject: [PATCH 1/9] chore(ci): notify slack on benchmark success (#23964) Fixes [A-1149](https://linear.app/aztec-labs/issue/A-1149/have-all-benchmarks-report-to-slack-on-success-or-failure) --- .github/workflows/nightly-bench-10tps.yml | 19 ++++++-- .github/workflows/nightly-spartan-bench.yml | 51 ++++++++++++++++++--- .github/workflows/weekly-proving-bench.yml | 17 ++++++- 3 files changed, 76 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nightly-bench-10tps.yml b/.github/workflows/nightly-bench-10tps.yml index 3f0c096b6980..724f9a04388c 100644 --- a/.github/workflows/nightly-bench-10tps.yml +++ b/.github/workflows/nightly-bench-10tps.yml @@ -187,8 +187,8 @@ jobs: NO_SPOT: 1 run: ./.github/ci3.sh network-teardown bench-10tps bench-10tps - notify-failure: - if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') }} + notify: + if: (success() || failure()) && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') needs: - select-image - deploy-bench-10tps-network @@ -202,7 +202,20 @@ jobs: with: ref: ${{ needs.select-image.outputs.source_ref }} - - name: Notify Slack on failure + - name: Notify Slack on success + if: success() + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + run: | + export CI=1 + IMAGE="${{ needs.select-image.outputs.image_label || 'unknown' }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./ci3/slack_notify \ + "Nightly 10 TPS benchmark PASSED (image ${IMAGE}) :white_check_mark: <${RUN_URL}|View Run>" \ + "#alerts-next-scenario" + + - name: Notify Slack and dispatch ClaudeBox on failure + if: failure() env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} diff --git a/.github/workflows/nightly-spartan-bench.yml b/.github/workflows/nightly-spartan-bench.yml index 166427975ee5..4bf486e76883 100644 --- a/.github/workflows/nightly-spartan-bench.yml +++ b/.github/workflows/nightly-spartan-bench.yml @@ -192,8 +192,8 @@ jobs: NO_SPOT: 1 run: ./.github/ci3.sh network-teardown tps-scenario nightly-bench - notify-bench-failure: - if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') }} + notify-bench: + if: (success() || failure()) && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') needs: - select-image - deploy-bench-network @@ -207,7 +207,20 @@ jobs: with: ref: ${{ needs.select-image.outputs.source_ref }} + - name: Notify Slack on success + if: success() + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + run: | + export CI=1 + TAG="${{ needs.select-image.outputs.nightly_tag || 'unknown' }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./ci3/slack_notify \ + "Nightly Spartan TPS benchmarks PASSED (nightly tag ${TAG}) :white_check_mark: <${RUN_URL}|View Run>" \ + "#alerts-next-scenario" + - name: Notify Slack and dispatch ClaudeBox on failure + if: failure() env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} @@ -346,8 +359,8 @@ jobs: NO_SPOT: 1 run: ./.github/ci3.sh network-teardown prove-n-tps-fake prove-n-tps-fake - notify-proving-failure: - if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') }} + notify-proving: + if: (success() || failure()) && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') needs: - select-image - deploy-proving-network @@ -361,7 +374,20 @@ jobs: with: ref: ${{ needs.select-image.outputs.source_ref }} + - name: Notify Slack on success + if: success() + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + run: | + export CI=1 + TAG="${{ needs.select-image.outputs.nightly_tag || 'unknown' }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./ci3/slack_notify \ + "Nightly proving benchmarks passed (nightly tag ${TAG}) :white_check_mark: <${RUN_URL}|View Run>" \ + "#alerts-next-scenario" + - name: Notify Slack and dispatch ClaudeBox on failure + if: failure() env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} @@ -500,8 +526,8 @@ jobs: NO_SPOT: 1 run: ./.github/ci3.sh network-teardown block-capacity nightly-block-capacity - notify-block-capacity-failure: - if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') }} + notify-block-capacity: + if: (success() || failure()) && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') needs: - select-image - deploy-block-capacity-network @@ -515,7 +541,20 @@ jobs: with: ref: ${{ needs.select-image.outputs.source_ref }} + - name: Notify Slack on success + if: success() + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + run: | + export CI=1 + TAG="${{ needs.select-image.outputs.nightly_tag || 'unknown' }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./ci3/slack_notify \ + "Nightly block capacity benchmarks passed (nightly tag ${TAG}) :white_check_mark: <${RUN_URL}|View Run>" \ + "#alerts-next-scenario" + - name: Notify Slack and dispatch ClaudeBox on failure + if: failure() env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} diff --git a/.github/workflows/weekly-proving-bench.yml b/.github/workflows/weekly-proving-bench.yml index 0353eaa4d08c..0f453729ed1d 100644 --- a/.github/workflows/weekly-proving-bench.yml +++ b/.github/workflows/weekly-proving-bench.yml @@ -188,8 +188,8 @@ jobs: NO_SPOT: 1 run: ./.github/ci3.sh network-teardown prove-n-tps-real prove-n-tps-real - notify-failure: - if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') }} + notify: + if: (success() || failure()) && github.event_name != 'workflow_dispatch' && (github.event_name != 'schedule' || github.repository == 'AztecProtocol/aztec-packages') needs: - select-image - deploy-real-proving-network @@ -203,7 +203,20 @@ jobs: with: ref: ${{ needs.select-image.outputs.source_ref }} + - name: Notify Slack on success + if: success() + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + run: | + export CI=1 + TAG="${{ needs.select-image.outputs.nightly_tag || 'unknown' }}" + RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + ./ci3/slack_notify \ + "Weekly real proving benchmark passed (nightly tag ${TAG}) :white_check_mark: <${RUN_URL}|View Run>" \ + "#alerts-next-scenario" + - name: Notify Slack and dispatch ClaudeBox on failure + if: failure() env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} From 28b31916a0007b2cfa51d651126c4284a987a404 Mon Sep 17 00:00:00 2001 From: spypsy Date: Wed, 10 Jun 2026 16:44:37 +0300 Subject: [PATCH 2/9] fix: interrupt publisher send-at-slot sleep on sequencer stop (#23990) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Propagate `CheckpointProposalJob.interrupt()` to its `SequencerPublisher` so the publisher's `sendRequestsAt` slot-deadline sleep is cancelled on sequencer stop. - Check `interrupted` *before* sleeping in `sendRequestsAt`, since `InterruptibleSleep.interrupt()` only resolves sleeps already in flight. Completes #23930, which made the job's own polling waits interruptible but not the publisher's. In [this `e2e_ha_full.test.ts` flake](http://ci.aztec-labs.com/1c46f3d4a226073d) a node became proposer 165ms before teardown and took the no-broadcast votes path, leaving `pendingL1Submission = publisher.sendRequestsAt(targetSlot)` sleeping until the target slot — ~22 wall-clock minutes away under the warped test clock. `Sequencer.stop()` blocked on it ("Awaiting pending L1 payload submission"), node stops were abandoned, and Jest hung on leaked handles until CI killed the run. Nothing in the shutdown path interrupts the per-job `SequencerPublisher`: `publisherFactory.stopAll()` only interrupts the underlying `L1TxUtils`. ## Tests - `checkpoint_proposal_job.test.ts`: pending L1 submission blocked in the publisher resolves promptly on `job.interrupt()` (red without the fix). - `sequencer-publisher.test.ts`: `sendRequestsAt` returns immediately when interrupted before the sleep starts (red without the fix). --- .../src/composed/ha/e2e_ha_full.test.ts | 29 +++++++++++++--- .../src/publisher/sequencer-publisher.test.ts | 22 ++++++++++++ .../src/publisher/sequencer-publisher.ts | 3 ++ .../sequencer/checkpoint_proposal_job.test.ts | 34 +++++++++++++++++++ .../src/sequencer/checkpoint_proposal_job.ts | 3 +- 5 files changed, 85 insertions(+), 6 deletions(-) diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts index d59e3cb189c9..49552eab25cf 100644 --- a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts +++ b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts @@ -304,24 +304,39 @@ describe('HA Full Setup', () => { }); afterAll(async () => { + const cleanupErrors: string[] = []; + + dateProvider?.reset(); + // Stop all HA peer nodes in parallel with a per-node deadline. A single stuck node can otherwise - // block the serial loop long enough to blow the jest hook timeout — e.g. a sequencer.stop() that - // awaits an L1 publish whose tx-timeout was computed on a test-warped clock and never fires. + // block the serial loop long enough to blow the jest hook timeout, so report the stuck node directly + // instead of letting the suite pass and later fail with Jest open handles. if (haNodeServices) { const STOP_DEADLINE_MS = 30_000; - await Promise.allSettled( + const stopResults = await Promise.allSettled( haNodeServices.map((service, i) => { logger.info(`Stopping HA peer node ${i}`); return Promise.race([ service.stop().catch(error => { - logger.error(`Failed to stop HA peer node ${i}: ${error}`); + const message = `Failed to stop HA peer node ${i}: ${error}`; + logger.error(message); + return message; }), sleep(STOP_DEADLINE_MS).then(() => { - logger.error(`HA peer node ${i} stop did not return within ${STOP_DEADLINE_MS}ms; abandoning`); + const message = `HA peer node ${i} stop did not return within ${STOP_DEADLINE_MS}ms; abandoning`; + logger.error(message); + return message; }), ]); }), ); + for (const result of stopResults) { + if (result.status === 'rejected') { + cleanupErrors.push(`Unexpected HA node stop error: ${result.reason}`); + } else if (result.value) { + cleanupErrors.push(result.value); + } + } } // Cleanup HA keystore temp directories @@ -350,6 +365,10 @@ describe('HA Full Setup', () => { // Cleanup bootstrap node and test infrastructure (this cleans up the shared data directory) await teardown(); + + if (cleanupErrors.length > 0) { + throw new Error(cleanupErrors.join('\n')); + } }); afterEach(async () => { diff --git a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts index bf7b7f2fafb8..519fbdb6dfed 100644 --- a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts +++ b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts @@ -720,6 +720,28 @@ describe('SequencerPublisher', () => { expect((publisher as any).requests.length).toEqual(0); }); + it('does not sleep in sendRequestsAt if interrupted beforehand', async () => { + // A target slot far enough in the future that sendRequestsAt would sleep for ~1 hour + // (EmptyL1RollupConstants has slotDuration 1s and l1GenesisTime 0, so slot N starts at N seconds). + const targetSlot = SlotNumber(Math.ceil(Date.now() / 1000) + 3600); + publisher.interrupt(); + + let timeout: NodeJS.Timeout | undefined; + try { + const result = await Promise.race([ + publisher.sendRequestsAt(targetSlot), + new Promise<'timed-out'>(resolve => { + timeout = setTimeout(() => resolve('timed-out'), 1000); + }), + ]); + expect(result).toBeUndefined(); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + }); + it('does not send requests if no valid requests are found', async () => { publisher.addRequest({ action: 'propose', diff --git a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts index 96754bfeea39..738e83d5d5b5 100644 --- a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts +++ b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts @@ -630,6 +630,9 @@ export class SequencerPublisher { // Aim to be in the mempool one L1 slot before the L2 slot starts, so we have a chance of // being picked up by the first L1 block of the L2 slot. const submitAfterMs = startOfTargetSlotMs - Number(this.ethereumSlotDuration) * 1000; + if (this.interrupted) { + return undefined; + } const sleepMs = submitAfterMs - this.dateProvider.now(); if (sleepMs > 0) { this.log.debug(`Sleeping ${sleepMs}ms before sending requests`, { diff --git a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts index 74b38a1f35ce..1e05c9e4d5fd 100644 --- a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts +++ b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts @@ -14,6 +14,7 @@ import { TimeoutError } from '@aztec/foundation/error'; import { EthAddress } from '@aztec/foundation/eth-address'; import { Signature } from '@aztec/foundation/eth-signature'; import { createLogger } from '@aztec/foundation/log'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; import { TestDateProvider } from '@aztec/foundation/timer'; import type { TypedEventEmitter } from '@aztec/foundation/types'; import { type P2P, P2PClientState } from '@aztec/p2p'; @@ -1724,6 +1725,39 @@ describe('CheckpointProposalJob', () => { } }); + it('interrupts a pending L1 submission sleeping in the publisher', async () => { + const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId); + checkpointBuilder.seedBlocks([block], [txs]); + validatorClient.collectAttestations.mockResolvedValue(getAttestations(block)); + + // Simulate sendRequestsAt sleeping until the target slot: the promise only resolves once + // the publisher itself is interrupted. + const sendDeferred = promiseWithResolvers(); + publisher.sendRequestsAt.mockReturnValue(sendDeferred.promise); + publisher.interrupt.mockImplementation(() => sendDeferred.resolve(undefined)); + + const checkpoint = await job.execute(); + expect(checkpoint).toBeDefined(); + + const pendingSubmission = job.awaitPendingSubmission().then(() => 'stopped' as const); + job.interrupt(); + + let timeout: NodeJS.Timeout | undefined; + try { + const result = await Promise.race([ + pendingSubmission, + new Promise<'timed-out'>(resolve => { + timeout = setTimeout(() => resolve('timed-out'), 1000); + }), + ]); + expect(result).toBe('stopped'); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + }); + it('aborts checkpoint when syncing proposed block to archiver fails', async () => { const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId); checkpointBuilder.seedBlocks([block], [txs]); diff --git a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts index a74f7d69e8b1..a4d733013363 100644 --- a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts +++ b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts @@ -173,10 +173,11 @@ export class CheckpointProposalJob implements Traceable { await this.pendingL1Submission; } - /** Interrupts job-owned waits so shutdown can finish. */ + /** Interrupts job-owned waits, including the publisher's send-at-slot sleep, so shutdown can finish. */ public interrupt(): void { this.interrupted = true; this.interruptibleSleep.interrupt(true); + this.publisher.interrupt(); } private async awaitInterruptibleSleep(ms: number): Promise { From afed52f1e4e65a0d60ecabbc809ca6d7c7e497f5 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 10 Jun 2026 20:32:27 +0300 Subject: [PATCH 3/9] chore: aztec-node render external secrets (#23997) Spartan RPC deployment prep. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136. --- spartan/aztec-node/templates/_pod-template.yaml | 14 ++++++++++++++ spartan/aztec-node/templates/extra-objects.yaml | 4 ++++ spartan/aztec-node/values.yaml | 5 +++++ 3 files changed, 23 insertions(+) create mode 100644 spartan/aztec-node/templates/extra-objects.yaml diff --git a/spartan/aztec-node/templates/_pod-template.yaml b/spartan/aztec-node/templates/_pod-template.yaml index 7c6bf0bbcf60..0b43402800aa 100644 --- a/spartan/aztec-node/templates/_pod-template.yaml +++ b/spartan/aztec-node/templates/_pod-template.yaml @@ -145,6 +145,20 @@ spec: - secretRef: name: {{ include "chart.fullname" . }}-env {{- end }} + {{- range .Values.node.envFrom.configMaps }} + - configMapRef: + name: {{ tpl .name $ | quote }} + {{- if hasKey . "optional" }} + optional: {{ .optional }} + {{- end }} + {{- end }} + {{- range .Values.node.envFrom.secrets }} + - secretRef: + name: {{ tpl .name $ | quote }} + {{- if hasKey . "optional" }} + optional: {{ .optional }} + {{- end }} + {{- end }} env: - name: POD_IP valueFrom: diff --git a/spartan/aztec-node/templates/extra-objects.yaml b/spartan/aztec-node/templates/extra-objects.yaml new file mode 100644 index 000000000000..b058f886db65 --- /dev/null +++ b/spartan/aztec-node/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{- range .Values.extraObjects }} +--- +{{ tpl (toYaml .) $ }} +{{- end }} diff --git a/spartan/aztec-node/values.yaml b/spartan/aztec-node/values.yaml index c5a0ed7ce1b8..43bd3c883a26 100644 --- a/spartan/aztec-node/values.yaml +++ b/spartan/aztec-node/values.yaml @@ -88,6 +88,8 @@ node: envFrom: configMapEnabled: false secretEnabled: false + configMaps: [] + secrets: [] configMap: envEnabled: false @@ -251,3 +253,6 @@ serviceAccount: name: "" # -- Annotations for the service account annotations: {} + +# -- Additional Kubernetes objects to render with this chart. +extraObjects: [] From b87462ccbb334699ce3f7acd64efa7fb7bd39916 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 10 Jun 2026 20:33:15 +0300 Subject: [PATCH 4/9] chore: expose ROLLUP_VERSION (#23998) Exposes rollup version to Spartan deployments. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136. --- spartan/aztec-bot/values.yaml | 3 ++- spartan/aztec-node/templates/_pod-template.yaml | 2 +- spartan/aztec-node/values.yaml | 4 ++-- spartan/aztec-prover-stack/values.yaml | 3 ++- spartan/aztec-validator/values.yaml | 3 ++- spartan/terraform/deploy-aztec-infra/values/common.yaml | 3 ++- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/spartan/aztec-bot/values.yaml b/spartan/aztec-bot/values.yaml index b44630c0a1c9..2161b124ba76 100644 --- a/spartan/aztec-bot/values.yaml +++ b/spartan/aztec-bot/values.yaml @@ -1,5 +1,6 @@ global: - aztecRollupVersion: "canonical" + aztecEnv: + ROLLUP_VERSION: "canonical" aztecNetwork: "" customAztecNetwork: enabled: false diff --git a/spartan/aztec-node/templates/_pod-template.yaml b/spartan/aztec-node/templates/_pod-template.yaml index 0b43402800aa..1cd1228e2aa8 100644 --- a/spartan/aztec-node/templates/_pod-template.yaml +++ b/spartan/aztec-node/templates/_pod-template.yaml @@ -339,7 +339,7 @@ spec: - name: PROVER_ID value: {{ .Values.node.coinbase | quote }} {{- end }} - {{- range $key, $value := .Values.node.env }} + {{- range $key, $value := mergeOverwrite (dict) .Values.global.aztecEnv .Values.node.env }} - name: {{ $key }} value: {{ $value | quote }} {{- end }} diff --git a/spartan/aztec-node/values.yaml b/spartan/aztec-node/values.yaml index 43bd3c883a26..f053da91ca8e 100644 --- a/spartan/aztec-node/values.yaml +++ b/spartan/aztec-node/values.yaml @@ -11,8 +11,8 @@ podManagementPolicy: Parallel # global config for all Aztec nodes in an instalation global: - # Which rollup contract we want to follow from the registry - aztecRollupVersion: "canonical" + # -- Environment variables shared by all Aztec nodes. Overridden by node.env. + aztecEnv: {} # -- Network name - this is a predefined network - alpha-testnet, devnet aztecNetwork: "" # -- Custom network - (not recommended) - Only for custom testnet usecases, (must have deployed your own protocol contracts first) diff --git a/spartan/aztec-prover-stack/values.yaml b/spartan/aztec-prover-stack/values.yaml index ae014973bcd7..17631052e3a8 100644 --- a/spartan/aztec-prover-stack/values.yaml +++ b/spartan/aztec-prover-stack/values.yaml @@ -1,5 +1,6 @@ global: - aztecRollupVersion: "canonical" + aztecEnv: + ROLLUP_VERSION: "canonical" aztecNetwork: "" customAztecNetwork: enabled: false diff --git a/spartan/aztec-validator/values.yaml b/spartan/aztec-validator/values.yaml index 46fa3ce2d784..40e1b2d04023 100644 --- a/spartan/aztec-validator/values.yaml +++ b/spartan/aztec-validator/values.yaml @@ -1,5 +1,6 @@ global: - aztecRollupVersion: "canonical" + aztecEnv: + ROLLUP_VERSION: "canonical" aztecNetwork: "" customAztecNetwork: enabled: false diff --git a/spartan/terraform/deploy-aztec-infra/values/common.yaml b/spartan/terraform/deploy-aztec-infra/values/common.yaml index 7c85c7d8ca2d..d74dd555fce1 100644 --- a/spartan/terraform/deploy-aztec-infra/values/common.yaml +++ b/spartan/terraform/deploy-aztec-infra/values/common.yaml @@ -1,2 +1,3 @@ global: - aztecRollupVersion: "canonical" + aztecEnv: + ROLLUP_VERSION: "canonical" From 87fe7e8d623059dc0cb32eed33eb7216f7b9b4fe Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 10 Jun 2026 20:34:20 +0300 Subject: [PATCH 5/9] chore: enable workload identity (#23999) Enables workload identity for Spartan GKE clusters. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136. --- spartan/terraform/gke-cluster/main.tf | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/spartan/terraform/gke-cluster/main.tf b/spartan/terraform/gke-cluster/main.tf index 790496b624d9..e23725762d9b 100644 --- a/spartan/terraform/gke-cluster/main.tf +++ b/spartan/terraform/gke-cluster/main.tf @@ -41,9 +41,10 @@ module "gke_cluster_private" { module "gke_cluster_public" { source = "./cluster" - cluster_name = "aztec-gke-public" - project = var.project - region = var.region - zone = var.zone - service_account = google_service_account.gke_sa.email + cluster_name = "aztec-gke-public" + project = var.project + region = var.region + zone = var.zone + service_account = google_service_account.gke_sa.email + enable_workload_identity = true } From 132b0c964de985cd48c2d704d9d37ad217f2911e Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 11 Jun 2026 11:55:57 +0300 Subject: [PATCH 6/9] feat: kong ingress (#24000) Adds Kong-based RPC gateway infrastructure. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136, A-1186 --- spartan/environments/bench-10tps.env | 1 - spartan/environments/block-capacity.env | 1 - spartan/environments/devnet.env | 10 +- spartan/environments/mainnet.env | 8 +- spartan/environments/next-net.env | 9 +- spartan/environments/prove-n-tps-fake.env | 1 - spartan/environments/prove-n-tps-real.env | 1 - spartan/environments/staging-internal.env | 99 ++ spartan/environments/staging.env | 5 + spartan/environments/testnet.env | 6 +- spartan/environments/tps-scenario.env | 1 - .../grafana/dashboards/aztec_kong.json | 1500 +++++++++++++++++ spartan/scripts/deploy_network.sh | 48 +- spartan/scripts/network_deploy.sh | 49 - spartan/terraform/deploy-aztec-infra/main.tf | 144 +- .../deploy-aztec-infra/values/common.yaml | 2 +- .../terraform/deploy-aztec-infra/variables.tf | 206 ++- spartan/terraform/deploy-kong-crds/README.md | 12 + spartan/terraform/deploy-kong-crds/main.tf | 63 + .../deploy-kong-crds/private.tfbackend | 2 + .../deploy-kong-crds/public.tfbackend | 2 + .../terraform/deploy-kong-crds/variables.tf | 23 + spartan/terraform/modules/rpc-gateway/main.tf | 812 +++++++++ .../terraform/modules/rpc-gateway/outputs.tf | 74 + .../modules/rpc-gateway/variables.tf | 346 ++++ 25 files changed, 3256 insertions(+), 169 deletions(-) create mode 100644 spartan/environments/staging-internal.env create mode 100644 spartan/metrics/grafana/dashboards/aztec_kong.json create mode 100644 spartan/terraform/deploy-kong-crds/README.md create mode 100644 spartan/terraform/deploy-kong-crds/main.tf create mode 100644 spartan/terraform/deploy-kong-crds/private.tfbackend create mode 100644 spartan/terraform/deploy-kong-crds/public.tfbackend create mode 100644 spartan/terraform/deploy-kong-crds/variables.tf create mode 100644 spartan/terraform/modules/rpc-gateway/main.tf create mode 100644 spartan/terraform/modules/rpc-gateway/outputs.tf create mode 100644 spartan/terraform/modules/rpc-gateway/variables.tf diff --git a/spartan/environments/bench-10tps.env b/spartan/environments/bench-10tps.env index 9e5cd27417fd..3e7899fe1916 100644 --- a/spartan/environments/bench-10tps.env +++ b/spartan/environments/bench-10tps.env @@ -44,7 +44,6 @@ SEQ_BUILD_CHECKPOINT_IF_EMPTY=true RPC_REPLICAS=1 RPC_RESOURCE_PROFILE="prod" -RPC_INGRESS_ENABLED=false FULL_NODE_REPLICAS=5 FULL_NODE_RESOURCE_PROFILE="prod" diff --git a/spartan/environments/block-capacity.env b/spartan/environments/block-capacity.env index 2893f9885d5c..94a43bc32365 100644 --- a/spartan/environments/block-capacity.env +++ b/spartan/environments/block-capacity.env @@ -29,7 +29,6 @@ VALIDATOR_RESOURCE_PROFILE="prod" REAL_VERIFIER=false RPC_REPLICAS=1 -RPC_INGRESS_ENABLED=false PROVER_AGENT_KEDA_ENABLED=true PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=REPLACE_WITH_GCP_SECRET diff --git a/spartan/environments/devnet.env b/spartan/environments/devnet.env index 0741b4751f7e..c06e20f41ef7 100644 --- a/spartan/environments/devnet.env +++ b/spartan/environments/devnet.env @@ -5,6 +5,12 @@ RESOURCE_PROFILE=dev NETWORK="devnet" NAMESPACE=${NAMESPACE:-devnet} +RPC_GATEWAY_ENABLED=true +RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME=otel-collector-url +RPC_GATEWAY_HOSTS="[\"$NAMESPACE.rpc.aztec-labs.com\"]" +RPC_GATEWAY_API_KEY_SECRET_NAMES='[]' +RPC_GATEWAY_ALLOW_ANONYMOUS=true + # Compute mnemonic index offset from namespace to avoid nonce conflicts # between concurrent devnets sharing the same mnemonic on the same L1. # Namespace format: v-devnet- (e.g., v4-devnet-2) @@ -72,9 +78,5 @@ REAL_VERIFIER=false PROVER_RESOURCE_PROFILE="dev" DEBUG_FORCE_TX_PROOF_VERIFICATION=true -RPC_INGRESS_ENABLED=true -RPC_INGRESS_HOSTS="[\"$NAMESPACE.aztec-labs.com\"]" -RPC_INGRESS_STATIC_IP_NAME=$NAMESPACE-rpc-ip -RPC_INGRESS_SSL_CERT_NAMES="[\"$NAMESPACE-rpc-cert\"]" WS_NUM_HISTORIC_CHECKPOINTS=300 diff --git a/spartan/environments/mainnet.env b/spartan/environments/mainnet.env index fe27dd0dd6c2..7ad6e225311e 100644 --- a/spartan/environments/mainnet.env +++ b/spartan/environments/mainnet.env @@ -7,6 +7,8 @@ CLUSTER=aztec-gke-public RESOURCE_PROFILE=prod NAMESPACE=${NAMESPACE:-mainnet} +RPC_GATEWAY_ENABLED=false + CREATE_ROLLUP_CONTRACTS=false VERIFY_CONTRACTS=false DEPLOY_INTERNAL_BOOTNODE=false @@ -15,12 +17,6 @@ RPC_REPLICAS=1 PROVER_REPLICAS=0 PROVER_ENABLED=false -CREATE_RPC_INGRESS=true -CREATE_RPC_DNS=true -RPC_INGRESS_HOSTS='["mainnet.rpc.aztec-labs.com"]' -RPC_INGRESS_SESSION_AFFINITY=CLIENT_IP -RPC_INGRESS_LOG_SAMPLE_RATE=1.0 -RPC_CLOUD_ARMOR_POLICY_NAME=mainnet-rpc-policy FISHERMAN_REPLICAS=1 FISHERMAN_MNEMONIC_START_INDEX=1 diff --git a/spartan/environments/next-net.env b/spartan/environments/next-net.env index cac6310416e4..c40c07d268f4 100644 --- a/spartan/environments/next-net.env +++ b/spartan/environments/next-net.env @@ -75,10 +75,11 @@ CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-false} DEBUG_P2P_INSTRUMENT_MESSAGES=true -RPC_INGRESS_ENABLED=true -RPC_INGRESS_HOSTS='["nextnet.aztec-labs.com"]' -RPC_INGRESS_STATIC_IP_NAME=nextnet-rpc-ip -RPC_INGRESS_SSL_CERT_NAMES='["nextnet-rpc-cert"]' +RPC_GATEWAY_ENABLED=true +RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME=otel-collector-url +RPC_GATEWAY_HOSTS='["nextnet.aztec-labs.com"]' +RPC_GATEWAY_API_KEY_SECRET_NAMES='[]' +RPC_GATEWAY_ALLOW_ANONYMOUS=true VALIDATOR_HA_REPLICAS=1 VALIDATOR_RESOURCE_PROFILE="prod" diff --git a/spartan/environments/prove-n-tps-fake.env b/spartan/environments/prove-n-tps-fake.env index 1808fb482c31..8e26ee558e99 100644 --- a/spartan/environments/prove-n-tps-fake.env +++ b/spartan/environments/prove-n-tps-fake.env @@ -30,7 +30,6 @@ VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 REAL_VERIFIER=false RPC_REPLICAS=1 -RPC_INGRESS_ENABLED=false PROVER_RESOURCE_PROFILE="dev-hi-tps" PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 diff --git a/spartan/environments/prove-n-tps-real.env b/spartan/environments/prove-n-tps-real.env index 0d4787597707..a4a592574051 100644 --- a/spartan/environments/prove-n-tps-real.env +++ b/spartan/environments/prove-n-tps-real.env @@ -30,7 +30,6 @@ VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 REAL_VERIFIER=true RPC_REPLICAS=1 -RPC_INGRESS_ENABLED=false PROVER_RESOURCE_PROFILE="prod-hi-tps" PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 diff --git a/spartan/environments/staging-internal.env b/spartan/environments/staging-internal.env new file mode 100644 index 000000000000..af8a7a0330ee --- /dev/null +++ b/spartan/environments/staging-internal.env @@ -0,0 +1,99 @@ +# Deployment +CREATE_ETH_DEVNET=false +NETWORK=staging +NAMESPACE=${NAMESPACE:-staging-internal} +GCP_REGION=us-west1-a +CLUSTER=aztec-gke-public +RESOURCE_PROFILE=prod + +# Ethereum +ETHEREUM_CHAIN_ID=11155111 +ETHEREUM_RPC_URLS=REPLACE_WITH_GCP_SECRET +ETHEREUM_CONSENSUS_HOST_URLS=REPLACE_WITH_GCP_SECRET +ETHEREUM_CONSENSUS_HOST_API_KEYS=REPLACE_WITH_GCP_SECRET +ETHEREUM_CONSENSUS_HOST_API_KEY_HEADERS=REPLACE_WITH_GCP_SECRET +FUNDING_PRIVATE_KEY=REPLACE_WITH_GCP_SECRET +LABS_INFRA_MNEMONIC=REPLACE_WITH_GCP_SECRET +ROLLUP_DEPLOYMENT_PRIVATE_KEY=REPLACE_WITH_GCP_SECRET +CREATE_ROLLUP_CONTRACTS=${CREATE_ROLLUP_CONTRACTS:-false} + +# Storage +SNAPSHOT_BUCKET_DIRECTORY=${SNAPSHOT_BUCKET_DIRECTORY:-staging-internal/snapshots} +SYNC_SNAPSHOT_URLS="https://aztec-labs-snapshots.com/${SNAPSHOT_BUCKET_DIRECTORY}" +TX_FILE_STORE_ENABLED=false +R2_ACCESS_KEY_ID=REPLACE_WITH_GCP_SECRET +R2_SECRET_ACCESS_KEY=REPLACE_WITH_GCP_SECRET + +# Network features +DEPLOY_INTERNAL_BOOTNODE=true +P2P_TX_POOL_DELETE_TXS_AFTER_REORG=true +TEST_ACCOUNTS=false +SPONSORED_FPC=false + +# Rollup +AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET=2 +AZTEC_LAG_IN_EPOCHS_FOR_RANDAO=2 +AZTEC_INBOX_LAG=2 +AZTEC_MANA_TARGET=75000000 +AZTEC_PROVING_COST_PER_MANA=12500000 + +# Sequencer +SEQ_MAX_L2_BLOCK_GAS=150000000 +SEQ_MIN_TX_PER_BLOCK=1 +SEQ_MAX_TX_PER_CHECKPOINT=7 # 0.1 TPS +SEQ_BUILD_CHECKPOINT_IF_EMPTY=true +SEQ_BLOCK_DURATION_MS=6000 +SEQ_L1_PUBLISHING_TIME_ALLOWANCE_IN_SLOT=36 +SEQ_ENABLE_PROPOSER_PIPELINING=true + +# Validators and RPC +VALIDATOR_REPLICAS=2 +VALIDATORS_PER_NODE=32 +VALIDATOR_PUBLISHERS_PER_REPLICA=4 +VALIDATOR_PUBLISHER_MNEMONIC_START_INDEX=5000 +VALIDATOR_RESOURCE_PROFILE="prod" + +RPC_REPLICAS=1 +RPC_GATEWAY_ENABLED=true +RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME=otel-collector-url +RPC_GATEWAY_HOSTS='["staging-internal.rpc.aztec-labs.com"]' +RPC_GATEWAY_API_KEY_SECRET_NAMES='["staging-rpc-internal-api-key"]' +RPC_GATEWAY_ALLOW_ANONYMOUS=false + +# Prover +PROVER_RESOURCE_PROFILE="prod" +PUBLISHERS_PER_PROVER=2 +PROVER_PUBLISHER_MNEMONIC_START_INDEX=8000 +# PROVER_FAILED_PROOF_STORE=gs://aztec-develop/staging/failed-proofs +# L1_TX_FAILED_STORE=gs://aztec-develop/staging/failed-l1-txs +PROVER_AGENT_KEDA_ENABLED=true +PROVER_AGENT_KEDA_PROMETHEUS_SERVER_ADDRESS=REPLACE_WITH_GCP_SECRET +PROVER_AGENT_KEDA_MIN_REPLICAS=0 +PROVER_AGENT_KEDA_MAX_REPLICAS=8 +PROVER_AGENT_KEDA_SCALING_BANDS='[ + { + queueSize = 0 + replicas = 4 + }, + { + queueSize = 100 + replicas = 8 + } +]' + +# Bots +BOT_DA_GAS_LIMIT=100000 +BOT_L2_GAS_LIMIT=6540000 +BOT_TRANSFERS_REPLICAS=1 +BOT_TRANSFERS_TX_INTERVAL_SECONDS=250 +BOT_TRANSFERS_FOLLOW_CHAIN=PROPOSED +BOT_SWAPS_REPLICAS=0 +BOT_SWAPS_FOLLOW_CHAIN=PROPOSED +BOT_SWAPS_TX_INTERVAL_SECONDS=350 +BOT_CROSS_CHAIN_REPLICAS=0 +BOT_CROSS_CHAIN_TX_INTERVAL_SECONDS=250 +BOT_CROSS_CHAIN_FOLLOW_CHAIN=PROPOSED + +# Observability +OTEL_COLLECTOR_ENDPOINT=REPLACE_WITH_GCP_SECRET +LOG_LEVEL="info" diff --git a/spartan/environments/staging.env b/spartan/environments/staging.env index 3193c558f6b6..213b70c50487 100644 --- a/spartan/environments/staging.env +++ b/spartan/environments/staging.env @@ -62,6 +62,11 @@ VALIDATOR_HA_REPLICAS=1 VALIDATOR_HA_REPLICA_COUNT=2 VALIDATOR_RESOURCE_PROFILE="prod" RPC_REPLICAS=1 +RPC_GATEWAY_ENABLED=true +RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME=otel-collector-url +RPC_GATEWAY_HOSTS='["staging.rpc.aztec-labs.com"]' +RPC_GATEWAY_API_KEY_SECRET_NAMES='[]' +RPC_GATEWAY_ALLOW_ANONYMOUS=true # Prover PROVER_RESOURCE_PROFILE="prod" diff --git a/spartan/environments/testnet.env b/spartan/environments/testnet.env index 6ed14f1eba4e..a3f959ace726 100644 --- a/spartan/environments/testnet.env +++ b/spartan/environments/testnet.env @@ -5,6 +5,8 @@ RESOURCE_PROFILE=prod NAMESPACE=${NAMESPACE:-testnet} NETWORK=testnet +RPC_GATEWAY_ENABLED=false + REAL_VERIFIER=true AZTEC_ENTRY_QUEUE_BOOTSTRAP_VALIDATOR_SET_SIZE=48 AZTEC_ENTRY_QUEUE_BOOTSTRAP_FLUSH_SIZE=48 @@ -68,10 +70,6 @@ BOT_SWAPS_REPLICAS=0 P2P_TX_POOL_DELETE_TXS_AFTER_REORG=true SEQ_MAX_TX_PER_CHECKPOINT=72 -RPC_INGRESS_ENABLED=true -RPC_INGRESS_HOSTS='["rpc.testnet.aztec-labs.com"]' -RPC_INGRESS_STATIC_IP_NAME=testnet-rpc-ip -RPC_INGRESS_SSL_CERT_NAMES='["testnet-rpc-cert"]' VALIDATOR_REPLICAS=2 diff --git a/spartan/environments/tps-scenario.env b/spartan/environments/tps-scenario.env index 6f512b6557bf..3ea25fffe0dd 100644 --- a/spartan/environments/tps-scenario.env +++ b/spartan/environments/tps-scenario.env @@ -44,7 +44,6 @@ VALIDATOR_RESOURCE_PROFILE="prod" REAL_VERIFIER=false RPC_REPLICAS=10 -RPC_INGRESS_ENABLED=false FULL_NODE_REPLICAS=500 FULL_NODE_RESOURCE_PROFILE="prod" diff --git a/spartan/metrics/grafana/dashboards/aztec_kong.json b/spartan/metrics/grafana/dashboards/aztec_kong.json new file mode 100644 index 000000000000..9885b2c28999 --- /dev/null +++ b/spartan/metrics/grafana/dashboards/aztec_kong.json @@ -0,0 +1,1500 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row", + "description": "High-level health summary for the selected Kong RPC gateway." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "count(count by (k8s_namespace_name, node_id) (kong_node_info{k8s_namespace_name=~\"$namespace\"}))", + "instant": false, + "legendFormat": "nodes", + "range": true, + "refId": "A" + } + ], + "title": "Active nodes", + "type": "stat", + "description": "Number of Kong gateway nodes currently exporting metrics." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 3, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum(rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=~\"$consumer\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "requests", + "range": true, + "refId": "A" + } + ], + "title": "Request rate", + "type": "stat", + "description": "Total proxied request throughput for the selected routes and consumers." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [], + "noValue": "0%", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.01 + }, + { + "color": "red", + "value": 0.05 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 9, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "((sum(rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=~\"$consumer\", code=~\"5..\"}[$__rate_interval])) or vector(0)) / clamp_min((sum(rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=~\"$consumer\"}[$__rate_interval])) or vector(0)), 0.000000001))", + "instant": false, + "legendFormat": "5xx", + "range": true, + "refId": "A" + } + ], + "title": "5xx rate", + "type": "stat", + "description": "Share of requests returning 5xx responses from Kong or upstream services." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1000 + }, + { + "color": "red", + "value": 3000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 14, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(kong_request_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.95, sum by (le) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"request\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "p95", + "range": true, + "refId": "A" + } + ], + "title": "Request latency p95", + "type": "stat", + "description": "95th percentile end-to-end request latency through Kong." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 19, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum(kong_upstream_target_health{k8s_namespace_name=~\"$namespace\", state=~\"unhealthy|dns_error\"}) or vector(0)", + "instant": false, + "legendFormat": "targets", + "range": true, + "refId": "A" + } + ], + "title": "Unhealthy upstreams", + "type": "stat", + "description": "Count of upstream targets reporting unhealthy or DNS error states." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 7, + "panels": [], + "title": "Traffic", + "type": "row", + "description": "Traffic volume, status codes, consumers, bandwidth, and connection load." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps", + "decimals": 3 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum by (route, code) (rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=~\"$consumer\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "{{route}} {{code}}", + "range": true, + "refId": "A" + } + ], + "title": "Request rate by route and status", + "type": "timeseries", + "description": "Per-route request throughput split by HTTP status code." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps", + "decimals": 3 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum by (consumer, code) (rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer!=\"\", consumer=~\"$consumer\"}[$__rate_interval])) or sum by (consumer, code) (label_replace(rate(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=\"\"}[$__rate_interval]), \"consumer\", \"unauthenticated\", \"__name__\", \".*\"))", + "instant": false, + "legendFormat": "{{consumer}} {{code}}", + "range": true, + "refId": "A" + } + ], + "title": "Request rate by consumer", + "type": "timeseries", + "description": "Per-consumer request throughput split by HTTP status code." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum by (direction) (rate(kong_bandwidth_bytes_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\", consumer=~\"$consumer\", direction=~\"ingress|egress\"}[$__rate_interval]))", + "instant": false, + "legendFormat": "{{direction}}", + "range": true, + "refId": "A" + } + ], + "title": "Bandwidth", + "type": "timeseries", + "description": "Ingress and egress byte throughput handled by Kong." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum by (state) (kong_nginx_connections_total{k8s_namespace_name=~\"$namespace\", subsystem=\"http\", state=~\"active|reading|writing|waiting\"}) or sum by (state) (kong_nginx_http_current_connections{k8s_namespace_name=~\"$namespace\", state=~\"active|reading|writing|waiting\"})", + "instant": false, + "legendFormat": "{{state}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP connections", + "type": "timeseries", + "description": "Current Nginx HTTP connection states across Kong nodes." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 12, + "panels": [], + "title": "Latency", + "type": "row", + "description": "Latency percentiles for total requests, Kong processing, and upstream service time." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by (le, route) (rate(kong_request_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.50, sum by (le, route) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"request\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "{{route}} p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le, route) (rate(kong_request_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.95, sum by (le, route) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"request\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "{{route}} p95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le, route) (rate(kong_request_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.99, sum by (le, route) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"request\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "{{route}} p99", + "range": true, + "refId": "C" + } + ], + "title": "Request latency by route", + "type": "timeseries", + "description": "Request latency p50, p95, and p99 grouped by route." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(kong_kong_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.95, sum by (le) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"kong\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "kong p95", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(kong_upstream_latency_ms_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}[$__rate_interval]))) or histogram_quantile(0.95, sum by (le) (rate(kong_latency_bucket{k8s_namespace_name=~\"$namespace\", route=~\"$route\", type=\"upstream\"}[$__rate_interval])))", + "instant": false, + "legendFormat": "upstream p95", + "range": true, + "refId": "B" + } + ], + "title": "Kong vs upstream latency", + "type": "timeseries", + "description": "Compares Kong processing latency with upstream response latency." + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 15, + "panels": [], + "title": "Runtime", + "type": "row", + "description": "Kong runtime health, metric export errors, and memory usage." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "rate(kong_nginx_metric_errors_total{k8s_namespace_name=~\"$namespace\"}[$__rate_interval])", + "instant": false, + "legendFormat": "metric errors", + "range": true, + "refId": "A" + } + ], + "title": "Prometheus metric errors", + "type": "timeseries", + "description": "Rate of errors emitted by the Kong Nginx Prometheus metrics exporter." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "sum by (node_id) (kong_memory_workers_lua_vms_bytes{k8s_namespace_name=~\"$namespace\"})", + "instant": false, + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Lua memory", + "type": "timeseries", + "description": "Lua VM memory allocated by Kong workers, grouped by node." + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${data_source}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "editorMode": "code", + "expr": "100 * sum by (shared_dict) (kong_memory_lua_shared_dict_bytes{k8s_namespace_name=~\"$namespace\"}) / sum by (shared_dict) (kong_memory_lua_shared_dict_total_bytes{k8s_namespace_name=~\"$namespace\"})", + "instant": false, + "legendFormat": "{{shared_dict}}", + "range": true, + "refId": "A" + } + ], + "title": "Shared dictionary usage", + "type": "timeseries", + "description": "Percent of allocated capacity used by each Kong shared dictionary." + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 42, + "tags": [ + "kong", + "rpc", + "metrics" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "includeAll": false, + "name": "data_source", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "definition": "label_values(kong_node_info,k8s_namespace_name)", + "includeAll": true, + "label": "Kong namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kong_node_info,k8s_namespace_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "definition": "label_values(kong_http_requests_total{k8s_namespace_name=~\"$namespace\"}, route)", + "includeAll": true, + "label": "Route", + "multi": true, + "name": "route", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kong_http_requests_total{k8s_namespace_name=~\"$namespace\"}, route)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${data_source}" + }, + "definition": "label_values(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}, consumer)", + "includeAll": true, + "label": "Consumer", + "multi": true, + "name": "consumer", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kong_http_requests_total{k8s_namespace_name=~\"$namespace\", route=~\"$route\"}, consumer)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Kong RPC Gateway", + "uid": "kong-rpc-gateway", + "version": 1, + "weekStart": "" +} diff --git a/spartan/scripts/deploy_network.sh b/spartan/scripts/deploy_network.sh index 232d4d2c61aa..109f5bebedc3 100755 --- a/spartan/scripts/deploy_network.sh +++ b/spartan/scripts/deploy_network.sh @@ -29,6 +29,26 @@ tf_str() { fi } +append_optional_tfvar() { + local file="$1" + local name="$2" + local type="${3:-raw}" + + if [[ -z "${!name+x}" ]]; then + return + fi + + if [[ -z "${!name}" ]]; then + return + fi + + if [[ "${type}" == "string" ]]; then + printf '%s = %s\n' "${name}" "$(tf_str "${!name}")" >> "${file}" + else + printf '%s = %s\n' "${name}" "${!name}" >> "${file}" + fi +} + # We want to separate out these logs. export DENOISE=1 ######################## @@ -166,13 +186,6 @@ BOT_TRANSFERS_PXE_SYNC_CHAIN_TIP=${BOT_TRANSFERS_PXE_SYNC_CHAIN_TIP:-checkpointe BOT_SWAPS_PXE_SYNC_CHAIN_TIP=${BOT_SWAPS_PXE_SYNC_CHAIN_TIP:-checkpointed} BOT_CROSS_CHAIN_PXE_SYNC_CHAIN_TIP=${BOT_CROSS_CHAIN_PXE_SYNC_CHAIN_TIP:-checkpointed} -RPC_INGRESS_ENABLED=${RPC_INGRESS_ENABLED:-false} -RPC_INGRESS_HOSTS=${RPC_INGRESS_HOSTS:-[]} -RPC_INGRESS_STATIC_IP_NAME=${RPC_INGRESS_STATIC_IP_NAME:-} -RPC_INGRESS_SSL_CERT_NAMES=${RPC_INGRESS_SSL_CERT_NAMES:-[]} -RPC_CLOUD_ARMOR_POLICY_NAME=${RPC_CLOUD_ARMOR_POLICY_NAME:-} -RPC_INGRESS_SESSION_AFFINITY=${RPC_INGRESS_SESSION_AFFINITY:-} -RPC_INGRESS_LOG_SAMPLE_RATE=${RPC_INGRESS_LOG_SAMPLE_RATE:-null} RPC_REPLICAS=${RPC_REPLICAS:-1} FULL_NODE_REPLICAS=${FULL_NODE_REPLICAS:-0} FISHERMAN_MNEMONIC_START_INDEX=${FISHERMAN_MNEMONIC_START_INDEX:-1} @@ -643,13 +656,6 @@ PROVER_AGENT_KEDA_SCALING_BANDS = ${PROVER_AGENT_KEDA_SCALING_BANDS:-[]} PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS = ${PROVER_AGENT_KEDA_POLLING_INTERVAL_SECONDS:-30} PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS = ${PROVER_AGENT_KEDA_COOLDOWN_PERIOD_SECONDS:-300} -RPC_INGRESS_ENABLED = ${RPC_INGRESS_ENABLED} -RPC_INGRESS_HOSTS = ${RPC_INGRESS_HOSTS} -RPC_INGRESS_STATIC_IP_NAME = "${RPC_INGRESS_STATIC_IP_NAME}" -RPC_INGRESS_SSL_CERT_NAMES = ${RPC_INGRESS_SSL_CERT_NAMES} -RPC_CLOUD_ARMOR_POLICY_NAME = "${RPC_CLOUD_ARMOR_POLICY_NAME}" -RPC_INGRESS_SESSION_AFFINITY = "${RPC_INGRESS_SESSION_AFFINITY}" -RPC_INGRESS_LOG_SAMPLE_RATE = ${RPC_INGRESS_LOG_SAMPLE_RATE} RPC_REPLICAS = ${RPC_REPLICAS:-1} FISHERMAN_REPLICAS = ${FISHERMAN_REPLICAS} FISHERMAN_MNEMONIC = "${LABS_INFRA_MNEMONIC}" @@ -699,6 +705,20 @@ WAIT_FOR_PROVER_DEPLOY = ${WAIT_FOR_PROVER_DEPLOY:-null} ADMIN_API_KEY_HASH = "${ADMIN_API_KEY_HASH}" EOF +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_ENABLED +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_HOSTS +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_API_KEY_SECRET_NAMES +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_ALLOW_ANONYMOUS +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_ANONYMOUS_RATE_LIMIT_MINUTE +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_API_KEY_HEADER_NAME string +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_CONSUMERS +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_CREATE_DNS +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_DNS_ZONE_NAME string +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_DNS_TTL +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_FRONTEND_STATIC_IP_NAME string +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_GCP_MANAGED_CERTIFICATE_NAME string +append_optional_tfvar "${DEPLOY_AZTEC_INFRA_DIR}/terraform.tfvars" RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME string + k8s_denoise "tf_run "${DEPLOY_AZTEC_INFRA_DIR}" "${DESTROY_AZTEC_INFRA}" "${CREATE_AZTEC_INFRA}"" STAGE_TIMINGS[aztec_infra]=$(($(date +%s) - AZTEC_INFRA_START)) log "Deployed aztec infra" diff --git a/spartan/scripts/network_deploy.sh b/spartan/scripts/network_deploy.sh index 8fa7fcaf3e12..cdbd74b4abc4 100755 --- a/spartan/scripts/network_deploy.sh +++ b/spartan/scripts/network_deploy.sh @@ -29,54 +29,5 @@ gcp_auth source_network_env "$env_file" -# Optional: provision per-network IP + managed cert (+ DNS record in the delegated -# rpc.aztec-labs.com zone) via the network-frontend terraform module. The module's -# outputs are exported as env vars that deploy_network.sh already consumes. -CREATE_RPC_INGRESS=${CREATE_RPC_INGRESS:-false} -CREATE_RPC_DNS=${CREATE_RPC_DNS:-false} - -if [[ "$CREATE_RPC_DNS" == "true" && "$CREATE_RPC_INGRESS" != "true" ]]; then - echo "CREATE_RPC_DNS=true requires CREATE_RPC_INGRESS=true" >&2 - exit 1 -fi - -if [[ "$CREATE_RPC_INGRESS" == "true" ]]; then - if [[ -z "${NAMESPACE:-}" ]]; then - echo "CREATE_RPC_INGRESS=true requires NAMESPACE to be set" >&2 - exit 1 - fi - # RPC_INGRESS_HOSTS is a JSON array of one-or-more hostnames, e.g. '["mainnet.rpc.aztec-labs.com"]'. - if ! echo "${RPC_INGRESS_HOSTS:-}" | jq -e 'type == "array" and length > 0 and all(.[]; type == "string")' >/dev/null 2>&1; then - echo "CREATE_RPC_INGRESS=true requires RPC_INGRESS_HOSTS to be a non-empty JSON array of hostnames, e.g. '[\"mainnet.rpc.aztec-labs.com\"]'" >&2 - exit 1 - fi - - frontend_dir="$spartan/terraform/network-frontend" - echo "Applying network-frontend for $NAMESPACE ($RPC_INGRESS_HOSTS)..." - terraform -chdir="$frontend_dir" init -reconfigure \ - -backend-config="bucket=aztec-terraform" \ - -backend-config="prefix=terraform/state/network-frontend/$NAMESPACE" - - tf_vars=( - -var "NAMESPACE=$NAMESPACE" - -var "RPC_HOSTNAMES=$RPC_INGRESS_HOSTS" - ) - if [[ "$CREATE_RPC_DNS" == "true" ]]; then - tf_vars+=( - -var "CREATE_DNS=true" - -var "DNS_ZONE_NAME=rpc-aztec-labs-com" - ) - fi - - terraform -chdir="$frontend_dir" apply -auto-approve "${tf_vars[@]}" - - export RPC_INGRESS_ENABLED=true - export RPC_INGRESS_STATIC_IP_NAME=$(terraform -chdir="$frontend_dir" output -raw ip_name) - export RPC_INGRESS_SSL_CERT_NAMES="[\"$(terraform -chdir="$frontend_dir" output -raw cert_name)\"]" - export RPC_INGRESS_HOSTS=$(terraform -chdir="$frontend_dir" output -json hostnames) - - echo "network-frontend: ip=$RPC_INGRESS_STATIC_IP_NAME cert=$RPC_INGRESS_SSL_CERT_NAMES hosts=$RPC_INGRESS_HOSTS" -fi - $scripts_dir/deploy_network.sh echo "Deployed network" diff --git a/spartan/terraform/deploy-aztec-infra/main.tf b/spartan/terraform/deploy-aztec-infra/main.tf index 28d3a694e74e..0cf16008c7c0 100644 --- a/spartan/terraform/deploy-aztec-infra/main.tf +++ b/spartan/terraform/deploy-aztec-infra/main.tf @@ -18,6 +18,10 @@ terraform { source = "hashicorp/kubernetes" version = "~> 2.38.0" } + google = { + source = "hashicorp/google" + version = "~> 5.0" + } } } @@ -35,6 +39,11 @@ provider "helm" { } } +provider "google" { + project = var.GCP_PROJECT_ID + region = var.GCP_REGION +} + module "web3signer" { # Only deploy web3signer if we have validators or provers that need to publish to L1 count = tonumber(var.VALIDATOR_REPLICAS) > 0 ? 1 : 0 @@ -104,6 +113,27 @@ locals { # Detect local kind context (e.g., "kind-kind") to gate Service types is_kind = can(regex("^kind", var.K8S_CLUSTER_CONTEXT)) + rpc_gateway_simple_consumers = { + for secret_name in var.RPC_GATEWAY_API_KEY_SECRET_NAMES : secret_name => { + username = secret_name + gcp_secret_manager_secret_name = secret_name + rate_limit_minute = 0 + } + } + + rpc_gateway_consumers = merge(local.rpc_gateway_simple_consumers, var.RPC_GATEWAY_CONSUMERS) + + rpc_gateway_routes = { + canonical = { + hosts = var.RPC_GATEWAY_HOSTS + route_namespace = var.NAMESPACE + upstream_service_name = "${var.RELEASE_PREFIX}-rpc-aztec-node" + upstream_service_port = 8080 + auth_mode = var.RPC_GATEWAY_ALLOW_ANONYMOUS ? "keyed_with_anonymous" : "keyed_only" + anonymous_rate_limit_minute = var.RPC_GATEWAY_ANONYMOUS_RATE_LIMIT_MINUTE + } + } + internal_boot_node_url = var.DEPLOY_INTERNAL_BOOTNODE ? "http://${var.RELEASE_PREFIX}-p2p-bootstrap-node.${var.NAMESPACE}.svc.cluster.local:8080" : "" internal_rpc_url = "http://${var.RELEASE_PREFIX}-rpc-aztec-node.${var.NAMESPACE}.svc.cluster.local:8080" @@ -437,30 +467,7 @@ locals { "rpc.yaml", "rpc-resources-${var.RPC_RESOURCE_PROFILE}.yaml" ] - inline_values = concat(var.RPC_INGRESS_ENABLED ? [yamlencode({ - service = { - p2p = { publicIP = var.P2P_PUBLIC_IP } - rpc = { - annotations = { - "cloud.google.com/neg" = jsonencode({ ingress = true }) - "cloud.google.com/backend-config" = jsonencode({ - default = "${var.RELEASE_PREFIX}-rpc-ingress-backend" - }) - } - } - } - ingress = { - rpc = { - hosts = var.RPC_INGRESS_HOSTS - annotations = { - "kubernetes.io/ingress.class" = "gce" - "kubernetes.io/ingress.global-static-ip-name" = var.RPC_INGRESS_STATIC_IP_NAME - "ingress.gcp.kubernetes.io/pre-shared-cert" = join(",", var.RPC_INGRESS_SSL_CERT_NAMES) - "kubernetes.io/ingress.allow-http" = "false" - } - } - } - })] : [yamlencode({ + inline_values = [yamlencode({ service = { p2p = { publicIP = var.P2P_PUBLIC_IP } rpc = { @@ -468,7 +475,7 @@ locals { type = local.is_kind ? "ClusterIP" : "LoadBalancer" } } - })]) + })] custom_settings = merge({ "replicaCount" = var.RPC_REPLICAS @@ -479,7 +486,6 @@ locals { # Ensure the JSON-RPC server binds the same port the probe checks "node.proverRealProofs" = var.PROVER_REAL_PROOFS - "ingress.rpc.enabled" = var.RPC_INGRESS_ENABLED "node.env.AWS_ACCESS_KEY_ID" = var.R2_ACCESS_KEY_ID "node.env.AWS_SECRET_ACCESS_KEY" = var.R2_SECRET_ACCESS_KEY "node.env.P2P_TX_POOL_DELETE_TXS_AFTER_REORG" = var.P2P_TX_POOL_DELETE_TXS_AFTER_REORG @@ -724,45 +730,53 @@ resource "helm_release" "releases" { } } -resource "kubernetes_manifest" "rpc_ingress_backend" { - count = var.RPC_INGRESS_ENABLED ? 1 : 0 - provider = kubernetes.gke-cluster +module "rpc_gateway" { + count = var.RPC_GATEWAY_ENABLED ? 1 : 0 - manifest = { - apiVersion = "cloud.google.com/v1" - kind = "BackendConfig" - metadata = { - name = "${var.RELEASE_PREFIX}-rpc-ingress-backend" - namespace = var.NAMESPACE - } - spec = merge( - { - healthCheck = { - checkIntervalSec = 15 - timeoutSec = 5 - healthyThreshold = 2 - unhealthyThreshold = 2 - type = "HTTP" - port = 8080 - requestPath = "/status" - } - }, - var.RPC_CLOUD_ARMOR_POLICY_NAME != "" ? { - securityPolicy = { - name = var.RPC_CLOUD_ARMOR_POLICY_NAME - } - } : {}, - var.RPC_INGRESS_SESSION_AFFINITY != "" ? { - sessionAffinity = { - affinityType = var.RPC_INGRESS_SESSION_AFFINITY - } - } : {}, - var.RPC_INGRESS_LOG_SAMPLE_RATE != null ? { - logging = { - enable = true - sampleRate = var.RPC_INGRESS_LOG_SAMPLE_RATE - } - } : {} - ) + source = "../modules/rpc-gateway" + + providers = { + helm = helm.gke-cluster + kubernetes = kubernetes.gke-cluster + google = google } + + RELEASE_PREFIX = var.RELEASE_PREFIX + CONSUMER_NAMESPACE = var.NAMESPACE + + KONG_NAMESPACE = var.RPC_GATEWAY_KONG_NAMESPACE != "" ? var.RPC_GATEWAY_KONG_NAMESPACE : var.NAMESPACE + KONG_HELM_RELEASE_NAME = var.RPC_GATEWAY_KONG_HELM_RELEASE_NAME + KONG_HELM_CHART_VERSION = var.RPC_GATEWAY_KONG_HELM_CHART_VERSION + KONG_INGRESS_CLASS = var.RPC_GATEWAY_KONG_INGRESS_CLASS + KONG_PROXY_SERVICE_TYPE = var.RPC_GATEWAY_KONG_PROXY_SERVICE_TYPE + KONG_PROXY_SERVICE_ANNOTATIONS = var.RPC_GATEWAY_KONG_PROXY_SERVICE_ANNOTATIONS + KONG_EXTRA_HELM_VALUES = var.RPC_GATEWAY_KONG_EXTRA_HELM_VALUES + KONG_SERVICE_MONITOR_ENABLED = var.RPC_GATEWAY_KONG_SERVICE_MONITOR_ENABLED + KONG_METRICS_SERVICE_ENABLED = var.RPC_GATEWAY_KONG_METRICS_SERVICE_ENABLED + KONG_METRICS_SERVICE_NAME = var.RPC_GATEWAY_KONG_METRICS_SERVICE_NAME + KONG_METRICS_SERVICE_TYPE = var.RPC_GATEWAY_KONG_METRICS_SERVICE_TYPE + KONG_METRICS_SERVICE_ANNOTATIONS = var.RPC_GATEWAY_KONG_METRICS_SERVICE_ANNOTATIONS + KONG_METRICS_SERVICE_LOAD_BALANCER_IP = var.RPC_GATEWAY_KONG_METRICS_SERVICE_LOAD_BALANCER_IP + KONG_METRICS_SERVICE_LOAD_BALANCER_SOURCE_RANGES = var.RPC_GATEWAY_KONG_METRICS_SERVICE_LOAD_BALANCER_SOURCE_RANGES + KONG_METRICS_SERVICE_EXTERNAL_TRAFFIC_POLICY = var.RPC_GATEWAY_KONG_METRICS_SERVICE_EXTERNAL_TRAFFIC_POLICY + KONG_OTEL_METRICS_GCP_SECRET_NAME = var.RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME + + API_KEY_HEADER_NAME = var.RPC_GATEWAY_API_KEY_HEADER_NAME + ROUTES = local.rpc_gateway_routes + CONSUMERS = local.rpc_gateway_consumers + EXTERNAL_SECRET_STORE_NAME = var.RPC_GATEWAY_EXTERNAL_SECRET_STORE_NAME + EXTERNAL_SECRET_STORE_KIND = var.RPC_GATEWAY_EXTERNAL_SECRET_STORE_KIND + EXTERNAL_SECRET_REFRESH_INTERVAL = var.RPC_GATEWAY_EXTERNAL_SECRET_REFRESH_INTERVAL + + CREATE_DNS = var.RPC_GATEWAY_CREATE_DNS + DNS_ZONE_NAME = var.RPC_GATEWAY_DNS_ZONE_NAME + DNS_TTL = var.RPC_GATEWAY_DNS_TTL + FRONTEND_ENABLED = var.RPC_GATEWAY_FRONTEND_ENABLED + FRONTEND_STATIC_IP_ENABLED = var.RPC_GATEWAY_FRONTEND_STATIC_IP_ENABLED + FRONTEND_STATIC_IP_NAME = var.RPC_GATEWAY_FRONTEND_STATIC_IP_NAME + FRONTEND_ALLOW_HTTP = var.RPC_GATEWAY_FRONTEND_ALLOW_HTTP + GCP_MANAGED_CERTIFICATE_ENABLED = var.RPC_GATEWAY_GCP_MANAGED_CERTIFICATE_ENABLED + GCP_MANAGED_CERTIFICATE_NAME = var.RPC_GATEWAY_GCP_MANAGED_CERTIFICATE_NAME + + depends_on = [helm_release.releases] } diff --git a/spartan/terraform/deploy-aztec-infra/values/common.yaml b/spartan/terraform/deploy-aztec-infra/values/common.yaml index d74dd555fce1..b26ecb6a1599 100644 --- a/spartan/terraform/deploy-aztec-infra/values/common.yaml +++ b/spartan/terraform/deploy-aztec-infra/values/common.yaml @@ -1,3 +1,3 @@ global: aztecEnv: - ROLLUP_VERSION: "canonical" + ROLLUP_VERSION: "" diff --git a/spartan/terraform/deploy-aztec-infra/variables.tf b/spartan/terraform/deploy-aztec-infra/variables.tf index b2f72274507a..8805cbb7338e 100644 --- a/spartan/terraform/deploy-aztec-infra/variables.tf +++ b/spartan/terraform/deploy-aztec-infra/variables.tf @@ -697,50 +697,222 @@ variable "BOT_L2_GAS_LIMIT" { default = "" } -# RPC ingress configuration (GKE-specific) -variable "RPC_INGRESS_ENABLED" { - description = "Enable GKE ingress for RPC nodes" +# RPC gateway configuration (Kong-backed, optional) +variable "RPC_GATEWAY_ENABLED" { + description = "Enable the Kong RPC gateway for the utility RPC service. When false, no Kong/frontend/DNS resources are created." type = bool default = false } -variable "RPC_INGRESS_HOSTS" { - description = "Hostnames for RPC ingress" +variable "RPC_GATEWAY_HOSTS" { + description = "Hostnames served by the RPC gateway. Required when RPC_GATEWAY_ENABLED=true." type = list(string) default = [] } -variable "RPC_INGRESS_STATIC_IP_NAME" { - description = "Name of the GCP static IP resource for the ingress" +variable "RPC_GATEWAY_API_KEY_SECRET_NAMES" { + description = "GCP Secret Manager secret names containing API keys allowed by the RPC gateway. Raw key values must not go here." + type = list(string) + default = [] +} + +variable "RPC_GATEWAY_ALLOW_ANONYMOUS" { + description = "Whether the RPC gateway allows requests without a valid API key. Missing and invalid keys both use the anonymous consumer." + type = bool + default = false +} + +variable "RPC_GATEWAY_ANONYMOUS_RATE_LIMIT_MINUTE" { + description = "Per-client-IP anonymous request limit per minute when RPC_GATEWAY_ALLOW_ANONYMOUS=true. Kong local policy makes this per Kong pod." + type = number + default = 300 +} + +variable "RPC_GATEWAY_API_KEY_HEADER_NAME" { + description = "Header checked by Kong key-auth." + type = string + default = "x-aztec-api-key" +} + +variable "RPC_GATEWAY_CONSUMERS" { + description = "Kong consumers keyed by team name. Each value must use exactly one credential source." + type = map(object({ + username = string + gcp_secret_manager_secret_name = string + rate_limit_minute = number + })) + default = {} +} + +variable "RPC_GATEWAY_KONG_NAMESPACE" { + description = "Optional namespace for the Kong Helm release. Defaults to NAMESPACE." + type = string + default = "" +} + +variable "RPC_GATEWAY_KONG_HELM_RELEASE_NAME" { + description = "Optional Helm release name for Kong. Defaults to RELEASE_PREFIX-rpc-kong." + type = string + default = "" +} + +variable "RPC_GATEWAY_KONG_HELM_CHART_VERSION" { + description = "Kong ingress Helm chart version." + type = string + default = "0.24.0" +} + +variable "RPC_GATEWAY_KONG_INGRESS_CLASS" { + description = "Optional ingress class watched by Kong. Defaults to RELEASE_PREFIX-rpc-kong." type = string default = "" } -variable "RPC_INGRESS_SSL_CERT_NAMES" { - description = "Names of the GCP managed SSL certificates for the ingress" +variable "RPC_GATEWAY_KONG_PROXY_SERVICE_TYPE" { + description = "Kong proxy Service type. With frontend enabled this should normally stay ClusterIP plus NEG annotation." + type = string + default = "ClusterIP" +} + +variable "RPC_GATEWAY_KONG_PROXY_SERVICE_ANNOTATIONS" { + description = "Annotations applied to the Kong proxy Service." + type = map(string) + default = {} +} + +variable "RPC_GATEWAY_KONG_EXTRA_HELM_VALUES" { + description = "Additional YAML values passed to the Kong Helm chart." type = list(string) default = [] } -variable "RPC_CLOUD_ARMOR_POLICY_NAME" { - description = "Name of a Cloud Armor security policy to attach to the RPC ingress BackendConfig. Leave empty to disable." +variable "RPC_GATEWAY_KONG_SERVICE_MONITOR_ENABLED" { + description = "Whether Kong should create a ServiceMonitor." + type = bool + default = false +} + +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_ENABLED" { + description = "Whether to expose Kong's /metrics endpoint through a dedicated Service." + type = bool + default = false +} + +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_NAME" { + description = "Optional Kong metrics Service name. Defaults to RELEASE_PREFIX-kong-metrics." type = string default = "" } -variable "RPC_INGRESS_SESSION_AFFINITY" { - description = "Session affinity type for the RPC BackendConfig. One of NONE, CLIENT_IP, GENERATED_COOKIE. Leave empty for no affinity (GCE default)." +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_TYPE" { + description = "Kong metrics Service type." + type = string + default = "ClusterIP" +} + +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_ANNOTATIONS" { + description = "Annotations applied to the Kong metrics Service." + type = map(string) + default = {} +} + +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_LOAD_BALANCER_IP" { + description = "Optional static IP assigned to the Kong metrics LoadBalancer Service." type = string default = "" } -variable "RPC_INGRESS_LOG_SAMPLE_RATE" { - description = "LB access-log sample rate for the RPC BackendConfig (0.0-1.0). When set, logs include the Cloud Armor matched rule priority. Leave null to disable logging (GCE default)." - type = number - nullable = true +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_LOAD_BALANCER_SOURCE_RANGES" { + description = "Optional source CIDRs allowed to reach the Kong metrics Service." + type = list(string) + default = [] +} + +variable "RPC_GATEWAY_KONG_METRICS_SERVICE_EXTERNAL_TRAFFIC_POLICY" { + description = "External traffic policy for the Kong metrics Service." + type = string default = null } +variable "RPC_GATEWAY_KONG_OTEL_METRICS_GCP_SECRET_NAME" { + description = "GCP Secret Manager secret name containing the central OTLP/HTTP collector endpoint. When empty, no local Kong metrics collector is deployed." + type = string + default = "" +} + + +variable "RPC_GATEWAY_EXTERNAL_SECRET_STORE_NAME" { + description = "ExternalSecrets SecretStore or ClusterSecretStore name for RPC gateway consumer keys." + type = string + default = "gcp-secret-store" +} + +variable "RPC_GATEWAY_EXTERNAL_SECRET_STORE_KIND" { + description = "ExternalSecrets store kind for RPC gateway consumer keys." + type = string + default = "ClusterSecretStore" +} + +variable "RPC_GATEWAY_EXTERNAL_SECRET_REFRESH_INTERVAL" { + description = "ExternalSecret refresh interval for RPC gateway consumer keys." + type = string + default = "1m" +} + +variable "RPC_GATEWAY_CREATE_DNS" { + description = "Whether to create A records for RPC_GATEWAY_HOSTS." + type = bool + default = true +} + +variable "RPC_GATEWAY_DNS_ZONE_NAME" { + description = "Cloud DNS managed zone name for RPC gateway hosts." + type = string + default = "rpc-aztec-labs-com" +} + +variable "RPC_GATEWAY_DNS_TTL" { + description = "TTL for RPC gateway DNS A records." + type = number + default = 300 +} + +variable "RPC_GATEWAY_FRONTEND_ENABLED" { + description = "Whether to create a GKE frontend Ingress in front of Kong." + type = bool + default = true +} + +variable "RPC_GATEWAY_FRONTEND_STATIC_IP_ENABLED" { + description = "Whether to allocate a global static IP for the RPC gateway frontend." + type = bool + default = true +} + +variable "RPC_GATEWAY_FRONTEND_STATIC_IP_NAME" { + description = "Optional global static IP name for the RPC gateway frontend. Defaults to RELEASE_PREFIX-rpc-frontend." + type = string + default = "" +} + +variable "RPC_GATEWAY_FRONTEND_ALLOW_HTTP" { + description = "Whether the RPC gateway frontend should allow HTTP in addition to HTTPS." + type = bool + default = false +} + +variable "RPC_GATEWAY_GCP_MANAGED_CERTIFICATE_ENABLED" { + description = "Whether to create a GKE ManagedCertificate for RPC_GATEWAY_HOSTS." + type = bool + default = true +} + +variable "RPC_GATEWAY_GCP_MANAGED_CERTIFICATE_NAME" { + description = "Optional GKE ManagedCertificate name for RPC gateway hosts. Defaults to RELEASE_PREFIX-rpc-cert." + type = string + default = "" +} + variable "PROVER_FAILED_PROOF_STORE" { description = "Optional GCS/URI to store failed proofs from the prover" type = string diff --git a/spartan/terraform/deploy-kong-crds/README.md b/spartan/terraform/deploy-kong-crds/README.md new file mode 100644 index 000000000000..930ffbf63022 --- /dev/null +++ b/spartan/terraform/deploy-kong-crds/README.md @@ -0,0 +1,12 @@ +# Kong CRDs + +One-time Kong CRD bootstrap for clusters that run Kong Ingress Controller. + +This installs the `kong/kong` chart as a CRD-only release: Kong Gateway and the ingress controller are disabled, Helm's special `crds/` path is skipped, and only the chart's managed Kong CRD templates are rendered. The release adopts existing unowned Kong CRDs, and the RPC gateway module never renders CRDs, so cluster-scoped CRD ownership stays here. + +```bash +terraform -chdir=terraform/deploy-kong-crds init -backend-config=public.tfbackend +terraform -chdir=terraform/deploy-kong-crds apply -var-file=public.tfvars.example +``` + +Use `private.tfbackend` and `private.tfvars.example` for the private cluster. diff --git a/spartan/terraform/deploy-kong-crds/main.tf b/spartan/terraform/deploy-kong-crds/main.tf new file mode 100644 index 000000000000..c0e4461a314a --- /dev/null +++ b/spartan/terraform/deploy-kong-crds/main.tf @@ -0,0 +1,63 @@ +terraform { + backend "gcs" {} + + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 3.1.2" + } + } +} + +provider "helm" { + kubernetes = { + config_path = "~/.kube/config" + config_context = var.GKE_CLUSTER_CONTEXT + } +} + +resource "helm_release" "kong_crds" { + name = var.KONG_CRD_HELM_RELEASE_NAME + repository = "https://charts.konghq.com" + chart = "kong" + version = var.KONG_CRD_HELM_CHART_VERSION + namespace = var.KONG_CRD_NAMESPACE + create_namespace = true + upgrade_install = true + skip_crds = true + take_ownership = true + wait = true + timeout = 300 + + values = [ + yamlencode({ + deployment = { + kong = { + enabled = false + } + } + ingressController = { + enabled = false + installCRDs = true + } + admin = { + enabled = false + } + proxy = { + enabled = false + } + udpProxy = { + enabled = false + } + cluster = { + enabled = false + } + postgresql = { + enabled = false + } + test = { + enabled = false + } + }) + ] +} diff --git a/spartan/terraform/deploy-kong-crds/private.tfbackend b/spartan/terraform/deploy-kong-crds/private.tfbackend new file mode 100644 index 000000000000..9466505c837c --- /dev/null +++ b/spartan/terraform/deploy-kong-crds/private.tfbackend @@ -0,0 +1,2 @@ +bucket = "aztec-terraform" +prefix = "terraform/state/deploy-kong-crds/private" diff --git a/spartan/terraform/deploy-kong-crds/public.tfbackend b/spartan/terraform/deploy-kong-crds/public.tfbackend new file mode 100644 index 000000000000..5cde1ed1f50a --- /dev/null +++ b/spartan/terraform/deploy-kong-crds/public.tfbackend @@ -0,0 +1,2 @@ +bucket = "aztec-terraform" +prefix = "terraform/state/deploy-kong-crds/public" diff --git a/spartan/terraform/deploy-kong-crds/variables.tf b/spartan/terraform/deploy-kong-crds/variables.tf new file mode 100644 index 000000000000..1bbbc549d98d --- /dev/null +++ b/spartan/terraform/deploy-kong-crds/variables.tf @@ -0,0 +1,23 @@ +variable "GKE_CLUSTER_CONTEXT" { + description = "Kubernetes context for the GKE cluster that should receive Kong CRDs." + type = string + default = "gke_testnet-440309_us-west1-a_aztec-gke-public" +} + +variable "KONG_CRD_NAMESPACE" { + description = "Namespace used only to own the Kong CRD Helm release metadata." + type = string + default = "kong-crds" +} + +variable "KONG_CRD_HELM_RELEASE_NAME" { + description = "Helm release name for the Kong CRD-only release." + type = string + default = "kong-crds" +} + +variable "KONG_CRD_HELM_CHART_VERSION" { + description = "kong/kong chart version used to install CRDs. 3.2.0 matches kong/ingress 0.24.0." + type = string + default = "3.2.0" +} diff --git a/spartan/terraform/modules/rpc-gateway/main.tf b/spartan/terraform/modules/rpc-gateway/main.tf new file mode 100644 index 000000000000..471286ce2792 --- /dev/null +++ b/spartan/terraform/modules/rpc-gateway/main.tf @@ -0,0 +1,812 @@ +terraform { + required_providers { + helm = { + source = "hashicorp/helm" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + google = { + source = "hashicorp/google" + } + } +} + +locals { + kong_namespace = var.KONG_NAMESPACE != "" ? var.KONG_NAMESPACE : "${var.RELEASE_PREFIX}-rpc-kong" + kong_helm_release_name = var.KONG_HELM_RELEASE_NAME != "" ? var.KONG_HELM_RELEASE_NAME : "${var.RELEASE_PREFIX}-rpc-kong" + kong_ingress_class = var.KONG_INGRESS_CLASS != "" ? var.KONG_INGRESS_CLASS : "${var.RELEASE_PREFIX}-rpc-kong" + + sticky_session_policy_name = var.STICKY_SESSION_POLICY_NAME != "" ? var.STICKY_SESSION_POLICY_NAME : "${var.RELEASE_PREFIX}-rpc-sticky-sessions" + frontend_static_ip_name = var.FRONTEND_STATIC_IP_NAME != "" ? var.FRONTEND_STATIC_IP_NAME : "${var.RELEASE_PREFIX}-rpc-frontend" + frontend_service_name = var.FRONTEND_SERVICE_NAME != "" ? var.FRONTEND_SERVICE_NAME : "${local.kong_helm_release_name}-gateway-proxy" + managed_certificate_name = var.GCP_MANAGED_CERTIFICATE_NAME != "" ? var.GCP_MANAGED_CERTIFICATE_NAME : "${var.RELEASE_PREFIX}-rpc-cert" + frontend_backend_config_name = "${var.RELEASE_PREFIX}-rpc-kong-backend" + frontend_hosts = toset(flatten([for _, route in var.ROUTES : route.hosts])) + frontend_load_balancer_ip = var.FRONTEND_ENABLED && var.FRONTEND_STATIC_IP_ENABLED ? try(google_compute_global_address.frontend[0].address, "") : "" + + kong_proxy_service_annotations = merge( + var.FRONTEND_ENABLED ? { + "cloud.google.com/neg" = jsonencode({ ingress = true }) + "cloud.google.com/backend-config" = jsonencode({ ports = { tostring(var.FRONTEND_SERVICE_PORT) = local.frontend_backend_config_name } }) + } : {}, + var.KONG_PROXY_SERVICE_ANNOTATIONS + ) + + routes_with_anonymous = { + for name, route in var.ROUTES : name => route + if route.auth_mode == "keyed_with_anonymous" + } + + route_plugin_names = { + for name, route in var.ROUTES : + name => join(",", compact([ + "${var.RELEASE_PREFIX}-${name}-rpc-key-auth", + "${var.RELEASE_PREFIX}-${name}-rpc-prometheus" + ])) + } + + metrics_service_enabled = var.KONG_METRICS_SERVICE_ENABLED || var.KONG_OTEL_METRICS_GCP_SECRET_NAME != "" + metrics_service_name = var.KONG_METRICS_SERVICE_NAME != "" ? var.KONG_METRICS_SERVICE_NAME : "${var.RELEASE_PREFIX}-kong-metrics" + metrics_service_selector = length(var.KONG_METRICS_SERVICE_SELECTOR) > 0 ? var.KONG_METRICS_SERVICE_SELECTOR : { + "app.kubernetes.io/name" = "gateway" + "app.kubernetes.io/component" = "app" + "app.kubernetes.io/instance" = local.kong_helm_release_name + } + + otel_collector_name = "${var.RELEASE_PREFIX}-rpc-kong-otel-collector" + otel_collector_config_name = "${local.otel_collector_name}-config" + otel_collector_secret_name = local.otel_collector_name + + consumer_credential_secret_names = { + for name, _ in var.CONSUMERS : + name => "${var.RELEASE_PREFIX}-${name}-rpc-key-auth" + } + + consumers_with_rate_limit = { + for name, consumer in var.CONSUMERS : + name => consumer + if consumer.rate_limit_minute > 0 + } +} + +resource "helm_release" "kong" { + count = var.INSTALL_KONG ? 1 : 0 + + name = local.kong_helm_release_name + repository = "https://charts.konghq.com" + chart = "ingress" + version = var.KONG_HELM_CHART_VERSION + namespace = local.kong_namespace + create_namespace = true + upgrade_install = true + skip_crds = true + wait = true + timeout = 600 + + values = concat([ + yamlencode({ + gateway = { + env = { + database = "off" + } + proxy = { + type = var.KONG_PROXY_SERVICE_TYPE + annotations = local.kong_proxy_service_annotations + loadBalancerIP = var.KONG_PROXY_SERVICE_LOAD_BALANCER_IP + loadBalancerSourceRanges = var.KONG_PROXY_SERVICE_LOAD_BALANCER_SOURCE_RANGES + } + serviceMonitor = { + enabled = var.KONG_SERVICE_MONITOR_ENABLED + } + } + controller = { + ingressController = { + ingressClass = local.kong_ingress_class + installCRDs = false + } + } + }) + ], var.KONG_EXTRA_HELM_VALUES) +} + +resource "google_compute_global_address" "frontend" { + count = var.FRONTEND_ENABLED && var.FRONTEND_STATIC_IP_ENABLED ? 1 : 0 + + name = local.frontend_static_ip_name + description = "Global static IP for ${var.RELEASE_PREFIX} RPC frontend Ingress" +} + +resource "kubernetes_manifest" "managed_certificate" { + count = var.FRONTEND_ENABLED && var.GCP_MANAGED_CERTIFICATE_ENABLED ? 1 : 0 + + manifest = { + apiVersion = "networking.gke.io/v1" + kind = "ManagedCertificate" + metadata = { + name = local.managed_certificate_name + namespace = local.kong_namespace + } + spec = { + domains = sort(tolist(local.frontend_hosts)) + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "frontend_backend_config" { + count = var.FRONTEND_ENABLED ? 1 : 0 + + manifest = { + apiVersion = "cloud.google.com/v1" + kind = "BackendConfig" + metadata = { + name = local.frontend_backend_config_name + namespace = local.kong_namespace + } + spec = { + healthCheck = { + type = "HTTP" + requestPath = "/status" + port = 8100 + } + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "frontend_ingress" { + count = var.FRONTEND_ENABLED ? 1 : 0 + + manifest = { + apiVersion = "networking.k8s.io/v1" + kind = "Ingress" + metadata = { + name = "${var.RELEASE_PREFIX}-rpc-frontend" + namespace = local.kong_namespace + annotations = merge( + { + "kubernetes.io/ingress.class" = var.FRONTEND_INGRESS_CLASS + "kubernetes.io/ingress.global-static-ip-name" = local.frontend_static_ip_name + "kubernetes.io/ingress.allow-http" = tostring(var.FRONTEND_ALLOW_HTTP) + }, + var.GCP_MANAGED_CERTIFICATE_ENABLED ? { + "networking.gke.io/managed-certificates" = local.managed_certificate_name + } : {} + ) + } + spec = { + ingressClassName = var.FRONTEND_INGRESS_CLASS + rules = [ + for host in sort(tolist(local.frontend_hosts)) : { + host = host + http = { + paths = [ + { + path = "/" + pathType = "Prefix" + backend = { + service = { + name = local.frontend_service_name + port = { + number = var.FRONTEND_SERVICE_PORT + } + } + } + } + ] + } + } + ] + } + } + + lifecycle { + precondition { + condition = !var.FRONTEND_STATIC_IP_ENABLED || local.frontend_load_balancer_ip != "" + error_message = "Frontend DNS requires FRONTEND_STATIC_IP_ENABLED=true or an explicit represented frontend IP path." + } + } + + depends_on = [ + helm_release.kong, + kubernetes_manifest.managed_certificate, + kubernetes_manifest.frontend_backend_config, + ] +} + +resource "google_dns_record_set" "rpc" { + for_each = var.CREATE_DNS ? local.frontend_hosts : toset([]) + managed_zone = var.DNS_ZONE_NAME + name = "${each.value}." + type = "A" + ttl = var.DNS_TTL + rrdatas = [local.frontend_load_balancer_ip] + + lifecycle { + precondition { + condition = local.frontend_load_balancer_ip != "" + error_message = "DNS records require FRONTEND_ENABLED=true and FRONTEND_STATIC_IP_ENABLED=true." + } + } + + depends_on = [kubernetes_manifest.frontend_ingress] +} + +resource "kubernetes_manifest" "key_auth_plugin" { + for_each = var.ROUTES + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongPlugin" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-rpc-key-auth" + namespace = each.value.route_namespace + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + } + plugin = "key-auth" + config = merge( + { + key_names = [var.API_KEY_HEADER_NAME] + hide_credentials = true + key_in_body = false + key_in_header = true + key_in_query = false + }, + each.value.auth_mode == "keyed_with_anonymous" ? { + anonymous = "${var.RELEASE_PREFIX}-${each.key}-anonymous" + } : {} + ) + } + + depends_on = [helm_release.kong, kubernetes_manifest.anonymous_consumer] +} + +resource "kubernetes_manifest" "prometheus_plugin" { + for_each = var.ROUTES + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongPlugin" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-rpc-prometheus" + namespace = each.value.route_namespace + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + } + plugin = "prometheus" + config = { + per_consumer = true + status_code_metrics = true + latency_metrics = true + bandwidth_metrics = true + upstream_health_metrics = true + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "otel_collector_external_secret" { + count = var.KONG_OTEL_METRICS_GCP_SECRET_NAME != "" ? 1 : 0 + + manifest = { + apiVersion = "external-secrets.io/v1" + kind = "ExternalSecret" + metadata = { + name = local.otel_collector_secret_name + namespace = local.kong_namespace + } + spec = { + refreshInterval = var.EXTERNAL_SECRET_REFRESH_INTERVAL + secretStoreRef = { + name = var.EXTERNAL_SECRET_STORE_NAME + kind = var.EXTERNAL_SECRET_STORE_KIND + } + target = { + name = local.otel_collector_secret_name + creationPolicy = "Owner" + template = { + engineVersion = "v2" + type = "Opaque" + data = { + endpoint = "{{ .url | trimSuffix \"/v1/metrics\" | trimSuffix \"/\" }}" + } + } + } + data = [ + { + secretKey = "url" + remoteRef = { + key = var.KONG_OTEL_METRICS_GCP_SECRET_NAME + } + } + ] + } + } + + wait { + condition { + type = "Ready" + status = "True" + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "otel_collector_config" { + count = var.KONG_OTEL_METRICS_GCP_SECRET_NAME != "" ? 1 : 0 + + manifest = { + apiVersion = "v1" + kind = "ConfigMap" + metadata = { + name = local.otel_collector_config_name + namespace = local.kong_namespace + } + data = { + "collector.yaml" = yamlencode({ + extensions = { + health_check = { + endpoint = "0.0.0.0:13133" + } + } + receivers = { + prometheus = { + config = { + scrape_configs = [ + { + job_name = "kong" + scrape_interval = "${var.KONG_OTEL_METRICS_PUSH_INTERVAL_SECONDS}s" + metrics_path = "/metrics" + static_configs = [ + { + targets = ["${local.metrics_service_name}.${local.kong_namespace}.svc.cluster.local:${var.KONG_METRICS_SERVICE_PORT}"] + labels = { + component = "kong" + network = var.RELEASE_PREFIX + } + } + ] + } + ] + } + } + } + processors = { + resource = { + attributes = [ + { + action = "upsert" + key = "service.name" + value = "${var.RELEASE_PREFIX}-rpc-kong" + }, + { + action = "upsert" + key = "service.namespace" + value = local.kong_namespace + }, + { + action = "upsert" + key = "k8s.namespace.name" + value = local.kong_namespace + }, + { + action = "upsert" + key = "network" + value = var.RELEASE_PREFIX + }, + { + action = "upsert" + key = "aztec.component" + value = "kong" + } + ] + } + batch = {} + } + exporters = { + otlphttp = { + endpoint = "$${env:OTEL_EXPORTER_OTLP_ENDPOINT}" + compression = "gzip" + } + } + service = { + extensions = ["health_check"] + pipelines = { + metrics = { + receivers = ["prometheus"] + processors = ["resource", "batch"] + exporters = ["otlphttp"] + } + } + } + }) + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "otel_collector_deployment" { + count = var.KONG_OTEL_METRICS_GCP_SECRET_NAME != "" ? 1 : 0 + + manifest = { + apiVersion = "apps/v1" + kind = "Deployment" + metadata = { + name = local.otel_collector_name + namespace = local.kong_namespace + labels = { + "app.kubernetes.io/name" = "kong-otel-collector" + "app.kubernetes.io/instance" = local.kong_helm_release_name + "app.kubernetes.io/component" = "metrics" + } + } + spec = { + replicas = var.KONG_OTEL_METRICS_COLLECTOR_REPLICAS + selector = { + matchLabels = { + "app.kubernetes.io/name" = "kong-otel-collector" + "app.kubernetes.io/instance" = local.kong_helm_release_name + "app.kubernetes.io/component" = "metrics" + } + } + template = { + metadata = { + labels = { + "app.kubernetes.io/name" = "kong-otel-collector" + "app.kubernetes.io/instance" = local.kong_helm_release_name + "app.kubernetes.io/component" = "metrics" + } + } + spec = { + containers = [ + { + name = "otel-collector" + image = var.KONG_OTEL_METRICS_COLLECTOR_IMAGE + args = ["--config=/conf/collector.yaml"] + env = [ + { + name = "OTEL_EXPORTER_OTLP_ENDPOINT" + valueFrom = { + secretKeyRef = { + name = local.otel_collector_secret_name + key = "endpoint" + } + } + } + ] + ports = [ + { + name = "health" + containerPort = 13133 + protocol = "TCP" + } + ] + readinessProbe = { + httpGet = { + path = "/" + port = "health" + } + initialDelaySeconds = 5 + periodSeconds = 10 + } + livenessProbe = { + httpGet = { + path = "/" + port = "health" + } + initialDelaySeconds = 15 + periodSeconds = 20 + } + resources = var.KONG_OTEL_METRICS_COLLECTOR_RESOURCES + volumeMounts = [ + { + name = "config" + mountPath = "/conf" + readOnly = true + } + ] + } + ] + volumes = [ + { + name = "config" + configMap = { + name = local.otel_collector_config_name + } + } + ] + } + } + } + } + + depends_on = [ + kubernetes_manifest.otel_collector_external_secret, + kubernetes_manifest.otel_collector_config, + kubernetes_service_v1.metrics, + ] +} + +resource "kubernetes_manifest" "sticky_session_policy" { + for_each = var.STICKY_SESSIONS_ENABLED ? toset(distinct([for _, route in var.ROUTES : route.route_namespace])) : toset([]) + + manifest = { + apiVersion = "configuration.konghq.com/v1beta1" + kind = "KongUpstreamPolicy" + metadata = { + name = local.sticky_session_policy_name + namespace = each.value + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + } + spec = { + algorithm = "sticky-sessions" + hashOn = { + input = "none" + } + stickySessions = { + cookie = var.STICKY_SESSION_COOKIE_NAME + cookiePath = var.STICKY_SESSION_COOKIE_PATH + } + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "consumer_rate_limit_plugin" { + for_each = local.consumers_with_rate_limit + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongPlugin" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-rpc-rate-limit" + namespace = var.CONSUMER_NAMESPACE + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + } + plugin = "rate-limiting" + config = { + minute = each.value.rate_limit_minute + policy = "local" + limit_by = "consumer" + fault_tolerant = true + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "consumer_key_external_secret" { + for_each = var.CONSUMERS + + manifest = { + apiVersion = "external-secrets.io/v1" + kind = "ExternalSecret" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-rpc-key-auth" + namespace = var.CONSUMER_NAMESPACE + } + spec = { + refreshInterval = var.EXTERNAL_SECRET_REFRESH_INTERVAL + secretStoreRef = { + name = var.EXTERNAL_SECRET_STORE_NAME + kind = var.EXTERNAL_SECRET_STORE_KIND + } + target = { + name = local.consumer_credential_secret_names[each.key] + creationPolicy = "Owner" + template = { + metadata = { + labels = { + "konghq.com/credential" = "key-auth" + } + } + type = "Opaque" + data = { + key = "{{ .api_key }}" + } + } + } + data = [ + { + secretKey = "api_key" + remoteRef = { + key = each.value.gcp_secret_manager_secret_name + } + } + ] + } + } + + wait { + condition { + type = "Ready" + status = "True" + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "consumer" { + for_each = var.CONSUMERS + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongConsumer" + metadata = merge( + { + name = "${var.RELEASE_PREFIX}-${each.key}" + namespace = var.CONSUMER_NAMESPACE + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + }, + each.value.rate_limit_minute > 0 ? { + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + "konghq.com/plugins" = "${var.RELEASE_PREFIX}-${each.key}-rpc-rate-limit" + } + } : {} + ) + username = each.value.username != "" ? each.value.username : each.key + credentials = [local.consumer_credential_secret_names[each.key]] + } + + depends_on = [ + kubernetes_manifest.consumer_key_external_secret, + kubernetes_manifest.consumer_rate_limit_plugin, + ] +} + +resource "kubernetes_manifest" "anonymous_rate_limit_plugin" { + for_each = local.routes_with_anonymous + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongPlugin" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-anonymous-rpc-rate-limit" + namespace = each.value.route_namespace + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + } + } + plugin = "rate-limiting" + config = { + minute = each.value.anonymous_rate_limit_minute + policy = "local" + limit_by = "ip" + fault_tolerant = true + } + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "anonymous_consumer" { + for_each = local.routes_with_anonymous + + manifest = { + apiVersion = "configuration.konghq.com/v1" + kind = "KongConsumer" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-anonymous" + namespace = each.value.route_namespace + annotations = { + "kubernetes.io/ingress.class" = local.kong_ingress_class + "konghq.com/plugins" = "${var.RELEASE_PREFIX}-${each.key}-anonymous-rpc-rate-limit" + } + } + username = "${var.RELEASE_PREFIX}-${each.key}-anonymous" + } + + depends_on = [kubernetes_manifest.anonymous_rate_limit_plugin] +} + +resource "kubernetes_service_v1" "metrics" { + count = local.metrics_service_enabled ? 1 : 0 + + metadata { + name = local.metrics_service_name + namespace = local.kong_namespace + annotations = var.KONG_METRICS_SERVICE_ANNOTATIONS + labels = { + "app.kubernetes.io/name" = "kong-metrics" + "app.kubernetes.io/instance" = local.kong_helm_release_name + "app.kubernetes.io/component" = "metrics" + } + } + + spec { + type = var.KONG_METRICS_SERVICE_TYPE + load_balancer_ip = var.KONG_METRICS_SERVICE_LOAD_BALANCER_IP != "" ? var.KONG_METRICS_SERVICE_LOAD_BALANCER_IP : null + load_balancer_source_ranges = var.KONG_METRICS_SERVICE_LOAD_BALANCER_SOURCE_RANGES + external_traffic_policy = var.KONG_METRICS_SERVICE_EXTERNAL_TRAFFIC_POLICY + + port { + name = "status" + port = var.KONG_METRICS_SERVICE_PORT + target_port = "status" + protocol = "TCP" + } + + selector = local.metrics_service_selector + } + + depends_on = [helm_release.kong] +} + +resource "kubernetes_manifest" "rpc_route" { + for_each = var.ROUTES + + manifest = { + apiVersion = "networking.k8s.io/v1" + kind = "Ingress" + metadata = { + name = "${var.RELEASE_PREFIX}-${each.key}-rpc" + namespace = each.value.route_namespace + annotations = merge( + { + "kubernetes.io/ingress.class" = local.kong_ingress_class + "konghq.com/plugins" = local.route_plugin_names[each.key] + "konghq.com/strip-path" = "false" + }, + var.ROUTE_ANNOTATIONS + ) + } + spec = merge( + { + ingressClassName = local.kong_ingress_class + rules = [ + for host in each.value.hosts : { + host = host + http = { + paths = [ + { + path = "/" + pathType = "Prefix" + backend = { + service = { + name = each.value.upstream_service_name + port = { + number = each.value.upstream_service_port + } + } + } + } + ] + } + } + ] + }, + var.TLS_ENABLED ? { + tls = [ + { + hosts = each.value.hosts + secretName = var.TLS_SECRET_NAME + } + ] + } : {} + ) + } + + depends_on = [ + kubernetes_manifest.key_auth_plugin, + kubernetes_manifest.prometheus_plugin, + kubernetes_manifest.sticky_session_policy, + ] +} diff --git a/spartan/terraform/modules/rpc-gateway/outputs.tf b/spartan/terraform/modules/rpc-gateway/outputs.tf new file mode 100644 index 000000000000..43bc03c952e4 --- /dev/null +++ b/spartan/terraform/modules/rpc-gateway/outputs.tf @@ -0,0 +1,74 @@ +output "route_names" { + description = "Kong-managed Ingress names keyed by RPC route." + value = { for name, route in kubernetes_manifest.rpc_route : name => route.manifest.metadata.name } +} + +output "consumer_names" { + description = "KongConsumer resource names keyed by configured consumer." + value = { for name, consumer in kubernetes_manifest.consumer : name => consumer.manifest.metadata.name } +} + +output "anonymous_consumer_names" { + description = "Anonymous KongConsumer resource names keyed by route." + value = { for name, consumer in kubernetes_manifest.anonymous_consumer : name => consumer.manifest.metadata.name } +} + +output "consumer_credential_secret_names" { + description = "Kubernetes Secret names referenced by KongConsumer credentials." + value = local.consumer_credential_secret_names +} + +output "key_auth_plugin_names" { + description = "KongPlugin names for key authentication, keyed by route." + value = { for name, plugin in kubernetes_manifest.key_auth_plugin : name => plugin.manifest.metadata.name } +} + +output "prometheus_plugin_names" { + description = "KongPlugin names for per-consumer Prometheus metrics, keyed by route." + value = { for name, plugin in kubernetes_manifest.prometheus_plugin : name => plugin.manifest.metadata.name } +} + +output "kong_namespace" { + description = "Namespace containing the Kong Helm release." + value = local.kong_namespace +} + +output "sticky_session_policy_name" { + description = "KongUpstreamPolicy name for sticky sessions, or null when disabled." + value = var.STICKY_SESSIONS_ENABLED ? local.sticky_session_policy_name : null +} + +output "metrics_service_name" { + description = "Kong metrics Service name, or null when disabled." + value = local.metrics_service_enabled ? kubernetes_service_v1.metrics[0].metadata[0].name : null +} + +output "metrics_service_namespace" { + description = "Kong metrics Service namespace, or null when disabled." + value = local.metrics_service_enabled ? kubernetes_service_v1.metrics[0].metadata[0].namespace : null +} + +output "metrics_service_port" { + description = "Kong metrics Service port, or null when disabled." + value = local.metrics_service_enabled ? var.KONG_METRICS_SERVICE_PORT : null +} + +output "metrics_service_load_balancer_ingress" { + description = "Kong metrics Service load balancer ingress status, or an empty list when disabled/not assigned yet." + value = local.metrics_service_enabled ? try(kubernetes_service_v1.metrics[0].status[0].load_balancer[0].ingress, []) : [] +} + +output "otel_collector_deployment_name" { + description = "Local OTel collector Deployment name for Kong metrics, or null when disabled." + value = var.KONG_OTEL_METRICS_GCP_SECRET_NAME != "" ? local.otel_collector_name : null +} + +output "frontend_load_balancer_ip" { + description = "Global static IP assigned to the public GKE frontend Ingress." + value = local.frontend_load_balancer_ip +} + +output "gcp_managed_certificate_name" { + description = "GKE ManagedCertificate resource name for RPC hosts." + value = var.GCP_MANAGED_CERTIFICATE_ENABLED ? local.managed_certificate_name : null +} diff --git a/spartan/terraform/modules/rpc-gateway/variables.tf b/spartan/terraform/modules/rpc-gateway/variables.tf new file mode 100644 index 000000000000..4ea4838c8e9c --- /dev/null +++ b/spartan/terraform/modules/rpc-gateway/variables.tf @@ -0,0 +1,346 @@ +variable "RELEASE_PREFIX" { + description = "Prefix used for generated Kubernetes and GCP resources." + type = string +} + +variable "CONSUMER_NAMESPACE" { + description = "Namespace for named KongConsumer resources and their credential secrets." + type = string +} + +variable "INSTALL_KONG" { + description = "Whether this module should install Kong Gateway and Kong Ingress Controller." + type = bool + default = true +} + +variable "KONG_NAMESPACE" { + description = "Namespace for the Kong Helm release. Defaults to RELEASE_PREFIX-rpc-kong when empty." + type = string + default = "" +} + +variable "KONG_HELM_RELEASE_NAME" { + description = "Helm release name for Kong. Defaults to RELEASE_PREFIX-rpc-kong when empty." + type = string + default = "" +} + +variable "KONG_HELM_CHART_VERSION" { + description = "Kong ingress Helm chart version." + type = string + default = "0.24.0" +} + +variable "KONG_INGRESS_CLASS" { + description = "Ingress class watched by Kong Ingress Controller. Defaults to RELEASE_PREFIX-rpc-kong when empty." + type = string + default = "" +} + +variable "KONG_PROXY_SERVICE_TYPE" { + description = "Kong proxy Kubernetes Service type." + type = string + default = "ClusterIP" +} + +variable "KONG_PROXY_SERVICE_ANNOTATIONS" { + description = "Annotations applied to the Kong proxy Service by the Helm chart." + type = map(string) + default = {} +} + +variable "KONG_PROXY_SERVICE_LOAD_BALANCER_IP" { + description = "Optional static IP assigned to the Kong proxy LoadBalancer Service." + type = string + default = "" +} + +variable "KONG_PROXY_SERVICE_LOAD_BALANCER_SOURCE_RANGES" { + description = "Optional source CIDRs allowed to reach the Kong proxy LoadBalancer Service." + type = list(string) + default = [] +} + +variable "KONG_EXTRA_HELM_VALUES" { + description = "Additional YAML values passed to the Kong Helm chart." + type = list(string) + default = [] +} + +variable "KONG_SERVICE_MONITOR_ENABLED" { + description = "Whether the Kong Helm chart should create a ServiceMonitor for Prometheus Operator." + type = bool + default = false +} + +variable "KONG_METRICS_SERVICE_ENABLED" { + description = "Whether to expose Kong's status /metrics endpoint through a Kubernetes Service. The service is also created automatically when local OTel collection is enabled." + type = bool + default = false +} + +variable "KONG_METRICS_SERVICE_NAME" { + description = "Optional name for the Kong metrics Service. Defaults to RELEASE_PREFIX-kong-metrics." + type = string + default = "" +} + +variable "KONG_METRICS_SERVICE_TYPE" { + description = "Kong metrics Service type. ClusterIP is enough for the local OTel collector path." + type = string + default = "ClusterIP" +} + +variable "KONG_METRICS_SERVICE_PORT" { + description = "Service port for Kong's status /metrics endpoint." + type = number + default = 8100 +} + +variable "KONG_METRICS_SERVICE_ANNOTATIONS" { + description = "Annotations applied to the Kong metrics Service." + type = map(string) + default = {} +} + +variable "KONG_METRICS_SERVICE_LOAD_BALANCER_IP" { + description = "Optional static IP assigned to the Kong metrics LoadBalancer Service." + type = string + default = "" +} + +variable "KONG_METRICS_SERVICE_LOAD_BALANCER_SOURCE_RANGES" { + description = "Optional source CIDRs allowed to reach the Kong metrics Service." + type = list(string) + default = [] +} + +variable "KONG_METRICS_SERVICE_EXTERNAL_TRAFFIC_POLICY" { + description = "External traffic policy for the Kong metrics Service. Leave null to use the Kubernetes default." + type = string + default = null +} + +variable "KONG_METRICS_SERVICE_SELECTOR" { + description = "Optional selector for Kong Gateway pods. Defaults to the kong/ingress gateway pod labels." + type = map(string) + default = {} +} + +variable "KONG_OTEL_METRICS_GCP_SECRET_NAME" { + description = "GCP Secret Manager secret name containing the central OTLP/HTTP collector endpoint. When empty, no local Kong metrics collector is deployed." + type = string + default = "" +} + +variable "KONG_OTEL_METRICS_PUSH_INTERVAL_SECONDS" { + description = "How often the local OTel collector scrapes Kong metrics before exporting to the central collector." + type = number + default = 15 +} + +variable "KONG_OTEL_METRICS_COLLECTOR_IMAGE" { + description = "Container image for the local OTel collector that scrapes Kong metrics." + type = string + default = "otel/opentelemetry-collector-contrib:0.154.0" +} + +variable "KONG_OTEL_METRICS_COLLECTOR_REPLICAS" { + description = "Replica count for the local Kong metrics OTel collector." + type = number + default = 1 +} + +variable "KONG_OTEL_METRICS_COLLECTOR_RESOURCES" { + description = "Resource requests and limits for the local Kong metrics OTel collector." + type = object({ + requests = map(string) + limits = map(string) + }) + default = { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + cpu = "200m" + memory = "256Mi" + } + } +} + +variable "STICKY_SESSIONS_ENABLED" { + description = "Whether to create KongUpstreamPolicy resources for RPC backend pods." + type = bool + default = false +} + +variable "STICKY_SESSION_POLICY_NAME" { + description = "Optional KongUpstreamPolicy name. Defaults to RELEASE_PREFIX-rpc-sticky-sessions." + type = string + default = "" +} + +variable "STICKY_SESSION_COOKIE_NAME" { + description = "Cookie name used by Kong sticky-sessions upstream balancing." + type = string + default = "aztec_rpc_backend" +} + +variable "STICKY_SESSION_COOKIE_PATH" { + description = "Cookie path used by Kong sticky-sessions upstream balancing." + type = string + default = "/" +} + +variable "API_KEY_HEADER_NAME" { + description = "Header checked by Kong's key-auth plugin." + type = string + default = "x-aztec-api-key" +} + +variable "TLS_ENABLED" { + description = "Whether Kong-managed RPC Ingresses should include TLS configuration." + type = bool + default = false +} + +variable "TLS_SECRET_NAME" { + description = "TLS Secret used by Kong-managed RPC Ingresses when TLS_ENABLED=true." + type = string + default = "" +} + +variable "ROUTES" { + description = "RPC routes keyed by rollup alias. Route Ingresses are created in route_namespace." + type = map(object({ + hosts = list(string) + route_namespace = string + upstream_service_name = string + upstream_service_port = number + auth_mode = string + anonymous_rate_limit_minute = number + })) + + validation { + condition = alltrue([ + for _, route in var.ROUTES : + contains(["keyed_only", "keyed_with_anonymous"], route.auth_mode) + ]) + error_message = "ROUTES auth_mode must be keyed_only or keyed_with_anonymous." + } + + validation { + condition = alltrue([ + for _, route in var.ROUTES : length(route.hosts) > 0 + ]) + error_message = "Every RPC gateway route must define at least one host." + } +} + +variable "ROUTE_ANNOTATIONS" { + description = "Additional annotations applied to every Kong-managed RPC Ingress." + type = map(string) + default = {} +} + +variable "CONSUMERS" { + description = "Kong consumers keyed by team name. Use one credential source per consumer." + type = map(object({ + username = string + gcp_secret_manager_secret_name = string + rate_limit_minute = number + })) + default = {} +} + +variable "EXTERNAL_SECRET_STORE_NAME" { + description = "ExternalSecrets SecretStore or ClusterSecretStore name." + type = string + default = "gcp-secret-store" +} + +variable "EXTERNAL_SECRET_STORE_KIND" { + description = "ExternalSecrets store kind." + type = string + default = "ClusterSecretStore" +} + +variable "EXTERNAL_SECRET_REFRESH_INTERVAL" { + description = "ExternalSecret refresh interval." + type = string + default = "1m" +} + +variable "CREATE_DNS" { + description = "Whether to create A records for RPC hosts in DNS_ZONE_NAME." + type = bool + default = true +} + +variable "DNS_ZONE_NAME" { + description = "Cloud DNS managed zone name for RPC hosts." + type = string + default = "rpc-aztec-labs-com" +} + +variable "DNS_TTL" { + description = "TTL for RPC DNS A records." + type = number + default = 300 +} + +variable "FRONTEND_ENABLED" { + description = "Whether to create a GKE Ingress in front of Kong for public HTTP(S) traffic." + type = bool + default = true +} + +variable "FRONTEND_INGRESS_CLASS" { + description = "Ingress class used by the public frontend Ingress. Use gce for GKE external HTTP(S) Load Balancing." + type = string + default = "gce" +} + +variable "FRONTEND_STATIC_IP_ENABLED" { + description = "Whether to allocate a global static IP for the public frontend Ingress when FRONTEND_STATIC_IP_NAME is empty." + type = bool + default = true +} + +variable "FRONTEND_STATIC_IP_NAME" { + description = "Optional global static IP name for the public frontend Ingress. Defaults to RELEASE_PREFIX-rpc-frontend." + type = string + default = "" +} + +variable "FRONTEND_SERVICE_NAME" { + description = "Optional Kong proxy Service name used as the GKE Ingress backend. Defaults to KONG_HELM_RELEASE_NAME-gateway-proxy." + type = string + default = "" +} + +variable "FRONTEND_SERVICE_PORT" { + description = "Kong proxy Service port used as the GKE Ingress backend." + type = number + default = 80 +} + +variable "FRONTEND_ALLOW_HTTP" { + description = "Whether the public GKE Ingress should allow HTTP in addition to HTTPS." + type = bool + default = false +} + +variable "GCP_MANAGED_CERTIFICATE_ENABLED" { + description = "Whether to create a GKE ManagedCertificate for RPC hosts." + type = bool + default = true +} + +variable "GCP_MANAGED_CERTIFICATE_NAME" { + description = "Optional GKE ManagedCertificate resource name. Defaults to RELEASE_PREFIX-rpc-cert." + type = string + default = "" +} From 6b058254556b566a3c2348894c3e85c556c558c8 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 11 Jun 2026 12:18:10 +0300 Subject: [PATCH 7/9] feat: add version RPC deployments (#24001) Adds versioned RPC Terraform deployment wiring. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136. --- .github/workflows/deploy-rpc.yml | 238 +++++++++++++++++ spartan/terraform/deploy-rpc/README.md | 21 ++ .../deploy-rpc/environments/mainnet/main.tf | 90 +++++++ .../environments/mainnet/outputs.tf | 29 ++ .../environments/mainnet/variables.tf | 28 ++ .../deploy-rpc/environments/testnet/main.tf | 90 +++++++ .../environments/testnet/outputs.tf | 29 ++ .../environments/testnet/variables.tf | 28 ++ .../deploy-rpc/modules/environment/main.tf | 89 +++++++ .../deploy-rpc/modules/environment/outputs.tf | 42 +++ .../modules/environment/variables.tf | 51 ++++ .../terraform/deploy-rpc/modules/rpc/main.tf | 247 ++++++++++++++++++ .../deploy-rpc/modules/rpc/outputs.tf | 34 +++ .../modules/rpc/values/prod-res.yaml | 23 ++ .../deploy-rpc/modules/rpc/values/prod.yaml | 33 +++ .../deploy-rpc/modules/rpc/variables.tf | 60 +++++ 16 files changed, 1132 insertions(+) create mode 100644 .github/workflows/deploy-rpc.yml create mode 100644 spartan/terraform/deploy-rpc/README.md create mode 100644 spartan/terraform/deploy-rpc/environments/mainnet/main.tf create mode 100644 spartan/terraform/deploy-rpc/environments/mainnet/outputs.tf create mode 100644 spartan/terraform/deploy-rpc/environments/mainnet/variables.tf create mode 100644 spartan/terraform/deploy-rpc/environments/testnet/main.tf create mode 100644 spartan/terraform/deploy-rpc/environments/testnet/outputs.tf create mode 100644 spartan/terraform/deploy-rpc/environments/testnet/variables.tf create mode 100644 spartan/terraform/deploy-rpc/modules/environment/main.tf create mode 100644 spartan/terraform/deploy-rpc/modules/environment/outputs.tf create mode 100644 spartan/terraform/deploy-rpc/modules/environment/variables.tf create mode 100644 spartan/terraform/deploy-rpc/modules/rpc/main.tf create mode 100644 spartan/terraform/deploy-rpc/modules/rpc/outputs.tf create mode 100644 spartan/terraform/deploy-rpc/modules/rpc/values/prod-res.yaml create mode 100644 spartan/terraform/deploy-rpc/modules/rpc/values/prod.yaml create mode 100644 spartan/terraform/deploy-rpc/modules/rpc/variables.tf diff --git a/.github/workflows/deploy-rpc.yml b/.github/workflows/deploy-rpc.yml new file mode 100644 index 000000000000..3173f6a3e041 --- /dev/null +++ b/.github/workflows/deploy-rpc.yml @@ -0,0 +1,238 @@ +name: Deploy RPC + +on: + workflow_call: + inputs: + rpc_environment: + description: "RPC environment to deploy: testnet or mainnet." + required: true + type: string + v4_aztec_docker_image: + description: "Full Aztec Docker image for the v4 RPC, for example aztecprotocol/aztec:4.3.1." + required: true + type: string + canonical_aztec_docker_image: + description: "Full Aztec Docker image for canonical RPC. Accepted now, used when the canonical RPC block is enabled." + required: false + type: string + default: "" + ref: + description: "Git ref to checkout. Defaults to the caller ref." + required: false + type: string + gcp_project_id: + description: "GCP project id for the RPC deployment." + required: false + type: string + default: "testnet-440309" + gcp_region: + description: "GCP region passed to Terraform." + required: false + type: string + default: "us-west1" + cluster: + description: "GKE cluster name." + required: false + type: string + default: "aztec-gke-public" + cluster_location: + description: "GKE cluster location used by gcloud." + required: false + type: string + default: "us-west1-a" + k8s_cluster_context: + description: "Kubernetes context override. Defaults to gke___." + required: false + type: string + default: "" + respect_tf_lock: + description: "Whether Terraform should respect state locking." + required: false + type: boolean + default: true + secrets: + GCP_SA_KEY: + description: "GCP service account key for Terraform, GCS state, and GKE access." + required: true + workflow_dispatch: + inputs: + rpc_environment: + description: "RPC environment to deploy." + required: true + type: choice + options: + - testnet + - mainnet + v4_aztec_docker_image: + description: "Full Aztec Docker image for the v4 RPC, for example aztecprotocol/aztec:4.3.1." + required: true + type: string + canonical_aztec_docker_image: + description: "Full Aztec Docker image for canonical RPC. Accepted now, used when the canonical RPC block is enabled." + required: false + type: string + ref: + description: "Git ref to checkout. Leave empty to use the current ref." + required: false + type: string + gcp_project_id: + description: "GCP project id for the RPC deployment." + required: false + type: string + default: "testnet-440309" + gcp_region: + description: "GCP region passed to Terraform." + required: false + type: string + default: "us-west1" + cluster: + description: "GKE cluster name." + required: false + type: string + default: "aztec-gke-public" + cluster_location: + description: "GKE cluster location used by gcloud." + required: false + type: string + default: "us-west1-a" + k8s_cluster_context: + description: "Kubernetes context override. Defaults to gke___." + required: false + type: string + respect_tf_lock: + description: "Whether Terraform should respect state locking." + required: false + type: boolean + default: true + +permissions: + contents: read + +concurrency: + group: deploy-rpc-${{ inputs.rpc_environment }} + cancel-in-progress: false + +jobs: + deploy-rpc: + runs-on: ubuntu-latest + env: + GCP_PROJECT_ID: ${{ inputs.gcp_project_id }} + GCP_REGION: ${{ inputs.gcp_region }} + CLUSTER_NAME: ${{ inputs.cluster }} + CLUSTER_LOCATION: ${{ inputs.cluster_location }} + steps: + - name: Determine checkout ref + id: checkout-ref + env: + REQUESTED_REF: ${{ inputs.ref }} + run: | + if [[ -n "$REQUESTED_REF" ]]; then + echo "ref=$REQUESTED_REF" >> "$GITHUB_OUTPUT" + else + echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT" + fi + + - name: Checkout + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1 + with: + ref: ${{ steps.checkout-ref.outputs.ref }} + fetch-depth: 0 + persist-credentials: false + + - name: Resolve deployment inputs + env: + RPC_ENVIRONMENT: ${{ inputs.rpc_environment }} + V4_AZTEC_DOCKER_IMAGE: ${{ inputs.v4_aztec_docker_image }} + CANONICAL_AZTEC_DOCKER_IMAGE: ${{ inputs.canonical_aztec_docker_image }} + K8S_CLUSTER_CONTEXT: ${{ inputs.k8s_cluster_context }} + run: | + if [[ "$RPC_ENVIRONMENT" != "testnet" && "$RPC_ENVIRONMENT" != "mainnet" ]]; then + echo "Error: rpc_environment must be testnet or mainnet, got '$RPC_ENVIRONMENT'" + exit 1 + fi + + if [[ ! "$V4_AZTEC_DOCKER_IMAGE" =~ ^.+:.+$ ]]; then + echo "Error: v4_aztec_docker_image must be in repository:tag form" + exit 1 + fi + + if [[ -n "$CANONICAL_AZTEC_DOCKER_IMAGE" && ! "$CANONICAL_AZTEC_DOCKER_IMAGE" =~ ^.+:.+$ ]]; then + echo "Error: canonical_aztec_docker_image must be empty or in repository:tag form" + exit 1 + fi + + resolved_context="$K8S_CLUSTER_CONTEXT" + if [[ -z "$resolved_context" ]]; then + resolved_context="gke_${GCP_PROJECT_ID}_${CLUSTER_LOCATION}_${CLUSTER_NAME}" + fi + + namespace="${RPC_ENVIRONMENT}-rpc" + terraform_dir="spartan/terraform/deploy-rpc/environments/${RPC_ENVIRONMENT}" + state_path="${CLUSTER_NAME}/${namespace}/deploy-rpc" + + if [[ ! -d "$terraform_dir" ]]; then + echo "Error: Terraform environment not found: $terraform_dir" + exit 1 + fi + + { + echo "RPC_ENVIRONMENT=$RPC_ENVIRONMENT" + echo "NAMESPACE=$namespace" + echo "TERRAFORM_DIR=$terraform_dir" + echo "STATE_PATH=$state_path" + echo "K8S_CLUSTER_CONTEXT=$resolved_context" + echo "TF_VAR_GCP_PROJECT_ID=$GCP_PROJECT_ID" + echo "TF_VAR_GCP_REGION=$GCP_REGION" + echo "TF_VAR_K8S_CLUSTER_CONTEXT=$resolved_context" + echo "TF_VAR_V4_AZTEC_DOCKER_IMAGE=$V4_AZTEC_DOCKER_IMAGE" + echo "TF_VAR_CANONICAL_AZTEC_DOCKER_IMAGE=$CANONICAL_AZTEC_DOCKER_IMAGE" + } >> "$GITHUB_ENV" + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3.0.0 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db # v3.0.1 + with: + install_components: gke-gcloud-auth-plugin + + - name: Configure kubectl + run: | + gcloud container clusters get-credentials "$CLUSTER_NAME" \ + --region "$CLUSTER_LOCATION" \ + --project "$GCP_PROJECT_ID" + kubectl config use-context "$K8S_CLUSTER_CONTEXT" + + - name: Setup Terraform + uses: hashicorp/setup-terraform@dfe3c3f87815947d99a8997f908cb6525fc44e9e # v4.0.1 + with: + terraform_version: "1.7.5" + terraform_wrapper: false + + - name: Configure Terraform backend + run: spartan/scripts/override_terraform_backend.sh "$TERRAFORM_DIR" "$CLUSTER_NAME" "$STATE_PATH" + + - name: Terraform Init + run: terraform -chdir="$TERRAFORM_DIR" init -reconfigure + + - name: Terraform Plan + run: terraform -chdir="$TERRAFORM_DIR" plan -out=tfplan -lock=${{ inputs.respect_tf_lock }} + + - name: Terraform Apply + run: terraform -chdir="$TERRAFORM_DIR" apply -auto-approve -lock=${{ inputs.respect_tf_lock }} tfplan + + - name: Write summary + if: always() + run: | + { + echo "## RPC deployment" + echo + echo "| Field | Value |" + echo "|---|---|" + echo "| Environment | \`${RPC_ENVIRONMENT:-${{ inputs.rpc_environment }}}\` |" + echo "| Namespace | \`${NAMESPACE:-unknown}\` |" + echo "| v4 image | \`${TF_VAR_V4_AZTEC_DOCKER_IMAGE:-unknown}\` |" + echo "| Terraform state | \`${STATE_PATH:-unknown}\` |" + } >> "$GITHUB_STEP_SUMMARY" diff --git a/spartan/terraform/deploy-rpc/README.md b/spartan/terraform/deploy-rpc/README.md new file mode 100644 index 000000000000..4c7a201c2d0b --- /dev/null +++ b/spartan/terraform/deploy-rpc/README.md @@ -0,0 +1,21 @@ +# RPC Deployment + +Terraform for standalone public RPC deployments. + +Shared modules: + +- `modules/environment`: this module defines an environment. Creates RPC and API Gateway +- `modules/rpc`: Aztec RPC deployment +- `../modules/rpc-gateway`: Kong API Gateway + +Environments. This is what you want to `terraform apply` +- `environments/testnet`: testnet RPC +- `environments/mainnet`: mainnet RPC following canonical & v4 + +Set the Aztec images in each environment with `V4_AZTEC_DOCKER_IMAGE` and `CANONICAL_AZTEC_DOCKER_IMAGE`. The canonical RPC block is currently commented out, but it already references the canonical image variable for when that route is enabled. Each RPC entry passes its image directly to the node module. + +GitHub Actions can deploy these environments through `.github/workflows/deploy-rpc.yml`. Call it with `rpc_environment` set to `testnet` or `mainnet`, and `v4_aztec_docker_image` set to the image to deploy. + +RPC node environment is configured through each RPC entry's single `env` map. Common values such as `NETWORK`, `L1_CHAIN_ID`, and `RPC_MAX_BODY_SIZE` live in the environment-level `local.env`; rollup-specific values such as `ROLLUP_VERSION` are merged per RPC. + +API key consumers are Terraform inputs, but API key values are not. For each `CONSUMERS` entry, provide `gcp_secret_manager_secret_name`. Set `ALLOW_ANONYMOUS = true` on the environment module to allow anonymous usage, with `ANONYMOUS_RATE_LIMIT_MINUTE` controlling rate limit. diff --git a/spartan/terraform/deploy-rpc/environments/mainnet/main.tf b/spartan/terraform/deploy-rpc/environments/mainnet/main.tf new file mode 100644 index 000000000000..8d1d59aeefee --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/mainnet/main.tf @@ -0,0 +1,90 @@ +terraform { + backend "gcs" { + bucket = "aztec-terraform" + prefix = "aztec-gke-public/mainnet-rpc/deploy-rpc/terraform.tfstate" + } + + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 3.1.2" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 3.1.0" + } + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } +} + +provider "kubernetes" { + alias = "gke-cluster" + config_path = "~/.kube/config" + config_context = var.K8S_CLUSTER_CONTEXT +} + +provider "helm" { + alias = "gke-cluster" + kubernetes = { + config_path = "~/.kube/config" + config_context = var.K8S_CLUSTER_CONTEXT + } +} + +provider "google" { + project = var.GCP_PROJECT_ID + region = var.GCP_REGION +} + +locals { + l1_secret_names = { + l1_rpc_secret_name = "mainnet-rpc-urls" + l1_consensus_host_urls_secret_name = "mainnet-consensus-host-urls" + l1_consensus_host_api_keys_secret_name = "mainnet-consensus-host-api-keys" + l1_consensus_host_api_key_headers_secret_name = "mainnet-consensus-host-api-key-headers" + } + + env = { + NETWORK = "mainnet" + L1_CHAIN_ID = "1" + RPC_MAX_BODY_SIZE = "10mb" + } + + rpcs = { + # TODO enable canonical RPC once canonical routes are ready + # canonical = merge(local.l1_secret_names, { + # aztec_docker_image = var.CANONICAL_AZTEC_DOCKER_IMAGE + # hosts = ["mainnet-new.rpc.aztec-labs.com"] + # storage_size = "8Gi" + # env = merge(local.env, { + # ROLLUP_VERSION = "" + # }) + # }) + v4 = merge(local.l1_secret_names, { + aztec_docker_image = var.V4_AZTEC_DOCKER_IMAGE + hosts = ["v4.mainnet.rpc.aztec-labs.com"] + storage_size = "8Gi" + env = merge(local.env, { + ROLLUP_VERSION = "2934756905" + }) + }) + } +} + +module "environment" { + source = "../../modules/environment" + + providers = { + helm = helm.gke-cluster + kubernetes = kubernetes.gke-cluster + google = google + } + + NAMESPACE = "mainnet-rpc" + RELEASE_PREFIX = "mainnet" + RPCS = local.rpcs + ALLOW_ANONYMOUS = false +} diff --git a/spartan/terraform/deploy-rpc/environments/mainnet/outputs.tf b/spartan/terraform/deploy-rpc/environments/mainnet/outputs.tf new file mode 100644 index 000000000000..054225480bf4 --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/mainnet/outputs.tf @@ -0,0 +1,29 @@ +output "rpc_services" { + description = "RPC Service names and ports keyed by alias." + value = module.environment.rpc_services +} + +output "kong_routes" { + description = "Kong route names keyed by alias." + value = module.environment.kong_routes +} + +output "kong_sticky_session_policy_name" { + description = "Kong sticky session policy name, or null when disabled." + value = module.environment.kong_sticky_session_policy_name +} + +output "kong_metrics_service" { + description = "Kong metrics Service details for Prometheus scraping." + value = module.environment.kong_metrics_service +} + +output "frontend_load_balancer_ip" { + description = "Global static IP assigned to the public GKE frontend Ingress." + value = module.environment.frontend_load_balancer_ip +} + +output "gcp_managed_certificate_name" { + description = "GKE ManagedCertificate resource name for RPC hosts." + value = module.environment.gcp_managed_certificate_name +} diff --git a/spartan/terraform/deploy-rpc/environments/mainnet/variables.tf b/spartan/terraform/deploy-rpc/environments/mainnet/variables.tf new file mode 100644 index 000000000000..2bc7869c3ad2 --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/mainnet/variables.tf @@ -0,0 +1,28 @@ +variable "GCP_PROJECT_ID" { + description = "GCP project id for regional RPC infrastructure." + type = string + default = "testnet-440309" +} + +variable "GCP_REGION" { + description = "GCP region for regional RPC infrastructure." + type = string + default = "us-west1" +} + +variable "K8S_CLUSTER_CONTEXT" { + description = "Kubernetes context for the GKE cluster." + type = string + nullable = false +} + +variable "V4_AZTEC_DOCKER_IMAGE" { + description = "Aztec Docker image to deploy for the v4 RPC." + type = string +} + +variable "CANONICAL_AZTEC_DOCKER_IMAGE" { + description = "Aztec Docker image to deploy for the canonical RPC once that route is enabled." + type = string + default = "" +} diff --git a/spartan/terraform/deploy-rpc/environments/testnet/main.tf b/spartan/terraform/deploy-rpc/environments/testnet/main.tf new file mode 100644 index 000000000000..2fcbcf4c270a --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/testnet/main.tf @@ -0,0 +1,90 @@ +terraform { + backend "gcs" { + bucket = "aztec-terraform" + prefix = "aztec-gke-public/testnet-rpc/deploy-rpc/terraform.tfstate" + } + + required_providers { + helm = { + source = "hashicorp/helm" + version = "~> 3.1.2" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 3.1.0" + } + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } +} + +provider "kubernetes" { + alias = "gke-cluster" + config_path = "~/.kube/config" + config_context = var.K8S_CLUSTER_CONTEXT +} + +provider "helm" { + alias = "gke-cluster" + kubernetes = { + config_path = "~/.kube/config" + config_context = var.K8S_CLUSTER_CONTEXT + } +} + +provider "google" { + project = var.GCP_PROJECT_ID + region = var.GCP_REGION +} + +locals { + l1_secret_names = { + l1_rpc_secret_name = "sepolia-rpc-urls" + l1_consensus_host_urls_secret_name = "sepolia-consensus-host-urls" + l1_consensus_host_api_keys_secret_name = "sepolia-consensus-host-api-keys" + l1_consensus_host_api_key_headers_secret_name = "sepolia-consensus-host-api-key-headers" + } + + env = { + NETWORK = "testnet" + L1_CHAIN_ID = "11155111" + RPC_MAX_BODY_SIZE = "10mb" + } + + rpcs = { + # TODO enable canonical RPC once testnet upgrades + # canonical = merge(local.l1_secret_names, { + # aztec_docker_image = var.CANONICAL_AZTEC_DOCKER_IMAGE + # hosts = ["testnet-new.rpc.aztec-labs.com"] + # storage_size = "8Gi" + # env = merge(local.env, { + # ROLLUP_VERSION = "" + # }) + # }) + v4 = merge(local.l1_secret_names, { + aztec_docker_image = var.V4_AZTEC_DOCKER_IMAGE + hosts = ["v4.testnet.rpc.aztec-labs.com"] + storage_size = "8Gi" + env = merge(local.env, { + ROLLUP_VERSION = "4127419662" + }) + }) + } +} + +module "environment" { + source = "../../modules/environment" + + providers = { + helm = helm.gke-cluster + kubernetes = kubernetes.gke-cluster + google = google + } + + NAMESPACE = "testnet-rpc" + RELEASE_PREFIX = "testnet" + RPCS = local.rpcs + ALLOW_ANONYMOUS = true +} diff --git a/spartan/terraform/deploy-rpc/environments/testnet/outputs.tf b/spartan/terraform/deploy-rpc/environments/testnet/outputs.tf new file mode 100644 index 000000000000..054225480bf4 --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/testnet/outputs.tf @@ -0,0 +1,29 @@ +output "rpc_services" { + description = "RPC Service names and ports keyed by alias." + value = module.environment.rpc_services +} + +output "kong_routes" { + description = "Kong route names keyed by alias." + value = module.environment.kong_routes +} + +output "kong_sticky_session_policy_name" { + description = "Kong sticky session policy name, or null when disabled." + value = module.environment.kong_sticky_session_policy_name +} + +output "kong_metrics_service" { + description = "Kong metrics Service details for Prometheus scraping." + value = module.environment.kong_metrics_service +} + +output "frontend_load_balancer_ip" { + description = "Global static IP assigned to the public GKE frontend Ingress." + value = module.environment.frontend_load_balancer_ip +} + +output "gcp_managed_certificate_name" { + description = "GKE ManagedCertificate resource name for RPC hosts." + value = module.environment.gcp_managed_certificate_name +} diff --git a/spartan/terraform/deploy-rpc/environments/testnet/variables.tf b/spartan/terraform/deploy-rpc/environments/testnet/variables.tf new file mode 100644 index 000000000000..44e1121300ee --- /dev/null +++ b/spartan/terraform/deploy-rpc/environments/testnet/variables.tf @@ -0,0 +1,28 @@ +variable "GCP_PROJECT_ID" { + description = "GCP project id for regional RPC infrastructure." + type = string + default = "testnet-440309" +} + +variable "GCP_REGION" { + description = "GCP region for regional RPC infrastructure." + type = string + default = "us-west1" +} + +variable "K8S_CLUSTER_CONTEXT" { + description = "Kubernetes context for the GKE cluster." + type = string + default = "gke_testnet-440309_us-west1-a_aztec-gke-public" +} + +variable "V4_AZTEC_DOCKER_IMAGE" { + description = "Aztec Docker image to deploy for the v4 RPC." + type = string +} + +variable "CANONICAL_AZTEC_DOCKER_IMAGE" { + description = "Aztec Docker image to deploy for the canonical RPC once that route is enabled." + type = string + default = "" +} diff --git a/spartan/terraform/deploy-rpc/modules/environment/main.tf b/spartan/terraform/deploy-rpc/modules/environment/main.tf new file mode 100644 index 000000000000..d37ff6329b2e --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/environment/main.tf @@ -0,0 +1,89 @@ +terraform { + required_providers { + helm = { + source = "hashicorp/helm" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + google = { + source = "hashicorp/google" + } + } +} + +locals { + # route requests from the same client to the same RPC node in order to have a consisten view of the chain + sticky_policy_name = "${var.RELEASE_PREFIX}-rpc-sticky-sessions" + + routed_rpcs = { + for name, rpc in var.RPCS : name => rpc + if length(rpc.hosts) > 0 + } + + rpc_routes = { + for name, rpc in local.routed_rpcs : name => { + hosts = rpc.hosts + route_namespace = var.NAMESPACE + upstream_service_name = module.rpc[name].service_name + upstream_service_port = module.rpc[name].service_port + auth_mode = var.ALLOW_ANONYMOUS ? "keyed_with_anonymous" : "keyed_only" + anonymous_rate_limit_minute = var.ANONYMOUS_RATE_LIMIT_MINUTE + } + } + +} + +resource "kubernetes_namespace_v1" "rpc" { + metadata { + name = var.NAMESPACE + } +} + +module "rpc" { + for_each = var.RPCS + + source = "../rpc" + + providers = { + helm = helm + kubernetes = kubernetes + } + + NAMESPACE = var.NAMESPACE + RELEASE_NAME = "${var.RELEASE_PREFIX}-rpc-${each.key}" + RELEASE_PREFIX = var.RELEASE_PREFIX + + AZTEC_DOCKER_IMAGE = each.value.aztec_docker_image + ENV = each.value.env + L1_RPC_SECRET_NAME = each.value.l1_rpc_secret_name + L1_CONSENSUS_HOST_URLS_SECRET_NAME = each.value.l1_consensus_host_urls_secret_name + L1_CONSENSUS_HOST_API_KEYS_SECRET_NAME = each.value.l1_consensus_host_api_keys_secret_name + L1_CONSENSUS_HOST_API_KEY_HEADERS_SECRET_NAME = each.value.l1_consensus_host_api_key_headers_secret_name + STORAGE_SIZE = each.value.storage_size + OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME = var.OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME + + depends_on = [kubernetes_namespace_v1.rpc] +} + +module "rpc_gateway" { + source = "../../../modules/rpc-gateway" + + providers = { + helm = helm + kubernetes = kubernetes + google = google + } + + RELEASE_PREFIX = var.RELEASE_PREFIX + CONSUMER_NAMESPACE = var.NAMESPACE + + STICKY_SESSIONS_ENABLED = true + STICKY_SESSION_POLICY_NAME = local.sticky_policy_name + + ROUTES = local.rpc_routes + CONSUMERS = var.CONSUMERS + KONG_OTEL_METRICS_GCP_SECRET_NAME = var.OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME + + depends_on = [module.rpc] +} diff --git a/spartan/terraform/deploy-rpc/modules/environment/outputs.tf b/spartan/terraform/deploy-rpc/modules/environment/outputs.tf new file mode 100644 index 000000000000..c07d474af210 --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/environment/outputs.tf @@ -0,0 +1,42 @@ +output "rpc_services" { + description = "RPC Service names and ports keyed by alias." + value = { + for name, rpc in module.rpc : name => { + namespace = rpc.namespace + service = rpc.service_name + port = rpc.service_port + hpa = rpc.hpa_name + } + } +} + +output "kong_routes" { + description = "Kong route names keyed by alias." + value = module.rpc_gateway.route_names +} + +output "kong_sticky_session_policy_name" { + description = "Kong sticky session policy name." + value = module.rpc_gateway.sticky_session_policy_name +} + +output "kong_metrics_service" { + description = "Kong metrics Service details for Prometheus scraping." + value = { + namespace = module.rpc_gateway.metrics_service_namespace + service = module.rpc_gateway.metrics_service_name + port = module.rpc_gateway.metrics_service_port + ingress = module.rpc_gateway.metrics_service_load_balancer_ingress + otel_collector = module.rpc_gateway.otel_collector_deployment_name + } +} + +output "frontend_load_balancer_ip" { + description = "Global static IP assigned to the public GKE frontend Ingress." + value = module.rpc_gateway.frontend_load_balancer_ip +} + +output "gcp_managed_certificate_name" { + description = "GKE ManagedCertificate resource name for RPC hosts." + value = module.rpc_gateway.gcp_managed_certificate_name +} diff --git a/spartan/terraform/deploy-rpc/modules/environment/variables.tf b/spartan/terraform/deploy-rpc/modules/environment/variables.tf new file mode 100644 index 000000000000..3714f60ea6ec --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/environment/variables.tf @@ -0,0 +1,51 @@ +variable "NAMESPACE" { + description = "Namespace for RPC workloads and Kong routes." + type = string +} + +variable "RELEASE_PREFIX" { + description = "Prefix for generated release and Kubernetes resource names." + type = string +} + +variable "RPCS" { + description = "RPC instances keyed by public route alias." + type = map(object({ + aztec_docker_image = string + l1_rpc_secret_name = string + l1_consensus_host_urls_secret_name = string + l1_consensus_host_api_keys_secret_name = string + l1_consensus_host_api_key_headers_secret_name = string + hosts = list(string) + storage_size = string + env = map(string) + })) +} + +variable "ALLOW_ANONYMOUS" { + description = "Whether the RPC gateway allows requests without a valid API key. Missing and invalid keys both use the anonymous consumer." + type = bool + default = false +} + +variable "ANONYMOUS_RATE_LIMIT_MINUTE" { + description = "Per-client-IP anonymous request limit per minute when ALLOW_ANONYMOUS=true. Kong local policy makes this per Kong pod." + type = number + default = 300 +} + +variable "OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME" { + description = "GCP Secret Manager secret containing the OpenTelemetry collector base URL." + type = string + default = "otel-collector-url" +} + +variable "CONSUMERS" { + description = "Kong consumers keyed by team name. Configured consumers can use every keyed RPC route in the environment." + type = map(object({ + username = string + gcp_secret_manager_secret_name = string + rate_limit_minute = number + })) + default = {} +} diff --git a/spartan/terraform/deploy-rpc/modules/rpc/main.tf b/spartan/terraform/deploy-rpc/modules/rpc/main.tf new file mode 100644 index 000000000000..564fa859e9b1 --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/rpc/main.tf @@ -0,0 +1,247 @@ +terraform { + required_providers { + helm = { + source = "hashicorp/helm" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + } +} + +locals { + aztec_image_parts = split(":", var.AZTEC_DOCKER_IMAGE) + aztec_image_repository = join(":", slice(local.aztec_image_parts, 0, length(local.aztec_image_parts) - 1)) + aztec_image_tag = local.aztec_image_parts[length(local.aztec_image_parts) - 1] + + workload_name = "${var.RELEASE_NAME}-aztec-node" + l1_secret_name = "${var.RELEASE_NAME}-l1" + otel_secret_name = "${var.RELEASE_NAME}-otel" + sticky_policy_name = "${var.RELEASE_PREFIX}-rpc-sticky-sessions" +} + +resource "helm_release" "rpc" { + name = var.RELEASE_NAME + chart = "${path.module}/../../../../aztec-node" + namespace = var.NAMESPACE + create_namespace = false + upgrade_install = true + force_update = true + recreate_pods = true + reuse_values = false + timeout = 600 + wait = true + wait_for_jobs = true + take_ownership = true + + values = [ + file("${path.module}/values/prod.yaml"), + file("${path.module}/values/prod-res.yaml"), + yamlencode({ + fullnameOverride = local.workload_name + replicaCount = 1 + extraObjects = concat( + [ + { + apiVersion = "external-secrets.io/v1" + kind = "ExternalSecret" + metadata = { + name = local.l1_secret_name + namespace = var.NAMESPACE + } + spec = { + refreshInterval = "1m" + secretStoreRef = { + name = "gcp-secret-store" + kind = "ClusterSecretStore" + } + target = { + name = local.l1_secret_name + creationPolicy = "Owner" + template = { + engineVersion = "v2" + type = "Opaque" + data = { + ETHEREUM_HOSTS = "{{`{{ .ethereumHostsJson | fromJson | join \",\" }}`}}" + L1_CONSENSUS_HOST_URLS = "{{`{{ .consensusHostUrlsJson | fromJson | join \",\" }}`}}" + L1_CONSENSUS_HOST_API_KEYS = "{{`{{ .consensusHostApiKeysJson | fromJson | join \",\" }}`}}" + L1_CONSENSUS_HOST_API_KEY_HEADERS = "{{`{{ .consensusHostApiKeyHeadersJson | fromJson | join \",\" }}`}}" + } + } + } + data = [ + { + secretKey = "ethereumHostsJson" + remoteRef = { + key = var.L1_RPC_SECRET_NAME + } + }, + { + secretKey = "consensusHostUrlsJson" + remoteRef = { + key = var.L1_CONSENSUS_HOST_URLS_SECRET_NAME + } + }, + { + secretKey = "consensusHostApiKeysJson" + remoteRef = { + key = var.L1_CONSENSUS_HOST_API_KEYS_SECRET_NAME + } + }, + { + secretKey = "consensusHostApiKeyHeadersJson" + remoteRef = { + key = var.L1_CONSENSUS_HOST_API_KEY_HEADERS_SECRET_NAME + } + } + ] + } + } + ], + var.OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME != "" ? [ + { + apiVersion = "external-secrets.io/v1" + kind = "ExternalSecret" + metadata = { + name = local.otel_secret_name + namespace = var.NAMESPACE + } + spec = { + refreshInterval = "1m" + secretStoreRef = { + name = "gcp-secret-store" + kind = "ClusterSecretStore" + } + target = { + name = local.otel_secret_name + creationPolicy = "Owner" + template = { + engineVersion = "v2" + type = "Opaque" + data = { + OTEL_EXPORTER_OTLP_METRICS_ENDPOINT = "{{`{{ .otelCollectorEndpoint | trimSuffix \"/\" }}`}}/v1/metrics" + OTEL_EXPORTER_OTLP_TRACES_ENDPOINT = "{{`{{ .otelCollectorEndpoint | trimSuffix \"/\" }}`}}/v1/traces" + } + } + } + data = [ + { + secretKey = "otelCollectorEndpoint" + remoteRef = { + key = var.OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME + } + } + ] + } + } + ] : [] + ) + + global = { + aztecImage = { + repository = local.aztec_image_repository + tag = local.aztec_image_tag + pullPolicy = "Always" + } + aztecEnv = var.ENV + useGcloudLogging = true + otelCollectorEndpoint = "" + } + + node = { + logLevel = "info" + env = { + OTEL_SERVICE_NAME = var.RELEASE_NAME + } + envFrom = { + secrets = concat( + [{ name = local.l1_secret_name }], + var.OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME != "" ? [{ name = local.otel_secret_name }] : [] + ) + } + proverRealProofs = true + updateStrategy = { + type = "RollingUpdate" + } + } + + persistence = { + enabled = true + } + + statefulSet = { + enabled = true + volumeClaimTemplates = [ + { + metadata = { + name = "data" + } + spec = { + accessModes = ["ReadWriteOnce"] + resources = { + requests = { + storage = var.STORAGE_SIZE + } + } + } + } + ] + } + + service = { + rpc = { + enabled = true + port = 8080 + type = "ClusterIP" + annotations = { + "konghq.com/upstream-policy" = local.sticky_policy_name + } + } + admin = { + enabled = false + } + p2p = { + enabled = true + nodePortEnabled = false + publicIP = true + port = 40400 + announcePort = 40400 + } + } + }) + ] +} + +resource "kubernetes_manifest" "hpa" { + manifest = { + apiVersion = "autoscaling/v2" + kind = "HorizontalPodAutoscaler" + metadata = { + name = "${local.workload_name}-hpa" + namespace = var.NAMESPACE + } + spec = { + scaleTargetRef = { + apiVersion = "apps/v1" + kind = "StatefulSet" + name = local.workload_name + } + minReplicas = 1 + maxReplicas = 4 + metrics = [ + { + type = "Resource" + resource = { + name = "cpu" + target = { + type = "Utilization" + averageUtilization = 70 + } + } + } + ] + } + } + + depends_on = [helm_release.rpc] +} diff --git a/spartan/terraform/deploy-rpc/modules/rpc/outputs.tf b/spartan/terraform/deploy-rpc/modules/rpc/outputs.tf new file mode 100644 index 000000000000..14bfa21be6b9 --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/rpc/outputs.tf @@ -0,0 +1,34 @@ +output "release_name" { + description = "Helm release name." + value = helm_release.rpc.name +} + +output "workload_name" { + description = "Kubernetes StatefulSet name." + value = local.workload_name +} + +output "service_name" { + description = "RPC Kubernetes Service name." + value = local.workload_name +} + +output "service_port" { + description = "RPC Kubernetes Service port." + value = 8080 +} + +output "namespace" { + description = "Kubernetes namespace containing the RPC workload." + value = var.NAMESPACE +} + +output "l1_secret_name" { + description = "Kubernetes Secret name populated by ExternalSecrets for L1 env vars." + value = local.l1_secret_name +} + +output "hpa_name" { + description = "HorizontalPodAutoscaler name." + value = kubernetes_manifest.hpa.manifest.metadata.name +} diff --git a/spartan/terraform/deploy-rpc/modules/rpc/values/prod-res.yaml b/spartan/terraform/deploy-rpc/modules/rpc/values/prod-res.yaml new file mode 100644 index 000000000000..5a79879f0299 --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/rpc/values/prod-res.yaml @@ -0,0 +1,23 @@ +nodeSelector: + local-ssd: "false" + node-type: "network" + +affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: cores + operator: In + values: + - "2" + +node: + resources: + requests: + cpu: "0.5" + memory: "2Gi" + limits: + cpu: "1.9" + memory: "6Gi" diff --git a/spartan/terraform/deploy-rpc/modules/rpc/values/prod.yaml b/spartan/terraform/deploy-rpc/modules/rpc/values/prod.yaml new file mode 100644 index 000000000000..ac855a0bc3bd --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/rpc/values/prod.yaml @@ -0,0 +1,33 @@ +nodeType: "rpc-node" + +node: + preStartScript: | + if [ -n "${BOOT_NODE_HOST:-}" ]; then + until curl --silent --head --fail "${BOOT_NODE_HOST}/status" > /dev/null; do + echo "Waiting for boot node..." + sleep 1 + done + echo "Boot node is ready!" + + export BOOTSTRAP_NODES=$(curl -X POST -H "content-type: application/json" --data '{"method": "bootstrap_getEncodedEnr"}' $BOOT_NODE_HOST | jq -r .result) + fi + startCmd: + - --node + +persistence: + enabled: true + +statefulSet: + enabled: true + +service: + rpc: + enabled: true + type: ClusterIP + p2p: + enabled: true + nodePortEnabled: false + admin: + enabled: false + headless: + enabled: false diff --git a/spartan/terraform/deploy-rpc/modules/rpc/variables.tf b/spartan/terraform/deploy-rpc/modules/rpc/variables.tf new file mode 100644 index 000000000000..2b3c92b3a0ef --- /dev/null +++ b/spartan/terraform/deploy-rpc/modules/rpc/variables.tf @@ -0,0 +1,60 @@ +variable "NAMESPACE" { + description = "Kubernetes namespace to deploy the RPC workload into." + type = string +} + +variable "RELEASE_NAME" { + description = "Helm release name for this RPC instance." + type = string +} + +variable "RELEASE_PREFIX" { + description = "Prefix used for generated RPC gateway resources." + type = string +} + +variable "AZTEC_DOCKER_IMAGE" { + description = "Aztec Docker image in repository:tag form." + type = string + + validation { + condition = can(regex("^.+:.+$", var.AZTEC_DOCKER_IMAGE)) + error_message = "AZTEC_DOCKER_IMAGE must be in repository:tag form, for example aztecprotocol/aztec:latest." + } +} + +variable "ENV" { + description = "Environment variables for the RPC node." + type = map(string) +} + +variable "L1_RPC_SECRET_NAME" { + description = "GCP Secret Manager secret containing the JSON array of L1 execution RPC URLs." + type = string +} + +variable "L1_CONSENSUS_HOST_URLS_SECRET_NAME" { + description = "GCP Secret Manager secret containing the JSON array of L1 consensus host URLs." + type = string +} + +variable "L1_CONSENSUS_HOST_API_KEYS_SECRET_NAME" { + description = "GCP Secret Manager secret containing the JSON array of L1 consensus host API keys." + type = string +} + +variable "L1_CONSENSUS_HOST_API_KEY_HEADERS_SECRET_NAME" { + description = "GCP Secret Manager secret containing the JSON array of L1 consensus host API key headers." + type = string +} + +variable "STORAGE_SIZE" { + description = "Persistent volume size per RPC pod." + type = string +} + +variable "OTEL_COLLECTOR_ENDPOINT_GCP_SECRET_NAME" { + description = "GCP Secret Manager secret containing the OpenTelemetry collector base URL." + type = string + default = "otel-collector-url" +} From f83bb601b938cabf250ff3408a1921d4ea7f6e66 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 11 Jun 2026 12:19:03 +0300 Subject: [PATCH 8/9] feat: add readiness gate (#24002) Adds RPC readiness gating for Spartan deployments. Stack: #23997 -> #23998 -> #23999 -> #24000 -> #24001 -> #24002 Fixes: A-1142, A-1134, A-1135, A-1136. --- spartan/aztec-bot/values.yaml | 2 ++ spartan/aztec-node/README.md | 8 +++++ .../aztec-node/templates/_pod-template.yaml | 15 ++++++++++ spartan/aztec-node/values.yaml | 15 ++++++++++ spartan/aztec-prover-stack/values.yaml | 4 +++ .../values/p2p-bootstrap.yaml | 2 ++ .../terraform/deploy-rpc/modules/rpc/main.tf | 29 +++++++++++++++++++ 7 files changed, 75 insertions(+) diff --git a/spartan/aztec-bot/values.yaml b/spartan/aztec-bot/values.yaml index 2161b124ba76..43c9ce8bda45 100644 --- a/spartan/aztec-bot/values.yaml +++ b/spartan/aztec-bot/values.yaml @@ -96,6 +96,8 @@ bot: startCmd: - --bot + readinessProbe: + enabled: false hostNetwork: false diff --git a/spartan/aztec-node/README.md b/spartan/aztec-node/README.md index a8592b795505..4c656b531938 100644 --- a/spartan/aztec-node/README.md +++ b/spartan/aztec-node/README.md @@ -159,6 +159,14 @@ service: | node.nodeJsOptions | ["--no-warnings", "--max-old-space-size=4096"] | Node.js options | | node.startupProbe.periodSeconds | 30 | Period seconds for startup probe | | node.startupProbe.failureThreshold | 3 | Failure threshold for startup probe | +| node.readinessProbe.enabled | true | Enable readiness probe rendering | +| node.readinessProbe.exec | sync status check | Exec readiness probe config | +| node.readinessProbe.httpGet | null | HTTP readiness probe config | +| node.readinessProbe.periodSeconds | 15 | Period seconds for readiness probe | +| node.readinessProbe.timeoutSeconds | 2 | Timeout seconds for readiness probe | +| node.readinessProbe.initialDelaySeconds | 0 | Initial delay before readiness probes | +| node.readinessProbe.failureThreshold | 1 | Failure threshold for readiness probe | +| node.readinessProbe.successThreshold | 1 | Success threshold for readiness probe | | persistence.enabled | false | Enable persistence (uses emptyDir when disabled) | | persistence.existingClaim | null | Use an existing PVC | | persistence.accessModes | ["ReadWriteOnce"] | Access modes for persistence | diff --git a/spartan/aztec-node/templates/_pod-template.yaml b/spartan/aztec-node/templates/_pod-template.yaml index 1cd1228e2aa8..6837e0f5bba5 100644 --- a/spartan/aztec-node/templates/_pod-template.yaml +++ b/spartan/aztec-node/templates/_pod-template.yaml @@ -119,6 +119,21 @@ spec: timeoutSeconds: {{ .Values.node.startupProbe.timeoutSeconds }} initialDelaySeconds: {{ .Values.node.startupProbe.initialDelaySeconds }} failureThreshold: {{ .Values.node.startupProbe.failureThreshold }} + {{- if .Values.node.readinessProbe.enabled }} + readinessProbe: + {{- if .Values.node.readinessProbe.exec }} + exec: +{{ toYaml .Values.node.readinessProbe.exec | indent 10 }} + {{- else if .Values.node.readinessProbe.httpGet }} + httpGet: +{{ toYaml .Values.node.readinessProbe.httpGet | indent 10 }} + {{- end }} + periodSeconds: {{ .Values.node.readinessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.node.readinessProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.node.readinessProbe.initialDelaySeconds }} + failureThreshold: {{ .Values.node.readinessProbe.failureThreshold }} + successThreshold: {{ .Values.node.readinessProbe.successThreshold }} + {{- end }} volumeMounts: - name: shared mountPath: /shared diff --git a/spartan/aztec-node/values.yaml b/spartan/aztec-node/values.yaml index f053da91ca8e..345e9b2f3340 100644 --- a/spartan/aztec-node/values.yaml +++ b/spartan/aztec-node/values.yaml @@ -138,6 +138,21 @@ node: # 20 minutes default but this might not be enough if the node has to download a lot of blocks. failureThreshold: 40 + readinessProbe: + enabled: true + exec: + command: + - /bin/sh + - -ec + - | + curl -sf -H 'content-type: application/json' --data '{"jsonrpc":"2.0","id":1,"method":"node_getWorldStateSyncStatus","params":[]}' "http://127.0.0.1:${AZTEC_PORT:-8080}" | jq -e '.result.treesAreSynched == true' > /dev/null + httpGet: null + periodSeconds: 15 + timeoutSeconds: 2 + initialDelaySeconds: 0 + failureThreshold: 1 + successThreshold: 1 + resources: {} proverRealProofs: true diff --git a/spartan/aztec-prover-stack/values.yaml b/spartan/aztec-prover-stack/values.yaml index 17631052e3a8..ed4d3a4bfde5 100644 --- a/spartan/aztec-prover-stack/values.yaml +++ b/spartan/aztec-prover-stack/values.yaml @@ -69,6 +69,8 @@ broker: RPC_MAX_BODY_SIZE: "50mb" startCmd: - --prover-broker + readinessProbe: + enabled: false hostNetwork: false @@ -111,6 +113,8 @@ agent: source /scripts/wait-for-broker.sh startCmd: - --prover-agent + readinessProbe: + enabled: false service: p2p: diff --git a/spartan/terraform/deploy-aztec-infra/values/p2p-bootstrap.yaml b/spartan/terraform/deploy-aztec-infra/values/p2p-bootstrap.yaml index f4b438d6d67a..8e10cefb8268 100644 --- a/spartan/terraform/deploy-aztec-infra/values/p2p-bootstrap.yaml +++ b/spartan/terraform/deploy-aztec-infra/values/p2p-bootstrap.yaml @@ -12,6 +12,8 @@ node: startCmd: - --p2p-bootstrap + readinessProbe: + enabled: false service: p2p: diff --git a/spartan/terraform/deploy-rpc/modules/rpc/main.tf b/spartan/terraform/deploy-rpc/modules/rpc/main.tf index 564fa859e9b1..c94fc14b1fd6 100644 --- a/spartan/terraform/deploy-rpc/modules/rpc/main.tf +++ b/spartan/terraform/deploy-rpc/modules/rpc/main.tf @@ -228,6 +228,35 @@ resource "kubernetes_manifest" "hpa" { } minReplicas = 1 maxReplicas = 4 + behavior = { + scaleUp = { + stabilizationWindowSeconds = 600 + selectPolicy = "Max" + policies = [ + { + type = "Pods" + value = 4 + periodSeconds = 15 + }, + { + type = "Percent" + value = 100 + periodSeconds = 15 + } + ] + } + scaleDown = { + stabilizationWindowSeconds = 300 + selectPolicy = "Max" + policies = [ + { + type = "Percent" + value = 100 + periodSeconds = 15 + } + ] + } + } metrics = [ { type = "Resource" From b2d6925b8c13aafb066b95ae5c465491c25dc520 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 11 Jun 2026 13:11:58 +0300 Subject: [PATCH 9/9] chore: update IAM roles (#24011) . --- spartan/terraform/gke-cluster/iam.tf | 45 +++++++++++++++++++++------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/spartan/terraform/gke-cluster/iam.tf b/spartan/terraform/gke-cluster/iam.tf index 10663f891ed2..e7bc9cb99843 100644 --- a/spartan/terraform/gke-cluster/iam.tf +++ b/spartan/terraform/gke-cluster/iam.tf @@ -1,3 +1,18 @@ +locals { + network_deployer_roles = toset([ + "roles/container.admin", + "roles/storage.admin", + "roles/secretmanager.admin", + "roles/compute.loadBalancerAdmin", + "roles/dns.admin" + ]) + + ci_observer_roles = toset([ + "roles/logging.viewer", + "roles/monitoring.viewer" + ]) +} + # Create the service account resource "google_service_account" "gke_sa" { account_id = "aztec-gke-nodes-sa" @@ -29,23 +44,31 @@ resource "google_service_account" "helm_sa" { # Add IAM roles to the Helm service account resource "google_project_iam_member" "helm_sa_roles" { - for_each = toset([ - "roles/container.admin", - "roles/storage.admin", - "roles/secretmanager.admin", - "roles/compute.loadBalancerAdmin", - "roles/dns.admin" - ]) - project = var.project - role = each.key - member = "serviceAccount:${google_service_account.helm_sa.email}" + for_each = local.network_deployer_roles + project = var.project + role = each.key + member = "serviceAccount:${google_service_account.helm_sa.email}" } # Create a service account for CI resource "google_service_account" "ci" { account_id = var.ci_service_account_id display_name = "CI Service Account" - description = "Service account for CI jobs that publish Docker images" + description = "Service account for CI jobs that publish internal artifacts and deploy networks" +} + +resource "google_project_iam_member" "ci_network_deployer_roles" { + for_each = local.network_deployer_roles + project = var.project + role = each.key + member = "serviceAccount:${google_service_account.ci.email}" +} + +resource "google_project_iam_member" "ci_observer_roles" { + for_each = local.ci_observer_roles + project = var.project + role = each.key + member = "serviceAccount:${google_service_account.ci.email}" } resource "google_service_account" "npm_registry_reader" {