diff --git a/.test_patterns.yml b/.test_patterns.yml index b740808ec72b..498fc75e22ad 100644 --- a/.test_patterns.yml +++ b/.test_patterns.yml @@ -371,7 +371,7 @@ tests: owners: - *palla - - regex: "yarn-project/end-to-end/scripts/run_test.sh ha src/composed/ha/e2e_ha_full.test.ts" + - regex: "yarn-project/end-to-end/scripts/run_test.sh ha src/composed/ha/e2e_ha_full.parallel.test.ts" owners: - *spyros diff --git a/yarn-project/end-to-end/bootstrap.sh b/yarn-project/end-to-end/bootstrap.sh index 18fd910e9372..3c9806c4985c 100755 --- a/yarn-project/end-to-end/bootstrap.sh +++ b/yarn-project/end-to-end/bootstrap.sh @@ -96,7 +96,13 @@ function test_cmds { ) for test in "${tests[@]}"; do # We must set ONLY_TERM_PARENT=1 to allow the script to fully control cleanup process. - echo "$hash:ONLY_TERM_PARENT=1:TIMEOUT=30m $run_test_script ha $test" + if [[ "$test" == *.parallel.test.ts ]]; then + while IFS= read -r test_name; do + echo "$hash:ONLY_TERM_PARENT=1:TIMEOUT=30m $run_test_script ha $test \"$test_name\"" + done < <(extract_test_names "$test") + else + echo "$hash:ONLY_TERM_PARENT=1:TIMEOUT=30m $run_test_script ha $test" + fi done #echo "$hash:ONLY_TERM_PARENT=1 $run_test_script simple src/e2e_multi_validator/e2e_multi_validator_node.test.ts" diff --git a/yarn-project/end-to-end/scripts/ha/docker-compose.yml b/yarn-project/end-to-end/scripts/ha/docker-compose.yml index eb8ecad5d320..cb700f840159 100644 --- a/yarn-project/end-to-end/scripts/ha/docker-compose.yml +++ b/yarn-project/end-to-end/scripts/ha/docker-compose.yml @@ -29,12 +29,6 @@ services: volumes: - web3signer_keys:/keys - anvil: - image: aztecprotocol/build:3.0 - cpus: 1 - mem_limit: 2G - entrypoint: 'anvil --silent -p 8545 --host 0.0.0.0 --chain-id 31337' - end-to-end: image: aztecprotocol/build:3.0 cpus: 4 @@ -51,7 +45,8 @@ services: environment: JEST_CACHE_DIR: /tmp-jest LOG_LEVEL: ${LOG_LEVEL:-verbose} - ETHEREUM_HOSTS: http://anvil:8545 + TEST: ${TEST:-./src/composed/ha/e2e_ha_full.parallel.test.ts} + TEST_NAME: ${TEST_NAME:-} L1_CHAIN_ID: 31337 DATABASE_URL: postgresql://aztec:aztec@postgres:5432/aztec_ha_test WEB3_SIGNER_URL: http://web3signer:9000 @@ -70,10 +65,6 @@ services: while ! nc -z web3signer 9000; do sleep 1; done; echo "Web3Signer is ready" - # Wait for anvil to be ready - while ! nc -z anvil 8545; do sleep 1; done; - echo "Anvil is ready" - # Run database migrations echo "Running database migrations..." cd /root/aztec-packages/yarn-project/aztec @@ -84,7 +75,7 @@ services: cd /root/aztec-packages/yarn-project/end-to-end # Run the test - setsid ./scripts/test_simple.sh ${TEST:-./src/composed/ha/e2e_ha_sequencer.test.ts} & + setsid ./scripts/test_simple.sh "$${TEST}" "$${TEST_NAME}" & pid=$$! pgid=$$(($$(ps -o pgid= -p $$pid))) trap "kill -SIGTERM -$$pgid" SIGTERM @@ -96,8 +87,6 @@ services: condition: service_healthy web3signer: condition: service_started - anvil: - condition: service_started volumes: postgres_data: diff --git a/yarn-project/end-to-end/scripts/run_test.sh b/yarn-project/end-to-end/scripts/run_test.sh index 7f5ca7b8219a..c5a06472e5e5 100755 --- a/yarn-project/end-to-end/scripts/run_test.sh +++ b/yarn-project/end-to-end/scripts/run_test.sh @@ -25,7 +25,10 @@ case "$type" in TEST=$test exec run_compose_test $test end-to-end $PWD/web3signer ;; "ha") - # Remove volumes on cleanup for HA tests to ensure clean database state on retries - TEST=$test REMOVE_COMPOSE_VOLUMES=1 exec run_compose_test $test end-to-end $PWD/ha + # Remove volumes on cleanup for HA tests to ensure clean database state on retries. + # NAME_POSTFIX namespaces the compose project per test so parallel per-test jobs don't collide. + # Compose project names must be lowercase alphanumerics, hyphens, and underscores. + postfix=$(echo "$test_name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/_/g') + TEST=$test TEST_NAME=$test_name NAME_POSTFIX=${postfix:+_$postfix} REMOVE_COMPOSE_VOLUMES=1 exec run_compose_test $test end-to-end $PWD/ha ;; esac diff --git a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts similarity index 88% rename from yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts rename to yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts index 6c3a52616b8d..87c7617c5844 100644 --- a/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.test.ts +++ b/yarn-project/end-to-end/src/composed/ha/e2e_ha_full.parallel.test.ts @@ -15,7 +15,6 @@ import type { Logger } from '@aztec/aztec.js/log'; import { type AztecNode, waitForTx } from '@aztec/aztec.js/node'; import { GovernanceProposerContract } from '@aztec/ethereum/contracts'; import type { DeployAztecL1ContractsReturnType } from '@aztec/ethereum/deploy-aztec-l1-contracts'; -import type { EthCheatCodes } from '@aztec/ethereum/test'; import { BlockNumber, CheckpointNumber, SlotNumber } from '@aztec/foundation/branded-types'; import { Buffer32 } from '@aztec/foundation/buffer'; import { SecretValue } from '@aztec/foundation/config'; @@ -70,16 +69,6 @@ const NODE_COUNT = 5; const VALIDATOR_COUNT = 4; const COMMITTEE_SIZE = 4; -type SyncImmediateBlockSource = { - syncImmediate: () => Promise; -}; - -function hasSyncImmediate(value: unknown): value is SyncImmediateBlockSource { - return ( - typeof value === 'object' && value !== null && 'syncImmediate' in value && typeof value.syncImmediate === 'function' - ); -} - async function getHardcodedAccountData(secret: Fr, salt: Fr): Promise { const contract = new SchnorrHardcodedKeyAccountContract(); const address = await getAccountContractAddress(contract, secret, salt); @@ -120,36 +109,7 @@ async function waitForTriggerTx(node: AztecNode, txHash: TxHash): Promise { - const latestBlock = await node.getBlockData('latest'); - if (!latestBlock) { - throw new Error('Could not load latest block for HA trigger tx'); - } - - const nextBlockTimestamp = latestBlock.header.globalVariables.timestamp + BigInt(aztecSlotDuration); - dateProvider.setTime(Number(nextBlockTimestamp) * 1000); -} - -async function sendTriggerTx( - wallet: TestWallet, - node: AztecNode, - testContract: TestContract, - from: AztecAddress, - syncL1Data: () => Promise, - alignTimeToNextBlockSlot: () => Promise, -): Promise { - await alignTimeToNextBlockSlot(); - const txHash = await submitTriggerTx(wallet, testContract, from); - await syncL1Data(); - return await waitForTriggerTx(node, txHash); -} - -// TODO: re-enable once HA block building is reconciled with the always-enforced timetable (#23821). -describe.skip('HA Full Setup', () => { +describe('HA Full Setup', () => { jest.setTimeout(20 * 60 * 1000); // 20 minutes let logger: Logger; @@ -158,7 +118,6 @@ describe.skip('HA Full Setup', () => { let testContract: TestContract; let aztecNode: AztecNode; let config: AztecNodeConfig; - let ethCheatCodes: EthCheatCodes; let teardown: () => Promise = async () => {}; let dateProvider: TestDateProvider; let genesis: GenesisData | undefined; @@ -200,27 +159,10 @@ describe.skip('HA Full Setup', () => { logger.info('All HA peer sequencers started'); }; - const syncHAL1Data = async () => { - const l1BlocksPerSyncNudge = Math.ceil((config.aztecSlotDuration * 2) / config.ethereumSlotDuration); - await ethCheatCodes.mine(l1BlocksPerSyncNudge); - await Promise.all( - haNodeServices.map(async service => { - try { - const blockSource = service.getBlockSource(); - if (hasSyncImmediate(blockSource)) { - await blockSource.syncImmediate(); - } - } catch (error) { - logger.debug('Skipping HA L1 sync nudge for stopped node', { - error: error instanceof Error ? error.message : String(error), - }); - } - }), - ); - }; - - const alignDateProviderToNextBlockSlot = async () => { - await setDateProviderToNextBlockSlot(aztecNode, dateProvider, config.aztecSlotDuration); + const sendTriggerTx = async (): Promise => { + await startHASequencers(); + const txHash = await submitTriggerTx(wallet, testContract, ownerAddress); + return await waitForTriggerTx(aztecNode, txHash); }; const stopHANode = async (nodeIndex: number) => { @@ -282,39 +224,39 @@ describe.skip('HA Full Setup', () => { const initialValidators = createInitialValidatorsFromPrivateKeys(attesterPrivateKeys); const hardcodedAccountData = await getHardcodedAccountData(Fr.random(), Fr.random()); - ({ teardown, logger, wallet, aztecNode, config, ethCheatCodes, dateProvider, deployL1ContractsValues, genesis } = - await setup( - 0, - { - ...PIPELINING_SETUP_OPTS, - initialFundedAccounts: [hardcodedAccountData], - initialValidators, - sequencerPublisherPrivateKeys: [new SecretValue(publisherPrivateKeys[0])], - aztecTargetCommitteeSize: COMMITTEE_SIZE, - // The full HA docker/Web3Signer stack can still be joining and syncing after the shared - // 12s pipelining preset's 2.5s start window has closed. Keep real sequencing, but give - // HA validators enough time to pass the enforced build-start gate in CI. - aztecSlotDuration: 16, - // This suite validates HA coordination on tx-bearing checkpoints. Requiring one tx avoids a startup empty - // checkpoint from occupying the shared HA publisher while the trigger tx is still being prepared. - minTxsPerBlock: 1, - archiverPollingIntervalMS: 200, - sequencerPollingIntervalMS: 200, - worldStateBlockCheckIntervalMS: 200, - blockCheckIntervalMS: 200, - startProverNode: true, - // The bootstrap node is only an RPC/P2P anchor. HA validators are the first block producers in this suite. - disableValidator: true, - skipAccountDeployment: true, - // Enable P2P for transaction gossip - p2pEnabled: true, - // Enable slashing for testing governance + slashing vote coordination - slasherEnabled: true, - slashingRoundSizeInEpochs: 1, // 32 slots (1 epoch) - slashingQuorum: 17, // >50% of 32 slots for tally quorum, - }, - { syncChainTip: 'proven' }, - )); + ({ teardown, logger, wallet, aztecNode, config, dateProvider, deployL1ContractsValues, genesis } = await setup( + 0, + { + ...PIPELINING_SETUP_OPTS, + automineL1Setup: true, + initialFundedAccounts: [hardcodedAccountData], + initialValidators, + sequencerPublisherPrivateKeys: [new SecretValue(publisherPrivateKeys[0])], + aztecTargetCommitteeSize: COMMITTEE_SIZE, + // The full HA docker/Web3Signer stack can still be joining and syncing after the shared + // 12s pipelining preset's 2.5s start window has closed. Keep real sequencing, but give + // HA validators enough time to pass the enforced build-start gate in CI. + aztecSlotDuration: 16, + // This suite validates HA coordination on tx-bearing checkpoints. Requiring one tx avoids a startup empty + // checkpoint from occupying the shared HA publisher while the trigger tx is still being prepared. + minTxsPerBlock: 1, + archiverPollingIntervalMS: 200, + sequencerPollingIntervalMS: 200, + worldStateBlockCheckIntervalMS: 200, + blockCheckIntervalMS: 200, + startProverNode: true, + // The bootstrap node is only an RPC/P2P anchor. HA validators are the first block producers in this suite. + disableValidator: true, + skipAccountDeployment: true, + // Enable P2P for transaction gossip + p2pEnabled: true, + // Enable slashing for testing governance + slashing vote coordination + slasherEnabled: true, + slashingRoundSizeInEpochs: 1, // 32 slots (1 epoch) + slashingQuorum: 17, // >50% of 32 slots for tally quorum, + }, + { syncChainTip: 'proven' }, + )); ownerAddress = await registerHardcodedAccount(wallet, hardcodedAccountData); testContract = await registerTestContract(wallet); @@ -435,27 +377,34 @@ describe.skip('HA Full Setup', () => { }); afterAll(async () => { - dateProvider?.reset(); - - // Stop all HA peer nodes in parallel with a per-node deadline. A single stuck node can otherwise - // block the serial loop long enough to blow the jest hook timeout — e.g. a sequencer.stop() that - // awaits an L1 publish whose tx-timeout was computed on a test-warped clock and never fires. + // Stop all sequencers before tearing down the nodes: a sequencer stop awaits its in-flight + // iteration, which can spend tens of seconds finishing a vote or checkpoint publish on L1. + // Stops must be awaited fully — jest runs without forceExit, so a node abandoned mid-stop + // outlives the test environment and keeps the worker process alive until the CI job timeout. + // The dateProvider reset must wait until nodes are stopped: it rewinds the shared clock from + // chain time to wall time (minutes apart after the automine deploy burst), and any publisher + // deadline armed against the rewound clock would block shutdown until wall time catches up. if (haNodeServices) { - const STOP_DEADLINE_MS = 30_000; await Promise.allSettled( - haNodeServices.map((_, i) => { - return Promise.race([ - stopHANode(i).catch(error => { - logger.error(`Failed to stop HA peer node ${i}: ${error}`); - }), - sleep(STOP_DEADLINE_MS).then(() => { - logger.error(`HA peer node ${i} stop did not return within ${STOP_DEADLINE_MS}ms; abandoning`); - }), - ]); + haNodeServices.map(async (service, i) => { + try { + await service.getSequencer()?.stop(); + } catch (error) { + logger.error(`Failed to stop sequencer of HA peer node ${i}: ${error}`); + } }), ); + await Promise.allSettled( + haNodeServices.map((_, i) => + stopHANode(i).catch(error => { + logger.error(`Failed to stop HA peer node ${i}: ${error}`); + }), + ), + ); } + dateProvider?.reset(); + // Cleanup HA keystore temp directories if (haKeystoreDirs) { for (let i = 0; i < haKeystoreDirs.length; i++) { @@ -504,12 +453,7 @@ describe.skip('HA Full Setup', () => { // so HA validators are the first block producers exercised by this suite. logger.info(`Sending trigger tx from ${ownerAddress}`); const txHash = await submitTriggerTx(wallet, testContract, ownerAddress); - // HA nodes cold-start with their archivers synced through the previous L2 slot. Move the - // test clock back one slot before starting their sequencers so the first HA proposal builds - // the next slot their local sync gate permits, instead of immediately chasing a future slot. - dateProvider.setTime(dateProvider.now() - config.aztecSlotDuration * 1000); await startHASequencers(); - await syncHAL1Data(); const receipt = await waitForTriggerTx(aztecNode, txHash); expect(receipt.blockNumber).toBeDefined(); @@ -625,14 +569,7 @@ describe.skip('HA Full Setup', () => { // Send a transaction to trigger block building which will also trigger voting logger.info('Sending transaction to trigger block building...'); - const receipt = await sendTriggerTx( - wallet, - aztecNode, - testContract, - ownerAddress, - syncHAL1Data, - alignDateProviderToNextBlockSlot, - ); + const receipt = await sendTriggerTx(); expect(receipt.blockNumber).toBeDefined(); logger.info(`Transaction mined in block ${receipt.blockNumber}`); @@ -815,14 +752,7 @@ describe.skip('HA Full Setup', () => { verifyNodeAttesters(i, i < 3 ? groupB : groupA, i < 3 ? 'group B (swapped)' : 'group A (swapped)'); } - const receipt = await sendTriggerTx( - wallet, - aztecNode, - testContract, - ownerAddress, - syncHAL1Data, - alignDateProviderToNextBlockSlot, - ); + const receipt = await sendTriggerTx(); expect(receipt.blockNumber).toBeDefined(); const [block] = await aztecNode.getBlocks(receipt.blockNumber!, 1, { includeL1PublishInfo: true, @@ -865,14 +795,7 @@ describe.skip('HA Full Setup', () => { logger.info(`\n=== Producing block ${i + 1}/${blockCount} ===`); logger.info(`Active nodes: ${haNodeServices.length - killedNodes.length}/${NODE_COUNT}`); - const receipt = await sendTriggerTx( - wallet, - aztecNode, - testContract, - ownerAddress, - syncHAL1Data, - alignDateProviderToNextBlockSlot, - ); + const receipt = await sendTriggerTx(); expect(receipt.blockNumber).toBeDefined(); @@ -1026,7 +949,6 @@ describe.skip('HA Full Setup', () => { ); expect(equivocationOffenses).toEqual([]); - dateProvider.reset(); await Promise.all(haNodeServices.map((_, nodeIndex) => stopHANode(nodeIndex))); }); @@ -1106,7 +1028,7 @@ describe.skip('HA Full Setup', () => { } }); - it('should not delete recent duties when node clock is ahead (using cleanupOldDuties)', async () => { + it('should not delete recent duties via cleanupOldDuties when node clock is ahead', async () => { const spDb = new PostgresSlashingProtectionDatabase(mainPool); // Ensure clean slate for this test @@ -1168,7 +1090,7 @@ describe.skip('HA Full Setup', () => { expect(result.rows.length).toBe(1); }); - it('should delete old duties based on DB time, not node time (using cleanupOldDuties)', async () => { + it('should delete old duties via cleanupOldDuties based on DB time, not node time', async () => { const spDb = new PostgresSlashingProtectionDatabase(mainPool); // Ensure clean slate for this test @@ -1237,7 +1159,7 @@ describe.skip('HA Full Setup', () => { expect(result.rows.length).toBe(0); }); - it('should not delete recent stuck duties when node clock is ahead (using cleanupOwnStuckDuties)', async () => { + it('should not delete recent stuck duties via cleanupOwnStuckDuties when node clock is ahead', async () => { const spDb = new PostgresSlashingProtectionDatabase(mainPool); // Create a signing duty (stuck, not completed) using our actual method diff --git a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts index bf7b7f2fafb8..519fbdb6dfed 100644 --- a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts +++ b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.test.ts @@ -720,6 +720,28 @@ describe('SequencerPublisher', () => { expect((publisher as any).requests.length).toEqual(0); }); + it('does not sleep in sendRequestsAt if interrupted beforehand', async () => { + // A target slot far enough in the future that sendRequestsAt would sleep for ~1 hour + // (EmptyL1RollupConstants has slotDuration 1s and l1GenesisTime 0, so slot N starts at N seconds). + const targetSlot = SlotNumber(Math.ceil(Date.now() / 1000) + 3600); + publisher.interrupt(); + + let timeout: NodeJS.Timeout | undefined; + try { + const result = await Promise.race([ + publisher.sendRequestsAt(targetSlot), + new Promise<'timed-out'>(resolve => { + timeout = setTimeout(() => resolve('timed-out'), 1000); + }), + ]); + expect(result).toBeUndefined(); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + }); + it('does not send requests if no valid requests are found', async () => { publisher.addRequest({ action: 'propose', diff --git a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts index 96754bfeea39..738e83d5d5b5 100644 --- a/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts +++ b/yarn-project/sequencer-client/src/publisher/sequencer-publisher.ts @@ -630,6 +630,9 @@ export class SequencerPublisher { // Aim to be in the mempool one L1 slot before the L2 slot starts, so we have a chance of // being picked up by the first L1 block of the L2 slot. const submitAfterMs = startOfTargetSlotMs - Number(this.ethereumSlotDuration) * 1000; + if (this.interrupted) { + return undefined; + } const sleepMs = submitAfterMs - this.dateProvider.now(); if (sleepMs > 0) { this.log.debug(`Sleeping ${sleepMs}ms before sending requests`, { diff --git a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts index 176e6d4ea83a..f1da2c37dcee 100644 --- a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts +++ b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.test.ts @@ -14,6 +14,7 @@ import { TimeoutError } from '@aztec/foundation/error'; import { EthAddress } from '@aztec/foundation/eth-address'; import { Signature } from '@aztec/foundation/eth-signature'; import { createLogger } from '@aztec/foundation/log'; +import { promiseWithResolvers } from '@aztec/foundation/promise'; import { TestDateProvider } from '@aztec/foundation/timer'; import type { TypedEventEmitter } from '@aztec/foundation/types'; import { type P2P, P2PClientState } from '@aztec/p2p'; @@ -1735,6 +1736,39 @@ describe('CheckpointProposalJob', () => { } }); + it('interrupts a pending L1 submission sleeping in the publisher', async () => { + const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId); + checkpointBuilder.seedBlocks([block], [txs]); + validatorClient.collectAttestations.mockResolvedValue(getAttestations(block)); + + // Simulate sendRequestsAt sleeping until the target slot: the promise only resolves once + // the publisher itself is interrupted. + const sendDeferred = promiseWithResolvers(); + publisher.sendRequestsAt.mockReturnValue(sendDeferred.promise); + publisher.interrupt.mockImplementation(() => sendDeferred.resolve(undefined)); + + const checkpoint = await job.execute(); + expect(checkpoint).toBeDefined(); + + const pendingSubmission = job.awaitPendingSubmission().then(() => 'stopped' as const); + job.interrupt(); + + let timeout: NodeJS.Timeout | undefined; + try { + const result = await Promise.race([ + pendingSubmission, + new Promise<'timed-out'>(resolve => { + timeout = setTimeout(() => resolve('timed-out'), 1000); + }), + ]); + expect(result).toBe('stopped'); + } finally { + if (timeout) { + clearTimeout(timeout); + } + } + }); + it('aborts checkpoint when syncing proposed block to archiver fails', async () => { const { txs, block } = await setupTxsAndBlock(p2p, globalVariables, 1, chainId); checkpointBuilder.seedBlocks([block], [txs]); diff --git a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts index eb9f24087950..613d5080749a 100644 --- a/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts +++ b/yarn-project/sequencer-client/src/sequencer/checkpoint_proposal_job.ts @@ -189,10 +189,11 @@ export class CheckpointProposalJob implements Traceable { await this.pendingL1Submission; } - /** Interrupts job-owned waits so shutdown can finish. */ + /** Interrupts job-owned waits, including the publisher's send-at-slot sleep, so shutdown can finish. */ public interrupt(): void { this.interrupted = true; this.interruptibleSleep.interrupt(true); + this.publisher.interrupt(); } private async awaitInterruptibleSleep(ms: number): Promise {