From f0d63a34ad8744ce623032c6a4ef4fa505d98b8b Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Mon, 18 May 2026 13:41:06 -0700 Subject: [PATCH 01/23] resetOffsetsAndBackfill using bounded stream supervisor --- docs/api-reference/supervisor-api.md | 95 +++++++ .../supervisor/RabbitStreamSupervisor.java | 6 +- .../kafka/supervisor/KafkaSupervisor.java | 10 +- .../kinesis/supervisor/KinesisSupervisor.java | 4 +- .../supervisor/SupervisorManager.java | 117 ++++++++ .../supervisor/SupervisorResource.java | 39 +++ .../supervisor/SeekableStreamSupervisor.java | 8 +- .../supervisor/SupervisorManagerTest.java | 261 ++++++++++++++++++ .../supervisor/SupervisorResourceTest.java | 70 +++++ .../SeekableStreamSupervisorStateTest.java | 8 +- .../SeekableStreamSupervisorTestBase.java | 4 +- 11 files changed, 602 insertions(+), 20 deletions(-) diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md index d321af143020..73a365e2ff91 100644 --- a/docs/api-reference/supervisor-api.md +++ b/docs/api-reference/supervisor-api.md @@ -3539,6 +3539,101 @@ when the supervisor's tasks restart, they resume reading from `{"0": 100, "1": 1 ``` +### Reset offsets and start a backfill supervisor + +Resets the supervisor to the latest available stream offsets and starts a new bounded backfill supervisor to ingest the data in the skipped range. + +This endpoint is useful when a supervisor has fallen behind and you want to catch it up to the latest offsets without losing the skipped data. The main supervisor resumes ingesting from the latest offsets, while the backfill supervisor processes the range from the previously checkpointed offsets up to the latest offsets at the time of the reset. + +The following requirements must be met before calling this endpoint: + +- The supervisor must be a `SeekableStreamSupervisor`. +- The supervisor's `useEarliestSequenceNumber` property must be `false`. +- The supervisor context must have `useConcurrentLocks` set to `true` to allow the backfill supervisor's tasks to write concurrently with the main supervisor's tasks. +- The supervisor must be in a `RUNNING` state so that it can query the latest offsets from the stream. + +The backfill supervisor has the same configuration as the source supervisor except for its ID, which takes the form `{supervisorId}_backfill_{randomSuffix}`, and its `boundedStreamConfig`, which is set to the skipped offset range. If `backfillTaskCount` is specified, it overrides the `taskCount` for the backfill supervisor only. + +#### URL + +`POST` `/druid/indexer/v1/supervisor/{supervisorId}/resetOffsetsAndBackfill` + +#### Query parameters + +| Parameter | Type | Description | Default | +|---------|---------|---------|---------| +| `backfillTaskCount` | Integer | Number of parallel tasks for the backfill supervisor. If not specified, inherits `taskCount` from the source supervisor. | None | + +#### Responses + + + + + + +*Successfully reset and started backfill supervisor* + + + + + +*Supervisor does not meet requirements (wrong type, `useEarliestSequenceNumber` is true, `useConcurrentLocks` not enabled, or supervisor not RUNNING)* + + + + + +*Invalid supervisor ID* + + + + + +*Failed to retrieve stream offsets or serialize the backfill spec* + + + + +--- + +#### Sample request + +The following example resets a supervisor named `social_media` and starts a backfill supervisor with 2 tasks. + + + + + + +```shell +curl --request POST "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/supervisor/social_media/resetOffsetsAndBackfill?backfillTaskCount=2" +``` + + + + + +```HTTP +POST /druid/indexer/v1/supervisor/social_media/resetOffsetsAndBackfill?backfillTaskCount=2 HTTP/1.1 +Host: http://ROUTER_IP:ROUTER_PORT +``` + + + + +#### Sample response + +
+ View the response + + ```json +{ + "id": "social_media", + "backfillSupervisorId": "social_media_backfill_abcdefgh" +} + ``` +
+ ### Terminate a supervisor Terminates a supervisor and its associated indexing tasks, triggering the publishing of their segments. When you terminate a supervisor, Druid places a tombstone marker in the metadata store to prevent reloading on restart. diff --git a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisor.java b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisor.java index 6099105b3374..04973a5272fd 100644 --- a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisor.java +++ b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisor.java @@ -322,7 +322,7 @@ protected Map getTimeLagPerPartition(Map currentOffs } @Override - protected RabbitStreamDataSourceMetadata createDataSourceMetaDataForReset(String topic, Map map) + public RabbitStreamDataSourceMetadata createDataSourceMetaDataForReset(String topic, Map map) { return new RabbitStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(topic, map)); } @@ -408,7 +408,7 @@ public LagStats computeLagStats() } @Override - protected void updatePartitionLagFromStream() + public void updatePartitionLagFromStream() { getRecordSupplierLock().lock(); @@ -435,7 +435,7 @@ protected void updatePartitionLagFromStream() } @Override - protected Map getLatestSequencesFromStream() + public Map getLatestSequencesFromStream() { return latestSequenceFromStream != null ? latestSequenceFromStream : new HashMap<>(); } diff --git a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisor.java b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisor.java index 727eb52db272..5863284cc2d9 100644 --- a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisor.java +++ b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisor.java @@ -356,7 +356,7 @@ protected Map getTimeLagPerPartition(Map map) + public KafkaDataSourceMetadata createDataSourceMetaDataForReset(String topic, Map map) { return new KafkaDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(topic, map)); } @@ -548,7 +548,7 @@ private Map getTimestampPerPartitionAtCurrentOffset(S *

*/ @Override - protected void updatePartitionLagFromStream() + public void updatePartitionLagFromStream() { if (getIoConfig().isEmitTimeLagMetrics()) { updatePartitionTimeAndRecordLagFromStream(); @@ -597,7 +597,7 @@ private void updateOffsetSnapshot( } @Override - protected Map getLatestSequencesFromStream() + public Map getLatestSequencesFromStream() { return offsetSnapshotRef.get().getLatestOffsetsFromStream(); } @@ -630,7 +630,7 @@ protected boolean isMultiTopic() * Gets the offsets as stored in the metadata store. The map returned will only contain * offsets from topic partitions that match the current supervisor config stream. This * override is needed because in the case of multi-topic, a user could have updated the supervisor - * config from single topic to mult-topic, where the new multi-topic pattern regex matches the + * config from single topic to multi-topic, where the new multi-topic pattern regex matches the * old config single topic. Without this override, the previously stored metadata for the single * topic would be deemed as different from the currently configure stream, and not be included in * the offset map returned. This implementation handles these cases appropriately. @@ -640,7 +640,7 @@ protected boolean isMultiTopic() * updated to single topic or multi-topic depending on the supervisor config, as needed. */ @Override - protected Map getOffsetsFromMetadataStorage() + public Map getOffsetsFromMetadataStorage() { final DataSourceMetadata dataSourceMetadata = retrieveDataSourceMetadata(); if (checkSourceMetadataMatch(dataSourceMetadata)) { diff --git a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisor.java b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisor.java index 0f91fc0965db..3f1f4034f3ce 100644 --- a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisor.java +++ b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisor.java @@ -321,7 +321,7 @@ protected Map getTimeLagPerPartition(Map currentOf } @Override - protected SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( + public SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( String stream, Map map ) @@ -336,7 +336,7 @@ protected OrderedSequenceNumber makeSequenceNumber(String seq, boolean i } @Override - protected void updatePartitionLagFromStream() + public void updatePartitionLagFromStream() { KinesisRecordSupplier supplier = (KinesisRecordSupplier) recordSupplier; // this recordSupplier method is thread safe, so does not need to acquire the recordSupplierLock diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 52f3cba7fc11..de2102a27229 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -21,11 +21,14 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Optional; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; import com.google.common.util.concurrent.ListenableFuture; import com.google.inject.Inject; import org.apache.druid.common.guava.FutureUtils; +import org.apache.druid.common.utils.IdUtils; import org.apache.druid.error.DruidException; import org.apache.druid.error.InvalidInput; import org.apache.druid.error.NotFound; @@ -35,8 +38,11 @@ import org.apache.druid.indexing.overlord.DataSourceMetadata; import org.apache.druid.indexing.overlord.supervisor.autoscaler.SupervisorTaskAutoScaler; import org.apache.druid.indexing.seekablestream.SeekableStreamDataSourceMetadata; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; +import org.apache.druid.java.util.common.IAE; +import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.Pair; import org.apache.druid.java.util.common.lifecycle.LifecycleStart; import org.apache.druid.java.util.common.lifecycle.LifecycleStop; @@ -393,6 +399,117 @@ public boolean resetSupervisor(String id, @Nullable DataSourceMetadata resetData return true; } + /** + * Resets a supervisor to the latest stream offsets and starts a bounded backfill supervisor to + * process the skipped range from the previously checkpointed offsets up to the latest offsets. + * + * @param id supervisor ID + * @param backfillTaskCount number of tasks for the backfill supervisor, or null to inherit from the source spec + * @return map with {@code "id"} (the original supervisor ID) and {@code "backfillSupervisorId"} + * @throws IllegalArgumentException if the supervisor is not a {@link SeekableStreamSupervisor}, + * if {@code useEarliestSequenceNumber} is true, + * if {@code useConcurrentLocks} is not set to true in the supervisor context, + * or if the supervisor is not in a RUNNING state + * @throws IllegalStateException if the latest or checkpointed offsets cannot be retrieved, + * or if the backfill spec cannot be serialized + */ + public Map resetSupervisorAndBackfill(String id, @Nullable Integer backfillTaskCount) + { + Preconditions.checkState(started, "SupervisorManager not started"); + Preconditions.checkNotNull(id, "id"); + + Pair supervisorPair = supervisors.get(id); + if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { + throw new IAE("Supervisor[%s] is not a SeekableStreamSupervisor", id); + } + SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; + SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; + + // Verify useEarliestOffset is false + if (streamSupervisor.getIoConfig().isUseEarliestSequenceNumber()) { + throw new IAE("Reset with skipped offsets is not supported when useEarliestOffset is true."); + } + + // Verify useConcurrentLocks is enabled + final Map context = streamSpec.getContext(); + if (context == null || !Boolean.TRUE.equals(context.get("useConcurrentLocks"))) { + throw new IAE( + "Backfill tasks require 'useConcurrentLocks' to be set to true in the supervisor context to allow concurrent writes with the main supervisor tasks" + ); + } + + // We need an active recordSupplier to query the latest offsets from the stream + if (supervisorPair.lhs.getState() != SupervisorStateManager.BasicState.RUNNING) { + throw new IAE("Supervisor[%s] must be in a RUNNING state to perform a reset and backfill", id); + } + + log.info("Capturing latest offsets from stream for supervisor[%s]", id); + streamSupervisor.updatePartitionLagFromStream(); + Map endOffsets = streamSupervisor.getLatestSequencesFromStream(); + + log.info("Capturing checkpointed offsets for supervisor[%s]", id); + Map startOffsets = streamSupervisor.getOffsetsFromMetadataStorage(); + + // Validate that we successfully retrieved offsets + if (endOffsets == null || endOffsets.isEmpty()) { + throw new ISE("Skipping reset: Failed to get latest offsets from stream for supervisor[%s]", id); + } + if (startOffsets == null || startOffsets.isEmpty()) { + throw new ISE("Skipping reset: Failed to get checkpointed offsets for supervisor[%s]", id); + } + + log.info("Resetting supervisor[%s] metadata to latest offsets", id); + DataSourceMetadata resetMetadata = streamSupervisor.createDataSourceMetaDataForReset( + streamSupervisor.getIoConfig().getStream(), + endOffsets + ); + + streamSupervisor.resetOffsets(resetMetadata); + + // Reset autoscaler if present + SupervisorTaskAutoScaler autoscaler = autoscalers.get(id); + if (autoscaler != null) { + autoscaler.reset(); + } + + String backfillSupervisorId = IdUtils.getRandomIdWithPrefix(id + "_backfill"); + + try { + Map normalizedStartOffsets = jsonMapper.readValue(jsonMapper.writeValueAsString(startOffsets), Map.class); + Map normalizedEndOffsets = jsonMapper.readValue(jsonMapper.writeValueAsString(endOffsets), Map.class); + BoundedStreamConfig boundedStreamConfig = new BoundedStreamConfig(normalizedStartOffsets, normalizedEndOffsets); + SupervisorSpec backfillSpec = createBackfillSpec(streamSpec, backfillSupervisorId, boundedStreamConfig, backfillTaskCount); + createOrUpdateAndStartSupervisor(backfillSpec); + } + catch (JsonProcessingException e) { + throw new ISE(e, "Failed to create backfill supervisor spec for supervisor[%s]", id); + } + + log.info("Started backfill supervisor[%s] for supervisor[%s]", backfillSupervisorId, id); + + return ImmutableMap.of( + "id", id, + "backfillSupervisorId", backfillSupervisorId + ); + } + + SupervisorSpec createBackfillSpec( + SeekableStreamSupervisorSpec sourceSpec, + String backfillSupervisorId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer backfillTaskCount + ) throws JsonProcessingException + { + ObjectNode specNode = jsonMapper.valueToTree(sourceSpec); + specNode.put("id", backfillSupervisorId); + ObjectNode ioConfigNode = (ObjectNode) specNode.path("spec").path("ioConfig"); + ioConfigNode.set("boundedStreamConfig", jsonMapper.valueToTree(boundedStreamConfig)); + if (backfillTaskCount != null) { + ioConfigNode.put("taskCount", backfillTaskCount); + } + return jsonMapper.treeToValue(specNode, SupervisorSpec.class); + } + public boolean checkPointDataSourceMetadata( String supervisorId, int taskGroupId, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java index aff9edf19af9..254c60739925 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java @@ -640,6 +640,45 @@ private Response handleResetRequest( ); } + @POST + @Path("/{id}/resetOffsetsAndBackfill") + @Produces(MediaType.APPLICATION_JSON) + @ResourceFilters(SupervisorResourceFilter.class) + public Response resetOffsetsAndBackfill( + @PathParam("id") final String id, + @QueryParam("backfillTaskCount") @Nullable final Integer backfillTaskCount + ) + { + return handleResetAndBackfill(id, backfillTaskCount); + } + + private Response handleResetAndBackfill(final String id, @Nullable final Integer backfillTaskCount) + { + return asLeaderWithSupervisorManager( + manager -> { + if (!manager.getSupervisorIds().contains(id)) { + return Response.status(Response.Status.NOT_FOUND) + .entity(ImmutableMap.of("error", StringUtils.format("[%s] does not exist", id))) + .build(); + } + try { + Map result = manager.resetSupervisorAndBackfill(id, backfillTaskCount); + return Response.ok(result).build(); + } + catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(ImmutableMap.of("error", e.getMessage())) + .build(); + } + catch (Exception e) { + return Response.serverError() + .entity(ImmutableMap.of("error", e.getMessage())) + .build(); + } + } + ); + } + private Response asLeaderWithSupervisorManager(Function f) { Optional supervisorManager = taskMaster.getSupervisorManager(); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java index 91b4244c0bf3..fdf7563873d4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java @@ -3274,7 +3274,7 @@ private boolean updatePartitionDataFromStream() /** * gets mapping of partitions in stream to their latest offsets. */ - protected Map getLatestSequencesFromStream() + public Map getLatestSequencesFromStream() { return new HashMap<>(); } @@ -4552,7 +4552,7 @@ private OrderedSequenceNumber getOffsetFromStorageForPartiti } } - protected Map getOffsetsFromMetadataStorage() + public Map getOffsetsFromMetadataStorage() { final DataSourceMetadata dataSourceMetadata = retrieveDataSourceMetadata(); if (dataSourceMetadata instanceof SeekableStreamDataSourceMetadata @@ -4939,7 +4939,7 @@ private void updateCurrentOffsets() throws InterruptedException, ExecutionExcept coalesceAndAwait(futures); } - protected abstract void updatePartitionLagFromStream(); + public abstract void updatePartitionLagFromStream(); /** * Gets 'lag' of currently processed offset behind latest offset as a measure of difference between offsets. @@ -5196,7 +5196,7 @@ protected abstract List sequence * @return specific instance of datasource metadata */ - protected abstract SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( + public abstract SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( String stream, Map map ); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java index 525444e23dea..5be400cdc086 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java @@ -19,13 +19,19 @@ package org.apache.druid.indexing.overlord.supervisor; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonTypeName; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.jsontype.NamedType; import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.util.concurrent.SettableFuture; import org.apache.druid.data.input.impl.ByteEntity; +import org.apache.druid.data.input.impl.DimensionsSpec; +import org.apache.druid.data.input.impl.TimestampSpec; import org.apache.druid.error.DruidException; import org.apache.druid.error.DruidExceptionMatcher; import org.apache.druid.error.InvalidInput; @@ -35,7 +41,11 @@ import org.apache.druid.indexing.overlord.ObjectMetadata; import org.apache.druid.indexing.seekablestream.SeekableStreamStartSequenceNumbers; import org.apache.druid.indexing.seekablestream.TestSeekableStreamDataSourceMetadata; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; +import org.apache.druid.indexing.seekablestream.supervisor.LagAggregator; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisor; +import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIOConfig; +import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIngestionSpec; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; import org.apache.druid.jackson.DefaultObjectMapper; import org.apache.druid.java.util.common.DateTimes; @@ -43,6 +53,7 @@ import org.apache.druid.java.util.common.Pair; import org.apache.druid.metadata.MetadataSupervisorManager; import org.apache.druid.metadata.PendingSegmentRecord; +import org.apache.druid.segment.indexing.DataSchema; import org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec; import org.apache.druid.server.metrics.SupervisorStatsProvider; import org.apache.druid.timeline.partition.NumberedShardSpec; @@ -1068,6 +1079,170 @@ public void test_isAnotherTaskGroupPublishingToPartitions() ); } + @Test + public void testResetSupervisorAndBackfill() throws Exception + { + EasyMock.expect(metadataSupervisorManager.getLatest()).andReturn(ImmutableMap.of()); + replayAll(); + manager.start(); + + final ConcurrentHashMap> supervisorsMap = getSupervisorsMap(); + final SeekableStreamSupervisorSpec streamSpec = EasyMock.createNiceMock(SeekableStreamSupervisorSpec.class); + final SeekableStreamSupervisor streamSupervisor = EasyMock.createNiceMock(SeekableStreamSupervisor.class); + final SeekableStreamSupervisorIOConfig ioConfig = EasyMock.createNiceMock(SeekableStreamSupervisorIOConfig.class); + + // non-SeekableStream supervisor → IAE + // Use a concrete anonymous Supervisor (not a mock) to reliably fail instanceof SeekableStreamSupervisor + final Supervisor nonStreamSupervisor = new Supervisor() + { + @Override + public void start() + { + } + + @Override + public void stop(boolean stopGracefully) + { + } + + @Override + public SupervisorReport getStatus() + { + return null; + } + + @Override + public SupervisorStateManager.State getState() + { + return null; + } + + @Override + public void reset(DataSourceMetadata dataSourceMetadata) + { + } + }; + supervisorsMap.put("id1", Pair.of(nonStreamSupervisor, streamSpec)); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + + // useEarliestSequenceNumber=true → IAE + supervisorsMap.put("id1", Pair.of(streamSupervisor, streamSpec)); + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(true).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // useConcurrentLocks not set → IAE + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(null).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // supervisor not RUNNING → IAE + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("useConcurrentLocks", true)).once(); + EasyMock.expect(streamSupervisor.getState()).andReturn(SupervisorStateManager.BasicState.SUSPENDED).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // empty latest offsets → ISE + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("useConcurrentLocks", true)).once(); + EasyMock.expect(streamSupervisor.getState()).andReturn(SupervisorStateManager.BasicState.RUNNING).once(); + streamSupervisor.updatePartitionLagFromStream(); + EasyMock.expectLastCall().once(); + EasyMock.expect(streamSupervisor.getLatestSequencesFromStream()).andReturn(ImmutableMap.of()).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalStateException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // empty start offsets from metadata → ISE + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("useConcurrentLocks", true)).once(); + EasyMock.expect(streamSupervisor.getState()).andReturn(SupervisorStateManager.BasicState.RUNNING).once(); + streamSupervisor.updatePartitionLagFromStream(); + EasyMock.expectLastCall().once(); + EasyMock.expect(streamSupervisor.getLatestSequencesFromStream()).andReturn(ImmutableMap.of("0", 100L)).once(); + EasyMock.expect(streamSupervisor.getOffsetsFromMetadataStorage()).andReturn(ImmutableMap.of()).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalStateException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + + verifyAll(); + } + + @Test + public void testCreateBackfillSpec() throws Exception + { + final ObjectMapper localMapper = new DefaultObjectMapper(); + localMapper.registerSubtypes( + new NamedType(TestBackfillSupervisorSpec.class, "testBackfill"), + new NamedType(TestBackfillSupervisorSpec.IngestionSpec.class, "testBackfillIngestionSpec"), + new NamedType(TestBackfillSupervisorSpec.IOConfig.class, "testBackfillIOConfig") + ); + + final SupervisorManager localManager = new SupervisorManager(localMapper, metadataSupervisorManager); + + final TestBackfillSupervisorSpec.IOConfig ioConfig = new TestBackfillSupervisorSpec.IOConfig("test-stream", null, null); + final TestBackfillSupervisorSpec.IngestionSpec ingestionSpec = new TestBackfillSupervisorSpec.IngestionSpec(ioConfig); + final SeekableStreamSupervisorSpec sourceSpec = new TestBackfillSupervisorSpec("original-id", ingestionSpec); + + final BoundedStreamConfig boundedStreamConfig = new BoundedStreamConfig( + ImmutableMap.of("0", 100L), + ImmutableMap.of("0", 200L) + ); + + // Without overriding taskCount + final SupervisorSpec backfillSpec = localManager.createBackfillSpec( + sourceSpec, + "backfill-id", + boundedStreamConfig, + null + ); + Assert.assertEquals("backfill-id", backfillSpec.getId()); + final TestBackfillSupervisorSpec backfillCast = (TestBackfillSupervisorSpec) backfillSpec; + final BoundedStreamConfig actualConfig = backfillCast.getIoConfig().getBoundedStreamConfig(); + Assert.assertNotNull(actualConfig); + Assert.assertEquals(ImmutableMap.of("0", 100L), actualConfig.getStartSequenceNumbers()); + Assert.assertEquals(ImmutableMap.of("0", 200L), actualConfig.getEndSequenceNumbers()); + Assert.assertEquals(1, backfillCast.getIoConfig().getTaskCount()); + + // With overriding taskCount + final SupervisorSpec backfillSpecWithCount = localManager.createBackfillSpec( + sourceSpec, + "backfill-id-2", + boundedStreamConfig, + 5 + ); + Assert.assertEquals("backfill-id-2", backfillSpecWithCount.getId()); + final TestBackfillSupervisorSpec backfillWithCount = (TestBackfillSupervisorSpec) backfillSpecWithCount; + Assert.assertEquals(5, backfillWithCount.getIoConfig().getTaskCount()); + } + private static class TestSupervisorSpec implements SupervisorSpec { private final String id; @@ -1137,4 +1312,90 @@ public List getDataSources() return Collections.singletonList(id); } } + + @JsonTypeName("testBackfill") + private static class TestBackfillSupervisorSpec extends SeekableStreamSupervisorSpec + { + @JsonCreator + TestBackfillSupervisorSpec( + @JsonProperty("id") String id, + @JsonProperty("spec") IngestionSpec ingestionSpec + ) + { + super( + id, + ingestionSpec, + ImmutableMap.of("useConcurrentLocks", true), + false, + null, null, null, null, + MAPPER, + null, null, null, null + ); + } + + @Override + public Supervisor createSupervisor() + { + return null; + } + + @Override + public String getType() + { + return "testBackfill"; + } + + @Override + public String getSource() + { + return "test-stream"; + } + + @Override + protected SeekableStreamSupervisorSpec toggleSuspend(boolean suspend) + { + return this; + } + + @Override + public SeekableStreamSupervisorIOConfig getIoConfig() + { + return getSpec().getIOConfig(); + } + + @JsonTypeName("testBackfillIngestionSpec") + static class IngestionSpec extends SeekableStreamSupervisorIngestionSpec + { + @JsonCreator + IngestionSpec( + @JsonProperty("ioConfig") IOConfig ioConfig + ) + { + super( + new DataSchema( + "testDS", + new TimestampSpec("time", "auto", null), + new DimensionsSpec(Collections.emptyList()), + null, null, null, null, null + ), + ioConfig, + null + ); + } + } + + @JsonTypeName("testBackfillIOConfig") + static class IOConfig extends SeekableStreamSupervisorIOConfig + { + @JsonCreator + IOConfig( + @JsonProperty("stream") String stream, + @JsonProperty("taskCount") Integer taskCount, + @JsonProperty("boundedStreamConfig") BoundedStreamConfig boundedStreamConfig + ) + { + super(stream, null, 1, taskCount, null, null, null, false, null, null, null, null, LagAggregator.DEFAULT, null, null, null, null, boundedStreamConfig); + } + } + } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index 4ccf4659994f..f6d7bd8234f4 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -1379,6 +1379,76 @@ public void testResetOffsets() verifyAll(); } + @Test + public void testResetOffsetsAndBackfill() + { + // 200 - success + EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); + EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); + EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + .andReturn(ImmutableMap.of("id", "my-id", "backfillSupervisorId", "my-id_backfill_abcdefgh")); + replayAll(); + + Response response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Assert.assertEquals(200, response.getStatus()); + Assert.assertEquals( + ImmutableMap.of("id", "my-id", "backfillSupervisorId", "my-id_backfill_abcdefgh"), + response.getEntity() + ); + verifyAll(); + resetAll(); + + // 404 - supervisor does not exist + EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); + EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of()); + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Assert.assertEquals(404, response.getStatus()); + verifyAll(); + resetAll(); + + // 400 - IAE (e.g. supervisor not running) + EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); + EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); + EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + .andThrow(new IllegalArgumentException("Supervisor[my-id] must be in a RUNNING state")); + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Assert.assertEquals(400, response.getStatus()); + Assert.assertEquals( + ImmutableMap.of("error", "Supervisor[my-id] must be in a RUNNING state"), + response.getEntity() + ); + verifyAll(); + resetAll(); + + // 500 - ISE (e.g. failed to retrieve offsets) + EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); + EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); + EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + .andThrow(new IllegalStateException("Failed to get latest offsets from stream")); + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Assert.assertEquals(500, response.getStatus()); + Assert.assertEquals( + ImmutableMap.of("error", "Failed to get latest offsets from stream"), + response.getEntity() + ); + verifyAll(); + resetAll(); + + // 503 - no supervisor manager (not leader) + EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.absent()); + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Assert.assertEquals(503, response.getStatus()); + verifyAll(); + } + @Test public void testNoopSupervisorSpecSerde() throws Exception { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java index eff5d1acd980..e19d68cb2b3f 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java @@ -3059,7 +3059,7 @@ public String toString() final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor() { @Override - protected SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( + public SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( String stream, Map map ) @@ -3284,7 +3284,7 @@ protected String baseTaskName() } @Override - protected void updatePartitionLagFromStream() + public void updatePartitionLagFromStream() { // do nothing } @@ -3381,7 +3381,7 @@ protected boolean doesTaskMatchSupervisor(Task task) } @Override - protected SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( + public SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( String stream, Map map ) @@ -3521,7 +3521,7 @@ public LagStats computeLagStats() } @Override - protected Map getLatestSequencesFromStream() + public Map getLatestSequencesFromStream() { return streamOffsets; } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java index 4eefaed9bd99..0488670e1e48 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java @@ -124,7 +124,7 @@ protected String baseTaskName() } @Override - protected void updatePartitionLagFromStream() + public void updatePartitionLagFromStream() { // do nothing } @@ -205,7 +205,7 @@ protected boolean doesTaskMatchSupervisor(Task task) } @Override - protected SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( + public SeekableStreamDataSourceMetadata createDataSourceMetaDataForReset( String stream, Map map ) From 750037ddaaa6d898b7be4fd19556fc93149853f0 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Mon, 18 May 2026 14:00:46 -0700 Subject: [PATCH 02/23] Reject non-positive backfillTaskCount --- .../supervisor/SupervisorResource.java | 5 ++++ .../supervisor/SupervisorResourceTest.java | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java index 254c60739925..af8c7adc7664 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java @@ -654,6 +654,11 @@ public Response resetOffsetsAndBackfill( private Response handleResetAndBackfill(final String id, @Nullable final Integer backfillTaskCount) { + if (backfillTaskCount != null && backfillTaskCount < 1) { + return Response.status(Response.Status.BAD_REQUEST) + .entity(ImmutableMap.of("error", "backfillTaskCount must be a positive integer")) + .build(); + } return asLeaderWithSupervisorManager( manager -> { if (!manager.getSupervisorIds().contains(id)) { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index f6d7bd8234f4..a4251fccf3be 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -1440,6 +1440,30 @@ public void testResetOffsetsAndBackfill() verifyAll(); resetAll(); + // 400 - invalid backfillTaskCount (zero) + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", 0); + Assert.assertEquals(400, response.getStatus()); + Assert.assertEquals( + ImmutableMap.of("error", "backfillTaskCount must be a positive integer"), + response.getEntity() + ); + verifyAll(); + resetAll(); + + // 400 - invalid backfillTaskCount (negative) + replayAll(); + + response = supervisorResource.resetOffsetsAndBackfill("my-id", -1); + Assert.assertEquals(400, response.getStatus()); + Assert.assertEquals( + ImmutableMap.of("error", "backfillTaskCount must be a positive integer"), + response.getEntity() + ); + verifyAll(); + resetAll(); + // 503 - no supervisor manager (not leader) EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.absent()); replayAll(); From 99531350ae9be6275a4264bd16ca8ea8e7957bf1 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Tue, 19 May 2026 09:56:00 -0700 Subject: [PATCH 03/23] Reset supervisor after backfill Supervisor has already been started --- .../supervisor/SupervisorManager.java | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index de2102a27229..5ca80559c896 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -458,20 +458,6 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ throw new ISE("Skipping reset: Failed to get checkpointed offsets for supervisor[%s]", id); } - log.info("Resetting supervisor[%s] metadata to latest offsets", id); - DataSourceMetadata resetMetadata = streamSupervisor.createDataSourceMetaDataForReset( - streamSupervisor.getIoConfig().getStream(), - endOffsets - ); - - streamSupervisor.resetOffsets(resetMetadata); - - // Reset autoscaler if present - SupervisorTaskAutoScaler autoscaler = autoscalers.get(id); - if (autoscaler != null) { - autoscaler.reset(); - } - String backfillSupervisorId = IdUtils.getRandomIdWithPrefix(id + "_backfill"); try { @@ -487,6 +473,20 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ log.info("Started backfill supervisor[%s] for supervisor[%s]", backfillSupervisorId, id); + log.info("Resetting supervisor[%s] metadata to latest offsets", id); + DataSourceMetadata resetMetadata = streamSupervisor.createDataSourceMetaDataForReset( + streamSupervisor.getIoConfig().getStream(), + endOffsets + ); + + streamSupervisor.resetOffsets(resetMetadata); + + // Reset autoscaler if present + SupervisorTaskAutoScaler autoscaler = autoscalers.get(id); + if (autoscaler != null) { + autoscaler.reset(); + } + return ImmutableMap.of( "id", id, "backfillSupervisorId", backfillSupervisorId From a5f169bb44c428ab6362309029ec9be540b5e788 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Tue, 19 May 2026 10:16:24 -0700 Subject: [PATCH 04/23] Add helper method specHasConcurrentLocks --- .../supervisor/SupervisorManager.java | 58 +++++++++---------- .../supervisor/SupervisorManagerTest.java | 37 +++++++++++- 2 files changed, 64 insertions(+), 31 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 5ca80559c896..4ca373fa2c42 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -135,33 +135,8 @@ public Optional getActiveSupervisorIdForDatasourceWithAppendLock(String final Supervisor supervisor = entry.getValue().lhs; final SupervisorSpec supervisorSpec = entry.getValue().rhs; - boolean hasAppendLock = Tasks.DEFAULT_USE_CONCURRENT_LOCKS; - if (supervisorSpec instanceof SeekableStreamSupervisorSpec) { - SeekableStreamSupervisorSpec seekableStreamSupervisorSpec = (SeekableStreamSupervisorSpec) supervisorSpec; - Map context = seekableStreamSupervisorSpec.getContext(); - if (context != null) { - Boolean useConcurrentLocks = QueryContexts.getAsBoolean( - Tasks.USE_CONCURRENT_LOCKS, - context.get(Tasks.USE_CONCURRENT_LOCKS) - ); - if (useConcurrentLocks == null) { - TaskLockType taskLockType = QueryContexts.getAsEnum( - Tasks.TASK_LOCK_TYPE, - context.get(Tasks.TASK_LOCK_TYPE), - TaskLockType.class - ); - if (taskLockType == null) { - hasAppendLock = Tasks.DEFAULT_USE_CONCURRENT_LOCKS; - } else if (taskLockType == TaskLockType.APPEND) { - hasAppendLock = true; - } else { - hasAppendLock = false; - } - } else { - hasAppendLock = useConcurrentLocks; - } - } - } + boolean hasAppendLock = supervisorSpec instanceof SeekableStreamSupervisorSpec + && specHasConcurrentLocks((SeekableStreamSupervisorSpec) supervisorSpec); if (supervisor instanceof SeekableStreamSupervisor && !supervisorSpec.isSuspended() @@ -430,9 +405,7 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ throw new IAE("Reset with skipped offsets is not supported when useEarliestOffset is true."); } - // Verify useConcurrentLocks is enabled - final Map context = streamSpec.getContext(); - if (context == null || !Boolean.TRUE.equals(context.get("useConcurrentLocks"))) { + if (!specHasConcurrentLocks(streamSpec)) { throw new IAE( "Backfill tasks require 'useConcurrentLocks' to be set to true in the supervisor context to allow concurrent writes with the main supervisor tasks" ); @@ -748,4 +721,29 @@ private SupervisorSpec getSpec(String id) return supervisor == null ? null : supervisor.rhs; } } + + /** + * Returns true if the spec's context enables concurrent (append) locks, accepting both + * {@code useConcurrentLocks: true} (or any truthy string) and {@code taskLockType: APPEND}. + */ + private static boolean specHasConcurrentLocks(SeekableStreamSupervisorSpec spec) + { + Map context = spec.getContext(); + if (context == null) { + return false; + } + Boolean useConcurrentLocks = QueryContexts.getAsBoolean( + Tasks.USE_CONCURRENT_LOCKS, + context.get(Tasks.USE_CONCURRENT_LOCKS) + ); + if (useConcurrentLocks != null) { + return useConcurrentLocks; + } + TaskLockType taskLockType = QueryContexts.getAsEnum( + Tasks.TASK_LOCK_TYPE, + context.get(Tasks.TASK_LOCK_TYPE), + TaskLockType.class + ); + return taskLockType == TaskLockType.APPEND; + } } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java index 5be400cdc086..5815952ea626 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java @@ -1139,7 +1139,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); - // useConcurrentLocks not set → IAE + // useConcurrentLocks not set (null context) → IAE EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); EasyMock.expect(streamSpec.getContext()).andReturn(null).once(); @@ -1150,6 +1150,41 @@ public void reset(DataSourceMetadata dataSourceMetadata) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + // useConcurrentLocks=false → IAE + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("useConcurrentLocks", false)).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // useConcurrentLocks="true" (string) → accepted, fails at next guard (not RUNNING) + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("useConcurrentLocks", "true")).once(); + EasyMock.expect(streamSupervisor.getState()).andReturn(SupervisorStateManager.BasicState.SUSPENDED).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + + // taskLockType=APPEND → accepted, fails at next guard (not RUNNING) + EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); + EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); + EasyMock.expect(streamSpec.getContext()).andReturn(ImmutableMap.of("taskLockType", "APPEND")).once(); + EasyMock.expect(streamSupervisor.getState()).andReturn(SupervisorStateManager.BasicState.SUSPENDED).once(); + EasyMock.replay(streamSupervisor, streamSpec, ioConfig); + Assert.assertThrows( + IllegalArgumentException.class, + () -> manager.resetSupervisorAndBackfill("id1", null) + ); + EasyMock.reset(streamSupervisor, streamSpec, ioConfig); + // supervisor not RUNNING → IAE EasyMock.expect(streamSupervisor.getIoConfig()).andReturn(ioConfig).anyTimes(); EasyMock.expect(ioConfig.isUseEarliestSequenceNumber()).andReturn(false).once(); From e9992603581bbb3a0203f76de52cf5355cc6fca9 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Wed, 20 May 2026 21:27:52 -0700 Subject: [PATCH 05/23] Fix doc reference --- docs/api-reference/supervisor-api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md index 73a365e2ff91..840666265aeb 100644 --- a/docs/api-reference/supervisor-api.md +++ b/docs/api-reference/supervisor-api.md @@ -3547,7 +3547,7 @@ This endpoint is useful when a supervisor has fallen behind and you want to catc The following requirements must be met before calling this endpoint: -- The supervisor must be a `SeekableStreamSupervisor`. +- The supervisor must be a [streaming supervisor](../ingestion/supervisor.md). - The supervisor's `useEarliestSequenceNumber` property must be `false`. - The supervisor context must have `useConcurrentLocks` set to `true` to allow the backfill supervisor's tasks to write concurrently with the main supervisor's tasks. - The supervisor must be in a `RUNNING` state so that it can query the latest offsets from the stream. From eec7d88df06d57776b57ecd02a00c66e907060c1 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Wed, 20 May 2026 21:51:50 -0700 Subject: [PATCH 06/23] Move validations into helper function --- .../supervisor/SupervisorManager.java | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 4ca373fa2c42..b76b586e18db 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -394,28 +394,11 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ Preconditions.checkNotNull(id, "id"); Pair supervisorPair = supervisors.get(id); - if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { - throw new IAE("Supervisor[%s] is not a SeekableStreamSupervisor", id); - } + validateResetAndBackfill(id, supervisorPair); + SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; - // Verify useEarliestOffset is false - if (streamSupervisor.getIoConfig().isUseEarliestSequenceNumber()) { - throw new IAE("Reset with skipped offsets is not supported when useEarliestOffset is true."); - } - - if (!specHasConcurrentLocks(streamSpec)) { - throw new IAE( - "Backfill tasks require 'useConcurrentLocks' to be set to true in the supervisor context to allow concurrent writes with the main supervisor tasks" - ); - } - - // We need an active recordSupplier to query the latest offsets from the stream - if (supervisorPair.lhs.getState() != SupervisorStateManager.BasicState.RUNNING) { - throw new IAE("Supervisor[%s] must be in a RUNNING state to perform a reset and backfill", id); - } - log.info("Capturing latest offsets from stream for supervisor[%s]", id); streamSupervisor.updatePartitionLagFromStream(); Map endOffsets = streamSupervisor.getLatestSequencesFromStream(); @@ -423,7 +406,6 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ log.info("Capturing checkpointed offsets for supervisor[%s]", id); Map startOffsets = streamSupervisor.getOffsetsFromMetadataStorage(); - // Validate that we successfully retrieved offsets if (endOffsets == null || endOffsets.isEmpty()) { throw new ISE("Skipping reset: Failed to get latest offsets from stream for supervisor[%s]", id); } @@ -466,6 +448,29 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ ); } + private void validateResetAndBackfill(String id, Pair supervisorPair) + { + if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { + throw new IAE("Supervisor[%s] is not a streaming supervisor", id); + } + SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; + SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; + + if (streamSupervisor.getIoConfig().isUseEarliestSequenceNumber()) { + throw new IAE("Reset with skipped offsets is not supported when useEarliestOffset is true."); + } + + if (!specHasConcurrentLocks(streamSpec)) { + throw new IAE( + "Backfill tasks require 'useConcurrentLocks' to be set to true in the supervisor context to allow concurrent writes with the main supervisor tasks" + ); + } + + if (supervisorPair.lhs.getState() != SupervisorStateManager.BasicState.RUNNING) { + throw new IAE("Supervisor[%s] must be in a RUNNING state to perform a reset and backfill", id); + } + } + SupervisorSpec createBackfillSpec( SeekableStreamSupervisorSpec sourceSpec, String backfillSupervisorId, From 6fdb183f3ce66dca9eb90f862bf2a9ac74fb93b4 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 21 May 2026 12:53:54 -0700 Subject: [PATCH 07/23] Add embedded-test for resetSupervisorAndBackfill --- .../indexing/KafkaBoundedSupervisorTest.java | 44 +++++++++++++++++++ .../MSQWorkerTaskLauncherRetryTest.java | 6 +++ .../rpc/indexing/NoopOverlordClient.java | 6 +++ .../druid/rpc/indexing/OverlordClient.java | 9 ++++ .../rpc/indexing/OverlordClientImpl.java | 17 +++++++ .../testing/embedded/EmbeddedClusterApis.java | 10 +++++ 6 files changed, 92 insertions(+) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index 7e22d85d9cab..c895d4a1b20c 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -292,6 +292,50 @@ public void test_boundedSupervisor_doesNotSilentlyCompleteWhenStaleOffsetExceeds Assertions.assertEquals("UNHEALTHY_SUPERVISOR", status2.getState(), "Supervisor state should be UNHEALTHY_SUPERVISOR"); } + @Test + public void test_resetSupervisorAndBackfill() + { + final String topic = IdUtils.getRandomId(); + kafkaServer.createTopicWithPartitions(topic, 2); + + // Create a streaming supervisor with concurrent locks (required for backfill) + final KafkaSupervisorSpec supervisor = createKafkaSupervisor(kafkaServer) + .withContext(Map.of("useConcurrentLocks", true)) + .withIoConfig(io -> io + .withKafkaInputFormat(new JsonInputFormat(null, null, null, null, null)) + .withUseEarliestSequenceNumber(false) + ) + .build(dataSource, topic); + + cluster.callApi().postSupervisor(supervisor); + + // Publish batch 1 and wait for the supervisor to checkpoint those offsets + final int batch1 = publish1kRecords(topic, false); + waitUntilPublishedRecordsAreIngested(batch1); + + // Publish batch 2 — this is the gap the backfill supervisor will cover + final int batch2 = publish1kRecords(topic, false); + + // Reset the main supervisor and spin up a backfill supervisor for the gap + final Map result = cluster.callApi().resetSupervisorAndBackfill(supervisor.getId()); + final String backfillSupervisorId = (String) result.get("backfillSupervisorId"); + + // Wait for the backfill to finish + waitForSupervisorToComplete(backfillSupervisorId); + + // Verify all data (batch 1 + gap) was ingested + verifyRowCount(batch1 + batch2); + + // Main supervisor should still be running + final SupervisorStatus mainStatus = cluster.callApi().getSupervisorStatus(supervisor.getId()); + Assertions.assertEquals("RUNNING", mainStatus.getState()); + Assertions.assertTrue(mainStatus.isHealthy()); + + final SupervisorStatus backfillStatus = cluster.callApi().getSupervisorStatus(backfillSupervisorId); + Assertions.assertEquals("COMPLETED", backfillStatus.getState()); + Assertions.assertTrue(backfillStatus.isHealthy()); + } + private void waitForSupervisorToComplete(String supervisorId) { overlord.latchableEmitter().waitForEvent( diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java index 0ca643f109f7..257aafa69388 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java @@ -322,6 +322,12 @@ public ListenableFuture> terminateSupervisor(String supervis throw new UOE("Not implemented"); } + @Override + public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + { + throw new UOE("Not implemented"); + } + @Override public ListenableFuture> supervisorStatuses() { diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java b/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java index 81fccf19f131..d14474491fc5 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java @@ -114,6 +114,12 @@ public ListenableFuture> terminateSupervisor(String supervis throw new UnsupportedOperationException(); } + @Override + public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + { + throw new UnsupportedOperationException(); + } + @Override public ListenableFuture> supervisorStatuses() { diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java index c4d348997779..91d1dfa4d96c 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java @@ -197,6 +197,15 @@ ListenableFuture> taskStatuses( */ ListenableFuture> terminateSupervisor(String supervisorId); + /** + * Resets a supervisor to the latest stream offsets and starts a bounded backfill supervisor. + *

+ * API: {@code POST /druid/indexer/v1/supervisor//resetOffsetsAndBackfill} + * + * @return Map containing "id" and "backfillSupervisorId" + */ + ListenableFuture> resetSupervisorAndBackfill(String supervisorId); + /** * Returns all current supervisor statuses. */ diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java index 0499a62f090a..cc052119e297 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java @@ -265,6 +265,23 @@ public ListenableFuture> terminateSupervisor(String supervis ); } + @Override + public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + { + final String path = StringUtils.format( + "/druid/indexer/v1/supervisor/%s/resetOffsetsAndBackfill", + StringUtils.urlEncode(supervisorId) + ); + + return FutureUtils.transform( + client.asyncRequest( + new RequestBuilder(HttpMethod.POST, path), + new BytesFullResponseHandler() + ), + holder -> JacksonUtils.readValue(jsonMapper, holder.getContent(), new TypeReference<>() {}) + ); + } + @Override public ListenableFuture> supervisorStatuses() { diff --git a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java index 257533aecbd0..02ff65de67f9 100644 --- a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java +++ b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java @@ -430,6 +430,16 @@ public String postSupervisor(SupervisorSpec supervisor) return onLeaderOverlord(o -> o.postSupervisor(supervisor)).get("id"); } + /** + * Resets a supervisor to the latest stream offsets and starts a bounded backfill supervisor. + * + * @return Map containing "id" and "backfillSupervisorId" + */ + public Map resetSupervisorAndBackfill(String supervisorId) + { + return onLeaderOverlord(o -> o.resetSupervisorAndBackfill(supervisorId)); + } + /** * Fetches the current status of the given supervisor ID. */ From d40aee211b0a9c9d9d0eb2f4944ca8c48543e72e Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 21 May 2026 16:50:45 -0700 Subject: [PATCH 08/23] Remove flaky waitUntilPublishedRecordsAreIngested --- .../testing/embedded/indexing/KafkaBoundedSupervisorTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index c895d4a1b20c..0ece037581fd 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -311,7 +311,6 @@ public void test_resetSupervisorAndBackfill() // Publish batch 1 and wait for the supervisor to checkpoint those offsets final int batch1 = publish1kRecords(topic, false); - waitUntilPublishedRecordsAreIngested(batch1); // Publish batch 2 — this is the gap the backfill supervisor will cover final int batch2 = publish1kRecords(topic, false); From dc6920fae21390191d60206bfcd49ecc71c9101e Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 21 May 2026 16:53:24 -0700 Subject: [PATCH 09/23] Update KafkaBoundedSupervisorTest.java --- .../embedded/indexing/KafkaBoundedSupervisorTest.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index 0ece037581fd..5549f46fc5c6 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -309,11 +309,7 @@ public void test_resetSupervisorAndBackfill() cluster.callApi().postSupervisor(supervisor); - // Publish batch 1 and wait for the supervisor to checkpoint those offsets - final int batch1 = publish1kRecords(topic, false); - - // Publish batch 2 — this is the gap the backfill supervisor will cover - final int batch2 = publish1kRecords(topic, false); + final int recordCount = publish1kRecords(topic, false); // Reset the main supervisor and spin up a backfill supervisor for the gap final Map result = cluster.callApi().resetSupervisorAndBackfill(supervisor.getId()); @@ -322,8 +318,7 @@ public void test_resetSupervisorAndBackfill() // Wait for the backfill to finish waitForSupervisorToComplete(backfillSupervisorId); - // Verify all data (batch 1 + gap) was ingested - verifyRowCount(batch1 + batch2); + verifyRowCount(recordCount); // Main supervisor should still be running final SupervisorStatus mainStatus = cluster.callApi().getSupervisorStatus(supervisor.getId()); From 946347a4a08ca1bcf72a92e0da2fdde7ce26a8f3 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 21 May 2026 17:53:53 -0700 Subject: [PATCH 10/23] Wait for supervisor to be RUNNING --- .../indexing/KafkaBoundedSupervisorTest.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index 5549f46fc5c6..4384fc3efea5 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -298,7 +298,7 @@ public void test_resetSupervisorAndBackfill() final String topic = IdUtils.getRandomId(); kafkaServer.createTopicWithPartitions(topic, 2); - // Create a streaming supervisor with concurrent locks (required for backfill) + // Create a streaming supervisor with concurrent locks and withUseEarliestSequenceNumber=false final KafkaSupervisorSpec supervisor = createKafkaSupervisor(kafkaServer) .withContext(Map.of("useConcurrentLocks", true)) .withIoConfig(io -> io @@ -309,7 +309,11 @@ public void test_resetSupervisorAndBackfill() cluster.callApi().postSupervisor(supervisor); - final int recordCount = publish1kRecords(topic, false); + waitForSupervisorDetailedState(supervisor.getId(), "RUNNING"); + + final int batch1 = publish1kRecords(topic, false); + waitUntilPublishedRecordsAreIngested(batch1); + publish1kRecords(topic, false); // Reset the main supervisor and spin up a backfill supervisor for the gap final Map result = cluster.callApi().resetSupervisorAndBackfill(supervisor.getId()); @@ -318,8 +322,6 @@ public void test_resetSupervisorAndBackfill() // Wait for the backfill to finish waitForSupervisorToComplete(backfillSupervisorId); - verifyRowCount(recordCount); - // Main supervisor should still be running final SupervisorStatus mainStatus = cluster.callApi().getSupervisorStatus(supervisor.getId()); Assertions.assertEquals("RUNNING", mainStatus.getState()); @@ -339,6 +341,15 @@ private void waitForSupervisorToComplete(String supervisorId) ); } + private void waitForSupervisorDetailedState(String supervisorId, String detailedState) + { + overlord.latchableEmitter().waitForEvent( + event -> event.hasMetricName("supervisor/count") + .hasDimension(DruidMetrics.SUPERVISOR_ID, supervisorId) + .hasDimension("detailedState", detailedState) + ); + } + private void waitForSupervisorToBeUnhealthy(String supervisorId) { overlord.latchableEmitter().waitForEvent( From 89b5fec25e3a7dc88441d6e995564723557f2312 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Fri, 22 May 2026 14:51:32 -0700 Subject: [PATCH 11/23] Use checkpointed offset if > requested reset offset to prevent duplicate ingestion --- .../supervisor/SupervisorManager.java | 2 +- .../supervisor/SeekableStreamSupervisor.java | 63 +++++++- .../SeekableStreamSupervisorStateTest.java | 150 +++++++++++++++++- 3 files changed, 206 insertions(+), 9 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index b76b586e18db..d835e60fab3c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -434,7 +434,7 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ endOffsets ); - streamSupervisor.resetOffsets(resetMetadata); + streamSupervisor.resetOffsetsForwardOnly(resetMetadata); // Reset autoscaler if present SupervisorTaskAutoScaler autoscaler = autoscalers.get(id); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java index fdf7563873d4..fca9ae8343fc 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java @@ -818,19 +818,22 @@ public String getType() private class ResetOffsetsNotice implements Notice { final DataSourceMetadata dataSourceMetadata; + final boolean forwardOnly; private static final String TYPE = "reset_offsets_notice"; ResetOffsetsNotice( - final DataSourceMetadata dataSourceMetadata + final DataSourceMetadata dataSourceMetadata, + final boolean forwardOnly ) { this.dataSourceMetadata = dataSourceMetadata; + this.forwardOnly = forwardOnly; } @Override public void handle() { - resetOffsetsInternal(dataSourceMetadata); + resetOffsetsInternal(dataSourceMetadata, forwardOnly); } @Override @@ -1340,6 +1343,25 @@ public void reset(@Nullable final DataSourceMetadata dataSourceMetadata) */ @Override public void resetOffsets(@Nonnull DataSourceMetadata resetDataSourceMetadata) + { + validateResetOffsetsMetadata(resetDataSourceMetadata); + log.info("Posting ResetOffsetsNotice with reset dataSource metadata[%s]", resetDataSourceMetadata); + addNotice(new ResetOffsetsNotice(resetDataSourceMetadata, false)); + } + + /** + * Like {@link #resetOffsets}, but never rolls a partition backward: if the stored offset is already ahead of + * the requested reset offset for a partition, the stored value is kept. Used by resetSupervisorAndBackfill to + * ensure the main supervisor cannot regress even if a task checkpoints between offset capture and this call. + */ + public void resetOffsetsForwardOnly(@Nonnull DataSourceMetadata resetDataSourceMetadata) + { + validateResetOffsetsMetadata(resetDataSourceMetadata); + log.info("Posting ResetOffsetsNotice (forwardOnly) with reset dataSource metadata[%s]", resetDataSourceMetadata); + addNotice(new ResetOffsetsNotice(resetDataSourceMetadata, true)); + } + + private void validateResetOffsetsMetadata(@Nonnull DataSourceMetadata resetDataSourceMetadata) { if (resetDataSourceMetadata == null) { throw InvalidInput.exception("Reset dataSourceMetadata is required for resetOffsets."); @@ -1374,8 +1396,6 @@ public void resetOffsets(@Nonnull DataSourceMetadata resetDataSourceMetadata) ioConfig.getStream() ); } - log.info("Posting ResetOffsetsNotice with reset dataSource metadata[%s]", resetDataSourceMetadata); - addNotice(new ResetOffsetsNotice(resetDataSourceMetadata)); } public void registerNewVersionOfPendingSegment( @@ -2147,7 +2167,7 @@ public void resetInternal(DataSourceMetadata dataSourceMetadata) * the metadata storage. Once the offsets are reset, any active tasks serving the partition offsets will be restarted. * @param dataSourceMetadata Required reset data source metadata. Assumed that the metadata is validated. */ - public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMetadata) + public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMetadata, final boolean forwardOnly) { log.info("Reset offsets for supervisor[%s] for dataSource[%s] with metadata[%s]", supervisorId, dataSource, dataSourceMetadata); @@ -2158,7 +2178,7 @@ public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMet final boolean metadataUpdateSuccess; final DataSourceMetadata metadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(supervisorId); if (metadata == null) { - log.info("Checkpointed metadata in null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); + log.info("Checkpointed metadata is null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); metadataUpdateSuccess = indexerMetadataStorageCoordinator.insertDataSourceMetadata(supervisorId, resetMetadata); } else { if (!checkSourceMetadataMatch(metadata)) { @@ -2170,7 +2190,36 @@ public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMet @SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata currentMetadata = (SeekableStreamDataSourceMetadata) metadata; - final DataSourceMetadata newMetadata = currentMetadata.plus(resetMetadata); + final DataSourceMetadata newMetadata; + if (forwardOnly) { + // For each partition in resetMetadata, keep max(current, reset) so the main supervisor never goes backward. + final Map currentMap = + currentMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap(); + final Map resetMap = + resetMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap(); + final Map forwardMap = new HashMap<>(resetMap); + forwardMap.forEach((partition, resetOffset) -> { + final SequenceOffsetType currentOffset = currentMap.get(partition); + if (currentOffset != null && isOffsetAtOrBeyond(currentOffset, resetOffset)) { + log.info( + "Keeping current offset[%s] for partition[%s] of supervisor[%s] as it is ahead of requested reset offset[%s]", + currentOffset, + partition, + supervisorId, + resetOffset + ); + forwardMap.put(partition, currentOffset); + } + }); + final SeekableStreamDataSourceMetadata forwardMetadata = + createDataSourceMetaDataForReset( + resetMetadata.getSeekableStreamSequenceNumbers().getStream(), + forwardMap + ); + newMetadata = currentMetadata.plus(forwardMetadata); + } else { + newMetadata = currentMetadata.plus(resetMetadata); + } log.info("Current checkpointed metadata[%s], new metadata[%s] for supervisor[%s] for dataSource[%s]", currentMetadata, newMetadata, supervisorId, dataSource); try { metadataUpdateSuccess = indexerMetadataStorageCoordinator.resetDataSourceMetadata(supervisorId, newMetadata); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java index e19d68cb2b3f..f6ae9de91287 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java @@ -2554,6 +2554,154 @@ public void testSupervisorResetInvalidStream() ); } + @Test + public void testResetOffsetsForwardOnlyKeepsAheadPartitions() throws InterruptedException, IOException + { + // Partition "1" has checkpointed ahead of the reset offset; forwardOnly must keep the higher value. + final ImmutableMap checkpointOffsets = ImmutableMap.of("0", "5", "1", "200", "2", "100"); + final ImmutableMap resetOffsets = ImmutableMap.of("0", "10", "1", "8", "2", "150"); + // "1" keeps 200 (checkpoint > reset); "0" advances to 10; "2" advances to 150. + final ImmutableMap expectedOffsets = ImmutableMap.of("0", "10", "1", "200", "2", "150"); + + EasyMock.expect(spec.isSuspended()).andReturn(false); + EasyMock.reset(indexerMetadataStorageCoordinator); + EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn( + new TestSeekableStreamDataSourceMetadata( + new SeekableStreamEndSequenceNumbers<>(STREAM, checkpointOffsets) + ) + ); + EasyMock.expect(indexerMetadataStorageCoordinator.resetDataSourceMetadata( + SUPERVISOR_ID, + new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, expectedOffsets)) + )).andReturn(true); + taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + taskQueue.shutdown("task3", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + + replayAll(); + + final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); + supervisor.getIoConfig().setTaskCount(3); + supervisor.start(); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("0"), + ImmutableMap.of("0", "5"), + null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null + ); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("1"), + ImmutableMap.of("1", "200"), + null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null + ); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("2"), + ImmutableMap.of("2", "100"), + null, null, ImmutableSet.of("task3"), ImmutableSet.of(), null + ); + + final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( + new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) + ); + + supervisor.resetOffsetsForwardOnly(resetMetadata); + + validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); + } + + @Test + public void testResetOffsetsForwardOnlyAllBehind() throws InterruptedException, IOException + { + // All checkpointed offsets are behind the reset offsets; behavior is identical to a normal reset. + final ImmutableMap checkpointOffsets = ImmutableMap.of("0", "5", "1", "6"); + final ImmutableMap resetOffsets = ImmutableMap.of("0", "100", "1", "200"); + + EasyMock.expect(spec.isSuspended()).andReturn(false); + EasyMock.reset(indexerMetadataStorageCoordinator); + EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn( + new TestSeekableStreamDataSourceMetadata( + new SeekableStreamEndSequenceNumbers<>(STREAM, checkpointOffsets) + ) + ); + EasyMock.expect(indexerMetadataStorageCoordinator.resetDataSourceMetadata( + SUPERVISOR_ID, + new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets)) + )).andReturn(true); + taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + + replayAll(); + + final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); + supervisor.getIoConfig().setTaskCount(2); + supervisor.start(); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("0"), + ImmutableMap.of("0", "5"), + null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null + ); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("1"), + ImmutableMap.of("1", "6"), + null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null + ); + + final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( + new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) + ); + + supervisor.resetOffsetsForwardOnly(resetMetadata); + + validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); + } + + @Test + public void testResetOffsetsForwardOnlyNoExistingCheckpoint() throws InterruptedException + { + // No existing metadata in storage; forwardOnly inserts the reset offsets as-is. + final ImmutableMap resetOffsets = ImmutableMap.of("0", "50", "1", "75"); + + EasyMock.expect(spec.isSuspended()).andReturn(false); + EasyMock.reset(indexerMetadataStorageCoordinator); + EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn(null); + EasyMock.expect(indexerMetadataStorageCoordinator.insertDataSourceMetadata( + SUPERVISOR_ID, + new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets)) + )).andReturn(true); + taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); + EasyMock.expectLastCall(); + + replayAll(); + + final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); + supervisor.getIoConfig().setTaskCount(2); + supervisor.start(); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("0"), + ImmutableMap.of("0", "10"), + null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null + ); + supervisor.addTaskGroupToActivelyReadingTaskGroup( + supervisor.getTaskGroupIdForPartition("1"), + ImmutableMap.of("1", "20"), + null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null + ); + + final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( + new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) + ); + + supervisor.resetOffsetsForwardOnly(resetMetadata); + + validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); + } + @Test public void testStaleOffsetsNegativeLagNotEmitted() throws Exception { @@ -3386,7 +3534,7 @@ public SeekableStreamDataSourceMetadata createDataSourceMetaData Map map ) { - return null; + return new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(stream, map)); } @Override From 551e887fa597fe8ae33c4ca6c0b1c7735cebc513 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Fri, 22 May 2026 17:43:40 -0700 Subject: [PATCH 12/23] Update KafkaBoundedSupervisorTest.java --- .../embedded/indexing/KafkaBoundedSupervisorTest.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index 4384fc3efea5..ce65e6263529 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -311,11 +311,12 @@ public void test_resetSupervisorAndBackfill() waitForSupervisorDetailedState(supervisor.getId(), "RUNNING"); - final int batch1 = publish1kRecords(topic, false); - waitUntilPublishedRecordsAreIngested(batch1); - publish1kRecords(topic, false); + final int totalRecords = publish1kRecords(topic, false); + waitUntilPublishedRecordsAreIngested(totalRecords); - // Reset the main supervisor and spin up a backfill supervisor for the gap + // Reset the main supervisor and spin up a backfill supervisor. + // Since all records are already ingested before the call, the backfill + // supervisor will complete immediately without ingesting anything. final Map result = cluster.callApi().resetSupervisorAndBackfill(supervisor.getId()); final String backfillSupervisorId = (String) result.get("backfillSupervisorId"); From c503e217b201a3026a9a96b77e165f601a777c41 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Tue, 26 May 2026 09:11:30 -0700 Subject: [PATCH 13/23] Revert "Use checkpointed offset if > requested reset offset to prevent duplicate ingestion" resetOffsetsForwardOnly does not fully close the race it targets (the write is still unconditional) and the duplicate scenario it addresses is narrower than the overlap case, which cannot be solved without suspending the main supervisor. Accepting the limitation and documenting it is preferable to the added complexity. This reverts commit 89b5fec25e3a7dc88441d6e995564723557f2312. --- .../supervisor/SupervisorManager.java | 2 +- .../supervisor/SeekableStreamSupervisor.java | 63 +------- .../SeekableStreamSupervisorStateTest.java | 150 +----------------- 3 files changed, 9 insertions(+), 206 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index d835e60fab3c..b76b586e18db 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -434,7 +434,7 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ endOffsets ); - streamSupervisor.resetOffsetsForwardOnly(resetMetadata); + streamSupervisor.resetOffsets(resetMetadata); // Reset autoscaler if present SupervisorTaskAutoScaler autoscaler = autoscalers.get(id); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java index fca9ae8343fc..fdf7563873d4 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java @@ -818,22 +818,19 @@ public String getType() private class ResetOffsetsNotice implements Notice { final DataSourceMetadata dataSourceMetadata; - final boolean forwardOnly; private static final String TYPE = "reset_offsets_notice"; ResetOffsetsNotice( - final DataSourceMetadata dataSourceMetadata, - final boolean forwardOnly + final DataSourceMetadata dataSourceMetadata ) { this.dataSourceMetadata = dataSourceMetadata; - this.forwardOnly = forwardOnly; } @Override public void handle() { - resetOffsetsInternal(dataSourceMetadata, forwardOnly); + resetOffsetsInternal(dataSourceMetadata); } @Override @@ -1343,25 +1340,6 @@ public void reset(@Nullable final DataSourceMetadata dataSourceMetadata) */ @Override public void resetOffsets(@Nonnull DataSourceMetadata resetDataSourceMetadata) - { - validateResetOffsetsMetadata(resetDataSourceMetadata); - log.info("Posting ResetOffsetsNotice with reset dataSource metadata[%s]", resetDataSourceMetadata); - addNotice(new ResetOffsetsNotice(resetDataSourceMetadata, false)); - } - - /** - * Like {@link #resetOffsets}, but never rolls a partition backward: if the stored offset is already ahead of - * the requested reset offset for a partition, the stored value is kept. Used by resetSupervisorAndBackfill to - * ensure the main supervisor cannot regress even if a task checkpoints between offset capture and this call. - */ - public void resetOffsetsForwardOnly(@Nonnull DataSourceMetadata resetDataSourceMetadata) - { - validateResetOffsetsMetadata(resetDataSourceMetadata); - log.info("Posting ResetOffsetsNotice (forwardOnly) with reset dataSource metadata[%s]", resetDataSourceMetadata); - addNotice(new ResetOffsetsNotice(resetDataSourceMetadata, true)); - } - - private void validateResetOffsetsMetadata(@Nonnull DataSourceMetadata resetDataSourceMetadata) { if (resetDataSourceMetadata == null) { throw InvalidInput.exception("Reset dataSourceMetadata is required for resetOffsets."); @@ -1396,6 +1374,8 @@ private void validateResetOffsetsMetadata(@Nonnull DataSourceMetadata resetDataS ioConfig.getStream() ); } + log.info("Posting ResetOffsetsNotice with reset dataSource metadata[%s]", resetDataSourceMetadata); + addNotice(new ResetOffsetsNotice(resetDataSourceMetadata)); } public void registerNewVersionOfPendingSegment( @@ -2167,7 +2147,7 @@ public void resetInternal(DataSourceMetadata dataSourceMetadata) * the metadata storage. Once the offsets are reset, any active tasks serving the partition offsets will be restarted. * @param dataSourceMetadata Required reset data source metadata. Assumed that the metadata is validated. */ - public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMetadata, final boolean forwardOnly) + public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMetadata) { log.info("Reset offsets for supervisor[%s] for dataSource[%s] with metadata[%s]", supervisorId, dataSource, dataSourceMetadata); @@ -2178,7 +2158,7 @@ public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMet final boolean metadataUpdateSuccess; final DataSourceMetadata metadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(supervisorId); if (metadata == null) { - log.info("Checkpointed metadata is null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); + log.info("Checkpointed metadata in null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); metadataUpdateSuccess = indexerMetadataStorageCoordinator.insertDataSourceMetadata(supervisorId, resetMetadata); } else { if (!checkSourceMetadataMatch(metadata)) { @@ -2190,36 +2170,7 @@ public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMet @SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata currentMetadata = (SeekableStreamDataSourceMetadata) metadata; - final DataSourceMetadata newMetadata; - if (forwardOnly) { - // For each partition in resetMetadata, keep max(current, reset) so the main supervisor never goes backward. - final Map currentMap = - currentMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap(); - final Map resetMap = - resetMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap(); - final Map forwardMap = new HashMap<>(resetMap); - forwardMap.forEach((partition, resetOffset) -> { - final SequenceOffsetType currentOffset = currentMap.get(partition); - if (currentOffset != null && isOffsetAtOrBeyond(currentOffset, resetOffset)) { - log.info( - "Keeping current offset[%s] for partition[%s] of supervisor[%s] as it is ahead of requested reset offset[%s]", - currentOffset, - partition, - supervisorId, - resetOffset - ); - forwardMap.put(partition, currentOffset); - } - }); - final SeekableStreamDataSourceMetadata forwardMetadata = - createDataSourceMetaDataForReset( - resetMetadata.getSeekableStreamSequenceNumbers().getStream(), - forwardMap - ); - newMetadata = currentMetadata.plus(forwardMetadata); - } else { - newMetadata = currentMetadata.plus(resetMetadata); - } + final DataSourceMetadata newMetadata = currentMetadata.plus(resetMetadata); log.info("Current checkpointed metadata[%s], new metadata[%s] for supervisor[%s] for dataSource[%s]", currentMetadata, newMetadata, supervisorId, dataSource); try { metadataUpdateSuccess = indexerMetadataStorageCoordinator.resetDataSourceMetadata(supervisorId, newMetadata); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java index f6ae9de91287..e19d68cb2b3f 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorStateTest.java @@ -2554,154 +2554,6 @@ public void testSupervisorResetInvalidStream() ); } - @Test - public void testResetOffsetsForwardOnlyKeepsAheadPartitions() throws InterruptedException, IOException - { - // Partition "1" has checkpointed ahead of the reset offset; forwardOnly must keep the higher value. - final ImmutableMap checkpointOffsets = ImmutableMap.of("0", "5", "1", "200", "2", "100"); - final ImmutableMap resetOffsets = ImmutableMap.of("0", "10", "1", "8", "2", "150"); - // "1" keeps 200 (checkpoint > reset); "0" advances to 10; "2" advances to 150. - final ImmutableMap expectedOffsets = ImmutableMap.of("0", "10", "1", "200", "2", "150"); - - EasyMock.expect(spec.isSuspended()).andReturn(false); - EasyMock.reset(indexerMetadataStorageCoordinator); - EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn( - new TestSeekableStreamDataSourceMetadata( - new SeekableStreamEndSequenceNumbers<>(STREAM, checkpointOffsets) - ) - ); - EasyMock.expect(indexerMetadataStorageCoordinator.resetDataSourceMetadata( - SUPERVISOR_ID, - new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, expectedOffsets)) - )).andReturn(true); - taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - taskQueue.shutdown("task3", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - - replayAll(); - - final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); - supervisor.getIoConfig().setTaskCount(3); - supervisor.start(); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("0"), - ImmutableMap.of("0", "5"), - null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null - ); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("1"), - ImmutableMap.of("1", "200"), - null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null - ); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("2"), - ImmutableMap.of("2", "100"), - null, null, ImmutableSet.of("task3"), ImmutableSet.of(), null - ); - - final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( - new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) - ); - - supervisor.resetOffsetsForwardOnly(resetMetadata); - - validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); - } - - @Test - public void testResetOffsetsForwardOnlyAllBehind() throws InterruptedException, IOException - { - // All checkpointed offsets are behind the reset offsets; behavior is identical to a normal reset. - final ImmutableMap checkpointOffsets = ImmutableMap.of("0", "5", "1", "6"); - final ImmutableMap resetOffsets = ImmutableMap.of("0", "100", "1", "200"); - - EasyMock.expect(spec.isSuspended()).andReturn(false); - EasyMock.reset(indexerMetadataStorageCoordinator); - EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn( - new TestSeekableStreamDataSourceMetadata( - new SeekableStreamEndSequenceNumbers<>(STREAM, checkpointOffsets) - ) - ); - EasyMock.expect(indexerMetadataStorageCoordinator.resetDataSourceMetadata( - SUPERVISOR_ID, - new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets)) - )).andReturn(true); - taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - - replayAll(); - - final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); - supervisor.getIoConfig().setTaskCount(2); - supervisor.start(); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("0"), - ImmutableMap.of("0", "5"), - null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null - ); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("1"), - ImmutableMap.of("1", "6"), - null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null - ); - - final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( - new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) - ); - - supervisor.resetOffsetsForwardOnly(resetMetadata); - - validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); - } - - @Test - public void testResetOffsetsForwardOnlyNoExistingCheckpoint() throws InterruptedException - { - // No existing metadata in storage; forwardOnly inserts the reset offsets as-is. - final ImmutableMap resetOffsets = ImmutableMap.of("0", "50", "1", "75"); - - EasyMock.expect(spec.isSuspended()).andReturn(false); - EasyMock.reset(indexerMetadataStorageCoordinator); - EasyMock.expect(indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(SUPERVISOR_ID)).andReturn(null); - EasyMock.expect(indexerMetadataStorageCoordinator.insertDataSourceMetadata( - SUPERVISOR_ID, - new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets)) - )).andReturn(true); - taskQueue.shutdown("task1", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - taskQueue.shutdown("task2", "DataSourceMetadata is updated while reset offsets is called"); - EasyMock.expectLastCall(); - - replayAll(); - - final TestSeekableStreamSupervisor supervisor = new TestSeekableStreamSupervisor(); - supervisor.getIoConfig().setTaskCount(2); - supervisor.start(); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("0"), - ImmutableMap.of("0", "10"), - null, null, ImmutableSet.of("task1"), ImmutableSet.of(), null - ); - supervisor.addTaskGroupToActivelyReadingTaskGroup( - supervisor.getTaskGroupIdForPartition("1"), - ImmutableMap.of("1", "20"), - null, null, ImmutableSet.of("task2"), ImmutableSet.of(), null - ); - - final DataSourceMetadata resetMetadata = new TestSeekableStreamDataSourceMetadata( - new SeekableStreamEndSequenceNumbers<>(STREAM, resetOffsets) - ); - - supervisor.resetOffsetsForwardOnly(resetMetadata); - - validateSupervisorStateAfterResetOffsets(supervisor, resetOffsets, 0); - } - @Test public void testStaleOffsetsNegativeLagNotEmitted() throws Exception { @@ -3534,7 +3386,7 @@ public SeekableStreamDataSourceMetadata createDataSourceMetaData Map map ) { - return new TestSeekableStreamDataSourceMetadata(new SeekableStreamEndSequenceNumbers<>(stream, map)); + return null; } @Override From 6c10fca41e3523509dd55f4af7c1a0575e5adc76 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Tue, 26 May 2026 09:23:24 -0700 Subject: [PATCH 14/23] Doc update - duplication notice and Kinesis callout --- docs/api-reference/supervisor-api.md | 10 +++++++++- .../supervisor/SeekableStreamSupervisor.java | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md index 840666265aeb..44f41e796d91 100644 --- a/docs/api-reference/supervisor-api.md +++ b/docs/api-reference/supervisor-api.md @@ -3541,16 +3541,24 @@ when the supervisor's tasks restart, they resume reading from `{"0": 100, "1": 1 ### Reset offsets and start a backfill supervisor +This endpoint is supported for Apache Kafka and RabbitMQ Stream supervisors. Amazon Kinesis is not supported yet. + Resets the supervisor to the latest available stream offsets and starts a new bounded backfill supervisor to ingest the data in the skipped range. This endpoint is useful when a supervisor has fallen behind and you want to catch it up to the latest offsets without losing the skipped data. The main supervisor resumes ingesting from the latest offsets, while the backfill supervisor processes the range from the previously checkpointed offsets up to the latest offsets at the time of the reset. +**Duplicate ingestion notice:** The main supervisor is not quiesced before the reset. This means duplicate data can occur in two ways: +- **Backfill overlap:** Any tasks that were in-flight at the time of the reset may publish segments covering part of the backfill range before being shut down. +- **Reset race:** If a task checkpoint is written to the metadata store between when this endpoint captures the current offsets and when it applies the reset, that checkpoint can be overwritten, causing the main supervisor to re-ingest already-processed data. + +Both windows are narrow in practice, but cannot be fully eliminated without manually suspending the main supervisor before calling this endpoint and waiting for all pending tasks to complete. + The following requirements must be met before calling this endpoint: - The supervisor must be a [streaming supervisor](../ingestion/supervisor.md). - The supervisor's `useEarliestSequenceNumber` property must be `false`. - The supervisor context must have `useConcurrentLocks` set to `true` to allow the backfill supervisor's tasks to write concurrently with the main supervisor's tasks. -- The supervisor must be in a `RUNNING` state so that it can query the latest offsets from the stream. +- The supervisor must be in a `RUNNING` state. The backfill supervisor has the same configuration as the source supervisor except for its ID, which takes the form `{supervisorId}_backfill_{randomSuffix}`, and its `boundedStreamConfig`, which is set to the skipped offset range. If `backfillTaskCount` is specified, it overrides the `taskCount` for the backfill supervisor only. diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java index fdf7563873d4..a3e4dc1cd764 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisor.java @@ -2158,7 +2158,7 @@ public void resetOffsetsInternal(@Nonnull final DataSourceMetadata dataSourceMet final boolean metadataUpdateSuccess; final DataSourceMetadata metadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(supervisorId); if (metadata == null) { - log.info("Checkpointed metadata in null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); + log.info("Checkpointed metadata is null for supervisor[%s] for dataSource[%s] - inserting metadata[%s]", supervisorId, dataSource, resetMetadata); metadataUpdateSuccess = indexerMetadataStorageCoordinator.insertDataSourceMetadata(supervisorId, resetMetadata); } else { if (!checkSourceMetadataMatch(metadata)) { From 38986881a38e5cc8021fdcdad1d8103a2101d6b5 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 11:21:38 -0700 Subject: [PATCH 15/23] Rename endpoint from resetOffsetsAndBackfill to resetToLatestAndBackfill --- docs/api-reference/supervisor-api.md | 6 +++--- .../overlord/supervisor/SupervisorResource.java | 4 ++-- .../supervisor/SupervisorResourceTest.java | 14 +++++++------- .../apache/druid/rpc/indexing/OverlordClient.java | 2 +- .../druid/rpc/indexing/OverlordClientImpl.java | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md index 44f41e796d91..dfddf31719f7 100644 --- a/docs/api-reference/supervisor-api.md +++ b/docs/api-reference/supervisor-api.md @@ -3564,7 +3564,7 @@ The backfill supervisor has the same configuration as the source supervisor exce #### URL -`POST` `/druid/indexer/v1/supervisor/{supervisorId}/resetOffsetsAndBackfill` +`POST` `/druid/indexer/v1/supervisor/{supervisorId}/resetToLatestAndBackfill` #### Query parameters @@ -3614,7 +3614,7 @@ The following example resets a supervisor named `social_media` and starts a back ```shell -curl --request POST "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/supervisor/social_media/resetOffsetsAndBackfill?backfillTaskCount=2" +curl --request POST "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/supervisor/social_media/resetToLatestAndBackfill?backfillTaskCount=2" ``` @@ -3622,7 +3622,7 @@ curl --request POST "http://ROUTER_IP:ROUTER_PORT/druid/indexer/v1/supervisor/so ```HTTP -POST /druid/indexer/v1/supervisor/social_media/resetOffsetsAndBackfill?backfillTaskCount=2 HTTP/1.1 +POST /druid/indexer/v1/supervisor/social_media/resetToLatestAndBackfill?backfillTaskCount=2 HTTP/1.1 Host: http://ROUTER_IP:ROUTER_PORT ``` diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java index af8c7adc7664..6116733c1950 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java @@ -641,10 +641,10 @@ private Response handleResetRequest( } @POST - @Path("/{id}/resetOffsetsAndBackfill") + @Path("/{id}/resetToLatestAndBackfill") @Produces(MediaType.APPLICATION_JSON) @ResourceFilters(SupervisorResourceFilter.class) - public Response resetOffsetsAndBackfill( + public Response resetToLatestAndBackfill( @PathParam("id") final String id, @QueryParam("backfillTaskCount") @Nullable final Integer backfillTaskCount ) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index a4251fccf3be..00972d7c08fd 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -1389,7 +1389,7 @@ public void testResetOffsetsAndBackfill() .andReturn(ImmutableMap.of("id", "my-id", "backfillSupervisorId", "my-id_backfill_abcdefgh")); replayAll(); - Response response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + Response response = supervisorResource.resetToLatestAndBackfill("my-id", null); Assert.assertEquals(200, response.getStatus()); Assert.assertEquals( ImmutableMap.of("id", "my-id", "backfillSupervisorId", "my-id_backfill_abcdefgh"), @@ -1403,7 +1403,7 @@ public void testResetOffsetsAndBackfill() EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of()); replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + response = supervisorResource.resetToLatestAndBackfill("my-id", null); Assert.assertEquals(404, response.getStatus()); verifyAll(); resetAll(); @@ -1415,7 +1415,7 @@ public void testResetOffsetsAndBackfill() .andThrow(new IllegalArgumentException("Supervisor[my-id] must be in a RUNNING state")); replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + response = supervisorResource.resetToLatestAndBackfill("my-id", null); Assert.assertEquals(400, response.getStatus()); Assert.assertEquals( ImmutableMap.of("error", "Supervisor[my-id] must be in a RUNNING state"), @@ -1431,7 +1431,7 @@ public void testResetOffsetsAndBackfill() .andThrow(new IllegalStateException("Failed to get latest offsets from stream")); replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + response = supervisorResource.resetToLatestAndBackfill("my-id", null); Assert.assertEquals(500, response.getStatus()); Assert.assertEquals( ImmutableMap.of("error", "Failed to get latest offsets from stream"), @@ -1443,7 +1443,7 @@ public void testResetOffsetsAndBackfill() // 400 - invalid backfillTaskCount (zero) replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", 0); + response = supervisorResource.resetToLatestAndBackfill("my-id", 0); Assert.assertEquals(400, response.getStatus()); Assert.assertEquals( ImmutableMap.of("error", "backfillTaskCount must be a positive integer"), @@ -1455,7 +1455,7 @@ public void testResetOffsetsAndBackfill() // 400 - invalid backfillTaskCount (negative) replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", -1); + response = supervisorResource.resetToLatestAndBackfill("my-id", -1); Assert.assertEquals(400, response.getStatus()); Assert.assertEquals( ImmutableMap.of("error", "backfillTaskCount must be a positive integer"), @@ -1468,7 +1468,7 @@ public void testResetOffsetsAndBackfill() EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.absent()); replayAll(); - response = supervisorResource.resetOffsetsAndBackfill("my-id", null); + response = supervisorResource.resetToLatestAndBackfill("my-id", null); Assert.assertEquals(503, response.getStatus()); verifyAll(); } diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java index 91d1dfa4d96c..d8bcd10550bb 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java @@ -200,7 +200,7 @@ ListenableFuture> taskStatuses( /** * Resets a supervisor to the latest stream offsets and starts a bounded backfill supervisor. *

- * API: {@code POST /druid/indexer/v1/supervisor//resetOffsetsAndBackfill} + * API: {@code POST /druid/indexer/v1/supervisor//resetToLatestAndBackfill} * * @return Map containing "id" and "backfillSupervisorId" */ diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java index cc052119e297..a24fdc9c6f67 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java @@ -269,7 +269,7 @@ public ListenableFuture> terminateSupervisor(String supervis public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) { final String path = StringUtils.format( - "/druid/indexer/v1/supervisor/%s/resetOffsetsAndBackfill", + "/druid/indexer/v1/supervisor/%s/resetToLatestAndBackfill", StringUtils.urlEncode(supervisorId) ); From a13169fd6a5f3773293e7d235bb839c62d15dc84 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 11:22:58 -0700 Subject: [PATCH 16/23] Update test name to reflect new endpoint --- .../indexing/overlord/supervisor/SupervisorResourceTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index 00972d7c08fd..4c1ff299d98e 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -1380,7 +1380,7 @@ public void testResetOffsets() } @Test - public void testResetOffsetsAndBackfill() + public void testResetToLatestAndBackfill() { // 200 - success EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); From 5860750ac8578b8c0597cbe8eaeeafcd16b62373 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 16:48:40 -0700 Subject: [PATCH 17/23] Address clean up from review comments --- .../supervisor/SupervisorManager.java | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index b76b586e18db..40a23955665c 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -394,11 +394,16 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ Preconditions.checkNotNull(id, "id"); Pair supervisorPair = supervisors.get(id); - validateResetAndBackfill(id, supervisorPair); + + if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { + throw new IAE("Supervisor[%s] is not a streaming supervisor", id); + } SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; + validateResetAndBackfill(id, streamSupervisor, streamSpec); + log.info("Capturing latest offsets from stream for supervisor[%s]", id); streamSupervisor.updatePartitionLagFromStream(); Map endOffsets = streamSupervisor.getLatestSequencesFromStream(); @@ -448,14 +453,12 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ ); } - private void validateResetAndBackfill(String id, Pair supervisorPair) + private void validateResetAndBackfill( + String id, + SeekableStreamSupervisor streamSupervisor, + SeekableStreamSupervisorSpec streamSpec + ) { - if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { - throw new IAE("Supervisor[%s] is not a streaming supervisor", id); - } - SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; - SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; - if (streamSupervisor.getIoConfig().isUseEarliestSequenceNumber()) { throw new IAE("Reset with skipped offsets is not supported when useEarliestOffset is true."); } @@ -466,7 +469,7 @@ private void validateResetAndBackfill(String id, Pair context = spec.getContext(); if (context == null) { - return false; + return Tasks.DEFAULT_USE_CONCURRENT_LOCKS; } Boolean useConcurrentLocks = QueryContexts.getAsBoolean( Tasks.USE_CONCURRENT_LOCKS, From 26c10f004812f501cbdaa13ae7dbc0c7826c1916 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 16:51:30 -0700 Subject: [PATCH 18/23] Log out start/end offsets --- .../indexing/overlord/supervisor/SupervisorManager.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 40a23955665c..37502d1c09d5 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -431,7 +431,13 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ throw new ISE(e, "Failed to create backfill supervisor spec for supervisor[%s]", id); } - log.info("Started backfill supervisor[%s] for supervisor[%s]", backfillSupervisorId, id); + log.info( + "Started backfill supervisor[%s] for supervisor[%s] with startOffsets[%s] and endOffsets[%s]", + backfillSupervisorId, + id, + startOffsets, + endOffsets + ); log.info("Resetting supervisor[%s] metadata to latest offsets", id); DataSourceMetadata resetMetadata = streamSupervisor.createDataSourceMetaDataForReset( From 384cc6e762a4b504c4b592b6f65150086670810d Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 18:30:27 -0700 Subject: [PATCH 19/23] Add abstract createBackfillSpec --- .../RabbitStreamSupervisorSpec.java | 50 +++++++++++++++++ .../kafka/supervisor/KafkaSupervisorSpec.java | 54 +++++++++++++++++++ .../supervisor/KinesisSupervisorSpec.java | 54 +++++++++++++++++++ .../supervisor/SupervisorManager.java | 22 +------- .../SeekableStreamSupervisorSpec.java | 6 +++ .../supervisor/SupervisorManagerTest.java | 40 ++++++-------- .../supervisor/SupervisorResourceTest.java | 11 ++++ .../SeekableStreamSupervisorSpecTest.java | 10 ++++ .../SeekableStreamSupervisorTestBase.java | 10 ++++ 9 files changed, 214 insertions(+), 43 deletions(-) diff --git a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java index 4a445f6f1c11..899498d4b9b5 100644 --- a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java +++ b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java @@ -30,6 +30,7 @@ import org.apache.druid.indexing.overlord.supervisor.Supervisor; import org.apache.druid.indexing.overlord.supervisor.SupervisorStateManagerConfig; import org.apache.druid.indexing.rabbitstream.RabbitStreamIndexTaskClientFactory; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig; @@ -155,6 +156,55 @@ protected RabbitStreamSupervisorSpec toggleSuspend(boolean suspend) supervisorStateManagerConfig); } + @Override + public RabbitStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + RabbitStreamSupervisorIOConfig ioConfig = getIoConfig(); + RabbitStreamSupervisorIOConfig backfillIoConfig = new RabbitStreamSupervisorIOConfig( + ioConfig.getStream(), + ioConfig.getUri(), + ioConfig.getInputFormat(), + ioConfig.getReplicas(), + taskCount != null ? taskCount : ioConfig.getTaskCount(), + ioConfig.getTaskDuration().toPeriod(), + ioConfig.getConsumerProperties(), + ioConfig.getAutoScalerConfig(), + ioConfig.getPollTimeout(), + ioConfig.getStartDelay().toPeriod(), + ioConfig.getPeriod().toPeriod(), + ioConfig.getCompletionTimeout().toPeriod(), + ioConfig.isUseEarliestSequenceNumber(), + ioConfig.getLateMessageRejectionPeriod().isPresent() ? ioConfig.getLateMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getEarlyMessageRejectionPeriod().isPresent() ? ioConfig.getEarlyMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getLateMessageRejectionStartDateTime().isPresent() ? ioConfig.getLateMessageRejectionStartDateTime().get() : null, + ioConfig.getStopTaskCount(), + ioConfig.getServerPriorityToReplicas(), + boundedStreamConfig + ); + return new RabbitStreamSupervisorSpec( + backfillId, + null, + getDataSchema(), + getTuningConfig(), + backfillIoConfig, + getContext(), + isSuspended(), + taskStorage, + taskMaster, + indexerMetadataStorageCoordinator, + (RabbitStreamIndexTaskClientFactory) indexTaskClientFactory, + mapper, + emitter, + monitorSchedulerConfig, + rowIngestionMetersFactory, + supervisorStateManagerConfig + ); + } + @Override public String toString() { diff --git a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java index b607ade1acfe..d00f387b53ec 100644 --- a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java +++ b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java @@ -36,6 +36,7 @@ import org.apache.druid.indexing.overlord.supervisor.Supervisor; import org.apache.druid.indexing.overlord.supervisor.SupervisorSpec; import org.apache.druid.indexing.overlord.supervisor.SupervisorStateManagerConfig; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.emitter.service.ServiceEmitter; @@ -173,6 +174,59 @@ protected KafkaSupervisorSpec toggleSuspend(boolean suspend) ); } + @Override + public KafkaSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + KafkaSupervisorIOConfig ioConfig = getIoConfig(); + KafkaSupervisorIOConfig backfillIoConfig = new KafkaSupervisorIOConfig( + ioConfig.getTopic(), + ioConfig.getTopicPattern(), + ioConfig.getInputFormat(), + ioConfig.getReplicas(), + taskCount != null ? taskCount : ioConfig.getTaskCount(), + ioConfig.getTaskDuration().toPeriod(), + ioConfig.getConsumerProperties(), + ioConfig.getAutoScalerConfig(), + ioConfig.getLagAggregator(), + ioConfig.getPollTimeout(), + ioConfig.getStartDelay().toPeriod(), + ioConfig.getPeriod().toPeriod(), + ioConfig.isUseEarliestSequenceNumber(), + ioConfig.getCompletionTimeout().toPeriod(), + ioConfig.getLateMessageRejectionPeriod().isPresent() ? ioConfig.getLateMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getEarlyMessageRejectionPeriod().isPresent() ? ioConfig.getEarlyMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getLateMessageRejectionStartDateTime().isPresent() ? ioConfig.getLateMessageRejectionStartDateTime().get() : null, + ioConfig.getConfigOverrides(), + ioConfig.getIdleConfig(), + ioConfig.getStopTaskCount(), + ioConfig.isEmitTimeLagMetrics(), + ioConfig.getServerPriorityToReplicas(), + boundedStreamConfig + ); + return new KafkaSupervisorSpec( + backfillId, + null, + getDataSchema(), + getTuningConfig(), + backfillIoConfig, + getContext(), + isSuspended(), + taskStorage, + taskMaster, + indexerMetadataStorageCoordinator, + (KafkaIndexTaskClientFactory) indexTaskClientFactory, + mapper, + emitter, + monitorSchedulerConfig, + rowIngestionMetersFactory, + supervisorStateManagerConfig + ); + } + /** * Extends {@link SeekableStreamSupervisorSpec#validateSpecUpdateTo} to ensure that the proposed spec and current spec are either both multi-topic or both single-topic. *

diff --git a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java index 8e6615716809..8c53cc881030 100644 --- a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java +++ b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java @@ -35,6 +35,7 @@ import org.apache.druid.indexing.overlord.TaskStorage; import org.apache.druid.indexing.overlord.supervisor.Supervisor; import org.apache.druid.indexing.overlord.supervisor.SupervisorStateManagerConfig; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; import org.apache.druid.java.util.emitter.service.ServiceEmitter; import org.apache.druid.java.util.metrics.DruidMonitorSchedulerConfig; @@ -193,4 +194,57 @@ protected KinesisSupervisorSpec toggleSuspend(boolean suspend) supervisorStateManagerConfig ); } + + @Override + public KinesisSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + KinesisSupervisorIOConfig ioConfig = getIoConfig(); + KinesisSupervisorIOConfig backfillIoConfig = new KinesisSupervisorIOConfig( + ioConfig.getStream(), + ioConfig.getInputFormat(), + ioConfig.getEndpoint(), + null, + ioConfig.getReplicas(), + taskCount != null ? taskCount : ioConfig.getTaskCount(), + ioConfig.getTaskDuration().toPeriod(), + ioConfig.getStartDelay().toPeriod(), + ioConfig.getPeriod().toPeriod(), + ioConfig.isUseEarliestSequenceNumber(), + ioConfig.getCompletionTimeout().toPeriod(), + ioConfig.getLateMessageRejectionPeriod().isPresent() ? ioConfig.getLateMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getEarlyMessageRejectionPeriod().isPresent() ? ioConfig.getEarlyMessageRejectionPeriod().get().toPeriod() : null, + ioConfig.getLateMessageRejectionStartDateTime().isPresent() ? ioConfig.getLateMessageRejectionStartDateTime().get() : null, + ioConfig.getRecordsPerFetch(), + ioConfig.getFetchDelayMillis(), + ioConfig.getAwsAssumedRoleArn(), + ioConfig.getAwsExternalId(), + ioConfig.getAutoScalerConfig(), + ioConfig.isDeaggregate(), + ioConfig.getServerPriorityToReplicas(), + boundedStreamConfig + ); + return new KinesisSupervisorSpec( + backfillId, + null, + getDataSchema(), + getTuningConfig(), + backfillIoConfig, + getContext(), + isSuspended(), + taskStorage, + taskMaster, + indexerMetadataStorageCoordinator, + (KinesisIndexTaskClientFactory) indexTaskClientFactory, + mapper, + emitter, + monitorSchedulerConfig, + rowIngestionMetersFactory, + awsCredentialsConfig, + supervisorStateManagerConfig + ); + } } diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 37502d1c09d5..e70c170de15d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -21,7 +21,6 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; @@ -424,11 +423,11 @@ public Map resetSupervisorAndBackfill(String id, @Nullable Integ Map normalizedStartOffsets = jsonMapper.readValue(jsonMapper.writeValueAsString(startOffsets), Map.class); Map normalizedEndOffsets = jsonMapper.readValue(jsonMapper.writeValueAsString(endOffsets), Map.class); BoundedStreamConfig boundedStreamConfig = new BoundedStreamConfig(normalizedStartOffsets, normalizedEndOffsets); - SupervisorSpec backfillSpec = createBackfillSpec(streamSpec, backfillSupervisorId, boundedStreamConfig, backfillTaskCount); + SupervisorSpec backfillSpec = streamSpec.createBackfillSpec(backfillSupervisorId, boundedStreamConfig, backfillTaskCount); createOrUpdateAndStartSupervisor(backfillSpec); } catch (JsonProcessingException e) { - throw new ISE(e, "Failed to create backfill supervisor spec for supervisor[%s]", id); + throw new ISE(e, "Failed to serialize offsets for backfill supervisor[%s]", backfillSupervisorId); } log.info( @@ -480,23 +479,6 @@ private void validateResetAndBackfill( } } - SupervisorSpec createBackfillSpec( - SeekableStreamSupervisorSpec sourceSpec, - String backfillSupervisorId, - BoundedStreamConfig boundedStreamConfig, - @Nullable Integer backfillTaskCount - ) throws JsonProcessingException - { - ObjectNode specNode = jsonMapper.valueToTree(sourceSpec); - specNode.put("id", backfillSupervisorId); - ObjectNode ioConfigNode = (ObjectNode) specNode.path("spec").path("ioConfig"); - ioConfigNode.set("boundedStreamConfig", jsonMapper.valueToTree(boundedStreamConfig)); - if (backfillTaskCount != null) { - ioConfigNode.put("taskCount", backfillTaskCount); - } - return jsonMapper.treeToValue(specNode, SupervisorSpec.class); - } - public boolean checkPointDataSourceMetadata( String supervisorId, int taskGroupId, diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpec.java b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpec.java index 842f0de4774e..ecbd51757c37 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpec.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpec.java @@ -297,4 +297,10 @@ public void merge(@Nullable SupervisorSpec existingSpec) protected abstract SeekableStreamSupervisorSpec toggleSuspend(boolean suspend); + public abstract SeekableStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ); + } diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java index 5815952ea626..8867da2808e5 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java @@ -23,7 +23,6 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonTypeName; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.jsontype.NamedType; import com.google.common.base.Optional; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -70,6 +69,7 @@ import org.junit.rules.ExpectedException; import org.junit.runner.RunWith; +import javax.annotation.Nullable; import java.lang.reflect.Field; import java.util.Collection; import java.util.Collections; @@ -1231,17 +1231,8 @@ public void reset(DataSourceMetadata dataSourceMetadata) } @Test - public void testCreateBackfillSpec() throws Exception + public void testCreateBackfillSpec() { - final ObjectMapper localMapper = new DefaultObjectMapper(); - localMapper.registerSubtypes( - new NamedType(TestBackfillSupervisorSpec.class, "testBackfill"), - new NamedType(TestBackfillSupervisorSpec.IngestionSpec.class, "testBackfillIngestionSpec"), - new NamedType(TestBackfillSupervisorSpec.IOConfig.class, "testBackfillIOConfig") - ); - - final SupervisorManager localManager = new SupervisorManager(localMapper, metadataSupervisorManager); - final TestBackfillSupervisorSpec.IOConfig ioConfig = new TestBackfillSupervisorSpec.IOConfig("test-stream", null, null); final TestBackfillSupervisorSpec.IngestionSpec ingestionSpec = new TestBackfillSupervisorSpec.IngestionSpec(ioConfig); final SeekableStreamSupervisorSpec sourceSpec = new TestBackfillSupervisorSpec("original-id", ingestionSpec); @@ -1252,12 +1243,7 @@ public void testCreateBackfillSpec() throws Exception ); // Without overriding taskCount - final SupervisorSpec backfillSpec = localManager.createBackfillSpec( - sourceSpec, - "backfill-id", - boundedStreamConfig, - null - ); + final SupervisorSpec backfillSpec = sourceSpec.createBackfillSpec("backfill-id", boundedStreamConfig, null); Assert.assertEquals("backfill-id", backfillSpec.getId()); final TestBackfillSupervisorSpec backfillCast = (TestBackfillSupervisorSpec) backfillSpec; final BoundedStreamConfig actualConfig = backfillCast.getIoConfig().getBoundedStreamConfig(); @@ -1267,12 +1253,7 @@ public void testCreateBackfillSpec() throws Exception Assert.assertEquals(1, backfillCast.getIoConfig().getTaskCount()); // With overriding taskCount - final SupervisorSpec backfillSpecWithCount = localManager.createBackfillSpec( - sourceSpec, - "backfill-id-2", - boundedStreamConfig, - 5 - ); + final SupervisorSpec backfillSpecWithCount = sourceSpec.createBackfillSpec("backfill-id-2", boundedStreamConfig, 5); Assert.assertEquals("backfill-id-2", backfillSpecWithCount.getId()); final TestBackfillSupervisorSpec backfillWithCount = (TestBackfillSupervisorSpec) backfillSpecWithCount; Assert.assertEquals(5, backfillWithCount.getIoConfig().getTaskCount()); @@ -1392,6 +1373,19 @@ protected SeekableStreamSupervisorSpec toggleSuspend(boolean suspend) return this; } + @Override + public SeekableStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + return new TestBackfillSupervisorSpec( + backfillId, + new IngestionSpec(new IOConfig(getIoConfig().getStream(), taskCount, boundedStreamConfig)) + ); + } + @Override public SeekableStreamSupervisorIOConfig getIoConfig() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index 4c1ff299d98e..61761f4e5ee8 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -34,6 +34,7 @@ import org.apache.druid.indexing.seekablestream.SeekableStreamIndexTaskClientFactory; import org.apache.druid.indexing.seekablestream.SeekableStreamStartSequenceNumbers; import org.apache.druid.indexing.seekablestream.TestSeekableStreamDataSourceMetadata; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIOConfig; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorIngestionSpec; import org.apache.druid.indexing.seekablestream.supervisor.SeekableStreamSupervisorSpec; @@ -1762,6 +1763,16 @@ protected SeekableStreamSupervisorSpec toggleSuspend(boolean suspend) return null; } + @Override + public SeekableStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + return null; + } + @JsonIgnore @Nonnull @Override diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpecTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpecTest.java index 3d0c6426feb2..8182ae482ca6 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpecTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorSpecTest.java @@ -884,6 +884,16 @@ protected SeekableStreamSupervisorSpec toggleSuspend(boolean suspend) return null; } + @Override + public SeekableStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + return null; + } + @Override public String getType() { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java index 0488670e1e48..c96a64211b97 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/seekablestream/supervisor/SeekableStreamSupervisorTestBase.java @@ -436,6 +436,16 @@ protected SeekableStreamSupervisorSpec toggleSuspend(boolean suspend) { return null; } + + @Override + public SeekableStreamSupervisorSpec createBackfillSpec( + String backfillId, + BoundedStreamConfig boundedStreamConfig, + @Nullable Integer taskCount + ) + { + return null; + } } protected static SeekableStreamSupervisorTuningConfig getTuningConfig() From 99755dc61982b907dab178c4c0d75b61211c004b Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 20:59:30 -0700 Subject: [PATCH 20/23] Unit test createBackfillSpec --- .../supervisor/KafkaSupervisorSpecTest.java | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java index 8879ff6d9753..0d127583f266 100644 --- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java +++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java @@ -32,6 +32,7 @@ import org.apache.druid.indexing.overlord.TaskStorage; import org.apache.druid.indexing.overlord.supervisor.SupervisorSpec; import org.apache.druid.indexing.overlord.supervisor.SupervisorStateManagerConfig; +import org.apache.druid.indexing.seekablestream.supervisor.BoundedStreamConfig; import org.apache.druid.indexing.seekablestream.supervisor.LagAggregator; import org.apache.druid.indexing.seekablestream.supervisor.autoscaler.CostBasedAutoScalerConfig; import org.apache.druid.jackson.DefaultObjectMapper; @@ -564,6 +565,38 @@ public void test_validateSpecUpdateTo() sourceSpec.validateSpecUpdateTo(validDestSpec); } + @Test + public void testCreateBackfillSpec() + { + KafkaSupervisorSpec spec = new KafkaSupervisorSpecBuilder() + .withDataSchema( + schema -> schema + .withTimestamp(TimestampSpec.DEFAULT) + .withAggregators(new CountAggregatorFactory("rows")) + .withGranularity(new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, null)) + ) + .withIoConfig( + ioConfig -> ioConfig + .withJsonInputFormat() + .withConsumerProperties(Map.of("bootstrap.servers", "localhost:9092")) + .withTaskCount(3) + ) + .build("testDs", "metrics"); + + BoundedStreamConfig boundedStreamConfig = new BoundedStreamConfig( + Map.of("0", 100L, "1", 200L), + Map.of("0", 500L, "1", 600L) + ); + + KafkaSupervisorSpec backfill = (KafkaSupervisorSpec) spec.createBackfillSpec("backfill-id", boundedStreamConfig, 2); + + Assert.assertEquals("backfill-id", backfill.getId()); + Assert.assertEquals("testDs", backfill.getDataSchema().getDataSource()); + Assert.assertEquals("metrics", backfill.getIoConfig().getTopic()); + Assert.assertEquals(2, backfill.getIoConfig().getTaskCount()); + Assert.assertEquals(boundedStreamConfig, backfill.getIoConfig().getBoundedStreamConfig()); + } + private KafkaSupervisorSpec getSpec(String topic, String topicPattern) { KafkaSupervisorSpecBuilder builder = new KafkaSupervisorSpecBuilder() From e00f33e2852d4c2518005697b4aa6e1e750cf51d Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Thu, 28 May 2026 22:52:19 -0700 Subject: [PATCH 21/23] Fix deprecation notices --- .../supervisor/RabbitStreamSupervisorSpec.java | 6 +++--- .../indexing/kafka/supervisor/KafkaSupervisorSpec.java | 6 +++--- .../kafka/supervisor/KafkaSupervisorSpecTest.java | 8 ++++---- .../kinesis/supervisor/KinesisSupervisorSpec.java | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java index 899498d4b9b5..4763a949a615 100644 --- a/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java +++ b/extensions-contrib/rabbit-stream-indexing-service/src/main/java/org/apache/druid/indexing/rabbitstream/supervisor/RabbitStreamSupervisorSpec.java @@ -163,7 +163,7 @@ public RabbitStreamSupervisorSpec createBackfillSpec( @Nullable Integer taskCount ) { - RabbitStreamSupervisorIOConfig ioConfig = getIoConfig(); + RabbitStreamSupervisorIOConfig ioConfig = getSpec().getIOConfig(); RabbitStreamSupervisorIOConfig backfillIoConfig = new RabbitStreamSupervisorIOConfig( ioConfig.getStream(), ioConfig.getUri(), @@ -188,8 +188,8 @@ public RabbitStreamSupervisorSpec createBackfillSpec( return new RabbitStreamSupervisorSpec( backfillId, null, - getDataSchema(), - getTuningConfig(), + getSpec().getDataSchema(), + getSpec().getTuningConfig(), backfillIoConfig, getContext(), isSuspended(), diff --git a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java index d00f387b53ec..31d3e8fad691 100644 --- a/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java +++ b/extensions-core/kafka-indexing-service/src/main/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpec.java @@ -181,7 +181,7 @@ public KafkaSupervisorSpec createBackfillSpec( @Nullable Integer taskCount ) { - KafkaSupervisorIOConfig ioConfig = getIoConfig(); + KafkaSupervisorIOConfig ioConfig = getSpec().getIOConfig(); KafkaSupervisorIOConfig backfillIoConfig = new KafkaSupervisorIOConfig( ioConfig.getTopic(), ioConfig.getTopicPattern(), @@ -210,8 +210,8 @@ public KafkaSupervisorSpec createBackfillSpec( return new KafkaSupervisorSpec( backfillId, null, - getDataSchema(), - getTuningConfig(), + getSpec().getDataSchema(), + getSpec().getTuningConfig(), backfillIoConfig, getContext(), isSuspended(), diff --git a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java index 0d127583f266..06ca9b64ced5 100644 --- a/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java +++ b/extensions-core/kafka-indexing-service/src/test/java/org/apache/druid/indexing/kafka/supervisor/KafkaSupervisorSpecTest.java @@ -591,10 +591,10 @@ public void testCreateBackfillSpec() KafkaSupervisorSpec backfill = (KafkaSupervisorSpec) spec.createBackfillSpec("backfill-id", boundedStreamConfig, 2); Assert.assertEquals("backfill-id", backfill.getId()); - Assert.assertEquals("testDs", backfill.getDataSchema().getDataSource()); - Assert.assertEquals("metrics", backfill.getIoConfig().getTopic()); - Assert.assertEquals(2, backfill.getIoConfig().getTaskCount()); - Assert.assertEquals(boundedStreamConfig, backfill.getIoConfig().getBoundedStreamConfig()); + Assert.assertEquals("testDs", backfill.getSpec().getDataSchema().getDataSource()); + Assert.assertEquals("metrics", backfill.getSpec().getIOConfig().getTopic()); + Assert.assertEquals(2, backfill.getSpec().getIOConfig().getTaskCount()); + Assert.assertEquals(boundedStreamConfig, backfill.getSpec().getIOConfig().getBoundedStreamConfig()); } private KafkaSupervisorSpec getSpec(String topic, String topicPattern) diff --git a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java index 8c53cc881030..4899337797bf 100644 --- a/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java +++ b/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/supervisor/KinesisSupervisorSpec.java @@ -202,7 +202,7 @@ public KinesisSupervisorSpec createBackfillSpec( @Nullable Integer taskCount ) { - KinesisSupervisorIOConfig ioConfig = getIoConfig(); + KinesisSupervisorIOConfig ioConfig = getSpec().getIOConfig(); KinesisSupervisorIOConfig backfillIoConfig = new KinesisSupervisorIOConfig( ioConfig.getStream(), ioConfig.getInputFormat(), @@ -230,8 +230,8 @@ public KinesisSupervisorSpec createBackfillSpec( return new KinesisSupervisorSpec( backfillId, null, - getDataSchema(), - getTuningConfig(), + getSpec().getDataSchema(), + getSpec().getTuningConfig(), backfillIoConfig, getContext(), isSuspended(), From 21a4ce7157d8f69d89fd9f4749fcf4eaffcf8618 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Fri, 29 May 2026 15:42:20 -0700 Subject: [PATCH 22/23] Rename functions to align with new endpoint name --- docs/api-reference/supervisor-api.md | 4 ++-- .../indexing/KafkaBoundedSupervisorTest.java | 5 +++-- .../supervisor/SupervisorManager.java | 2 +- .../supervisor/SupervisorResource.java | 6 +++--- .../supervisor/SupervisorManagerTest.java | 20 +++++++++---------- .../supervisor/SupervisorResourceTest.java | 6 +++--- .../MSQWorkerTaskLauncherRetryTest.java | 2 +- .../rpc/indexing/NoopOverlordClient.java | 2 +- .../druid/rpc/indexing/OverlordClient.java | 2 +- .../rpc/indexing/OverlordClientImpl.java | 2 +- .../testing/embedded/EmbeddedClusterApis.java | 4 ++-- 11 files changed, 28 insertions(+), 27 deletions(-) diff --git a/docs/api-reference/supervisor-api.md b/docs/api-reference/supervisor-api.md index dfddf31719f7..8f9c5c36dc5c 100644 --- a/docs/api-reference/supervisor-api.md +++ b/docs/api-reference/supervisor-api.md @@ -3539,7 +3539,7 @@ when the supervisor's tasks restart, they resume reading from `{"0": 100, "1": 1 ``` -### Reset offsets and start a backfill supervisor +### Reset offsets to latest and start a backfill supervisor This endpoint is supported for Apache Kafka and RabbitMQ Stream supervisors. Amazon Kinesis is not supported yet. @@ -3570,7 +3570,7 @@ The backfill supervisor has the same configuration as the source supervisor exce | Parameter | Type | Description | Default | |---------|---------|---------|---------| -| `backfillTaskCount` | Integer | Number of parallel tasks for the backfill supervisor. If not specified, inherits `taskCount` from the source supervisor. | None | +| `backfillTaskCount` | Integer | Number of parallel tasks for the backfill supervisor. | Defaults to `taskCount` from the source supervisor if not specified | #### Responses diff --git a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java index ce65e6263529..fa184418df52 100644 --- a/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java +++ b/embedded-tests/src/test/java/org/apache/druid/testing/embedded/indexing/KafkaBoundedSupervisorTest.java @@ -293,7 +293,7 @@ public void test_boundedSupervisor_doesNotSilentlyCompleteWhenStaleOffsetExceeds } @Test - public void test_resetSupervisorAndBackfill() + public void test_resetToLatestAndBackfill() { final String topic = IdUtils.getRandomId(); kafkaServer.createTopicWithPartitions(topic, 2); @@ -317,7 +317,8 @@ public void test_resetSupervisorAndBackfill() // Reset the main supervisor and spin up a backfill supervisor. // Since all records are already ingested before the call, the backfill // supervisor will complete immediately without ingesting anything. - final Map result = cluster.callApi().resetSupervisorAndBackfill(supervisor.getId()); + final Map result = cluster.callApi().resetToLatestAndBackfill(supervisor.getId()); + Assertions.assertEquals(supervisor.getId(), result.get("id")); final String backfillSupervisorId = (String) result.get("backfillSupervisorId"); // Wait for the backfill to finish diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index e70c170de15d..2f902884272d 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -387,7 +387,7 @@ public boolean resetSupervisor(String id, @Nullable DataSourceMetadata resetData * @throws IllegalStateException if the latest or checkpointed offsets cannot be retrieved, * or if the backfill spec cannot be serialized */ - public Map resetSupervisorAndBackfill(String id, @Nullable Integer backfillTaskCount) + public Map resetToLatestAndBackfill(String id, @Nullable Integer backfillTaskCount) { Preconditions.checkState(started, "SupervisorManager not started"); Preconditions.checkNotNull(id, "id"); diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java index 6116733c1950..8d0e04eb7988 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResource.java @@ -649,10 +649,10 @@ public Response resetToLatestAndBackfill( @QueryParam("backfillTaskCount") @Nullable final Integer backfillTaskCount ) { - return handleResetAndBackfill(id, backfillTaskCount); + return handleResetToLatestAndBackfill(id, backfillTaskCount); } - private Response handleResetAndBackfill(final String id, @Nullable final Integer backfillTaskCount) + private Response handleResetToLatestAndBackfill(final String id, @Nullable final Integer backfillTaskCount) { if (backfillTaskCount != null && backfillTaskCount < 1) { return Response.status(Response.Status.BAD_REQUEST) @@ -667,7 +667,7 @@ private Response handleResetAndBackfill(final String id, @Nullable final Integer .build(); } try { - Map result = manager.resetSupervisorAndBackfill(id, backfillTaskCount); + Map result = manager.resetToLatestAndBackfill(id, backfillTaskCount); return Response.ok(result).build(); } catch (IllegalArgumentException e) { diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java index 8867da2808e5..199e004b4243 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManagerTest.java @@ -1080,7 +1080,7 @@ public void test_isAnotherTaskGroupPublishingToPartitions() } @Test - public void testResetSupervisorAndBackfill() throws Exception + public void testResetToLatestAndBackfill() throws Exception { EasyMock.expect(metadataSupervisorManager.getLatest()).andReturn(ImmutableMap.of()); replayAll(); @@ -1125,7 +1125,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) supervisorsMap.put("id1", Pair.of(nonStreamSupervisor, streamSpec)); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); // useEarliestSequenceNumber=true → IAE @@ -1135,7 +1135,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1146,7 +1146,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1157,7 +1157,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1169,7 +1169,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1181,7 +1181,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1193,7 +1193,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalArgumentException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1208,7 +1208,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalStateException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); EasyMock.reset(streamSupervisor, streamSpec, ioConfig); @@ -1224,7 +1224,7 @@ public void reset(DataSourceMetadata dataSourceMetadata) EasyMock.replay(streamSupervisor, streamSpec, ioConfig); Assert.assertThrows( IllegalStateException.class, - () -> manager.resetSupervisorAndBackfill("id1", null) + () -> manager.resetToLatestAndBackfill("id1", null) ); verifyAll(); diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java index 61761f4e5ee8..31e0d604a222 100644 --- a/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java +++ b/indexing-service/src/test/java/org/apache/druid/indexing/overlord/supervisor/SupervisorResourceTest.java @@ -1386,7 +1386,7 @@ public void testResetToLatestAndBackfill() // 200 - success EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); - EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + EasyMock.expect(supervisorManager.resetToLatestAndBackfill("my-id", null)) .andReturn(ImmutableMap.of("id", "my-id", "backfillSupervisorId", "my-id_backfill_abcdefgh")); replayAll(); @@ -1412,7 +1412,7 @@ public void testResetToLatestAndBackfill() // 400 - IAE (e.g. supervisor not running) EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); - EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + EasyMock.expect(supervisorManager.resetToLatestAndBackfill("my-id", null)) .andThrow(new IllegalArgumentException("Supervisor[my-id] must be in a RUNNING state")); replayAll(); @@ -1428,7 +1428,7 @@ public void testResetToLatestAndBackfill() // 500 - ISE (e.g. failed to retrieve offsets) EasyMock.expect(taskMaster.getSupervisorManager()).andReturn(Optional.of(supervisorManager)); EasyMock.expect(supervisorManager.getSupervisorIds()).andReturn(ImmutableSet.of("my-id")); - EasyMock.expect(supervisorManager.resetSupervisorAndBackfill("my-id", null)) + EasyMock.expect(supervisorManager.resetToLatestAndBackfill("my-id", null)) .andThrow(new IllegalStateException("Failed to get latest offsets from stream")); replayAll(); diff --git a/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java b/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java index 257aafa69388..35488be081b1 100644 --- a/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java +++ b/multi-stage-query/src/test/java/org/apache/druid/msq/indexing/MSQWorkerTaskLauncherRetryTest.java @@ -323,7 +323,7 @@ public ListenableFuture> terminateSupervisor(String supervis } @Override - public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + public ListenableFuture> resetToLatestAndBackfill(String supervisorId) { throw new UOE("Not implemented"); } diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java b/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java index d14474491fc5..2b1ad6a555a7 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/NoopOverlordClient.java @@ -115,7 +115,7 @@ public ListenableFuture> terminateSupervisor(String supervis } @Override - public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + public ListenableFuture> resetToLatestAndBackfill(String supervisorId) { throw new UnsupportedOperationException(); } diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java index d8bcd10550bb..baf7e4297c9d 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClient.java @@ -204,7 +204,7 @@ ListenableFuture> taskStatuses( * * @return Map containing "id" and "backfillSupervisorId" */ - ListenableFuture> resetSupervisorAndBackfill(String supervisorId); + ListenableFuture> resetToLatestAndBackfill(String supervisorId); /** * Returns all current supervisor statuses. diff --git a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java index a24fdc9c6f67..3657d8b83a6f 100644 --- a/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java +++ b/server/src/main/java/org/apache/druid/rpc/indexing/OverlordClientImpl.java @@ -266,7 +266,7 @@ public ListenableFuture> terminateSupervisor(String supervis } @Override - public ListenableFuture> resetSupervisorAndBackfill(String supervisorId) + public ListenableFuture> resetToLatestAndBackfill(String supervisorId) { final String path = StringUtils.format( "/druid/indexer/v1/supervisor/%s/resetToLatestAndBackfill", diff --git a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java index 02ff65de67f9..6ae8750b8d8e 100644 --- a/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java +++ b/services/src/test/java/org/apache/druid/testing/embedded/EmbeddedClusterApis.java @@ -435,9 +435,9 @@ public String postSupervisor(SupervisorSpec supervisor) * * @return Map containing "id" and "backfillSupervisorId" */ - public Map resetSupervisorAndBackfill(String supervisorId) + public Map resetToLatestAndBackfill(String supervisorId) { - return onLeaderOverlord(o -> o.resetSupervisorAndBackfill(supervisorId)); + return onLeaderOverlord(o -> o.resetToLatestAndBackfill(supervisorId)); } /** From 1af4a5fad9ec9e425de6687595f6746014100c07 Mon Sep 17 00:00:00 2001 From: Andrew Ho Date: Fri, 29 May 2026 16:45:59 -0700 Subject: [PATCH 23/23] Add null check and rename for consistency --- .../overlord/supervisor/SupervisorManager.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java index 2f902884272d..fa7d96634ae6 100644 --- a/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java +++ b/indexing-service/src/main/java/org/apache/druid/indexing/overlord/supervisor/SupervisorManager.java @@ -392,14 +392,18 @@ public Map resetToLatestAndBackfill(String id, @Nullable Integer Preconditions.checkState(started, "SupervisorManager not started"); Preconditions.checkNotNull(id, "id"); - Pair supervisorPair = supervisors.get(id); + Pair supervisor = supervisors.get(id); + + if (supervisor == null) { + throw new IAE("Supervisor[%s] does not exist", id); + } - if (!(supervisorPair.lhs instanceof SeekableStreamSupervisor)) { + if (!(supervisor.lhs instanceof SeekableStreamSupervisor)) { throw new IAE("Supervisor[%s] is not a streaming supervisor", id); } - SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisorPair.lhs; - SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisorPair.rhs; + SeekableStreamSupervisor streamSupervisor = (SeekableStreamSupervisor) supervisor.lhs; + SeekableStreamSupervisorSpec streamSpec = (SeekableStreamSupervisorSpec) supervisor.rhs; validateResetAndBackfill(id, streamSupervisor, streamSpec);