diff --git a/storage/src/main/java/org/zstack/storage/volume/VolumeBase.java b/storage/src/main/java/org/zstack/storage/volume/VolumeBase.java index 95d7bb86060..4c40b7a95dc 100755 --- a/storage/src/main/java/org/zstack/storage/volume/VolumeBase.java +++ b/storage/src/main/java/org/zstack/storage/volume/VolumeBase.java @@ -46,6 +46,8 @@ import org.zstack.identity.AccountManager; import org.zstack.storage.primary.EstimateVolumeTemplateSizeOnPrimaryStorageMsg; import org.zstack.storage.primary.EstimateVolumeTemplateSizeOnPrimaryStorageReply; +import org.zstack.storage.primary.PrimaryStorageDeleteBitGC; +import org.zstack.storage.primary.PrimaryStorageGlobalConfig; import org.zstack.storage.snapshot.group.VolumeSnapshotGroupOperationValidator; import org.zstack.storage.snapshot.reference.VolumeSnapshotReferenceUtils; import org.zstack.tag.SystemTagCreator; @@ -59,6 +61,7 @@ import javax.persistence.TypedQuery; import java.util.*; +import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import static org.zstack.core.Platform.*; @@ -3561,7 +3564,7 @@ public void run(MessageReply reply) { }); flow(new NoRollbackFlow() { - String __name__ = "delete-origin-volume-bits"; + String __name__ = "update-db-install-path"; @Override public boolean skip(Map data) { @@ -3570,24 +3573,18 @@ public boolean skip(Map data) { @Override public void run(FlowTrigger trigger, Map data) { - DeleteVolumeBitsOnPrimaryStorageMsg dmsg = new DeleteVolumeBitsOnPrimaryStorageMsg(); - dmsg.setPrimaryStorageUuid(self.getPrimaryStorageUuid()); - dmsg.setInstallPath(originVolumePath); - dmsg.setSize(self.getSize()); - dmsg.setBitsType(VolumeVO.class.getSimpleName()); - dmsg.setBitsUuid(self.getUuid()); - dmsg.setHypervisorType(VolumeFormat.getMasterHypervisorTypeByVolumeFormat(getSelfInventory().getFormat()).toString()); - dmsg.setFolder(false); - dmsg.setFromRecycle(true); - bus.makeTargetServiceIdByResourceUuid(dmsg, PrimaryStorageConstant.SERVICE_ID, self.getPrimaryStorageUuid()); - bus.send(dmsg, new CloudBusCallBack(trigger) { + MarkSnapshotAsVolumeMsg mmsg = new MarkSnapshotAsVolumeMsg(); + mmsg.setVolumeUuid(self.getUuid()); + mmsg.setSnapshotUuid(snapShot.getUuid()); + mmsg.setSize(size); + mmsg.setVolumePath(newVolumeInstallPath); + mmsg.setTreeUuid(snapShot.getTreeUuid()); + bus.makeTargetServiceIdByResourceUuid(mmsg, VolumeSnapshotConstant.SERVICE_ID, snapShot.getUuid()); + bus.send(mmsg, new CloudBusCallBack(trigger) { @Override public void run(MessageReply reply) { if (!reply.isSuccess()) { - if (reply.getError().isError(VolumeErrors.VOLUME_IN_USE)) { - logger.warn(String.format("unable to delete path:%s right now", originVolumePath)); - } - + logger.warn(String.format("mark snapshot:%s as volume failed", snapShot.getUuid())); trigger.fail(reply.getError()); return; } @@ -3599,7 +3596,7 @@ public void run(MessageReply reply) { }); flow(new NoRollbackFlow() { - String __name__ = "update-db-install-path"; + String __name__ = "delete-origin-volume-bits"; @Override public boolean skip(Map data) { @@ -3608,20 +3605,31 @@ public boolean skip(Map data) { @Override public void run(FlowTrigger trigger, Map data) { - MarkSnapshotAsVolumeMsg mmsg = new MarkSnapshotAsVolumeMsg(); - mmsg.setVolumeUuid(self.getUuid()); - mmsg.setSnapshotUuid(snapShot.getUuid()); - mmsg.setSize(size); - mmsg.setVolumePath(newVolumeInstallPath); - mmsg.setTreeUuid(snapShot.getTreeUuid()); - bus.makeTargetServiceIdByResourceUuid(mmsg, VolumeSnapshotConstant.SERVICE_ID, snapShot.getUuid()); - bus.send(mmsg, new CloudBusCallBack(trigger) { + DeleteVolumeBitsOnPrimaryStorageMsg dmsg = new DeleteVolumeBitsOnPrimaryStorageMsg(); + dmsg.setPrimaryStorageUuid(self.getPrimaryStorageUuid()); + dmsg.setInstallPath(originVolumePath); + dmsg.setSize(self.getSize()); + dmsg.setBitsType(VolumeVO.class.getSimpleName()); + dmsg.setBitsUuid(self.getUuid()); + dmsg.setHypervisorType(VolumeFormat.getMasterHypervisorTypeByVolumeFormat(getSelfInventory().getFormat()).toString()); + dmsg.setFolder(false); + dmsg.setFromRecycle(true); + bus.makeTargetServiceIdByResourceUuid(dmsg, PrimaryStorageConstant.SERVICE_ID, self.getPrimaryStorageUuid()); + bus.send(dmsg, new CloudBusCallBack(trigger) { @Override public void run(MessageReply reply) { if (!reply.isSuccess()) { - logger.warn(String.format("mark snapshot:%s as volume failed", snapShot.getUuid())); - trigger.fail(reply.getError()); - return; + if (reply.getError().isError(VolumeErrors.VOLUME_IN_USE)) { + logger.warn(String.format("unable to delete path:%s right now", originVolumePath)); + } + + PrimaryStorageDeleteBitGC gc = new PrimaryStorageDeleteBitGC(); + gc.NAME = String.format("gc-delete-bits-volume-%s-on-primary-storage-%s", self.getUuid(), self.getPrimaryStorageUuid()); + gc.primaryStorageInstallPath = originVolumePath; + gc.primaryStorageUuid = self.getPrimaryStorageUuid(); + gc.volume = self; + gc.submit(PrimaryStorageGlobalConfig.PRIMARY_STORAGE_DELETEBITS_GARBAGE_COLLECTOR_INTERVAL.value(Long.class), + TimeUnit.SECONDS); } trigger.next(); diff --git a/test/src/test/groovy/org/zstack/test/integration/storage/snapshot/UndoSnapshotCreationDeleteBitsFailureCase.groovy b/test/src/test/groovy/org/zstack/test/integration/storage/snapshot/UndoSnapshotCreationDeleteBitsFailureCase.groovy new file mode 100644 index 00000000000..d14926555b4 --- /dev/null +++ b/test/src/test/groovy/org/zstack/test/integration/storage/snapshot/UndoSnapshotCreationDeleteBitsFailureCase.groovy @@ -0,0 +1,133 @@ +package org.zstack.test.integration.storage.snapshot + +import org.zstack.core.db.Q +import org.zstack.header.volume.VolumeVO +import org.zstack.header.volume.VolumeVO_ +import org.zstack.sdk.GarbageCollectorInventory +import org.zstack.sdk.VmInstanceInventory +import org.zstack.sdk.VolumeInventory +import org.zstack.sdk.VolumeSnapshotInventory +import org.zstack.storage.primary.local.LocalStorageKvmBackend +import org.zstack.test.integration.storage.Env +import org.zstack.test.integration.storage.StorageTest +import org.zstack.testlib.EnvSpec +import org.zstack.testlib.SubCase + +/** + * Cover ZSTAC-76704 / TIC-5745: + * + * APIUndoSnapshotCreationMsg: after blockCommit succeeds on the data plane, + * a failure when deleting the origin volume bits must NOT roll back the + * control plane. Before the fix, MarkSnapshotAsVolume ran AFTER the + * delete-origin-volume-bits flow, so a delete failure aborted the undo and + * left the management-plane install path pointing at the origin path that + * had already been committed away on the data plane (control/data plane + * inconsistency). + * + * The fix re-orders the flows so MarkSnapshotAsVolume (update-db-install-path) + * runs BEFORE delete-origin-volume-bits, and a delete failure only submits a + * PrimaryStorageDeleteBitGC job instead of failing the whole undo. + * + * This case injects a delete-bits failure during undoSnapshotCreation and + * verifies: + * 1. the undo API still succeeds despite the delete-bits failure; + * 2. the volume install path is updated to the new (committed) path; + * 3. MarkSnapshotAsVolume ran BEFORE the delete-bits call (ordering); + * 4. the failed delete only submits a PrimaryStorageDeleteBitGC job. + * + * Note: the delete-bits failure is injected as a VOLUME_IN_USE error rather + * than a raw HTTP error, because local storage swallows raw transport errors + * with its own internal GC and still replies success - a VOLUME_IN_USE reply + * is the way to make DeleteVolumeBitsOnPrimaryStorageMsg actually fail back + * to the undo flow, which is the path the fix changes. + */ +class UndoSnapshotCreationDeleteBitsFailureCase extends SubCase { + EnvSpec env + VmInstanceInventory vm + + @Override + void clean() { + env.delete() + } + + @Override + void setup() { + useSpring(StorageTest.springSpec) + } + + @Override + void environment() { + env = Env.localStorageOneVmEnv() + } + + @Override + void test() { + env.create { + vm = env.inventoryByName("vm") as VmInstanceInventory + testDeleteBitsFailureStillUndoSnapshot() + } + } + + void testDeleteBitsFailureStillUndoSnapshot() { + VolumeSnapshotInventory snapshot = createVolumeSnapshot { + volumeUuid = vm.rootVolumeUuid + name = "snapshot-for-undo" + } as VolumeSnapshotInventory + + VolumeInventory rootBefore = queryVolume { + conditions = ["uuid=${vm.rootVolumeUuid}"] + }[0] as VolumeInventory + String originPath = rootBefore.installPath + assert originPath != snapshot.primaryStorageInstallPath + + // install path observed inside the delete-bits simulator: proves + // whether MarkSnapshotAsVolume already ran by the time delete-bits + // is reached (ordering assertion). + String installPathWhenDeleteHit = null + env.afterSimulator(LocalStorageKvmBackend.DELETE_BITS_PATH) { LocalStorageKvmBackend.DeleteBitsRsp rsp -> + installPathWhenDeleteHit = Q.New(VolumeVO.class) + .select(VolumeVO_.installPath) + .eq(VolumeVO_.uuid, vm.rootVolumeUuid) + .findValue() + // make DeleteVolumeBitsOnPrimaryStorageMsg fail back to the undo + // flow (local storage does not run its internal GC for in-use). + rsp.setError("volume in use - on purpose - ZSTAC-76704") + rsp.inUse = true + return rsp + } + + // before the fix the failed delete-bits would fail the whole undo; + // after the fix this API must still succeed. + undoSnapshotCreation { + uuid = vm.rootVolumeUuid + snapShotUuid = snapshot.uuid + } + + env.cleanSimulatorAndMessageHandlers() + + VolumeInventory rootAfter = queryVolume { + conditions = ["uuid=${vm.rootVolumeUuid}"] + }[0] as VolumeInventory + + // 1. control plane committed: volume install path updated to the new path + assert rootAfter.installPath != originPath + assert rootAfter.installPath == snapshot.primaryStorageInstallPath + + // 2. ordering: MarkSnapshotAsVolume ran BEFORE delete-origin-volume-bits, + // so the DB install path was already the new path when delete was hit. + assert installPathWhenDeleteHit != null + assert installPathWhenDeleteHit == snapshot.primaryStorageInstallPath + + // 3. the failed delete only submits a PrimaryStorageDeleteBitGC job; + // the undo itself still succeeds. + retryInSecs { + def gcs = queryGCJob { + conditions = ["context~=%${vm.rootVolumeUuid}%".toString()] + } as List + GarbageCollectorInventory gc = gcs.find { + it.name.contains("gc-delete-bits-volume-${vm.rootVolumeUuid}") + } as GarbageCollectorInventory + assert gc != null : "expected PrimaryStorageDeleteBitGC submitted, got: ${gcs.collect { it.name }}" + } + } +}