Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 36 additions & 28 deletions storage/src/main/java/org/zstack/storage/volume/VolumeBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
import org.zstack.identity.AccountManager;
import org.zstack.storage.primary.EstimateVolumeTemplateSizeOnPrimaryStorageMsg;
import org.zstack.storage.primary.EstimateVolumeTemplateSizeOnPrimaryStorageReply;
import org.zstack.storage.primary.PrimaryStorageDeleteBitGC;
import org.zstack.storage.primary.PrimaryStorageGlobalConfig;
import org.zstack.storage.snapshot.group.VolumeSnapshotGroupOperationValidator;
import org.zstack.storage.snapshot.reference.VolumeSnapshotReferenceUtils;
import org.zstack.tag.SystemTagCreator;
Expand All @@ -59,6 +61,7 @@

import javax.persistence.TypedQuery;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import static org.zstack.core.Platform.*;
Expand Down Expand Up @@ -3561,7 +3564,7 @@ public void run(MessageReply reply) {
});

flow(new NoRollbackFlow() {
String __name__ = "delete-origin-volume-bits";
String __name__ = "update-db-install-path";

@Override
public boolean skip(Map data) {
Expand All @@ -3570,24 +3573,18 @@ public boolean skip(Map data) {

@Override
public void run(FlowTrigger trigger, Map data) {
DeleteVolumeBitsOnPrimaryStorageMsg dmsg = new DeleteVolumeBitsOnPrimaryStorageMsg();
dmsg.setPrimaryStorageUuid(self.getPrimaryStorageUuid());
dmsg.setInstallPath(originVolumePath);
dmsg.setSize(self.getSize());
dmsg.setBitsType(VolumeVO.class.getSimpleName());
dmsg.setBitsUuid(self.getUuid());
dmsg.setHypervisorType(VolumeFormat.getMasterHypervisorTypeByVolumeFormat(getSelfInventory().getFormat()).toString());
dmsg.setFolder(false);
dmsg.setFromRecycle(true);
bus.makeTargetServiceIdByResourceUuid(dmsg, PrimaryStorageConstant.SERVICE_ID, self.getPrimaryStorageUuid());
bus.send(dmsg, new CloudBusCallBack(trigger) {
MarkSnapshotAsVolumeMsg mmsg = new MarkSnapshotAsVolumeMsg();
mmsg.setVolumeUuid(self.getUuid());
mmsg.setSnapshotUuid(snapShot.getUuid());
mmsg.setSize(size);
mmsg.setVolumePath(newVolumeInstallPath);
mmsg.setTreeUuid(snapShot.getTreeUuid());
bus.makeTargetServiceIdByResourceUuid(mmsg, VolumeSnapshotConstant.SERVICE_ID, snapShot.getUuid());
bus.send(mmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (!reply.isSuccess()) {
if (reply.getError().isError(VolumeErrors.VOLUME_IN_USE)) {
logger.warn(String.format("unable to delete path:%s right now", originVolumePath));
}

logger.warn(String.format("mark snapshot:%s as volume failed", snapShot.getUuid()));
trigger.fail(reply.getError());
return;
}
Expand All @@ -3599,7 +3596,7 @@ public void run(MessageReply reply) {
});

flow(new NoRollbackFlow() {
String __name__ = "update-db-install-path";
String __name__ = "delete-origin-volume-bits";

@Override
public boolean skip(Map data) {
Expand All @@ -3608,20 +3605,31 @@ public boolean skip(Map data) {

@Override
public void run(FlowTrigger trigger, Map data) {
MarkSnapshotAsVolumeMsg mmsg = new MarkSnapshotAsVolumeMsg();
mmsg.setVolumeUuid(self.getUuid());
mmsg.setSnapshotUuid(snapShot.getUuid());
mmsg.setSize(size);
mmsg.setVolumePath(newVolumeInstallPath);
mmsg.setTreeUuid(snapShot.getTreeUuid());
bus.makeTargetServiceIdByResourceUuid(mmsg, VolumeSnapshotConstant.SERVICE_ID, snapShot.getUuid());
bus.send(mmsg, new CloudBusCallBack(trigger) {
DeleteVolumeBitsOnPrimaryStorageMsg dmsg = new DeleteVolumeBitsOnPrimaryStorageMsg();
dmsg.setPrimaryStorageUuid(self.getPrimaryStorageUuid());
dmsg.setInstallPath(originVolumePath);
dmsg.setSize(self.getSize());
dmsg.setBitsType(VolumeVO.class.getSimpleName());
dmsg.setBitsUuid(self.getUuid());
dmsg.setHypervisorType(VolumeFormat.getMasterHypervisorTypeByVolumeFormat(getSelfInventory().getFormat()).toString());
dmsg.setFolder(false);
dmsg.setFromRecycle(true);
bus.makeTargetServiceIdByResourceUuid(dmsg, PrimaryStorageConstant.SERVICE_ID, self.getPrimaryStorageUuid());
bus.send(dmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (!reply.isSuccess()) {
logger.warn(String.format("mark snapshot:%s as volume failed", snapShot.getUuid()));
trigger.fail(reply.getError());
return;
if (reply.getError().isError(VolumeErrors.VOLUME_IN_USE)) {
logger.warn(String.format("unable to delete path:%s right now", originVolumePath));
}

PrimaryStorageDeleteBitGC gc = new PrimaryStorageDeleteBitGC();
gc.NAME = String.format("gc-delete-bits-volume-%s-on-primary-storage-%s", self.getUuid(), self.getPrimaryStorageUuid());
gc.primaryStorageInstallPath = originVolumePath;
gc.primaryStorageUuid = self.getPrimaryStorageUuid();
gc.volume = self;
gc.submit(PrimaryStorageGlobalConfig.PRIMARY_STORAGE_DELETEBITS_GARBAGE_COLLECTOR_INTERVAL.value(Long.class),
TimeUnit.SECONDS);
}

trigger.next();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package org.zstack.test.integration.storage.snapshot

import org.zstack.core.db.Q
import org.zstack.header.volume.VolumeVO
import org.zstack.header.volume.VolumeVO_
import org.zstack.sdk.GarbageCollectorInventory
import org.zstack.sdk.VmInstanceInventory
import org.zstack.sdk.VolumeInventory
import org.zstack.sdk.VolumeSnapshotInventory
import org.zstack.storage.primary.local.LocalStorageKvmBackend
import org.zstack.test.integration.storage.Env
import org.zstack.test.integration.storage.StorageTest
import org.zstack.testlib.EnvSpec
import org.zstack.testlib.SubCase

/**
* Cover ZSTAC-76704 / TIC-5745:
*
* APIUndoSnapshotCreationMsg: after blockCommit succeeds on the data plane,
* a failure when deleting the origin volume bits must NOT roll back the
* control plane. Before the fix, MarkSnapshotAsVolume ran AFTER the
* delete-origin-volume-bits flow, so a delete failure aborted the undo and
* left the management-plane install path pointing at the origin path that
* had already been committed away on the data plane (control/data plane
* inconsistency).
*
* The fix re-orders the flows so MarkSnapshotAsVolume (update-db-install-path)
* runs BEFORE delete-origin-volume-bits, and a delete failure only submits a
* PrimaryStorageDeleteBitGC job instead of failing the whole undo.
*
* This case injects a delete-bits failure during undoSnapshotCreation and
* verifies:
* 1. the undo API still succeeds despite the delete-bits failure;
* 2. the volume install path is updated to the new (committed) path;
* 3. MarkSnapshotAsVolume ran BEFORE the delete-bits call (ordering);
* 4. the failed delete only submits a PrimaryStorageDeleteBitGC job.
*
* Note: the delete-bits failure is injected as a VOLUME_IN_USE error rather
* than a raw HTTP error, because local storage swallows raw transport errors
* with its own internal GC and still replies success - a VOLUME_IN_USE reply
* is the way to make DeleteVolumeBitsOnPrimaryStorageMsg actually fail back
* to the undo flow, which is the path the fix changes.
*/
class UndoSnapshotCreationDeleteBitsFailureCase extends SubCase {
EnvSpec env
VmInstanceInventory vm

@Override
void clean() {
env.delete()
}

@Override
void setup() {
useSpring(StorageTest.springSpec)
}

@Override
void environment() {
env = Env.localStorageOneVmEnv()
}

@Override
void test() {
env.create {
vm = env.inventoryByName("vm") as VmInstanceInventory
testDeleteBitsFailureStillUndoSnapshot()
}
}

void testDeleteBitsFailureStillUndoSnapshot() {
VolumeSnapshotInventory snapshot = createVolumeSnapshot {
volumeUuid = vm.rootVolumeUuid
name = "snapshot-for-undo"
} as VolumeSnapshotInventory

VolumeInventory rootBefore = queryVolume {
conditions = ["uuid=${vm.rootVolumeUuid}"]
}[0] as VolumeInventory
String originPath = rootBefore.installPath
assert originPath != snapshot.primaryStorageInstallPath

// install path observed inside the delete-bits simulator: proves
// whether MarkSnapshotAsVolume already ran by the time delete-bits
// is reached (ordering assertion).
String installPathWhenDeleteHit = null
env.afterSimulator(LocalStorageKvmBackend.DELETE_BITS_PATH) { LocalStorageKvmBackend.DeleteBitsRsp rsp ->
installPathWhenDeleteHit = Q.New(VolumeVO.class)
.select(VolumeVO_.installPath)
.eq(VolumeVO_.uuid, vm.rootVolumeUuid)
.findValue()
// make DeleteVolumeBitsOnPrimaryStorageMsg fail back to the undo
// flow (local storage does not run its internal GC for in-use).
rsp.setError("volume in use - on purpose - ZSTAC-76704")
rsp.inUse = true
return rsp
}

// before the fix the failed delete-bits would fail the whole undo;
// after the fix this API must still succeed.
undoSnapshotCreation {
uuid = vm.rootVolumeUuid
snapShotUuid = snapshot.uuid
}

env.cleanSimulatorAndMessageHandlers()

VolumeInventory rootAfter = queryVolume {
conditions = ["uuid=${vm.rootVolumeUuid}"]
}[0] as VolumeInventory

// 1. control plane committed: volume install path updated to the new path
assert rootAfter.installPath != originPath
assert rootAfter.installPath == snapshot.primaryStorageInstallPath

// 2. ordering: MarkSnapshotAsVolume ran BEFORE delete-origin-volume-bits,
// so the DB install path was already the new path when delete was hit.
assert installPathWhenDeleteHit != null
assert installPathWhenDeleteHit == snapshot.primaryStorageInstallPath

// 3. the failed delete only submits a PrimaryStorageDeleteBitGC job;
// the undo itself still succeeds.
retryInSecs {
def gcs = queryGCJob {
conditions = ["context~=%${vm.rootVolumeUuid}%".toString()]
} as List<GarbageCollectorInventory>
GarbageCollectorInventory gc = gcs.find {
it.name.contains("gc-delete-bits-volume-${vm.rootVolumeUuid}")
} as GarbageCollectorInventory
assert gc != null : "expected PrimaryStorageDeleteBitGC submitted, got: ${gcs.collect { it.name }}"
}
}
}