From 6736c47b3325efb0fc85f9ca95f75055e57df963 Mon Sep 17 00:00:00 2001 From: "shan.wu" Date: Thu, 14 May 2026 23:23:52 +0800 Subject: [PATCH] [vm]: verify dest state in migrate flow Check the destination host when the hypervisor migration call fails. If the VM is Running there, continue the normal migration flow. That lets DB sync and post hooks run through the standard path. Otherwise fail the flow and keep the rollback behavior. Resolves: ZSTAC-83894 Change-Id: I8b4774a405fc3b1c05d21b6742facd26bc8d03e6 --- .../compute/vm/VmMigrateOnHypervisorFlow.java | 51 ++++- .../kvm/vm/VmLastHostUuidCase.groovy | 4 +- ...MigrateVmFailureCheckTargetHostCase.groovy | 213 ++++++++++++++++++ .../MaintainHostMultiTypePsCase.groovy | 12 + 4 files changed, 277 insertions(+), 3 deletions(-) create mode 100644 test/src/test/groovy/org/zstack/test/integration/kvm/vm/migrate/MigrateVmFailureCheckTargetHostCase.groovy diff --git a/compute/src/main/java/org/zstack/compute/vm/VmMigrateOnHypervisorFlow.java b/compute/src/main/java/org/zstack/compute/vm/VmMigrateOnHypervisorFlow.java index 24529a31df3..d11226c86d9 100755 --- a/compute/src/main/java/org/zstack/compute/vm/VmMigrateOnHypervisorFlow.java +++ b/compute/src/main/java/org/zstack/compute/vm/VmMigrateOnHypervisorFlow.java @@ -10,7 +10,10 @@ import org.zstack.header.core.workflow.FlowRollback; import org.zstack.header.core.workflow.FlowTrigger; import org.zstack.header.errorcode.ErrorCode; +import org.zstack.header.host.CheckVmStateOnHypervisorMsg; +import org.zstack.header.host.CheckVmStateOnHypervisorReply; import org.zstack.header.host.HostConstant; +import org.zstack.header.host.HostErrors; import org.zstack.header.message.MessageReply; import org.zstack.header.host.MigrateVmOnHypervisorMsg; import org.zstack.header.vm.*; @@ -18,6 +21,8 @@ import java.util.Map; +import static org.zstack.utils.CollectionDSL.list; + @Configurable(preConstruction = true, autowire = Autowire.BY_TYPE) public class VmMigrateOnHypervisorFlow implements Flow { @Autowired @@ -60,8 +65,52 @@ public void run(MessageReply reply) { if (reply.isSuccess()) { chain.next(); } else { - chain.fail(reply.getError()); + ErrorCode canceledError = LongJobUtils.buildErrIfCanceled(); + if (canceledError != null) { + chain.fail(canceledError); + return; + } + + ErrorCode error = reply.getError(); + if (HostErrors.FAILED_TO_MIGRATE_VM_ON_HYPERVISOR.isEqual(error.getCode())) { + checkVmStateOnDestinationHost(spec, error, chain); + return; + } + + chain.fail(error); + } + } + }); + } + + private void checkVmStateOnDestinationHost(final VmInstanceSpec spec, final ErrorCode migrateError, + final FlowTrigger chain) { + CheckVmStateOnHypervisorMsg msg = new CheckVmStateOnHypervisorMsg(); + msg.setVmInstanceUuids(list(spec.getVmInventory().getUuid())); + msg.setHostUuid(spec.getDestHost().getUuid()); + bus.makeTargetServiceIdByResourceUuid(msg, HostConstant.SERVICE_ID, msg.getHostUuid()); + bus.send(msg, new CloudBusCallBack(chain) { + @Override + public void run(MessageReply reply) { + if (!reply.isSuccess()) { + chain.fail(migrateError); + return; + } + + CheckVmStateOnHypervisorReply r = reply.castReply(); + Map states = r.getStates(); + if (states == null || states.isEmpty()) { + chain.fail(migrateError); + return; } + + String state = states.get(spec.getVmInventory().getUuid()); + if (VmInstanceState.Running.toString().equals(state)) { + chain.next(); + return; + } + + chain.fail(migrateError); } }); } diff --git a/test/src/test/groovy/org/zstack/test/integration/kvm/vm/VmLastHostUuidCase.groovy b/test/src/test/groovy/org/zstack/test/integration/kvm/vm/VmLastHostUuidCase.groovy index 6016c206264..aa05f217bff 100644 --- a/test/src/test/groovy/org/zstack/test/integration/kvm/vm/VmLastHostUuidCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/kvm/vm/VmLastHostUuidCase.groovy @@ -103,7 +103,7 @@ class VmLastHostUuidCase extends SubCase{ env.message(CheckVmStateOnHypervisorMsg.class) { CheckVmStateOnHypervisorMsg msg, CloudBus bus -> def reply = new CheckVmStateOnHypervisorReply() def list = new HashMap() - list.put(vm.uuid, VmInstanceState.Running.toString()) + list.put(vm.uuid, expect ? VmInstanceState.Running.toString() : VmInstanceState.Stopped.toString()) reply.setStates(list) reply.success = true bus.reply(msg, reply) @@ -125,7 +125,7 @@ class VmLastHostUuidCase extends SubCase{ assert afterDstHostMem == originDstHostMem - vmMemInGB assert afterSrcHostMem == originSrcHostMem + vmMemInGB }else { - assert a.call().error != null + assert ret.error != null assert afterDstHostMem == originDstHostMem assert afterSrcHostMem == originSrcHostMem } diff --git a/test/src/test/groovy/org/zstack/test/integration/kvm/vm/migrate/MigrateVmFailureCheckTargetHostCase.groovy b/test/src/test/groovy/org/zstack/test/integration/kvm/vm/migrate/MigrateVmFailureCheckTargetHostCase.groovy new file mode 100644 index 00000000000..711fd5bb96b --- /dev/null +++ b/test/src/test/groovy/org/zstack/test/integration/kvm/vm/migrate/MigrateVmFailureCheckTargetHostCase.groovy @@ -0,0 +1,213 @@ +package org.zstack.test.integration.kvm.vm.migrate + +import org.zstack.core.cloudbus.CloudBus +import org.zstack.header.host.CheckVmStateOnHypervisorMsg +import org.zstack.header.host.CheckVmStateOnHypervisorReply +import org.zstack.header.network.service.NetworkServiceType +import org.zstack.header.vm.VmInstanceState +import org.zstack.kvm.KVMAgentCommands +import org.zstack.kvm.KVMConstant +import org.zstack.network.securitygroup.SecurityGroupConstant +import org.zstack.network.service.flat.FlatNetworkServiceConstant +import org.zstack.network.service.userdata.UserdataConstant +import org.zstack.sdk.HostInventory +import org.zstack.sdk.MigrateVmAction +import org.zstack.sdk.VmInstanceInventory +import org.zstack.test.integration.kvm.KvmTest +import org.zstack.testlib.EnvSpec +import org.zstack.testlib.SubCase +import org.zstack.utils.data.SizeUnit + +class MigrateVmFailureCheckTargetHostCase extends SubCase { + EnvSpec env + + @Override + void setup() { + useSpring(KvmTest.springSpec) + } + + @Override + void environment() { + env = env { + instanceOffering { + name = "instanceOffering" + memory = SizeUnit.GIGABYTE.toByte(1) + cpu = 1 + } + + cephBackupStorage { + name = "ceph-bk" + fsid = "7ff218d9-f525-435f-8a40-3618d1772a64" + monUrls = ["root:password@localhost:23", "root:password@127.0.0.1:23"] + + image { + name = "image1" + url = "http://zstack.org/download/test.qcow2" + } + } + + zone { + name = "zone" + + cluster { + name = "cluster" + hypervisorType = "KVM" + + kvm { + name = "kvm1" + managementIp = "127.0.0.1" + username = "root" + password = "password" + } + + kvm { + name = "kvm2" + managementIp = "127.0.0.2" + username = "root" + password = "password" + } + + attachPrimaryStorage("ceph-pri") + attachL2Network("l2") + } + + cephPrimaryStorage { + name = "ceph-pri" + fsid = "7ff218d9-f525-435f-8a40-3618d1772a64" + monUrls = ["root:password@localhost/?monPort=7777", "root:password@127.0.0.1/?monPort=7777"] + } + + l2NoVlanNetwork { + name = "l2" + physicalInterface = "eth0" + + l3Network { + name = "l3" + + service { + provider = FlatNetworkServiceConstant.FLAT_NETWORK_SERVICE_TYPE_STRING + types = [NetworkServiceType.DHCP.toString(), UserdataConstant.USERDATA_TYPE_STRING] + } + + service { + provider = SecurityGroupConstant.SECURITY_GROUP_PROVIDER_TYPE + types = [SecurityGroupConstant.SECURITY_GROUP_NETWORK_SERVICE_TYPE] + } + + ip { + startIp = "192.168.100.10" + endIp = "192.168.100.100" + netmask = "255.255.255.0" + gateway = "192.168.100.1" + } + } + } + + attachBackupStorage("ceph-bk") + } + + vm { + name = "vm" + useInstanceOffering("instanceOffering") + useImage("image1") + useL3Networks("l3") + } + } + } + + @Override + void test() { + env.create { + testRollbackWhenTargetHostReportsVmNotRunning() + testMigrationSuccessWhenTargetHostReportsVmRunning() + } + } + + @Override + void clean() { + env.delete() + } + + void testRollbackWhenTargetHostReportsVmNotRunning() { + VmInstanceInventory vm = env.inventoryByName("vm") as VmInstanceInventory + HostInventory destHost = findAnotherHost(vm.hostUuid) + + assertRollbackWhenTargetReports(vm, destHost, VmInstanceState.Stopped.toString()) + assertRollbackWhenTargetReports(vm, destHost, VmInstanceState.Paused.toString()) + } + + void assertRollbackWhenTargetReports(VmInstanceInventory vm, HostInventory destHost, String targetHostState) { + List checkedHosts = [] + + mockMigrateVmFailure() + mockVmState(vm.uuid, destHost.uuid, targetHostState, checkedHosts) + + MigrateVmAction.Result result = migrateVmAction(vm.uuid, destHost.uuid).call() + + assert result.error != null + VmInstanceInventory after = queryVmInstance { + conditions = ["uuid=${vm.uuid}".toString()] + }[0] as VmInstanceInventory + assert after.hostUuid == vm.hostUuid + assert after.state == VmInstanceState.Running.toString() + assert checkedHosts[0] == destHost.uuid + } + + void testMigrationSuccessWhenTargetHostReportsVmRunning() { + VmInstanceInventory vm = queryVmInstance { + conditions = ["name=vm"] + }[0] as VmInstanceInventory + HostInventory destHost = findAnotherHost(vm.hostUuid) + List checkedHosts = [] + + mockMigrateVmFailure() + mockVmState(vm.uuid, destHost.uuid, VmInstanceState.Running.toString(), checkedHosts) + + MigrateVmAction.Result result = migrateVmAction(vm.uuid, destHost.uuid).call() + + assert result.error == null + VmInstanceInventory after = queryVmInstance { + conditions = ["uuid=${vm.uuid}".toString()] + }[0] as VmInstanceInventory + assert after.hostUuid == destHost.uuid + assert after.lastHostUuid == vm.hostUuid + assert checkedHosts + assert checkedHosts.every { it == destHost.uuid } + } + + HostInventory findAnotherHost(String hostUuid) { + return queryHost { + conditions = ["uuid!=${hostUuid}".toString()] + }[0] as HostInventory + } + + void mockMigrateVmFailure() { + env.simulator(KVMConstant.KVM_MIGRATE_VM_PATH) { + KVMAgentCommands.MigrateVmResponse rsp = new KVMAgentCommands.MigrateVmResponse() + rsp.setError("mock migration API failure") + return rsp + } + } + + void mockVmState(String vmUuid, String hostUuid, String targetHostState, List checkedHosts) { + env.revokeMessage(CheckVmStateOnHypervisorMsg.class, null) + env.message(CheckVmStateOnHypervisorMsg.class) { CheckVmStateOnHypervisorMsg msg, CloudBus bus -> + CheckVmStateOnHypervisorReply reply = new CheckVmStateOnHypervisorReply() + Map states = new HashMap<>() + checkedHosts.add(msg.hostUuid) + msg.vmInstanceUuids.each { + states.put(it, it == vmUuid && msg.hostUuid == hostUuid ? targetHostState : VmInstanceState.Running.toString()) + } + reply.setStates(states) + bus.reply(msg, reply) + } + } + + MigrateVmAction migrateVmAction(String vmUuid, String destHostUuid) { + MigrateVmAction action = new MigrateVmAction() + action.sessionId = adminSession() + action.vmInstanceUuid = vmUuid + action.hostUuid = destHostUuid + return action + } +} diff --git a/test/src/test/groovy/org/zstack/test/integration/storage/primary/local_nfs/MaintainHostMultiTypePsCase.groovy b/test/src/test/groovy/org/zstack/test/integration/storage/primary/local_nfs/MaintainHostMultiTypePsCase.groovy index c1d495606c3..5f2b555b623 100644 --- a/test/src/test/groovy/org/zstack/test/integration/storage/primary/local_nfs/MaintainHostMultiTypePsCase.groovy +++ b/test/src/test/groovy/org/zstack/test/integration/storage/primary/local_nfs/MaintainHostMultiTypePsCase.groovy @@ -122,6 +122,18 @@ class MaintainHostMultiTypePsCase extends SubCase{ return rsp } + env.simulator(KVMConstant.KVM_VM_CHECK_STATE) { HttpEntity e -> + def cmd = JSONObjectUtil.toObject(e.getBody(), KVMAgentCommands.CheckVmStateCmd) + def rsp = new KVMAgentCommands.CheckVmStateRsp() + rsp.states = [:] + cmd.vmUuids.each { String vmUuid -> + rsp.states[vmUuid] = vmUuid == vm2OnNfs.uuid ? + KVMConstant.KvmVmState.Shutdown.toString() : + KVMConstant.KvmVmState.Running.toString() + } + return rsp + } + SQL.New(VmInstanceVO.class).eq(VmInstanceVO_.uuid, vm1.uuid).set(VmInstanceVO_.state, VmInstanceState.Unknown).update() changeHostState {