Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowired;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.core.gc.GC;
import org.zstack.core.gc.GCCompletion;
import org.zstack.core.gc.TimeBasedGarbageCollector;
import org.zstack.header.host.HostVO;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageAO_;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

public class CleanupVmInstanceMetadataOnPrimaryStorageGC extends TimeBasedGarbageCollector {
private static final CLogger logger = Utils.getLogger(CleanupVmInstanceMetadataOnPrimaryStorageGC.class);

@Autowired
private PluginRegistry pluginRgty;

@GC
public String primaryStorageUuid;
@GC
public String vmUuid;
@GC
public String rootVolumeUuid;
@GC
public String metadataPath;
@GC
public String hostUuid;

public static String getGCName(String vmUuid, String primaryStorageUuid) {
return String.format("gc-cleanup-vm-metadata-%s-%s", vmUuid, primaryStorageUuid);
}

@Override
protected void triggerNow(GCCompletion completion) {
if (!dbf.isExist(primaryStorageUuid, PrimaryStorageVO.class)) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] no longer exists, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageAO_.type).eq(PrimaryStorageAO_.uuid, primaryStorageUuid).findValue();
if (psType == null) {
logger.debug(String.format("[MetadataCleanupGC] primary storage[uuid:%s] type not found, " +
"cancel gc for vm[uuid:%s]", primaryStorageUuid, vmUuid));
completion.cancel();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
boolean requireHost = ext != null && ext.requireHostForCleanup();

if (hostUuid == null && requireHost) {
logger.debug(String.format("[MetadataCleanupGC] hostUuid is null and ps[uuid:%s, type:%s] " +
"requires host for cleanup, cancel gc for vm[uuid:%s]",
primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}

if (hostUuid != null && !dbf.isExist(hostUuid, HostVO.class)) {
if (requireHost) {
logger.debug(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists " +
"and ps[uuid:%s, type:%s] requires host for cleanup, cancel gc for vm[uuid:%s]",
hostUuid, primaryStorageUuid, psType, vmUuid));
completion.cancel();
return;
}

logger.info(String.format("[MetadataCleanupGC] host[uuid:%s] no longer exists for vm[uuid:%s], " +
"clear hostUuid and let the primary storage backend pick an available host", hostUuid, vmUuid));
hostUuid = null;
Comment on lines +58 to +80
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

requireHost 路径还缺少“host 仍能访问该主存储”的校验。

当前唯一返回 requireHostForCleanup()==true 的实现是 LocalStorage,而 Line 69 这里只判断 HostVO 是否存在。LocalStorageBase 后续还会在 backend 解析时校验该 host 是否仍绑定这块 PS(Line 3163-3167、Line 3577-3583);如果 host 还在库里但已经从这块 PS 脱离,这个 GC 会持续发送注定失败的 cleanup 并一直重试。这里最好通过扩展点或 PS helper 在发送前把“host 是否仍映射到目标 PS”也判掉,不满足时直接 cancel

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@compute/src/main/java/org/zstack/compute/vm/CleanupVmInstanceMetadataOnPrimaryStorageGC.java`
around lines 58 - 80, The requireHost branch currently only checks HostVO
existence; enhance CleanupVmInstanceMetadataOnPrimaryStorageGC so that when
ext.requireHostForCleanup() is true you also verify the host is still mapped to
the target primary storage before sending cleanup; obtain the extension via
pluginRgty.getExtensionFromMap(...) or use the PS helper to check the
host-primary-storage mapping (e.g. validate a PrimaryStorageHostRef or use a
helper on PrimaryStorageVO) and if the host is not mapped/correctly bound to
primaryStorageUuid then call completion.cancel() and return (mirror the existing
log style and messages used around hostUuid checks to explain cancellation).
Ensure the check runs where hostUuid != null && requireHost is evaluated so the
GC won’t keep retrying against a detached host.

}

CleanupVmInstanceMetadataOnPrimaryStorageMsg msg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
msg.setPrimaryStorageUuid(primaryStorageUuid);
msg.setVmInstanceUuid(vmUuid);
msg.setRootVolumeUuid(rootVolumeUuid);
msg.setMetadataPath(metadataPath);
msg.setHostUuid(hostUuid);

bus.makeTargetServiceIdByResourceUuid(msg, PrimaryStorageConstant.SERVICE_ID, primaryStorageUuid);
bus.send(msg, new CloudBusCallBack(completion) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataCleanupGC] successfully cleaned up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]", vmUuid, primaryStorageUuid));
completion.success();
} else {
logger.warn(String.format("[MetadataCleanupGC] failed to clean up metadata " +
"for vm[uuid:%s] on ps[uuid:%s]: %s", vmUuid, primaryStorageUuid, reply.getError()));
completion.fail(reply.getError());
}
}
});
}
}
135 changes: 135 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package org.zstack.compute.vm;

import org.springframework.beans.factory.annotation.Autowire;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Configurable;
import org.zstack.core.cloudbus.CloudBus;
import org.zstack.core.cloudbus.CloudBusCallBack;
import org.zstack.core.componentloader.PluginRegistry;
import org.zstack.core.db.Q;
import org.zstack.header.core.workflow.FlowTrigger;
import org.zstack.header.core.workflow.NoRollbackFlow;
import org.zstack.header.message.MessageReply;
import org.zstack.header.storage.primary.CleanupVmInstanceMetadataOnPrimaryStorageMsg;
import org.zstack.header.storage.primary.PrimaryStorageConstant;
import org.zstack.header.storage.primary.PrimaryStorageVO;
import org.zstack.header.storage.primary.PrimaryStorageVO_;
import org.zstack.header.vm.VmInstanceConstant;
import org.zstack.header.vm.VmInstanceSpec;
import org.zstack.header.vm.metadata.VmMetadataPathBuildExtensionPoint;
import org.zstack.header.volume.VolumeInventory;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

import java.util.Map;
import java.util.concurrent.TimeUnit;

@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
public class VmExpungeMetadataFlow extends NoRollbackFlow {
private static final CLogger logger = Utils.getLogger(VmExpungeMetadataFlow.class);

@Autowired
private CloudBus bus;
@Autowired
private PluginRegistry pluginRgty;

@Override
public void run(FlowTrigger trigger, Map data) {
final VmInstanceSpec spec = (VmInstanceSpec) data.get(VmInstanceConstant.Params.VmInstanceSpec.toString());
if (spec == null || spec.getVmInventory() == null) {
logger.warn("[MetadataExpunge] missing VmInstanceSpec or VmInventory, skip metadata cleanup");
trigger.next();
return;
}

final String vmUuid = spec.getVmInventory().getUuid();

VolumeInventory rootVolume = spec.getVmInventory().getRootVolume();
String psUuid = rootVolume != null ? rootVolume.getPrimaryStorageUuid() : null;
if (psUuid == null) {
logger.debug(String.format("[MetadataExpunge] vm[uuid:%s] root volume has no primaryStorageUuid, " +
"skipping metadata cleanup", vmUuid));
trigger.next();
return;
}


String psType = Q.New(PrimaryStorageVO.class).select(PrimaryStorageVO_.type).eq(PrimaryStorageVO_.uuid, psUuid).findValue();
if (psType == null) {
logger.warn(String.format("[MetadataExpunge] primary storage[uuid:%s] not found for vm[uuid:%s], " +
"skip metadata cleanup", psUuid, vmUuid));
trigger.next();
return;
}

VmMetadataPathBuildExtensionPoint ext = pluginRgty.getExtensionFromMap(psType, VmMetadataPathBuildExtensionPoint.class);
if (ext == null) {
logger.warn(String.format("[MetadataExpunge] no VmMetadataPathBuildExtensionPoint found for ps[uuid:%s, type:%s], " +
"skip metadata cleanup", psUuid, psType));
trigger.next();
return;
}
final String metadataPath;
try {
metadataPath = ext.buildVmMetadataPath(psUuid, vmUuid);
} catch (Exception e) {
logger.warn(String.format("[MetadataExpunge] failed to build metadata path for vm[uuid:%s] on ps[uuid:%s], " +
"skip metadata cleanup: %s", vmUuid, psUuid, e.getMessage()));
trigger.next();
return;
}

String hostUuid = spec.getVmInventory().getHostUuid();
if (hostUuid == null) {
hostUuid = spec.getVmInventory().getLastHostUuid();
}

if (hostUuid == null && ext.requireHostForCleanup()) {
logger.warn(String.format("[MetadataExpunge] vm[uuid:%s] hostUuid is null, " +
"ps[uuid:%s, type:%s] requires host for cleanup, skip without submitting GC",
vmUuid, psUuid, psType));
trigger.next();
return;
}
Comment on lines +87 to +93
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

缺少 host 时不要直接放弃 metadata cleanup。

这是 expunge 链里的 cleanup flow。对 requireHostForCleanup() 的后端,这个分支既不下发清理也不提交 GC;只要 expunge 时 hostUuid/lastHostUuid 已经为空,这份 metadata 就没有任何后续重试路径,最终会永久遗留在主存上。至少要提交 GC,或者在更早的流程里把 cleanup 所需的 hostUuid 固化到 VmInstanceSpec/flow context。

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@compute/src/main/java/org/zstack/compute/vm/VmExpungeMetadataFlow.java`
around lines 87 - 93, The current branch in VmExpungeMetadataFlow that checks if
hostUuid == null && ext.requireHostForCleanup() incorrectly skips cleanup by
calling trigger.next() and returning; instead ensure metadata is not left behind
by submitting a GC task (or otherwise enqueue metadata cleanup) when hostUuid is
null for backends that require a host. Modify the branch around hostUuid,
ext.requireHostForCleanup(), psUuid/psType so that rather than only logging and
calling trigger.next(), it invokes the existing GC submission path (or creates a
submitMetadataGC(vmUuid, psUuid, psType) helper) and then proceeds with
trigger.next(); alternatively, ensure the required hostUuid is persisted earlier
into VmInstanceSpec/flow context before this flow runs (update the earlier flow
that sets ext/lastHostUuid if you choose that approach).


String rootVolumeUuid = rootVolume.getUuid();
CleanupVmInstanceMetadataOnPrimaryStorageMsg cmsg = new CleanupVmInstanceMetadataOnPrimaryStorageMsg();
cmsg.setPrimaryStorageUuid(psUuid);
cmsg.setVmInstanceUuid(vmUuid);
cmsg.setMetadataPath(metadataPath);
cmsg.setRootVolumeUuid(rootVolumeUuid);
cmsg.setHostUuid(hostUuid);
final String finalPsUuid = psUuid;
final String finalHostUuid = hostUuid;

bus.makeTargetServiceIdByResourceUuid(cmsg, PrimaryStorageConstant.SERVICE_ID, psUuid);
bus.send(cmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (reply.isSuccess()) {
logger.info(String.format("[MetadataExpunge] successfully deleted metadata for vm[uuid:%s] on ps[uuid:%s]",
vmUuid, finalPsUuid));
} else {
logger.warn(String.format("[MetadataExpunge] failed to delete metadata for vm[uuid:%s] on ps[uuid:%s]: %s, " +
"submitting GC job for retry", vmUuid, finalPsUuid, reply.getError()));
submitGC(finalPsUuid, vmUuid, rootVolumeUuid, metadataPath, finalHostUuid);
}
trigger.next();
}
});
}

private void submitGC(String psUuid, String vmUuid, String rootVolumeUuid, String metadataPath, String hostUuid) {
CleanupVmInstanceMetadataOnPrimaryStorageGC gc = new CleanupVmInstanceMetadataOnPrimaryStorageGC();
gc.NAME = CleanupVmInstanceMetadataOnPrimaryStorageGC.getGCName(vmUuid, psUuid);
gc.primaryStorageUuid = psUuid;
gc.vmUuid = vmUuid;
gc.rootVolumeUuid = rootVolumeUuid;
gc.metadataPath = metadataPath;
gc.hostUuid = hostUuid;
long gcIntervalSec = TimeUnit.HOURS.toSeconds(VmGlobalConfig.VM_METADATA_CLEANUP_GC_INTERVAL.value(Long.class));
gc.deduplicateSubmit(gcIntervalSec, TimeUnit.SECONDS);

logger.info(String.format("[MetadataExpunge] submitted GC job [%s] for vm[uuid:%s] on ps[uuid:%s]", gc.NAME, vmUuid, psUuid));
}
}
49 changes: 49 additions & 0 deletions compute/src/main/java/org/zstack/compute/vm/VmGlobalConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,53 @@ public class VmGlobalConfig {
@GlobalConfigValidation(validValues = {"true", "false"})
@BindResourceConfig(value = {VmInstanceVO.class, ClusterVO.class})
public static GlobalConfig RESET_TPM_AFTER_VM_CLONE = new GlobalConfig(CATEGORY, "reset.tpm.after.vm.clone");

@GlobalConfigValidation(validValues = {"true", "false"})
public static GlobalConfig VM_METADATA_ENABLED = new GlobalConfig(CATEGORY, "vm.metadata.enabled");

@GlobalConfigValidation()
public static GlobalConfig VM_METADATA_LAST_REFRESH_VERSION = new GlobalConfig(CATEGORY, "vm.metadata.lastRefreshVersion");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 100)
public static GlobalConfig VM_METADATA_FLUSH_CONCURRENCY = new GlobalConfig(CATEGORY, "vm.metadata.flush.concurrency");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 300)
public static GlobalConfig VM_METADATA_FLUSH_POLL_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.flush.pollInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_FLUSH_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.flush.batchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 168)
public static GlobalConfig VM_METADATA_CLEANUP_GC_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.cleanup.gc.interval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 100)
public static GlobalConfig VM_METADATA_FLUSH_MAX_RETRY = new GlobalConfig(CATEGORY, "vm.metadata.flush.maxRetry");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 120)
public static GlobalConfig VM_METADATA_FLUSH_ZOMBIE_CLAIM_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.flush.zombieClaimThreshold");

// 6h = 21600s, 48h = 172800s
@GlobalConfigValidation(numberGreaterThan = 21599, numberLessThan = 172801)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 1000)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_MAX_CYCLES = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryMaxCycles");

@GlobalConfigValidation(numberGreaterThan = 0)
public static GlobalConfig VM_METADATA_PAYLOAD_REJECT_THRESHOLD = new GlobalConfig(CATEGORY, "vm.metadata.payload.rejectThreshold");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 86400)
public static GlobalConfig VM_METADATA_MAINTENANCE_ORPHAN_CHECK_INTERVAL = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.orphanCheckInterval");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 20)
public static GlobalConfig VM_METADATA_MAINTENANCE_STALE_RECOVERY_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.staleRecoveryBatchSize");

@GlobalConfigValidation(numberGreaterThan = 9, numberLessThan = 31)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SIZE = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSize");

@GlobalConfigValidation(numberGreaterThan = 0, numberLessThan = 11)
public static GlobalConfig VM_METADATA_MAINTENANCE_CONTENT_DRIFT_BATCH_SLEEP_SEC = new GlobalConfig(CATEGORY, "vm.metadata.maintenance.contentDriftBatchSleepSec");
}
Loading