Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 150 additions & 39 deletions compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,34 @@ public void run(MessageReply re) {
});
}

/**
* ZSTAC-83890: pick a Connected+Enabled+KVM sibling host (excluding the suspect) to act as the
* pre-fence peer. Prefers the hinted siblings vetted by HaKvmHostSiblingChecker. Returns null
* if none are usable, in which case pre-fence MUST refuse to start to prevent split-brain.
*/
private String pickFencePeerHostUuid(List<String> hintedSiblings, String suspectHostUuid) {
if (hintedSiblings == null || hintedSiblings.isEmpty()) {
return null;
}
List<String> filtered = new ArrayList<>(hintedSiblings);
filtered.remove(suspectHostUuid);
if (filtered.isEmpty()) {
return null;
}
List<String> alive = Q.New(HostVO.class)
.select(HostVO_.uuid)
.in(HostVO_.uuid, filtered)
.eq(HostVO_.status, HostStatus.Connected)
.eq(HostVO_.state, HostState.Enabled)
.eq(HostVO_.hypervisorType, VmInstanceConstant.KVM_HYPERVISOR_TYPE)
.listValues();
if (alive == null || alive.isEmpty()) {
return null;
}
Collections.shuffle(alive);
return alive.get(0);
}

private void handle(final HaStartVmInstanceMsg msg) {
thdf.chainSubmit(new ChainTask(msg) {
@Override
Expand All @@ -952,7 +980,7 @@ public String getSyncSignature() {
public void run(final SyncTaskChain chain) {
refreshVO();

HaStartVmJudger judger;
final HaStartVmJudger judger;
try {
Class clz = Class.forName(msg.getJudgerClassName());
judger = (HaStartVmJudger) clz.newInstance();
Expand All @@ -967,74 +995,157 @@ public void run(final SyncTaskChain chain) {
return;
}

// It is better to monitor HaStartVmInstanceMsg and HaStartVmInstanceReply,
// instead of intrusively recording the scheduling record here.
// The problem is, we have two early exits:
// 1. throwing exception;
// 2. judges no need to start VM.
// thus, with monitoring, there might be false records.
final VmSchedHistoryRecorder recorder = VmSchedHistoryRecorder.ofHA(msg.getVmInstanceUuid())
.withSchedReason(msg.getHaReason())
.begin();
ErrorCodeList errList = new ErrorCodeList();
new While<>(pluginRgty.getExtensionList(BeforeHaStartVmInstanceExtensionPoint.class)).each((ext, whileCompletion) -> {
ext.beforeHaStartVmInstance(msg.getVmInstanceUuid(), msg.getJudgerClassName(), msg.getSoftAvoidHostUuids(), new Completion(msg) {
@Override
public void success() {
whileCompletion.done();
}

@Override
public void fail(ErrorCode errorCode) {
errList.getCauses().add(errorCode);
whileCompletion.done();
}
});
}).run(new WhileDoneCompletion(msg, chain) {
FlowChain fchain = FlowChainBuilder.newSimpleFlowChain();
fchain.setName(String.format("ha-start-vm-%s", msg.getVmInstanceUuid()));

fchain.then(new NoRollbackFlow() {
String __name__ = "ha-pre-fence-vm";

@Override
public void done(ErrorCodeList errorCodeList) {
if (!errList.getCauses().isEmpty()) {
reply.setError(errList.getCauses().get(0));
bus.reply(msg, reply);
recorder.withFailReason(reply.getError().getDetails())
.end(null);
chain.next();
public void run(FlowTrigger trigger, Map data) {
// ZSTAC-83890: self.hostUuid is usually cleared by the prior
// abnormal-lifecycle flow before HA-start runs; fall back to the
// HA-vetted softAvoid suspect, then to lastHostUuid.
String suspectHostUuid = self.getHostUuid();
if (suspectHostUuid == null) {
List<String> avoid = msg.getSoftAvoidHostUuids();
if (avoid != null && !avoid.isEmpty()) {
suspectHostUuid = avoid.get(0);
}
}
if (suspectHostUuid == null) {
suspectHostUuid = self.getLastHostUuid();
}
if (suspectHostUuid == null) {
trigger.next();
return;
}
List<String> siblings = msg.getSiblingHostUuids();
if (siblings == null || siblings.isEmpty()) {
trigger.next();
return;
}
if (!VmInstanceConstant.KVM_HYPERVISOR_TYPE.equals(self.getHypervisorType())) {
trigger.next();
return;
}
// ZSTAC-83890: route to a Connected sibling KVMHost; never to the suspect itself.
String peerHostUuid = pickFencePeerHostUuid(siblings, suspectHostUuid);
if (peerHostUuid == null) {
trigger.fail(operr("HA-start vm[%s]: no Connected sibling KVM host available to fence" +
" suspect host[%s]. Refuse to start to prevent split-brain.",
self.getUuid(), suspectHostUuid));
return;
}
FenceVmOnHostMsg fmsg = new FenceVmOnHostMsg();
fmsg.setHostUuid(peerHostUuid);
fmsg.setSuspectHostUuid(suspectHostUuid);
fmsg.setVmUuid(self.getUuid());
bus.makeTargetServiceIdByResourceUuid(fmsg, HostConstant.SERVICE_ID, peerHostUuid);
bus.send(fmsg, new CloudBusCallBack(trigger) {
@Override
public void run(MessageReply reply) {
if (!reply.isSuccess()) {
trigger.fail(reply.getError());
return;
}
trigger.next();
}
});
}
});

fchain.then(new NoRollbackFlow() {
String __name__ = "before-ha-start-vm-instance";

@Override
public void run(FlowTrigger trigger, Map data) {
ErrorCodeList errList = new ErrorCodeList();
new While<>(pluginRgty.getExtensionList(BeforeHaStartVmInstanceExtensionPoint.class)).each((ext, wc) -> {
ext.beforeHaStartVmInstance(msg.getVmInstanceUuid(), msg.getJudgerClassName(), msg.getSoftAvoidHostUuids(), new Completion(msg) {
@Override
public void success() {
wc.done();
}

@Override
public void fail(ErrorCode errorCode) {
errList.getCauses().add(errorCode);
wc.done();
}
});
}).run(new WhileDoneCompletion(trigger) {
@Override
public void done(ErrorCodeList errorCodeList) {
if (!errList.getCauses().isEmpty()) {
trigger.fail(errList.getCauses().get(0));
return;
}
trigger.next();
}
});
}
});

fchain.then(new NoRollbackFlow() {
String __name__ = "mark-vm-stopped-and-save-last-host";

@Override
public void run(FlowTrigger trigger, Map data) {
logger.debug(String.format("HaStartVmJudger[%s] says the VM[uuid:%s, name:%s] is qualified for HA start, now we are starting it",
judger.getClass(), self.getUuid(), self.getName()));
UpdateQuery sql = SQL.New(VmInstanceVO.class)
.eq(VmInstanceVO_.uuid, self.getUuid())
.set(VmInstanceVO_.state, VmInstanceState.Stopped)
.set(VmInstanceVO_.hostUuid, null);

if (self.getHostUuid() != null) {
sql.set(VmInstanceVO_.lastHostUuid, self.getHostUuid());
}

sql.update();
trigger.next();
}
});

startVm(msg, new Completion(msg, chain) {
fchain.then(new NoRollbackFlow() {
String __name__ = "start-ha-vm";

@Override
public void run(FlowTrigger trigger, Map data) {
startVm(msg, new Completion(trigger) {
@Override
public void success() {
reply.setInventory(getSelfInventory());
bus.reply(msg, reply);
recorder.end(reply.getInventory().getHostUuid());
chain.next();
trigger.next();
Comment on lines +1094 to +1122
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

不要在真正启动前先把 VM 落库成 Stopped

这里先改库,再调用 startVm();而 startVm() 一进来会 refreshVO(),随后把已经被改写后的状态当作自己的 originState。这样一来,只要后面的分配、pre-start 或 start 失败,回滚就只会回到 Stopped/hostUuid = null,而不是 HA 进入前的原始状态。这个改动会直接改写 5.4.8 的 HA 失败语义,而且还绕过了 changeVmStateInDb(...) 的状态事件/扩展点回调。

建议把原始 state/hostUuid/lastHostUuid 存进 flow data 并在失败时恢复,或者把这次落库延后到 HA start 真正成功之后。

As per coding guidelines: “向后兼容原则……不应直接改动已有行为……开关控制等”.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@compute/src/main/java/org/zstack/compute/vm/VmInstanceBase.java` around lines
1051 - 1079, The change persists VmInstanceVO state/hostUuid to Stopped before
calling startVm(), which causes startVm()'s refreshVO() to treat the modified
state as originState and breaks HA-failure semantics and changeVmStateInDb
callbacks; fix by reverting the "mark-vm-stopped-and-save-last-host" flow:
instead of updating the DB before start, capture and store the original
VmInstanceVO fields (state, hostUuid, lastHostUuid) into the flow data map
(inside the first NoRollbackFlow) and either (a) perform the UpdateQuery that
sets state=Stopped/hostUuid=null only after startVm() success (move the DB
update into the success() callback of startVm) or (b) if you must keep the DB
update early, ensure failures restore the original values by invoking
changeVmStateInDb or executing an UpdateQuery using the saved originals in the
failure path (trigger.fail/trigger.rollback). Reference: the NoRollbackFlow
named "mark-vm-stopped-and-save-last-host", startVm(...),
changeVmStateInDb(...), and VmInstanceVO/VmInstanceVO_.uuid to locate the code.

}

@Override
public void fail(ErrorCode errorCode) {
reply.setError(errorCode);
bus.reply(msg, reply);
recorder.withFailReason(errorCode.getDetails())
.end(null);
chain.next();
trigger.fail(errorCode);
}
});
}
});

fchain.done(new FlowDoneHandler(msg, chain) {
@Override
public void handle(Map data) {
bus.reply(msg, reply);
recorder.end(reply.getInventory() == null ? null : reply.getInventory().getHostUuid());
chain.next();
}
}).error(new FlowErrorHandler(msg, chain) {
@Override
public void handle(ErrorCode errCode, Map data) {
reply.setError(errCode);
bus.reply(msg, reply);
recorder.withFailReason(errCode.getDetails()).end(null);
chain.next();
}
}).start();
}

@Override
Expand Down
50 changes: 50 additions & 0 deletions header/src/main/java/org/zstack/header/vm/FenceVmOnHostMsg.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.zstack.header.vm;

import org.zstack.header.host.HostMessage;
import org.zstack.header.message.NeedReplyMessage;

/**
* ZSTAC-83890 / TIC-5513
*
* Sent from {@code VmInstanceBase.handle(HaStartVmInstanceMsg)} during HA pre-fence. Routed to a
* vetted Connected KVM sibling host's KVMHost service (the "peer"). The peer is picked at the
* management node from {@code siblingHostUuids} (hinted by HA decision) so that this message never
* has to be delivered to the suspect host whose service is by definition unreliable / Disconnected.
*
* The peer's {@code KVMHost.handle(FenceVmOnHostMsg)} loads the suspect host's SSH credentials from
* its KVMHostVO and asks its local kvmagent to SSH the suspect and force-destroy any leftover qemu
* of {@code vmUuid}, so the new VM start during HA is safe against split-brain even if Ceph
* watchers were transiently emptied by an OSD watch_ping timeout.
*/
public class FenceVmOnHostMsg extends NeedReplyMessage implements HostMessage {
/** Routing target: the peer (sibling) KVM host that will execute the fence. */
private String hostUuid;
/** The suspect host whose qemu must be killed; supplied by the management node. */
private String suspectHostUuid;
private String vmUuid;

@Override
public String getHostUuid() {
return hostUuid;
}

public void setHostUuid(String hostUuid) {
this.hostUuid = hostUuid;
}

public String getSuspectHostUuid() {
return suspectHostUuid;
}

public void setSuspectHostUuid(String suspectHostUuid) {
this.suspectHostUuid = suspectHostUuid;
}

public String getVmUuid() {
return vmUuid;
}

public void setVmUuid(String vmUuid) {
this.vmUuid = vmUuid;
}
}
14 changes: 14 additions & 0 deletions header/src/main/java/org/zstack/header/vm/FenceVmOnHostReply.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.zstack.header.vm;

import org.zstack.header.message.MessageReply;

/**
* ZSTAC-83890 - reply to {@link FenceVmOnHostMsg}.
*
* If a sibling kvmagent confirmed the suspect qemu is gone (or could not even reach the suspect
* host), the reply is a plain success and HA-start is allowed to proceed. If the sibling reported
* that qemu is still alive on the suspect host, the reply is a failure with a descriptive error,
* and HA-start is refused to prevent split-brain.
*/
public class FenceVmOnHostReply extends MessageReply {
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ public class HaStartVmInstanceMsg extends NeedReplyMessage implements VmInstance
private String judgerClassName;
private List<String> softAvoidHostUuids;
private String haReason;
// ZSTAC-83890 - Sibling KVM hosts already picked during HA decision (e.g. by HaKvmHostSiblingChecker).
// The HA-start flow uses one of them to SSH-fence any leftover qemu on the suspect host before allowing
// the new VM to start, hardening against transient empty-watcher / stale-qemu split-brain.
private List<String> siblingHostUuids;

public String getJudgerClassName() {
return judgerClassName;
Expand Down Expand Up @@ -46,4 +50,12 @@ public String getHaReason() {
public void setHaReason(String haReason) {
this.haReason = haReason;
}

public List<String> getSiblingHostUuids() {
return siblingHostUuids;
}

public void setSiblingHostUuids(List<String> siblingHostUuids) {
this.siblingHostUuids = siblingHostUuids;
}
}
57 changes: 57 additions & 0 deletions plugin/kvm/src/main/java/org/zstack/kvm/KVMAgentCommands.java
Original file line number Diff line number Diff line change
Expand Up @@ -5040,4 +5040,61 @@ public void setMemoryUsage(long memoryUsage) {
}
}

/**
* ZSTAC-83890 / TIC-5513
*
* Sent from MN to a sibling KVM host's kvmagent. The sibling SSHes to the suspect host
* (where the HA-started VM was running) and force-destroys any leftover qemu process
* before the new VM is allowed to start on a different host.
*
* Verdict is reported back via {@link FenceVmFromPeerRsp}.
*/
public static class FenceVmFromPeerCmd extends AgentCommand implements java.io.Serializable {
@GrayVersion(value = "5.4.8")
public String vmUuid;

@GrayVersion(value = "5.4.8")
public String targetHostUuid;

@GrayVersion(value = "5.4.8")
public String targetHostIp;

@GrayVersion(value = "5.4.8")
public String targetHostUsername;

@GrayVersion(value = "5.4.8")
@NoLogging
public String targetHostPassword;

@GrayVersion(value = "5.4.8")
public Integer targetHostSshPort;

@GrayVersion(value = "5.4.8")
public Integer sshTimeoutSec;
}

/**
* ZSTAC-83890 / TIC-5513 - reply to {@link FenceVmFromPeerCmd}.
*
* Boxed Boolean fields are intentional: under @GrayVersion an older agent (< 5.4.8) returns no
* value here, and a primitive boolean would silently default to false, making "absent"
* indistinguishable from "explicit false". Consumers must treat null as "absent".
*/
public static class FenceVmFromPeerRsp extends AgentResponse {
@GrayVersion(value = "5.4.8")
public Boolean qemuConfirmedDead;

@GrayVersion(value = "5.4.8")
public Boolean qemuStillAlive;

@GrayVersion(value = "5.4.8")
public Boolean targetHostUnreachable;

@GrayVersion(value = "5.4.8")
public String stdout;

@GrayVersion(value = "5.4.8")
public String stderr;
}

}
Loading