diff --git a/agent/conf/agent.properties b/agent/conf/agent.properties index 0dc5b8211e0d..a8e3b73e8dd9 100644 --- a/agent/conf/agent.properties +++ b/agent/conf/agent.properties @@ -310,6 +310,31 @@ iscsi.session.cleanup.enabled=false # This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat. #reboot.host.and.alert.management.on.heartbeat.timeout=true +# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat +# write fails persistently. Supersedes the legacy binary +# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value. +# +# Allowed values: +# hard-reboot - immediate sysrq-trigger reboot (default; 'reboot' kept as alias). +# Required default for setups where a stale NFSv3 mount can prevent +# a graceful shutdown from completing. +# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly. +# Use only if a stale storage mount cannot block shutdown. +# restart-agent - restart cloudstack-agent only; running VMs are preserved. +# log-only - log + alert; take no automatic action (admin must investigate). +# custom - invoke the script at 'kvm.heartbeat.fence.custom.script' (see below). +# Script is called with one positional arg: the heartbeat script name +# (e.g. 'kvmheartbeat.sh'). Falls back to hard-reboot if missing or +# not executable. +# +# The non-default values are recommended for setups using LINSTOR/DRBD or any local +# storage with replication, where transient I/O contention can cause a heartbeat +# write to time out without the host actually being unhealthy. +#kvm.heartbeat.fence.action=hard-reboot + +# Path to the operator-supplied script invoked when kvm.heartbeat.fence.action=custom. +#kvm.heartbeat.fence.custom.script=/etc/cloudstack/agent/heartbeat-fence-custom.sh + # Enables manually setting CPU's topology on KVM's VM. #enable.manually.setting.cpu.topology.on.kvm.vm=true diff --git a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java index 3364f9708cf5..7951e37b035a 100644 --- a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java +++ b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java @@ -598,6 +598,44 @@ public class AgentProperties{ public static final Property REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT = new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true); + /** + * Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh) + * when a heartbeat write fails persistently. Allowed values: + * + * The non-default values are recommended for setups using LINSTOR/DRBD or other replicated + * local storage, where transient I/O contention can cause a heartbeat write to time out + * without the host actually being unhealthy.
+ * Read by the heartbeat shell scripts directly from agent.properties.
+ * Data type: String.
+ * Default value: {@code hard-reboot} + */ + public static final Property KVM_HEARTBEAT_FENCE_ACTION + = new Property<>("kvm.heartbeat.fence.action", "hard-reboot"); + + /** + * Path to the operator-supplied script invoked when + * {@link #KVM_HEARTBEAT_FENCE_ACTION} is set to {@code custom}. The script must be + * executable and is called with a single positional argument: the heartbeat script name + * that triggered the fence (e.g. {@code kvmheartbeat.sh}). Read by the heartbeat shell + * scripts directly from agent.properties.
+ * Data type: String.
+ * Default value: {@code /etc/cloudstack/agent/heartbeat-fence-custom.sh} + */ + public static final Property KVM_HEARTBEAT_FENCE_CUSTOM_SCRIPT + = new Property<>("kvm.heartbeat.fence.custom.script", "/etc/cloudstack/agent/heartbeat-fence-custom.sh"); + /** * Enables manually setting CPU's topology on KVM's VM.
* Data type: Boolean.
diff --git a/scripts/vm/hypervisor/kvm/kvmha-fence.sh b/scripts/vm/hypervisor/kvm/kvmha-fence.sh new file mode 100755 index 000000000000..5f3006f71d7b --- /dev/null +++ b/scripts/vm/hypervisor/kvm/kvmha-fence.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Shared fence-action helper for kvmheartbeat.sh and kvmspheartbeat.sh. +# Sourced by both scripts; do not invoke directly. +# +# Usage from caller: +# source "$(dirname "$0")/kvmha-fence.sh" +# fence_action "kvmheartbeat.sh" # script name passed for log tagging + +AGENT_PROPS="${AGENT_PROPS:-/etc/cloudstack/agent/agent.properties}" + +fence_action() { + local source_script="${1:-kvmha}" + local FENCE_ACTION="hard-reboot" + local CUSTOM_SCRIPT="/etc/cloudstack/agent/heartbeat-fence-custom.sh" + + if [ -r "$AGENT_PROPS" ]; then + local val + val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]') + [ -n "$val" ] && FENCE_ACTION="$val" + local cval + cval=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.custom\.script[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + [ -n "$cval" ] && CUSTOM_SCRIPT="$cval" + fi + + case "$FENCE_ACTION" in + log-only) + /usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate." + exit 0 + ;; + restart-agent) + /usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)." + sync & + sleep 2 + systemctl restart cloudstack-agent + exit $? + ;; + graceful-reboot) + /usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)." + sync & + sleep 5 + systemctl reboot + exit $? + ;; + custom) + if [ -x "$CUSTOM_SCRIPT" ]; then + /usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' — running ${CUSTOM_SCRIPT}." + sync & + sleep 2 + "$CUSTOM_SCRIPT" "$source_script" + exit $? + else + /usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' selected but ${CUSTOM_SCRIPT} is missing or not executable — falling back to hard-reboot." + sync & + sleep 5 + echo b > /proc/sysrq-trigger + exit $? + fi + ;; + hard-reboot|reboot|*) + # 'reboot' kept as alias for back-compat with pre-existing deployments. + /usr/bin/logger -t heartbeat "${source_script} will reboot system because it was unable to write the heartbeat to the storage." + sync & + sleep 5 + echo b > /proc/sysrq-trigger + exit $? + ;; + esac +} diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh index 9b7eadada69f..0947798dcfcd 100755 --- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh @@ -156,11 +156,9 @@ then exit 0 elif [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." - sync & - sleep 5 - echo b > /proc/sysrq-trigger - exit $? + # shellcheck disable=SC1091 + . "$(dirname "$0")/kvmha-fence.sh" + fence_action "kvmheartbeat.sh" else write_hbLog exit $? diff --git a/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh index 3cb459e3e854..1a5b2a24dd9f 100755 --- a/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmspheartbeat.sh @@ -58,9 +58,7 @@ deleteVMs() { if [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage." - sync & - sleep 5 - echo b > /proc/sysrq-trigger - exit $? + # shellcheck disable=SC1091 + . "$(dirname "$0")/kvmha-fence.sh" + fence_action "kvmspheartbeat.sh" fi