1#!/bin/sh 2# shellcheck disable=SC3014,SC2154,SC2086,SC2034 3# 4# Turn off disk's enclosure slot if an I/O is hung triggering the deadman. 5# 6# It's possible for outstanding I/O to a misbehaving SCSI disk to neither 7# promptly complete or return an error. This can occur due to retry and 8# recovery actions taken by the SCSI layer, driver, or disk. When it occurs 9# the pool will be unresponsive even though there may be sufficient redundancy 10# configured to proceeded without this single disk. 11# 12# When a hung I/O is detected by the kmods it will be posted as a deadman 13# event. By default an I/O is considered to be hung after 5 minutes. This 14# value can be changed with the zfs_deadman_ziotime_ms module parameter. 15# If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure 16# slot will be powered off causing the outstanding I/O to fail. The ZED 17# will then handle this like a normal disk failure and FAULT the vdev. 18# 19# We assume the user will be responsible for turning the slot back on 20# after replacing the disk. 21# 22# Note that this script requires that your enclosure be supported by the 23# Linux SCSI Enclosure services (SES) driver. The script will do nothing 24# if you have no enclosure, or if your enclosure isn't supported. 25# 26# Exit codes: 27# 0: slot successfully powered off 28# 1: enclosure not available 29# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN disabled 30# 3: System not configured to wait on deadman 31# 4: The enclosure sysfs path passed from ZFS does not exist 32# 5: Enclosure slot didn't actually turn off after we told it to 33 34[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" 35. "${ZED_ZEDLET_DIR}/zed-functions.sh" 36 37if [ ! -d /sys/class/enclosure ] ; then 38 # No JBOD enclosure or NVMe slots 39 exit 1 40fi 41 42if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN}" != "1" ] ; then 43 exit 2 44fi 45 46if [ "$ZEVENT_POOL_FAILMODE" != "wait" ] ; then 47 exit 3 48fi 49 50if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then 51 exit 4 52fi 53 54# Turn off the slot and wait for sysfs to report that the slot is off. 55# It can take ~400ms on some enclosures and multiple retries may be needed. 56for i in $(seq 1 20) ; do 57 echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" 58 59 for j in $(seq 1 5) ; do 60 if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then 61 break 2 62 fi 63 sleep 0.1 64 done 65done 66 67if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then 68 exit 5 69fi 70 71zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH" 72