xref: /freebsd/sys/contrib/openzfs/cmd/zed/zed.d/deadman-sync-slot_off.sh (revision dd32d6b29d49838c99d38ba30846ade210b2e6f7)
1*dd32d6b2SMartin Matuska#!/bin/sh
2*dd32d6b2SMartin Matuska# shellcheck disable=SC3014,SC2154,SC2086,SC2034
3*dd32d6b2SMartin Matuska#
4*dd32d6b2SMartin Matuska# Turn off disk's enclosure slot if an I/O is hung triggering the deadman.
5*dd32d6b2SMartin Matuska#
6*dd32d6b2SMartin Matuska# It's possible for outstanding I/O to a misbehaving SCSI disk to neither
7*dd32d6b2SMartin Matuska# promptly complete or return an error.  This can occur due to retry and
8*dd32d6b2SMartin Matuska# recovery actions taken by the SCSI layer, driver, or disk.  When it occurs
9*dd32d6b2SMartin Matuska# the pool will be unresponsive even though there may be sufficient redundancy
10*dd32d6b2SMartin Matuska# configured to proceeded without this single disk.
11*dd32d6b2SMartin Matuska#
12*dd32d6b2SMartin Matuska# When a hung I/O is detected by the kmods it will be posted as a deadman
13*dd32d6b2SMartin Matuska# event.  By default an I/O is considered to be hung after 5 minutes.  This
14*dd32d6b2SMartin Matuska# value can be changed with the zfs_deadman_ziotime_ms module parameter.
15*dd32d6b2SMartin Matuska# If ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN is set the disk's enclosure
16*dd32d6b2SMartin Matuska# slot will be powered off causing the outstanding I/O to fail.  The ZED
17*dd32d6b2SMartin Matuska# will then handle this like a normal disk failure and FAULT the vdev.
18*dd32d6b2SMartin Matuska#
19*dd32d6b2SMartin Matuska# We assume the user will be responsible for turning the slot back on
20*dd32d6b2SMartin Matuska# after replacing the disk.
21*dd32d6b2SMartin Matuska#
22*dd32d6b2SMartin Matuska# Note that this script requires that your enclosure be supported by the
23*dd32d6b2SMartin Matuska# Linux SCSI Enclosure services (SES) driver.  The script will do nothing
24*dd32d6b2SMartin Matuska# if you have no enclosure, or if your enclosure isn't supported.
25*dd32d6b2SMartin Matuska#
26*dd32d6b2SMartin Matuska# Exit codes:
27*dd32d6b2SMartin Matuska#   0: slot successfully powered off
28*dd32d6b2SMartin Matuska#   1: enclosure not available
29*dd32d6b2SMartin Matuska#   2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN disabled
30*dd32d6b2SMartin Matuska#   3: System not configured to wait on deadman
31*dd32d6b2SMartin Matuska#   4: The enclosure sysfs path passed from ZFS does not exist
32*dd32d6b2SMartin Matuska#   5: Enclosure slot didn't actually turn off after we told it to
33*dd32d6b2SMartin Matuska
34*dd32d6b2SMartin Matuska[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
35*dd32d6b2SMartin Matuska. "${ZED_ZEDLET_DIR}/zed-functions.sh"
36*dd32d6b2SMartin Matuska
37*dd32d6b2SMartin Matuskaif [ ! -d /sys/class/enclosure ] ; then
38*dd32d6b2SMartin Matuska	# No JBOD enclosure or NVMe slots
39*dd32d6b2SMartin Matuska	exit 1
40*dd32d6b2SMartin Matuskafi
41*dd32d6b2SMartin Matuska
42*dd32d6b2SMartin Matuskaif [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_DEADMAN}" != "1" ] ; then
43*dd32d6b2SMartin Matuska	exit 2
44*dd32d6b2SMartin Matuskafi
45*dd32d6b2SMartin Matuska
46*dd32d6b2SMartin Matuskaif [ "$ZEVENT_POOL_FAILMODE" != "wait" ] ; then
47*dd32d6b2SMartin Matuska	exit 3
48*dd32d6b2SMartin Matuskafi
49*dd32d6b2SMartin Matuska
50*dd32d6b2SMartin Matuskaif [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
51*dd32d6b2SMartin Matuska	exit 4
52*dd32d6b2SMartin Matuskafi
53*dd32d6b2SMartin Matuska
54*dd32d6b2SMartin Matuska# Turn off the slot and wait for sysfs to report that the slot is off.
55*dd32d6b2SMartin Matuska# It can take ~400ms on some enclosures and multiple retries may be needed.
56*dd32d6b2SMartin Matuskafor i in $(seq 1 20) ; do
57*dd32d6b2SMartin Matuska	echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
58*dd32d6b2SMartin Matuska
59*dd32d6b2SMartin Matuska	for j in $(seq 1 5) ; do
60*dd32d6b2SMartin Matuska		if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
61*dd32d6b2SMartin Matuska			break 2
62*dd32d6b2SMartin Matuska		fi
63*dd32d6b2SMartin Matuska		sleep 0.1
64*dd32d6b2SMartin Matuska	done
65*dd32d6b2SMartin Matuskadone
66*dd32d6b2SMartin Matuska
67*dd32d6b2SMartin Matuskaif [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
68*dd32d6b2SMartin Matuska	exit 5
69*dd32d6b2SMartin Matuskafi
70*dd32d6b2SMartin Matuska
71*dd32d6b2SMartin Matuskazed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"
72