1#!/bin/ksh -p 2# SPDX-License-Identifier: CDDL-1.0 3 4# 5# CDDL HEADER START 6# 7# This file and its contents are supplied under the terms of the 8# Common Development and Distribution License ("CDDL"), version 1.0. 9# You may only use this file in accordance with the terms of version 10# 1.0 of the CDDL. 11# 12# A full copy of the text of the CDDL should have accompanied this 13# source. A copy of the CDDL is also available via the Internet at 14# http://www.illumos.org/license/CDDL. 15# 16# CDDL HEADER END 17# 18 19# 20# Copyright (c) 2017 by Intel Corporation. All rights reserved. 21# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. 22# 23 24. $STF_SUITE/include/libtest.shlib 25. $STF_SUITE/tests/functional/fault/fault.cfg 26 27# 28# DESCRIPTION: 29# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when 30# multiple drives are faulted. 31# 32# STRATEGY: 33# 1. Create a pool with two hot spares 34# 2. Inject IO ERRORS with a zinject error handler on the first device 35# 3. Start a scrub 36# 4. Verify the ZED kicks in a hot spare and expected pool/device status 37# 5. Inject IO ERRORS on a second device 38# 6. Start a scrub 39# 7. Verify the ZED kicks in a second hot spare 40# 8. Clear the fault on both devices 41# 9. Verify the hot spares are available and expected pool/device status 42# 10. Rinse and repeat, this time faulting both devices at the same time 43# 44 45verify_runnable "both" 46 47function cleanup 48{ 49 log_must zinject -c all 50 destroy_pool $TESTPOOL 51 rm -f $DATA_DEVS $SPARE_DEVS 52} 53 54log_assert "ZED should be able to handle multiple faulted devices" 55log_onexit cleanup 56 57# Events not supported on FreeBSD 58if ! is_freebsd; then 59 # Clear events from previous runs 60 zed_events_drain 61fi 62 63FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" 64FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" 65SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" 66SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" 67SAFE_DEV3="$TEST_BASE_DIR/safe-dev3" 68SAFE_DEV4="$TEST_BASE_DIR/safe-dev4" 69DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" 70SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" 71SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" 72SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" 73 74for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do 75 if [ "$type" = "draid2:1s" ]; then 76 # 1. Create a dRAID pool with a distributed and traditional 77 # hot spare to provide test coverage for both configurations. 78 # 79 # Corruption is injected in the third and fourth vdevs 80 # since the dRAID permutation at these offsets maps to 81 # distributed spare space and not data devices. 82 # 83 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 84 log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 85 $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 86 spare $SPARE_DEV1 87 SPARE1=$SPARE_DEV1 88 SPARE2="draid2-0-0" 89 elif [ "$type" = "mirror" ]; then 90 # 1. Create a 3-way mirror pool with two hot spares 91 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 92 log_must zpool create -f $TESTPOOL $type \ 93 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 94 SPARE1=$SPARE_DEV1 95 SPARE2=$SPARE_DEV2 96 else 97 # 1. Create a raidz pool with two hot spares 98 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 99 log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 100 spare $SPARE_DEVS 101 SPARE1=$SPARE_DEV1 102 SPARE2=$SPARE_DEV2 103 fi 104 105 # 2. Inject IO ERRORS with a zinject error handler on the first device 106 log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL 107 108 # 3. Start a scrub 109 log_must zpool scrub $TESTPOOL 110 111 # 4. Verify the ZED kicks in a hot spare and the pool/device status 112 log_note "Wait for ZED to auto-spare" 113 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 114 log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 115 log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 116 log_must check_state $TESTPOOL "" "DEGRADED" 117 118 # 5. Inject IO ERRORS on a second device 119 log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL 120 121 # 6. Start a scrub 122 while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 123 sleep 1 124 done 125 log_must zpool scrub $TESTPOOL 126 127 # 7. Verify the ZED kicks in a second hot spare 128 log_note "Wait for ZED to auto-spare" 129 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 130 log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 131 log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 132 log_must check_state $TESTPOOL "" "DEGRADED" 133 134 while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do 135 sleep 1 136 done 137 138 # 8. Clear the fault on both devices 139 log_must zinject -c all 140 log_must zpool clear $TESTPOOL $FAULT_DEV1 141 log_must zpool clear $TESTPOOL $FAULT_DEV2 142 143 # 9. Verify the hot spares are available and expected pool/device status 144 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 145 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 146 log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" 147 log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" 148 log_must check_state $TESTPOOL "" "ONLINE" 149 150 # Cleanup 151 cleanup 152done 153 154# Rinse and repeat, this time faulting both devices at the same time 155# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices 156# NOTE: "mirror" is a 3-way mirror here and should survive this test 157for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do 158 if [ "$type" = "draid2:1s" ]; then 159 # 1. Create a dRAID pool with a distributed and traditional 160 # hot spare to provide test coverage for both configurations. 161 # 162 # Corruption is injected in the third and fourth vdevs 163 # since the dRAID permutation at these offsets maps to 164 # distributed spare space and not data devices. 165 # 166 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 167 log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ 168 $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ 169 spare $SPARE_DEV1 170 SPARE1=$SPARE_DEV1 171 SPARE2="draid2-0-0" 172 elif [ "$type" = "mirror" ]; then 173 # 1. Create a 3-way mirror pool with two hot spares 174 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 175 log_must zpool create -f $TESTPOOL $type \ 176 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS 177 SPARE1=$SPARE_DEV1 178 SPARE2=$SPARE_DEV2 179 else 180 # 1. Create a raidz pool with two hot spares 181 truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS 182 log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ 183 spare $SPARE_DEVS 184 SPARE1=$SPARE_DEV1 185 SPARE2=$SPARE_DEV2 186 fi 187 188 # 2. Inject IO ERRORS with a zinject error handler on two devices 189 log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" 190 log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &" 191 192 # 3. Start a scrub 193 log_must zpool scrub $TESTPOOL 194 195 # 4. Verify the ZED kicks in two hot spares and the pool/device status 196 log_note "Wait for ZED to auto-spare" 197 log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 198 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 199 log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 200 log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 201 log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" 202 log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" 203 log_must check_state $TESTPOOL "" "DEGRADED" 204 205 # 5. Clear the fault on both devices 206 log_must zinject -c all 207 log_must zpool clear $TESTPOOL $FAULT_DEV1 208 log_must zpool clear $TESTPOOL $FAULT_DEV2 209 210 # Cleanup 211 cleanup 212done 213 214log_pass "ZED successfully handles multiple faulted devices" 215