xref: /freebsd/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1#!/bin/ksh -p
2# SPDX-License-Identifier: CDDL-1.0
3
4#
5# CDDL HEADER START
6#
7# This file and its contents are supplied under the terms of the
8# Common Development and Distribution License ("CDDL"), version 1.0.
9# You may only use this file in accordance with the terms of version
10# 1.0 of the CDDL.
11#
12# A full copy of the text of the CDDL should have accompanied this
13# source.  A copy of the CDDL is also available via the Internet at
14# http://www.illumos.org/license/CDDL.
15#
16# CDDL HEADER END
17#
18
19#
20# Copyright (c) 2017 by Intel Corporation. All rights reserved.
21# Copyright 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
22#
23
24. $STF_SUITE/include/libtest.shlib
25. $STF_SUITE/tests/functional/fault/fault.cfg
26
27#
28# DESCRIPTION:
29# Testing Fault Management Agent ZED Logic - Automated Auto-Spare Test when
30# multiple drives are faulted.
31#
32# STRATEGY:
33# 1. Create a pool with two hot spares
34# 2. Inject IO ERRORS with a zinject error handler on the first device
35# 3. Start a scrub
36# 4. Verify the ZED kicks in a hot spare and expected pool/device status
37# 5. Inject IO ERRORS on a second device
38# 6. Start a scrub
39# 7. Verify the ZED kicks in a second hot spare
40# 8. Clear the fault on both devices
41# 9. Verify the hot spares are available and expected pool/device status
42# 10. Rinse and repeat, this time faulting both devices at the same time
43#
44
45verify_runnable "both"
46
47function cleanup
48{
49	log_must zinject -c all
50	destroy_pool $TESTPOOL
51	rm -f $DATA_DEVS $SPARE_DEVS
52}
53
54log_assert "ZED should be able to handle multiple faulted devices"
55log_onexit cleanup
56
57# Events not supported on FreeBSD
58if ! is_freebsd; then
59	# Clear events from previous runs
60	zed_events_drain
61fi
62
63FAULT_DEV1="$TEST_BASE_DIR/fault-dev1"
64FAULT_DEV2="$TEST_BASE_DIR/fault-dev2"
65SAFE_DEV1="$TEST_BASE_DIR/safe-dev1"
66SAFE_DEV2="$TEST_BASE_DIR/safe-dev2"
67SAFE_DEV3="$TEST_BASE_DIR/safe-dev3"
68SAFE_DEV4="$TEST_BASE_DIR/safe-dev4"
69DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4"
70SPARE_DEV1="$TEST_BASE_DIR/spare-dev1"
71SPARE_DEV2="$TEST_BASE_DIR/spare-dev2"
72SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2"
73
74for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do
75	if [ "$type" = "draid2:1s" ]; then
76		# 1. Create a dRAID pool with a distributed and traditional
77		# hot spare to provide test coverage for both configurations.
78		#
79		# Corruption is injected in the third and fourth vdevs
80		# since the dRAID permutation at these offsets maps to
81		# distributed spare space and not data devices.
82		#
83		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
84		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
85		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
86		    spare $SPARE_DEV1
87		SPARE1=$SPARE_DEV1
88		SPARE2="draid2-0-0"
89	elif [ "$type" = "mirror" ]; then
90		# 1. Create a 3-way mirror pool with two hot spares
91		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
92		log_must zpool create -f $TESTPOOL $type \
93		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
94		SPARE1=$SPARE_DEV1
95		SPARE2=$SPARE_DEV2
96	else
97		# 1. Create a raidz pool with two hot spares
98		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
99		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
100		    spare $SPARE_DEVS
101		SPARE1=$SPARE_DEV1
102		SPARE2=$SPARE_DEV2
103	fi
104
105	# 2. Inject IO ERRORS with a zinject error handler on the first device
106	log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL
107
108	# 3. Start a scrub
109	log_must zpool scrub $TESTPOOL
110
111	# 4. Verify the ZED kicks in a hot spare and the pool/device status
112	log_note "Wait for ZED to auto-spare"
113	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
114	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
115	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
116	log_must check_state $TESTPOOL "" "DEGRADED"
117
118	# 5. Inject IO ERRORS on a second device
119	log_must zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL
120
121	# 6. Start a scrub
122	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
123		sleep 1
124	done
125	log_must zpool scrub $TESTPOOL
126
127	# 7. Verify the ZED kicks in a second hot spare
128	log_note "Wait for ZED to auto-spare"
129	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
130	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
131	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
132	log_must check_state $TESTPOOL "" "DEGRADED"
133
134	while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do
135		sleep 1
136	done
137
138	# 8. Clear the fault on both devices
139	log_must zinject -c all
140	log_must zpool clear $TESTPOOL $FAULT_DEV1
141	log_must zpool clear $TESTPOOL $FAULT_DEV2
142
143	# 9. Verify the hot spares are available and expected pool/device status
144	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60
145	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60
146	log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL"
147	log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL"
148	log_must check_state $TESTPOOL "" "ONLINE"
149
150	# Cleanup
151	cleanup
152done
153
154# Rinse and repeat, this time faulting both devices at the same time
155# NOTE: "raidz" is excluded since it cannot survive 2 faulted devices
156# NOTE: "mirror" is a 3-way mirror here and should survive this test
157for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do
158	if [ "$type" = "draid2:1s" ]; then
159		# 1. Create a dRAID pool with a distributed and traditional
160		# hot spare to provide test coverage for both configurations.
161		#
162		# Corruption is injected in the third and fourth vdevs
163		# since the dRAID permutation at these offsets maps to
164		# distributed spare space and not data devices.
165		#
166		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1
167		log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \
168		    $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \
169		    spare $SPARE_DEV1
170		SPARE1=$SPARE_DEV1
171		SPARE2="draid2-0-0"
172	elif [ "$type" = "mirror" ]; then
173		# 1. Create a 3-way mirror pool with two hot spares
174		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
175		log_must zpool create -f $TESTPOOL $type \
176		    $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS
177		SPARE1=$SPARE_DEV1
178		SPARE2=$SPARE_DEV2
179	else
180		# 1. Create a raidz pool with two hot spares
181		truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS
182		log_must zpool create -f $TESTPOOL $type $DATA_DEVS \
183		    spare $SPARE_DEVS
184		SPARE1=$SPARE_DEV1
185		SPARE2=$SPARE_DEV2
186	fi
187
188	# 2. Inject IO ERRORS with a zinject error handler on two devices
189	log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &"
190	log_must eval "zinject -d $FAULT_DEV2 -e io -T all -f 100 $TESTPOOL &"
191
192	# 3. Start a scrub
193	log_must zpool scrub $TESTPOOL
194
195	# 4. Verify the ZED kicks in two hot spares and the pool/device status
196	log_note "Wait for ZED to auto-spare"
197	log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60
198	log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60
199	log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60
200	log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60
201	log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE"
202	log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE"
203	log_must check_state $TESTPOOL "" "DEGRADED"
204
205	# 5. Clear the fault on both devices
206	log_must zinject -c all
207	log_must zpool clear $TESTPOOL $FAULT_DEV1
208	log_must zpool clear $TESTPOOL $FAULT_DEV2
209
210	# Cleanup
211	cleanup
212done
213
214log_pass "ZED successfully handles multiple faulted devices"
215