xref: /freebsd/sys/contrib/openzfs/module/zfs/zio_inject.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25  * Copyright (c) 2017, Intel Corporation.
26  * Copyright (c) 2024-2025, Klara, Inc.
27  */
28 
29 /*
30  * ZFS fault injection
31  *
32  * To handle fault injection, we keep track of a series of zinject_record_t
33  * structures which describe which logical block(s) should be injected with a
34  * fault.  These are kept in a global list.  Each record corresponds to a given
35  * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
36  * or exported while the injection record exists.
37  *
38  * Device level injection is done using the 'zi_guid' field.  If this is set, it
39  * means that the error is destined for a particular device, not a piece of
40  * data.
41  *
42  * This is a rather poor data structure and algorithm, but we don't expect more
43  * than a few faults at any one time, so it should be sufficient for our needs.
44  */
45 
46 #include <sys/arc.h>
47 #include <sys/zio.h>
48 #include <sys/zfs_ioctl.h>
49 #include <sys/vdev_impl.h>
50 #include <sys/dmu_objset.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/fs/zfs.h>
53 
54 uint32_t zio_injection_enabled = 0;
55 
56 /*
57  * Data describing each zinject handler registered on the system, and
58  * contains the list node linking the handler in the global zinject
59  * handler list.
60  */
61 typedef struct inject_handler {
62 	int			zi_id;
63 	spa_t			*zi_spa;
64 	char			*zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
65 	zinject_record_t	zi_record;
66 	uint64_t		*zi_lanes;
67 	int			zi_next_lane;
68 	list_node_t		zi_link;
69 } inject_handler_t;
70 
71 /*
72  * List of all zinject handlers registered on the system, protected by
73  * the inject_lock defined below.
74  */
75 static list_t inject_handlers;
76 
77 /*
78  * This protects insertion into, and traversal of, the inject handler
79  * list defined above; as well as the inject_delay_count. Any time a
80  * handler is inserted or removed from the list, this lock should be
81  * taken as a RW_WRITER; and any time traversal is done over the list
82  * (without modification to it) this lock should be taken as a RW_READER.
83  */
84 static krwlock_t inject_lock;
85 
86 /*
87  * This holds the number of zinject delay handlers that have been
88  * registered on the system. It is protected by the inject_lock defined
89  * above. Thus modifications to this count must be a RW_WRITER of the
90  * inject_lock, and reads of this count must be (at least) a RW_READER
91  * of the lock.
92  */
93 static int inject_delay_count = 0;
94 
95 /*
96  * This lock is used only in zio_handle_io_delay(), refer to the comment
97  * in that function for more details.
98  */
99 static kmutex_t inject_delay_mtx;
100 
101 /*
102  * Used to assign unique identifying numbers to each new zinject handler.
103  */
104 static int inject_next_id = 1;
105 
106 /*
107  * Test if the requested frequency was triggered
108  */
109 static boolean_t
freq_triggered(uint32_t frequency)110 freq_triggered(uint32_t frequency)
111 {
112 	/*
113 	 * zero implies always (100%)
114 	 */
115 	if (frequency == 0)
116 		return (B_TRUE);
117 
118 	/*
119 	 * Note: we still handle legacy (unscaled) frequency values
120 	 */
121 	uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
122 
123 	return (random_in_range(maximum) < frequency);
124 }
125 
126 /*
127  * Returns true if the given record matches the I/O in progress.
128  */
129 static boolean_t
zio_match_handler(const zbookmark_phys_t * zb,uint64_t type,int dva,zinject_record_t * record,int error)130 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
131     zinject_record_t *record, int error)
132 {
133 	boolean_t matched = B_FALSE;
134 	boolean_t injected = B_FALSE;
135 
136 	/*
137 	 * Check for a match against the MOS, which is based on type
138 	 */
139 	if (zb->zb_objset == DMU_META_OBJSET &&
140 	    record->zi_objset == DMU_META_OBJSET &&
141 	    record->zi_object == DMU_META_DNODE_OBJECT) {
142 		if (record->zi_type == DMU_OT_NONE ||
143 		    type == record->zi_type)
144 			matched = B_TRUE;
145 		goto done;
146 	}
147 
148 	/*
149 	 * Check for an exact match.
150 	 */
151 	if (zb->zb_objset == record->zi_objset &&
152 	    zb->zb_object == record->zi_object &&
153 	    zb->zb_level == record->zi_level &&
154 	    zb->zb_blkid >= record->zi_start &&
155 	    zb->zb_blkid <= record->zi_end &&
156 	    (record->zi_dvas == 0 ||
157 	    (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) &&
158 	    error == record->zi_error) {
159 		matched = B_TRUE;
160 		goto done;
161 	}
162 
163 done:
164 	if (matched) {
165 		record->zi_match_count++;
166 		injected = freq_triggered(record->zi_freq);
167 	}
168 
169 	if (injected)
170 		record->zi_inject_count++;
171 
172 	return (injected);
173 }
174 
175 /*
176  * Panic the system when a config change happens in the function
177  * specified by tag.
178  */
179 void
zio_handle_panic_injection(spa_t * spa,const char * tag,uint64_t type)180 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type)
181 {
182 	inject_handler_t *handler;
183 
184 	rw_enter(&inject_lock, RW_READER);
185 
186 	for (handler = list_head(&inject_handlers); handler != NULL;
187 	    handler = list_next(&inject_handlers, handler)) {
188 
189 		if (spa != handler->zi_spa)
190 			continue;
191 
192 		if (handler->zi_record.zi_type == type &&
193 		    strcmp(tag, handler->zi_record.zi_func) == 0) {
194 			handler->zi_record.zi_match_count++;
195 			handler->zi_record.zi_inject_count++;
196 			panic("Panic requested in function %s\n", tag);
197 		}
198 	}
199 
200 	rw_exit(&inject_lock);
201 }
202 
203 /*
204  * Inject a decryption failure. Decryption failures can occur in
205  * both the ARC and the ZIO layers.
206  */
207 int
zio_handle_decrypt_injection(spa_t * spa,const zbookmark_phys_t * zb,uint64_t type,int error)208 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
209     uint64_t type, int error)
210 {
211 	int ret = 0;
212 	inject_handler_t *handler;
213 
214 	rw_enter(&inject_lock, RW_READER);
215 
216 	for (handler = list_head(&inject_handlers); handler != NULL;
217 	    handler = list_next(&inject_handlers, handler)) {
218 
219 		if (spa != handler->zi_spa ||
220 		    handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
221 			continue;
222 
223 		if (zio_match_handler(zb, type, ZI_NO_DVA,
224 		    &handler->zi_record, error)) {
225 			ret = error;
226 			break;
227 		}
228 	}
229 
230 	rw_exit(&inject_lock);
231 	return (ret);
232 }
233 
234 /*
235  * If this is a physical I/O for a vdev child determine which DVA it is
236  * for. We iterate backwards through the DVAs matching on the offset so
237  * that we end up with ZI_NO_DVA (-1) if we don't find a match.
238  */
239 static int
zio_match_dva(zio_t * zio)240 zio_match_dva(zio_t *zio)
241 {
242 	int i = ZI_NO_DVA;
243 
244 	if (zio->io_bp != NULL && zio->io_vd != NULL &&
245 	    zio->io_child_type == ZIO_CHILD_VDEV) {
246 		for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
247 			dva_t *dva = &zio->io_bp->blk_dva[i];
248 			uint64_t off = DVA_GET_OFFSET(dva);
249 			vdev_t *vd = vdev_lookup_top(zio->io_spa,
250 			    DVA_GET_VDEV(dva));
251 
252 			/* Compensate for vdev label added to leaves */
253 			if (zio->io_vd->vdev_ops->vdev_op_leaf)
254 				off += VDEV_LABEL_START_SIZE;
255 
256 			if (zio->io_vd == vd && zio->io_offset == off)
257 				break;
258 		}
259 	}
260 
261 	return (i);
262 }
263 
264 
265 /*
266  * Determine if the I/O in question should return failure.  Returns the errno
267  * to be returned to the caller.
268  */
269 int
zio_handle_fault_injection(zio_t * zio,int error)270 zio_handle_fault_injection(zio_t *zio, int error)
271 {
272 	int ret = 0;
273 	inject_handler_t *handler;
274 
275 	/*
276 	 * Ignore I/O not associated with any logical data.
277 	 */
278 	if (zio->io_logical == NULL)
279 		return (0);
280 
281 	/*
282 	 * Currently, we only support fault injection on reads.
283 	 */
284 	if (zio->io_type != ZIO_TYPE_READ)
285 		return (0);
286 
287 	/*
288 	 * A rebuild I/O has no checksum to verify.
289 	 */
290 	if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
291 		return (0);
292 
293 	rw_enter(&inject_lock, RW_READER);
294 
295 	for (handler = list_head(&inject_handlers); handler != NULL;
296 	    handler = list_next(&inject_handlers, handler)) {
297 		if (zio->io_spa != handler->zi_spa ||
298 		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
299 			continue;
300 
301 		/* If this handler matches, return the specified error */
302 		if (zio_match_handler(&zio->io_logical->io_bookmark,
303 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
304 		    zio_match_dva(zio), &handler->zi_record, error)) {
305 			ret = error;
306 			break;
307 		}
308 	}
309 
310 	rw_exit(&inject_lock);
311 
312 	return (ret);
313 }
314 
315 /*
316  * Determine if the zio is part of a label update and has an injection
317  * handler associated with that portion of the label. Currently, we
318  * allow error injection in either the nvlist or the uberblock region of
319  * of the vdev label.
320  */
321 int
zio_handle_label_injection(zio_t * zio,int error)322 zio_handle_label_injection(zio_t *zio, int error)
323 {
324 	inject_handler_t *handler;
325 	vdev_t *vd = zio->io_vd;
326 	uint64_t offset = zio->io_offset;
327 	int label;
328 	int ret = 0;
329 
330 	if (offset >= VDEV_LABEL_START_SIZE &&
331 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
332 		return (0);
333 
334 	rw_enter(&inject_lock, RW_READER);
335 
336 	for (handler = list_head(&inject_handlers); handler != NULL;
337 	    handler = list_next(&inject_handlers, handler)) {
338 		uint64_t start = handler->zi_record.zi_start;
339 		uint64_t end = handler->zi_record.zi_end;
340 
341 		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
342 			continue;
343 
344 		/*
345 		 * The injection region is the relative offsets within a
346 		 * vdev label. We must determine the label which is being
347 		 * updated and adjust our region accordingly.
348 		 */
349 		label = vdev_label_number(vd->vdev_psize, offset);
350 		start = vdev_label_offset(vd->vdev_psize, label, start);
351 		end = vdev_label_offset(vd->vdev_psize, label, end);
352 
353 		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
354 		    (offset >= start && offset <= end)) {
355 			handler->zi_record.zi_match_count++;
356 			handler->zi_record.zi_inject_count++;
357 			ret = error;
358 			break;
359 		}
360 	}
361 	rw_exit(&inject_lock);
362 	return (ret);
363 }
364 
365 static int
zio_inject_bitflip_cb(void * data,size_t len,void * private)366 zio_inject_bitflip_cb(void *data, size_t len, void *private)
367 {
368 	zio_t *zio = private;
369 	uint8_t *buffer = data;
370 	uint_t byte = random_in_range(len);
371 
372 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
373 
374 	/* flip a single random bit in an abd data buffer */
375 	buffer[byte] ^= 1 << random_in_range(8);
376 
377 	return (1);	/* stop after first flip */
378 }
379 
380 /* Test if this zio matches the iotype from the injection record. */
381 static boolean_t
zio_match_iotype(zio_t * zio,uint32_t iotype)382 zio_match_iotype(zio_t *zio, uint32_t iotype)
383 {
384 	ASSERT3P(zio, !=, NULL);
385 
386 	/* Unknown iotype, maybe from a newer version of zinject. Reject it. */
387 	if (iotype >= ZINJECT_IOTYPES)
388 		return (B_FALSE);
389 
390 	/* Probe IOs only match IOTYPE_PROBE, regardless of their type. */
391 	if (zio->io_flags & ZIO_FLAG_PROBE)
392 		return (iotype == ZINJECT_IOTYPE_PROBE);
393 
394 	/* Standard IO types, match against ZIO type. */
395 	if (iotype < ZINJECT_IOTYPE_ALL)
396 		return (iotype == zio->io_type);
397 
398 	/* Match any standard IO type. */
399 	if (iotype == ZINJECT_IOTYPE_ALL)
400 		return (B_TRUE);
401 
402 	return (B_FALSE);
403 }
404 
405 static int
zio_handle_device_injection_impl(vdev_t * vd,zio_t * zio,int err1,int err2)406 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
407 {
408 	inject_handler_t *handler;
409 	int ret = 0;
410 
411 	/*
412 	 * We skip over faults in the labels unless it's during device open
413 	 * (i.e. zio == NULL) or a device flush (offset is meaningless). We let
414 	 * probe IOs through so we can match them to probe inject records.
415 	 */
416 	if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH &&
417 	    !(zio->io_flags & ZIO_FLAG_PROBE)) {
418 		uint64_t offset = zio->io_offset;
419 
420 		if (offset < VDEV_LABEL_START_SIZE ||
421 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
422 			return (0);
423 	}
424 
425 	rw_enter(&inject_lock, RW_READER);
426 
427 	for (handler = list_head(&inject_handlers); handler != NULL;
428 	    handler = list_next(&inject_handlers, handler)) {
429 
430 		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
431 			continue;
432 
433 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
434 			if (handler->zi_record.zi_failfast &&
435 			    (zio == NULL || (zio->io_flags &
436 			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
437 				continue;
438 			}
439 
440 			/* Handle type specific I/O failures */
441 			if (zio != NULL && !zio_match_iotype(zio,
442 			    handler->zi_record.zi_iotype))
443 				continue;
444 
445 			if (handler->zi_record.zi_error == err1 ||
446 			    handler->zi_record.zi_error == err2) {
447 				handler->zi_record.zi_match_count++;
448 
449 				/*
450 				 * limit error injection if requested
451 				 */
452 				if (!freq_triggered(handler->zi_record.zi_freq))
453 					continue;
454 
455 				handler->zi_record.zi_inject_count++;
456 
457 				/*
458 				 * For a failed open, pretend like the device
459 				 * has gone away.
460 				 */
461 				if (err1 == ENXIO)
462 					vd->vdev_stat.vs_aux =
463 					    VDEV_AUX_OPEN_FAILED;
464 
465 				/*
466 				 * Treat these errors as if they had been
467 				 * retried so that all the appropriate stats
468 				 * and FMA events are generated.
469 				 */
470 				if (!handler->zi_record.zi_failfast &&
471 				    zio != NULL)
472 					zio->io_flags |= ZIO_FLAG_IO_RETRY;
473 
474 				/*
475 				 * EILSEQ means flip a bit after a read
476 				 */
477 				if (handler->zi_record.zi_error == EILSEQ) {
478 					if (zio == NULL)
479 						break;
480 
481 					/* locate buffer data and flip a bit */
482 					(void) abd_iterate_func(zio->io_abd, 0,
483 					    zio->io_size, zio_inject_bitflip_cb,
484 					    zio);
485 					break;
486 				}
487 
488 				ret = handler->zi_record.zi_error;
489 				break;
490 			}
491 			if (handler->zi_record.zi_error == ENXIO) {
492 				handler->zi_record.zi_match_count++;
493 				handler->zi_record.zi_inject_count++;
494 				ret = SET_ERROR(EIO);
495 				break;
496 			}
497 		}
498 	}
499 
500 	rw_exit(&inject_lock);
501 
502 	return (ret);
503 }
504 
505 int
zio_handle_device_injection(vdev_t * vd,zio_t * zio,int error)506 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
507 {
508 	return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
509 }
510 
511 int
zio_handle_device_injections(vdev_t * vd,zio_t * zio,int err1,int err2)512 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
513 {
514 	return (zio_handle_device_injection_impl(vd, zio, err1, err2));
515 }
516 
517 /*
518  * Simulate hardware that ignores cache flushes.  For requested number
519  * of seconds nix the actual writing to disk.
520  */
521 void
zio_handle_ignored_writes(zio_t * zio)522 zio_handle_ignored_writes(zio_t *zio)
523 {
524 	inject_handler_t *handler;
525 
526 	rw_enter(&inject_lock, RW_READER);
527 
528 	for (handler = list_head(&inject_handlers); handler != NULL;
529 	    handler = list_next(&inject_handlers, handler)) {
530 
531 		/* Ignore errors not destined for this pool */
532 		if (zio->io_spa != handler->zi_spa ||
533 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
534 			continue;
535 
536 		handler->zi_record.zi_match_count++;
537 
538 		/*
539 		 * Positive duration implies # of seconds, negative
540 		 * a number of txgs
541 		 */
542 		if (handler->zi_record.zi_timer == 0) {
543 			if (handler->zi_record.zi_duration > 0)
544 				handler->zi_record.zi_timer = ddi_get_lbolt64();
545 			else
546 				handler->zi_record.zi_timer = zio->io_txg;
547 		}
548 
549 		/* Have a "problem" writing 60% of the time */
550 		if (random_in_range(100) < 60) {
551 			handler->zi_record.zi_inject_count++;
552 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
553 		}
554 		break;
555 	}
556 
557 	rw_exit(&inject_lock);
558 }
559 
560 void
spa_handle_ignored_writes(spa_t * spa)561 spa_handle_ignored_writes(spa_t *spa)
562 {
563 	inject_handler_t *handler;
564 
565 	if (zio_injection_enabled == 0)
566 		return;
567 
568 	rw_enter(&inject_lock, RW_READER);
569 
570 	for (handler = list_head(&inject_handlers); handler != NULL;
571 	    handler = list_next(&inject_handlers, handler)) {
572 
573 		if (spa != handler->zi_spa ||
574 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
575 			continue;
576 
577 		handler->zi_record.zi_match_count++;
578 		handler->zi_record.zi_inject_count++;
579 
580 		if (handler->zi_record.zi_duration > 0) {
581 			VERIFY(handler->zi_record.zi_timer == 0 ||
582 			    ddi_time_after64(
583 			    (int64_t)handler->zi_record.zi_timer +
584 			    handler->zi_record.zi_duration * hz,
585 			    ddi_get_lbolt64()));
586 		} else {
587 			/* duration is negative so the subtraction here adds */
588 			VERIFY(handler->zi_record.zi_timer == 0 ||
589 			    handler->zi_record.zi_timer -
590 			    handler->zi_record.zi_duration >=
591 			    spa_syncing_txg(spa));
592 		}
593 	}
594 
595 	rw_exit(&inject_lock);
596 }
597 
598 hrtime_t
zio_handle_io_delay(zio_t * zio)599 zio_handle_io_delay(zio_t *zio)
600 {
601 	vdev_t *vd = zio->io_vd;
602 	inject_handler_t *min_handler = NULL;
603 	hrtime_t min_target = 0;
604 
605 	rw_enter(&inject_lock, RW_READER);
606 
607 	/*
608 	 * inject_delay_count is a subset of zio_injection_enabled that
609 	 * is only incremented for delay handlers. These checks are
610 	 * mainly added to remind the reader why we're not explicitly
611 	 * checking zio_injection_enabled like the other functions.
612 	 */
613 	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
614 	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
615 
616 	/*
617 	 * If there aren't any inject delay handlers registered, then we
618 	 * can short circuit and simply return 0 here. A value of zero
619 	 * informs zio_delay_interrupt() that this request should not be
620 	 * delayed. This short circuit keeps us from acquiring the
621 	 * inject_delay_mutex unnecessarily.
622 	 */
623 	if (inject_delay_count == 0) {
624 		rw_exit(&inject_lock);
625 		return (0);
626 	}
627 
628 	/*
629 	 * Each inject handler has a number of "lanes" associated with
630 	 * it. Each lane is able to handle requests independently of one
631 	 * another, and at a latency defined by the inject handler
632 	 * record's zi_timer field. Thus if a handler in configured with
633 	 * a single lane with a 10ms latency, it will delay requests
634 	 * such that only a single request is completed every 10ms. So,
635 	 * if more than one request is attempted per each 10ms interval,
636 	 * the average latency of the requests will be greater than
637 	 * 10ms; but if only a single request is submitted each 10ms
638 	 * interval the average latency will be 10ms.
639 	 *
640 	 * We need to acquire this mutex to prevent multiple concurrent
641 	 * threads being assigned to the same lane of a given inject
642 	 * handler. The mutex allows us to perform the following two
643 	 * operations atomically:
644 	 *
645 	 *	1. determine the minimum handler and minimum target
646 	 *	   value of all the possible handlers
647 	 *	2. update that minimum handler's lane array
648 	 *
649 	 * Without atomicity, two (or more) threads could pick the same
650 	 * lane in step (1), and then conflict with each other in step
651 	 * (2). This could allow a single lane handler to process
652 	 * multiple requests simultaneously, which shouldn't be possible.
653 	 */
654 	mutex_enter(&inject_delay_mtx);
655 
656 	for (inject_handler_t *handler = list_head(&inject_handlers);
657 	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
658 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
659 			continue;
660 
661 		if (vd->vdev_guid != handler->zi_record.zi_guid)
662 			continue;
663 
664 		/* also match on I/O type (e.g., -T read) */
665 		if (!zio_match_iotype(zio, handler->zi_record.zi_iotype))
666 			continue;
667 
668 		/*
669 		 * Defensive; should never happen as the array allocation
670 		 * occurs prior to inserting this handler on the list.
671 		 */
672 		ASSERT3P(handler->zi_lanes, !=, NULL);
673 
674 		/*
675 		 * This should never happen, the zinject command should
676 		 * prevent a user from setting an IO delay with zero lanes.
677 		 */
678 		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
679 
680 		ASSERT3U(handler->zi_record.zi_nlanes, >,
681 		    handler->zi_next_lane);
682 
683 		handler->zi_record.zi_match_count++;
684 
685 		/* Limit the use of this handler if requested */
686 		if (!freq_triggered(handler->zi_record.zi_freq))
687 			continue;
688 
689 		/*
690 		 * We want to issue this IO to the lane that will become
691 		 * idle the soonest, so we compare the soonest this
692 		 * specific handler can complete the IO with all other
693 		 * handlers, to find the lowest value of all possible
694 		 * lanes. We then use this lane to submit the request.
695 		 *
696 		 * Since each handler has a constant value for its
697 		 * delay, we can just use the "next" lane for that
698 		 * handler; as it will always be the lane with the
699 		 * lowest value for that particular handler (i.e. the
700 		 * lane that will become idle the soonest). This saves a
701 		 * scan of each handler's lanes array.
702 		 *
703 		 * There's two cases to consider when determining when
704 		 * this specific IO request should complete. If this
705 		 * lane is idle, we want to "submit" the request now so
706 		 * it will complete after zi_timer milliseconds. Thus,
707 		 * we set the target to now + zi_timer.
708 		 *
709 		 * If the lane is busy, we want this request to complete
710 		 * zi_timer milliseconds after the lane becomes idle.
711 		 * Since the 'zi_lanes' array holds the time at which
712 		 * each lane will become idle, we use that value to
713 		 * determine when this request should complete.
714 		 */
715 		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
716 		hrtime_t busy = handler->zi_record.zi_timer +
717 		    handler->zi_lanes[handler->zi_next_lane];
718 		hrtime_t target = MAX(idle, busy);
719 
720 		if (min_handler == NULL) {
721 			min_handler = handler;
722 			min_target = target;
723 			continue;
724 		}
725 
726 		ASSERT3P(min_handler, !=, NULL);
727 		ASSERT3U(min_target, !=, 0);
728 
729 		/*
730 		 * We don't yet increment the "next lane" variable since
731 		 * we still might find a lower value lane in another
732 		 * handler during any remaining iterations. Once we're
733 		 * sure we've selected the absolute minimum, we'll claim
734 		 * the lane and increment the handler's "next lane"
735 		 * field below.
736 		 */
737 
738 		if (target < min_target) {
739 			min_handler = handler;
740 			min_target = target;
741 		}
742 	}
743 
744 	/*
745 	 * 'min_handler' will be NULL if no IO delays are registered for
746 	 * this vdev, otherwise it will point to the handler containing
747 	 * the lane that will become idle the soonest.
748 	 */
749 	if (min_handler != NULL) {
750 		ASSERT3U(min_target, !=, 0);
751 		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
752 
753 		/*
754 		 * If we've used all possible lanes for this handler,
755 		 * loop back and start using the first lane again;
756 		 * otherwise, just increment the lane index.
757 		 */
758 		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
759 		    min_handler->zi_record.zi_nlanes;
760 
761 		min_handler->zi_record.zi_inject_count++;
762 
763 	}
764 
765 	mutex_exit(&inject_delay_mtx);
766 	rw_exit(&inject_lock);
767 
768 	return (min_target);
769 }
770 
771 static void
zio_handle_pool_delay(spa_t * spa,hrtime_t elapsed,zinject_type_t command)772 zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
773 {
774 	inject_handler_t *handler;
775 	hrtime_t delay = 0;
776 	int id = 0;
777 
778 	rw_enter(&inject_lock, RW_READER);
779 
780 	for (handler = list_head(&inject_handlers);
781 	    handler != NULL && handler->zi_record.zi_cmd == command;
782 	    handler = list_next(&inject_handlers, handler)) {
783 		ASSERT3P(handler->zi_spa_name, !=, NULL);
784 		if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
785 			handler->zi_record.zi_match_count++;
786 			uint64_t pause =
787 			    SEC2NSEC(handler->zi_record.zi_duration);
788 			if (pause > elapsed) {
789 				handler->zi_record.zi_inject_count++;
790 				delay = pause - elapsed;
791 			}
792 			id = handler->zi_id;
793 			break;
794 		}
795 	}
796 
797 	rw_exit(&inject_lock);
798 
799 	if (delay) {
800 		if (command == ZINJECT_DELAY_IMPORT) {
801 			spa_import_progress_set_notes(spa, "injecting %llu "
802 			    "sec delay", (u_longlong_t)NSEC2SEC(delay));
803 		}
804 		zfs_sleep_until(gethrtime() + delay);
805 	}
806 	if (id) {
807 		/* all done with this one-shot handler */
808 		zio_clear_fault(id);
809 	}
810 }
811 
812 /*
813  * For testing, inject a delay during an import
814  */
815 void
zio_handle_import_delay(spa_t * spa,hrtime_t elapsed)816 zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
817 {
818 	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
819 }
820 
821 /*
822  * For testing, inject a delay during an export
823  */
824 void
zio_handle_export_delay(spa_t * spa,hrtime_t elapsed)825 zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
826 {
827 	zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
828 }
829 
830 static int
zio_calculate_range(const char * pool,zinject_record_t * record)831 zio_calculate_range(const char *pool, zinject_record_t *record)
832 {
833 	dsl_pool_t *dp;
834 	dsl_dataset_t *ds;
835 	objset_t *os = NULL;
836 	dnode_t *dn = NULL;
837 	int error;
838 
839 	/*
840 	 * Obtain the dnode for object using pool, objset, and object
841 	 */
842 	error = dsl_pool_hold(pool, FTAG, &dp);
843 	if (error)
844 		return (error);
845 
846 	error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
847 	dsl_pool_rele(dp, FTAG);
848 	if (error)
849 		return (error);
850 
851 	error = dmu_objset_from_ds(ds, &os);
852 	dsl_dataset_rele(ds, FTAG);
853 	if (error)
854 		return (error);
855 
856 	error = dnode_hold(os, record->zi_object, FTAG, &dn);
857 	if (error)
858 		return (error);
859 
860 	/*
861 	 * Translate the range into block IDs
862 	 */
863 	if (record->zi_start != 0 || record->zi_end != -1ULL) {
864 		record->zi_start >>= dn->dn_datablkshift;
865 		record->zi_end >>= dn->dn_datablkshift;
866 	}
867 	if (record->zi_level > 0) {
868 		if (record->zi_level >= dn->dn_nlevels) {
869 			dnode_rele(dn, FTAG);
870 			return (SET_ERROR(EDOM));
871 		}
872 
873 		if (record->zi_start != 0 || record->zi_end != 0) {
874 			int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
875 
876 			for (int level = record->zi_level; level > 0; level--) {
877 				record->zi_start >>= shift;
878 				record->zi_end >>= shift;
879 			}
880 		}
881 	}
882 
883 	dnode_rele(dn, FTAG);
884 	return (0);
885 }
886 
887 static boolean_t
zio_pool_handler_exists(const char * name,zinject_type_t command)888 zio_pool_handler_exists(const char *name, zinject_type_t command)
889 {
890 	boolean_t exists = B_FALSE;
891 
892 	rw_enter(&inject_lock, RW_READER);
893 	for (inject_handler_t *handler = list_head(&inject_handlers);
894 	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
895 		if (command != handler->zi_record.zi_cmd)
896 			continue;
897 
898 		const char *pool = (handler->zi_spa_name != NULL) ?
899 		    handler->zi_spa_name : spa_name(handler->zi_spa);
900 		if (strcmp(name, pool) == 0) {
901 			exists = B_TRUE;
902 			break;
903 		}
904 	}
905 	rw_exit(&inject_lock);
906 
907 	return (exists);
908 }
909 /*
910  * Create a new handler for the given record.  We add it to the list, adding
911  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
912  * which is the switch to trigger all fault injection.
913  */
914 int
zio_inject_fault(char * name,int flags,int * id,zinject_record_t * record)915 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
916 {
917 	inject_handler_t *handler;
918 	int error;
919 	spa_t *spa;
920 
921 	/*
922 	 * If this is pool-wide metadata, make sure we unload the corresponding
923 	 * spa_t, so that the next attempt to load it will trigger the fault.
924 	 * We call spa_reset() to unload the pool appropriately.
925 	 */
926 	if (flags & ZINJECT_UNLOAD_SPA)
927 		if ((error = spa_reset(name)) != 0)
928 			return (error);
929 
930 	if (record->zi_cmd == ZINJECT_DELAY_IO) {
931 		/*
932 		 * A value of zero for the number of lanes or for the
933 		 * delay time doesn't make sense.
934 		 */
935 		if (record->zi_timer == 0 || record->zi_nlanes == 0)
936 			return (SET_ERROR(EINVAL));
937 
938 		/*
939 		 * The number of lanes is directly mapped to the size of
940 		 * an array used by the handler. Thus, to ensure the
941 		 * user doesn't trigger an allocation that's "too large"
942 		 * we cap the number of lanes here.
943 		 */
944 		if (record->zi_nlanes >= UINT16_MAX)
945 			return (SET_ERROR(EINVAL));
946 	}
947 
948 	/*
949 	 * If the supplied range was in bytes -- calculate the actual blkid
950 	 */
951 	if (flags & ZINJECT_CALC_RANGE) {
952 		error = zio_calculate_range(name, record);
953 		if (error != 0)
954 			return (error);
955 	}
956 
957 	if (!(flags & ZINJECT_NULL)) {
958 		/*
959 		 * Pool delays for import or export don't take an
960 		 * injection reference on the spa. Instead they
961 		 * rely on matching by name.
962 		 */
963 		if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
964 		    record->zi_cmd == ZINJECT_DELAY_EXPORT) {
965 			if (record->zi_duration <= 0)
966 				return (SET_ERROR(EINVAL));
967 			/*
968 			 * Only one import | export delay handler per pool.
969 			 */
970 			if (zio_pool_handler_exists(name, record->zi_cmd))
971 				return (SET_ERROR(EEXIST));
972 
973 			mutex_enter(&spa_namespace_lock);
974 			boolean_t has_spa = spa_lookup(name) != NULL;
975 			mutex_exit(&spa_namespace_lock);
976 
977 			if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
978 				return (SET_ERROR(EEXIST));
979 			if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
980 				return (SET_ERROR(ENOENT));
981 			spa = NULL;
982 		} else {
983 			/*
984 			 * spa_inject_ref() will add an injection reference,
985 			 * which will prevent the pool from being removed
986 			 * from the namespace while still allowing it to be
987 			 * unloaded.
988 			 */
989 			if ((spa = spa_inject_addref(name)) == NULL)
990 				return (SET_ERROR(ENOENT));
991 		}
992 
993 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
994 		handler->zi_spa = spa;	/* note: can be NULL */
995 		handler->zi_record = *record;
996 
997 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
998 			handler->zi_lanes = kmem_zalloc(
999 			    sizeof (*handler->zi_lanes) *
1000 			    handler->zi_record.zi_nlanes, KM_SLEEP);
1001 			handler->zi_next_lane = 0;
1002 		} else {
1003 			handler->zi_lanes = NULL;
1004 			handler->zi_next_lane = 0;
1005 		}
1006 
1007 		if (handler->zi_spa == NULL)
1008 			handler->zi_spa_name = spa_strdup(name);
1009 		else
1010 			handler->zi_spa_name = NULL;
1011 
1012 		rw_enter(&inject_lock, RW_WRITER);
1013 
1014 		/*
1015 		 * We can't move this increment into the conditional
1016 		 * above because we need to hold the RW_WRITER lock of
1017 		 * inject_lock, and we don't want to hold that while
1018 		 * allocating the handler's zi_lanes array.
1019 		 */
1020 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1021 			ASSERT3S(inject_delay_count, >=, 0);
1022 			inject_delay_count++;
1023 			ASSERT3S(inject_delay_count, >, 0);
1024 		}
1025 
1026 		*id = handler->zi_id = inject_next_id++;
1027 		list_insert_tail(&inject_handlers, handler);
1028 		atomic_inc_32(&zio_injection_enabled);
1029 
1030 		rw_exit(&inject_lock);
1031 	}
1032 
1033 	/*
1034 	 * Flush the ARC, so that any attempts to read this data will end up
1035 	 * going to the ZIO layer.  Note that this is a little overkill, but
1036 	 * we don't have the necessary ARC interfaces to do anything else, and
1037 	 * fault injection isn't a performance critical path.
1038 	 */
1039 	if (flags & ZINJECT_FLUSH_ARC)
1040 		/*
1041 		 * We must use FALSE to ensure arc_flush returns, since
1042 		 * we're not preventing concurrent ARC insertions.
1043 		 */
1044 		arc_flush(NULL, FALSE);
1045 
1046 	return (0);
1047 }
1048 
1049 /*
1050  * Returns the next record with an ID greater than that supplied to the
1051  * function.  Used to iterate over all handlers in the system.
1052  */
1053 int
zio_inject_list_next(int * id,char * name,size_t buflen,zinject_record_t * record)1054 zio_inject_list_next(int *id, char *name, size_t buflen,
1055     zinject_record_t *record)
1056 {
1057 	inject_handler_t *handler;
1058 	int ret;
1059 
1060 	mutex_enter(&spa_namespace_lock);
1061 	rw_enter(&inject_lock, RW_READER);
1062 
1063 	for (handler = list_head(&inject_handlers); handler != NULL;
1064 	    handler = list_next(&inject_handlers, handler))
1065 		if (handler->zi_id > *id)
1066 			break;
1067 
1068 	if (handler) {
1069 		*record = handler->zi_record;
1070 		*id = handler->zi_id;
1071 		ASSERT(handler->zi_spa || handler->zi_spa_name);
1072 		if (handler->zi_spa != NULL)
1073 			(void) strlcpy(name, spa_name(handler->zi_spa), buflen);
1074 		else
1075 			(void) strlcpy(name, handler->zi_spa_name, buflen);
1076 		ret = 0;
1077 	} else {
1078 		ret = SET_ERROR(ENOENT);
1079 	}
1080 
1081 	rw_exit(&inject_lock);
1082 	mutex_exit(&spa_namespace_lock);
1083 
1084 	return (ret);
1085 }
1086 
1087 /*
1088  * Clear the fault handler with the given identifier, or return ENOENT if none
1089  * exists.
1090  */
1091 int
zio_clear_fault(int id)1092 zio_clear_fault(int id)
1093 {
1094 	inject_handler_t *handler;
1095 
1096 	rw_enter(&inject_lock, RW_WRITER);
1097 
1098 	for (handler = list_head(&inject_handlers); handler != NULL;
1099 	    handler = list_next(&inject_handlers, handler))
1100 		if (handler->zi_id == id)
1101 			break;
1102 
1103 	if (handler == NULL) {
1104 		rw_exit(&inject_lock);
1105 		return (SET_ERROR(ENOENT));
1106 	}
1107 
1108 	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1109 		ASSERT3S(inject_delay_count, >, 0);
1110 		inject_delay_count--;
1111 		ASSERT3S(inject_delay_count, >=, 0);
1112 	}
1113 
1114 	list_remove(&inject_handlers, handler);
1115 	rw_exit(&inject_lock);
1116 
1117 	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1118 		ASSERT3P(handler->zi_lanes, !=, NULL);
1119 		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
1120 		    handler->zi_record.zi_nlanes);
1121 	} else {
1122 		ASSERT3P(handler->zi_lanes, ==, NULL);
1123 	}
1124 
1125 	if (handler->zi_spa_name != NULL)
1126 		spa_strfree(handler->zi_spa_name);
1127 
1128 	if (handler->zi_spa != NULL)
1129 		spa_inject_delref(handler->zi_spa);
1130 	kmem_free(handler, sizeof (inject_handler_t));
1131 	atomic_dec_32(&zio_injection_enabled);
1132 
1133 	return (0);
1134 }
1135 
1136 void
zio_inject_init(void)1137 zio_inject_init(void)
1138 {
1139 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
1140 	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
1141 	list_create(&inject_handlers, sizeof (inject_handler_t),
1142 	    offsetof(inject_handler_t, zi_link));
1143 }
1144 
1145 void
zio_inject_fini(void)1146 zio_inject_fini(void)
1147 {
1148 	list_destroy(&inject_handlers);
1149 	mutex_destroy(&inject_delay_mtx);
1150 	rw_destroy(&inject_lock);
1151 }
1152 
1153 #if defined(_KERNEL)
1154 EXPORT_SYMBOL(zio_injection_enabled);
1155 EXPORT_SYMBOL(zio_inject_fault);
1156 EXPORT_SYMBOL(zio_inject_list_next);
1157 EXPORT_SYMBOL(zio_clear_fault);
1158 EXPORT_SYMBOL(zio_handle_fault_injection);
1159 EXPORT_SYMBOL(zio_handle_device_injection);
1160 EXPORT_SYMBOL(zio_handle_label_injection);
1161 #endif
1162