xref: /freebsd/sys/contrib/openzfs/module/zfs/zio_inject.c (revision 5c4aa6257210502c93ad65882a8a4842d984bae2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2017, Intel Corporation.
25  */
26 
27 /*
28  * ZFS fault injection
29  *
30  * To handle fault injection, we keep track of a series of zinject_record_t
31  * structures which describe which logical block(s) should be injected with a
32  * fault.  These are kept in a global list.  Each record corresponds to a given
33  * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
34  * or exported while the injection record exists.
35  *
36  * Device level injection is done using the 'zi_guid' field.  If this is set, it
37  * means that the error is destined for a particular device, not a piece of
38  * data.
39  *
40  * This is a rather poor data structure and algorithm, but we don't expect more
41  * than a few faults at any one time, so it should be sufficient for our needs.
42  */
43 
44 #include <sys/arc.h>
45 #include <sys/zio.h>
46 #include <sys/zfs_ioctl.h>
47 #include <sys/vdev_impl.h>
48 #include <sys/dmu_objset.h>
49 #include <sys/dsl_dataset.h>
50 #include <sys/fs/zfs.h>
51 
52 uint32_t zio_injection_enabled = 0;
53 
54 /*
55  * Data describing each zinject handler registered on the system, and
56  * contains the list node linking the handler in the global zinject
57  * handler list.
58  */
59 typedef struct inject_handler {
60 	int			zi_id;
61 	spa_t			*zi_spa;
62 	zinject_record_t	zi_record;
63 	uint64_t		*zi_lanes;
64 	int			zi_next_lane;
65 	list_node_t		zi_link;
66 } inject_handler_t;
67 
68 /*
69  * List of all zinject handlers registered on the system, protected by
70  * the inject_lock defined below.
71  */
72 static list_t inject_handlers;
73 
74 /*
75  * This protects insertion into, and traversal of, the inject handler
76  * list defined above; as well as the inject_delay_count. Any time a
77  * handler is inserted or removed from the list, this lock should be
78  * taken as a RW_WRITER; and any time traversal is done over the list
79  * (without modification to it) this lock should be taken as a RW_READER.
80  */
81 static krwlock_t inject_lock;
82 
83 /*
84  * This holds the number of zinject delay handlers that have been
85  * registered on the system. It is protected by the inject_lock defined
86  * above. Thus modifications to this count must be a RW_WRITER of the
87  * inject_lock, and reads of this count must be (at least) a RW_READER
88  * of the lock.
89  */
90 static int inject_delay_count = 0;
91 
92 /*
93  * This lock is used only in zio_handle_io_delay(), refer to the comment
94  * in that function for more details.
95  */
96 static kmutex_t inject_delay_mtx;
97 
98 /*
99  * Used to assign unique identifying numbers to each new zinject handler.
100  */
101 static int inject_next_id = 1;
102 
103 /*
104  * Test if the requested frequency was triggered
105  */
106 static boolean_t
107 freq_triggered(uint32_t frequency)
108 {
109 	/*
110 	 * zero implies always (100%)
111 	 */
112 	if (frequency == 0)
113 		return (B_TRUE);
114 
115 	/*
116 	 * Note: we still handle legacy (unscaled) frequency values
117 	 */
118 	uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
119 
120 	return (random_in_range(maximum) < frequency);
121 }
122 
123 /*
124  * Returns true if the given record matches the I/O in progress.
125  */
126 static boolean_t
127 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
128     zinject_record_t *record, int error)
129 {
130 	/*
131 	 * Check for a match against the MOS, which is based on type
132 	 */
133 	if (zb->zb_objset == DMU_META_OBJSET &&
134 	    record->zi_objset == DMU_META_OBJSET &&
135 	    record->zi_object == DMU_META_DNODE_OBJECT) {
136 		if (record->zi_type == DMU_OT_NONE ||
137 		    type == record->zi_type)
138 			return (freq_triggered(record->zi_freq));
139 		else
140 			return (B_FALSE);
141 	}
142 
143 	/*
144 	 * Check for an exact match.
145 	 */
146 	if (zb->zb_objset == record->zi_objset &&
147 	    zb->zb_object == record->zi_object &&
148 	    zb->zb_level == record->zi_level &&
149 	    zb->zb_blkid >= record->zi_start &&
150 	    zb->zb_blkid <= record->zi_end &&
151 	    (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
152 	    error == record->zi_error) {
153 		return (freq_triggered(record->zi_freq));
154 	}
155 
156 	return (B_FALSE);
157 }
158 
159 /*
160  * Panic the system when a config change happens in the function
161  * specified by tag.
162  */
163 void
164 zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
165 {
166 	inject_handler_t *handler;
167 
168 	rw_enter(&inject_lock, RW_READER);
169 
170 	for (handler = list_head(&inject_handlers); handler != NULL;
171 	    handler = list_next(&inject_handlers, handler)) {
172 
173 		if (spa != handler->zi_spa)
174 			continue;
175 
176 		if (handler->zi_record.zi_type == type &&
177 		    strcmp(tag, handler->zi_record.zi_func) == 0)
178 			panic("Panic requested in function %s\n", tag);
179 	}
180 
181 	rw_exit(&inject_lock);
182 }
183 
184 /*
185  * Inject a decryption failure. Decryption failures can occur in
186  * both the ARC and the ZIO layers.
187  */
188 int
189 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
190     uint64_t type, int error)
191 {
192 	int ret = 0;
193 	inject_handler_t *handler;
194 
195 	rw_enter(&inject_lock, RW_READER);
196 
197 	for (handler = list_head(&inject_handlers); handler != NULL;
198 	    handler = list_next(&inject_handlers, handler)) {
199 
200 		if (spa != handler->zi_spa ||
201 		    handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
202 			continue;
203 
204 		if (zio_match_handler(zb, type, ZI_NO_DVA,
205 		    &handler->zi_record, error)) {
206 			ret = error;
207 			break;
208 		}
209 	}
210 
211 	rw_exit(&inject_lock);
212 	return (ret);
213 }
214 
215 /*
216  * If this is a physical I/O for a vdev child determine which DVA it is
217  * for. We iterate backwards through the DVAs matching on the offset so
218  * that we end up with ZI_NO_DVA (-1) if we don't find a match.
219  */
220 static int
221 zio_match_dva(zio_t *zio)
222 {
223 	int i = ZI_NO_DVA;
224 
225 	if (zio->io_bp != NULL && zio->io_vd != NULL &&
226 	    zio->io_child_type == ZIO_CHILD_VDEV) {
227 		for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
228 			dva_t *dva = &zio->io_bp->blk_dva[i];
229 			uint64_t off = DVA_GET_OFFSET(dva);
230 			vdev_t *vd = vdev_lookup_top(zio->io_spa,
231 			    DVA_GET_VDEV(dva));
232 
233 			/* Compensate for vdev label added to leaves */
234 			if (zio->io_vd->vdev_ops->vdev_op_leaf)
235 				off += VDEV_LABEL_START_SIZE;
236 
237 			if (zio->io_vd == vd && zio->io_offset == off)
238 				break;
239 		}
240 	}
241 
242 	return (i);
243 }
244 
245 
246 /*
247  * Determine if the I/O in question should return failure.  Returns the errno
248  * to be returned to the caller.
249  */
250 int
251 zio_handle_fault_injection(zio_t *zio, int error)
252 {
253 	int ret = 0;
254 	inject_handler_t *handler;
255 
256 	/*
257 	 * Ignore I/O not associated with any logical data.
258 	 */
259 	if (zio->io_logical == NULL)
260 		return (0);
261 
262 	/*
263 	 * Currently, we only support fault injection on reads.
264 	 */
265 	if (zio->io_type != ZIO_TYPE_READ)
266 		return (0);
267 
268 	/*
269 	 * A rebuild I/O has no checksum to verify.
270 	 */
271 	if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
272 		return (0);
273 
274 	rw_enter(&inject_lock, RW_READER);
275 
276 	for (handler = list_head(&inject_handlers); handler != NULL;
277 	    handler = list_next(&inject_handlers, handler)) {
278 		if (zio->io_spa != handler->zi_spa ||
279 		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
280 			continue;
281 
282 		/* If this handler matches, return the specified error */
283 		if (zio_match_handler(&zio->io_logical->io_bookmark,
284 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
285 		    zio_match_dva(zio), &handler->zi_record, error)) {
286 			ret = error;
287 			break;
288 		}
289 	}
290 
291 	rw_exit(&inject_lock);
292 
293 	return (ret);
294 }
295 
296 /*
297  * Determine if the zio is part of a label update and has an injection
298  * handler associated with that portion of the label. Currently, we
299  * allow error injection in either the nvlist or the uberblock region of
300  * of the vdev label.
301  */
302 int
303 zio_handle_label_injection(zio_t *zio, int error)
304 {
305 	inject_handler_t *handler;
306 	vdev_t *vd = zio->io_vd;
307 	uint64_t offset = zio->io_offset;
308 	int label;
309 	int ret = 0;
310 
311 	if (offset >= VDEV_LABEL_START_SIZE &&
312 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
313 		return (0);
314 
315 	rw_enter(&inject_lock, RW_READER);
316 
317 	for (handler = list_head(&inject_handlers); handler != NULL;
318 	    handler = list_next(&inject_handlers, handler)) {
319 		uint64_t start = handler->zi_record.zi_start;
320 		uint64_t end = handler->zi_record.zi_end;
321 
322 		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
323 			continue;
324 
325 		/*
326 		 * The injection region is the relative offsets within a
327 		 * vdev label. We must determine the label which is being
328 		 * updated and adjust our region accordingly.
329 		 */
330 		label = vdev_label_number(vd->vdev_psize, offset);
331 		start = vdev_label_offset(vd->vdev_psize, label, start);
332 		end = vdev_label_offset(vd->vdev_psize, label, end);
333 
334 		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
335 		    (offset >= start && offset <= end)) {
336 			ret = error;
337 			break;
338 		}
339 	}
340 	rw_exit(&inject_lock);
341 	return (ret);
342 }
343 
344 static int
345 zio_inject_bitflip_cb(void *data, size_t len, void *private)
346 {
347 	zio_t *zio = private;
348 	uint8_t *buffer = data;
349 	uint_t byte = random_in_range(len);
350 
351 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
352 
353 	/* flip a single random bit in an abd data buffer */
354 	buffer[byte] ^= 1 << random_in_range(8);
355 
356 	return (1);	/* stop after first flip */
357 }
358 
359 static int
360 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
361 {
362 	inject_handler_t *handler;
363 	int ret = 0;
364 
365 	/*
366 	 * We skip over faults in the labels unless it's during
367 	 * device open (i.e. zio == NULL).
368 	 */
369 	if (zio != NULL) {
370 		uint64_t offset = zio->io_offset;
371 
372 		if (offset < VDEV_LABEL_START_SIZE ||
373 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
374 			return (0);
375 	}
376 
377 	rw_enter(&inject_lock, RW_READER);
378 
379 	for (handler = list_head(&inject_handlers); handler != NULL;
380 	    handler = list_next(&inject_handlers, handler)) {
381 
382 		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
383 			continue;
384 
385 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
386 			if (handler->zi_record.zi_failfast &&
387 			    (zio == NULL || (zio->io_flags &
388 			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
389 				continue;
390 			}
391 
392 			/* Handle type specific I/O failures */
393 			if (zio != NULL &&
394 			    handler->zi_record.zi_iotype != ZIO_TYPES &&
395 			    handler->zi_record.zi_iotype != zio->io_type)
396 				continue;
397 
398 			if (handler->zi_record.zi_error == err1 ||
399 			    handler->zi_record.zi_error == err2) {
400 				/*
401 				 * limit error injection if requested
402 				 */
403 				if (!freq_triggered(handler->zi_record.zi_freq))
404 					continue;
405 
406 				/*
407 				 * For a failed open, pretend like the device
408 				 * has gone away.
409 				 */
410 				if (err1 == ENXIO)
411 					vd->vdev_stat.vs_aux =
412 					    VDEV_AUX_OPEN_FAILED;
413 
414 				/*
415 				 * Treat these errors as if they had been
416 				 * retried so that all the appropriate stats
417 				 * and FMA events are generated.
418 				 */
419 				if (!handler->zi_record.zi_failfast &&
420 				    zio != NULL)
421 					zio->io_flags |= ZIO_FLAG_IO_RETRY;
422 
423 				/*
424 				 * EILSEQ means flip a bit after a read
425 				 */
426 				if (handler->zi_record.zi_error == EILSEQ) {
427 					if (zio == NULL)
428 						break;
429 
430 					/* locate buffer data and flip a bit */
431 					(void) abd_iterate_func(zio->io_abd, 0,
432 					    zio->io_size, zio_inject_bitflip_cb,
433 					    zio);
434 					break;
435 				}
436 
437 				ret = handler->zi_record.zi_error;
438 				break;
439 			}
440 			if (handler->zi_record.zi_error == ENXIO) {
441 				ret = SET_ERROR(EIO);
442 				break;
443 			}
444 		}
445 	}
446 
447 	rw_exit(&inject_lock);
448 
449 	return (ret);
450 }
451 
452 int
453 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
454 {
455 	return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
456 }
457 
458 int
459 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
460 {
461 	return (zio_handle_device_injection_impl(vd, zio, err1, err2));
462 }
463 
464 /*
465  * Simulate hardware that ignores cache flushes.  For requested number
466  * of seconds nix the actual writing to disk.
467  */
468 void
469 zio_handle_ignored_writes(zio_t *zio)
470 {
471 	inject_handler_t *handler;
472 
473 	rw_enter(&inject_lock, RW_READER);
474 
475 	for (handler = list_head(&inject_handlers); handler != NULL;
476 	    handler = list_next(&inject_handlers, handler)) {
477 
478 		/* Ignore errors not destined for this pool */
479 		if (zio->io_spa != handler->zi_spa ||
480 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
481 			continue;
482 
483 		/*
484 		 * Positive duration implies # of seconds, negative
485 		 * a number of txgs
486 		 */
487 		if (handler->zi_record.zi_timer == 0) {
488 			if (handler->zi_record.zi_duration > 0)
489 				handler->zi_record.zi_timer = ddi_get_lbolt64();
490 			else
491 				handler->zi_record.zi_timer = zio->io_txg;
492 		}
493 
494 		/* Have a "problem" writing 60% of the time */
495 		if (random_in_range(100) < 60)
496 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
497 		break;
498 	}
499 
500 	rw_exit(&inject_lock);
501 }
502 
503 void
504 spa_handle_ignored_writes(spa_t *spa)
505 {
506 	inject_handler_t *handler;
507 
508 	if (zio_injection_enabled == 0)
509 		return;
510 
511 	rw_enter(&inject_lock, RW_READER);
512 
513 	for (handler = list_head(&inject_handlers); handler != NULL;
514 	    handler = list_next(&inject_handlers, handler)) {
515 
516 		if (spa != handler->zi_spa ||
517 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
518 			continue;
519 
520 		if (handler->zi_record.zi_duration > 0) {
521 			VERIFY(handler->zi_record.zi_timer == 0 ||
522 			    ddi_time_after64(
523 			    (int64_t)handler->zi_record.zi_timer +
524 			    handler->zi_record.zi_duration * hz,
525 			    ddi_get_lbolt64()));
526 		} else {
527 			/* duration is negative so the subtraction here adds */
528 			VERIFY(handler->zi_record.zi_timer == 0 ||
529 			    handler->zi_record.zi_timer -
530 			    handler->zi_record.zi_duration >=
531 			    spa_syncing_txg(spa));
532 		}
533 	}
534 
535 	rw_exit(&inject_lock);
536 }
537 
538 hrtime_t
539 zio_handle_io_delay(zio_t *zio)
540 {
541 	vdev_t *vd = zio->io_vd;
542 	inject_handler_t *min_handler = NULL;
543 	hrtime_t min_target = 0;
544 
545 	rw_enter(&inject_lock, RW_READER);
546 
547 	/*
548 	 * inject_delay_count is a subset of zio_injection_enabled that
549 	 * is only incremented for delay handlers. These checks are
550 	 * mainly added to remind the reader why we're not explicitly
551 	 * checking zio_injection_enabled like the other functions.
552 	 */
553 	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
554 	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
555 
556 	/*
557 	 * If there aren't any inject delay handlers registered, then we
558 	 * can short circuit and simply return 0 here. A value of zero
559 	 * informs zio_delay_interrupt() that this request should not be
560 	 * delayed. This short circuit keeps us from acquiring the
561 	 * inject_delay_mutex unnecessarily.
562 	 */
563 	if (inject_delay_count == 0) {
564 		rw_exit(&inject_lock);
565 		return (0);
566 	}
567 
568 	/*
569 	 * Each inject handler has a number of "lanes" associated with
570 	 * it. Each lane is able to handle requests independently of one
571 	 * another, and at a latency defined by the inject handler
572 	 * record's zi_timer field. Thus if a handler in configured with
573 	 * a single lane with a 10ms latency, it will delay requests
574 	 * such that only a single request is completed every 10ms. So,
575 	 * if more than one request is attempted per each 10ms interval,
576 	 * the average latency of the requests will be greater than
577 	 * 10ms; but if only a single request is submitted each 10ms
578 	 * interval the average latency will be 10ms.
579 	 *
580 	 * We need to acquire this mutex to prevent multiple concurrent
581 	 * threads being assigned to the same lane of a given inject
582 	 * handler. The mutex allows us to perform the following two
583 	 * operations atomically:
584 	 *
585 	 *	1. determine the minimum handler and minimum target
586 	 *	   value of all the possible handlers
587 	 *	2. update that minimum handler's lane array
588 	 *
589 	 * Without atomicity, two (or more) threads could pick the same
590 	 * lane in step (1), and then conflict with each other in step
591 	 * (2). This could allow a single lane handler to process
592 	 * multiple requests simultaneously, which shouldn't be possible.
593 	 */
594 	mutex_enter(&inject_delay_mtx);
595 
596 	for (inject_handler_t *handler = list_head(&inject_handlers);
597 	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
598 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
599 			continue;
600 
601 		if (!freq_triggered(handler->zi_record.zi_freq))
602 			continue;
603 
604 		if (vd->vdev_guid != handler->zi_record.zi_guid)
605 			continue;
606 
607 		/*
608 		 * Defensive; should never happen as the array allocation
609 		 * occurs prior to inserting this handler on the list.
610 		 */
611 		ASSERT3P(handler->zi_lanes, !=, NULL);
612 
613 		/*
614 		 * This should never happen, the zinject command should
615 		 * prevent a user from setting an IO delay with zero lanes.
616 		 */
617 		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
618 
619 		ASSERT3U(handler->zi_record.zi_nlanes, >,
620 		    handler->zi_next_lane);
621 
622 		/*
623 		 * We want to issue this IO to the lane that will become
624 		 * idle the soonest, so we compare the soonest this
625 		 * specific handler can complete the IO with all other
626 		 * handlers, to find the lowest value of all possible
627 		 * lanes. We then use this lane to submit the request.
628 		 *
629 		 * Since each handler has a constant value for its
630 		 * delay, we can just use the "next" lane for that
631 		 * handler; as it will always be the lane with the
632 		 * lowest value for that particular handler (i.e. the
633 		 * lane that will become idle the soonest). This saves a
634 		 * scan of each handler's lanes array.
635 		 *
636 		 * There's two cases to consider when determining when
637 		 * this specific IO request should complete. If this
638 		 * lane is idle, we want to "submit" the request now so
639 		 * it will complete after zi_timer milliseconds. Thus,
640 		 * we set the target to now + zi_timer.
641 		 *
642 		 * If the lane is busy, we want this request to complete
643 		 * zi_timer milliseconds after the lane becomes idle.
644 		 * Since the 'zi_lanes' array holds the time at which
645 		 * each lane will become idle, we use that value to
646 		 * determine when this request should complete.
647 		 */
648 		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
649 		hrtime_t busy = handler->zi_record.zi_timer +
650 		    handler->zi_lanes[handler->zi_next_lane];
651 		hrtime_t target = MAX(idle, busy);
652 
653 		if (min_handler == NULL) {
654 			min_handler = handler;
655 			min_target = target;
656 			continue;
657 		}
658 
659 		ASSERT3P(min_handler, !=, NULL);
660 		ASSERT3U(min_target, !=, 0);
661 
662 		/*
663 		 * We don't yet increment the "next lane" variable since
664 		 * we still might find a lower value lane in another
665 		 * handler during any remaining iterations. Once we're
666 		 * sure we've selected the absolute minimum, we'll claim
667 		 * the lane and increment the handler's "next lane"
668 		 * field below.
669 		 */
670 
671 		if (target < min_target) {
672 			min_handler = handler;
673 			min_target = target;
674 		}
675 	}
676 
677 	/*
678 	 * 'min_handler' will be NULL if no IO delays are registered for
679 	 * this vdev, otherwise it will point to the handler containing
680 	 * the lane that will become idle the soonest.
681 	 */
682 	if (min_handler != NULL) {
683 		ASSERT3U(min_target, !=, 0);
684 		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
685 
686 		/*
687 		 * If we've used all possible lanes for this handler,
688 		 * loop back and start using the first lane again;
689 		 * otherwise, just increment the lane index.
690 		 */
691 		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
692 		    min_handler->zi_record.zi_nlanes;
693 	}
694 
695 	mutex_exit(&inject_delay_mtx);
696 	rw_exit(&inject_lock);
697 
698 	return (min_target);
699 }
700 
701 static int
702 zio_calculate_range(const char *pool, zinject_record_t *record)
703 {
704 	dsl_pool_t *dp;
705 	dsl_dataset_t *ds;
706 	objset_t *os = NULL;
707 	dnode_t *dn = NULL;
708 	int error;
709 
710 	/*
711 	 * Obtain the dnode for object using pool, objset, and object
712 	 */
713 	error = dsl_pool_hold(pool, FTAG, &dp);
714 	if (error)
715 		return (error);
716 
717 	error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
718 	dsl_pool_rele(dp, FTAG);
719 	if (error)
720 		return (error);
721 
722 	error = dmu_objset_from_ds(ds, &os);
723 	dsl_dataset_rele(ds, FTAG);
724 	if (error)
725 		return (error);
726 
727 	error = dnode_hold(os, record->zi_object, FTAG, &dn);
728 	if (error)
729 		return (error);
730 
731 	/*
732 	 * Translate the range into block IDs
733 	 */
734 	if (record->zi_start != 0 || record->zi_end != -1ULL) {
735 		record->zi_start >>= dn->dn_datablkshift;
736 		record->zi_end >>= dn->dn_datablkshift;
737 	}
738 	if (record->zi_level > 0) {
739 		if (record->zi_level >= dn->dn_nlevels) {
740 			dnode_rele(dn, FTAG);
741 			return (SET_ERROR(EDOM));
742 		}
743 
744 		if (record->zi_start != 0 || record->zi_end != 0) {
745 			int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
746 
747 			for (int level = record->zi_level; level > 0; level--) {
748 				record->zi_start >>= shift;
749 				record->zi_end >>= shift;
750 			}
751 		}
752 	}
753 
754 	dnode_rele(dn, FTAG);
755 	return (0);
756 }
757 
758 /*
759  * Create a new handler for the given record.  We add it to the list, adding
760  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
761  * which is the switch to trigger all fault injection.
762  */
763 int
764 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
765 {
766 	inject_handler_t *handler;
767 	int error;
768 	spa_t *spa;
769 
770 	/*
771 	 * If this is pool-wide metadata, make sure we unload the corresponding
772 	 * spa_t, so that the next attempt to load it will trigger the fault.
773 	 * We call spa_reset() to unload the pool appropriately.
774 	 */
775 	if (flags & ZINJECT_UNLOAD_SPA)
776 		if ((error = spa_reset(name)) != 0)
777 			return (error);
778 
779 	if (record->zi_cmd == ZINJECT_DELAY_IO) {
780 		/*
781 		 * A value of zero for the number of lanes or for the
782 		 * delay time doesn't make sense.
783 		 */
784 		if (record->zi_timer == 0 || record->zi_nlanes == 0)
785 			return (SET_ERROR(EINVAL));
786 
787 		/*
788 		 * The number of lanes is directly mapped to the size of
789 		 * an array used by the handler. Thus, to ensure the
790 		 * user doesn't trigger an allocation that's "too large"
791 		 * we cap the number of lanes here.
792 		 */
793 		if (record->zi_nlanes >= UINT16_MAX)
794 			return (SET_ERROR(EINVAL));
795 	}
796 
797 	/*
798 	 * If the supplied range was in bytes -- calculate the actual blkid
799 	 */
800 	if (flags & ZINJECT_CALC_RANGE) {
801 		error = zio_calculate_range(name, record);
802 		if (error != 0)
803 			return (error);
804 	}
805 
806 	if (!(flags & ZINJECT_NULL)) {
807 		/*
808 		 * spa_inject_ref() will add an injection reference, which will
809 		 * prevent the pool from being removed from the namespace while
810 		 * still allowing it to be unloaded.
811 		 */
812 		if ((spa = spa_inject_addref(name)) == NULL)
813 			return (SET_ERROR(ENOENT));
814 
815 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
816 
817 		handler->zi_spa = spa;
818 		handler->zi_record = *record;
819 
820 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
821 			handler->zi_lanes = kmem_zalloc(
822 			    sizeof (*handler->zi_lanes) *
823 			    handler->zi_record.zi_nlanes, KM_SLEEP);
824 			handler->zi_next_lane = 0;
825 		} else {
826 			handler->zi_lanes = NULL;
827 			handler->zi_next_lane = 0;
828 		}
829 
830 		rw_enter(&inject_lock, RW_WRITER);
831 
832 		/*
833 		 * We can't move this increment into the conditional
834 		 * above because we need to hold the RW_WRITER lock of
835 		 * inject_lock, and we don't want to hold that while
836 		 * allocating the handler's zi_lanes array.
837 		 */
838 		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
839 			ASSERT3S(inject_delay_count, >=, 0);
840 			inject_delay_count++;
841 			ASSERT3S(inject_delay_count, >, 0);
842 		}
843 
844 		*id = handler->zi_id = inject_next_id++;
845 		list_insert_tail(&inject_handlers, handler);
846 		atomic_inc_32(&zio_injection_enabled);
847 
848 		rw_exit(&inject_lock);
849 	}
850 
851 	/*
852 	 * Flush the ARC, so that any attempts to read this data will end up
853 	 * going to the ZIO layer.  Note that this is a little overkill, but
854 	 * we don't have the necessary ARC interfaces to do anything else, and
855 	 * fault injection isn't a performance critical path.
856 	 */
857 	if (flags & ZINJECT_FLUSH_ARC)
858 		/*
859 		 * We must use FALSE to ensure arc_flush returns, since
860 		 * we're not preventing concurrent ARC insertions.
861 		 */
862 		arc_flush(NULL, FALSE);
863 
864 	return (0);
865 }
866 
867 /*
868  * Returns the next record with an ID greater than that supplied to the
869  * function.  Used to iterate over all handlers in the system.
870  */
871 int
872 zio_inject_list_next(int *id, char *name, size_t buflen,
873     zinject_record_t *record)
874 {
875 	inject_handler_t *handler;
876 	int ret;
877 
878 	mutex_enter(&spa_namespace_lock);
879 	rw_enter(&inject_lock, RW_READER);
880 
881 	for (handler = list_head(&inject_handlers); handler != NULL;
882 	    handler = list_next(&inject_handlers, handler))
883 		if (handler->zi_id > *id)
884 			break;
885 
886 	if (handler) {
887 		*record = handler->zi_record;
888 		*id = handler->zi_id;
889 		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
890 		ret = 0;
891 	} else {
892 		ret = SET_ERROR(ENOENT);
893 	}
894 
895 	rw_exit(&inject_lock);
896 	mutex_exit(&spa_namespace_lock);
897 
898 	return (ret);
899 }
900 
901 /*
902  * Clear the fault handler with the given identifier, or return ENOENT if none
903  * exists.
904  */
905 int
906 zio_clear_fault(int id)
907 {
908 	inject_handler_t *handler;
909 
910 	rw_enter(&inject_lock, RW_WRITER);
911 
912 	for (handler = list_head(&inject_handlers); handler != NULL;
913 	    handler = list_next(&inject_handlers, handler))
914 		if (handler->zi_id == id)
915 			break;
916 
917 	if (handler == NULL) {
918 		rw_exit(&inject_lock);
919 		return (SET_ERROR(ENOENT));
920 	}
921 
922 	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
923 		ASSERT3S(inject_delay_count, >, 0);
924 		inject_delay_count--;
925 		ASSERT3S(inject_delay_count, >=, 0);
926 	}
927 
928 	list_remove(&inject_handlers, handler);
929 	rw_exit(&inject_lock);
930 
931 	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
932 		ASSERT3P(handler->zi_lanes, !=, NULL);
933 		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
934 		    handler->zi_record.zi_nlanes);
935 	} else {
936 		ASSERT3P(handler->zi_lanes, ==, NULL);
937 	}
938 
939 	spa_inject_delref(handler->zi_spa);
940 	kmem_free(handler, sizeof (inject_handler_t));
941 	atomic_dec_32(&zio_injection_enabled);
942 
943 	return (0);
944 }
945 
946 void
947 zio_inject_init(void)
948 {
949 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
950 	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
951 	list_create(&inject_handlers, sizeof (inject_handler_t),
952 	    offsetof(inject_handler_t, zi_link));
953 }
954 
955 void
956 zio_inject_fini(void)
957 {
958 	list_destroy(&inject_handlers);
959 	mutex_destroy(&inject_delay_mtx);
960 	rw_destroy(&inject_lock);
961 }
962 
963 #if defined(_KERNEL)
964 EXPORT_SYMBOL(zio_injection_enabled);
965 EXPORT_SYMBOL(zio_inject_fault);
966 EXPORT_SYMBOL(zio_inject_list_next);
967 EXPORT_SYMBOL(zio_clear_fault);
968 EXPORT_SYMBOL(zio_handle_fault_injection);
969 EXPORT_SYMBOL(zio_handle_device_injection);
970 EXPORT_SYMBOL(zio_handle_label_injection);
971 #endif
972