1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2017, Intel Corporation.
26 * Copyright (c) 2024-2025, Klara, Inc.
27 */
28
29 /*
30 * ZFS fault injection
31 *
32 * To handle fault injection, we keep track of a series of zinject_record_t
33 * structures which describe which logical block(s) should be injected with a
34 * fault. These are kept in a global list. Each record corresponds to a given
35 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
36 * or exported while the injection record exists.
37 *
38 * Device level injection is done using the 'zi_guid' field. If this is set, it
39 * means that the error is destined for a particular device, not a piece of
40 * data.
41 *
42 * This is a rather poor data structure and algorithm, but we don't expect more
43 * than a few faults at any one time, so it should be sufficient for our needs.
44 */
45
46 #include <sys/arc.h>
47 #include <sys/zio.h>
48 #include <sys/zfs_ioctl.h>
49 #include <sys/vdev_impl.h>
50 #include <sys/dmu_objset.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/fs/zfs.h>
53
54 uint32_t zio_injection_enabled = 0;
55
56 /*
57 * Data describing each zinject handler registered on the system, and
58 * contains the list node linking the handler in the global zinject
59 * handler list.
60 */
61 typedef struct inject_handler {
62 int zi_id;
63 spa_t *zi_spa;
64 char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */
65 zinject_record_t zi_record;
66 uint64_t *zi_lanes;
67 int zi_next_lane;
68 list_node_t zi_link;
69 } inject_handler_t;
70
71 /*
72 * List of all zinject handlers registered on the system, protected by
73 * the inject_lock defined below.
74 */
75 static list_t inject_handlers;
76
77 /*
78 * This protects insertion into, and traversal of, the inject handler
79 * list defined above; as well as the inject_delay_count. Any time a
80 * handler is inserted or removed from the list, this lock should be
81 * taken as a RW_WRITER; and any time traversal is done over the list
82 * (without modification to it) this lock should be taken as a RW_READER.
83 */
84 static krwlock_t inject_lock;
85
86 /*
87 * This holds the number of zinject delay handlers that have been
88 * registered on the system. It is protected by the inject_lock defined
89 * above. Thus modifications to this count must be a RW_WRITER of the
90 * inject_lock, and reads of this count must be (at least) a RW_READER
91 * of the lock.
92 */
93 static int inject_delay_count = 0;
94
95 /*
96 * This lock is used only in zio_handle_io_delay(), refer to the comment
97 * in that function for more details.
98 */
99 static kmutex_t inject_delay_mtx;
100
101 /*
102 * Used to assign unique identifying numbers to each new zinject handler.
103 */
104 static int inject_next_id = 1;
105
106 /*
107 * Test if the requested frequency was triggered
108 */
109 static boolean_t
freq_triggered(uint32_t frequency)110 freq_triggered(uint32_t frequency)
111 {
112 /*
113 * zero implies always (100%)
114 */
115 if (frequency == 0)
116 return (B_TRUE);
117
118 /*
119 * Note: we still handle legacy (unscaled) frequency values
120 */
121 uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
122
123 return (random_in_range(maximum) < frequency);
124 }
125
126 /*
127 * Returns true if the given record matches the I/O in progress.
128 */
129 static boolean_t
zio_match_handler(const zbookmark_phys_t * zb,uint64_t type,int dva,zinject_record_t * record,int error)130 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
131 zinject_record_t *record, int error)
132 {
133 boolean_t matched = B_FALSE;
134 boolean_t injected = B_FALSE;
135
136 /*
137 * Check for a match against the MOS, which is based on type
138 */
139 if (zb->zb_objset == DMU_META_OBJSET &&
140 record->zi_objset == DMU_META_OBJSET &&
141 record->zi_object == DMU_META_DNODE_OBJECT) {
142 if (record->zi_type == DMU_OT_NONE ||
143 type == record->zi_type)
144 matched = B_TRUE;
145 goto done;
146 }
147
148 /*
149 * Check for an exact match.
150 */
151 if (zb->zb_objset == record->zi_objset &&
152 zb->zb_object == record->zi_object &&
153 zb->zb_level == record->zi_level &&
154 zb->zb_blkid >= record->zi_start &&
155 zb->zb_blkid <= record->zi_end &&
156 (record->zi_dvas == 0 ||
157 (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) &&
158 error == record->zi_error) {
159 matched = B_TRUE;
160 goto done;
161 }
162
163 done:
164 if (matched) {
165 record->zi_match_count++;
166 injected = freq_triggered(record->zi_freq);
167 }
168
169 if (injected)
170 record->zi_inject_count++;
171
172 return (injected);
173 }
174
175 /*
176 * Panic the system when a config change happens in the function
177 * specified by tag.
178 */
179 void
zio_handle_panic_injection(spa_t * spa,const char * tag,uint64_t type)180 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type)
181 {
182 inject_handler_t *handler;
183
184 rw_enter(&inject_lock, RW_READER);
185
186 for (handler = list_head(&inject_handlers); handler != NULL;
187 handler = list_next(&inject_handlers, handler)) {
188
189 if (spa != handler->zi_spa)
190 continue;
191
192 if (handler->zi_record.zi_type == type &&
193 strcmp(tag, handler->zi_record.zi_func) == 0) {
194 handler->zi_record.zi_match_count++;
195 handler->zi_record.zi_inject_count++;
196 panic("Panic requested in function %s\n", tag);
197 }
198 }
199
200 rw_exit(&inject_lock);
201 }
202
203 /*
204 * Inject a decryption failure. Decryption failures can occur in
205 * both the ARC and the ZIO layers.
206 */
207 int
zio_handle_decrypt_injection(spa_t * spa,const zbookmark_phys_t * zb,uint64_t type,int error)208 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
209 uint64_t type, int error)
210 {
211 int ret = 0;
212 inject_handler_t *handler;
213
214 rw_enter(&inject_lock, RW_READER);
215
216 for (handler = list_head(&inject_handlers); handler != NULL;
217 handler = list_next(&inject_handlers, handler)) {
218
219 if (spa != handler->zi_spa ||
220 handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
221 continue;
222
223 if (zio_match_handler(zb, type, ZI_NO_DVA,
224 &handler->zi_record, error)) {
225 ret = error;
226 break;
227 }
228 }
229
230 rw_exit(&inject_lock);
231 return (ret);
232 }
233
234 /*
235 * If this is a physical I/O for a vdev child determine which DVA it is
236 * for. We iterate backwards through the DVAs matching on the offset so
237 * that we end up with ZI_NO_DVA (-1) if we don't find a match.
238 */
239 static int
zio_match_dva(zio_t * zio)240 zio_match_dva(zio_t *zio)
241 {
242 int i = ZI_NO_DVA;
243
244 if (zio->io_bp != NULL && zio->io_vd != NULL &&
245 zio->io_child_type == ZIO_CHILD_VDEV) {
246 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
247 dva_t *dva = &zio->io_bp->blk_dva[i];
248 uint64_t off = DVA_GET_OFFSET(dva);
249 vdev_t *vd = vdev_lookup_top(zio->io_spa,
250 DVA_GET_VDEV(dva));
251
252 /* Compensate for vdev label added to leaves */
253 if (zio->io_vd->vdev_ops->vdev_op_leaf)
254 off += VDEV_LABEL_START_SIZE;
255
256 if (zio->io_vd == vd && zio->io_offset == off)
257 break;
258 }
259 }
260
261 return (i);
262 }
263
264
265 /*
266 * Determine if the I/O in question should return failure. Returns the errno
267 * to be returned to the caller.
268 */
269 int
zio_handle_fault_injection(zio_t * zio,int error)270 zio_handle_fault_injection(zio_t *zio, int error)
271 {
272 int ret = 0;
273 inject_handler_t *handler;
274
275 /*
276 * Ignore I/O not associated with any logical data.
277 */
278 if (zio->io_logical == NULL)
279 return (0);
280
281 /*
282 * Currently, we only support fault injection on reads.
283 */
284 if (zio->io_type != ZIO_TYPE_READ)
285 return (0);
286
287 /*
288 * A rebuild I/O has no checksum to verify.
289 */
290 if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
291 return (0);
292
293 rw_enter(&inject_lock, RW_READER);
294
295 for (handler = list_head(&inject_handlers); handler != NULL;
296 handler = list_next(&inject_handlers, handler)) {
297 if (zio->io_spa != handler->zi_spa ||
298 handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
299 continue;
300
301 /* If this handler matches, return the specified error */
302 if (zio_match_handler(&zio->io_logical->io_bookmark,
303 zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
304 zio_match_dva(zio), &handler->zi_record, error)) {
305 ret = error;
306 break;
307 }
308 }
309
310 rw_exit(&inject_lock);
311
312 return (ret);
313 }
314
315 /*
316 * Determine if the zio is part of a label update and has an injection
317 * handler associated with that portion of the label. Currently, we
318 * allow error injection in either the nvlist or the uberblock region of
319 * of the vdev label.
320 */
321 int
zio_handle_label_injection(zio_t * zio,int error)322 zio_handle_label_injection(zio_t *zio, int error)
323 {
324 inject_handler_t *handler;
325 vdev_t *vd = zio->io_vd;
326 uint64_t offset = zio->io_offset;
327 int label;
328 int ret = 0;
329
330 if (offset >= VDEV_LABEL_START_SIZE &&
331 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
332 return (0);
333
334 rw_enter(&inject_lock, RW_READER);
335
336 for (handler = list_head(&inject_handlers); handler != NULL;
337 handler = list_next(&inject_handlers, handler)) {
338 uint64_t start = handler->zi_record.zi_start;
339 uint64_t end = handler->zi_record.zi_end;
340
341 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
342 continue;
343
344 /*
345 * The injection region is the relative offsets within a
346 * vdev label. We must determine the label which is being
347 * updated and adjust our region accordingly.
348 */
349 label = vdev_label_number(vd->vdev_psize, offset);
350 start = vdev_label_offset(vd->vdev_psize, label, start);
351 end = vdev_label_offset(vd->vdev_psize, label, end);
352
353 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
354 (offset >= start && offset <= end)) {
355 handler->zi_record.zi_match_count++;
356 handler->zi_record.zi_inject_count++;
357 ret = error;
358 break;
359 }
360 }
361 rw_exit(&inject_lock);
362 return (ret);
363 }
364
365 static int
zio_inject_bitflip_cb(void * data,size_t len,void * private)366 zio_inject_bitflip_cb(void *data, size_t len, void *private)
367 {
368 zio_t *zio = private;
369 uint8_t *buffer = data;
370 uint_t byte = random_in_range(len);
371
372 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
373
374 /* flip a single random bit in an abd data buffer */
375 buffer[byte] ^= 1 << random_in_range(8);
376
377 return (1); /* stop after first flip */
378 }
379
380 /* Test if this zio matches the iotype from the injection record. */
381 static boolean_t
zio_match_iotype(zio_t * zio,uint32_t iotype)382 zio_match_iotype(zio_t *zio, uint32_t iotype)
383 {
384 ASSERT3P(zio, !=, NULL);
385
386 /* Unknown iotype, maybe from a newer version of zinject. Reject it. */
387 if (iotype >= ZINJECT_IOTYPES)
388 return (B_FALSE);
389
390 /* Probe IOs only match IOTYPE_PROBE, regardless of their type. */
391 if (zio->io_flags & ZIO_FLAG_PROBE)
392 return (iotype == ZINJECT_IOTYPE_PROBE);
393
394 /* Standard IO types, match against ZIO type. */
395 if (iotype < ZINJECT_IOTYPE_ALL)
396 return (iotype == zio->io_type);
397
398 /* Match any standard IO type. */
399 if (iotype == ZINJECT_IOTYPE_ALL)
400 return (B_TRUE);
401
402 return (B_FALSE);
403 }
404
405 static int
zio_handle_device_injection_impl(vdev_t * vd,zio_t * zio,int err1,int err2)406 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
407 {
408 inject_handler_t *handler;
409 int ret = 0;
410
411 /*
412 * We skip over faults in the labels unless it's during device open
413 * (i.e. zio == NULL) or a device flush (offset is meaningless). We let
414 * probe IOs through so we can match them to probe inject records.
415 */
416 if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH &&
417 !(zio->io_flags & ZIO_FLAG_PROBE)) {
418 uint64_t offset = zio->io_offset;
419
420 if (offset < VDEV_LABEL_START_SIZE ||
421 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
422 return (0);
423 }
424
425 rw_enter(&inject_lock, RW_READER);
426
427 for (handler = list_head(&inject_handlers); handler != NULL;
428 handler = list_next(&inject_handlers, handler)) {
429
430 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
431 continue;
432
433 if (vd->vdev_guid == handler->zi_record.zi_guid) {
434 if (handler->zi_record.zi_failfast &&
435 (zio == NULL || (zio->io_flags &
436 (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
437 continue;
438 }
439
440 /* Handle type specific I/O failures */
441 if (zio != NULL && !zio_match_iotype(zio,
442 handler->zi_record.zi_iotype))
443 continue;
444
445 if (handler->zi_record.zi_error == err1 ||
446 handler->zi_record.zi_error == err2) {
447 handler->zi_record.zi_match_count++;
448
449 /*
450 * limit error injection if requested
451 */
452 if (!freq_triggered(handler->zi_record.zi_freq))
453 continue;
454
455 handler->zi_record.zi_inject_count++;
456
457 /*
458 * For a failed open, pretend like the device
459 * has gone away.
460 */
461 if (err1 == ENXIO)
462 vd->vdev_stat.vs_aux =
463 VDEV_AUX_OPEN_FAILED;
464
465 /*
466 * Treat these errors as if they had been
467 * retried so that all the appropriate stats
468 * and FMA events are generated.
469 */
470 if (!handler->zi_record.zi_failfast &&
471 zio != NULL)
472 zio->io_flags |= ZIO_FLAG_IO_RETRY;
473
474 /*
475 * EILSEQ means flip a bit after a read
476 */
477 if (handler->zi_record.zi_error == EILSEQ) {
478 if (zio == NULL)
479 break;
480
481 /* locate buffer data and flip a bit */
482 (void) abd_iterate_func(zio->io_abd, 0,
483 zio->io_size, zio_inject_bitflip_cb,
484 zio);
485 break;
486 }
487
488 ret = handler->zi_record.zi_error;
489 break;
490 }
491 if (handler->zi_record.zi_error == ENXIO) {
492 handler->zi_record.zi_match_count++;
493 handler->zi_record.zi_inject_count++;
494 ret = SET_ERROR(EIO);
495 break;
496 }
497 }
498 }
499
500 rw_exit(&inject_lock);
501
502 return (ret);
503 }
504
505 int
zio_handle_device_injection(vdev_t * vd,zio_t * zio,int error)506 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
507 {
508 return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
509 }
510
511 int
zio_handle_device_injections(vdev_t * vd,zio_t * zio,int err1,int err2)512 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
513 {
514 return (zio_handle_device_injection_impl(vd, zio, err1, err2));
515 }
516
517 /*
518 * Simulate hardware that ignores cache flushes. For requested number
519 * of seconds nix the actual writing to disk.
520 */
521 void
zio_handle_ignored_writes(zio_t * zio)522 zio_handle_ignored_writes(zio_t *zio)
523 {
524 inject_handler_t *handler;
525
526 rw_enter(&inject_lock, RW_READER);
527
528 for (handler = list_head(&inject_handlers); handler != NULL;
529 handler = list_next(&inject_handlers, handler)) {
530
531 /* Ignore errors not destined for this pool */
532 if (zio->io_spa != handler->zi_spa ||
533 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
534 continue;
535
536 handler->zi_record.zi_match_count++;
537
538 /*
539 * Positive duration implies # of seconds, negative
540 * a number of txgs
541 */
542 if (handler->zi_record.zi_timer == 0) {
543 if (handler->zi_record.zi_duration > 0)
544 handler->zi_record.zi_timer = ddi_get_lbolt64();
545 else
546 handler->zi_record.zi_timer = zio->io_txg;
547 }
548
549 /* Have a "problem" writing 60% of the time */
550 if (random_in_range(100) < 60) {
551 handler->zi_record.zi_inject_count++;
552 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
553 }
554 break;
555 }
556
557 rw_exit(&inject_lock);
558 }
559
560 void
spa_handle_ignored_writes(spa_t * spa)561 spa_handle_ignored_writes(spa_t *spa)
562 {
563 inject_handler_t *handler;
564
565 if (zio_injection_enabled == 0)
566 return;
567
568 rw_enter(&inject_lock, RW_READER);
569
570 for (handler = list_head(&inject_handlers); handler != NULL;
571 handler = list_next(&inject_handlers, handler)) {
572
573 if (spa != handler->zi_spa ||
574 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
575 continue;
576
577 handler->zi_record.zi_match_count++;
578 handler->zi_record.zi_inject_count++;
579
580 if (handler->zi_record.zi_duration > 0) {
581 VERIFY(handler->zi_record.zi_timer == 0 ||
582 ddi_time_after64(
583 (int64_t)handler->zi_record.zi_timer +
584 handler->zi_record.zi_duration * hz,
585 ddi_get_lbolt64()));
586 } else {
587 /* duration is negative so the subtraction here adds */
588 VERIFY(handler->zi_record.zi_timer == 0 ||
589 handler->zi_record.zi_timer -
590 handler->zi_record.zi_duration >=
591 spa_syncing_txg(spa));
592 }
593 }
594
595 rw_exit(&inject_lock);
596 }
597
598 hrtime_t
zio_handle_io_delay(zio_t * zio)599 zio_handle_io_delay(zio_t *zio)
600 {
601 vdev_t *vd = zio->io_vd;
602 inject_handler_t *min_handler = NULL;
603 hrtime_t min_target = 0;
604
605 rw_enter(&inject_lock, RW_READER);
606
607 /*
608 * inject_delay_count is a subset of zio_injection_enabled that
609 * is only incremented for delay handlers. These checks are
610 * mainly added to remind the reader why we're not explicitly
611 * checking zio_injection_enabled like the other functions.
612 */
613 IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
614 IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
615
616 /*
617 * If there aren't any inject delay handlers registered, then we
618 * can short circuit and simply return 0 here. A value of zero
619 * informs zio_delay_interrupt() that this request should not be
620 * delayed. This short circuit keeps us from acquiring the
621 * inject_delay_mutex unnecessarily.
622 */
623 if (inject_delay_count == 0) {
624 rw_exit(&inject_lock);
625 return (0);
626 }
627
628 /*
629 * Each inject handler has a number of "lanes" associated with
630 * it. Each lane is able to handle requests independently of one
631 * another, and at a latency defined by the inject handler
632 * record's zi_timer field. Thus if a handler in configured with
633 * a single lane with a 10ms latency, it will delay requests
634 * such that only a single request is completed every 10ms. So,
635 * if more than one request is attempted per each 10ms interval,
636 * the average latency of the requests will be greater than
637 * 10ms; but if only a single request is submitted each 10ms
638 * interval the average latency will be 10ms.
639 *
640 * We need to acquire this mutex to prevent multiple concurrent
641 * threads being assigned to the same lane of a given inject
642 * handler. The mutex allows us to perform the following two
643 * operations atomically:
644 *
645 * 1. determine the minimum handler and minimum target
646 * value of all the possible handlers
647 * 2. update that minimum handler's lane array
648 *
649 * Without atomicity, two (or more) threads could pick the same
650 * lane in step (1), and then conflict with each other in step
651 * (2). This could allow a single lane handler to process
652 * multiple requests simultaneously, which shouldn't be possible.
653 */
654 mutex_enter(&inject_delay_mtx);
655
656 for (inject_handler_t *handler = list_head(&inject_handlers);
657 handler != NULL; handler = list_next(&inject_handlers, handler)) {
658 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
659 continue;
660
661 if (vd->vdev_guid != handler->zi_record.zi_guid)
662 continue;
663
664 /* also match on I/O type (e.g., -T read) */
665 if (!zio_match_iotype(zio, handler->zi_record.zi_iotype))
666 continue;
667
668 /*
669 * Defensive; should never happen as the array allocation
670 * occurs prior to inserting this handler on the list.
671 */
672 ASSERT3P(handler->zi_lanes, !=, NULL);
673
674 /*
675 * This should never happen, the zinject command should
676 * prevent a user from setting an IO delay with zero lanes.
677 */
678 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
679
680 ASSERT3U(handler->zi_record.zi_nlanes, >,
681 handler->zi_next_lane);
682
683 handler->zi_record.zi_match_count++;
684
685 /* Limit the use of this handler if requested */
686 if (!freq_triggered(handler->zi_record.zi_freq))
687 continue;
688
689 /*
690 * We want to issue this IO to the lane that will become
691 * idle the soonest, so we compare the soonest this
692 * specific handler can complete the IO with all other
693 * handlers, to find the lowest value of all possible
694 * lanes. We then use this lane to submit the request.
695 *
696 * Since each handler has a constant value for its
697 * delay, we can just use the "next" lane for that
698 * handler; as it will always be the lane with the
699 * lowest value for that particular handler (i.e. the
700 * lane that will become idle the soonest). This saves a
701 * scan of each handler's lanes array.
702 *
703 * There's two cases to consider when determining when
704 * this specific IO request should complete. If this
705 * lane is idle, we want to "submit" the request now so
706 * it will complete after zi_timer milliseconds. Thus,
707 * we set the target to now + zi_timer.
708 *
709 * If the lane is busy, we want this request to complete
710 * zi_timer milliseconds after the lane becomes idle.
711 * Since the 'zi_lanes' array holds the time at which
712 * each lane will become idle, we use that value to
713 * determine when this request should complete.
714 */
715 hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
716 hrtime_t busy = handler->zi_record.zi_timer +
717 handler->zi_lanes[handler->zi_next_lane];
718 hrtime_t target = MAX(idle, busy);
719
720 if (min_handler == NULL) {
721 min_handler = handler;
722 min_target = target;
723 continue;
724 }
725
726 ASSERT3P(min_handler, !=, NULL);
727 ASSERT3U(min_target, !=, 0);
728
729 /*
730 * We don't yet increment the "next lane" variable since
731 * we still might find a lower value lane in another
732 * handler during any remaining iterations. Once we're
733 * sure we've selected the absolute minimum, we'll claim
734 * the lane and increment the handler's "next lane"
735 * field below.
736 */
737
738 if (target < min_target) {
739 min_handler = handler;
740 min_target = target;
741 }
742 }
743
744 /*
745 * 'min_handler' will be NULL if no IO delays are registered for
746 * this vdev, otherwise it will point to the handler containing
747 * the lane that will become idle the soonest.
748 */
749 if (min_handler != NULL) {
750 ASSERT3U(min_target, !=, 0);
751 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
752
753 /*
754 * If we've used all possible lanes for this handler,
755 * loop back and start using the first lane again;
756 * otherwise, just increment the lane index.
757 */
758 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
759 min_handler->zi_record.zi_nlanes;
760
761 min_handler->zi_record.zi_inject_count++;
762
763 }
764
765 mutex_exit(&inject_delay_mtx);
766 rw_exit(&inject_lock);
767
768 return (min_target);
769 }
770
771 static void
zio_handle_pool_delay(spa_t * spa,hrtime_t elapsed,zinject_type_t command)772 zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command)
773 {
774 inject_handler_t *handler;
775 hrtime_t delay = 0;
776 int id = 0;
777
778 rw_enter(&inject_lock, RW_READER);
779
780 for (handler = list_head(&inject_handlers);
781 handler != NULL && handler->zi_record.zi_cmd == command;
782 handler = list_next(&inject_handlers, handler)) {
783 ASSERT3P(handler->zi_spa_name, !=, NULL);
784 if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) {
785 handler->zi_record.zi_match_count++;
786 uint64_t pause =
787 SEC2NSEC(handler->zi_record.zi_duration);
788 if (pause > elapsed) {
789 handler->zi_record.zi_inject_count++;
790 delay = pause - elapsed;
791 }
792 id = handler->zi_id;
793 break;
794 }
795 }
796
797 rw_exit(&inject_lock);
798
799 if (delay) {
800 if (command == ZINJECT_DELAY_IMPORT) {
801 spa_import_progress_set_notes(spa, "injecting %llu "
802 "sec delay", (u_longlong_t)NSEC2SEC(delay));
803 }
804 zfs_sleep_until(gethrtime() + delay);
805 }
806 if (id) {
807 /* all done with this one-shot handler */
808 zio_clear_fault(id);
809 }
810 }
811
812 /*
813 * For testing, inject a delay during an import
814 */
815 void
zio_handle_import_delay(spa_t * spa,hrtime_t elapsed)816 zio_handle_import_delay(spa_t *spa, hrtime_t elapsed)
817 {
818 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT);
819 }
820
821 /*
822 * For testing, inject a delay during an export
823 */
824 void
zio_handle_export_delay(spa_t * spa,hrtime_t elapsed)825 zio_handle_export_delay(spa_t *spa, hrtime_t elapsed)
826 {
827 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT);
828 }
829
830 static int
zio_calculate_range(const char * pool,zinject_record_t * record)831 zio_calculate_range(const char *pool, zinject_record_t *record)
832 {
833 dsl_pool_t *dp;
834 dsl_dataset_t *ds;
835 objset_t *os = NULL;
836 dnode_t *dn = NULL;
837 int error;
838
839 /*
840 * Obtain the dnode for object using pool, objset, and object
841 */
842 error = dsl_pool_hold(pool, FTAG, &dp);
843 if (error)
844 return (error);
845
846 error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
847 dsl_pool_rele(dp, FTAG);
848 if (error)
849 return (error);
850
851 error = dmu_objset_from_ds(ds, &os);
852 dsl_dataset_rele(ds, FTAG);
853 if (error)
854 return (error);
855
856 error = dnode_hold(os, record->zi_object, FTAG, &dn);
857 if (error)
858 return (error);
859
860 /*
861 * Translate the range into block IDs
862 */
863 if (record->zi_start != 0 || record->zi_end != -1ULL) {
864 record->zi_start >>= dn->dn_datablkshift;
865 record->zi_end >>= dn->dn_datablkshift;
866 }
867 if (record->zi_level > 0) {
868 if (record->zi_level >= dn->dn_nlevels) {
869 dnode_rele(dn, FTAG);
870 return (SET_ERROR(EDOM));
871 }
872
873 if (record->zi_start != 0 || record->zi_end != 0) {
874 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
875
876 for (int level = record->zi_level; level > 0; level--) {
877 record->zi_start >>= shift;
878 record->zi_end >>= shift;
879 }
880 }
881 }
882
883 dnode_rele(dn, FTAG);
884 return (0);
885 }
886
887 static boolean_t
zio_pool_handler_exists(const char * name,zinject_type_t command)888 zio_pool_handler_exists(const char *name, zinject_type_t command)
889 {
890 boolean_t exists = B_FALSE;
891
892 rw_enter(&inject_lock, RW_READER);
893 for (inject_handler_t *handler = list_head(&inject_handlers);
894 handler != NULL; handler = list_next(&inject_handlers, handler)) {
895 if (command != handler->zi_record.zi_cmd)
896 continue;
897
898 const char *pool = (handler->zi_spa_name != NULL) ?
899 handler->zi_spa_name : spa_name(handler->zi_spa);
900 if (strcmp(name, pool) == 0) {
901 exists = B_TRUE;
902 break;
903 }
904 }
905 rw_exit(&inject_lock);
906
907 return (exists);
908 }
909 /*
910 * Create a new handler for the given record. We add it to the list, adding
911 * a reference to the spa_t in the process. We increment zio_injection_enabled,
912 * which is the switch to trigger all fault injection.
913 */
914 int
zio_inject_fault(char * name,int flags,int * id,zinject_record_t * record)915 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
916 {
917 inject_handler_t *handler;
918 int error;
919 spa_t *spa;
920
921 /*
922 * If this is pool-wide metadata, make sure we unload the corresponding
923 * spa_t, so that the next attempt to load it will trigger the fault.
924 * We call spa_reset() to unload the pool appropriately.
925 */
926 if (flags & ZINJECT_UNLOAD_SPA)
927 if ((error = spa_reset(name)) != 0)
928 return (error);
929
930 if (record->zi_cmd == ZINJECT_DELAY_IO) {
931 /*
932 * A value of zero for the number of lanes or for the
933 * delay time doesn't make sense.
934 */
935 if (record->zi_timer == 0 || record->zi_nlanes == 0)
936 return (SET_ERROR(EINVAL));
937
938 /*
939 * The number of lanes is directly mapped to the size of
940 * an array used by the handler. Thus, to ensure the
941 * user doesn't trigger an allocation that's "too large"
942 * we cap the number of lanes here.
943 */
944 if (record->zi_nlanes >= UINT16_MAX)
945 return (SET_ERROR(EINVAL));
946 }
947
948 /*
949 * If the supplied range was in bytes -- calculate the actual blkid
950 */
951 if (flags & ZINJECT_CALC_RANGE) {
952 error = zio_calculate_range(name, record);
953 if (error != 0)
954 return (error);
955 }
956
957 if (!(flags & ZINJECT_NULL)) {
958 /*
959 * Pool delays for import or export don't take an
960 * injection reference on the spa. Instead they
961 * rely on matching by name.
962 */
963 if (record->zi_cmd == ZINJECT_DELAY_IMPORT ||
964 record->zi_cmd == ZINJECT_DELAY_EXPORT) {
965 if (record->zi_duration <= 0)
966 return (SET_ERROR(EINVAL));
967 /*
968 * Only one import | export delay handler per pool.
969 */
970 if (zio_pool_handler_exists(name, record->zi_cmd))
971 return (SET_ERROR(EEXIST));
972
973 mutex_enter(&spa_namespace_lock);
974 boolean_t has_spa = spa_lookup(name) != NULL;
975 mutex_exit(&spa_namespace_lock);
976
977 if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa)
978 return (SET_ERROR(EEXIST));
979 if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa)
980 return (SET_ERROR(ENOENT));
981 spa = NULL;
982 } else {
983 /*
984 * spa_inject_ref() will add an injection reference,
985 * which will prevent the pool from being removed
986 * from the namespace while still allowing it to be
987 * unloaded.
988 */
989 if ((spa = spa_inject_addref(name)) == NULL)
990 return (SET_ERROR(ENOENT));
991 }
992
993 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
994 handler->zi_spa = spa; /* note: can be NULL */
995 handler->zi_record = *record;
996
997 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
998 handler->zi_lanes = kmem_zalloc(
999 sizeof (*handler->zi_lanes) *
1000 handler->zi_record.zi_nlanes, KM_SLEEP);
1001 handler->zi_next_lane = 0;
1002 } else {
1003 handler->zi_lanes = NULL;
1004 handler->zi_next_lane = 0;
1005 }
1006
1007 if (handler->zi_spa == NULL)
1008 handler->zi_spa_name = spa_strdup(name);
1009 else
1010 handler->zi_spa_name = NULL;
1011
1012 rw_enter(&inject_lock, RW_WRITER);
1013
1014 /*
1015 * We can't move this increment into the conditional
1016 * above because we need to hold the RW_WRITER lock of
1017 * inject_lock, and we don't want to hold that while
1018 * allocating the handler's zi_lanes array.
1019 */
1020 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1021 ASSERT3S(inject_delay_count, >=, 0);
1022 inject_delay_count++;
1023 ASSERT3S(inject_delay_count, >, 0);
1024 }
1025
1026 *id = handler->zi_id = inject_next_id++;
1027 list_insert_tail(&inject_handlers, handler);
1028 atomic_inc_32(&zio_injection_enabled);
1029
1030 rw_exit(&inject_lock);
1031 }
1032
1033 /*
1034 * Flush the ARC, so that any attempts to read this data will end up
1035 * going to the ZIO layer. Note that this is a little overkill, but
1036 * we don't have the necessary ARC interfaces to do anything else, and
1037 * fault injection isn't a performance critical path.
1038 */
1039 if (flags & ZINJECT_FLUSH_ARC)
1040 /*
1041 * We must use FALSE to ensure arc_flush returns, since
1042 * we're not preventing concurrent ARC insertions.
1043 */
1044 arc_flush(NULL, FALSE);
1045
1046 return (0);
1047 }
1048
1049 /*
1050 * Returns the next record with an ID greater than that supplied to the
1051 * function. Used to iterate over all handlers in the system.
1052 */
1053 int
zio_inject_list_next(int * id,char * name,size_t buflen,zinject_record_t * record)1054 zio_inject_list_next(int *id, char *name, size_t buflen,
1055 zinject_record_t *record)
1056 {
1057 inject_handler_t *handler;
1058 int ret;
1059
1060 mutex_enter(&spa_namespace_lock);
1061 rw_enter(&inject_lock, RW_READER);
1062
1063 for (handler = list_head(&inject_handlers); handler != NULL;
1064 handler = list_next(&inject_handlers, handler))
1065 if (handler->zi_id > *id)
1066 break;
1067
1068 if (handler) {
1069 *record = handler->zi_record;
1070 *id = handler->zi_id;
1071 ASSERT(handler->zi_spa || handler->zi_spa_name);
1072 if (handler->zi_spa != NULL)
1073 (void) strlcpy(name, spa_name(handler->zi_spa), buflen);
1074 else
1075 (void) strlcpy(name, handler->zi_spa_name, buflen);
1076 ret = 0;
1077 } else {
1078 ret = SET_ERROR(ENOENT);
1079 }
1080
1081 rw_exit(&inject_lock);
1082 mutex_exit(&spa_namespace_lock);
1083
1084 return (ret);
1085 }
1086
1087 /*
1088 * Clear the fault handler with the given identifier, or return ENOENT if none
1089 * exists.
1090 */
1091 int
zio_clear_fault(int id)1092 zio_clear_fault(int id)
1093 {
1094 inject_handler_t *handler;
1095
1096 rw_enter(&inject_lock, RW_WRITER);
1097
1098 for (handler = list_head(&inject_handlers); handler != NULL;
1099 handler = list_next(&inject_handlers, handler))
1100 if (handler->zi_id == id)
1101 break;
1102
1103 if (handler == NULL) {
1104 rw_exit(&inject_lock);
1105 return (SET_ERROR(ENOENT));
1106 }
1107
1108 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1109 ASSERT3S(inject_delay_count, >, 0);
1110 inject_delay_count--;
1111 ASSERT3S(inject_delay_count, >=, 0);
1112 }
1113
1114 list_remove(&inject_handlers, handler);
1115 rw_exit(&inject_lock);
1116
1117 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
1118 ASSERT3P(handler->zi_lanes, !=, NULL);
1119 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
1120 handler->zi_record.zi_nlanes);
1121 } else {
1122 ASSERT3P(handler->zi_lanes, ==, NULL);
1123 }
1124
1125 if (handler->zi_spa_name != NULL)
1126 spa_strfree(handler->zi_spa_name);
1127
1128 if (handler->zi_spa != NULL)
1129 spa_inject_delref(handler->zi_spa);
1130 kmem_free(handler, sizeof (inject_handler_t));
1131 atomic_dec_32(&zio_injection_enabled);
1132
1133 return (0);
1134 }
1135
1136 void
zio_inject_init(void)1137 zio_inject_init(void)
1138 {
1139 rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
1140 mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
1141 list_create(&inject_handlers, sizeof (inject_handler_t),
1142 offsetof(inject_handler_t, zi_link));
1143 }
1144
1145 void
zio_inject_fini(void)1146 zio_inject_fini(void)
1147 {
1148 list_destroy(&inject_handlers);
1149 mutex_destroy(&inject_delay_mtx);
1150 rw_destroy(&inject_lock);
1151 }
1152
1153 #if defined(_KERNEL)
1154 EXPORT_SYMBOL(zio_injection_enabled);
1155 EXPORT_SYMBOL(zio_inject_fault);
1156 EXPORT_SYMBOL(zio_inject_list_next);
1157 EXPORT_SYMBOL(zio_clear_fault);
1158 EXPORT_SYMBOL(zio_handle_fault_injection);
1159 EXPORT_SYMBOL(zio_handle_device_injection);
1160 EXPORT_SYMBOL(zio_handle_label_injection);
1161 #endif
1162