1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2017, Intel Corporation. 26 * Copyright (c) 2024-2025, Klara, Inc. 27 */ 28 29 /* 30 * ZFS fault injection 31 * 32 * To handle fault injection, we keep track of a series of zinject_record_t 33 * structures which describe which logical block(s) should be injected with a 34 * fault. These are kept in a global list. Each record corresponds to a given 35 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 36 * or exported while the injection record exists. 37 * 38 * Device level injection is done using the 'zi_guid' field. If this is set, it 39 * means that the error is destined for a particular device, not a piece of 40 * data. 41 * 42 * This is a rather poor data structure and algorithm, but we don't expect more 43 * than a few faults at any one time, so it should be sufficient for our needs. 44 */ 45 46 #include <sys/arc.h> 47 #include <sys/zio.h> 48 #include <sys/zfs_ioctl.h> 49 #include <sys/vdev_impl.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/fs/zfs.h> 53 54 uint32_t zio_injection_enabled = 0; 55 56 /* 57 * Data describing each zinject handler registered on the system, and 58 * contains the list node linking the handler in the global zinject 59 * handler list. 60 */ 61 typedef struct inject_handler { 62 int zi_id; 63 spa_t *zi_spa; 64 char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ 65 zinject_record_t zi_record; 66 uint64_t *zi_lanes; 67 int zi_next_lane; 68 list_node_t zi_link; 69 } inject_handler_t; 70 71 /* 72 * List of all zinject handlers registered on the system, protected by 73 * the inject_lock defined below. 74 */ 75 static list_t inject_handlers; 76 77 /* 78 * This protects insertion into, and traversal of, the inject handler 79 * list defined above; as well as the inject_delay_count. Any time a 80 * handler is inserted or removed from the list, this lock should be 81 * taken as a RW_WRITER; and any time traversal is done over the list 82 * (without modification to it) this lock should be taken as a RW_READER. 83 */ 84 static krwlock_t inject_lock; 85 86 /* 87 * This holds the number of zinject delay handlers that have been 88 * registered on the system. It is protected by the inject_lock defined 89 * above. Thus modifications to this count must be a RW_WRITER of the 90 * inject_lock, and reads of this count must be (at least) a RW_READER 91 * of the lock. 92 */ 93 static int inject_delay_count = 0; 94 95 /* 96 * This lock is used only in zio_handle_io_delay(), refer to the comment 97 * in that function for more details. 98 */ 99 static kmutex_t inject_delay_mtx; 100 101 /* 102 * Used to assign unique identifying numbers to each new zinject handler. 103 */ 104 static int inject_next_id = 1; 105 106 /* 107 * Test if the requested frequency was triggered 108 */ 109 static boolean_t 110 freq_triggered(uint32_t frequency) 111 { 112 /* 113 * zero implies always (100%) 114 */ 115 if (frequency == 0) 116 return (B_TRUE); 117 118 /* 119 * Note: we still handle legacy (unscaled) frequency values 120 */ 121 uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; 122 123 return (random_in_range(maximum) < frequency); 124 } 125 126 /* 127 * Returns true if the given record matches the I/O in progress. 128 */ 129 static boolean_t 130 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, 131 zinject_record_t *record, int error) 132 { 133 boolean_t matched = B_FALSE; 134 boolean_t injected = B_FALSE; 135 136 /* 137 * Check for a match against the MOS, which is based on type 138 */ 139 if (zb->zb_objset == DMU_META_OBJSET && 140 record->zi_objset == DMU_META_OBJSET && 141 record->zi_object == DMU_META_DNODE_OBJECT) { 142 if (record->zi_type == DMU_OT_NONE || 143 type == record->zi_type) 144 matched = B_TRUE; 145 goto done; 146 } 147 148 /* 149 * Check for an exact match. 150 */ 151 if (zb->zb_objset == record->zi_objset && 152 zb->zb_object == record->zi_object && 153 zb->zb_level == record->zi_level && 154 zb->zb_blkid >= record->zi_start && 155 zb->zb_blkid <= record->zi_end && 156 (record->zi_dvas == 0 || 157 (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && 158 error == record->zi_error) { 159 matched = B_TRUE; 160 goto done; 161 } 162 163 done: 164 if (matched) { 165 record->zi_match_count++; 166 injected = freq_triggered(record->zi_freq); 167 } 168 169 if (injected) 170 record->zi_inject_count++; 171 172 return (injected); 173 } 174 175 /* 176 * Panic the system when a config change happens in the function 177 * specified by tag. 178 */ 179 void 180 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) 181 { 182 inject_handler_t *handler; 183 184 rw_enter(&inject_lock, RW_READER); 185 186 for (handler = list_head(&inject_handlers); handler != NULL; 187 handler = list_next(&inject_handlers, handler)) { 188 189 if (spa != handler->zi_spa) 190 continue; 191 192 if (handler->zi_record.zi_type == type && 193 strcmp(tag, handler->zi_record.zi_func) == 0) { 194 handler->zi_record.zi_match_count++; 195 handler->zi_record.zi_inject_count++; 196 panic("Panic requested in function %s\n", tag); 197 } 198 } 199 200 rw_exit(&inject_lock); 201 } 202 203 /* 204 * Inject a decryption failure. Decryption failures can occur in 205 * both the ARC and the ZIO layers. 206 */ 207 int 208 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, 209 uint64_t type, int error) 210 { 211 int ret = 0; 212 inject_handler_t *handler; 213 214 rw_enter(&inject_lock, RW_READER); 215 216 for (handler = list_head(&inject_handlers); handler != NULL; 217 handler = list_next(&inject_handlers, handler)) { 218 219 if (spa != handler->zi_spa || 220 handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) 221 continue; 222 223 if (zio_match_handler(zb, type, ZI_NO_DVA, 224 &handler->zi_record, error)) { 225 ret = error; 226 break; 227 } 228 } 229 230 rw_exit(&inject_lock); 231 return (ret); 232 } 233 234 /* 235 * If this is a physical I/O for a vdev child determine which DVA it is 236 * for. We iterate backwards through the DVAs matching on the offset so 237 * that we end up with ZI_NO_DVA (-1) if we don't find a match. 238 */ 239 static int 240 zio_match_dva(zio_t *zio) 241 { 242 int i = ZI_NO_DVA; 243 244 if (zio->io_bp != NULL && zio->io_vd != NULL && 245 zio->io_child_type == ZIO_CHILD_VDEV) { 246 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { 247 dva_t *dva = &zio->io_bp->blk_dva[i]; 248 uint64_t off = DVA_GET_OFFSET(dva); 249 vdev_t *vd = vdev_lookup_top(zio->io_spa, 250 DVA_GET_VDEV(dva)); 251 252 /* Compensate for vdev label added to leaves */ 253 if (zio->io_vd->vdev_ops->vdev_op_leaf) 254 off += VDEV_LABEL_START_SIZE; 255 256 if (zio->io_vd == vd && zio->io_offset == off) 257 break; 258 } 259 } 260 261 return (i); 262 } 263 264 265 /* 266 * Determine if the I/O in question should return failure. Returns the errno 267 * to be returned to the caller. 268 */ 269 int 270 zio_handle_fault_injection(zio_t *zio, int error) 271 { 272 int ret = 0; 273 inject_handler_t *handler; 274 275 /* 276 * Ignore I/O not associated with any logical data. 277 */ 278 if (zio->io_logical == NULL) 279 return (0); 280 281 /* 282 * Currently, we only support fault injection on reads. 283 */ 284 if (zio->io_type != ZIO_TYPE_READ) 285 return (0); 286 287 /* 288 * A rebuild I/O has no checksum to verify. 289 */ 290 if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) 291 return (0); 292 293 rw_enter(&inject_lock, RW_READER); 294 295 for (handler = list_head(&inject_handlers); handler != NULL; 296 handler = list_next(&inject_handlers, handler)) { 297 if (zio->io_spa != handler->zi_spa || 298 handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 299 continue; 300 301 /* If this handler matches, return the specified error */ 302 if (zio_match_handler(&zio->io_logical->io_bookmark, 303 zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 304 zio_match_dva(zio), &handler->zi_record, error)) { 305 ret = error; 306 break; 307 } 308 } 309 310 rw_exit(&inject_lock); 311 312 return (ret); 313 } 314 315 /* 316 * Determine if the zio is part of a label update and has an injection 317 * handler associated with that portion of the label. Currently, we 318 * allow error injection in either the nvlist or the uberblock region of 319 * of the vdev label. 320 */ 321 int 322 zio_handle_label_injection(zio_t *zio, int error) 323 { 324 inject_handler_t *handler; 325 vdev_t *vd = zio->io_vd; 326 uint64_t offset = zio->io_offset; 327 int label; 328 int ret = 0; 329 330 if (offset >= VDEV_LABEL_START_SIZE && 331 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 332 return (0); 333 334 rw_enter(&inject_lock, RW_READER); 335 336 for (handler = list_head(&inject_handlers); handler != NULL; 337 handler = list_next(&inject_handlers, handler)) { 338 uint64_t start = handler->zi_record.zi_start; 339 uint64_t end = handler->zi_record.zi_end; 340 341 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 342 continue; 343 344 /* 345 * The injection region is the relative offsets within a 346 * vdev label. We must determine the label which is being 347 * updated and adjust our region accordingly. 348 */ 349 label = vdev_label_number(vd->vdev_psize, offset); 350 start = vdev_label_offset(vd->vdev_psize, label, start); 351 end = vdev_label_offset(vd->vdev_psize, label, end); 352 353 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 354 (offset >= start && offset <= end)) { 355 handler->zi_record.zi_match_count++; 356 handler->zi_record.zi_inject_count++; 357 ret = error; 358 break; 359 } 360 } 361 rw_exit(&inject_lock); 362 return (ret); 363 } 364 365 static int 366 zio_inject_bitflip_cb(void *data, size_t len, void *private) 367 { 368 zio_t *zio = private; 369 uint8_t *buffer = data; 370 uint_t byte = random_in_range(len); 371 372 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 373 374 /* flip a single random bit in an abd data buffer */ 375 buffer[byte] ^= 1 << random_in_range(8); 376 377 return (1); /* stop after first flip */ 378 } 379 380 /* Test if this zio matches the iotype from the injection record. */ 381 static boolean_t 382 zio_match_iotype(zio_t *zio, uint32_t iotype) 383 { 384 ASSERT3P(zio, !=, NULL); 385 386 /* Unknown iotype, maybe from a newer version of zinject. Reject it. */ 387 if (iotype >= ZINJECT_IOTYPES) 388 return (B_FALSE); 389 390 /* Probe IOs only match IOTYPE_PROBE, regardless of their type. */ 391 if (zio->io_flags & ZIO_FLAG_PROBE) 392 return (iotype == ZINJECT_IOTYPE_PROBE); 393 394 /* Standard IO types, match against ZIO type. */ 395 if (iotype < ZINJECT_IOTYPE_ALL) 396 return (iotype == zio->io_type); 397 398 /* Match any standard IO type. */ 399 if (iotype == ZINJECT_IOTYPE_ALL) 400 return (B_TRUE); 401 402 return (B_FALSE); 403 } 404 405 static int 406 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) 407 { 408 inject_handler_t *handler; 409 int ret = 0; 410 411 /* 412 * We skip over faults in the labels unless it's during device open 413 * (i.e. zio == NULL) or a device flush (offset is meaningless). We let 414 * probe IOs through so we can match them to probe inject records. 415 */ 416 if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH && 417 !(zio->io_flags & ZIO_FLAG_PROBE)) { 418 uint64_t offset = zio->io_offset; 419 420 if (offset < VDEV_LABEL_START_SIZE || 421 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 422 return (0); 423 } 424 425 rw_enter(&inject_lock, RW_READER); 426 427 for (handler = list_head(&inject_handlers); handler != NULL; 428 handler = list_next(&inject_handlers, handler)) { 429 430 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 431 continue; 432 433 if (vd->vdev_guid == handler->zi_record.zi_guid) { 434 if (handler->zi_record.zi_failfast && 435 (zio == NULL || (zio->io_flags & 436 (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 437 continue; 438 } 439 440 /* Handle type specific I/O failures */ 441 if (zio != NULL && !zio_match_iotype(zio, 442 handler->zi_record.zi_iotype)) 443 continue; 444 445 if (handler->zi_record.zi_error == err1 || 446 handler->zi_record.zi_error == err2) { 447 handler->zi_record.zi_match_count++; 448 449 /* 450 * limit error injection if requested 451 */ 452 if (!freq_triggered(handler->zi_record.zi_freq)) 453 continue; 454 455 handler->zi_record.zi_inject_count++; 456 457 /* 458 * For a failed open, pretend like the device 459 * has gone away. 460 */ 461 if (err1 == ENXIO) 462 vd->vdev_stat.vs_aux = 463 VDEV_AUX_OPEN_FAILED; 464 465 /* 466 * Treat these errors as if they had been 467 * retried so that all the appropriate stats 468 * and FMA events are generated. 469 */ 470 if (!handler->zi_record.zi_failfast && 471 zio != NULL) 472 zio->io_flags |= ZIO_FLAG_IO_RETRY; 473 474 /* 475 * EILSEQ means flip a bit after a read 476 */ 477 if (handler->zi_record.zi_error == EILSEQ) { 478 if (zio == NULL) 479 break; 480 481 /* locate buffer data and flip a bit */ 482 (void) abd_iterate_func(zio->io_abd, 0, 483 zio->io_size, zio_inject_bitflip_cb, 484 zio); 485 break; 486 } 487 488 ret = handler->zi_record.zi_error; 489 break; 490 } 491 if (handler->zi_record.zi_error == ENXIO) { 492 handler->zi_record.zi_match_count++; 493 handler->zi_record.zi_inject_count++; 494 ret = SET_ERROR(EIO); 495 break; 496 } 497 } 498 } 499 500 rw_exit(&inject_lock); 501 502 return (ret); 503 } 504 505 int 506 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 507 { 508 return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); 509 } 510 511 int 512 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) 513 { 514 return (zio_handle_device_injection_impl(vd, zio, err1, err2)); 515 } 516 517 /* 518 * Simulate hardware that ignores cache flushes. For requested number 519 * of seconds nix the actual writing to disk. 520 */ 521 void 522 zio_handle_ignored_writes(zio_t *zio) 523 { 524 inject_handler_t *handler; 525 526 rw_enter(&inject_lock, RW_READER); 527 528 for (handler = list_head(&inject_handlers); handler != NULL; 529 handler = list_next(&inject_handlers, handler)) { 530 531 /* Ignore errors not destined for this pool */ 532 if (zio->io_spa != handler->zi_spa || 533 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 534 continue; 535 536 handler->zi_record.zi_match_count++; 537 538 /* 539 * Positive duration implies # of seconds, negative 540 * a number of txgs 541 */ 542 if (handler->zi_record.zi_timer == 0) { 543 if (handler->zi_record.zi_duration > 0) 544 handler->zi_record.zi_timer = ddi_get_lbolt64(); 545 else 546 handler->zi_record.zi_timer = zio->io_txg; 547 } 548 549 /* Have a "problem" writing 60% of the time */ 550 if (random_in_range(100) < 60) { 551 handler->zi_record.zi_inject_count++; 552 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 553 } 554 break; 555 } 556 557 rw_exit(&inject_lock); 558 } 559 560 void 561 spa_handle_ignored_writes(spa_t *spa) 562 { 563 inject_handler_t *handler; 564 565 if (zio_injection_enabled == 0) 566 return; 567 568 rw_enter(&inject_lock, RW_READER); 569 570 for (handler = list_head(&inject_handlers); handler != NULL; 571 handler = list_next(&inject_handlers, handler)) { 572 573 if (spa != handler->zi_spa || 574 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 575 continue; 576 577 handler->zi_record.zi_match_count++; 578 handler->zi_record.zi_inject_count++; 579 580 if (handler->zi_record.zi_duration > 0) { 581 VERIFY(handler->zi_record.zi_timer == 0 || 582 ddi_time_after64( 583 (int64_t)handler->zi_record.zi_timer + 584 handler->zi_record.zi_duration * hz, 585 ddi_get_lbolt64())); 586 } else { 587 /* duration is negative so the subtraction here adds */ 588 VERIFY(handler->zi_record.zi_timer == 0 || 589 handler->zi_record.zi_timer - 590 handler->zi_record.zi_duration >= 591 spa_syncing_txg(spa)); 592 } 593 } 594 595 rw_exit(&inject_lock); 596 } 597 598 hrtime_t 599 zio_handle_io_delay(zio_t *zio) 600 { 601 vdev_t *vd = zio->io_vd; 602 inject_handler_t *min_handler = NULL; 603 hrtime_t min_target = 0; 604 605 rw_enter(&inject_lock, RW_READER); 606 607 /* 608 * inject_delay_count is a subset of zio_injection_enabled that 609 * is only incremented for delay handlers. These checks are 610 * mainly added to remind the reader why we're not explicitly 611 * checking zio_injection_enabled like the other functions. 612 */ 613 IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 614 IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 615 616 /* 617 * If there aren't any inject delay handlers registered, then we 618 * can short circuit and simply return 0 here. A value of zero 619 * informs zio_delay_interrupt() that this request should not be 620 * delayed. This short circuit keeps us from acquiring the 621 * inject_delay_mutex unnecessarily. 622 */ 623 if (inject_delay_count == 0) { 624 rw_exit(&inject_lock); 625 return (0); 626 } 627 628 /* 629 * Each inject handler has a number of "lanes" associated with 630 * it. Each lane is able to handle requests independently of one 631 * another, and at a latency defined by the inject handler 632 * record's zi_timer field. Thus if a handler in configured with 633 * a single lane with a 10ms latency, it will delay requests 634 * such that only a single request is completed every 10ms. So, 635 * if more than one request is attempted per each 10ms interval, 636 * the average latency of the requests will be greater than 637 * 10ms; but if only a single request is submitted each 10ms 638 * interval the average latency will be 10ms. 639 * 640 * We need to acquire this mutex to prevent multiple concurrent 641 * threads being assigned to the same lane of a given inject 642 * handler. The mutex allows us to perform the following two 643 * operations atomically: 644 * 645 * 1. determine the minimum handler and minimum target 646 * value of all the possible handlers 647 * 2. update that minimum handler's lane array 648 * 649 * Without atomicity, two (or more) threads could pick the same 650 * lane in step (1), and then conflict with each other in step 651 * (2). This could allow a single lane handler to process 652 * multiple requests simultaneously, which shouldn't be possible. 653 */ 654 mutex_enter(&inject_delay_mtx); 655 656 for (inject_handler_t *handler = list_head(&inject_handlers); 657 handler != NULL; handler = list_next(&inject_handlers, handler)) { 658 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 659 continue; 660 661 if (vd->vdev_guid != handler->zi_record.zi_guid) 662 continue; 663 664 /* also match on I/O type (e.g., -T read) */ 665 if (!zio_match_iotype(zio, handler->zi_record.zi_iotype)) 666 continue; 667 668 /* 669 * Defensive; should never happen as the array allocation 670 * occurs prior to inserting this handler on the list. 671 */ 672 ASSERT3P(handler->zi_lanes, !=, NULL); 673 674 /* 675 * This should never happen, the zinject command should 676 * prevent a user from setting an IO delay with zero lanes. 677 */ 678 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 679 680 ASSERT3U(handler->zi_record.zi_nlanes, >, 681 handler->zi_next_lane); 682 683 handler->zi_record.zi_match_count++; 684 685 /* Limit the use of this handler if requested */ 686 if (!freq_triggered(handler->zi_record.zi_freq)) 687 continue; 688 689 /* 690 * We want to issue this IO to the lane that will become 691 * idle the soonest, so we compare the soonest this 692 * specific handler can complete the IO with all other 693 * handlers, to find the lowest value of all possible 694 * lanes. We then use this lane to submit the request. 695 * 696 * Since each handler has a constant value for its 697 * delay, we can just use the "next" lane for that 698 * handler; as it will always be the lane with the 699 * lowest value for that particular handler (i.e. the 700 * lane that will become idle the soonest). This saves a 701 * scan of each handler's lanes array. 702 * 703 * There's two cases to consider when determining when 704 * this specific IO request should complete. If this 705 * lane is idle, we want to "submit" the request now so 706 * it will complete after zi_timer milliseconds. Thus, 707 * we set the target to now + zi_timer. 708 * 709 * If the lane is busy, we want this request to complete 710 * zi_timer milliseconds after the lane becomes idle. 711 * Since the 'zi_lanes' array holds the time at which 712 * each lane will become idle, we use that value to 713 * determine when this request should complete. 714 */ 715 hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 716 hrtime_t busy = handler->zi_record.zi_timer + 717 handler->zi_lanes[handler->zi_next_lane]; 718 hrtime_t target = MAX(idle, busy); 719 720 if (min_handler == NULL) { 721 min_handler = handler; 722 min_target = target; 723 continue; 724 } 725 726 ASSERT3P(min_handler, !=, NULL); 727 ASSERT3U(min_target, !=, 0); 728 729 /* 730 * We don't yet increment the "next lane" variable since 731 * we still might find a lower value lane in another 732 * handler during any remaining iterations. Once we're 733 * sure we've selected the absolute minimum, we'll claim 734 * the lane and increment the handler's "next lane" 735 * field below. 736 */ 737 738 if (target < min_target) { 739 min_handler = handler; 740 min_target = target; 741 } 742 } 743 744 /* 745 * 'min_handler' will be NULL if no IO delays are registered for 746 * this vdev, otherwise it will point to the handler containing 747 * the lane that will become idle the soonest. 748 */ 749 if (min_handler != NULL) { 750 ASSERT3U(min_target, !=, 0); 751 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 752 753 /* 754 * If we've used all possible lanes for this handler, 755 * loop back and start using the first lane again; 756 * otherwise, just increment the lane index. 757 */ 758 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 759 min_handler->zi_record.zi_nlanes; 760 761 min_handler->zi_record.zi_inject_count++; 762 763 } 764 765 mutex_exit(&inject_delay_mtx); 766 rw_exit(&inject_lock); 767 768 return (min_target); 769 } 770 771 static void 772 zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) 773 { 774 inject_handler_t *handler; 775 hrtime_t delay = 0; 776 int id = 0; 777 778 rw_enter(&inject_lock, RW_READER); 779 780 for (handler = list_head(&inject_handlers); 781 handler != NULL && handler->zi_record.zi_cmd == command; 782 handler = list_next(&inject_handlers, handler)) { 783 ASSERT3P(handler->zi_spa_name, !=, NULL); 784 if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { 785 handler->zi_record.zi_match_count++; 786 uint64_t pause = 787 SEC2NSEC(handler->zi_record.zi_duration); 788 if (pause > elapsed) { 789 handler->zi_record.zi_inject_count++; 790 delay = pause - elapsed; 791 } 792 id = handler->zi_id; 793 break; 794 } 795 } 796 797 rw_exit(&inject_lock); 798 799 if (delay) { 800 if (command == ZINJECT_DELAY_IMPORT) { 801 spa_import_progress_set_notes(spa, "injecting %llu " 802 "sec delay", (u_longlong_t)NSEC2SEC(delay)); 803 } 804 zfs_sleep_until(gethrtime() + delay); 805 } 806 if (id) { 807 /* all done with this one-shot handler */ 808 zio_clear_fault(id); 809 } 810 } 811 812 /* 813 * For testing, inject a delay during an import 814 */ 815 void 816 zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) 817 { 818 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); 819 } 820 821 /* 822 * For testing, inject a delay during an export 823 */ 824 void 825 zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) 826 { 827 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); 828 } 829 830 static int 831 zio_calculate_range(const char *pool, zinject_record_t *record) 832 { 833 dsl_pool_t *dp; 834 dsl_dataset_t *ds; 835 objset_t *os = NULL; 836 dnode_t *dn = NULL; 837 int error; 838 839 /* 840 * Obtain the dnode for object using pool, objset, and object 841 */ 842 error = dsl_pool_hold(pool, FTAG, &dp); 843 if (error) 844 return (error); 845 846 error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); 847 dsl_pool_rele(dp, FTAG); 848 if (error) 849 return (error); 850 851 error = dmu_objset_from_ds(ds, &os); 852 dsl_dataset_rele(ds, FTAG); 853 if (error) 854 return (error); 855 856 error = dnode_hold(os, record->zi_object, FTAG, &dn); 857 if (error) 858 return (error); 859 860 /* 861 * Translate the range into block IDs 862 */ 863 if (record->zi_start != 0 || record->zi_end != -1ULL) { 864 record->zi_start >>= dn->dn_datablkshift; 865 record->zi_end >>= dn->dn_datablkshift; 866 } 867 if (record->zi_level > 0) { 868 if (record->zi_level >= dn->dn_nlevels) { 869 dnode_rele(dn, FTAG); 870 return (SET_ERROR(EDOM)); 871 } 872 873 if (record->zi_start != 0 || record->zi_end != 0) { 874 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 875 876 for (int level = record->zi_level; level > 0; level--) { 877 record->zi_start >>= shift; 878 record->zi_end >>= shift; 879 } 880 } 881 } 882 883 dnode_rele(dn, FTAG); 884 return (0); 885 } 886 887 static boolean_t 888 zio_pool_handler_exists(const char *name, zinject_type_t command) 889 { 890 boolean_t exists = B_FALSE; 891 892 rw_enter(&inject_lock, RW_READER); 893 for (inject_handler_t *handler = list_head(&inject_handlers); 894 handler != NULL; handler = list_next(&inject_handlers, handler)) { 895 if (command != handler->zi_record.zi_cmd) 896 continue; 897 898 const char *pool = (handler->zi_spa_name != NULL) ? 899 handler->zi_spa_name : spa_name(handler->zi_spa); 900 if (strcmp(name, pool) == 0) { 901 exists = B_TRUE; 902 break; 903 } 904 } 905 rw_exit(&inject_lock); 906 907 return (exists); 908 } 909 /* 910 * Create a new handler for the given record. We add it to the list, adding 911 * a reference to the spa_t in the process. We increment zio_injection_enabled, 912 * which is the switch to trigger all fault injection. 913 */ 914 int 915 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 916 { 917 inject_handler_t *handler; 918 int error; 919 spa_t *spa; 920 921 /* 922 * If this is pool-wide metadata, make sure we unload the corresponding 923 * spa_t, so that the next attempt to load it will trigger the fault. 924 * We call spa_reset() to unload the pool appropriately. 925 */ 926 if (flags & ZINJECT_UNLOAD_SPA) 927 if ((error = spa_reset(name)) != 0) 928 return (error); 929 930 if (record->zi_cmd == ZINJECT_DELAY_IO) { 931 /* 932 * A value of zero for the number of lanes or for the 933 * delay time doesn't make sense. 934 */ 935 if (record->zi_timer == 0 || record->zi_nlanes == 0) 936 return (SET_ERROR(EINVAL)); 937 938 /* 939 * The number of lanes is directly mapped to the size of 940 * an array used by the handler. Thus, to ensure the 941 * user doesn't trigger an allocation that's "too large" 942 * we cap the number of lanes here. 943 */ 944 if (record->zi_nlanes >= UINT16_MAX) 945 return (SET_ERROR(EINVAL)); 946 } 947 948 /* 949 * If the supplied range was in bytes -- calculate the actual blkid 950 */ 951 if (flags & ZINJECT_CALC_RANGE) { 952 error = zio_calculate_range(name, record); 953 if (error != 0) 954 return (error); 955 } 956 957 if (!(flags & ZINJECT_NULL)) { 958 /* 959 * Pool delays for import or export don't take an 960 * injection reference on the spa. Instead they 961 * rely on matching by name. 962 */ 963 if (record->zi_cmd == ZINJECT_DELAY_IMPORT || 964 record->zi_cmd == ZINJECT_DELAY_EXPORT) { 965 if (record->zi_duration <= 0) 966 return (SET_ERROR(EINVAL)); 967 /* 968 * Only one import | export delay handler per pool. 969 */ 970 if (zio_pool_handler_exists(name, record->zi_cmd)) 971 return (SET_ERROR(EEXIST)); 972 973 mutex_enter(&spa_namespace_lock); 974 boolean_t has_spa = spa_lookup(name) != NULL; 975 mutex_exit(&spa_namespace_lock); 976 977 if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) 978 return (SET_ERROR(EEXIST)); 979 if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) 980 return (SET_ERROR(ENOENT)); 981 spa = NULL; 982 } else { 983 /* 984 * spa_inject_ref() will add an injection reference, 985 * which will prevent the pool from being removed 986 * from the namespace while still allowing it to be 987 * unloaded. 988 */ 989 if ((spa = spa_inject_addref(name)) == NULL) 990 return (SET_ERROR(ENOENT)); 991 } 992 993 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 994 handler->zi_spa = spa; /* note: can be NULL */ 995 handler->zi_record = *record; 996 997 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 998 handler->zi_lanes = kmem_zalloc( 999 sizeof (*handler->zi_lanes) * 1000 handler->zi_record.zi_nlanes, KM_SLEEP); 1001 handler->zi_next_lane = 0; 1002 } else { 1003 handler->zi_lanes = NULL; 1004 handler->zi_next_lane = 0; 1005 } 1006 1007 if (handler->zi_spa == NULL) 1008 handler->zi_spa_name = spa_strdup(name); 1009 else 1010 handler->zi_spa_name = NULL; 1011 1012 rw_enter(&inject_lock, RW_WRITER); 1013 1014 /* 1015 * We can't move this increment into the conditional 1016 * above because we need to hold the RW_WRITER lock of 1017 * inject_lock, and we don't want to hold that while 1018 * allocating the handler's zi_lanes array. 1019 */ 1020 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1021 ASSERT3S(inject_delay_count, >=, 0); 1022 inject_delay_count++; 1023 ASSERT3S(inject_delay_count, >, 0); 1024 } 1025 1026 *id = handler->zi_id = inject_next_id++; 1027 list_insert_tail(&inject_handlers, handler); 1028 atomic_inc_32(&zio_injection_enabled); 1029 1030 rw_exit(&inject_lock); 1031 } 1032 1033 /* 1034 * Flush the ARC, so that any attempts to read this data will end up 1035 * going to the ZIO layer. Note that this is a little overkill, but 1036 * we don't have the necessary ARC interfaces to do anything else, and 1037 * fault injection isn't a performance critical path. 1038 */ 1039 if (flags & ZINJECT_FLUSH_ARC) 1040 /* 1041 * We must use FALSE to ensure arc_flush returns, since 1042 * we're not preventing concurrent ARC insertions. 1043 */ 1044 arc_flush(NULL, FALSE); 1045 1046 return (0); 1047 } 1048 1049 /* 1050 * Returns the next record with an ID greater than that supplied to the 1051 * function. Used to iterate over all handlers in the system. 1052 */ 1053 int 1054 zio_inject_list_next(int *id, char *name, size_t buflen, 1055 zinject_record_t *record) 1056 { 1057 inject_handler_t *handler; 1058 int ret; 1059 1060 mutex_enter(&spa_namespace_lock); 1061 rw_enter(&inject_lock, RW_READER); 1062 1063 for (handler = list_head(&inject_handlers); handler != NULL; 1064 handler = list_next(&inject_handlers, handler)) 1065 if (handler->zi_id > *id) 1066 break; 1067 1068 if (handler) { 1069 *record = handler->zi_record; 1070 *id = handler->zi_id; 1071 ASSERT(handler->zi_spa || handler->zi_spa_name); 1072 if (handler->zi_spa != NULL) 1073 (void) strlcpy(name, spa_name(handler->zi_spa), buflen); 1074 else 1075 (void) strlcpy(name, handler->zi_spa_name, buflen); 1076 ret = 0; 1077 } else { 1078 ret = SET_ERROR(ENOENT); 1079 } 1080 1081 rw_exit(&inject_lock); 1082 mutex_exit(&spa_namespace_lock); 1083 1084 return (ret); 1085 } 1086 1087 /* 1088 * Clear the fault handler with the given identifier, or return ENOENT if none 1089 * exists. 1090 */ 1091 int 1092 zio_clear_fault(int id) 1093 { 1094 inject_handler_t *handler; 1095 1096 rw_enter(&inject_lock, RW_WRITER); 1097 1098 for (handler = list_head(&inject_handlers); handler != NULL; 1099 handler = list_next(&inject_handlers, handler)) 1100 if (handler->zi_id == id) 1101 break; 1102 1103 if (handler == NULL) { 1104 rw_exit(&inject_lock); 1105 return (SET_ERROR(ENOENT)); 1106 } 1107 1108 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1109 ASSERT3S(inject_delay_count, >, 0); 1110 inject_delay_count--; 1111 ASSERT3S(inject_delay_count, >=, 0); 1112 } 1113 1114 list_remove(&inject_handlers, handler); 1115 rw_exit(&inject_lock); 1116 1117 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1118 ASSERT3P(handler->zi_lanes, !=, NULL); 1119 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 1120 handler->zi_record.zi_nlanes); 1121 } else { 1122 ASSERT3P(handler->zi_lanes, ==, NULL); 1123 } 1124 1125 if (handler->zi_spa_name != NULL) 1126 spa_strfree(handler->zi_spa_name); 1127 1128 if (handler->zi_spa != NULL) 1129 spa_inject_delref(handler->zi_spa); 1130 kmem_free(handler, sizeof (inject_handler_t)); 1131 atomic_dec_32(&zio_injection_enabled); 1132 1133 return (0); 1134 } 1135 1136 void 1137 zio_inject_init(void) 1138 { 1139 rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 1140 mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 1141 list_create(&inject_handlers, sizeof (inject_handler_t), 1142 offsetof(inject_handler_t, zi_link)); 1143 } 1144 1145 void 1146 zio_inject_fini(void) 1147 { 1148 list_destroy(&inject_handlers); 1149 mutex_destroy(&inject_delay_mtx); 1150 rw_destroy(&inject_lock); 1151 } 1152 1153 #if defined(_KERNEL) 1154 EXPORT_SYMBOL(zio_injection_enabled); 1155 EXPORT_SYMBOL(zio_inject_fault); 1156 EXPORT_SYMBOL(zio_inject_list_next); 1157 EXPORT_SYMBOL(zio_clear_fault); 1158 EXPORT_SYMBOL(zio_handle_fault_injection); 1159 EXPORT_SYMBOL(zio_handle_device_injection); 1160 EXPORT_SYMBOL(zio_handle_label_injection); 1161 #endif 1162