1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2017, Intel Corporation. 25 * Copyright (c) 2024, Klara Inc. 26 */ 27 28 /* 29 * ZFS fault injection 30 * 31 * To handle fault injection, we keep track of a series of zinject_record_t 32 * structures which describe which logical block(s) should be injected with a 33 * fault. These are kept in a global list. Each record corresponds to a given 34 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 35 * or exported while the injection record exists. 36 * 37 * Device level injection is done using the 'zi_guid' field. If this is set, it 38 * means that the error is destined for a particular device, not a piece of 39 * data. 40 * 41 * This is a rather poor data structure and algorithm, but we don't expect more 42 * than a few faults at any one time, so it should be sufficient for our needs. 43 */ 44 45 #include <sys/arc.h> 46 #include <sys/zio.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/vdev_impl.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/dsl_dataset.h> 51 #include <sys/fs/zfs.h> 52 53 uint32_t zio_injection_enabled = 0; 54 55 /* 56 * Data describing each zinject handler registered on the system, and 57 * contains the list node linking the handler in the global zinject 58 * handler list. 59 */ 60 typedef struct inject_handler { 61 int zi_id; 62 spa_t *zi_spa; 63 char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ 64 zinject_record_t zi_record; 65 uint64_t *zi_lanes; 66 int zi_next_lane; 67 list_node_t zi_link; 68 } inject_handler_t; 69 70 /* 71 * List of all zinject handlers registered on the system, protected by 72 * the inject_lock defined below. 73 */ 74 static list_t inject_handlers; 75 76 /* 77 * This protects insertion into, and traversal of, the inject handler 78 * list defined above; as well as the inject_delay_count. Any time a 79 * handler is inserted or removed from the list, this lock should be 80 * taken as a RW_WRITER; and any time traversal is done over the list 81 * (without modification to it) this lock should be taken as a RW_READER. 82 */ 83 static krwlock_t inject_lock; 84 85 /* 86 * This holds the number of zinject delay handlers that have been 87 * registered on the system. It is protected by the inject_lock defined 88 * above. Thus modifications to this count must be a RW_WRITER of the 89 * inject_lock, and reads of this count must be (at least) a RW_READER 90 * of the lock. 91 */ 92 static int inject_delay_count = 0; 93 94 /* 95 * This lock is used only in zio_handle_io_delay(), refer to the comment 96 * in that function for more details. 97 */ 98 static kmutex_t inject_delay_mtx; 99 100 /* 101 * Used to assign unique identifying numbers to each new zinject handler. 102 */ 103 static int inject_next_id = 1; 104 105 /* 106 * Test if the requested frequency was triggered 107 */ 108 static boolean_t 109 freq_triggered(uint32_t frequency) 110 { 111 /* 112 * zero implies always (100%) 113 */ 114 if (frequency == 0) 115 return (B_TRUE); 116 117 /* 118 * Note: we still handle legacy (unscaled) frequency values 119 */ 120 uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; 121 122 return (random_in_range(maximum) < frequency); 123 } 124 125 /* 126 * Returns true if the given record matches the I/O in progress. 127 */ 128 static boolean_t 129 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, 130 zinject_record_t *record, int error) 131 { 132 /* 133 * Check for a match against the MOS, which is based on type 134 */ 135 if (zb->zb_objset == DMU_META_OBJSET && 136 record->zi_objset == DMU_META_OBJSET && 137 record->zi_object == DMU_META_DNODE_OBJECT) { 138 if (record->zi_type == DMU_OT_NONE || 139 type == record->zi_type) 140 return (freq_triggered(record->zi_freq)); 141 else 142 return (B_FALSE); 143 } 144 145 /* 146 * Check for an exact match. 147 */ 148 if (zb->zb_objset == record->zi_objset && 149 zb->zb_object == record->zi_object && 150 zb->zb_level == record->zi_level && 151 zb->zb_blkid >= record->zi_start && 152 zb->zb_blkid <= record->zi_end && 153 (record->zi_dvas == 0 || 154 (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && 155 error == record->zi_error) { 156 return (freq_triggered(record->zi_freq)); 157 } 158 159 return (B_FALSE); 160 } 161 162 /* 163 * Panic the system when a config change happens in the function 164 * specified by tag. 165 */ 166 void 167 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) 168 { 169 inject_handler_t *handler; 170 171 rw_enter(&inject_lock, RW_READER); 172 173 for (handler = list_head(&inject_handlers); handler != NULL; 174 handler = list_next(&inject_handlers, handler)) { 175 176 if (spa != handler->zi_spa) 177 continue; 178 179 if (handler->zi_record.zi_type == type && 180 strcmp(tag, handler->zi_record.zi_func) == 0) 181 panic("Panic requested in function %s\n", tag); 182 } 183 184 rw_exit(&inject_lock); 185 } 186 187 /* 188 * Inject a decryption failure. Decryption failures can occur in 189 * both the ARC and the ZIO layers. 190 */ 191 int 192 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, 193 uint64_t type, int error) 194 { 195 int ret = 0; 196 inject_handler_t *handler; 197 198 rw_enter(&inject_lock, RW_READER); 199 200 for (handler = list_head(&inject_handlers); handler != NULL; 201 handler = list_next(&inject_handlers, handler)) { 202 203 if (spa != handler->zi_spa || 204 handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) 205 continue; 206 207 if (zio_match_handler(zb, type, ZI_NO_DVA, 208 &handler->zi_record, error)) { 209 ret = error; 210 break; 211 } 212 } 213 214 rw_exit(&inject_lock); 215 return (ret); 216 } 217 218 /* 219 * If this is a physical I/O for a vdev child determine which DVA it is 220 * for. We iterate backwards through the DVAs matching on the offset so 221 * that we end up with ZI_NO_DVA (-1) if we don't find a match. 222 */ 223 static int 224 zio_match_dva(zio_t *zio) 225 { 226 int i = ZI_NO_DVA; 227 228 if (zio->io_bp != NULL && zio->io_vd != NULL && 229 zio->io_child_type == ZIO_CHILD_VDEV) { 230 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { 231 dva_t *dva = &zio->io_bp->blk_dva[i]; 232 uint64_t off = DVA_GET_OFFSET(dva); 233 vdev_t *vd = vdev_lookup_top(zio->io_spa, 234 DVA_GET_VDEV(dva)); 235 236 /* Compensate for vdev label added to leaves */ 237 if (zio->io_vd->vdev_ops->vdev_op_leaf) 238 off += VDEV_LABEL_START_SIZE; 239 240 if (zio->io_vd == vd && zio->io_offset == off) 241 break; 242 } 243 } 244 245 return (i); 246 } 247 248 249 /* 250 * Determine if the I/O in question should return failure. Returns the errno 251 * to be returned to the caller. 252 */ 253 int 254 zio_handle_fault_injection(zio_t *zio, int error) 255 { 256 int ret = 0; 257 inject_handler_t *handler; 258 259 /* 260 * Ignore I/O not associated with any logical data. 261 */ 262 if (zio->io_logical == NULL) 263 return (0); 264 265 /* 266 * Currently, we only support fault injection on reads. 267 */ 268 if (zio->io_type != ZIO_TYPE_READ) 269 return (0); 270 271 /* 272 * A rebuild I/O has no checksum to verify. 273 */ 274 if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) 275 return (0); 276 277 rw_enter(&inject_lock, RW_READER); 278 279 for (handler = list_head(&inject_handlers); handler != NULL; 280 handler = list_next(&inject_handlers, handler)) { 281 if (zio->io_spa != handler->zi_spa || 282 handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 283 continue; 284 285 /* If this handler matches, return the specified error */ 286 if (zio_match_handler(&zio->io_logical->io_bookmark, 287 zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 288 zio_match_dva(zio), &handler->zi_record, error)) { 289 ret = error; 290 break; 291 } 292 } 293 294 rw_exit(&inject_lock); 295 296 return (ret); 297 } 298 299 /* 300 * Determine if the zio is part of a label update and has an injection 301 * handler associated with that portion of the label. Currently, we 302 * allow error injection in either the nvlist or the uberblock region of 303 * of the vdev label. 304 */ 305 int 306 zio_handle_label_injection(zio_t *zio, int error) 307 { 308 inject_handler_t *handler; 309 vdev_t *vd = zio->io_vd; 310 uint64_t offset = zio->io_offset; 311 int label; 312 int ret = 0; 313 314 if (offset >= VDEV_LABEL_START_SIZE && 315 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 316 return (0); 317 318 rw_enter(&inject_lock, RW_READER); 319 320 for (handler = list_head(&inject_handlers); handler != NULL; 321 handler = list_next(&inject_handlers, handler)) { 322 uint64_t start = handler->zi_record.zi_start; 323 uint64_t end = handler->zi_record.zi_end; 324 325 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 326 continue; 327 328 /* 329 * The injection region is the relative offsets within a 330 * vdev label. We must determine the label which is being 331 * updated and adjust our region accordingly. 332 */ 333 label = vdev_label_number(vd->vdev_psize, offset); 334 start = vdev_label_offset(vd->vdev_psize, label, start); 335 end = vdev_label_offset(vd->vdev_psize, label, end); 336 337 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 338 (offset >= start && offset <= end)) { 339 ret = error; 340 break; 341 } 342 } 343 rw_exit(&inject_lock); 344 return (ret); 345 } 346 347 static int 348 zio_inject_bitflip_cb(void *data, size_t len, void *private) 349 { 350 zio_t *zio = private; 351 uint8_t *buffer = data; 352 uint_t byte = random_in_range(len); 353 354 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 355 356 /* flip a single random bit in an abd data buffer */ 357 buffer[byte] ^= 1 << random_in_range(8); 358 359 return (1); /* stop after first flip */ 360 } 361 362 static int 363 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) 364 { 365 inject_handler_t *handler; 366 int ret = 0; 367 368 /* 369 * We skip over faults in the labels unless it's during device open 370 * (i.e. zio == NULL) or a device flush (offset is meaningless) 371 */ 372 if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { 373 uint64_t offset = zio->io_offset; 374 375 if (offset < VDEV_LABEL_START_SIZE || 376 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 377 return (0); 378 } 379 380 rw_enter(&inject_lock, RW_READER); 381 382 for (handler = list_head(&inject_handlers); handler != NULL; 383 handler = list_next(&inject_handlers, handler)) { 384 385 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 386 continue; 387 388 if (vd->vdev_guid == handler->zi_record.zi_guid) { 389 if (handler->zi_record.zi_failfast && 390 (zio == NULL || (zio->io_flags & 391 (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 392 continue; 393 } 394 395 /* Handle type specific I/O failures */ 396 if (zio != NULL && 397 handler->zi_record.zi_iotype != ZIO_TYPES && 398 handler->zi_record.zi_iotype != zio->io_type) 399 continue; 400 401 if (handler->zi_record.zi_error == err1 || 402 handler->zi_record.zi_error == err2) { 403 /* 404 * limit error injection if requested 405 */ 406 if (!freq_triggered(handler->zi_record.zi_freq)) 407 continue; 408 409 /* 410 * For a failed open, pretend like the device 411 * has gone away. 412 */ 413 if (err1 == ENXIO) 414 vd->vdev_stat.vs_aux = 415 VDEV_AUX_OPEN_FAILED; 416 417 /* 418 * Treat these errors as if they had been 419 * retried so that all the appropriate stats 420 * and FMA events are generated. 421 */ 422 if (!handler->zi_record.zi_failfast && 423 zio != NULL) 424 zio->io_flags |= ZIO_FLAG_IO_RETRY; 425 426 /* 427 * EILSEQ means flip a bit after a read 428 */ 429 if (handler->zi_record.zi_error == EILSEQ) { 430 if (zio == NULL) 431 break; 432 433 /* locate buffer data and flip a bit */ 434 (void) abd_iterate_func(zio->io_abd, 0, 435 zio->io_size, zio_inject_bitflip_cb, 436 zio); 437 break; 438 } 439 440 ret = handler->zi_record.zi_error; 441 break; 442 } 443 if (handler->zi_record.zi_error == ENXIO) { 444 ret = SET_ERROR(EIO); 445 break; 446 } 447 } 448 } 449 450 rw_exit(&inject_lock); 451 452 return (ret); 453 } 454 455 int 456 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 457 { 458 return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); 459 } 460 461 int 462 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) 463 { 464 return (zio_handle_device_injection_impl(vd, zio, err1, err2)); 465 } 466 467 /* 468 * Simulate hardware that ignores cache flushes. For requested number 469 * of seconds nix the actual writing to disk. 470 */ 471 void 472 zio_handle_ignored_writes(zio_t *zio) 473 { 474 inject_handler_t *handler; 475 476 rw_enter(&inject_lock, RW_READER); 477 478 for (handler = list_head(&inject_handlers); handler != NULL; 479 handler = list_next(&inject_handlers, handler)) { 480 481 /* Ignore errors not destined for this pool */ 482 if (zio->io_spa != handler->zi_spa || 483 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 484 continue; 485 486 /* 487 * Positive duration implies # of seconds, negative 488 * a number of txgs 489 */ 490 if (handler->zi_record.zi_timer == 0) { 491 if (handler->zi_record.zi_duration > 0) 492 handler->zi_record.zi_timer = ddi_get_lbolt64(); 493 else 494 handler->zi_record.zi_timer = zio->io_txg; 495 } 496 497 /* Have a "problem" writing 60% of the time */ 498 if (random_in_range(100) < 60) 499 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 500 break; 501 } 502 503 rw_exit(&inject_lock); 504 } 505 506 void 507 spa_handle_ignored_writes(spa_t *spa) 508 { 509 inject_handler_t *handler; 510 511 if (zio_injection_enabled == 0) 512 return; 513 514 rw_enter(&inject_lock, RW_READER); 515 516 for (handler = list_head(&inject_handlers); handler != NULL; 517 handler = list_next(&inject_handlers, handler)) { 518 519 if (spa != handler->zi_spa || 520 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 521 continue; 522 523 if (handler->zi_record.zi_duration > 0) { 524 VERIFY(handler->zi_record.zi_timer == 0 || 525 ddi_time_after64( 526 (int64_t)handler->zi_record.zi_timer + 527 handler->zi_record.zi_duration * hz, 528 ddi_get_lbolt64())); 529 } else { 530 /* duration is negative so the subtraction here adds */ 531 VERIFY(handler->zi_record.zi_timer == 0 || 532 handler->zi_record.zi_timer - 533 handler->zi_record.zi_duration >= 534 spa_syncing_txg(spa)); 535 } 536 } 537 538 rw_exit(&inject_lock); 539 } 540 541 hrtime_t 542 zio_handle_io_delay(zio_t *zio) 543 { 544 vdev_t *vd = zio->io_vd; 545 inject_handler_t *min_handler = NULL; 546 hrtime_t min_target = 0; 547 548 rw_enter(&inject_lock, RW_READER); 549 550 /* 551 * inject_delay_count is a subset of zio_injection_enabled that 552 * is only incremented for delay handlers. These checks are 553 * mainly added to remind the reader why we're not explicitly 554 * checking zio_injection_enabled like the other functions. 555 */ 556 IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 557 IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 558 559 /* 560 * If there aren't any inject delay handlers registered, then we 561 * can short circuit and simply return 0 here. A value of zero 562 * informs zio_delay_interrupt() that this request should not be 563 * delayed. This short circuit keeps us from acquiring the 564 * inject_delay_mutex unnecessarily. 565 */ 566 if (inject_delay_count == 0) { 567 rw_exit(&inject_lock); 568 return (0); 569 } 570 571 /* 572 * Each inject handler has a number of "lanes" associated with 573 * it. Each lane is able to handle requests independently of one 574 * another, and at a latency defined by the inject handler 575 * record's zi_timer field. Thus if a handler in configured with 576 * a single lane with a 10ms latency, it will delay requests 577 * such that only a single request is completed every 10ms. So, 578 * if more than one request is attempted per each 10ms interval, 579 * the average latency of the requests will be greater than 580 * 10ms; but if only a single request is submitted each 10ms 581 * interval the average latency will be 10ms. 582 * 583 * We need to acquire this mutex to prevent multiple concurrent 584 * threads being assigned to the same lane of a given inject 585 * handler. The mutex allows us to perform the following two 586 * operations atomically: 587 * 588 * 1. determine the minimum handler and minimum target 589 * value of all the possible handlers 590 * 2. update that minimum handler's lane array 591 * 592 * Without atomicity, two (or more) threads could pick the same 593 * lane in step (1), and then conflict with each other in step 594 * (2). This could allow a single lane handler to process 595 * multiple requests simultaneously, which shouldn't be possible. 596 */ 597 mutex_enter(&inject_delay_mtx); 598 599 for (inject_handler_t *handler = list_head(&inject_handlers); 600 handler != NULL; handler = list_next(&inject_handlers, handler)) { 601 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 602 continue; 603 604 if (!freq_triggered(handler->zi_record.zi_freq)) 605 continue; 606 607 if (vd->vdev_guid != handler->zi_record.zi_guid) 608 continue; 609 610 if (handler->zi_record.zi_iotype != ZIO_TYPES && 611 handler->zi_record.zi_iotype != zio->io_type) 612 continue; 613 614 /* 615 * Defensive; should never happen as the array allocation 616 * occurs prior to inserting this handler on the list. 617 */ 618 ASSERT3P(handler->zi_lanes, !=, NULL); 619 620 /* 621 * This should never happen, the zinject command should 622 * prevent a user from setting an IO delay with zero lanes. 623 */ 624 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 625 626 ASSERT3U(handler->zi_record.zi_nlanes, >, 627 handler->zi_next_lane); 628 629 /* 630 * We want to issue this IO to the lane that will become 631 * idle the soonest, so we compare the soonest this 632 * specific handler can complete the IO with all other 633 * handlers, to find the lowest value of all possible 634 * lanes. We then use this lane to submit the request. 635 * 636 * Since each handler has a constant value for its 637 * delay, we can just use the "next" lane for that 638 * handler; as it will always be the lane with the 639 * lowest value for that particular handler (i.e. the 640 * lane that will become idle the soonest). This saves a 641 * scan of each handler's lanes array. 642 * 643 * There's two cases to consider when determining when 644 * this specific IO request should complete. If this 645 * lane is idle, we want to "submit" the request now so 646 * it will complete after zi_timer milliseconds. Thus, 647 * we set the target to now + zi_timer. 648 * 649 * If the lane is busy, we want this request to complete 650 * zi_timer milliseconds after the lane becomes idle. 651 * Since the 'zi_lanes' array holds the time at which 652 * each lane will become idle, we use that value to 653 * determine when this request should complete. 654 */ 655 hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 656 hrtime_t busy = handler->zi_record.zi_timer + 657 handler->zi_lanes[handler->zi_next_lane]; 658 hrtime_t target = MAX(idle, busy); 659 660 if (min_handler == NULL) { 661 min_handler = handler; 662 min_target = target; 663 continue; 664 } 665 666 ASSERT3P(min_handler, !=, NULL); 667 ASSERT3U(min_target, !=, 0); 668 669 /* 670 * We don't yet increment the "next lane" variable since 671 * we still might find a lower value lane in another 672 * handler during any remaining iterations. Once we're 673 * sure we've selected the absolute minimum, we'll claim 674 * the lane and increment the handler's "next lane" 675 * field below. 676 */ 677 678 if (target < min_target) { 679 min_handler = handler; 680 min_target = target; 681 } 682 } 683 684 /* 685 * 'min_handler' will be NULL if no IO delays are registered for 686 * this vdev, otherwise it will point to the handler containing 687 * the lane that will become idle the soonest. 688 */ 689 if (min_handler != NULL) { 690 ASSERT3U(min_target, !=, 0); 691 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 692 693 /* 694 * If we've used all possible lanes for this handler, 695 * loop back and start using the first lane again; 696 * otherwise, just increment the lane index. 697 */ 698 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 699 min_handler->zi_record.zi_nlanes; 700 } 701 702 mutex_exit(&inject_delay_mtx); 703 rw_exit(&inject_lock); 704 705 return (min_target); 706 } 707 708 static void 709 zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) 710 { 711 inject_handler_t *handler; 712 hrtime_t delay = 0; 713 int id = 0; 714 715 rw_enter(&inject_lock, RW_READER); 716 717 for (handler = list_head(&inject_handlers); 718 handler != NULL && handler->zi_record.zi_cmd == command; 719 handler = list_next(&inject_handlers, handler)) { 720 ASSERT3P(handler->zi_spa_name, !=, NULL); 721 if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { 722 uint64_t pause = 723 SEC2NSEC(handler->zi_record.zi_duration); 724 if (pause > elapsed) { 725 delay = pause - elapsed; 726 } 727 id = handler->zi_id; 728 break; 729 } 730 } 731 732 rw_exit(&inject_lock); 733 734 if (delay) { 735 if (command == ZINJECT_DELAY_IMPORT) { 736 spa_import_progress_set_notes(spa, "injecting %llu " 737 "sec delay", (u_longlong_t)NSEC2SEC(delay)); 738 } 739 zfs_sleep_until(gethrtime() + delay); 740 } 741 if (id) { 742 /* all done with this one-shot handler */ 743 zio_clear_fault(id); 744 } 745 } 746 747 /* 748 * For testing, inject a delay during an import 749 */ 750 void 751 zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) 752 { 753 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); 754 } 755 756 /* 757 * For testing, inject a delay during an export 758 */ 759 void 760 zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) 761 { 762 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); 763 } 764 765 static int 766 zio_calculate_range(const char *pool, zinject_record_t *record) 767 { 768 dsl_pool_t *dp; 769 dsl_dataset_t *ds; 770 objset_t *os = NULL; 771 dnode_t *dn = NULL; 772 int error; 773 774 /* 775 * Obtain the dnode for object using pool, objset, and object 776 */ 777 error = dsl_pool_hold(pool, FTAG, &dp); 778 if (error) 779 return (error); 780 781 error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); 782 dsl_pool_rele(dp, FTAG); 783 if (error) 784 return (error); 785 786 error = dmu_objset_from_ds(ds, &os); 787 dsl_dataset_rele(ds, FTAG); 788 if (error) 789 return (error); 790 791 error = dnode_hold(os, record->zi_object, FTAG, &dn); 792 if (error) 793 return (error); 794 795 /* 796 * Translate the range into block IDs 797 */ 798 if (record->zi_start != 0 || record->zi_end != -1ULL) { 799 record->zi_start >>= dn->dn_datablkshift; 800 record->zi_end >>= dn->dn_datablkshift; 801 } 802 if (record->zi_level > 0) { 803 if (record->zi_level >= dn->dn_nlevels) { 804 dnode_rele(dn, FTAG); 805 return (SET_ERROR(EDOM)); 806 } 807 808 if (record->zi_start != 0 || record->zi_end != 0) { 809 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 810 811 for (int level = record->zi_level; level > 0; level--) { 812 record->zi_start >>= shift; 813 record->zi_end >>= shift; 814 } 815 } 816 } 817 818 dnode_rele(dn, FTAG); 819 return (0); 820 } 821 822 static boolean_t 823 zio_pool_handler_exists(const char *name, zinject_type_t command) 824 { 825 boolean_t exists = B_FALSE; 826 827 rw_enter(&inject_lock, RW_READER); 828 for (inject_handler_t *handler = list_head(&inject_handlers); 829 handler != NULL; handler = list_next(&inject_handlers, handler)) { 830 if (command != handler->zi_record.zi_cmd) 831 continue; 832 833 const char *pool = (handler->zi_spa_name != NULL) ? 834 handler->zi_spa_name : spa_name(handler->zi_spa); 835 if (strcmp(name, pool) == 0) { 836 exists = B_TRUE; 837 break; 838 } 839 } 840 rw_exit(&inject_lock); 841 842 return (exists); 843 } 844 /* 845 * Create a new handler for the given record. We add it to the list, adding 846 * a reference to the spa_t in the process. We increment zio_injection_enabled, 847 * which is the switch to trigger all fault injection. 848 */ 849 int 850 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 851 { 852 inject_handler_t *handler; 853 int error; 854 spa_t *spa; 855 856 /* 857 * If this is pool-wide metadata, make sure we unload the corresponding 858 * spa_t, so that the next attempt to load it will trigger the fault. 859 * We call spa_reset() to unload the pool appropriately. 860 */ 861 if (flags & ZINJECT_UNLOAD_SPA) 862 if ((error = spa_reset(name)) != 0) 863 return (error); 864 865 if (record->zi_cmd == ZINJECT_DELAY_IO) { 866 /* 867 * A value of zero for the number of lanes or for the 868 * delay time doesn't make sense. 869 */ 870 if (record->zi_timer == 0 || record->zi_nlanes == 0) 871 return (SET_ERROR(EINVAL)); 872 873 /* 874 * The number of lanes is directly mapped to the size of 875 * an array used by the handler. Thus, to ensure the 876 * user doesn't trigger an allocation that's "too large" 877 * we cap the number of lanes here. 878 */ 879 if (record->zi_nlanes >= UINT16_MAX) 880 return (SET_ERROR(EINVAL)); 881 } 882 883 /* 884 * If the supplied range was in bytes -- calculate the actual blkid 885 */ 886 if (flags & ZINJECT_CALC_RANGE) { 887 error = zio_calculate_range(name, record); 888 if (error != 0) 889 return (error); 890 } 891 892 if (!(flags & ZINJECT_NULL)) { 893 /* 894 * Pool delays for import or export don't take an 895 * injection reference on the spa. Instead they 896 * rely on matching by name. 897 */ 898 if (record->zi_cmd == ZINJECT_DELAY_IMPORT || 899 record->zi_cmd == ZINJECT_DELAY_EXPORT) { 900 if (record->zi_duration <= 0) 901 return (SET_ERROR(EINVAL)); 902 /* 903 * Only one import | export delay handler per pool. 904 */ 905 if (zio_pool_handler_exists(name, record->zi_cmd)) 906 return (SET_ERROR(EEXIST)); 907 908 mutex_enter(&spa_namespace_lock); 909 boolean_t has_spa = spa_lookup(name) != NULL; 910 mutex_exit(&spa_namespace_lock); 911 912 if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) 913 return (SET_ERROR(EEXIST)); 914 if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) 915 return (SET_ERROR(ENOENT)); 916 spa = NULL; 917 } else { 918 /* 919 * spa_inject_ref() will add an injection reference, 920 * which will prevent the pool from being removed 921 * from the namespace while still allowing it to be 922 * unloaded. 923 */ 924 if ((spa = spa_inject_addref(name)) == NULL) 925 return (SET_ERROR(ENOENT)); 926 } 927 928 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 929 handler->zi_spa = spa; /* note: can be NULL */ 930 handler->zi_record = *record; 931 932 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 933 handler->zi_lanes = kmem_zalloc( 934 sizeof (*handler->zi_lanes) * 935 handler->zi_record.zi_nlanes, KM_SLEEP); 936 handler->zi_next_lane = 0; 937 } else { 938 handler->zi_lanes = NULL; 939 handler->zi_next_lane = 0; 940 } 941 942 if (handler->zi_spa == NULL) 943 handler->zi_spa_name = spa_strdup(name); 944 else 945 handler->zi_spa_name = NULL; 946 947 rw_enter(&inject_lock, RW_WRITER); 948 949 /* 950 * We can't move this increment into the conditional 951 * above because we need to hold the RW_WRITER lock of 952 * inject_lock, and we don't want to hold that while 953 * allocating the handler's zi_lanes array. 954 */ 955 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 956 ASSERT3S(inject_delay_count, >=, 0); 957 inject_delay_count++; 958 ASSERT3S(inject_delay_count, >, 0); 959 } 960 961 *id = handler->zi_id = inject_next_id++; 962 list_insert_tail(&inject_handlers, handler); 963 atomic_inc_32(&zio_injection_enabled); 964 965 rw_exit(&inject_lock); 966 } 967 968 /* 969 * Flush the ARC, so that any attempts to read this data will end up 970 * going to the ZIO layer. Note that this is a little overkill, but 971 * we don't have the necessary ARC interfaces to do anything else, and 972 * fault injection isn't a performance critical path. 973 */ 974 if (flags & ZINJECT_FLUSH_ARC) 975 /* 976 * We must use FALSE to ensure arc_flush returns, since 977 * we're not preventing concurrent ARC insertions. 978 */ 979 arc_flush(NULL, FALSE); 980 981 return (0); 982 } 983 984 /* 985 * Returns the next record with an ID greater than that supplied to the 986 * function. Used to iterate over all handlers in the system. 987 */ 988 int 989 zio_inject_list_next(int *id, char *name, size_t buflen, 990 zinject_record_t *record) 991 { 992 inject_handler_t *handler; 993 int ret; 994 995 mutex_enter(&spa_namespace_lock); 996 rw_enter(&inject_lock, RW_READER); 997 998 for (handler = list_head(&inject_handlers); handler != NULL; 999 handler = list_next(&inject_handlers, handler)) 1000 if (handler->zi_id > *id) 1001 break; 1002 1003 if (handler) { 1004 *record = handler->zi_record; 1005 *id = handler->zi_id; 1006 ASSERT(handler->zi_spa || handler->zi_spa_name); 1007 if (handler->zi_spa != NULL) 1008 (void) strlcpy(name, spa_name(handler->zi_spa), buflen); 1009 else 1010 (void) strlcpy(name, handler->zi_spa_name, buflen); 1011 ret = 0; 1012 } else { 1013 ret = SET_ERROR(ENOENT); 1014 } 1015 1016 rw_exit(&inject_lock); 1017 mutex_exit(&spa_namespace_lock); 1018 1019 return (ret); 1020 } 1021 1022 /* 1023 * Clear the fault handler with the given identifier, or return ENOENT if none 1024 * exists. 1025 */ 1026 int 1027 zio_clear_fault(int id) 1028 { 1029 inject_handler_t *handler; 1030 1031 rw_enter(&inject_lock, RW_WRITER); 1032 1033 for (handler = list_head(&inject_handlers); handler != NULL; 1034 handler = list_next(&inject_handlers, handler)) 1035 if (handler->zi_id == id) 1036 break; 1037 1038 if (handler == NULL) { 1039 rw_exit(&inject_lock); 1040 return (SET_ERROR(ENOENT)); 1041 } 1042 1043 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1044 ASSERT3S(inject_delay_count, >, 0); 1045 inject_delay_count--; 1046 ASSERT3S(inject_delay_count, >=, 0); 1047 } 1048 1049 list_remove(&inject_handlers, handler); 1050 rw_exit(&inject_lock); 1051 1052 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1053 ASSERT3P(handler->zi_lanes, !=, NULL); 1054 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 1055 handler->zi_record.zi_nlanes); 1056 } else { 1057 ASSERT3P(handler->zi_lanes, ==, NULL); 1058 } 1059 1060 if (handler->zi_spa_name != NULL) 1061 spa_strfree(handler->zi_spa_name); 1062 1063 if (handler->zi_spa != NULL) 1064 spa_inject_delref(handler->zi_spa); 1065 kmem_free(handler, sizeof (inject_handler_t)); 1066 atomic_dec_32(&zio_injection_enabled); 1067 1068 return (0); 1069 } 1070 1071 void 1072 zio_inject_init(void) 1073 { 1074 rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 1075 mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 1076 list_create(&inject_handlers, sizeof (inject_handler_t), 1077 offsetof(inject_handler_t, zi_link)); 1078 } 1079 1080 void 1081 zio_inject_fini(void) 1082 { 1083 list_destroy(&inject_handlers); 1084 mutex_destroy(&inject_delay_mtx); 1085 rw_destroy(&inject_lock); 1086 } 1087 1088 #if defined(_KERNEL) 1089 EXPORT_SYMBOL(zio_injection_enabled); 1090 EXPORT_SYMBOL(zio_inject_fault); 1091 EXPORT_SYMBOL(zio_inject_list_next); 1092 EXPORT_SYMBOL(zio_clear_fault); 1093 EXPORT_SYMBOL(zio_handle_fault_injection); 1094 EXPORT_SYMBOL(zio_handle_device_injection); 1095 EXPORT_SYMBOL(zio_handle_label_injection); 1096 #endif 1097