1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2017, Intel Corporation. 25 * Copyright (c) 2024, Klara Inc. 26 */ 27 28 /* 29 * ZFS fault injection 30 * 31 * To handle fault injection, we keep track of a series of zinject_record_t 32 * structures which describe which logical block(s) should be injected with a 33 * fault. These are kept in a global list. Each record corresponds to a given 34 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 35 * or exported while the injection record exists. 36 * 37 * Device level injection is done using the 'zi_guid' field. If this is set, it 38 * means that the error is destined for a particular device, not a piece of 39 * data. 40 * 41 * This is a rather poor data structure and algorithm, but we don't expect more 42 * than a few faults at any one time, so it should be sufficient for our needs. 43 */ 44 45 #include <sys/arc.h> 46 #include <sys/zio.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/vdev_impl.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/dsl_dataset.h> 51 #include <sys/fs/zfs.h> 52 53 uint32_t zio_injection_enabled = 0; 54 55 /* 56 * Data describing each zinject handler registered on the system, and 57 * contains the list node linking the handler in the global zinject 58 * handler list. 59 */ 60 typedef struct inject_handler { 61 int zi_id; 62 spa_t *zi_spa; 63 char *zi_spa_name; /* ZINJECT_DELAY_IMPORT only */ 64 zinject_record_t zi_record; 65 uint64_t *zi_lanes; 66 int zi_next_lane; 67 list_node_t zi_link; 68 } inject_handler_t; 69 70 /* 71 * List of all zinject handlers registered on the system, protected by 72 * the inject_lock defined below. 73 */ 74 static list_t inject_handlers; 75 76 /* 77 * This protects insertion into, and traversal of, the inject handler 78 * list defined above; as well as the inject_delay_count. Any time a 79 * handler is inserted or removed from the list, this lock should be 80 * taken as a RW_WRITER; and any time traversal is done over the list 81 * (without modification to it) this lock should be taken as a RW_READER. 82 */ 83 static krwlock_t inject_lock; 84 85 /* 86 * This holds the number of zinject delay handlers that have been 87 * registered on the system. It is protected by the inject_lock defined 88 * above. Thus modifications to this count must be a RW_WRITER of the 89 * inject_lock, and reads of this count must be (at least) a RW_READER 90 * of the lock. 91 */ 92 static int inject_delay_count = 0; 93 94 /* 95 * This lock is used only in zio_handle_io_delay(), refer to the comment 96 * in that function for more details. 97 */ 98 static kmutex_t inject_delay_mtx; 99 100 /* 101 * Used to assign unique identifying numbers to each new zinject handler. 102 */ 103 static int inject_next_id = 1; 104 105 /* 106 * Test if the requested frequency was triggered 107 */ 108 static boolean_t 109 freq_triggered(uint32_t frequency) 110 { 111 /* 112 * zero implies always (100%) 113 */ 114 if (frequency == 0) 115 return (B_TRUE); 116 117 /* 118 * Note: we still handle legacy (unscaled) frequency values 119 */ 120 uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; 121 122 return (random_in_range(maximum) < frequency); 123 } 124 125 /* 126 * Returns true if the given record matches the I/O in progress. 127 */ 128 static boolean_t 129 zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, 130 zinject_record_t *record, int error) 131 { 132 /* 133 * Check for a match against the MOS, which is based on type 134 */ 135 if (zb->zb_objset == DMU_META_OBJSET && 136 record->zi_objset == DMU_META_OBJSET && 137 record->zi_object == DMU_META_DNODE_OBJECT) { 138 if (record->zi_type == DMU_OT_NONE || 139 type == record->zi_type) 140 return (freq_triggered(record->zi_freq)); 141 else 142 return (B_FALSE); 143 } 144 145 /* 146 * Check for an exact match. 147 */ 148 if (zb->zb_objset == record->zi_objset && 149 zb->zb_object == record->zi_object && 150 zb->zb_level == record->zi_level && 151 zb->zb_blkid >= record->zi_start && 152 zb->zb_blkid <= record->zi_end && 153 (record->zi_dvas == 0 || 154 (dva != ZI_NO_DVA && (record->zi_dvas & (1ULL << dva)))) && 155 error == record->zi_error) { 156 return (freq_triggered(record->zi_freq)); 157 } 158 159 return (B_FALSE); 160 } 161 162 /* 163 * Panic the system when a config change happens in the function 164 * specified by tag. 165 */ 166 void 167 zio_handle_panic_injection(spa_t *spa, const char *tag, uint64_t type) 168 { 169 inject_handler_t *handler; 170 171 rw_enter(&inject_lock, RW_READER); 172 173 for (handler = list_head(&inject_handlers); handler != NULL; 174 handler = list_next(&inject_handlers, handler)) { 175 176 if (spa != handler->zi_spa) 177 continue; 178 179 if (handler->zi_record.zi_type == type && 180 strcmp(tag, handler->zi_record.zi_func) == 0) 181 panic("Panic requested in function %s\n", tag); 182 } 183 184 rw_exit(&inject_lock); 185 } 186 187 /* 188 * Inject a decryption failure. Decryption failures can occur in 189 * both the ARC and the ZIO layers. 190 */ 191 int 192 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, 193 uint64_t type, int error) 194 { 195 int ret = 0; 196 inject_handler_t *handler; 197 198 rw_enter(&inject_lock, RW_READER); 199 200 for (handler = list_head(&inject_handlers); handler != NULL; 201 handler = list_next(&inject_handlers, handler)) { 202 203 if (spa != handler->zi_spa || 204 handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) 205 continue; 206 207 if (zio_match_handler(zb, type, ZI_NO_DVA, 208 &handler->zi_record, error)) { 209 ret = error; 210 break; 211 } 212 } 213 214 rw_exit(&inject_lock); 215 return (ret); 216 } 217 218 /* 219 * If this is a physical I/O for a vdev child determine which DVA it is 220 * for. We iterate backwards through the DVAs matching on the offset so 221 * that we end up with ZI_NO_DVA (-1) if we don't find a match. 222 */ 223 static int 224 zio_match_dva(zio_t *zio) 225 { 226 int i = ZI_NO_DVA; 227 228 if (zio->io_bp != NULL && zio->io_vd != NULL && 229 zio->io_child_type == ZIO_CHILD_VDEV) { 230 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { 231 dva_t *dva = &zio->io_bp->blk_dva[i]; 232 uint64_t off = DVA_GET_OFFSET(dva); 233 vdev_t *vd = vdev_lookup_top(zio->io_spa, 234 DVA_GET_VDEV(dva)); 235 236 /* Compensate for vdev label added to leaves */ 237 if (zio->io_vd->vdev_ops->vdev_op_leaf) 238 off += VDEV_LABEL_START_SIZE; 239 240 if (zio->io_vd == vd && zio->io_offset == off) 241 break; 242 } 243 } 244 245 return (i); 246 } 247 248 249 /* 250 * Determine if the I/O in question should return failure. Returns the errno 251 * to be returned to the caller. 252 */ 253 int 254 zio_handle_fault_injection(zio_t *zio, int error) 255 { 256 int ret = 0; 257 inject_handler_t *handler; 258 259 /* 260 * Ignore I/O not associated with any logical data. 261 */ 262 if (zio->io_logical == NULL) 263 return (0); 264 265 /* 266 * Currently, we only support fault injection on reads. 267 */ 268 if (zio->io_type != ZIO_TYPE_READ) 269 return (0); 270 271 /* 272 * A rebuild I/O has no checksum to verify. 273 */ 274 if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) 275 return (0); 276 277 rw_enter(&inject_lock, RW_READER); 278 279 for (handler = list_head(&inject_handlers); handler != NULL; 280 handler = list_next(&inject_handlers, handler)) { 281 if (zio->io_spa != handler->zi_spa || 282 handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 283 continue; 284 285 /* If this handler matches, return the specified error */ 286 if (zio_match_handler(&zio->io_logical->io_bookmark, 287 zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 288 zio_match_dva(zio), &handler->zi_record, error)) { 289 ret = error; 290 break; 291 } 292 } 293 294 rw_exit(&inject_lock); 295 296 return (ret); 297 } 298 299 /* 300 * Determine if the zio is part of a label update and has an injection 301 * handler associated with that portion of the label. Currently, we 302 * allow error injection in either the nvlist or the uberblock region of 303 * of the vdev label. 304 */ 305 int 306 zio_handle_label_injection(zio_t *zio, int error) 307 { 308 inject_handler_t *handler; 309 vdev_t *vd = zio->io_vd; 310 uint64_t offset = zio->io_offset; 311 int label; 312 int ret = 0; 313 314 if (offset >= VDEV_LABEL_START_SIZE && 315 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 316 return (0); 317 318 rw_enter(&inject_lock, RW_READER); 319 320 for (handler = list_head(&inject_handlers); handler != NULL; 321 handler = list_next(&inject_handlers, handler)) { 322 uint64_t start = handler->zi_record.zi_start; 323 uint64_t end = handler->zi_record.zi_end; 324 325 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 326 continue; 327 328 /* 329 * The injection region is the relative offsets within a 330 * vdev label. We must determine the label which is being 331 * updated and adjust our region accordingly. 332 */ 333 label = vdev_label_number(vd->vdev_psize, offset); 334 start = vdev_label_offset(vd->vdev_psize, label, start); 335 end = vdev_label_offset(vd->vdev_psize, label, end); 336 337 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 338 (offset >= start && offset <= end)) { 339 ret = error; 340 break; 341 } 342 } 343 rw_exit(&inject_lock); 344 return (ret); 345 } 346 347 static int 348 zio_inject_bitflip_cb(void *data, size_t len, void *private) 349 { 350 zio_t *zio = private; 351 uint8_t *buffer = data; 352 uint_t byte = random_in_range(len); 353 354 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); 355 356 /* flip a single random bit in an abd data buffer */ 357 buffer[byte] ^= 1 << random_in_range(8); 358 359 return (1); /* stop after first flip */ 360 } 361 362 static int 363 zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) 364 { 365 inject_handler_t *handler; 366 int ret = 0; 367 368 /* 369 * We skip over faults in the labels unless it's during device open 370 * (i.e. zio == NULL) or a device flush (offset is meaningless) 371 */ 372 if (zio != NULL && zio->io_type != ZIO_TYPE_FLUSH) { 373 uint64_t offset = zio->io_offset; 374 375 if (offset < VDEV_LABEL_START_SIZE || 376 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 377 return (0); 378 } 379 380 rw_enter(&inject_lock, RW_READER); 381 382 for (handler = list_head(&inject_handlers); handler != NULL; 383 handler = list_next(&inject_handlers, handler)) { 384 385 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 386 continue; 387 388 if (vd->vdev_guid == handler->zi_record.zi_guid) { 389 if (handler->zi_record.zi_failfast && 390 (zio == NULL || (zio->io_flags & 391 (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 392 continue; 393 } 394 395 /* Handle type specific I/O failures */ 396 if (zio != NULL && 397 handler->zi_record.zi_iotype != ZIO_TYPES && 398 handler->zi_record.zi_iotype != zio->io_type) 399 continue; 400 401 if (handler->zi_record.zi_error == err1 || 402 handler->zi_record.zi_error == err2) { 403 /* 404 * limit error injection if requested 405 */ 406 if (!freq_triggered(handler->zi_record.zi_freq)) 407 continue; 408 409 /* 410 * For a failed open, pretend like the device 411 * has gone away. 412 */ 413 if (err1 == ENXIO) 414 vd->vdev_stat.vs_aux = 415 VDEV_AUX_OPEN_FAILED; 416 417 /* 418 * Treat these errors as if they had been 419 * retried so that all the appropriate stats 420 * and FMA events are generated. 421 */ 422 if (!handler->zi_record.zi_failfast && 423 zio != NULL) 424 zio->io_flags |= ZIO_FLAG_IO_RETRY; 425 426 /* 427 * EILSEQ means flip a bit after a read 428 */ 429 if (handler->zi_record.zi_error == EILSEQ) { 430 if (zio == NULL) 431 break; 432 433 /* locate buffer data and flip a bit */ 434 (void) abd_iterate_func(zio->io_abd, 0, 435 zio->io_size, zio_inject_bitflip_cb, 436 zio); 437 break; 438 } 439 440 ret = handler->zi_record.zi_error; 441 break; 442 } 443 if (handler->zi_record.zi_error == ENXIO) { 444 ret = SET_ERROR(EIO); 445 break; 446 } 447 } 448 } 449 450 rw_exit(&inject_lock); 451 452 return (ret); 453 } 454 455 int 456 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 457 { 458 return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); 459 } 460 461 int 462 zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) 463 { 464 return (zio_handle_device_injection_impl(vd, zio, err1, err2)); 465 } 466 467 /* 468 * Simulate hardware that ignores cache flushes. For requested number 469 * of seconds nix the actual writing to disk. 470 */ 471 void 472 zio_handle_ignored_writes(zio_t *zio) 473 { 474 inject_handler_t *handler; 475 476 rw_enter(&inject_lock, RW_READER); 477 478 for (handler = list_head(&inject_handlers); handler != NULL; 479 handler = list_next(&inject_handlers, handler)) { 480 481 /* Ignore errors not destined for this pool */ 482 if (zio->io_spa != handler->zi_spa || 483 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 484 continue; 485 486 /* 487 * Positive duration implies # of seconds, negative 488 * a number of txgs 489 */ 490 if (handler->zi_record.zi_timer == 0) { 491 if (handler->zi_record.zi_duration > 0) 492 handler->zi_record.zi_timer = ddi_get_lbolt64(); 493 else 494 handler->zi_record.zi_timer = zio->io_txg; 495 } 496 497 /* Have a "problem" writing 60% of the time */ 498 if (random_in_range(100) < 60) 499 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 500 break; 501 } 502 503 rw_exit(&inject_lock); 504 } 505 506 void 507 spa_handle_ignored_writes(spa_t *spa) 508 { 509 inject_handler_t *handler; 510 511 if (zio_injection_enabled == 0) 512 return; 513 514 rw_enter(&inject_lock, RW_READER); 515 516 for (handler = list_head(&inject_handlers); handler != NULL; 517 handler = list_next(&inject_handlers, handler)) { 518 519 if (spa != handler->zi_spa || 520 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 521 continue; 522 523 if (handler->zi_record.zi_duration > 0) { 524 VERIFY(handler->zi_record.zi_timer == 0 || 525 ddi_time_after64( 526 (int64_t)handler->zi_record.zi_timer + 527 handler->zi_record.zi_duration * hz, 528 ddi_get_lbolt64())); 529 } else { 530 /* duration is negative so the subtraction here adds */ 531 VERIFY(handler->zi_record.zi_timer == 0 || 532 handler->zi_record.zi_timer - 533 handler->zi_record.zi_duration >= 534 spa_syncing_txg(spa)); 535 } 536 } 537 538 rw_exit(&inject_lock); 539 } 540 541 hrtime_t 542 zio_handle_io_delay(zio_t *zio) 543 { 544 vdev_t *vd = zio->io_vd; 545 inject_handler_t *min_handler = NULL; 546 hrtime_t min_target = 0; 547 548 rw_enter(&inject_lock, RW_READER); 549 550 /* 551 * inject_delay_count is a subset of zio_injection_enabled that 552 * is only incremented for delay handlers. These checks are 553 * mainly added to remind the reader why we're not explicitly 554 * checking zio_injection_enabled like the other functions. 555 */ 556 IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 557 IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 558 559 /* 560 * If there aren't any inject delay handlers registered, then we 561 * can short circuit and simply return 0 here. A value of zero 562 * informs zio_delay_interrupt() that this request should not be 563 * delayed. This short circuit keeps us from acquiring the 564 * inject_delay_mutex unnecessarily. 565 */ 566 if (inject_delay_count == 0) { 567 rw_exit(&inject_lock); 568 return (0); 569 } 570 571 /* 572 * Each inject handler has a number of "lanes" associated with 573 * it. Each lane is able to handle requests independently of one 574 * another, and at a latency defined by the inject handler 575 * record's zi_timer field. Thus if a handler in configured with 576 * a single lane with a 10ms latency, it will delay requests 577 * such that only a single request is completed every 10ms. So, 578 * if more than one request is attempted per each 10ms interval, 579 * the average latency of the requests will be greater than 580 * 10ms; but if only a single request is submitted each 10ms 581 * interval the average latency will be 10ms. 582 * 583 * We need to acquire this mutex to prevent multiple concurrent 584 * threads being assigned to the same lane of a given inject 585 * handler. The mutex allows us to perform the following two 586 * operations atomically: 587 * 588 * 1. determine the minimum handler and minimum target 589 * value of all the possible handlers 590 * 2. update that minimum handler's lane array 591 * 592 * Without atomicity, two (or more) threads could pick the same 593 * lane in step (1), and then conflict with each other in step 594 * (2). This could allow a single lane handler to process 595 * multiple requests simultaneously, which shouldn't be possible. 596 */ 597 mutex_enter(&inject_delay_mtx); 598 599 for (inject_handler_t *handler = list_head(&inject_handlers); 600 handler != NULL; handler = list_next(&inject_handlers, handler)) { 601 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 602 continue; 603 604 if (!freq_triggered(handler->zi_record.zi_freq)) 605 continue; 606 607 if (vd->vdev_guid != handler->zi_record.zi_guid) 608 continue; 609 610 /* also match on I/O type (e.g., -T read) */ 611 if (handler->zi_record.zi_iotype != ZIO_TYPES && 612 handler->zi_record.zi_iotype != zio->io_type) { 613 continue; 614 } 615 616 /* 617 * Defensive; should never happen as the array allocation 618 * occurs prior to inserting this handler on the list. 619 */ 620 ASSERT3P(handler->zi_lanes, !=, NULL); 621 622 /* 623 * This should never happen, the zinject command should 624 * prevent a user from setting an IO delay with zero lanes. 625 */ 626 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 627 628 ASSERT3U(handler->zi_record.zi_nlanes, >, 629 handler->zi_next_lane); 630 631 /* 632 * We want to issue this IO to the lane that will become 633 * idle the soonest, so we compare the soonest this 634 * specific handler can complete the IO with all other 635 * handlers, to find the lowest value of all possible 636 * lanes. We then use this lane to submit the request. 637 * 638 * Since each handler has a constant value for its 639 * delay, we can just use the "next" lane for that 640 * handler; as it will always be the lane with the 641 * lowest value for that particular handler (i.e. the 642 * lane that will become idle the soonest). This saves a 643 * scan of each handler's lanes array. 644 * 645 * There's two cases to consider when determining when 646 * this specific IO request should complete. If this 647 * lane is idle, we want to "submit" the request now so 648 * it will complete after zi_timer milliseconds. Thus, 649 * we set the target to now + zi_timer. 650 * 651 * If the lane is busy, we want this request to complete 652 * zi_timer milliseconds after the lane becomes idle. 653 * Since the 'zi_lanes' array holds the time at which 654 * each lane will become idle, we use that value to 655 * determine when this request should complete. 656 */ 657 hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 658 hrtime_t busy = handler->zi_record.zi_timer + 659 handler->zi_lanes[handler->zi_next_lane]; 660 hrtime_t target = MAX(idle, busy); 661 662 if (min_handler == NULL) { 663 min_handler = handler; 664 min_target = target; 665 continue; 666 } 667 668 ASSERT3P(min_handler, !=, NULL); 669 ASSERT3U(min_target, !=, 0); 670 671 /* 672 * We don't yet increment the "next lane" variable since 673 * we still might find a lower value lane in another 674 * handler during any remaining iterations. Once we're 675 * sure we've selected the absolute minimum, we'll claim 676 * the lane and increment the handler's "next lane" 677 * field below. 678 */ 679 680 if (target < min_target) { 681 min_handler = handler; 682 min_target = target; 683 } 684 } 685 686 /* 687 * 'min_handler' will be NULL if no IO delays are registered for 688 * this vdev, otherwise it will point to the handler containing 689 * the lane that will become idle the soonest. 690 */ 691 if (min_handler != NULL) { 692 ASSERT3U(min_target, !=, 0); 693 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 694 695 /* 696 * If we've used all possible lanes for this handler, 697 * loop back and start using the first lane again; 698 * otherwise, just increment the lane index. 699 */ 700 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 701 min_handler->zi_record.zi_nlanes; 702 } 703 704 mutex_exit(&inject_delay_mtx); 705 rw_exit(&inject_lock); 706 707 return (min_target); 708 } 709 710 static void 711 zio_handle_pool_delay(spa_t *spa, hrtime_t elapsed, zinject_type_t command) 712 { 713 inject_handler_t *handler; 714 hrtime_t delay = 0; 715 int id = 0; 716 717 rw_enter(&inject_lock, RW_READER); 718 719 for (handler = list_head(&inject_handlers); 720 handler != NULL && handler->zi_record.zi_cmd == command; 721 handler = list_next(&inject_handlers, handler)) { 722 ASSERT3P(handler->zi_spa_name, !=, NULL); 723 if (strcmp(spa_name(spa), handler->zi_spa_name) == 0) { 724 uint64_t pause = 725 SEC2NSEC(handler->zi_record.zi_duration); 726 if (pause > elapsed) { 727 delay = pause - elapsed; 728 } 729 id = handler->zi_id; 730 break; 731 } 732 } 733 734 rw_exit(&inject_lock); 735 736 if (delay) { 737 if (command == ZINJECT_DELAY_IMPORT) { 738 spa_import_progress_set_notes(spa, "injecting %llu " 739 "sec delay", (u_longlong_t)NSEC2SEC(delay)); 740 } 741 zfs_sleep_until(gethrtime() + delay); 742 } 743 if (id) { 744 /* all done with this one-shot handler */ 745 zio_clear_fault(id); 746 } 747 } 748 749 /* 750 * For testing, inject a delay during an import 751 */ 752 void 753 zio_handle_import_delay(spa_t *spa, hrtime_t elapsed) 754 { 755 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_IMPORT); 756 } 757 758 /* 759 * For testing, inject a delay during an export 760 */ 761 void 762 zio_handle_export_delay(spa_t *spa, hrtime_t elapsed) 763 { 764 zio_handle_pool_delay(spa, elapsed, ZINJECT_DELAY_EXPORT); 765 } 766 767 static int 768 zio_calculate_range(const char *pool, zinject_record_t *record) 769 { 770 dsl_pool_t *dp; 771 dsl_dataset_t *ds; 772 objset_t *os = NULL; 773 dnode_t *dn = NULL; 774 int error; 775 776 /* 777 * Obtain the dnode for object using pool, objset, and object 778 */ 779 error = dsl_pool_hold(pool, FTAG, &dp); 780 if (error) 781 return (error); 782 783 error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); 784 dsl_pool_rele(dp, FTAG); 785 if (error) 786 return (error); 787 788 error = dmu_objset_from_ds(ds, &os); 789 dsl_dataset_rele(ds, FTAG); 790 if (error) 791 return (error); 792 793 error = dnode_hold(os, record->zi_object, FTAG, &dn); 794 if (error) 795 return (error); 796 797 /* 798 * Translate the range into block IDs 799 */ 800 if (record->zi_start != 0 || record->zi_end != -1ULL) { 801 record->zi_start >>= dn->dn_datablkshift; 802 record->zi_end >>= dn->dn_datablkshift; 803 } 804 if (record->zi_level > 0) { 805 if (record->zi_level >= dn->dn_nlevels) { 806 dnode_rele(dn, FTAG); 807 return (SET_ERROR(EDOM)); 808 } 809 810 if (record->zi_start != 0 || record->zi_end != 0) { 811 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 812 813 for (int level = record->zi_level; level > 0; level--) { 814 record->zi_start >>= shift; 815 record->zi_end >>= shift; 816 } 817 } 818 } 819 820 dnode_rele(dn, FTAG); 821 return (0); 822 } 823 824 static boolean_t 825 zio_pool_handler_exists(const char *name, zinject_type_t command) 826 { 827 boolean_t exists = B_FALSE; 828 829 rw_enter(&inject_lock, RW_READER); 830 for (inject_handler_t *handler = list_head(&inject_handlers); 831 handler != NULL; handler = list_next(&inject_handlers, handler)) { 832 if (command != handler->zi_record.zi_cmd) 833 continue; 834 835 const char *pool = (handler->zi_spa_name != NULL) ? 836 handler->zi_spa_name : spa_name(handler->zi_spa); 837 if (strcmp(name, pool) == 0) { 838 exists = B_TRUE; 839 break; 840 } 841 } 842 rw_exit(&inject_lock); 843 844 return (exists); 845 } 846 /* 847 * Create a new handler for the given record. We add it to the list, adding 848 * a reference to the spa_t in the process. We increment zio_injection_enabled, 849 * which is the switch to trigger all fault injection. 850 */ 851 int 852 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 853 { 854 inject_handler_t *handler; 855 int error; 856 spa_t *spa; 857 858 /* 859 * If this is pool-wide metadata, make sure we unload the corresponding 860 * spa_t, so that the next attempt to load it will trigger the fault. 861 * We call spa_reset() to unload the pool appropriately. 862 */ 863 if (flags & ZINJECT_UNLOAD_SPA) 864 if ((error = spa_reset(name)) != 0) 865 return (error); 866 867 if (record->zi_cmd == ZINJECT_DELAY_IO) { 868 /* 869 * A value of zero for the number of lanes or for the 870 * delay time doesn't make sense. 871 */ 872 if (record->zi_timer == 0 || record->zi_nlanes == 0) 873 return (SET_ERROR(EINVAL)); 874 875 /* 876 * The number of lanes is directly mapped to the size of 877 * an array used by the handler. Thus, to ensure the 878 * user doesn't trigger an allocation that's "too large" 879 * we cap the number of lanes here. 880 */ 881 if (record->zi_nlanes >= UINT16_MAX) 882 return (SET_ERROR(EINVAL)); 883 } 884 885 /* 886 * If the supplied range was in bytes -- calculate the actual blkid 887 */ 888 if (flags & ZINJECT_CALC_RANGE) { 889 error = zio_calculate_range(name, record); 890 if (error != 0) 891 return (error); 892 } 893 894 if (!(flags & ZINJECT_NULL)) { 895 /* 896 * Pool delays for import or export don't take an 897 * injection reference on the spa. Instead they 898 * rely on matching by name. 899 */ 900 if (record->zi_cmd == ZINJECT_DELAY_IMPORT || 901 record->zi_cmd == ZINJECT_DELAY_EXPORT) { 902 if (record->zi_duration <= 0) 903 return (SET_ERROR(EINVAL)); 904 /* 905 * Only one import | export delay handler per pool. 906 */ 907 if (zio_pool_handler_exists(name, record->zi_cmd)) 908 return (SET_ERROR(EEXIST)); 909 910 mutex_enter(&spa_namespace_lock); 911 boolean_t has_spa = spa_lookup(name) != NULL; 912 mutex_exit(&spa_namespace_lock); 913 914 if (record->zi_cmd == ZINJECT_DELAY_IMPORT && has_spa) 915 return (SET_ERROR(EEXIST)); 916 if (record->zi_cmd == ZINJECT_DELAY_EXPORT && !has_spa) 917 return (SET_ERROR(ENOENT)); 918 spa = NULL; 919 } else { 920 /* 921 * spa_inject_ref() will add an injection reference, 922 * which will prevent the pool from being removed 923 * from the namespace while still allowing it to be 924 * unloaded. 925 */ 926 if ((spa = spa_inject_addref(name)) == NULL) 927 return (SET_ERROR(ENOENT)); 928 } 929 930 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 931 handler->zi_spa = spa; /* note: can be NULL */ 932 handler->zi_record = *record; 933 934 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 935 handler->zi_lanes = kmem_zalloc( 936 sizeof (*handler->zi_lanes) * 937 handler->zi_record.zi_nlanes, KM_SLEEP); 938 handler->zi_next_lane = 0; 939 } else { 940 handler->zi_lanes = NULL; 941 handler->zi_next_lane = 0; 942 } 943 944 if (handler->zi_spa == NULL) 945 handler->zi_spa_name = spa_strdup(name); 946 else 947 handler->zi_spa_name = NULL; 948 949 rw_enter(&inject_lock, RW_WRITER); 950 951 /* 952 * We can't move this increment into the conditional 953 * above because we need to hold the RW_WRITER lock of 954 * inject_lock, and we don't want to hold that while 955 * allocating the handler's zi_lanes array. 956 */ 957 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 958 ASSERT3S(inject_delay_count, >=, 0); 959 inject_delay_count++; 960 ASSERT3S(inject_delay_count, >, 0); 961 } 962 963 *id = handler->zi_id = inject_next_id++; 964 list_insert_tail(&inject_handlers, handler); 965 atomic_inc_32(&zio_injection_enabled); 966 967 rw_exit(&inject_lock); 968 } 969 970 /* 971 * Flush the ARC, so that any attempts to read this data will end up 972 * going to the ZIO layer. Note that this is a little overkill, but 973 * we don't have the necessary ARC interfaces to do anything else, and 974 * fault injection isn't a performance critical path. 975 */ 976 if (flags & ZINJECT_FLUSH_ARC) 977 /* 978 * We must use FALSE to ensure arc_flush returns, since 979 * we're not preventing concurrent ARC insertions. 980 */ 981 arc_flush(NULL, FALSE); 982 983 return (0); 984 } 985 986 /* 987 * Returns the next record with an ID greater than that supplied to the 988 * function. Used to iterate over all handlers in the system. 989 */ 990 int 991 zio_inject_list_next(int *id, char *name, size_t buflen, 992 zinject_record_t *record) 993 { 994 inject_handler_t *handler; 995 int ret; 996 997 mutex_enter(&spa_namespace_lock); 998 rw_enter(&inject_lock, RW_READER); 999 1000 for (handler = list_head(&inject_handlers); handler != NULL; 1001 handler = list_next(&inject_handlers, handler)) 1002 if (handler->zi_id > *id) 1003 break; 1004 1005 if (handler) { 1006 *record = handler->zi_record; 1007 *id = handler->zi_id; 1008 ASSERT(handler->zi_spa || handler->zi_spa_name); 1009 if (handler->zi_spa != NULL) 1010 (void) strlcpy(name, spa_name(handler->zi_spa), buflen); 1011 else 1012 (void) strlcpy(name, handler->zi_spa_name, buflen); 1013 ret = 0; 1014 } else { 1015 ret = SET_ERROR(ENOENT); 1016 } 1017 1018 rw_exit(&inject_lock); 1019 mutex_exit(&spa_namespace_lock); 1020 1021 return (ret); 1022 } 1023 1024 /* 1025 * Clear the fault handler with the given identifier, or return ENOENT if none 1026 * exists. 1027 */ 1028 int 1029 zio_clear_fault(int id) 1030 { 1031 inject_handler_t *handler; 1032 1033 rw_enter(&inject_lock, RW_WRITER); 1034 1035 for (handler = list_head(&inject_handlers); handler != NULL; 1036 handler = list_next(&inject_handlers, handler)) 1037 if (handler->zi_id == id) 1038 break; 1039 1040 if (handler == NULL) { 1041 rw_exit(&inject_lock); 1042 return (SET_ERROR(ENOENT)); 1043 } 1044 1045 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1046 ASSERT3S(inject_delay_count, >, 0); 1047 inject_delay_count--; 1048 ASSERT3S(inject_delay_count, >=, 0); 1049 } 1050 1051 list_remove(&inject_handlers, handler); 1052 rw_exit(&inject_lock); 1053 1054 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 1055 ASSERT3P(handler->zi_lanes, !=, NULL); 1056 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 1057 handler->zi_record.zi_nlanes); 1058 } else { 1059 ASSERT3P(handler->zi_lanes, ==, NULL); 1060 } 1061 1062 if (handler->zi_spa_name != NULL) 1063 spa_strfree(handler->zi_spa_name); 1064 1065 if (handler->zi_spa != NULL) 1066 spa_inject_delref(handler->zi_spa); 1067 kmem_free(handler, sizeof (inject_handler_t)); 1068 atomic_dec_32(&zio_injection_enabled); 1069 1070 return (0); 1071 } 1072 1073 void 1074 zio_inject_init(void) 1075 { 1076 rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 1077 mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 1078 list_create(&inject_handlers, sizeof (inject_handler_t), 1079 offsetof(inject_handler_t, zi_link)); 1080 } 1081 1082 void 1083 zio_inject_fini(void) 1084 { 1085 list_destroy(&inject_handlers); 1086 mutex_destroy(&inject_delay_mtx); 1087 rw_destroy(&inject_lock); 1088 } 1089 1090 #if defined(_KERNEL) 1091 EXPORT_SYMBOL(zio_injection_enabled); 1092 EXPORT_SYMBOL(zio_inject_fault); 1093 EXPORT_SYMBOL(zio_inject_list_next); 1094 EXPORT_SYMBOL(zio_clear_fault); 1095 EXPORT_SYMBOL(zio_handle_fault_injection); 1096 EXPORT_SYMBOL(zio_handle_device_injection); 1097 EXPORT_SYMBOL(zio_handle_label_injection); 1098 #endif 1099