1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 */ 25 26 /* 27 * ZFS fault injection 28 * 29 * To handle fault injection, we keep track of a series of zinject_record_t 30 * structures which describe which logical block(s) should be injected with a 31 * fault. These are kept in a global list. Each record corresponds to a given 32 * spa_t and maintains a special hold on the spa_t so that it cannot be deleted 33 * or exported while the injection record exists. 34 * 35 * Device level injection is done using the 'zi_guid' field. If this is set, it 36 * means that the error is destined for a particular device, not a piece of 37 * data. 38 * 39 * This is a rather poor data structure and algorithm, but we don't expect more 40 * than a few faults at any one time, so it should be sufficient for our needs. 41 */ 42 43 #include <sys/arc.h> 44 #include <sys/zio_impl.h> 45 #include <sys/zfs_ioctl.h> 46 #include <sys/vdev_impl.h> 47 #include <sys/dmu_objset.h> 48 #include <sys/dsl_dataset.h> 49 #include <sys/fs/zfs.h> 50 51 uint32_t zio_injection_enabled = 0; 52 53 /* 54 * Data describing each zinject handler registered on the system, and 55 * contains the list node linking the handler in the global zinject 56 * handler list. 57 */ 58 typedef struct inject_handler { 59 int zi_id; 60 spa_t *zi_spa; 61 zinject_record_t zi_record; 62 uint64_t *zi_lanes; 63 int zi_next_lane; 64 list_node_t zi_link; 65 } inject_handler_t; 66 67 /* 68 * List of all zinject handlers registered on the system, protected by 69 * the inject_lock defined below. 70 */ 71 static list_t inject_handlers; 72 73 /* 74 * This protects insertion into, and traversal of, the inject handler 75 * list defined above; as well as the inject_delay_count. Any time a 76 * handler is inserted or removed from the list, this lock should be 77 * taken as a RW_WRITER; and any time traversal is done over the list 78 * (without modification to it) this lock should be taken as a RW_READER. 79 */ 80 static krwlock_t inject_lock; 81 82 /* 83 * This holds the number of zinject delay handlers that have been 84 * registered on the system. It is protected by the inject_lock defined 85 * above. Thus modifications to this count must be a RW_WRITER of the 86 * inject_lock, and reads of this count must be (at least) a RW_READER 87 * of the lock. 88 */ 89 static int inject_delay_count = 0; 90 91 /* 92 * This lock is used only in zio_handle_io_delay(), refer to the comment 93 * in that function for more details. 94 */ 95 static kmutex_t inject_delay_mtx; 96 97 /* 98 * Used to assign unique identifying numbers to each new zinject handler. 99 */ 100 static int inject_next_id = 1; 101 102 /* 103 * Returns true if the given record matches the I/O in progress. 104 */ 105 static boolean_t 106 zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, 107 zinject_record_t *record, int error) 108 { 109 /* 110 * Check for a match against the MOS, which is based on type 111 */ 112 if (zb->zb_objset == DMU_META_OBJSET && 113 record->zi_objset == DMU_META_OBJSET && 114 record->zi_object == DMU_META_DNODE_OBJECT) { 115 if (record->zi_type == DMU_OT_NONE || 116 type == record->zi_type) 117 return (record->zi_freq == 0 || 118 spa_get_random(100) < record->zi_freq); 119 else 120 return (B_FALSE); 121 } 122 123 /* 124 * Check for an exact match. 125 */ 126 if (zb->zb_objset == record->zi_objset && 127 zb->zb_object == record->zi_object && 128 zb->zb_level == record->zi_level && 129 zb->zb_blkid >= record->zi_start && 130 zb->zb_blkid <= record->zi_end && 131 (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && 132 error == record->zi_error) { 133 return (record->zi_freq == 0 || 134 spa_get_random(100) < record->zi_freq); 135 } 136 137 return (B_FALSE); 138 } 139 140 /* 141 * Panic the system when a config change happens in the function 142 * specified by tag. 143 */ 144 void 145 zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) 146 { 147 inject_handler_t *handler; 148 149 rw_enter(&inject_lock, RW_READER); 150 151 for (handler = list_head(&inject_handlers); handler != NULL; 152 handler = list_next(&inject_handlers, handler)) { 153 154 if (spa != handler->zi_spa) 155 continue; 156 157 if (handler->zi_record.zi_type == type && 158 strcmp(tag, handler->zi_record.zi_func) == 0) 159 panic("Panic requested in function %s\n", tag); 160 } 161 162 rw_exit(&inject_lock); 163 } 164 165 166 /* 167 * If this is a physical I/O for a vdev child determine which DVA it is 168 * for. We iterate backwards through the DVAs matching on the offset so 169 * that we end up with ZI_NO_DVA (-1) if we don't find a match. 170 */ 171 static int 172 zio_match_dva(zio_t *zio) 173 { 174 int i = ZI_NO_DVA; 175 176 if (zio->io_bp != NULL && zio->io_vd != NULL && 177 zio->io_child_type == ZIO_CHILD_VDEV) { 178 for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { 179 dva_t *dva = &zio->io_bp->blk_dva[i]; 180 uint64_t off = DVA_GET_OFFSET(dva); 181 vdev_t *vd = vdev_lookup_top(zio->io_spa, 182 DVA_GET_VDEV(dva)); 183 184 /* Compensate for vdev label added to leaves */ 185 if (zio->io_vd->vdev_ops->vdev_op_leaf) 186 off += VDEV_LABEL_START_SIZE; 187 188 if (zio->io_vd == vd && zio->io_offset == off) 189 break; 190 } 191 } 192 193 return (i); 194 } 195 196 197 /* 198 * Inject a decryption failure. Decryption failures can occur in 199 * both the ARC and the ZIO layers. 200 */ 201 int 202 zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, 203 uint64_t type, int error) 204 { 205 int ret = 0; 206 inject_handler_t *handler; 207 208 rw_enter(&inject_lock, RW_READER); 209 210 for (handler = list_head(&inject_handlers); handler != NULL; 211 handler = list_next(&inject_handlers, handler)) { 212 213 if (spa != handler->zi_spa || 214 handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) 215 continue; 216 217 if (zio_match_handler((zbookmark_phys_t *)zb, type, ZI_NO_DVA, 218 &handler->zi_record, error)) { 219 ret = error; 220 break; 221 } 222 } 223 224 rw_exit(&inject_lock); 225 return (ret); 226 } 227 228 /* 229 * Determine if the I/O in question should return failure. Returns the errno 230 * to be returned to the caller. 231 */ 232 int 233 zio_handle_fault_injection(zio_t *zio, int error) 234 { 235 int ret = 0; 236 inject_handler_t *handler; 237 238 /* 239 * Ignore I/O not associated with any logical data. 240 */ 241 if (zio->io_logical == NULL) 242 return (0); 243 244 /* 245 * Currently, we only support fault injection on reads. 246 */ 247 if (zio->io_type != ZIO_TYPE_READ) 248 return (0); 249 250 rw_enter(&inject_lock, RW_READER); 251 252 for (handler = list_head(&inject_handlers); handler != NULL; 253 handler = list_next(&inject_handlers, handler)) { 254 255 if (zio->io_spa != handler->zi_spa || 256 handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) 257 continue; 258 259 /* If this handler matches, return the specified error */ 260 if (zio_match_handler(&zio->io_logical->io_bookmark, 261 zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, 262 zio_match_dva(zio), &handler->zi_record, error)) { 263 ret = error; 264 break; 265 } 266 } 267 268 rw_exit(&inject_lock); 269 270 return (ret); 271 } 272 273 /* 274 * Determine if the zio is part of a label update and has an injection 275 * handler associated with that portion of the label. Currently, we 276 * allow error injection in either the nvlist or the uberblock region of 277 * of the vdev label. 278 */ 279 int 280 zio_handle_label_injection(zio_t *zio, int error) 281 { 282 inject_handler_t *handler; 283 vdev_t *vd = zio->io_vd; 284 uint64_t offset = zio->io_offset; 285 int label; 286 int ret = 0; 287 288 if (offset >= VDEV_LABEL_START_SIZE && 289 offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) 290 return (0); 291 292 rw_enter(&inject_lock, RW_READER); 293 294 for (handler = list_head(&inject_handlers); handler != NULL; 295 handler = list_next(&inject_handlers, handler)) { 296 uint64_t start = handler->zi_record.zi_start; 297 uint64_t end = handler->zi_record.zi_end; 298 299 if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) 300 continue; 301 302 /* 303 * The injection region is the relative offsets within a 304 * vdev label. We must determine the label which is being 305 * updated and adjust our region accordingly. 306 */ 307 label = vdev_label_number(vd->vdev_psize, offset); 308 start = vdev_label_offset(vd->vdev_psize, label, start); 309 end = vdev_label_offset(vd->vdev_psize, label, end); 310 311 if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && 312 (offset >= start && offset <= end)) { 313 ret = error; 314 break; 315 } 316 } 317 rw_exit(&inject_lock); 318 return (ret); 319 } 320 321 322 int 323 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) 324 { 325 inject_handler_t *handler; 326 int ret = 0; 327 328 /* 329 * We skip over faults in the labels unless it's during 330 * device open (i.e. zio == NULL). 331 */ 332 if (zio != NULL) { 333 uint64_t offset = zio->io_offset; 334 335 if (offset < VDEV_LABEL_START_SIZE || 336 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) 337 return (0); 338 } 339 340 rw_enter(&inject_lock, RW_READER); 341 342 for (handler = list_head(&inject_handlers); handler != NULL; 343 handler = list_next(&inject_handlers, handler)) { 344 345 if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) 346 continue; 347 348 if (vd->vdev_guid == handler->zi_record.zi_guid) { 349 if (handler->zi_record.zi_failfast && 350 (zio == NULL || (zio->io_flags & 351 (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { 352 continue; 353 } 354 355 /* Handle type specific I/O failures */ 356 if (zio != NULL && 357 handler->zi_record.zi_iotype != ZIO_TYPES && 358 handler->zi_record.zi_iotype != zio->io_type) 359 continue; 360 361 if (handler->zi_record.zi_error == error) { 362 /* 363 * For a failed open, pretend like the device 364 * has gone away. 365 */ 366 if (error == ENXIO) 367 vd->vdev_stat.vs_aux = 368 VDEV_AUX_OPEN_FAILED; 369 370 /* 371 * Treat these errors as if they had been 372 * retried so that all the appropriate stats 373 * and FMA events are generated. 374 */ 375 if (!handler->zi_record.zi_failfast && 376 zio != NULL) 377 zio->io_flags |= ZIO_FLAG_IO_RETRY; 378 379 ret = error; 380 break; 381 } 382 if (handler->zi_record.zi_error == ENXIO) { 383 ret = SET_ERROR(EIO); 384 break; 385 } 386 } 387 } 388 389 rw_exit(&inject_lock); 390 391 return (ret); 392 } 393 394 /* 395 * Simulate hardware that ignores cache flushes. For requested number 396 * of seconds nix the actual writing to disk. 397 */ 398 void 399 zio_handle_ignored_writes(zio_t *zio) 400 { 401 inject_handler_t *handler; 402 403 rw_enter(&inject_lock, RW_READER); 404 405 for (handler = list_head(&inject_handlers); handler != NULL; 406 handler = list_next(&inject_handlers, handler)) { 407 408 /* Ignore errors not destined for this pool */ 409 if (zio->io_spa != handler->zi_spa || 410 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 411 continue; 412 413 /* 414 * Positive duration implies # of seconds, negative 415 * a number of txgs 416 */ 417 if (handler->zi_record.zi_timer == 0) { 418 if (handler->zi_record.zi_duration > 0) 419 handler->zi_record.zi_timer = ddi_get_lbolt64(); 420 else 421 handler->zi_record.zi_timer = zio->io_txg; 422 } 423 424 /* Have a "problem" writing 60% of the time */ 425 if (spa_get_random(100) < 60) 426 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 427 break; 428 } 429 430 rw_exit(&inject_lock); 431 } 432 433 void 434 spa_handle_ignored_writes(spa_t *spa) 435 { 436 inject_handler_t *handler; 437 438 if (zio_injection_enabled == 0) 439 return; 440 441 rw_enter(&inject_lock, RW_READER); 442 443 for (handler = list_head(&inject_handlers); handler != NULL; 444 handler = list_next(&inject_handlers, handler)) { 445 446 if (spa != handler->zi_spa || 447 handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) 448 continue; 449 450 if (handler->zi_record.zi_duration > 0) { 451 VERIFY(handler->zi_record.zi_timer == 0 || 452 handler->zi_record.zi_timer + 453 handler->zi_record.zi_duration * hz > 454 ddi_get_lbolt64()); 455 } else { 456 /* duration is negative so the subtraction here adds */ 457 VERIFY(handler->zi_record.zi_timer == 0 || 458 handler->zi_record.zi_timer - 459 handler->zi_record.zi_duration >= 460 spa_syncing_txg(spa)); 461 } 462 } 463 464 rw_exit(&inject_lock); 465 } 466 467 hrtime_t 468 zio_handle_io_delay(zio_t *zio) 469 { 470 vdev_t *vd = zio->io_vd; 471 inject_handler_t *min_handler = NULL; 472 hrtime_t min_target = 0; 473 474 rw_enter(&inject_lock, RW_READER); 475 476 /* 477 * inject_delay_count is a subset of zio_injection_enabled that 478 * is only incremented for delay handlers. These checks are 479 * mainly added to remind the reader why we're not explicitly 480 * checking zio_injection_enabled like the other functions. 481 */ 482 IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); 483 IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); 484 485 /* 486 * If there aren't any inject delay handlers registered, then we 487 * can short circuit and simply return 0 here. A value of zero 488 * informs zio_delay_interrupt() that this request should not be 489 * delayed. This short circuit keeps us from acquiring the 490 * inject_delay_mutex unnecessarily. 491 */ 492 if (inject_delay_count == 0) { 493 rw_exit(&inject_lock); 494 return (0); 495 } 496 497 /* 498 * Each inject handler has a number of "lanes" associated with 499 * it. Each lane is able to handle requests independently of one 500 * another, and at a latency defined by the inject handler 501 * record's zi_timer field. Thus if a handler in configured with 502 * a single lane with a 10ms latency, it will delay requests 503 * such that only a single request is completed every 10ms. So, 504 * if more than one request is attempted per each 10ms interval, 505 * the average latency of the requests will be greater than 506 * 10ms; but if only a single request is submitted each 10ms 507 * interval the average latency will be 10ms. 508 * 509 * We need to acquire this mutex to prevent multiple concurrent 510 * threads being assigned to the same lane of a given inject 511 * handler. The mutex allows us to perform the following two 512 * operations atomically: 513 * 514 * 1. determine the minimum handler and minimum target 515 * value of all the possible handlers 516 * 2. update that minimum handler's lane array 517 * 518 * Without atomicity, two (or more) threads could pick the same 519 * lane in step (1), and then conflict with each other in step 520 * (2). This could allow a single lane handler to process 521 * multiple requests simultaneously, which shouldn't be possible. 522 */ 523 mutex_enter(&inject_delay_mtx); 524 525 for (inject_handler_t *handler = list_head(&inject_handlers); 526 handler != NULL; handler = list_next(&inject_handlers, handler)) { 527 if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) 528 continue; 529 530 if (vd->vdev_guid != handler->zi_record.zi_guid) 531 continue; 532 533 /* 534 * Defensive; should never happen as the array allocation 535 * occurs prior to inserting this handler on the list. 536 */ 537 ASSERT3P(handler->zi_lanes, !=, NULL); 538 539 /* 540 * This should never happen, the zinject command should 541 * prevent a user from setting an IO delay with zero lanes. 542 */ 543 ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); 544 545 ASSERT3U(handler->zi_record.zi_nlanes, >, 546 handler->zi_next_lane); 547 548 /* 549 * We want to issue this IO to the lane that will become 550 * idle the soonest, so we compare the soonest this 551 * specific handler can complete the IO with all other 552 * handlers, to find the lowest value of all possible 553 * lanes. We then use this lane to submit the request. 554 * 555 * Since each handler has a constant value for its 556 * delay, we can just use the "next" lane for that 557 * handler; as it will always be the lane with the 558 * lowest value for that particular handler (i.e. the 559 * lane that will become idle the soonest). This saves a 560 * scan of each handler's lanes array. 561 * 562 * There's two cases to consider when determining when 563 * this specific IO request should complete. If this 564 * lane is idle, we want to "submit" the request now so 565 * it will complete after zi_timer milliseconds. Thus, 566 * we set the target to now + zi_timer. 567 * 568 * If the lane is busy, we want this request to complete 569 * zi_timer milliseconds after the lane becomes idle. 570 * Since the 'zi_lanes' array holds the time at which 571 * each lane will become idle, we use that value to 572 * determine when this request should complete. 573 */ 574 hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); 575 hrtime_t busy = handler->zi_record.zi_timer + 576 handler->zi_lanes[handler->zi_next_lane]; 577 hrtime_t target = MAX(idle, busy); 578 579 if (min_handler == NULL) { 580 min_handler = handler; 581 min_target = target; 582 continue; 583 } 584 585 ASSERT3P(min_handler, !=, NULL); 586 ASSERT3U(min_target, !=, 0); 587 588 /* 589 * We don't yet increment the "next lane" variable since 590 * we still might find a lower value lane in another 591 * handler during any remaining iterations. Once we're 592 * sure we've selected the absolute minimum, we'll claim 593 * the lane and increment the handler's "next lane" 594 * field below. 595 */ 596 597 if (target < min_target) { 598 min_handler = handler; 599 min_target = target; 600 } 601 } 602 603 /* 604 * 'min_handler' will be NULL if no IO delays are registered for 605 * this vdev, otherwise it will point to the handler containing 606 * the lane that will become idle the soonest. 607 */ 608 if (min_handler != NULL) { 609 ASSERT3U(min_target, !=, 0); 610 min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; 611 612 /* 613 * If we've used all possible lanes for this handler, 614 * loop back and start using the first lane again; 615 * otherwise, just increment the lane index. 616 */ 617 min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % 618 min_handler->zi_record.zi_nlanes; 619 } 620 621 mutex_exit(&inject_delay_mtx); 622 rw_exit(&inject_lock); 623 624 return (min_target); 625 } 626 627 static int 628 zio_calculate_range(const char *pool, zinject_record_t *record) 629 { 630 dsl_pool_t *dp; 631 dsl_dataset_t *ds; 632 objset_t *os = NULL; 633 dnode_t *dn = NULL; 634 int error; 635 636 /* 637 * Obtain the dnode for object using pool, objset, and object 638 */ 639 error = dsl_pool_hold(pool, FTAG, &dp); 640 if (error) 641 return (error); 642 643 error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); 644 dsl_pool_rele(dp, FTAG); 645 if (error) 646 return (error); 647 648 error = dmu_objset_from_ds(ds, &os); 649 dsl_dataset_rele(ds, FTAG); 650 if (error) 651 return (error); 652 653 error = dnode_hold(os, record->zi_object, FTAG, &dn); 654 if (error) 655 return (error); 656 657 /* 658 * Translate the range into block IDs 659 */ 660 if (record->zi_start != 0 || record->zi_end != -1ULL) { 661 record->zi_start >>= dn->dn_datablkshift; 662 record->zi_end >>= dn->dn_datablkshift; 663 } 664 if (record->zi_level > 0) { 665 if (record->zi_level >= dn->dn_nlevels) { 666 dnode_rele(dn, FTAG); 667 return (SET_ERROR(EDOM)); 668 } 669 670 if (record->zi_start != 0 || record->zi_end != 0) { 671 int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 672 673 for (int level = record->zi_level; level > 0; level--) { 674 record->zi_start >>= shift; 675 record->zi_end >>= shift; 676 } 677 } 678 } 679 680 dnode_rele(dn, FTAG); 681 return (0); 682 } 683 684 /* 685 * Create a new handler for the given record. We add it to the list, adding 686 * a reference to the spa_t in the process. We increment zio_injection_enabled, 687 * which is the switch to trigger all fault injection. 688 */ 689 int 690 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) 691 { 692 inject_handler_t *handler; 693 int error; 694 spa_t *spa; 695 696 /* 697 * If this is pool-wide metadata, make sure we unload the corresponding 698 * spa_t, so that the next attempt to load it will trigger the fault. 699 * We call spa_reset() to unload the pool appropriately. 700 */ 701 if (flags & ZINJECT_UNLOAD_SPA) 702 if ((error = spa_reset(name)) != 0) 703 return (error); 704 705 if (record->zi_cmd == ZINJECT_DELAY_IO) { 706 /* 707 * A value of zero for the number of lanes or for the 708 * delay time doesn't make sense. 709 */ 710 if (record->zi_timer == 0 || record->zi_nlanes == 0) 711 return (SET_ERROR(EINVAL)); 712 713 /* 714 * The number of lanes is directly mapped to the size of 715 * an array used by the handler. Thus, to ensure the 716 * user doesn't trigger an allocation that's "too large" 717 * we cap the number of lanes here. 718 */ 719 if (record->zi_nlanes >= UINT16_MAX) 720 return (SET_ERROR(EINVAL)); 721 } 722 723 /* 724 * If the supplied range was in bytes -- calculate the actual blkid 725 */ 726 if (flags & ZINJECT_CALC_RANGE) { 727 error = zio_calculate_range(name, record); 728 if (error != 0) 729 return (error); 730 } 731 732 if (!(flags & ZINJECT_NULL)) { 733 /* 734 * spa_inject_ref() will add an injection reference, which will 735 * prevent the pool from being removed from the namespace while 736 * still allowing it to be unloaded. 737 */ 738 if ((spa = spa_inject_addref(name)) == NULL) 739 return (SET_ERROR(ENOENT)); 740 741 handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); 742 743 handler->zi_spa = spa; 744 handler->zi_record = *record; 745 746 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 747 handler->zi_lanes = kmem_zalloc( 748 sizeof (*handler->zi_lanes) * 749 handler->zi_record.zi_nlanes, KM_SLEEP); 750 handler->zi_next_lane = 0; 751 } else { 752 handler->zi_lanes = NULL; 753 handler->zi_next_lane = 0; 754 } 755 756 rw_enter(&inject_lock, RW_WRITER); 757 758 /* 759 * We can't move this increment into the conditional 760 * above because we need to hold the RW_WRITER lock of 761 * inject_lock, and we don't want to hold that while 762 * allocating the handler's zi_lanes array. 763 */ 764 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 765 ASSERT3S(inject_delay_count, >=, 0); 766 inject_delay_count++; 767 ASSERT3S(inject_delay_count, >, 0); 768 } 769 770 *id = handler->zi_id = inject_next_id++; 771 list_insert_tail(&inject_handlers, handler); 772 atomic_inc_32(&zio_injection_enabled); 773 774 rw_exit(&inject_lock); 775 } 776 777 /* 778 * Flush the ARC, so that any attempts to read this data will end up 779 * going to the ZIO layer. Note that this is a little overkill, but 780 * we don't have the necessary ARC interfaces to do anything else, and 781 * fault injection isn't a performance critical path. 782 */ 783 if (flags & ZINJECT_FLUSH_ARC) 784 /* 785 * We must use FALSE to ensure arc_flush returns, since 786 * we're not preventing concurrent ARC insertions. 787 */ 788 arc_flush(NULL, FALSE); 789 790 return (0); 791 } 792 793 /* 794 * Returns the next record with an ID greater than that supplied to the 795 * function. Used to iterate over all handlers in the system. 796 */ 797 int 798 zio_inject_list_next(int *id, char *name, size_t buflen, 799 zinject_record_t *record) 800 { 801 inject_handler_t *handler; 802 int ret; 803 804 mutex_enter(&spa_namespace_lock); 805 rw_enter(&inject_lock, RW_READER); 806 807 for (handler = list_head(&inject_handlers); handler != NULL; 808 handler = list_next(&inject_handlers, handler)) 809 if (handler->zi_id > *id) 810 break; 811 812 if (handler) { 813 *record = handler->zi_record; 814 *id = handler->zi_id; 815 (void) strncpy(name, spa_name(handler->zi_spa), buflen); 816 ret = 0; 817 } else { 818 ret = SET_ERROR(ENOENT); 819 } 820 821 rw_exit(&inject_lock); 822 mutex_exit(&spa_namespace_lock); 823 824 return (ret); 825 } 826 827 /* 828 * Clear the fault handler with the given identifier, or return ENOENT if none 829 * exists. 830 */ 831 int 832 zio_clear_fault(int id) 833 { 834 inject_handler_t *handler; 835 836 rw_enter(&inject_lock, RW_WRITER); 837 838 for (handler = list_head(&inject_handlers); handler != NULL; 839 handler = list_next(&inject_handlers, handler)) 840 if (handler->zi_id == id) 841 break; 842 843 if (handler == NULL) { 844 rw_exit(&inject_lock); 845 return (SET_ERROR(ENOENT)); 846 } 847 848 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 849 ASSERT3S(inject_delay_count, >, 0); 850 inject_delay_count--; 851 ASSERT3S(inject_delay_count, >=, 0); 852 } 853 854 list_remove(&inject_handlers, handler); 855 rw_exit(&inject_lock); 856 857 if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { 858 ASSERT3P(handler->zi_lanes, !=, NULL); 859 kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * 860 handler->zi_record.zi_nlanes); 861 } else { 862 ASSERT3P(handler->zi_lanes, ==, NULL); 863 } 864 865 spa_inject_delref(handler->zi_spa); 866 kmem_free(handler, sizeof (inject_handler_t)); 867 atomic_dec_32(&zio_injection_enabled); 868 869 return (0); 870 } 871 872 void 873 zio_inject_init(void) 874 { 875 rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); 876 mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); 877 list_create(&inject_handlers, sizeof (inject_handler_t), 878 offsetof(inject_handler_t, zi_link)); 879 } 880 881 void 882 zio_inject_fini(void) 883 { 884 list_destroy(&inject_handlers); 885 mutex_destroy(&inject_delay_mtx); 886 rw_destroy(&inject_lock); 887 } 888