1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 344 bzero(zio, sizeof (zio_t)); 345 zio->io_parent = pio; 346 zio->io_spa = spa; 347 zio->io_txg = txg; 348 zio->io_flags = flags; 349 if (bp != NULL) { 350 zio->io_bp = bp; 351 zio->io_bp_copy = *bp; 352 zio->io_bp_orig = *bp; 353 } 354 zio->io_done = done; 355 zio->io_private = private; 356 zio->io_type = type; 357 zio->io_priority = priority; 358 zio->io_stage = stage; 359 zio->io_pipeline = pipeline; 360 zio->io_timestamp = lbolt64; 361 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 362 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 363 zio_push_transform(zio, data, size, size); 364 365 /* 366 * Note on config lock: 367 * 368 * If CONFIG_HELD is set, then the caller already has the config 369 * lock, so we don't need it for this io. 370 * 371 * We set CONFIG_GRABBED to indicate that we have grabbed the 372 * config lock on behalf of this io, so it should be released 373 * in zio_done. 374 * 375 * Unless CONFIG_HELD is set, we will grab the config lock for 376 * any top-level (parent-less) io, *except* NULL top-level ios. 377 * The NULL top-level ios rarely have any children, so we delay 378 * grabbing the lock until the first child is added (but it is 379 * still grabbed on behalf of the top-level i/o, so additional 380 * children don't need to also grab it). This greatly reduces 381 * contention on the config lock. 382 */ 383 if (pio == NULL) { 384 if (type != ZIO_TYPE_NULL && 385 !(flags & ZIO_FLAG_CONFIG_HELD)) { 386 spa_config_enter(spa, RW_READER, zio); 387 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 388 } 389 zio->io_root = zio; 390 } else { 391 zio->io_root = pio->io_root; 392 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 393 zio->io_logical = pio->io_logical; 394 mutex_enter(&pio->io_lock); 395 if (pio->io_parent == NULL && 396 pio->io_type == ZIO_TYPE_NULL && 397 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 398 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 399 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 400 spa_config_enter(spa, RW_READER, pio); 401 } 402 if (stage < ZIO_STAGE_READY) 403 pio->io_children_notready++; 404 pio->io_children_notdone++; 405 zio->io_sibling_next = pio->io_child; 406 zio->io_sibling_prev = NULL; 407 if (pio->io_child != NULL) 408 pio->io_child->io_sibling_prev = zio; 409 pio->io_child = zio; 410 zio->io_ndvas = pio->io_ndvas; 411 mutex_exit(&pio->io_lock); 412 } 413 414 /* 415 * Save off the original state incase we need to retry later. 416 */ 417 zio->io_orig_stage = zio->io_stage; 418 zio->io_orig_pipeline = zio->io_pipeline; 419 zio->io_orig_flags = zio->io_flags; 420 421 return (zio); 422 } 423 424 static void 425 zio_reset(zio_t *zio) 426 { 427 zio_clear_transform_stack(zio); 428 429 zio->io_flags = zio->io_orig_flags; 430 zio->io_stage = zio->io_orig_stage; 431 zio->io_pipeline = zio->io_orig_pipeline; 432 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 433 } 434 435 zio_t * 436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 437 int flags) 438 { 439 zio_t *zio; 440 441 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 442 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 443 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 444 445 return (zio); 446 } 447 448 zio_t * 449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 450 { 451 return (zio_null(NULL, spa, done, private, flags)); 452 } 453 454 zio_t * 455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 456 uint64_t size, zio_done_func_t *done, void *private, 457 int priority, int flags, zbookmark_t *zb) 458 { 459 zio_t *zio; 460 461 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 462 463 /* 464 * If the user has specified that we allow I/Os to continue 465 * then attempt to satisfy the read. 466 */ 467 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 468 ZIO_ENTER(spa); 469 470 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 471 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 472 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 473 zio->io_bookmark = *zb; 474 475 zio->io_logical = zio; 476 477 /* 478 * Work off our copy of the bp so the caller can free it. 479 */ 480 zio->io_bp = &zio->io_bp_copy; 481 482 return (zio); 483 } 484 485 zio_t * 486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 487 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 488 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 489 int flags, zbookmark_t *zb) 490 { 491 zio_t *zio; 492 493 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 494 checksum < ZIO_CHECKSUM_FUNCTIONS); 495 496 ASSERT(compress >= ZIO_COMPRESS_OFF && 497 compress < ZIO_COMPRESS_FUNCTIONS); 498 499 ZIO_ENTER(spa); 500 501 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 502 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 503 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 504 505 zio->io_ready = ready; 506 507 zio->io_bookmark = *zb; 508 509 zio->io_logical = zio; 510 511 zio->io_checksum = checksum; 512 zio->io_compress = compress; 513 zio->io_ndvas = ncopies; 514 515 if (bp->blk_birth != txg) { 516 /* XXX the bp usually (always?) gets re-zeroed later */ 517 BP_ZERO(bp); 518 BP_SET_LSIZE(bp, size); 519 BP_SET_PSIZE(bp, size); 520 } else { 521 /* Make sure someone doesn't change their mind on overwrites */ 522 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 523 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 524 } 525 526 return (zio); 527 } 528 529 zio_t * 530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 531 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 532 zio_done_func_t *done, void *private, int priority, int flags, 533 zbookmark_t *zb) 534 { 535 zio_t *zio; 536 537 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 538 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 539 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 540 541 zio->io_bookmark = *zb; 542 zio->io_checksum = checksum; 543 zio->io_compress = ZIO_COMPRESS_OFF; 544 545 if (pio != NULL) 546 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 547 548 return (zio); 549 } 550 551 static void 552 zio_write_allocate_ready(zio_t *zio) 553 { 554 /* Free up the previous block */ 555 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 556 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 557 &zio->io_bp_orig, NULL, NULL)); 558 } 559 } 560 561 static zio_t * 562 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 563 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 564 zio_done_func_t *done, void *private, int priority, int flags) 565 { 566 zio_t *zio; 567 568 BP_ZERO(bp); 569 BP_SET_LSIZE(bp, size); 570 BP_SET_PSIZE(bp, size); 571 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 572 573 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 574 ZIO_TYPE_WRITE, priority, flags, 575 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 576 577 zio->io_checksum = checksum; 578 zio->io_compress = ZIO_COMPRESS_OFF; 579 zio->io_ready = zio_write_allocate_ready; 580 581 return (zio); 582 } 583 584 zio_t * 585 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 586 zio_done_func_t *done, void *private) 587 { 588 zio_t *zio; 589 590 ASSERT(!BP_IS_HOLE(bp)); 591 592 if (txg == spa->spa_syncing_txg && 593 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 594 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 595 return (zio_null(pio, spa, NULL, NULL, 0)); 596 } 597 598 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 599 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 600 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 601 602 zio->io_bp = &zio->io_bp_copy; 603 604 return (zio); 605 } 606 607 zio_t * 608 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 609 zio_done_func_t *done, void *private) 610 { 611 zio_t *zio; 612 613 /* 614 * A claim is an allocation of a specific block. Claims are needed 615 * to support immediate writes in the intent log. The issue is that 616 * immediate writes contain committed data, but in a txg that was 617 * *not* committed. Upon opening the pool after an unclean shutdown, 618 * the intent log claims all blocks that contain immediate write data 619 * so that the SPA knows they're in use. 620 * 621 * All claims *must* be resolved in the first txg -- before the SPA 622 * starts allocating blocks -- so that nothing is allocated twice. 623 */ 624 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 625 ASSERT3U(spa_first_txg(spa), <=, txg); 626 627 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 628 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 629 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 630 631 zio->io_bp = &zio->io_bp_copy; 632 633 return (zio); 634 } 635 636 zio_t * 637 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 638 zio_done_func_t *done, void *private, int priority, int flags) 639 { 640 zio_t *zio; 641 int c; 642 643 if (vd->vdev_children == 0) { 644 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 645 ZIO_TYPE_IOCTL, priority, flags, 646 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 647 648 zio->io_vd = vd; 649 zio->io_cmd = cmd; 650 } else { 651 zio = zio_null(pio, spa, NULL, NULL, flags); 652 653 for (c = 0; c < vd->vdev_children; c++) 654 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 655 done, private, priority, flags)); 656 } 657 658 return (zio); 659 } 660 661 static void 662 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 663 int checksum, boolean_t labels) 664 { 665 ASSERT(vd->vdev_children == 0); 666 667 ASSERT(size <= SPA_MAXBLOCKSIZE); 668 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 669 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 670 671 #ifdef ZFS_DEBUG 672 if (labels) { 673 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 674 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 675 } 676 #endif 677 ASSERT3U(offset + size, <=, vd->vdev_psize); 678 679 BP_ZERO(bp); 680 681 BP_SET_LSIZE(bp, size); 682 BP_SET_PSIZE(bp, size); 683 684 BP_SET_CHECKSUM(bp, checksum); 685 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 686 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 687 688 if (checksum != ZIO_CHECKSUM_OFF) 689 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 690 } 691 692 zio_t * 693 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 694 void *data, int checksum, zio_done_func_t *done, void *private, 695 int priority, int flags, boolean_t labels) 696 { 697 zio_t *zio; 698 blkptr_t blk; 699 700 ZIO_ENTER(vd->vdev_spa); 701 702 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 703 704 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 705 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 706 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 707 708 zio->io_vd = vd; 709 zio->io_offset = offset; 710 711 /* 712 * Work off our copy of the bp so the caller can free it. 713 */ 714 zio->io_bp = &zio->io_bp_copy; 715 716 return (zio); 717 } 718 719 zio_t * 720 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 721 void *data, int checksum, zio_done_func_t *done, void *private, 722 int priority, int flags, boolean_t labels) 723 { 724 zio_block_tail_t *zbt; 725 void *wbuf; 726 zio_t *zio; 727 blkptr_t blk; 728 729 ZIO_ENTER(vd->vdev_spa); 730 731 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 732 733 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 734 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 735 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 736 737 zio->io_vd = vd; 738 zio->io_offset = offset; 739 740 zio->io_bp = &zio->io_bp_copy; 741 zio->io_checksum = checksum; 742 743 if (zio_checksum_table[checksum].ci_zbt) { 744 /* 745 * zbt checksums are necessarily destructive -- they modify 746 * one word of the write buffer to hold the verifier/checksum. 747 * Therefore, we must make a local copy in case the data is 748 * being written to multiple places. 749 */ 750 wbuf = zio_buf_alloc(size); 751 bcopy(data, wbuf, size); 752 zio_push_transform(zio, wbuf, size, size); 753 754 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 755 zbt->zbt_cksum = blk.blk_cksum; 756 } 757 758 return (zio); 759 } 760 761 /* 762 * Create a child I/O to do some work for us. It has no associated bp. 763 */ 764 zio_t * 765 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 766 void *data, uint64_t size, int type, int priority, int flags, 767 zio_done_func_t *done, void *private) 768 { 769 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 770 zio_t *cio; 771 772 if (type == ZIO_TYPE_READ && bp != NULL) { 773 /* 774 * If we have the bp, then the child should perform the 775 * checksum and the parent need not. This pushes error 776 * detection as close to the leaves as possible and 777 * eliminates redundant checksums in the interior nodes. 778 */ 779 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 780 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 781 } 782 783 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 784 done, private, type, priority, 785 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 786 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 787 788 cio->io_vd = vd; 789 cio->io_offset = offset; 790 791 return (cio); 792 } 793 794 /* 795 * ========================================================================== 796 * Initiate I/O, either sync or async 797 * ========================================================================== 798 */ 799 static void 800 zio_destroy(zio_t *zio) 801 { 802 mutex_destroy(&zio->io_lock); 803 cv_destroy(&zio->io_cv); 804 if (zio->io_failed_vds != NULL) { 805 kmem_free(zio->io_failed_vds, 806 zio->io_failed_vds_count * sizeof (vdev_t *)); 807 zio->io_failed_vds = NULL; 808 zio->io_failed_vds_count = 0; 809 } 810 kmem_cache_free(zio_cache, zio); 811 } 812 813 int 814 zio_wait(zio_t *zio) 815 { 816 int error; 817 818 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 819 820 zio->io_waiter = curthread; 821 822 zio_execute(zio); 823 824 mutex_enter(&zio->io_lock); 825 while (zio->io_stalled != ZIO_STAGE_DONE) 826 cv_wait(&zio->io_cv, &zio->io_lock); 827 mutex_exit(&zio->io_lock); 828 829 error = zio->io_error; 830 zio_destroy(zio); 831 832 return (error); 833 } 834 835 void 836 zio_nowait(zio_t *zio) 837 { 838 zio_execute(zio); 839 } 840 841 void 842 zio_interrupt(zio_t *zio) 843 { 844 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 845 (task_func_t *)zio_execute, zio, TQ_SLEEP); 846 } 847 848 static int 849 zio_issue_async(zio_t *zio) 850 { 851 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 852 (task_func_t *)zio_execute, zio, TQ_SLEEP); 853 854 return (ZIO_PIPELINE_STOP); 855 } 856 857 /* 858 * ========================================================================== 859 * I/O pipeline interlocks: parent/child dependency scoreboarding 860 * ========================================================================== 861 */ 862 static int 863 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 864 { 865 int rv = ZIO_PIPELINE_CONTINUE; 866 867 mutex_enter(&zio->io_lock); 868 ASSERT(zio->io_stalled == 0); 869 if (*countp != 0) { 870 zio->io_stalled = stage; 871 rv = ZIO_PIPELINE_STOP; 872 } 873 mutex_exit(&zio->io_lock); 874 875 return (rv); 876 } 877 878 static void 879 zio_add_failed_vdev(zio_t *pio, zio_t *zio) 880 { 881 uint64_t oldcount = pio->io_failed_vds_count; 882 vdev_t **new_vds; 883 int i; 884 885 ASSERT(MUTEX_HELD(&pio->io_lock)); 886 887 if (zio->io_vd == NULL) 888 return; 889 890 for (i = 0; i < oldcount; i++) { 891 if (pio->io_failed_vds[i] == zio->io_vd) 892 return; 893 } 894 895 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); 896 if (pio->io_failed_vds != NULL) { 897 bcopy(pio->io_failed_vds, new_vds, 898 oldcount * sizeof (vdev_t *)); 899 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); 900 } 901 pio->io_failed_vds = new_vds; 902 pio->io_failed_vds[oldcount] = zio->io_vd; 903 pio->io_failed_vds_count++; 904 } 905 906 static void 907 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 908 { 909 zio_t *pio = zio->io_parent; 910 911 mutex_enter(&pio->io_lock); 912 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { 913 pio->io_error = zio->io_error; 914 if (zio->io_error && zio->io_error != ENOTSUP) 915 zio_add_failed_vdev(pio, zio); 916 } 917 ASSERT3U(*countp, >, 0); 918 if (--*countp == 0 && pio->io_stalled == stage) { 919 pio->io_stalled = 0; 920 mutex_exit(&pio->io_lock); 921 zio_execute(pio); 922 } else { 923 mutex_exit(&pio->io_lock); 924 } 925 } 926 927 int 928 zio_wait_for_children_ready(zio_t *zio) 929 { 930 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 931 &zio->io_children_notready)); 932 } 933 934 int 935 zio_wait_for_children_done(zio_t *zio) 936 { 937 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 938 &zio->io_children_notdone)); 939 } 940 941 static int 942 zio_read_init(zio_t *zio) 943 { 944 blkptr_t *bp = zio->io_bp; 945 946 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 947 uint64_t csize = BP_GET_PSIZE(bp); 948 void *cbuf = zio_buf_alloc(csize); 949 950 zio_push_transform(zio, cbuf, csize, csize); 951 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 952 } 953 954 if (BP_IS_GANG(bp)) { 955 uint64_t gsize = SPA_GANGBLOCKSIZE; 956 void *gbuf = zio_buf_alloc(gsize); 957 958 zio_push_transform(zio, gbuf, gsize, gsize); 959 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 960 } 961 962 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 963 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 964 965 return (ZIO_PIPELINE_CONTINUE); 966 } 967 968 static int 969 zio_ready(zio_t *zio) 970 { 971 zio_t *pio = zio->io_parent; 972 973 if (zio->io_ready) 974 zio->io_ready(zio); 975 976 if (pio != NULL) 977 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 978 &pio->io_children_notready); 979 980 if (zio->io_bp) 981 zio->io_bp_copy = *zio->io_bp; 982 983 return (ZIO_PIPELINE_CONTINUE); 984 } 985 986 static int 987 zio_vdev_retry_io(zio_t *zio) 988 { 989 zio_t *pio = zio->io_parent; 990 991 /* 992 * Preserve the failed bp so that the io_ready() callback can 993 * update the accounting accordingly. The callback will also be 994 * responsible for freeing the previously allocated block, if one 995 * exists. 996 */ 997 zio->io_bp_orig = *zio->io_bp; 998 999 /* 1000 * We must zero out the old DVA and blk_birth before reallocating 1001 * the bp. 1002 */ 1003 BP_ZERO_DVAS(zio->io_bp); 1004 zio_reset(zio); 1005 1006 if (pio) { 1007 /* 1008 * Let the parent know that we will 1009 * re-alloc the write (=> new bp info). 1010 */ 1011 mutex_enter(&pio->io_lock); 1012 pio->io_children_notready++; 1013 1014 /* 1015 * If the parent I/O is still in the open stage, then 1016 * don't bother telling it to retry since it hasn't 1017 * progressed far enough for it to care. 1018 */ 1019 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 1020 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1021 1022 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 1023 mutex_exit(&pio->io_lock); 1024 } 1025 1026 /* 1027 * We are getting ready to process the retry request so clear 1028 * the flag and the zio's current error status. 1029 */ 1030 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 1031 zio->io_error = 0; 1032 1033 return (ZIO_PIPELINE_CONTINUE); 1034 } 1035 1036 int 1037 zio_vdev_resume_io(spa_t *spa) 1038 { 1039 zio_t *zio; 1040 1041 mutex_enter(&spa->spa_zio_lock); 1042 1043 /* 1044 * Probe all of vdevs that have experienced an I/O error. 1045 * If we are still unable to verify the integrity of the vdev 1046 * then we prevent the resume from proceeeding. 1047 */ 1048 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 1049 zio = list_next(&spa->spa_zio_list, zio)) { 1050 int error = 0; 1051 1052 /* We only care about I/Os that must succeed */ 1053 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 1054 continue; 1055 error = vdev_probe(zio->io_vd); 1056 if (error) { 1057 mutex_exit(&spa->spa_zio_lock); 1058 return (error); 1059 } 1060 } 1061 1062 /* 1063 * Clear the vdev stats so that I/O can flow. 1064 */ 1065 vdev_clear(spa, NULL, B_FALSE); 1066 1067 spa->spa_state = POOL_STATE_ACTIVE; 1068 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 1069 list_remove(&spa->spa_zio_list, zio); 1070 zio->io_error = 0; 1071 1072 /* 1073 * If we are resuming an allocating I/O then we force it 1074 * to retry and let it resume operation where it left off. 1075 * Otherwise, go back to the ready stage and pick up from 1076 * there. 1077 */ 1078 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 1079 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1080 zio->io_stage--; 1081 } else { 1082 zio->io_stage = ZIO_STAGE_READY; 1083 } 1084 1085 (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 1086 zio, TQ_SLEEP); 1087 } 1088 mutex_exit(&spa->spa_zio_lock); 1089 1090 /* 1091 * Wait for the taskqs to finish and recheck the pool state since 1092 * it's possible that a resumed I/O has failed again. 1093 */ 1094 taskq_wait(zio_taskq); 1095 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1096 return (EIO); 1097 1098 mutex_enter(&spa->spa_zio_lock); 1099 cv_broadcast(&spa->spa_zio_cv); 1100 mutex_exit(&spa->spa_zio_lock); 1101 1102 return (0); 1103 } 1104 1105 static int 1106 zio_vdev_suspend_io(zio_t *zio) 1107 { 1108 spa_t *spa = zio->io_spa; 1109 1110 /* 1111 * We've experienced an unrecoverable failure so 1112 * set the pool state accordingly and queue all 1113 * failed IOs. 1114 */ 1115 spa->spa_state = POOL_STATE_IO_FAILURE; 1116 1117 mutex_enter(&spa->spa_zio_lock); 1118 list_insert_tail(&spa->spa_zio_list, zio); 1119 1120 #ifndef _KERNEL 1121 /* Used to notify ztest that the pool has suspended */ 1122 cv_broadcast(&spa->spa_zio_cv); 1123 #endif 1124 mutex_exit(&spa->spa_zio_lock); 1125 1126 return (ZIO_PIPELINE_STOP); 1127 } 1128 1129 static void 1130 zio_handle_io_failure(zio_t *zio, vdev_t *vd) 1131 { 1132 spa_t *spa = zio->io_spa; 1133 blkptr_t *bp = zio->io_bp; 1134 char *blkbuf; 1135 1136 #ifdef ZFS_DEBUG 1137 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 1138 if (blkbuf) { 1139 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 1140 bp ? bp : &zio->io_bp_copy); 1141 } 1142 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 1143 zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 1144 zio_type_name[zio->io_type], vdev_description(vd), 1145 (u_longlong_t)zio->io_offset, (void *)zio, 1146 blkbuf ? blkbuf : "", zio->io_error); 1147 if (blkbuf) 1148 kmem_free(blkbuf, BP_SPRINTF_LEN); 1149 #endif 1150 1151 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 1152 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1153 "failure and the failure mode property for this pool " 1154 "is set to panic.", spa_name(spa)); 1155 } 1156 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1157 vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE, 1158 VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE); 1159 } 1160 1161 static int 1162 zio_assess(zio_t *zio) 1163 { 1164 spa_t *spa = zio->io_spa; 1165 blkptr_t *bp = zio->io_bp; 1166 vdev_t *vd = zio->io_vd; 1167 1168 ASSERT(zio->io_children_notready == 0); 1169 ASSERT(zio->io_children_notdone == 0); 1170 1171 if (bp != NULL) { 1172 ASSERT(bp->blk_pad[0] == 0); 1173 ASSERT(bp->blk_pad[1] == 0); 1174 ASSERT(bp->blk_pad[2] == 0); 1175 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1176 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 1177 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1178 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 1179 if (zio->io_ndvas != 0) 1180 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 1181 ASSERT(BP_COUNT_GANG(bp) == 0 || 1182 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 1183 } 1184 } 1185 1186 /* 1187 * Some child I/O has indicated that a retry is necessary, so 1188 * we set an error on the I/O and let the logic below do the 1189 * rest. 1190 */ 1191 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 1192 zio->io_error = ERESTART; 1193 1194 if (vd != NULL) 1195 vdev_stat_update(zio); 1196 1197 if (zio->io_error) { 1198 /* 1199 * If this I/O is attached to a particular vdev, 1200 * generate an error message describing the I/O failure 1201 * at the block level. We ignore these errors if the 1202 * device is currently unavailable. 1203 */ 1204 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 1205 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1206 1207 if ((zio->io_error == EIO || 1208 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 1209 zio->io_logical == zio) { 1210 /* 1211 * For root I/O requests, tell the SPA to log the error 1212 * appropriately. Also, generate a logical data 1213 * ereport. 1214 */ 1215 spa_log_error(spa, zio); 1216 1217 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 1218 0, 0); 1219 } 1220 1221 /* 1222 * If we are an allocating I/O then we attempt to reissue 1223 * the I/O on another vdev unless the pool is out of space. 1224 * We handle this condition based on the spa's failmode 1225 * property. 1226 */ 1227 if (zio_write_retry && zio->io_error != ENOSPC && 1228 IO_IS_ALLOCATING(zio)) 1229 return (zio_vdev_retry_io(zio)); 1230 1231 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1232 1233 /* 1234 * For I/O requests that cannot fail, we carry out 1235 * the requested behavior based on the failmode pool 1236 * property. 1237 * 1238 * XXX - Need to differentiate between an ENOSPC as 1239 * a result of vdev failures vs. a full pool. 1240 */ 1241 if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 1242 int i; 1243 1244 for (i = 0; i < zio->io_failed_vds_count; i++) { 1245 zio_handle_io_failure(zio, 1246 zio->io_failed_vds[i]); 1247 } 1248 if (zio->io_failed_vds_count == 0) { 1249 zio_handle_io_failure(zio, 1250 vd ? vd : spa->spa_root_vdev); 1251 } 1252 if (zio->io_failed_vds != NULL) { 1253 kmem_free(zio->io_failed_vds, 1254 zio->io_failed_vds_count * 1255 sizeof (vdev_t *)); 1256 zio->io_failed_vds = NULL; 1257 zio->io_failed_vds_count = 0; 1258 } 1259 return (zio_vdev_suspend_io(zio)); 1260 } 1261 } 1262 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1263 ASSERT(zio->io_children_notready == 0); 1264 1265 return (ZIO_PIPELINE_CONTINUE); 1266 } 1267 1268 static int 1269 zio_done(zio_t *zio) 1270 { 1271 zio_t *pio = zio->io_parent; 1272 spa_t *spa = zio->io_spa; 1273 1274 ASSERT(zio->io_children_notready == 0); 1275 ASSERT(zio->io_children_notdone == 0); 1276 1277 zio_clear_transform_stack(zio); 1278 1279 if (zio->io_done) 1280 zio->io_done(zio); 1281 1282 ASSERT(zio->io_delegate_list == NULL); 1283 ASSERT(zio->io_delegate_next == NULL); 1284 1285 if (pio != NULL) { 1286 zio_t *next, *prev; 1287 1288 mutex_enter(&pio->io_lock); 1289 next = zio->io_sibling_next; 1290 prev = zio->io_sibling_prev; 1291 if (next != NULL) 1292 next->io_sibling_prev = prev; 1293 if (prev != NULL) 1294 prev->io_sibling_next = next; 1295 if (pio->io_child == zio) 1296 pio->io_child = next; 1297 mutex_exit(&pio->io_lock); 1298 1299 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1300 &pio->io_children_notdone); 1301 } 1302 1303 /* 1304 * Note: this I/O is now done, and will shortly be freed, so there is no 1305 * need to clear this (or any other) flag. 1306 */ 1307 if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 1308 spa_config_exit(spa, zio); 1309 1310 if (zio->io_waiter != NULL) { 1311 mutex_enter(&zio->io_lock); 1312 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1313 zio->io_stalled = zio->io_stage; 1314 cv_broadcast(&zio->io_cv); 1315 mutex_exit(&zio->io_lock); 1316 } else { 1317 zio_destroy(zio); 1318 } 1319 1320 return (ZIO_PIPELINE_STOP); 1321 } 1322 1323 /* 1324 * ========================================================================== 1325 * Compression support 1326 * ========================================================================== 1327 */ 1328 static int 1329 zio_write_compress(zio_t *zio) 1330 { 1331 int compress = zio->io_compress; 1332 blkptr_t *bp = zio->io_bp; 1333 void *cbuf; 1334 uint64_t lsize = zio->io_size; 1335 uint64_t csize = lsize; 1336 uint64_t cbufsize = 0; 1337 int pass; 1338 1339 if (bp->blk_birth == zio->io_txg) { 1340 /* 1341 * We're rewriting an existing block, which means we're 1342 * working on behalf of spa_sync(). For spa_sync() to 1343 * converge, it must eventually be the case that we don't 1344 * have to allocate new blocks. But compression changes 1345 * the blocksize, which forces a reallocate, and makes 1346 * convergence take longer. Therefore, after the first 1347 * few passes, stop compressing to ensure convergence. 1348 */ 1349 pass = spa_sync_pass(zio->io_spa); 1350 if (pass > zio_sync_pass.zp_dontcompress) 1351 compress = ZIO_COMPRESS_OFF; 1352 } else { 1353 ASSERT(BP_IS_HOLE(bp)); 1354 pass = 1; 1355 } 1356 1357 if (compress != ZIO_COMPRESS_OFF) 1358 if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1359 &cbuf, &csize, &cbufsize)) 1360 compress = ZIO_COMPRESS_OFF; 1361 1362 if (compress != ZIO_COMPRESS_OFF && csize != 0) 1363 zio_push_transform(zio, cbuf, csize, cbufsize); 1364 1365 /* 1366 * The final pass of spa_sync() must be all rewrites, but the first 1367 * few passes offer a trade-off: allocating blocks defers convergence, 1368 * but newly allocated blocks are sequential, so they can be written 1369 * to disk faster. Therefore, we allow the first few passes of 1370 * spa_sync() to reallocate new blocks, but force rewrites after that. 1371 * There should only be a handful of blocks after pass 1 in any case. 1372 */ 1373 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1374 pass > zio_sync_pass.zp_rewrite) { 1375 ASSERT(csize != 0); 1376 BP_SET_LSIZE(bp, lsize); 1377 BP_SET_COMPRESS(bp, compress); 1378 zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1379 } else { 1380 if (bp->blk_birth == zio->io_txg) 1381 BP_ZERO(bp); 1382 if (csize == 0) { 1383 BP_ZERO(bp); 1384 zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1385 } else { 1386 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1387 BP_SET_LSIZE(bp, lsize); 1388 BP_SET_PSIZE(bp, csize); 1389 BP_SET_COMPRESS(bp, compress); 1390 } 1391 } 1392 1393 return (ZIO_PIPELINE_CONTINUE); 1394 } 1395 1396 static int 1397 zio_read_decompress(zio_t *zio) 1398 { 1399 blkptr_t *bp = zio->io_bp; 1400 void *data; 1401 uint64_t size; 1402 uint64_t bufsize; 1403 int compress = BP_GET_COMPRESS(bp); 1404 1405 ASSERT(compress != ZIO_COMPRESS_OFF); 1406 1407 zio_pop_transform(zio, &data, &size, &bufsize); 1408 1409 if (zio_decompress_data(compress, data, size, 1410 zio->io_data, zio->io_size)) 1411 zio->io_error = EIO; 1412 1413 zio_buf_free(data, bufsize); 1414 1415 return (ZIO_PIPELINE_CONTINUE); 1416 } 1417 1418 /* 1419 * ========================================================================== 1420 * Gang block support 1421 * ========================================================================== 1422 */ 1423 static void 1424 zio_gang_byteswap(zio_t *zio) 1425 { 1426 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1427 1428 if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1429 byteswap_uint64_array(zio->io_data, zio->io_size); 1430 } 1431 1432 static int 1433 zio_get_gang_header(zio_t *zio) 1434 { 1435 blkptr_t *bp = zio->io_bp; 1436 uint64_t gsize = SPA_GANGBLOCKSIZE; 1437 void *gbuf = zio_buf_alloc(gsize); 1438 1439 ASSERT(BP_IS_GANG(bp)); 1440 1441 zio_push_transform(zio, gbuf, gsize, gsize); 1442 1443 zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1444 NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1445 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1446 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1447 1448 return (zio_wait_for_children_done(zio)); 1449 } 1450 1451 static int 1452 zio_read_gang_members(zio_t *zio) 1453 { 1454 zio_gbh_phys_t *gbh; 1455 uint64_t gsize, gbufsize, loff, lsize; 1456 int i; 1457 1458 ASSERT(BP_IS_GANG(zio->io_bp)); 1459 1460 zio_gang_byteswap(zio); 1461 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1462 1463 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1464 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1465 lsize = BP_GET_PSIZE(gbp); 1466 1467 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1468 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1469 ASSERT3U(loff + lsize, <=, zio->io_size); 1470 ASSERT(i < SPA_GBH_NBLKPTRS); 1471 ASSERT(!BP_IS_HOLE(gbp)); 1472 1473 zio_nowait(zio_read(zio, zio->io_spa, gbp, 1474 (char *)zio->io_data + loff, lsize, 1475 NULL, NULL, zio->io_priority, 1476 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1477 } 1478 1479 zio_buf_free(gbh, gbufsize); 1480 1481 return (zio_wait_for_children_done(zio)); 1482 } 1483 1484 static int 1485 zio_rewrite_gang_members(zio_t *zio) 1486 { 1487 zio_gbh_phys_t *gbh; 1488 uint64_t gsize, gbufsize, loff, lsize; 1489 int i; 1490 1491 ASSERT(BP_IS_GANG(zio->io_bp)); 1492 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1493 1494 zio_gang_byteswap(zio); 1495 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1496 1497 ASSERT(gsize == gbufsize); 1498 1499 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1500 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1501 lsize = BP_GET_PSIZE(gbp); 1502 1503 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1504 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1505 ASSERT3U(loff + lsize, <=, zio->io_size); 1506 ASSERT(i < SPA_GBH_NBLKPTRS); 1507 ASSERT(!BP_IS_HOLE(gbp)); 1508 1509 zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1510 zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1511 NULL, NULL, zio->io_priority, 1512 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1513 } 1514 1515 zio_push_transform(zio, gbh, gsize, gbufsize); 1516 1517 return (zio_wait_for_children_ready(zio)); 1518 } 1519 1520 static int 1521 zio_free_gang_members(zio_t *zio) 1522 { 1523 zio_gbh_phys_t *gbh; 1524 uint64_t gsize, gbufsize; 1525 int i; 1526 1527 ASSERT(BP_IS_GANG(zio->io_bp)); 1528 1529 zio_gang_byteswap(zio); 1530 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1531 1532 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1533 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1534 1535 if (BP_IS_HOLE(gbp)) 1536 continue; 1537 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1538 gbp, NULL, NULL)); 1539 } 1540 1541 zio_buf_free(gbh, gbufsize); 1542 1543 return (ZIO_PIPELINE_CONTINUE); 1544 } 1545 1546 static int 1547 zio_claim_gang_members(zio_t *zio) 1548 { 1549 zio_gbh_phys_t *gbh; 1550 uint64_t gsize, gbufsize; 1551 int i; 1552 1553 ASSERT(BP_IS_GANG(zio->io_bp)); 1554 1555 zio_gang_byteswap(zio); 1556 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1557 1558 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1559 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1560 if (BP_IS_HOLE(gbp)) 1561 continue; 1562 zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1563 gbp, NULL, NULL)); 1564 } 1565 1566 zio_buf_free(gbh, gbufsize); 1567 1568 return (ZIO_PIPELINE_CONTINUE); 1569 } 1570 1571 static void 1572 zio_write_allocate_gang_member_done(zio_t *zio) 1573 { 1574 zio_t *pio = zio->io_parent; 1575 dva_t *cdva = zio->io_bp->blk_dva; 1576 dva_t *pdva = pio->io_bp->blk_dva; 1577 uint64_t asize; 1578 int d; 1579 1580 ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 1581 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1582 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1583 ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1584 1585 mutex_enter(&pio->io_lock); 1586 for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 1587 ASSERT(DVA_GET_GANG(&pdva[d])); 1588 asize = DVA_GET_ASIZE(&pdva[d]); 1589 asize += DVA_GET_ASIZE(&cdva[d]); 1590 DVA_SET_ASIZE(&pdva[d], asize); 1591 } 1592 mutex_exit(&pio->io_lock); 1593 } 1594 1595 static int 1596 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1597 { 1598 blkptr_t *bp = zio->io_bp; 1599 dva_t *dva = bp->blk_dva; 1600 spa_t *spa = zio->io_spa; 1601 zio_gbh_phys_t *gbh; 1602 uint64_t txg = zio->io_txg; 1603 uint64_t resid = zio->io_size; 1604 uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1605 uint64_t gsize, loff, lsize; 1606 uint32_t gbps_left; 1607 int ndvas = zio->io_ndvas; 1608 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1609 int error; 1610 int i, d; 1611 1612 gsize = SPA_GANGBLOCKSIZE; 1613 gbps_left = SPA_GBH_NBLKPTRS; 1614 1615 error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 1616 B_FALSE); 1617 if (error) { 1618 zio->io_error = error; 1619 return (ZIO_PIPELINE_CONTINUE); 1620 } 1621 1622 for (d = 0; d < gbh_ndvas; d++) 1623 DVA_SET_GANG(&dva[d], 1); 1624 1625 bp->blk_birth = txg; 1626 1627 gbh = zio_buf_alloc(gsize); 1628 bzero(gbh, gsize); 1629 1630 for (loff = 0, i = 0; loff != zio->io_size; 1631 loff += lsize, resid -= lsize, gbps_left--, i++) { 1632 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1633 dva = gbp->blk_dva; 1634 1635 ASSERT(gbps_left != 0); 1636 maxalloc = MIN(maxalloc, resid); 1637 1638 while (resid <= maxalloc * gbps_left) { 1639 error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 1640 txg, bp, B_FALSE); 1641 if (error == 0) 1642 break; 1643 ASSERT3U(error, ==, ENOSPC); 1644 /* XXX - free up previous allocations? */ 1645 if (maxalloc == SPA_MINBLOCKSIZE) { 1646 zio->io_error = error; 1647 return (ZIO_PIPELINE_CONTINUE); 1648 } 1649 maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1650 } 1651 1652 if (resid <= maxalloc * gbps_left) { 1653 lsize = maxalloc; 1654 BP_SET_LSIZE(gbp, lsize); 1655 BP_SET_PSIZE(gbp, lsize); 1656 BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1657 gbp->blk_birth = txg; 1658 zio_nowait(zio_rewrite(zio, spa, 1659 zio->io_checksum, txg, gbp, 1660 (char *)zio->io_data + loff, lsize, 1661 zio_write_allocate_gang_member_done, NULL, 1662 zio->io_priority, 1663 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1664 &zio->io_bookmark)); 1665 } else { 1666 lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1667 ASSERT(lsize != SPA_MINBLOCKSIZE); 1668 zio_nowait(zio_write_allocate(zio, spa, 1669 zio->io_checksum, txg, gbp, 1670 (char *)zio->io_data + loff, lsize, 1671 zio_write_allocate_gang_member_done, NULL, 1672 zio->io_priority, 1673 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1674 } 1675 } 1676 1677 ASSERT(resid == 0 && loff == zio->io_size); 1678 1679 zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1680 1681 zio_push_transform(zio, gbh, gsize, gsize); 1682 1683 /* 1684 * As much as we'd like this to be 'ready' instead of 'done', 1685 * updating our ASIZE doesn't happen until the io_done callback, 1686 * so we have to wait for that to finish in order for our BP 1687 * to be stable. 1688 */ 1689 return (zio_wait_for_children_done(zio)); 1690 } 1691 1692 /* 1693 * ========================================================================== 1694 * Allocate and free blocks 1695 * ========================================================================== 1696 */ 1697 static int 1698 zio_dva_allocate(zio_t *zio) 1699 { 1700 spa_t *spa = zio->io_spa; 1701 metaslab_class_t *mc = spa->spa_normal_class; 1702 blkptr_t *bp = zio->io_bp; 1703 int error; 1704 1705 ASSERT(BP_IS_HOLE(bp)); 1706 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1707 ASSERT3U(zio->io_ndvas, >, 0); 1708 ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1709 1710 /* 1711 * For testing purposes, we force I/Os to retry. We don't allow 1712 * retries beyond the first pass since those I/Os are non-allocating 1713 * writes. 1714 */ 1715 if (zio_io_fail_shift && 1716 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 1717 zio_io_should_fail(zio_io_fail_shift)) 1718 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1719 1720 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1721 1722 error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 1723 zio->io_txg, NULL, B_FALSE); 1724 1725 if (error == 0) { 1726 bp->blk_birth = zio->io_txg; 1727 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 1728 return (zio_write_allocate_gang_members(zio, mc)); 1729 } else { 1730 zio->io_error = error; 1731 } 1732 1733 return (ZIO_PIPELINE_CONTINUE); 1734 } 1735 1736 static int 1737 zio_dva_free(zio_t *zio) 1738 { 1739 blkptr_t *bp = zio->io_bp; 1740 1741 metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1742 1743 BP_ZERO(bp); 1744 1745 return (ZIO_PIPELINE_CONTINUE); 1746 } 1747 1748 static int 1749 zio_dva_claim(zio_t *zio) 1750 { 1751 zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1752 1753 return (ZIO_PIPELINE_CONTINUE); 1754 } 1755 1756 /* 1757 * ========================================================================== 1758 * Read and write to physical devices 1759 * ========================================================================== 1760 */ 1761 1762 static int 1763 zio_vdev_io_start(zio_t *zio) 1764 { 1765 vdev_t *vd = zio->io_vd; 1766 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1767 blkptr_t *bp = zio->io_bp; 1768 uint64_t align; 1769 spa_t *spa = zio->io_spa; 1770 1771 /* 1772 * If the pool is already in a failure state then just suspend 1773 * this IO until the problem is resolved. We will reissue them 1774 * at that time. 1775 */ 1776 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 1777 zio->io_type == ZIO_TYPE_WRITE) 1778 return (zio_vdev_suspend_io(zio)); 1779 1780 /* 1781 * The mirror_ops handle multiple DVAs in a single BP 1782 */ 1783 if (vd == NULL) 1784 return (vdev_mirror_ops.vdev_op_io_start(zio)); 1785 1786 align = 1ULL << tvd->vdev_ashift; 1787 1788 if (zio->io_retries == 0 && vd == tvd) 1789 zio->io_flags |= ZIO_FLAG_FAILFAST; 1790 1791 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1792 zio->io_flags |= ZIO_FLAG_PHYSICAL; 1793 zio->io_offset += VDEV_LABEL_START_SIZE; 1794 } 1795 1796 if (P2PHASE(zio->io_size, align) != 0) { 1797 uint64_t asize = P2ROUNDUP(zio->io_size, align); 1798 char *abuf = zio_buf_alloc(asize); 1799 ASSERT(vd == tvd); 1800 if (zio->io_type == ZIO_TYPE_WRITE) { 1801 bcopy(zio->io_data, abuf, zio->io_size); 1802 bzero(abuf + zio->io_size, asize - zio->io_size); 1803 } 1804 zio_push_transform(zio, abuf, asize, asize); 1805 ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 1806 zio->io_flags |= ZIO_FLAG_SUBBLOCK; 1807 } 1808 1809 ASSERT(P2PHASE(zio->io_offset, align) == 0); 1810 ASSERT(P2PHASE(zio->io_size, align) == 0); 1811 ASSERT(bp == NULL || 1812 P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1813 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1814 1815 return (vd->vdev_ops->vdev_op_io_start(zio)); 1816 } 1817 1818 static int 1819 zio_vdev_io_done(zio_t *zio) 1820 { 1821 if (zio->io_vd == NULL) 1822 return (vdev_mirror_ops.vdev_op_io_done(zio)); 1823 1824 return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1825 } 1826 1827 /* XXPOLICY */ 1828 boolean_t 1829 zio_should_retry(zio_t *zio) 1830 { 1831 vdev_t *vd = zio->io_vd; 1832 1833 if (zio->io_error == 0) 1834 return (B_FALSE); 1835 if (zio->io_delegate_list != NULL) 1836 return (B_FALSE); 1837 if (vd && vd != vd->vdev_top) 1838 return (B_FALSE); 1839 if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1840 return (B_FALSE); 1841 if (zio->io_retries > 0) 1842 return (B_FALSE); 1843 1844 return (B_TRUE); 1845 } 1846 1847 static int 1848 zio_vdev_io_assess(zio_t *zio) 1849 { 1850 vdev_t *vd = zio->io_vd; 1851 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1852 1853 ASSERT(zio->io_vsd == NULL); 1854 1855 if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 1856 void *abuf; 1857 uint64_t asize; 1858 ASSERT(vd == tvd); 1859 zio_pop_transform(zio, &abuf, &asize, &asize); 1860 if (zio->io_type == ZIO_TYPE_READ) 1861 bcopy(abuf, zio->io_data, zio->io_size); 1862 zio_buf_free(abuf, asize); 1863 zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 1864 } 1865 1866 if (zio_injection_enabled && !zio->io_error) 1867 zio->io_error = zio_handle_fault_injection(zio, EIO); 1868 1869 /* 1870 * If the I/O failed, determine whether we should attempt to retry it. 1871 */ 1872 /* XXPOLICY */ 1873 if (zio_should_retry(zio)) { 1874 ASSERT(tvd == vd); 1875 1876 zio->io_retries++; 1877 zio->io_error = 0; 1878 zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1879 /* XXPOLICY */ 1880 zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1881 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1882 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1883 1884 return (ZIO_PIPELINE_CONTINUE); 1885 } 1886 1887 return (ZIO_PIPELINE_CONTINUE); 1888 } 1889 1890 void 1891 zio_vdev_io_reissue(zio_t *zio) 1892 { 1893 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1894 ASSERT(zio->io_error == 0); 1895 1896 zio->io_stage--; 1897 } 1898 1899 void 1900 zio_vdev_io_redone(zio_t *zio) 1901 { 1902 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1903 1904 zio->io_stage--; 1905 } 1906 1907 void 1908 zio_vdev_io_bypass(zio_t *zio) 1909 { 1910 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1911 ASSERT(zio->io_error == 0); 1912 1913 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1914 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1915 } 1916 1917 /* 1918 * ========================================================================== 1919 * Generate and verify checksums 1920 * ========================================================================== 1921 */ 1922 static int 1923 zio_checksum_generate(zio_t *zio) 1924 { 1925 int checksum = zio->io_checksum; 1926 blkptr_t *bp = zio->io_bp; 1927 1928 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1929 1930 BP_SET_CHECKSUM(bp, checksum); 1931 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1932 1933 zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1934 1935 return (ZIO_PIPELINE_CONTINUE); 1936 } 1937 1938 static int 1939 zio_gang_checksum_generate(zio_t *zio) 1940 { 1941 zio_cksum_t zc; 1942 zio_gbh_phys_t *gbh = zio->io_data; 1943 1944 ASSERT(BP_IS_GANG(zio->io_bp)); 1945 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1946 1947 zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1948 1949 zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1950 1951 return (ZIO_PIPELINE_CONTINUE); 1952 } 1953 1954 static int 1955 zio_checksum_verify(zio_t *zio) 1956 { 1957 if (zio->io_bp != NULL) { 1958 zio->io_error = zio_checksum_error(zio); 1959 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1960 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1961 zio->io_spa, zio->io_vd, zio, 0, 0); 1962 } 1963 1964 return (ZIO_PIPELINE_CONTINUE); 1965 } 1966 1967 /* 1968 * Called by RAID-Z to ensure we don't compute the checksum twice. 1969 */ 1970 void 1971 zio_checksum_verified(zio_t *zio) 1972 { 1973 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1974 } 1975 1976 /* 1977 * Set the external verifier for a gang block based on stuff in the bp 1978 */ 1979 void 1980 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1981 { 1982 blkptr_t *bp = zio->io_bp; 1983 1984 zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 1985 zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 1986 zcp->zc_word[2] = bp->blk_birth; 1987 zcp->zc_word[3] = 0; 1988 } 1989 1990 /* 1991 * ========================================================================== 1992 * Define the pipeline 1993 * ========================================================================== 1994 */ 1995 typedef int zio_pipe_stage_t(zio_t *zio); 1996 1997 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1998 NULL, 1999 zio_wait_for_children_ready, 2000 zio_read_init, 2001 zio_issue_async, 2002 zio_write_compress, 2003 zio_checksum_generate, 2004 zio_get_gang_header, 2005 zio_rewrite_gang_members, 2006 zio_free_gang_members, 2007 zio_claim_gang_members, 2008 zio_dva_allocate, 2009 zio_dva_free, 2010 zio_dva_claim, 2011 zio_gang_checksum_generate, 2012 zio_ready, 2013 zio_vdev_io_start, 2014 zio_vdev_io_done, 2015 zio_vdev_io_assess, 2016 zio_wait_for_children_done, 2017 zio_checksum_verify, 2018 zio_read_gang_members, 2019 zio_read_decompress, 2020 zio_assess, 2021 zio_done, 2022 NULL 2023 }; 2024 2025 /* 2026 * Execute the I/O pipeline until one of the following occurs: 2027 * (1) the I/O completes; (2) the pipeline stalls waiting for 2028 * dependent child I/Os; (3) the I/O issues, so we're waiting 2029 * for an I/O completion interrupt; (4) the I/O is delegated by 2030 * vdev-level caching or aggregation; (5) the I/O is deferred 2031 * due to vdev-level queueing; (6) the I/O is handed off to 2032 * another thread. In all cases, the pipeline stops whenever 2033 * there's no CPU work; it never burns a thread in cv_wait(). 2034 * 2035 * There's no locking on io_stage because there's no legitimate way 2036 * for multiple threads to be attempting to process the same I/O. 2037 */ 2038 void 2039 zio_execute(zio_t *zio) 2040 { 2041 while (zio->io_stage < ZIO_STAGE_DONE) { 2042 uint32_t pipeline = zio->io_pipeline; 2043 int rv; 2044 2045 ASSERT(!MUTEX_HELD(&zio->io_lock)); 2046 2047 /* 2048 * If an error occurred outside the vdev stack, 2049 * just execute the interlock stages to clean up. 2050 */ 2051 if (zio->io_error && 2052 ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 2053 pipeline &= ZIO_ERROR_PIPELINE_MASK; 2054 2055 while (((1U << ++zio->io_stage) & pipeline) == 0) 2056 continue; 2057 2058 ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 2059 ASSERT(zio->io_stalled == 0); 2060 2061 rv = zio_pipeline[zio->io_stage](zio); 2062 2063 if (rv == ZIO_PIPELINE_STOP) 2064 return; 2065 2066 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2067 } 2068 } 2069 2070 static boolean_t 2071 zio_io_should_fail(uint16_t range) 2072 { 2073 static uint16_t allocs = 0; 2074 2075 return (P2PHASE(allocs++, 1U<<range) == 0); 2076 } 2077 2078 /* 2079 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2080 */ 2081 int 2082 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 2083 uint64_t txg) 2084 { 2085 int error; 2086 2087 spa_config_enter(spa, RW_READER, FTAG); 2088 2089 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 2090 spa_config_exit(spa, FTAG); 2091 return (ENOSPC); 2092 } 2093 2094 /* 2095 * We were passed the previous log block's DVA in bp->blk_dva[0]. 2096 * We use that as a hint for which vdev to allocate from next. 2097 */ 2098 error = metaslab_alloc(spa, spa->spa_log_class, size, 2099 new_bp, 1, txg, old_bp, B_TRUE); 2100 2101 if (error) 2102 error = metaslab_alloc(spa, spa->spa_normal_class, size, 2103 new_bp, 1, txg, old_bp, B_TRUE); 2104 2105 if (error == 0) { 2106 BP_SET_LSIZE(new_bp, size); 2107 BP_SET_PSIZE(new_bp, size); 2108 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2109 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2110 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2111 BP_SET_LEVEL(new_bp, 0); 2112 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2113 new_bp->blk_birth = txg; 2114 } 2115 2116 spa_config_exit(spa, FTAG); 2117 2118 return (error); 2119 } 2120 2121 /* 2122 * Free an intent log block. We know it can't be a gang block, so there's 2123 * nothing to do except metaslab_free() it. 2124 */ 2125 void 2126 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2127 { 2128 ASSERT(!BP_IS_GANG(bp)); 2129 2130 spa_config_enter(spa, RW_READER, FTAG); 2131 2132 metaslab_free(spa, bp, txg, B_FALSE); 2133 2134 spa_config_exit(spa, FTAG); 2135 } 2136 2137 /* 2138 * start an async flush of the write cache for this vdev 2139 */ 2140 void 2141 zio_flush(zio_t *zio, vdev_t *vd) 2142 { 2143 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 2144 NULL, NULL, ZIO_PRIORITY_NOW, 2145 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 2146 } 2147