1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 344 bzero(zio, sizeof (zio_t)); 345 zio->io_parent = pio; 346 zio->io_spa = spa; 347 zio->io_txg = txg; 348 zio->io_flags = flags; 349 if (bp != NULL) { 350 zio->io_bp = bp; 351 zio->io_bp_copy = *bp; 352 zio->io_bp_orig = *bp; 353 } 354 zio->io_done = done; 355 zio->io_private = private; 356 zio->io_type = type; 357 zio->io_priority = priority; 358 zio->io_stage = stage; 359 zio->io_pipeline = pipeline; 360 zio->io_timestamp = lbolt64; 361 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 362 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 363 zio_push_transform(zio, data, size, size); 364 365 /* 366 * Note on config lock: 367 * 368 * If CONFIG_HELD is set, then the caller already has the config 369 * lock, so we don't need it for this io. 370 * 371 * We set CONFIG_GRABBED to indicate that we have grabbed the 372 * config lock on behalf of this io, so it should be released 373 * in zio_done. 374 * 375 * Unless CONFIG_HELD is set, we will grab the config lock for 376 * any top-level (parent-less) io, *except* NULL top-level ios. 377 * The NULL top-level ios rarely have any children, so we delay 378 * grabbing the lock until the first child is added (but it is 379 * still grabbed on behalf of the top-level i/o, so additional 380 * children don't need to also grab it). This greatly reduces 381 * contention on the config lock. 382 */ 383 if (pio == NULL) { 384 if (type != ZIO_TYPE_NULL && 385 !(flags & ZIO_FLAG_CONFIG_HELD)) { 386 spa_config_enter(spa, RW_READER, zio); 387 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 388 } 389 zio->io_root = zio; 390 } else { 391 zio->io_root = pio->io_root; 392 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 393 zio->io_logical = pio->io_logical; 394 mutex_enter(&pio->io_lock); 395 if (pio->io_parent == NULL && 396 pio->io_type == ZIO_TYPE_NULL && 397 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 398 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 399 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 400 spa_config_enter(spa, RW_READER, pio); 401 } 402 if (stage < ZIO_STAGE_READY) 403 pio->io_children_notready++; 404 pio->io_children_notdone++; 405 zio->io_sibling_next = pio->io_child; 406 zio->io_sibling_prev = NULL; 407 if (pio->io_child != NULL) 408 pio->io_child->io_sibling_prev = zio; 409 pio->io_child = zio; 410 zio->io_ndvas = pio->io_ndvas; 411 mutex_exit(&pio->io_lock); 412 } 413 414 /* 415 * Save off the original state incase we need to retry later. 416 */ 417 zio->io_orig_stage = zio->io_stage; 418 zio->io_orig_pipeline = zio->io_pipeline; 419 zio->io_orig_flags = zio->io_flags; 420 421 return (zio); 422 } 423 424 static void 425 zio_reset(zio_t *zio) 426 { 427 zio_clear_transform_stack(zio); 428 429 zio->io_flags = zio->io_orig_flags; 430 zio->io_stage = zio->io_orig_stage; 431 zio->io_pipeline = zio->io_orig_pipeline; 432 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 433 } 434 435 zio_t * 436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 437 int flags) 438 { 439 zio_t *zio; 440 441 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 442 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 443 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 444 445 return (zio); 446 } 447 448 zio_t * 449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 450 { 451 return (zio_null(NULL, spa, done, private, flags)); 452 } 453 454 zio_t * 455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 456 uint64_t size, zio_done_func_t *done, void *private, 457 int priority, int flags, zbookmark_t *zb) 458 { 459 zio_t *zio; 460 461 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 462 463 /* 464 * If the user has specified that we allow I/Os to continue 465 * then attempt to satisfy the read. 466 */ 467 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 468 ZIO_ENTER(spa); 469 470 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 471 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 472 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 473 zio->io_bookmark = *zb; 474 475 zio->io_logical = zio; 476 477 /* 478 * Work off our copy of the bp so the caller can free it. 479 */ 480 zio->io_bp = &zio->io_bp_copy; 481 482 return (zio); 483 } 484 485 zio_t * 486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 487 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 488 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 489 int flags, zbookmark_t *zb) 490 { 491 zio_t *zio; 492 493 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 494 checksum < ZIO_CHECKSUM_FUNCTIONS); 495 496 ASSERT(compress >= ZIO_COMPRESS_OFF && 497 compress < ZIO_COMPRESS_FUNCTIONS); 498 499 ZIO_ENTER(spa); 500 501 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 502 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 503 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 504 505 zio->io_ready = ready; 506 507 zio->io_bookmark = *zb; 508 509 zio->io_logical = zio; 510 511 zio->io_checksum = checksum; 512 zio->io_compress = compress; 513 zio->io_ndvas = ncopies; 514 515 if (bp->blk_birth != txg) { 516 /* XXX the bp usually (always?) gets re-zeroed later */ 517 BP_ZERO(bp); 518 BP_SET_LSIZE(bp, size); 519 BP_SET_PSIZE(bp, size); 520 } else { 521 /* Make sure someone doesn't change their mind on overwrites */ 522 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 523 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 524 } 525 526 return (zio); 527 } 528 529 zio_t * 530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, blkptr_t *bp, void *data, 531 uint64_t size, zio_done_func_t *done, void *private, int priority, 532 int flags, zbookmark_t *zb) 533 { 534 zio_t *zio; 535 536 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 537 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 538 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 539 540 zio->io_bookmark = *zb; 541 zio->io_checksum = checksum; 542 zio->io_compress = ZIO_COMPRESS_OFF; 543 544 if (pio != NULL) 545 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 546 547 return (zio); 548 } 549 550 static void 551 zio_write_allocate_ready(zio_t *zio) 552 { 553 /* Free up the previous block */ 554 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 555 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 556 &zio->io_bp_orig, NULL, NULL)); 557 } 558 } 559 560 static zio_t * 561 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 562 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 563 zio_done_func_t *done, void *private, int priority, int flags) 564 { 565 zio_t *zio; 566 567 BP_ZERO(bp); 568 BP_SET_LSIZE(bp, size); 569 BP_SET_PSIZE(bp, size); 570 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 571 572 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 573 ZIO_TYPE_WRITE, priority, flags, 574 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 575 576 zio->io_checksum = checksum; 577 zio->io_compress = ZIO_COMPRESS_OFF; 578 zio->io_ready = zio_write_allocate_ready; 579 580 return (zio); 581 } 582 583 zio_t * 584 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 585 zio_done_func_t *done, void *private) 586 { 587 zio_t *zio; 588 589 ASSERT(!BP_IS_HOLE(bp)); 590 591 if (txg == spa->spa_syncing_txg && 592 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 593 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 594 return (zio_null(pio, spa, NULL, NULL, 0)); 595 } 596 597 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 598 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 599 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 600 601 zio->io_bp = &zio->io_bp_copy; 602 603 return (zio); 604 } 605 606 zio_t * 607 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 608 zio_done_func_t *done, void *private) 609 { 610 zio_t *zio; 611 612 /* 613 * A claim is an allocation of a specific block. Claims are needed 614 * to support immediate writes in the intent log. The issue is that 615 * immediate writes contain committed data, but in a txg that was 616 * *not* committed. Upon opening the pool after an unclean shutdown, 617 * the intent log claims all blocks that contain immediate write data 618 * so that the SPA knows they're in use. 619 * 620 * All claims *must* be resolved in the first txg -- before the SPA 621 * starts allocating blocks -- so that nothing is allocated twice. 622 */ 623 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 624 ASSERT3U(spa_first_txg(spa), <=, txg); 625 626 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 627 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 628 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 629 630 zio->io_bp = &zio->io_bp_copy; 631 632 return (zio); 633 } 634 635 zio_t * 636 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 637 zio_done_func_t *done, void *private, int priority, int flags) 638 { 639 zio_t *zio; 640 int c; 641 642 if (vd->vdev_children == 0) { 643 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 644 ZIO_TYPE_IOCTL, priority, flags, 645 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 646 647 zio->io_vd = vd; 648 zio->io_cmd = cmd; 649 } else { 650 zio = zio_null(pio, spa, NULL, NULL, flags); 651 652 for (c = 0; c < vd->vdev_children; c++) 653 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 654 done, private, priority, flags)); 655 } 656 657 return (zio); 658 } 659 660 static void 661 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 662 int checksum, boolean_t labels) 663 { 664 ASSERT(vd->vdev_children == 0); 665 666 ASSERT(size <= SPA_MAXBLOCKSIZE); 667 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 668 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 669 670 #ifdef ZFS_DEBUG 671 if (labels) { 672 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 673 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 674 } 675 #endif 676 ASSERT3U(offset + size, <=, vd->vdev_psize); 677 678 BP_ZERO(bp); 679 680 BP_SET_LSIZE(bp, size); 681 BP_SET_PSIZE(bp, size); 682 683 BP_SET_CHECKSUM(bp, checksum); 684 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 685 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 686 687 if (checksum != ZIO_CHECKSUM_OFF) 688 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 689 } 690 691 zio_t * 692 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 693 void *data, int checksum, zio_done_func_t *done, void *private, 694 int priority, int flags, boolean_t labels) 695 { 696 zio_t *zio; 697 blkptr_t blk; 698 699 ZIO_ENTER(vd->vdev_spa); 700 701 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 702 703 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 704 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 705 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 706 707 zio->io_vd = vd; 708 zio->io_offset = offset; 709 710 /* 711 * Work off our copy of the bp so the caller can free it. 712 */ 713 zio->io_bp = &zio->io_bp_copy; 714 715 return (zio); 716 } 717 718 zio_t * 719 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 720 void *data, int checksum, zio_done_func_t *done, void *private, 721 int priority, int flags, boolean_t labels) 722 { 723 zio_block_tail_t *zbt; 724 void *wbuf; 725 zio_t *zio; 726 blkptr_t blk; 727 728 ZIO_ENTER(vd->vdev_spa); 729 730 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 731 732 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 733 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 734 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 735 736 zio->io_vd = vd; 737 zio->io_offset = offset; 738 739 zio->io_bp = &zio->io_bp_copy; 740 zio->io_checksum = checksum; 741 742 if (zio_checksum_table[checksum].ci_zbt) { 743 /* 744 * zbt checksums are necessarily destructive -- they modify 745 * one word of the write buffer to hold the verifier/checksum. 746 * Therefore, we must make a local copy in case the data is 747 * being written to multiple places. 748 */ 749 wbuf = zio_buf_alloc(size); 750 bcopy(data, wbuf, size); 751 zio_push_transform(zio, wbuf, size, size); 752 753 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 754 zbt->zbt_cksum = blk.blk_cksum; 755 } 756 757 return (zio); 758 } 759 760 /* 761 * Create a child I/O to do some work for us. It has no associated bp. 762 */ 763 zio_t * 764 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 765 void *data, uint64_t size, int type, int priority, int flags, 766 zio_done_func_t *done, void *private) 767 { 768 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 769 zio_t *cio; 770 771 if (type == ZIO_TYPE_READ && bp != NULL) { 772 /* 773 * If we have the bp, then the child should perform the 774 * checksum and the parent need not. This pushes error 775 * detection as close to the leaves as possible and 776 * eliminates redundant checksums in the interior nodes. 777 */ 778 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 779 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 780 } 781 782 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 783 done, private, type, priority, 784 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 785 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 786 787 cio->io_vd = vd; 788 cio->io_offset = offset; 789 790 return (cio); 791 } 792 793 /* 794 * ========================================================================== 795 * Initiate I/O, either sync or async 796 * ========================================================================== 797 */ 798 static void 799 zio_destroy(zio_t *zio) 800 { 801 mutex_destroy(&zio->io_lock); 802 cv_destroy(&zio->io_cv); 803 if (zio->io_failed_vds != NULL) { 804 kmem_free(zio->io_failed_vds, 805 zio->io_failed_vds_count * sizeof (vdev_t *)); 806 zio->io_failed_vds = NULL; 807 zio->io_failed_vds_count = 0; 808 } 809 kmem_cache_free(zio_cache, zio); 810 } 811 812 int 813 zio_wait(zio_t *zio) 814 { 815 int error; 816 817 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 818 819 zio->io_waiter = curthread; 820 821 zio_execute(zio); 822 823 mutex_enter(&zio->io_lock); 824 while (zio->io_stalled != ZIO_STAGE_DONE) 825 cv_wait(&zio->io_cv, &zio->io_lock); 826 mutex_exit(&zio->io_lock); 827 828 error = zio->io_error; 829 zio_destroy(zio); 830 831 return (error); 832 } 833 834 void 835 zio_nowait(zio_t *zio) 836 { 837 zio_execute(zio); 838 } 839 840 void 841 zio_interrupt(zio_t *zio) 842 { 843 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 844 (task_func_t *)zio_execute, zio, TQ_SLEEP); 845 } 846 847 static int 848 zio_issue_async(zio_t *zio) 849 { 850 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 851 (task_func_t *)zio_execute, zio, TQ_SLEEP); 852 853 return (ZIO_PIPELINE_STOP); 854 } 855 856 /* 857 * ========================================================================== 858 * I/O pipeline interlocks: parent/child dependency scoreboarding 859 * ========================================================================== 860 */ 861 static int 862 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 863 { 864 int rv = ZIO_PIPELINE_CONTINUE; 865 866 mutex_enter(&zio->io_lock); 867 ASSERT(zio->io_stalled == 0); 868 if (*countp != 0) { 869 zio->io_stalled = stage; 870 rv = ZIO_PIPELINE_STOP; 871 } 872 mutex_exit(&zio->io_lock); 873 874 return (rv); 875 } 876 877 static void 878 zio_add_failed_vdev(zio_t *pio, zio_t *zio) 879 { 880 uint64_t oldcount = pio->io_failed_vds_count; 881 vdev_t **new_vds; 882 int i; 883 884 ASSERT(MUTEX_HELD(&pio->io_lock)); 885 886 if (zio->io_vd == NULL) 887 return; 888 889 for (i = 0; i < oldcount; i++) { 890 if (pio->io_failed_vds[i] == zio->io_vd) 891 return; 892 } 893 894 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); 895 if (pio->io_failed_vds != NULL) { 896 bcopy(pio->io_failed_vds, new_vds, 897 oldcount * sizeof (vdev_t *)); 898 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); 899 } 900 pio->io_failed_vds = new_vds; 901 pio->io_failed_vds[oldcount] = zio->io_vd; 902 pio->io_failed_vds_count++; 903 } 904 905 static void 906 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 907 { 908 zio_t *pio = zio->io_parent; 909 910 mutex_enter(&pio->io_lock); 911 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { 912 pio->io_error = zio->io_error; 913 if (zio->io_error && zio->io_error != ENOTSUP) 914 zio_add_failed_vdev(pio, zio); 915 } 916 ASSERT3U(*countp, >, 0); 917 if (--*countp == 0 && pio->io_stalled == stage) { 918 pio->io_stalled = 0; 919 mutex_exit(&pio->io_lock); 920 zio_execute(pio); 921 } else { 922 mutex_exit(&pio->io_lock); 923 } 924 } 925 926 int 927 zio_wait_for_children_ready(zio_t *zio) 928 { 929 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 930 &zio->io_children_notready)); 931 } 932 933 int 934 zio_wait_for_children_done(zio_t *zio) 935 { 936 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 937 &zio->io_children_notdone)); 938 } 939 940 static int 941 zio_read_init(zio_t *zio) 942 { 943 blkptr_t *bp = zio->io_bp; 944 945 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 946 uint64_t csize = BP_GET_PSIZE(bp); 947 void *cbuf = zio_buf_alloc(csize); 948 949 zio_push_transform(zio, cbuf, csize, csize); 950 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 951 } 952 953 if (BP_IS_GANG(bp)) { 954 uint64_t gsize = SPA_GANGBLOCKSIZE; 955 void *gbuf = zio_buf_alloc(gsize); 956 957 zio_push_transform(zio, gbuf, gsize, gsize); 958 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 959 } 960 961 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 962 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 963 964 return (ZIO_PIPELINE_CONTINUE); 965 } 966 967 static int 968 zio_ready(zio_t *zio) 969 { 970 zio_t *pio = zio->io_parent; 971 972 if (zio->io_ready) 973 zio->io_ready(zio); 974 975 if (pio != NULL) 976 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 977 &pio->io_children_notready); 978 979 if (zio->io_bp) 980 zio->io_bp_copy = *zio->io_bp; 981 982 return (ZIO_PIPELINE_CONTINUE); 983 } 984 985 static int 986 zio_vdev_retry_io(zio_t *zio) 987 { 988 zio_t *pio = zio->io_parent; 989 990 /* 991 * Preserve the failed bp so that the io_ready() callback can 992 * update the accounting accordingly. The callback will also be 993 * responsible for freeing the previously allocated block, if one 994 * exists. 995 */ 996 zio->io_bp_orig = *zio->io_bp; 997 998 /* 999 * We must zero out the old DVA and blk_birth before reallocating 1000 * the bp. 1001 */ 1002 BP_ZERO_DVAS(zio->io_bp); 1003 zio_reset(zio); 1004 1005 if (pio) { 1006 /* 1007 * Let the parent know that we will 1008 * re-alloc the write (=> new bp info). 1009 */ 1010 mutex_enter(&pio->io_lock); 1011 pio->io_children_notready++; 1012 1013 /* 1014 * If the parent I/O is still in the open stage, then 1015 * don't bother telling it to retry since it hasn't 1016 * progressed far enough for it to care. 1017 */ 1018 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 1019 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1020 1021 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 1022 mutex_exit(&pio->io_lock); 1023 } 1024 1025 /* 1026 * We are getting ready to process the retry request so clear 1027 * the flag and the zio's current error status. 1028 */ 1029 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 1030 zio->io_error = 0; 1031 1032 return (ZIO_PIPELINE_CONTINUE); 1033 } 1034 1035 int 1036 zio_vdev_resume_io(spa_t *spa) 1037 { 1038 zio_t *zio; 1039 1040 mutex_enter(&spa->spa_zio_lock); 1041 1042 /* 1043 * Probe all of vdevs that have experienced an I/O error. 1044 * If we are still unable to verify the integrity of the vdev 1045 * then we prevent the resume from proceeeding. 1046 */ 1047 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 1048 zio = list_next(&spa->spa_zio_list, zio)) { 1049 int error = 0; 1050 1051 /* We only care about I/Os that must succeed */ 1052 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 1053 continue; 1054 error = vdev_probe(zio->io_vd); 1055 if (error) { 1056 mutex_exit(&spa->spa_zio_lock); 1057 return (error); 1058 } 1059 } 1060 1061 /* 1062 * Clear the vdev stats so that I/O can flow. 1063 */ 1064 vdev_clear(spa, NULL, B_FALSE); 1065 1066 spa->spa_state = POOL_STATE_ACTIVE; 1067 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 1068 list_remove(&spa->spa_zio_list, zio); 1069 zio->io_error = 0; 1070 1071 /* 1072 * If we are resuming an allocating I/O then we force it 1073 * to retry and let it resume operation where it left off. 1074 * Otherwise, go back to the ready stage and pick up from 1075 * there. 1076 */ 1077 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 1078 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1079 zio->io_stage--; 1080 } else { 1081 zio->io_stage = ZIO_STAGE_READY; 1082 } 1083 1084 (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 1085 zio, TQ_SLEEP); 1086 } 1087 mutex_exit(&spa->spa_zio_lock); 1088 1089 /* 1090 * Wait for the taskqs to finish and recheck the pool state since 1091 * it's possible that a resumed I/O has failed again. 1092 */ 1093 taskq_wait(zio_taskq); 1094 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1095 return (EIO); 1096 1097 mutex_enter(&spa->spa_zio_lock); 1098 cv_broadcast(&spa->spa_zio_cv); 1099 mutex_exit(&spa->spa_zio_lock); 1100 1101 return (0); 1102 } 1103 1104 static int 1105 zio_vdev_suspend_io(zio_t *zio) 1106 { 1107 spa_t *spa = zio->io_spa; 1108 1109 /* 1110 * We've experienced an unrecoverable failure so 1111 * set the pool state accordingly and queue all 1112 * failed IOs. 1113 */ 1114 spa->spa_state = POOL_STATE_IO_FAILURE; 1115 1116 mutex_enter(&spa->spa_zio_lock); 1117 list_insert_tail(&spa->spa_zio_list, zio); 1118 1119 #ifndef _KERNEL 1120 /* Used to notify ztest that the pool has suspended */ 1121 cv_broadcast(&spa->spa_zio_cv); 1122 #endif 1123 mutex_exit(&spa->spa_zio_lock); 1124 1125 return (ZIO_PIPELINE_STOP); 1126 } 1127 1128 static void 1129 zio_handle_io_failure(zio_t *zio, vdev_t *vd) 1130 { 1131 spa_t *spa = zio->io_spa; 1132 blkptr_t *bp = zio->io_bp; 1133 char *blkbuf; 1134 1135 #ifdef ZFS_DEBUG 1136 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 1137 if (blkbuf) { 1138 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 1139 bp ? bp : &zio->io_bp_copy); 1140 } 1141 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 1142 zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 1143 zio_type_name[zio->io_type], vdev_description(vd), 1144 (u_longlong_t)zio->io_offset, (void *)zio, 1145 blkbuf ? blkbuf : "", zio->io_error); 1146 if (blkbuf) 1147 kmem_free(blkbuf, BP_SPRINTF_LEN); 1148 #endif 1149 1150 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 1151 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1152 "failure and the failure mode property for this pool " 1153 "is set to panic.", spa_name(spa)); 1154 } 1155 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1156 vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE, 1157 VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE); 1158 } 1159 1160 static int 1161 zio_assess(zio_t *zio) 1162 { 1163 spa_t *spa = zio->io_spa; 1164 blkptr_t *bp = zio->io_bp; 1165 vdev_t *vd = zio->io_vd; 1166 1167 ASSERT(zio->io_children_notready == 0); 1168 ASSERT(zio->io_children_notdone == 0); 1169 1170 if (bp != NULL) { 1171 ASSERT(bp->blk_pad[0] == 0); 1172 ASSERT(bp->blk_pad[1] == 0); 1173 ASSERT(bp->blk_pad[2] == 0); 1174 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1175 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 1176 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1177 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 1178 if (zio->io_ndvas != 0) 1179 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 1180 ASSERT(BP_COUNT_GANG(bp) == 0 || 1181 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 1182 } 1183 } 1184 1185 /* 1186 * Some child I/O has indicated that a retry is necessary, so 1187 * we set an error on the I/O and let the logic below do the 1188 * rest. 1189 */ 1190 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 1191 zio->io_error = ERESTART; 1192 1193 if (vd != NULL) 1194 vdev_stat_update(zio); 1195 1196 if (zio->io_error) { 1197 /* 1198 * If this I/O is attached to a particular vdev, 1199 * generate an error message describing the I/O failure 1200 * at the block level. We ignore these errors if the 1201 * device is currently unavailable. 1202 */ 1203 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 1204 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1205 1206 if ((zio->io_error == EIO || 1207 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 1208 zio->io_logical == zio) { 1209 /* 1210 * For root I/O requests, tell the SPA to log the error 1211 * appropriately. Also, generate a logical data 1212 * ereport. 1213 */ 1214 spa_log_error(spa, zio); 1215 1216 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 1217 0, 0); 1218 } 1219 1220 /* 1221 * If we are an allocating I/O then we attempt to reissue 1222 * the I/O on another vdev unless the pool is out of space. 1223 * We handle this condition based on the spa's failmode 1224 * property. 1225 */ 1226 if (zio_write_retry && zio->io_error != ENOSPC && 1227 IO_IS_ALLOCATING(zio)) 1228 return (zio_vdev_retry_io(zio)); 1229 1230 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1231 1232 /* 1233 * For I/O requests that cannot fail, we carry out 1234 * the requested behavior based on the failmode pool 1235 * property. 1236 * 1237 * XXX - Need to differentiate between an ENOSPC as 1238 * a result of vdev failures vs. a full pool. 1239 */ 1240 if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 1241 int i; 1242 1243 for (i = 0; i < zio->io_failed_vds_count; i++) { 1244 zio_handle_io_failure(zio, 1245 zio->io_failed_vds[i]); 1246 } 1247 if (zio->io_failed_vds_count == 0) { 1248 zio_handle_io_failure(zio, 1249 vd ? vd : spa->spa_root_vdev); 1250 } 1251 if (zio->io_failed_vds != NULL) { 1252 kmem_free(zio->io_failed_vds, 1253 zio->io_failed_vds_count * 1254 sizeof (vdev_t *)); 1255 zio->io_failed_vds = NULL; 1256 zio->io_failed_vds_count = 0; 1257 } 1258 return (zio_vdev_suspend_io(zio)); 1259 } 1260 } 1261 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1262 ASSERT(zio->io_children_notready == 0); 1263 1264 return (ZIO_PIPELINE_CONTINUE); 1265 } 1266 1267 static int 1268 zio_done(zio_t *zio) 1269 { 1270 zio_t *pio = zio->io_parent; 1271 spa_t *spa = zio->io_spa; 1272 1273 ASSERT(zio->io_children_notready == 0); 1274 ASSERT(zio->io_children_notdone == 0); 1275 1276 zio_clear_transform_stack(zio); 1277 1278 if (zio->io_done) 1279 zio->io_done(zio); 1280 1281 ASSERT(zio->io_delegate_list == NULL); 1282 ASSERT(zio->io_delegate_next == NULL); 1283 1284 if (pio != NULL) { 1285 zio_t *next, *prev; 1286 1287 mutex_enter(&pio->io_lock); 1288 next = zio->io_sibling_next; 1289 prev = zio->io_sibling_prev; 1290 if (next != NULL) 1291 next->io_sibling_prev = prev; 1292 if (prev != NULL) 1293 prev->io_sibling_next = next; 1294 if (pio->io_child == zio) 1295 pio->io_child = next; 1296 mutex_exit(&pio->io_lock); 1297 1298 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1299 &pio->io_children_notdone); 1300 } 1301 1302 /* 1303 * Note: this I/O is now done, and will shortly be freed, so there is no 1304 * need to clear this (or any other) flag. 1305 */ 1306 if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 1307 spa_config_exit(spa, zio); 1308 1309 if (zio->io_waiter != NULL) { 1310 mutex_enter(&zio->io_lock); 1311 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1312 zio->io_stalled = zio->io_stage; 1313 cv_broadcast(&zio->io_cv); 1314 mutex_exit(&zio->io_lock); 1315 } else { 1316 zio_destroy(zio); 1317 } 1318 1319 return (ZIO_PIPELINE_STOP); 1320 } 1321 1322 /* 1323 * ========================================================================== 1324 * Compression support 1325 * ========================================================================== 1326 */ 1327 static int 1328 zio_write_compress(zio_t *zio) 1329 { 1330 int compress = zio->io_compress; 1331 blkptr_t *bp = zio->io_bp; 1332 void *cbuf; 1333 uint64_t lsize = zio->io_size; 1334 uint64_t csize = lsize; 1335 uint64_t cbufsize = 0; 1336 int pass; 1337 1338 if (bp->blk_birth == zio->io_txg) { 1339 /* 1340 * We're rewriting an existing block, which means we're 1341 * working on behalf of spa_sync(). For spa_sync() to 1342 * converge, it must eventually be the case that we don't 1343 * have to allocate new blocks. But compression changes 1344 * the blocksize, which forces a reallocate, and makes 1345 * convergence take longer. Therefore, after the first 1346 * few passes, stop compressing to ensure convergence. 1347 */ 1348 pass = spa_sync_pass(zio->io_spa); 1349 if (pass > zio_sync_pass.zp_dontcompress) 1350 compress = ZIO_COMPRESS_OFF; 1351 } else { 1352 ASSERT(BP_IS_HOLE(bp)); 1353 pass = 1; 1354 } 1355 1356 if (compress != ZIO_COMPRESS_OFF) 1357 if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1358 &cbuf, &csize, &cbufsize)) 1359 compress = ZIO_COMPRESS_OFF; 1360 1361 if (compress != ZIO_COMPRESS_OFF && csize != 0) 1362 zio_push_transform(zio, cbuf, csize, cbufsize); 1363 1364 /* 1365 * The final pass of spa_sync() must be all rewrites, but the first 1366 * few passes offer a trade-off: allocating blocks defers convergence, 1367 * but newly allocated blocks are sequential, so they can be written 1368 * to disk faster. Therefore, we allow the first few passes of 1369 * spa_sync() to reallocate new blocks, but force rewrites after that. 1370 * There should only be a handful of blocks after pass 1 in any case. 1371 */ 1372 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1373 pass > zio_sync_pass.zp_rewrite) { 1374 ASSERT(csize != 0); 1375 BP_SET_LSIZE(bp, lsize); 1376 BP_SET_COMPRESS(bp, compress); 1377 zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1378 } else { 1379 if (bp->blk_birth == zio->io_txg) 1380 BP_ZERO(bp); 1381 if (csize == 0) { 1382 BP_ZERO(bp); 1383 zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1384 } else { 1385 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1386 BP_SET_LSIZE(bp, lsize); 1387 BP_SET_PSIZE(bp, csize); 1388 BP_SET_COMPRESS(bp, compress); 1389 } 1390 } 1391 1392 return (ZIO_PIPELINE_CONTINUE); 1393 } 1394 1395 static int 1396 zio_read_decompress(zio_t *zio) 1397 { 1398 blkptr_t *bp = zio->io_bp; 1399 void *data; 1400 uint64_t size; 1401 uint64_t bufsize; 1402 int compress = BP_GET_COMPRESS(bp); 1403 1404 ASSERT(compress != ZIO_COMPRESS_OFF); 1405 1406 zio_pop_transform(zio, &data, &size, &bufsize); 1407 1408 if (zio_decompress_data(compress, data, size, 1409 zio->io_data, zio->io_size)) 1410 zio->io_error = EIO; 1411 1412 zio_buf_free(data, bufsize); 1413 1414 return (ZIO_PIPELINE_CONTINUE); 1415 } 1416 1417 /* 1418 * ========================================================================== 1419 * Gang block support 1420 * ========================================================================== 1421 */ 1422 static void 1423 zio_gang_byteswap(zio_t *zio) 1424 { 1425 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1426 1427 if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1428 byteswap_uint64_array(zio->io_data, zio->io_size); 1429 } 1430 1431 static int 1432 zio_get_gang_header(zio_t *zio) 1433 { 1434 blkptr_t *bp = zio->io_bp; 1435 uint64_t gsize = SPA_GANGBLOCKSIZE; 1436 void *gbuf = zio_buf_alloc(gsize); 1437 1438 ASSERT(BP_IS_GANG(bp)); 1439 1440 zio_push_transform(zio, gbuf, gsize, gsize); 1441 1442 zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1443 NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1444 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1445 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1446 1447 return (zio_wait_for_children_done(zio)); 1448 } 1449 1450 static int 1451 zio_read_gang_members(zio_t *zio) 1452 { 1453 zio_gbh_phys_t *gbh; 1454 uint64_t gsize, gbufsize, loff, lsize; 1455 int i; 1456 1457 ASSERT(BP_IS_GANG(zio->io_bp)); 1458 1459 zio_gang_byteswap(zio); 1460 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1461 1462 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1463 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1464 lsize = BP_GET_PSIZE(gbp); 1465 1466 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1467 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1468 ASSERT3U(loff + lsize, <=, zio->io_size); 1469 ASSERT(i < SPA_GBH_NBLKPTRS); 1470 ASSERT(!BP_IS_HOLE(gbp)); 1471 1472 zio_nowait(zio_read(zio, zio->io_spa, gbp, 1473 (char *)zio->io_data + loff, lsize, 1474 NULL, NULL, zio->io_priority, 1475 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1476 } 1477 1478 zio_buf_free(gbh, gbufsize); 1479 1480 return (zio_wait_for_children_done(zio)); 1481 } 1482 1483 static int 1484 zio_rewrite_gang_members(zio_t *zio) 1485 { 1486 zio_gbh_phys_t *gbh; 1487 uint64_t gsize, gbufsize, loff, lsize; 1488 int i; 1489 1490 ASSERT(BP_IS_GANG(zio->io_bp)); 1491 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1492 1493 zio_gang_byteswap(zio); 1494 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1495 1496 ASSERT(gsize == gbufsize); 1497 1498 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1499 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1500 lsize = BP_GET_PSIZE(gbp); 1501 1502 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1503 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1504 ASSERT3U(loff + lsize, <=, zio->io_size); 1505 ASSERT(i < SPA_GBH_NBLKPTRS); 1506 ASSERT(!BP_IS_HOLE(gbp)); 1507 1508 zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, gbp, 1509 (char *)zio->io_data + loff, lsize, NULL, NULL, 1510 zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1511 &zio->io_bookmark)); 1512 } 1513 1514 zio_push_transform(zio, gbh, gsize, gbufsize); 1515 1516 return (zio_wait_for_children_ready(zio)); 1517 } 1518 1519 static int 1520 zio_free_gang_members(zio_t *zio) 1521 { 1522 zio_gbh_phys_t *gbh; 1523 uint64_t gsize, gbufsize; 1524 int i; 1525 1526 ASSERT(BP_IS_GANG(zio->io_bp)); 1527 1528 zio_gang_byteswap(zio); 1529 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1530 1531 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1532 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1533 1534 if (BP_IS_HOLE(gbp)) 1535 continue; 1536 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1537 gbp, NULL, NULL)); 1538 } 1539 1540 zio_buf_free(gbh, gbufsize); 1541 1542 return (ZIO_PIPELINE_CONTINUE); 1543 } 1544 1545 static int 1546 zio_claim_gang_members(zio_t *zio) 1547 { 1548 zio_gbh_phys_t *gbh; 1549 uint64_t gsize, gbufsize; 1550 int i; 1551 1552 ASSERT(BP_IS_GANG(zio->io_bp)); 1553 1554 zio_gang_byteswap(zio); 1555 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1556 1557 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1558 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1559 if (BP_IS_HOLE(gbp)) 1560 continue; 1561 zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1562 gbp, NULL, NULL)); 1563 } 1564 1565 zio_buf_free(gbh, gbufsize); 1566 1567 return (ZIO_PIPELINE_CONTINUE); 1568 } 1569 1570 static void 1571 zio_write_allocate_gang_member_done(zio_t *zio) 1572 { 1573 zio_t *pio = zio->io_parent; 1574 dva_t *cdva = zio->io_bp->blk_dva; 1575 dva_t *pdva = pio->io_bp->blk_dva; 1576 uint64_t asize; 1577 int d; 1578 1579 ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 1580 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1581 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1582 ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1583 1584 mutex_enter(&pio->io_lock); 1585 for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 1586 ASSERT(DVA_GET_GANG(&pdva[d])); 1587 asize = DVA_GET_ASIZE(&pdva[d]); 1588 asize += DVA_GET_ASIZE(&cdva[d]); 1589 DVA_SET_ASIZE(&pdva[d], asize); 1590 } 1591 mutex_exit(&pio->io_lock); 1592 } 1593 1594 static int 1595 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1596 { 1597 blkptr_t *bp = zio->io_bp; 1598 dva_t *dva = bp->blk_dva; 1599 spa_t *spa = zio->io_spa; 1600 zio_gbh_phys_t *gbh; 1601 uint64_t txg = zio->io_txg; 1602 uint64_t resid = zio->io_size; 1603 uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1604 uint64_t gsize, loff, lsize; 1605 uint32_t gbps_left; 1606 int ndvas = zio->io_ndvas; 1607 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1608 int error; 1609 int i, d; 1610 1611 gsize = SPA_GANGBLOCKSIZE; 1612 gbps_left = SPA_GBH_NBLKPTRS; 1613 1614 error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 1615 B_FALSE); 1616 if (error) { 1617 zio->io_error = error; 1618 return (ZIO_PIPELINE_CONTINUE); 1619 } 1620 1621 for (d = 0; d < gbh_ndvas; d++) 1622 DVA_SET_GANG(&dva[d], 1); 1623 1624 bp->blk_birth = txg; 1625 1626 gbh = zio_buf_alloc(gsize); 1627 bzero(gbh, gsize); 1628 1629 for (loff = 0, i = 0; loff != zio->io_size; 1630 loff += lsize, resid -= lsize, gbps_left--, i++) { 1631 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1632 dva = gbp->blk_dva; 1633 1634 ASSERT(gbps_left != 0); 1635 maxalloc = MIN(maxalloc, resid); 1636 1637 while (resid <= maxalloc * gbps_left) { 1638 error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 1639 txg, bp, B_FALSE); 1640 if (error == 0) 1641 break; 1642 ASSERT3U(error, ==, ENOSPC); 1643 /* XXX - free up previous allocations? */ 1644 if (maxalloc == SPA_MINBLOCKSIZE) { 1645 zio->io_error = error; 1646 return (ZIO_PIPELINE_CONTINUE); 1647 } 1648 maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1649 } 1650 1651 if (resid <= maxalloc * gbps_left) { 1652 lsize = maxalloc; 1653 BP_SET_LSIZE(gbp, lsize); 1654 BP_SET_PSIZE(gbp, lsize); 1655 BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1656 gbp->blk_birth = txg; 1657 zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, gbp, 1658 (char *)zio->io_data + loff, lsize, 1659 zio_write_allocate_gang_member_done, NULL, 1660 zio->io_priority, 1661 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1662 &zio->io_bookmark)); 1663 } else { 1664 lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1665 ASSERT(lsize != SPA_MINBLOCKSIZE); 1666 zio_nowait(zio_write_allocate(zio, spa, 1667 zio->io_checksum, txg, gbp, 1668 (char *)zio->io_data + loff, lsize, 1669 zio_write_allocate_gang_member_done, NULL, 1670 zio->io_priority, 1671 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1672 } 1673 } 1674 1675 ASSERT(resid == 0 && loff == zio->io_size); 1676 1677 zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1678 1679 zio_push_transform(zio, gbh, gsize, gsize); 1680 1681 /* 1682 * As much as we'd like this to be 'ready' instead of 'done', 1683 * updating our ASIZE doesn't happen until the io_done callback, 1684 * so we have to wait for that to finish in order for our BP 1685 * to be stable. 1686 */ 1687 return (zio_wait_for_children_done(zio)); 1688 } 1689 1690 /* 1691 * ========================================================================== 1692 * Allocate and free blocks 1693 * ========================================================================== 1694 */ 1695 static int 1696 zio_dva_allocate(zio_t *zio) 1697 { 1698 spa_t *spa = zio->io_spa; 1699 metaslab_class_t *mc = spa->spa_normal_class; 1700 blkptr_t *bp = zio->io_bp; 1701 int error; 1702 1703 ASSERT(BP_IS_HOLE(bp)); 1704 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1705 ASSERT3U(zio->io_ndvas, >, 0); 1706 ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1707 1708 /* 1709 * For testing purposes, we force I/Os to retry. We don't allow 1710 * retries beyond the first pass since those I/Os are non-allocating 1711 * writes. 1712 */ 1713 if (zio_io_fail_shift && 1714 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 1715 zio_io_should_fail(zio_io_fail_shift)) 1716 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1717 1718 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1719 1720 error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 1721 zio->io_txg, NULL, B_FALSE); 1722 1723 if (error == 0) { 1724 bp->blk_birth = zio->io_txg; 1725 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 1726 return (zio_write_allocate_gang_members(zio, mc)); 1727 } else { 1728 zio->io_error = error; 1729 } 1730 1731 return (ZIO_PIPELINE_CONTINUE); 1732 } 1733 1734 static int 1735 zio_dva_free(zio_t *zio) 1736 { 1737 blkptr_t *bp = zio->io_bp; 1738 1739 metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1740 1741 BP_ZERO(bp); 1742 1743 return (ZIO_PIPELINE_CONTINUE); 1744 } 1745 1746 static int 1747 zio_dva_claim(zio_t *zio) 1748 { 1749 zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1750 1751 return (ZIO_PIPELINE_CONTINUE); 1752 } 1753 1754 /* 1755 * ========================================================================== 1756 * Read and write to physical devices 1757 * ========================================================================== 1758 */ 1759 1760 static int 1761 zio_vdev_io_start(zio_t *zio) 1762 { 1763 vdev_t *vd = zio->io_vd; 1764 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1765 blkptr_t *bp = zio->io_bp; 1766 uint64_t align; 1767 spa_t *spa = zio->io_spa; 1768 1769 /* 1770 * If the pool is already in a failure state then just suspend 1771 * this IO until the problem is resolved. We will reissue them 1772 * at that time. 1773 */ 1774 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 1775 zio->io_type == ZIO_TYPE_WRITE) 1776 return (zio_vdev_suspend_io(zio)); 1777 1778 /* 1779 * The mirror_ops handle multiple DVAs in a single BP 1780 */ 1781 if (vd == NULL) 1782 return (vdev_mirror_ops.vdev_op_io_start(zio)); 1783 1784 align = 1ULL << tvd->vdev_ashift; 1785 1786 if (zio->io_retries == 0 && vd == tvd) 1787 zio->io_flags |= ZIO_FLAG_FAILFAST; 1788 1789 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1790 zio->io_flags |= ZIO_FLAG_PHYSICAL; 1791 zio->io_offset += VDEV_LABEL_START_SIZE; 1792 } 1793 1794 if (P2PHASE(zio->io_size, align) != 0) { 1795 uint64_t asize = P2ROUNDUP(zio->io_size, align); 1796 char *abuf = zio_buf_alloc(asize); 1797 ASSERT(vd == tvd); 1798 if (zio->io_type == ZIO_TYPE_WRITE) { 1799 bcopy(zio->io_data, abuf, zio->io_size); 1800 bzero(abuf + zio->io_size, asize - zio->io_size); 1801 } 1802 zio_push_transform(zio, abuf, asize, asize); 1803 ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 1804 zio->io_flags |= ZIO_FLAG_SUBBLOCK; 1805 } 1806 1807 ASSERT(P2PHASE(zio->io_offset, align) == 0); 1808 ASSERT(P2PHASE(zio->io_size, align) == 0); 1809 ASSERT(bp == NULL || 1810 P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1811 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1812 1813 return (vd->vdev_ops->vdev_op_io_start(zio)); 1814 } 1815 1816 static int 1817 zio_vdev_io_done(zio_t *zio) 1818 { 1819 if (zio->io_vd == NULL) 1820 return (vdev_mirror_ops.vdev_op_io_done(zio)); 1821 1822 return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1823 } 1824 1825 /* XXPOLICY */ 1826 boolean_t 1827 zio_should_retry(zio_t *zio) 1828 { 1829 vdev_t *vd = zio->io_vd; 1830 1831 if (zio->io_error == 0) 1832 return (B_FALSE); 1833 if (zio->io_delegate_list != NULL) 1834 return (B_FALSE); 1835 if (vd != NULL) { 1836 if (vd != vd->vdev_top) 1837 return (B_FALSE); 1838 if (vd->vdev_is_failing) 1839 return (B_FALSE); 1840 } 1841 if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1842 return (B_FALSE); 1843 if (zio->io_retries > 0) 1844 return (B_FALSE); 1845 1846 return (B_TRUE); 1847 } 1848 1849 static int 1850 zio_vdev_io_assess(zio_t *zio) 1851 { 1852 vdev_t *vd = zio->io_vd; 1853 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1854 1855 ASSERT(zio->io_vsd == NULL); 1856 1857 if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 1858 void *abuf; 1859 uint64_t asize; 1860 ASSERT(vd == tvd); 1861 zio_pop_transform(zio, &abuf, &asize, &asize); 1862 if (zio->io_type == ZIO_TYPE_READ) 1863 bcopy(abuf, zio->io_data, zio->io_size); 1864 zio_buf_free(abuf, asize); 1865 zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 1866 } 1867 1868 if (zio_injection_enabled && !zio->io_error) 1869 zio->io_error = zio_handle_fault_injection(zio, EIO); 1870 1871 /* 1872 * If the I/O failed, determine whether we should attempt to retry it. 1873 */ 1874 /* XXPOLICY */ 1875 if (zio_should_retry(zio)) { 1876 ASSERT(tvd == vd); 1877 1878 zio->io_retries++; 1879 zio->io_error = 0; 1880 zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1881 /* XXPOLICY */ 1882 zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1883 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1884 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1885 1886 return (ZIO_PIPELINE_CONTINUE); 1887 } 1888 1889 return (ZIO_PIPELINE_CONTINUE); 1890 } 1891 1892 void 1893 zio_vdev_io_reissue(zio_t *zio) 1894 { 1895 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1896 ASSERT(zio->io_error == 0); 1897 1898 zio->io_stage--; 1899 } 1900 1901 void 1902 zio_vdev_io_redone(zio_t *zio) 1903 { 1904 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1905 1906 zio->io_stage--; 1907 } 1908 1909 void 1910 zio_vdev_io_bypass(zio_t *zio) 1911 { 1912 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1913 ASSERT(zio->io_error == 0); 1914 1915 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1916 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1917 } 1918 1919 /* 1920 * ========================================================================== 1921 * Generate and verify checksums 1922 * ========================================================================== 1923 */ 1924 static int 1925 zio_checksum_generate(zio_t *zio) 1926 { 1927 int checksum = zio->io_checksum; 1928 blkptr_t *bp = zio->io_bp; 1929 1930 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1931 1932 BP_SET_CHECKSUM(bp, checksum); 1933 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1934 1935 zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1936 1937 return (ZIO_PIPELINE_CONTINUE); 1938 } 1939 1940 static int 1941 zio_gang_checksum_generate(zio_t *zio) 1942 { 1943 zio_cksum_t zc; 1944 zio_gbh_phys_t *gbh = zio->io_data; 1945 1946 ASSERT(BP_IS_GANG(zio->io_bp)); 1947 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1948 1949 zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1950 1951 zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1952 1953 return (ZIO_PIPELINE_CONTINUE); 1954 } 1955 1956 static int 1957 zio_checksum_verify(zio_t *zio) 1958 { 1959 if (zio->io_bp != NULL) { 1960 zio->io_error = zio_checksum_error(zio); 1961 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1962 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1963 zio->io_spa, zio->io_vd, zio, 0, 0); 1964 } 1965 1966 return (ZIO_PIPELINE_CONTINUE); 1967 } 1968 1969 /* 1970 * Called by RAID-Z to ensure we don't compute the checksum twice. 1971 */ 1972 void 1973 zio_checksum_verified(zio_t *zio) 1974 { 1975 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1976 } 1977 1978 /* 1979 * Set the external verifier for a gang block based on stuff in the bp 1980 */ 1981 void 1982 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1983 { 1984 blkptr_t *bp = zio->io_bp; 1985 1986 zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 1987 zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 1988 zcp->zc_word[2] = bp->blk_birth; 1989 zcp->zc_word[3] = 0; 1990 } 1991 1992 /* 1993 * ========================================================================== 1994 * Define the pipeline 1995 * ========================================================================== 1996 */ 1997 typedef int zio_pipe_stage_t(zio_t *zio); 1998 1999 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 2000 NULL, 2001 zio_wait_for_children_ready, 2002 zio_read_init, 2003 zio_issue_async, 2004 zio_write_compress, 2005 zio_checksum_generate, 2006 zio_get_gang_header, 2007 zio_rewrite_gang_members, 2008 zio_free_gang_members, 2009 zio_claim_gang_members, 2010 zio_dva_allocate, 2011 zio_dva_free, 2012 zio_dva_claim, 2013 zio_gang_checksum_generate, 2014 zio_ready, 2015 zio_vdev_io_start, 2016 zio_vdev_io_done, 2017 zio_vdev_io_assess, 2018 zio_wait_for_children_done, 2019 zio_checksum_verify, 2020 zio_read_gang_members, 2021 zio_read_decompress, 2022 zio_assess, 2023 zio_done, 2024 NULL 2025 }; 2026 2027 /* 2028 * Execute the I/O pipeline until one of the following occurs: 2029 * (1) the I/O completes; (2) the pipeline stalls waiting for 2030 * dependent child I/Os; (3) the I/O issues, so we're waiting 2031 * for an I/O completion interrupt; (4) the I/O is delegated by 2032 * vdev-level caching or aggregation; (5) the I/O is deferred 2033 * due to vdev-level queueing; (6) the I/O is handed off to 2034 * another thread. In all cases, the pipeline stops whenever 2035 * there's no CPU work; it never burns a thread in cv_wait(). 2036 * 2037 * There's no locking on io_stage because there's no legitimate way 2038 * for multiple threads to be attempting to process the same I/O. 2039 */ 2040 void 2041 zio_execute(zio_t *zio) 2042 { 2043 while (zio->io_stage < ZIO_STAGE_DONE) { 2044 uint32_t pipeline = zio->io_pipeline; 2045 int rv; 2046 2047 ASSERT(!MUTEX_HELD(&zio->io_lock)); 2048 2049 /* 2050 * If an error occurred outside the vdev stack, 2051 * just execute the interlock stages to clean up. 2052 */ 2053 if (zio->io_error && 2054 ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 2055 pipeline &= ZIO_ERROR_PIPELINE_MASK; 2056 2057 while (((1U << ++zio->io_stage) & pipeline) == 0) 2058 continue; 2059 2060 ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 2061 ASSERT(zio->io_stalled == 0); 2062 2063 rv = zio_pipeline[zio->io_stage](zio); 2064 2065 if (rv == ZIO_PIPELINE_STOP) 2066 return; 2067 2068 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2069 } 2070 } 2071 2072 static boolean_t 2073 zio_io_should_fail(uint16_t range) 2074 { 2075 static uint16_t allocs = 0; 2076 2077 return (P2PHASE(allocs++, 1U<<range) == 0); 2078 } 2079 2080 /* 2081 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2082 */ 2083 int 2084 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 2085 uint64_t txg) 2086 { 2087 int error; 2088 2089 spa_config_enter(spa, RW_READER, FTAG); 2090 2091 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 2092 spa_config_exit(spa, FTAG); 2093 return (ENOSPC); 2094 } 2095 2096 /* 2097 * We were passed the previous log block's DVA in bp->blk_dva[0]. 2098 * We use that as a hint for which vdev to allocate from next. 2099 */ 2100 error = metaslab_alloc(spa, spa->spa_log_class, size, 2101 new_bp, 1, txg, old_bp, B_TRUE); 2102 2103 if (error) 2104 error = metaslab_alloc(spa, spa->spa_normal_class, size, 2105 new_bp, 1, txg, old_bp, B_TRUE); 2106 2107 if (error == 0) { 2108 BP_SET_LSIZE(new_bp, size); 2109 BP_SET_PSIZE(new_bp, size); 2110 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2111 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2112 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2113 BP_SET_LEVEL(new_bp, 0); 2114 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2115 new_bp->blk_birth = txg; 2116 } 2117 2118 spa_config_exit(spa, FTAG); 2119 2120 return (error); 2121 } 2122 2123 /* 2124 * Free an intent log block. We know it can't be a gang block, so there's 2125 * nothing to do except metaslab_free() it. 2126 */ 2127 void 2128 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2129 { 2130 ASSERT(!BP_IS_GANG(bp)); 2131 2132 spa_config_enter(spa, RW_READER, FTAG); 2133 2134 metaslab_free(spa, bp, txg, B_FALSE); 2135 2136 spa_config_exit(spa, FTAG); 2137 } 2138 2139 /* 2140 * start an async flush of the write cache for this vdev 2141 */ 2142 void 2143 zio_flush(zio_t *zio, vdev_t *vd) 2144 { 2145 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 2146 NULL, NULL, ZIO_PRIORITY_NOW, 2147 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 2148 } 2149