1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 /* Only we should set CONFIG_GRABBED */ 344 ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED)); 345 346 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 347 bzero(zio, sizeof (zio_t)); 348 zio->io_parent = pio; 349 zio->io_spa = spa; 350 zio->io_txg = txg; 351 zio->io_flags = flags; 352 if (bp != NULL) { 353 zio->io_bp = bp; 354 zio->io_bp_copy = *bp; 355 zio->io_bp_orig = *bp; 356 } 357 zio->io_done = done; 358 zio->io_private = private; 359 zio->io_type = type; 360 zio->io_priority = priority; 361 zio->io_stage = stage; 362 zio->io_pipeline = pipeline; 363 zio->io_timestamp = lbolt64; 364 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 365 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 366 zio_push_transform(zio, data, size, size); 367 368 /* 369 * Note on config lock: 370 * 371 * If CONFIG_HELD is set, then the caller already has the config 372 * lock, so we don't need it for this io. 373 * 374 * We set CONFIG_GRABBED to indicate that we have grabbed the 375 * config lock on behalf of this io, so it should be released 376 * in zio_done. 377 * 378 * Unless CONFIG_HELD is set, we will grab the config lock for 379 * any top-level (parent-less) io, *except* NULL top-level ios. 380 * The NULL top-level ios rarely have any children, so we delay 381 * grabbing the lock until the first child is added (but it is 382 * still grabbed on behalf of the top-level i/o, so additional 383 * children don't need to also grab it). This greatly reduces 384 * contention on the config lock. 385 */ 386 if (pio == NULL) { 387 if (type != ZIO_TYPE_NULL && 388 !(flags & ZIO_FLAG_CONFIG_HELD)) { 389 spa_config_enter(spa, RW_READER, zio); 390 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 391 } 392 zio->io_root = zio; 393 } else { 394 zio->io_root = pio->io_root; 395 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 396 zio->io_logical = pio->io_logical; 397 mutex_enter(&pio->io_lock); 398 if (pio->io_parent == NULL && 399 pio->io_type == ZIO_TYPE_NULL && 400 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 401 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 402 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 403 spa_config_enter(spa, RW_READER, pio); 404 } 405 if (stage < ZIO_STAGE_READY) 406 pio->io_children_notready++; 407 pio->io_children_notdone++; 408 zio->io_sibling_next = pio->io_child; 409 zio->io_sibling_prev = NULL; 410 if (pio->io_child != NULL) 411 pio->io_child->io_sibling_prev = zio; 412 pio->io_child = zio; 413 zio->io_ndvas = pio->io_ndvas; 414 mutex_exit(&pio->io_lock); 415 } 416 417 /* 418 * Save off the original state incase we need to retry later. 419 */ 420 zio->io_orig_stage = zio->io_stage; 421 zio->io_orig_pipeline = zio->io_pipeline; 422 zio->io_orig_flags = zio->io_flags; 423 424 /* 425 * If this is not a null zio, and config is not already held, 426 * then the root zio should have grabbed the config lock. 427 * If this is not a root zio, it should not have grabbed the 428 * config lock. 429 */ 430 ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) || 431 zio->io_type == ZIO_TYPE_NULL || 432 (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 433 ASSERT(zio->io_root == zio || 434 !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED)); 435 436 return (zio); 437 } 438 439 static void 440 zio_reset(zio_t *zio) 441 { 442 zio_clear_transform_stack(zio); 443 444 zio->io_flags = zio->io_orig_flags; 445 zio->io_stage = zio->io_orig_stage; 446 zio->io_pipeline = zio->io_orig_pipeline; 447 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 448 } 449 450 zio_t * 451 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 452 int flags) 453 { 454 zio_t *zio; 455 456 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 457 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 458 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 459 460 return (zio); 461 } 462 463 zio_t * 464 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 465 { 466 return (zio_null(NULL, spa, done, private, flags)); 467 } 468 469 zio_t * 470 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, 471 uint64_t size, zio_done_func_t *done, void *private, 472 int priority, int flags, const zbookmark_t *zb) 473 { 474 zio_t *zio; 475 476 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 477 478 /* 479 * If the user has specified that we allow I/Os to continue 480 * then attempt to satisfy the read. 481 */ 482 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 483 ZIO_ENTER(spa); 484 485 zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, 486 data, size, done, private, 487 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 488 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 489 zio->io_bookmark = *zb; 490 491 zio->io_logical = zio; 492 493 /* 494 * Work off our copy of the bp so the caller can free it. 495 */ 496 zio->io_bp = &zio->io_bp_copy; 497 498 return (zio); 499 } 500 501 zio_t * 502 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 503 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 504 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 505 int flags, const zbookmark_t *zb) 506 { 507 zio_t *zio; 508 509 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 510 checksum < ZIO_CHECKSUM_FUNCTIONS); 511 512 ASSERT(compress >= ZIO_COMPRESS_OFF && 513 compress < ZIO_COMPRESS_FUNCTIONS); 514 515 ZIO_ENTER(spa); 516 517 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 518 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 519 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 520 521 zio->io_ready = ready; 522 523 zio->io_bookmark = *zb; 524 525 zio->io_logical = zio; 526 527 zio->io_checksum = checksum; 528 zio->io_compress = compress; 529 zio->io_ndvas = ncopies; 530 531 if (bp->blk_birth != txg) { 532 /* XXX the bp usually (always?) gets re-zeroed later */ 533 BP_ZERO(bp); 534 BP_SET_LSIZE(bp, size); 535 BP_SET_PSIZE(bp, size); 536 } else { 537 /* Make sure someone doesn't change their mind on overwrites */ 538 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 539 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 540 } 541 542 return (zio); 543 } 544 545 zio_t * 546 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, blkptr_t *bp, void *data, 547 uint64_t size, zio_done_func_t *done, void *private, int priority, 548 int flags, zbookmark_t *zb) 549 { 550 zio_t *zio; 551 552 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 553 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 554 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 555 556 zio->io_bookmark = *zb; 557 zio->io_checksum = checksum; 558 zio->io_compress = ZIO_COMPRESS_OFF; 559 560 if (pio != NULL) 561 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 562 563 return (zio); 564 } 565 566 static void 567 zio_write_allocate_ready(zio_t *zio) 568 { 569 /* Free up the previous block */ 570 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 571 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 572 &zio->io_bp_orig, NULL, NULL)); 573 } 574 } 575 576 static zio_t * 577 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 578 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 579 zio_done_func_t *done, void *private, int priority, int flags) 580 { 581 zio_t *zio; 582 583 BP_ZERO(bp); 584 BP_SET_LSIZE(bp, size); 585 BP_SET_PSIZE(bp, size); 586 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 587 588 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 589 ZIO_TYPE_WRITE, priority, flags, 590 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 591 592 zio->io_checksum = checksum; 593 zio->io_compress = ZIO_COMPRESS_OFF; 594 zio->io_ready = zio_write_allocate_ready; 595 596 return (zio); 597 } 598 599 zio_t * 600 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 601 zio_done_func_t *done, void *private) 602 { 603 zio_t *zio; 604 605 ASSERT(!BP_IS_HOLE(bp)); 606 607 if (txg == spa->spa_syncing_txg && 608 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 609 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 610 return (zio_null(pio, spa, NULL, NULL, 0)); 611 } 612 613 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 614 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 615 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 616 617 zio->io_bp = &zio->io_bp_copy; 618 619 return (zio); 620 } 621 622 zio_t * 623 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 624 zio_done_func_t *done, void *private) 625 { 626 zio_t *zio; 627 628 /* 629 * A claim is an allocation of a specific block. Claims are needed 630 * to support immediate writes in the intent log. The issue is that 631 * immediate writes contain committed data, but in a txg that was 632 * *not* committed. Upon opening the pool after an unclean shutdown, 633 * the intent log claims all blocks that contain immediate write data 634 * so that the SPA knows they're in use. 635 * 636 * All claims *must* be resolved in the first txg -- before the SPA 637 * starts allocating blocks -- so that nothing is allocated twice. 638 */ 639 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 640 ASSERT3U(spa_first_txg(spa), <=, txg); 641 642 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 643 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 644 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 645 646 zio->io_bp = &zio->io_bp_copy; 647 648 return (zio); 649 } 650 651 zio_t * 652 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 653 zio_done_func_t *done, void *private, int priority, int flags) 654 { 655 zio_t *zio; 656 int c; 657 658 if (vd->vdev_children == 0) { 659 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 660 ZIO_TYPE_IOCTL, priority, flags, 661 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 662 663 zio->io_vd = vd; 664 zio->io_cmd = cmd; 665 } else { 666 zio = zio_null(pio, spa, NULL, NULL, flags); 667 668 for (c = 0; c < vd->vdev_children; c++) 669 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 670 done, private, priority, flags)); 671 } 672 673 return (zio); 674 } 675 676 static void 677 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 678 int checksum, boolean_t labels) 679 { 680 ASSERT(vd->vdev_children == 0); 681 682 ASSERT(size <= SPA_MAXBLOCKSIZE); 683 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 684 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 685 686 #ifdef ZFS_DEBUG 687 if (labels) { 688 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 689 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 690 } 691 #endif 692 ASSERT3U(offset + size, <=, vd->vdev_psize); 693 694 BP_ZERO(bp); 695 696 BP_SET_LSIZE(bp, size); 697 BP_SET_PSIZE(bp, size); 698 699 BP_SET_CHECKSUM(bp, checksum); 700 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 701 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 702 703 if (checksum != ZIO_CHECKSUM_OFF) 704 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 705 } 706 707 zio_t * 708 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 709 void *data, int checksum, zio_done_func_t *done, void *private, 710 int priority, int flags, boolean_t labels) 711 { 712 zio_t *zio; 713 blkptr_t blk; 714 715 ZIO_ENTER(vd->vdev_spa); 716 717 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 718 719 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 720 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 721 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 722 723 zio->io_vd = vd; 724 zio->io_offset = offset; 725 726 /* 727 * Work off our copy of the bp so the caller can free it. 728 */ 729 zio->io_bp = &zio->io_bp_copy; 730 731 return (zio); 732 } 733 734 zio_t * 735 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 736 void *data, int checksum, zio_done_func_t *done, void *private, 737 int priority, int flags, boolean_t labels) 738 { 739 zio_block_tail_t *zbt; 740 void *wbuf; 741 zio_t *zio; 742 blkptr_t blk; 743 744 ZIO_ENTER(vd->vdev_spa); 745 746 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 747 748 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 749 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 750 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 751 752 zio->io_vd = vd; 753 zio->io_offset = offset; 754 755 zio->io_bp = &zio->io_bp_copy; 756 zio->io_checksum = checksum; 757 758 if (zio_checksum_table[checksum].ci_zbt) { 759 /* 760 * zbt checksums are necessarily destructive -- they modify 761 * one word of the write buffer to hold the verifier/checksum. 762 * Therefore, we must make a local copy in case the data is 763 * being written to multiple places. 764 */ 765 wbuf = zio_buf_alloc(size); 766 bcopy(data, wbuf, size); 767 zio_push_transform(zio, wbuf, size, size); 768 769 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 770 zbt->zbt_cksum = blk.blk_cksum; 771 } 772 773 return (zio); 774 } 775 776 /* 777 * Create a child I/O to do some work for us. It has no associated bp. 778 */ 779 zio_t * 780 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 781 void *data, uint64_t size, int type, int priority, int flags, 782 zio_done_func_t *done, void *private) 783 { 784 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 785 zio_t *cio; 786 787 if (type == ZIO_TYPE_READ && bp != NULL) { 788 /* 789 * If we have the bp, then the child should perform the 790 * checksum and the parent need not. This pushes error 791 * detection as close to the leaves as possible and 792 * eliminates redundant checksums in the interior nodes. 793 */ 794 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 795 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 796 } 797 798 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 799 done, private, type, priority, 800 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 801 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 802 803 cio->io_vd = vd; 804 cio->io_offset = offset; 805 806 return (cio); 807 } 808 809 /* 810 * ========================================================================== 811 * Initiate I/O, either sync or async 812 * ========================================================================== 813 */ 814 static void 815 zio_destroy(zio_t *zio) 816 { 817 mutex_destroy(&zio->io_lock); 818 cv_destroy(&zio->io_cv); 819 if (zio->io_failed_vds != NULL) { 820 kmem_free(zio->io_failed_vds, 821 zio->io_failed_vds_count * sizeof (vdev_t *)); 822 zio->io_failed_vds = NULL; 823 zio->io_failed_vds_count = 0; 824 } 825 kmem_cache_free(zio_cache, zio); 826 } 827 828 int 829 zio_wait(zio_t *zio) 830 { 831 int error; 832 833 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 834 835 zio->io_waiter = curthread; 836 837 zio_execute(zio); 838 839 mutex_enter(&zio->io_lock); 840 while (zio->io_stalled != ZIO_STAGE_DONE) 841 cv_wait(&zio->io_cv, &zio->io_lock); 842 mutex_exit(&zio->io_lock); 843 844 error = zio->io_error; 845 zio_destroy(zio); 846 847 return (error); 848 } 849 850 void 851 zio_nowait(zio_t *zio) 852 { 853 zio_execute(zio); 854 } 855 856 void 857 zio_interrupt(zio_t *zio) 858 { 859 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 860 (task_func_t *)zio_execute, zio, TQ_SLEEP); 861 } 862 863 static int 864 zio_issue_async(zio_t *zio) 865 { 866 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 867 (task_func_t *)zio_execute, zio, TQ_SLEEP); 868 869 return (ZIO_PIPELINE_STOP); 870 } 871 872 /* 873 * ========================================================================== 874 * I/O pipeline interlocks: parent/child dependency scoreboarding 875 * ========================================================================== 876 */ 877 static int 878 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 879 { 880 int rv = ZIO_PIPELINE_CONTINUE; 881 882 mutex_enter(&zio->io_lock); 883 ASSERT(zio->io_stalled == 0); 884 if (*countp != 0) { 885 zio->io_stalled = stage; 886 rv = ZIO_PIPELINE_STOP; 887 } 888 mutex_exit(&zio->io_lock); 889 890 return (rv); 891 } 892 893 static void 894 zio_add_failed_vdev(zio_t *pio, zio_t *zio) 895 { 896 uint64_t oldcount = pio->io_failed_vds_count; 897 vdev_t **new_vds; 898 int i; 899 900 ASSERT(MUTEX_HELD(&pio->io_lock)); 901 902 if (zio->io_vd == NULL) 903 return; 904 905 for (i = 0; i < oldcount; i++) { 906 if (pio->io_failed_vds[i] == zio->io_vd) 907 return; 908 } 909 910 new_vds = kmem_zalloc((oldcount + 1) * sizeof (vdev_t *), KM_SLEEP); 911 if (pio->io_failed_vds != NULL) { 912 bcopy(pio->io_failed_vds, new_vds, 913 oldcount * sizeof (vdev_t *)); 914 kmem_free(pio->io_failed_vds, oldcount * sizeof (vdev_t *)); 915 } 916 pio->io_failed_vds = new_vds; 917 pio->io_failed_vds[oldcount] = zio->io_vd; 918 pio->io_failed_vds_count++; 919 } 920 921 static void 922 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 923 { 924 zio_t *pio = zio->io_parent; 925 926 mutex_enter(&pio->io_lock); 927 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) { 928 pio->io_error = zio->io_error; 929 if (zio->io_error && zio->io_error != ENOTSUP) 930 zio_add_failed_vdev(pio, zio); 931 } 932 ASSERT3U(*countp, >, 0); 933 if (--*countp == 0 && pio->io_stalled == stage) { 934 pio->io_stalled = 0; 935 mutex_exit(&pio->io_lock); 936 zio_execute(pio); 937 } else { 938 mutex_exit(&pio->io_lock); 939 } 940 } 941 942 int 943 zio_wait_for_children_ready(zio_t *zio) 944 { 945 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 946 &zio->io_children_notready)); 947 } 948 949 int 950 zio_wait_for_children_done(zio_t *zio) 951 { 952 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 953 &zio->io_children_notdone)); 954 } 955 956 static int 957 zio_read_init(zio_t *zio) 958 { 959 blkptr_t *bp = zio->io_bp; 960 961 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 962 uint64_t csize = BP_GET_PSIZE(bp); 963 void *cbuf = zio_buf_alloc(csize); 964 965 zio_push_transform(zio, cbuf, csize, csize); 966 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 967 } 968 969 if (BP_IS_GANG(bp)) { 970 uint64_t gsize = SPA_GANGBLOCKSIZE; 971 void *gbuf = zio_buf_alloc(gsize); 972 973 zio_push_transform(zio, gbuf, gsize, gsize); 974 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 975 } 976 977 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 978 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 979 980 return (ZIO_PIPELINE_CONTINUE); 981 } 982 983 static int 984 zio_ready(zio_t *zio) 985 { 986 zio_t *pio = zio->io_parent; 987 988 if (zio->io_ready) 989 zio->io_ready(zio); 990 991 if (pio != NULL) 992 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 993 &pio->io_children_notready); 994 995 if (zio->io_bp) 996 zio->io_bp_copy = *zio->io_bp; 997 998 return (ZIO_PIPELINE_CONTINUE); 999 } 1000 1001 static int 1002 zio_vdev_retry_io(zio_t *zio) 1003 { 1004 zio_t *pio = zio->io_parent; 1005 1006 /* 1007 * Preserve the failed bp so that the io_ready() callback can 1008 * update the accounting accordingly. The callback will also be 1009 * responsible for freeing the previously allocated block, if one 1010 * exists. 1011 */ 1012 zio->io_bp_orig = *zio->io_bp; 1013 1014 /* 1015 * We must zero out the old DVA and blk_birth before reallocating 1016 * the bp. 1017 */ 1018 BP_ZERO_DVAS(zio->io_bp); 1019 zio_reset(zio); 1020 1021 if (pio) { 1022 /* 1023 * Let the parent know that we will 1024 * re-alloc the write (=> new bp info). 1025 */ 1026 mutex_enter(&pio->io_lock); 1027 pio->io_children_notready++; 1028 1029 /* 1030 * If the parent I/O is still in the open stage, then 1031 * don't bother telling it to retry since it hasn't 1032 * progressed far enough for it to care. 1033 */ 1034 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 1035 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1036 1037 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 1038 mutex_exit(&pio->io_lock); 1039 } 1040 1041 /* 1042 * We are getting ready to process the retry request so clear 1043 * the flag and the zio's current error status. 1044 */ 1045 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 1046 zio->io_error = 0; 1047 1048 return (ZIO_PIPELINE_CONTINUE); 1049 } 1050 1051 int 1052 zio_vdev_resume_io(spa_t *spa) 1053 { 1054 zio_t *zio; 1055 1056 mutex_enter(&spa->spa_zio_lock); 1057 1058 /* 1059 * Probe all of vdevs that have experienced an I/O error. 1060 * If we are still unable to verify the integrity of the vdev 1061 * then we prevent the resume from proceeeding. 1062 */ 1063 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 1064 zio = list_next(&spa->spa_zio_list, zio)) { 1065 int error = 0; 1066 1067 /* We only care about I/Os that must succeed */ 1068 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 1069 continue; 1070 error = vdev_probe(zio->io_vd); 1071 if (error) { 1072 mutex_exit(&spa->spa_zio_lock); 1073 return (error); 1074 } 1075 } 1076 1077 /* 1078 * Clear the vdev stats so that I/O can flow. 1079 */ 1080 vdev_clear(spa, NULL, B_FALSE); 1081 1082 spa->spa_state = POOL_STATE_ACTIVE; 1083 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 1084 list_remove(&spa->spa_zio_list, zio); 1085 zio->io_error = 0; 1086 1087 /* 1088 * If we are resuming an allocating I/O then we force it 1089 * to retry and let it resume operation where it left off. 1090 * Otherwise, go back to the ready stage and pick up from 1091 * there. 1092 */ 1093 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 1094 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1095 zio->io_stage--; 1096 } else { 1097 zio->io_stage = ZIO_STAGE_READY; 1098 } 1099 1100 (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 1101 zio, TQ_SLEEP); 1102 } 1103 mutex_exit(&spa->spa_zio_lock); 1104 1105 /* 1106 * Wait for the taskqs to finish and recheck the pool state since 1107 * it's possible that a resumed I/O has failed again. 1108 */ 1109 taskq_wait(zio_taskq); 1110 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1111 return (EIO); 1112 1113 mutex_enter(&spa->spa_zio_lock); 1114 cv_broadcast(&spa->spa_zio_cv); 1115 mutex_exit(&spa->spa_zio_lock); 1116 1117 return (0); 1118 } 1119 1120 static int 1121 zio_vdev_suspend_io(zio_t *zio) 1122 { 1123 spa_t *spa = zio->io_spa; 1124 1125 /* 1126 * We've experienced an unrecoverable failure so 1127 * set the pool state accordingly and queue all 1128 * failed IOs. 1129 */ 1130 spa->spa_state = POOL_STATE_IO_FAILURE; 1131 1132 mutex_enter(&spa->spa_zio_lock); 1133 list_insert_tail(&spa->spa_zio_list, zio); 1134 1135 #ifndef _KERNEL 1136 /* Used to notify ztest that the pool has suspended */ 1137 cv_broadcast(&spa->spa_zio_cv); 1138 #endif 1139 mutex_exit(&spa->spa_zio_lock); 1140 1141 return (ZIO_PIPELINE_STOP); 1142 } 1143 1144 static void 1145 zio_handle_io_failure(zio_t *zio, vdev_t *vd) 1146 { 1147 spa_t *spa = zio->io_spa; 1148 blkptr_t *bp = zio->io_bp; 1149 char *blkbuf; 1150 1151 #ifdef ZFS_DEBUG 1152 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 1153 if (blkbuf) { 1154 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 1155 bp ? bp : &zio->io_bp_copy); 1156 } 1157 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p %s): error %d", 1158 zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", 1159 zio_type_name[zio->io_type], vdev_description(vd), 1160 (u_longlong_t)zio->io_offset, (void *)zio, 1161 blkbuf ? blkbuf : "", zio->io_error); 1162 if (blkbuf) 1163 kmem_free(blkbuf, BP_SPRINTF_LEN); 1164 #endif 1165 1166 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 1167 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1168 "failure and the failure mode property for this pool " 1169 "is set to panic.", spa_name(spa)); 1170 } 1171 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1172 vdev_set_state(vd, vd == spa->spa_root_vdev ? B_TRUE : B_FALSE, 1173 VDEV_STATE_FAULTED, VDEV_AUX_IO_FAILURE); 1174 } 1175 1176 static int 1177 zio_assess(zio_t *zio) 1178 { 1179 spa_t *spa = zio->io_spa; 1180 blkptr_t *bp = zio->io_bp; 1181 vdev_t *vd = zio->io_vd; 1182 1183 ASSERT(zio->io_children_notready == 0); 1184 ASSERT(zio->io_children_notdone == 0); 1185 1186 if (bp != NULL) { 1187 ASSERT(bp->blk_pad[0] == 0); 1188 ASSERT(bp->blk_pad[1] == 0); 1189 ASSERT(bp->blk_pad[2] == 0); 1190 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1191 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 1192 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1193 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 1194 if (zio->io_ndvas != 0) 1195 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 1196 ASSERT(BP_COUNT_GANG(bp) == 0 || 1197 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 1198 } 1199 } 1200 1201 /* 1202 * Some child I/O has indicated that a retry is necessary, so 1203 * we set an error on the I/O and let the logic below do the 1204 * rest. 1205 */ 1206 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 1207 zio->io_error = ERESTART; 1208 1209 if (vd != NULL) 1210 vdev_stat_update(zio); 1211 1212 if (zio->io_error) { 1213 /* 1214 * If this I/O is attached to a particular vdev, 1215 * generate an error message describing the I/O failure 1216 * at the block level. We ignore these errors if the 1217 * device is currently unavailable. 1218 */ 1219 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 1220 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1221 1222 if ((zio->io_error == EIO || 1223 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 1224 zio->io_logical == zio) { 1225 /* 1226 * For root I/O requests, tell the SPA to log the error 1227 * appropriately. Also, generate a logical data 1228 * ereport. 1229 */ 1230 spa_log_error(spa, zio); 1231 1232 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 1233 0, 0); 1234 } 1235 1236 /* 1237 * If we are an allocating I/O then we attempt to reissue 1238 * the I/O on another vdev unless the pool is out of space. 1239 * We handle this condition based on the spa's failmode 1240 * property. 1241 */ 1242 if (zio_write_retry && zio->io_error != ENOSPC && 1243 IO_IS_ALLOCATING(zio)) 1244 return (zio_vdev_retry_io(zio)); 1245 1246 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1247 1248 /* 1249 * For I/O requests that cannot fail, we carry out 1250 * the requested behavior based on the failmode pool 1251 * property. 1252 * 1253 * XXX - Need to differentiate between an ENOSPC as 1254 * a result of vdev failures vs. a full pool. 1255 */ 1256 if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 1257 int i; 1258 1259 for (i = 0; i < zio->io_failed_vds_count; i++) { 1260 zio_handle_io_failure(zio, 1261 zio->io_failed_vds[i]); 1262 } 1263 if (zio->io_failed_vds_count == 0) { 1264 zio_handle_io_failure(zio, 1265 vd ? vd : spa->spa_root_vdev); 1266 } 1267 if (zio->io_failed_vds != NULL) { 1268 kmem_free(zio->io_failed_vds, 1269 zio->io_failed_vds_count * 1270 sizeof (vdev_t *)); 1271 zio->io_failed_vds = NULL; 1272 zio->io_failed_vds_count = 0; 1273 } 1274 return (zio_vdev_suspend_io(zio)); 1275 } 1276 } 1277 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1278 ASSERT(zio->io_children_notready == 0); 1279 1280 return (ZIO_PIPELINE_CONTINUE); 1281 } 1282 1283 static int 1284 zio_done(zio_t *zio) 1285 { 1286 zio_t *pio = zio->io_parent; 1287 spa_t *spa = zio->io_spa; 1288 1289 ASSERT(zio->io_children_notready == 0); 1290 ASSERT(zio->io_children_notdone == 0); 1291 1292 zio_clear_transform_stack(zio); 1293 1294 if (zio->io_done) 1295 zio->io_done(zio); 1296 1297 ASSERT(zio->io_delegate_list == NULL); 1298 ASSERT(zio->io_delegate_next == NULL); 1299 1300 if (pio != NULL) { 1301 zio_t *next, *prev; 1302 1303 mutex_enter(&pio->io_lock); 1304 next = zio->io_sibling_next; 1305 prev = zio->io_sibling_prev; 1306 if (next != NULL) 1307 next->io_sibling_prev = prev; 1308 if (prev != NULL) 1309 prev->io_sibling_next = next; 1310 if (pio->io_child == zio) 1311 pio->io_child = next; 1312 mutex_exit(&pio->io_lock); 1313 1314 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1315 &pio->io_children_notdone); 1316 } 1317 1318 /* 1319 * Note: this I/O is now done, and will shortly be freed, so there is no 1320 * need to clear this (or any other) flag. 1321 */ 1322 if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 1323 spa_config_exit(spa, zio); 1324 1325 if (zio->io_waiter != NULL) { 1326 mutex_enter(&zio->io_lock); 1327 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1328 zio->io_stalled = zio->io_stage; 1329 cv_broadcast(&zio->io_cv); 1330 mutex_exit(&zio->io_lock); 1331 } else { 1332 zio_destroy(zio); 1333 } 1334 1335 return (ZIO_PIPELINE_STOP); 1336 } 1337 1338 /* 1339 * ========================================================================== 1340 * Compression support 1341 * ========================================================================== 1342 */ 1343 static int 1344 zio_write_compress(zio_t *zio) 1345 { 1346 int compress = zio->io_compress; 1347 blkptr_t *bp = zio->io_bp; 1348 void *cbuf; 1349 uint64_t lsize = zio->io_size; 1350 uint64_t csize = lsize; 1351 uint64_t cbufsize = 0; 1352 int pass; 1353 1354 if (bp->blk_birth == zio->io_txg) { 1355 /* 1356 * We're rewriting an existing block, which means we're 1357 * working on behalf of spa_sync(). For spa_sync() to 1358 * converge, it must eventually be the case that we don't 1359 * have to allocate new blocks. But compression changes 1360 * the blocksize, which forces a reallocate, and makes 1361 * convergence take longer. Therefore, after the first 1362 * few passes, stop compressing to ensure convergence. 1363 */ 1364 pass = spa_sync_pass(zio->io_spa); 1365 if (pass > zio_sync_pass.zp_dontcompress) 1366 compress = ZIO_COMPRESS_OFF; 1367 } else { 1368 ASSERT(BP_IS_HOLE(bp)); 1369 pass = 1; 1370 } 1371 1372 if (compress != ZIO_COMPRESS_OFF) 1373 if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1374 &cbuf, &csize, &cbufsize)) 1375 compress = ZIO_COMPRESS_OFF; 1376 1377 if (compress != ZIO_COMPRESS_OFF && csize != 0) 1378 zio_push_transform(zio, cbuf, csize, cbufsize); 1379 1380 /* 1381 * The final pass of spa_sync() must be all rewrites, but the first 1382 * few passes offer a trade-off: allocating blocks defers convergence, 1383 * but newly allocated blocks are sequential, so they can be written 1384 * to disk faster. Therefore, we allow the first few passes of 1385 * spa_sync() to reallocate new blocks, but force rewrites after that. 1386 * There should only be a handful of blocks after pass 1 in any case. 1387 */ 1388 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1389 pass > zio_sync_pass.zp_rewrite) { 1390 ASSERT(csize != 0); 1391 BP_SET_LSIZE(bp, lsize); 1392 BP_SET_COMPRESS(bp, compress); 1393 zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1394 } else { 1395 if (bp->blk_birth == zio->io_txg) 1396 BP_ZERO(bp); 1397 if (csize == 0) { 1398 BP_ZERO(bp); 1399 zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1400 } else { 1401 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1402 BP_SET_LSIZE(bp, lsize); 1403 BP_SET_PSIZE(bp, csize); 1404 BP_SET_COMPRESS(bp, compress); 1405 } 1406 } 1407 1408 return (ZIO_PIPELINE_CONTINUE); 1409 } 1410 1411 static int 1412 zio_read_decompress(zio_t *zio) 1413 { 1414 blkptr_t *bp = zio->io_bp; 1415 void *data; 1416 uint64_t size; 1417 uint64_t bufsize; 1418 int compress = BP_GET_COMPRESS(bp); 1419 1420 ASSERT(compress != ZIO_COMPRESS_OFF); 1421 1422 zio_pop_transform(zio, &data, &size, &bufsize); 1423 1424 if (zio_decompress_data(compress, data, size, 1425 zio->io_data, zio->io_size)) 1426 zio->io_error = EIO; 1427 1428 zio_buf_free(data, bufsize); 1429 1430 return (ZIO_PIPELINE_CONTINUE); 1431 } 1432 1433 /* 1434 * ========================================================================== 1435 * Gang block support 1436 * ========================================================================== 1437 */ 1438 static void 1439 zio_gang_byteswap(zio_t *zio) 1440 { 1441 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1442 1443 if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1444 byteswap_uint64_array(zio->io_data, zio->io_size); 1445 } 1446 1447 static int 1448 zio_get_gang_header(zio_t *zio) 1449 { 1450 blkptr_t *bp = zio->io_bp; 1451 uint64_t gsize = SPA_GANGBLOCKSIZE; 1452 void *gbuf = zio_buf_alloc(gsize); 1453 1454 ASSERT(BP_IS_GANG(bp)); 1455 1456 zio_push_transform(zio, gbuf, gsize, gsize); 1457 1458 zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1459 NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1460 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1461 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1462 1463 return (zio_wait_for_children_done(zio)); 1464 } 1465 1466 static int 1467 zio_read_gang_members(zio_t *zio) 1468 { 1469 zio_gbh_phys_t *gbh; 1470 uint64_t gsize, gbufsize, loff, lsize; 1471 int i; 1472 1473 ASSERT(BP_IS_GANG(zio->io_bp)); 1474 1475 zio_gang_byteswap(zio); 1476 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1477 1478 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1479 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1480 lsize = BP_GET_PSIZE(gbp); 1481 1482 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1483 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1484 ASSERT3U(loff + lsize, <=, zio->io_size); 1485 ASSERT(i < SPA_GBH_NBLKPTRS); 1486 ASSERT(!BP_IS_HOLE(gbp)); 1487 1488 zio_nowait(zio_read(zio, zio->io_spa, gbp, 1489 (char *)zio->io_data + loff, lsize, 1490 NULL, NULL, zio->io_priority, 1491 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1492 } 1493 1494 zio_buf_free(gbh, gbufsize); 1495 1496 return (zio_wait_for_children_done(zio)); 1497 } 1498 1499 static int 1500 zio_rewrite_gang_members(zio_t *zio) 1501 { 1502 zio_gbh_phys_t *gbh; 1503 uint64_t gsize, gbufsize, loff, lsize; 1504 int i; 1505 1506 ASSERT(BP_IS_GANG(zio->io_bp)); 1507 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1508 1509 zio_gang_byteswap(zio); 1510 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1511 1512 ASSERT(gsize == gbufsize); 1513 1514 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1515 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1516 lsize = BP_GET_PSIZE(gbp); 1517 1518 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1519 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1520 ASSERT3U(loff + lsize, <=, zio->io_size); 1521 ASSERT(i < SPA_GBH_NBLKPTRS); 1522 ASSERT(!BP_IS_HOLE(gbp)); 1523 1524 zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, gbp, 1525 (char *)zio->io_data + loff, lsize, NULL, NULL, 1526 zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1527 &zio->io_bookmark)); 1528 } 1529 1530 zio_push_transform(zio, gbh, gsize, gbufsize); 1531 1532 return (zio_wait_for_children_ready(zio)); 1533 } 1534 1535 static int 1536 zio_free_gang_members(zio_t *zio) 1537 { 1538 zio_gbh_phys_t *gbh; 1539 uint64_t gsize, gbufsize; 1540 int i; 1541 1542 ASSERT(BP_IS_GANG(zio->io_bp)); 1543 1544 zio_gang_byteswap(zio); 1545 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1546 1547 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1548 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1549 1550 if (BP_IS_HOLE(gbp)) 1551 continue; 1552 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1553 gbp, NULL, NULL)); 1554 } 1555 1556 zio_buf_free(gbh, gbufsize); 1557 1558 return (ZIO_PIPELINE_CONTINUE); 1559 } 1560 1561 static int 1562 zio_claim_gang_members(zio_t *zio) 1563 { 1564 zio_gbh_phys_t *gbh; 1565 uint64_t gsize, gbufsize; 1566 int i; 1567 1568 ASSERT(BP_IS_GANG(zio->io_bp)); 1569 1570 zio_gang_byteswap(zio); 1571 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1572 1573 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1574 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1575 if (BP_IS_HOLE(gbp)) 1576 continue; 1577 zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1578 gbp, NULL, NULL)); 1579 } 1580 1581 zio_buf_free(gbh, gbufsize); 1582 1583 return (ZIO_PIPELINE_CONTINUE); 1584 } 1585 1586 static void 1587 zio_write_allocate_gang_member_done(zio_t *zio) 1588 { 1589 zio_t *pio = zio->io_parent; 1590 dva_t *cdva = zio->io_bp->blk_dva; 1591 dva_t *pdva = pio->io_bp->blk_dva; 1592 uint64_t asize; 1593 int d; 1594 1595 ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 1596 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1597 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1598 ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1599 1600 mutex_enter(&pio->io_lock); 1601 for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 1602 ASSERT(DVA_GET_GANG(&pdva[d])); 1603 asize = DVA_GET_ASIZE(&pdva[d]); 1604 asize += DVA_GET_ASIZE(&cdva[d]); 1605 DVA_SET_ASIZE(&pdva[d], asize); 1606 } 1607 mutex_exit(&pio->io_lock); 1608 } 1609 1610 static int 1611 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1612 { 1613 blkptr_t *bp = zio->io_bp; 1614 dva_t *dva = bp->blk_dva; 1615 spa_t *spa = zio->io_spa; 1616 zio_gbh_phys_t *gbh; 1617 uint64_t txg = zio->io_txg; 1618 uint64_t resid = zio->io_size; 1619 uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1620 uint64_t gsize, loff, lsize; 1621 uint32_t gbps_left; 1622 int ndvas = zio->io_ndvas; 1623 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1624 int error; 1625 int i, d; 1626 1627 gsize = SPA_GANGBLOCKSIZE; 1628 gbps_left = SPA_GBH_NBLKPTRS; 1629 1630 error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 1631 B_FALSE); 1632 if (error) { 1633 zio->io_error = error; 1634 return (ZIO_PIPELINE_CONTINUE); 1635 } 1636 1637 for (d = 0; d < gbh_ndvas; d++) 1638 DVA_SET_GANG(&dva[d], 1); 1639 1640 bp->blk_birth = txg; 1641 1642 gbh = zio_buf_alloc(gsize); 1643 bzero(gbh, gsize); 1644 1645 for (loff = 0, i = 0; loff != zio->io_size; 1646 loff += lsize, resid -= lsize, gbps_left--, i++) { 1647 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1648 dva = gbp->blk_dva; 1649 1650 ASSERT(gbps_left != 0); 1651 maxalloc = MIN(maxalloc, resid); 1652 1653 while (resid <= maxalloc * gbps_left) { 1654 error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 1655 txg, bp, B_FALSE); 1656 if (error == 0) 1657 break; 1658 ASSERT3U(error, ==, ENOSPC); 1659 /* XXX - free up previous allocations? */ 1660 if (maxalloc == SPA_MINBLOCKSIZE) { 1661 zio->io_error = error; 1662 return (ZIO_PIPELINE_CONTINUE); 1663 } 1664 maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1665 } 1666 1667 if (resid <= maxalloc * gbps_left) { 1668 lsize = maxalloc; 1669 BP_SET_LSIZE(gbp, lsize); 1670 BP_SET_PSIZE(gbp, lsize); 1671 BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1672 gbp->blk_birth = txg; 1673 zio_nowait(zio_rewrite(zio, spa, zio->io_checksum, gbp, 1674 (char *)zio->io_data + loff, lsize, 1675 zio_write_allocate_gang_member_done, NULL, 1676 zio->io_priority, 1677 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1678 &zio->io_bookmark)); 1679 } else { 1680 lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1681 ASSERT(lsize != SPA_MINBLOCKSIZE); 1682 zio_nowait(zio_write_allocate(zio, spa, 1683 zio->io_checksum, txg, gbp, 1684 (char *)zio->io_data + loff, lsize, 1685 zio_write_allocate_gang_member_done, NULL, 1686 zio->io_priority, 1687 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1688 } 1689 } 1690 1691 ASSERT(resid == 0 && loff == zio->io_size); 1692 1693 zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1694 1695 zio_push_transform(zio, gbh, gsize, gsize); 1696 1697 /* 1698 * As much as we'd like this to be 'ready' instead of 'done', 1699 * updating our ASIZE doesn't happen until the io_done callback, 1700 * so we have to wait for that to finish in order for our BP 1701 * to be stable. 1702 */ 1703 return (zio_wait_for_children_done(zio)); 1704 } 1705 1706 /* 1707 * ========================================================================== 1708 * Allocate and free blocks 1709 * ========================================================================== 1710 */ 1711 static int 1712 zio_dva_allocate(zio_t *zio) 1713 { 1714 spa_t *spa = zio->io_spa; 1715 metaslab_class_t *mc = spa->spa_normal_class; 1716 blkptr_t *bp = zio->io_bp; 1717 int error; 1718 1719 ASSERT(BP_IS_HOLE(bp)); 1720 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1721 ASSERT3U(zio->io_ndvas, >, 0); 1722 ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1723 1724 /* 1725 * For testing purposes, we force I/Os to retry. We don't allow 1726 * retries beyond the first pass since those I/Os are non-allocating 1727 * writes. 1728 */ 1729 if (zio_io_fail_shift && 1730 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 1731 zio_io_should_fail(zio_io_fail_shift)) 1732 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1733 1734 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1735 1736 error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 1737 zio->io_txg, NULL, B_FALSE); 1738 1739 if (error == 0) { 1740 bp->blk_birth = zio->io_txg; 1741 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 1742 return (zio_write_allocate_gang_members(zio, mc)); 1743 } else { 1744 zio->io_error = error; 1745 } 1746 1747 return (ZIO_PIPELINE_CONTINUE); 1748 } 1749 1750 static int 1751 zio_dva_free(zio_t *zio) 1752 { 1753 blkptr_t *bp = zio->io_bp; 1754 1755 metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1756 1757 BP_ZERO(bp); 1758 1759 return (ZIO_PIPELINE_CONTINUE); 1760 } 1761 1762 static int 1763 zio_dva_claim(zio_t *zio) 1764 { 1765 zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1766 1767 return (ZIO_PIPELINE_CONTINUE); 1768 } 1769 1770 /* 1771 * ========================================================================== 1772 * Read and write to physical devices 1773 * ========================================================================== 1774 */ 1775 1776 static int 1777 zio_vdev_io_start(zio_t *zio) 1778 { 1779 vdev_t *vd = zio->io_vd; 1780 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1781 blkptr_t *bp = zio->io_bp; 1782 uint64_t align; 1783 spa_t *spa = zio->io_spa; 1784 1785 /* 1786 * If the pool is already in a failure state then just suspend 1787 * this IO until the problem is resolved. We will reissue them 1788 * at that time. 1789 */ 1790 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 1791 zio->io_type == ZIO_TYPE_WRITE) 1792 return (zio_vdev_suspend_io(zio)); 1793 1794 /* 1795 * The mirror_ops handle multiple DVAs in a single BP 1796 */ 1797 if (vd == NULL) 1798 return (vdev_mirror_ops.vdev_op_io_start(zio)); 1799 1800 align = 1ULL << tvd->vdev_ashift; 1801 1802 if (zio->io_retries == 0 && vd == tvd) 1803 zio->io_flags |= ZIO_FLAG_FAILFAST; 1804 1805 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1806 zio->io_flags |= ZIO_FLAG_PHYSICAL; 1807 zio->io_offset += VDEV_LABEL_START_SIZE; 1808 } 1809 1810 if (P2PHASE(zio->io_size, align) != 0) { 1811 uint64_t asize = P2ROUNDUP(zio->io_size, align); 1812 char *abuf = zio_buf_alloc(asize); 1813 ASSERT(vd == tvd); 1814 if (zio->io_type == ZIO_TYPE_WRITE) { 1815 bcopy(zio->io_data, abuf, zio->io_size); 1816 bzero(abuf + zio->io_size, asize - zio->io_size); 1817 } 1818 zio_push_transform(zio, abuf, asize, asize); 1819 ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 1820 zio->io_flags |= ZIO_FLAG_SUBBLOCK; 1821 } 1822 1823 ASSERT(P2PHASE(zio->io_offset, align) == 0); 1824 ASSERT(P2PHASE(zio->io_size, align) == 0); 1825 ASSERT(bp == NULL || 1826 P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1827 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1828 1829 return (vd->vdev_ops->vdev_op_io_start(zio)); 1830 } 1831 1832 static int 1833 zio_vdev_io_done(zio_t *zio) 1834 { 1835 if (zio->io_vd == NULL) 1836 return (vdev_mirror_ops.vdev_op_io_done(zio)); 1837 1838 return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1839 } 1840 1841 /* XXPOLICY */ 1842 boolean_t 1843 zio_should_retry(zio_t *zio) 1844 { 1845 vdev_t *vd = zio->io_vd; 1846 1847 if (zio->io_error == 0) 1848 return (B_FALSE); 1849 if (zio->io_delegate_list != NULL) 1850 return (B_FALSE); 1851 if (vd != NULL) { 1852 if (vd != vd->vdev_top) 1853 return (B_FALSE); 1854 if (vd->vdev_is_failing) 1855 return (B_FALSE); 1856 } 1857 if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1858 return (B_FALSE); 1859 if (zio->io_retries > 0) 1860 return (B_FALSE); 1861 1862 return (B_TRUE); 1863 } 1864 1865 static int 1866 zio_vdev_io_assess(zio_t *zio) 1867 { 1868 vdev_t *vd = zio->io_vd; 1869 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1870 1871 ASSERT(zio->io_vsd == NULL); 1872 1873 if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 1874 void *abuf; 1875 uint64_t asize; 1876 ASSERT(vd == tvd); 1877 zio_pop_transform(zio, &abuf, &asize, &asize); 1878 if (zio->io_type == ZIO_TYPE_READ) 1879 bcopy(abuf, zio->io_data, zio->io_size); 1880 zio_buf_free(abuf, asize); 1881 zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 1882 } 1883 1884 if (zio_injection_enabled && !zio->io_error) 1885 zio->io_error = zio_handle_fault_injection(zio, EIO); 1886 1887 /* 1888 * If the I/O failed, determine whether we should attempt to retry it. 1889 */ 1890 /* XXPOLICY */ 1891 if (zio_should_retry(zio)) { 1892 ASSERT(tvd == vd); 1893 1894 zio->io_retries++; 1895 zio->io_error = 0; 1896 zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1897 /* XXPOLICY */ 1898 zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1899 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1900 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1901 1902 return (ZIO_PIPELINE_CONTINUE); 1903 } 1904 1905 return (ZIO_PIPELINE_CONTINUE); 1906 } 1907 1908 void 1909 zio_vdev_io_reissue(zio_t *zio) 1910 { 1911 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1912 ASSERT(zio->io_error == 0); 1913 1914 zio->io_stage--; 1915 } 1916 1917 void 1918 zio_vdev_io_redone(zio_t *zio) 1919 { 1920 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1921 1922 zio->io_stage--; 1923 } 1924 1925 void 1926 zio_vdev_io_bypass(zio_t *zio) 1927 { 1928 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1929 ASSERT(zio->io_error == 0); 1930 1931 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1932 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1933 } 1934 1935 /* 1936 * ========================================================================== 1937 * Generate and verify checksums 1938 * ========================================================================== 1939 */ 1940 static int 1941 zio_checksum_generate(zio_t *zio) 1942 { 1943 int checksum = zio->io_checksum; 1944 blkptr_t *bp = zio->io_bp; 1945 1946 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1947 1948 BP_SET_CHECKSUM(bp, checksum); 1949 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1950 1951 zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1952 1953 return (ZIO_PIPELINE_CONTINUE); 1954 } 1955 1956 static int 1957 zio_gang_checksum_generate(zio_t *zio) 1958 { 1959 zio_cksum_t zc; 1960 zio_gbh_phys_t *gbh = zio->io_data; 1961 1962 ASSERT(BP_IS_GANG(zio->io_bp)); 1963 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1964 1965 zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1966 1967 zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1968 1969 return (ZIO_PIPELINE_CONTINUE); 1970 } 1971 1972 static int 1973 zio_checksum_verify(zio_t *zio) 1974 { 1975 if (zio->io_bp != NULL) { 1976 zio->io_error = zio_checksum_error(zio); 1977 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1978 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1979 zio->io_spa, zio->io_vd, zio, 0, 0); 1980 } 1981 1982 return (ZIO_PIPELINE_CONTINUE); 1983 } 1984 1985 /* 1986 * Called by RAID-Z to ensure we don't compute the checksum twice. 1987 */ 1988 void 1989 zio_checksum_verified(zio_t *zio) 1990 { 1991 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1992 } 1993 1994 /* 1995 * Set the external verifier for a gang block based on stuff in the bp 1996 */ 1997 void 1998 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1999 { 2000 blkptr_t *bp = zio->io_bp; 2001 2002 zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 2003 zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 2004 zcp->zc_word[2] = bp->blk_birth; 2005 zcp->zc_word[3] = 0; 2006 } 2007 2008 /* 2009 * ========================================================================== 2010 * Define the pipeline 2011 * ========================================================================== 2012 */ 2013 typedef int zio_pipe_stage_t(zio_t *zio); 2014 2015 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 2016 NULL, 2017 zio_wait_for_children_ready, 2018 zio_read_init, 2019 zio_issue_async, 2020 zio_write_compress, 2021 zio_checksum_generate, 2022 zio_get_gang_header, 2023 zio_rewrite_gang_members, 2024 zio_free_gang_members, 2025 zio_claim_gang_members, 2026 zio_dva_allocate, 2027 zio_dva_free, 2028 zio_dva_claim, 2029 zio_gang_checksum_generate, 2030 zio_ready, 2031 zio_vdev_io_start, 2032 zio_vdev_io_done, 2033 zio_vdev_io_assess, 2034 zio_wait_for_children_done, 2035 zio_checksum_verify, 2036 zio_read_gang_members, 2037 zio_read_decompress, 2038 zio_assess, 2039 zio_done, 2040 NULL 2041 }; 2042 2043 /* 2044 * Execute the I/O pipeline until one of the following occurs: 2045 * (1) the I/O completes; (2) the pipeline stalls waiting for 2046 * dependent child I/Os; (3) the I/O issues, so we're waiting 2047 * for an I/O completion interrupt; (4) the I/O is delegated by 2048 * vdev-level caching or aggregation; (5) the I/O is deferred 2049 * due to vdev-level queueing; (6) the I/O is handed off to 2050 * another thread. In all cases, the pipeline stops whenever 2051 * there's no CPU work; it never burns a thread in cv_wait(). 2052 * 2053 * There's no locking on io_stage because there's no legitimate way 2054 * for multiple threads to be attempting to process the same I/O. 2055 */ 2056 void 2057 zio_execute(zio_t *zio) 2058 { 2059 while (zio->io_stage < ZIO_STAGE_DONE) { 2060 uint32_t pipeline = zio->io_pipeline; 2061 int rv; 2062 2063 ASSERT(!MUTEX_HELD(&zio->io_lock)); 2064 2065 /* 2066 * If an error occurred outside the vdev stack, 2067 * just execute the interlock stages to clean up. 2068 */ 2069 if (zio->io_error && 2070 ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 2071 pipeline &= ZIO_ERROR_PIPELINE_MASK; 2072 2073 while (((1U << ++zio->io_stage) & pipeline) == 0) 2074 continue; 2075 2076 ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 2077 ASSERT(zio->io_stalled == 0); 2078 2079 rv = zio_pipeline[zio->io_stage](zio); 2080 2081 if (rv == ZIO_PIPELINE_STOP) 2082 return; 2083 2084 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2085 } 2086 } 2087 2088 static boolean_t 2089 zio_io_should_fail(uint16_t range) 2090 { 2091 static uint16_t allocs = 0; 2092 2093 return (P2PHASE(allocs++, 1U<<range) == 0); 2094 } 2095 2096 /* 2097 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2098 */ 2099 int 2100 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 2101 uint64_t txg) 2102 { 2103 int error; 2104 2105 spa_config_enter(spa, RW_READER, FTAG); 2106 2107 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 2108 spa_config_exit(spa, FTAG); 2109 return (ENOSPC); 2110 } 2111 2112 /* 2113 * We were passed the previous log block's DVA in bp->blk_dva[0]. 2114 * We use that as a hint for which vdev to allocate from next. 2115 */ 2116 error = metaslab_alloc(spa, spa->spa_log_class, size, 2117 new_bp, 1, txg, old_bp, B_TRUE); 2118 2119 if (error) 2120 error = metaslab_alloc(spa, spa->spa_normal_class, size, 2121 new_bp, 1, txg, old_bp, B_TRUE); 2122 2123 if (error == 0) { 2124 BP_SET_LSIZE(new_bp, size); 2125 BP_SET_PSIZE(new_bp, size); 2126 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2127 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2128 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2129 BP_SET_LEVEL(new_bp, 0); 2130 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2131 new_bp->blk_birth = txg; 2132 } 2133 2134 spa_config_exit(spa, FTAG); 2135 2136 return (error); 2137 } 2138 2139 /* 2140 * Free an intent log block. We know it can't be a gang block, so there's 2141 * nothing to do except metaslab_free() it. 2142 */ 2143 void 2144 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2145 { 2146 ASSERT(!BP_IS_GANG(bp)); 2147 2148 spa_config_enter(spa, RW_READER, FTAG); 2149 2150 metaslab_free(spa, bp, txg, B_FALSE); 2151 2152 spa_config_exit(spa, FTAG); 2153 } 2154 2155 /* 2156 * start an async flush of the write cache for this vdev 2157 */ 2158 void 2159 zio_flush(zio_t *zio, vdev_t *vd) 2160 { 2161 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 2162 NULL, NULL, ZIO_PRIORITY_NOW, 2163 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 2164 } 2165