1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/fm/fs/zfs.h> 30 #include <sys/spa.h> 31 #include <sys/txg.h> 32 #include <sys/spa_impl.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/zio_impl.h> 35 #include <sys/zio_compress.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 6, /* ZIO_PRIORITY_ASYNC_READ */ 48 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 49 4, /* ZIO_PRIORITY_FREE */ 50 0, /* ZIO_PRIORITY_CACHE_FILL */ 51 0, /* ZIO_PRIORITY_LOG_WRITE */ 52 10, /* ZIO_PRIORITY_RESILVER */ 53 20, /* ZIO_PRIORITY_SCRUB */ 54 }; 55 56 /* 57 * ========================================================================== 58 * I/O type descriptions 59 * ========================================================================== 60 */ 61 char *zio_type_name[ZIO_TYPES] = { 62 "null", "read", "write", "free", "claim", "ioctl" }; 63 64 /* Force an allocation failure when non-zero */ 65 uint16_t zio_zil_fail_shift = 0; 66 uint16_t zio_io_fail_shift = 0; 67 68 /* Enable/disable the write-retry logic */ 69 int zio_write_retry = 1; 70 71 /* Taskq to handle reissuing of I/Os */ 72 taskq_t *zio_taskq; 73 int zio_resume_threads = 4; 74 75 typedef struct zio_sync_pass { 76 int zp_defer_free; /* defer frees after this pass */ 77 int zp_dontcompress; /* don't compress after this pass */ 78 int zp_rewrite; /* rewrite new bps after this pass */ 79 } zio_sync_pass_t; 80 81 zio_sync_pass_t zio_sync_pass = { 82 1, /* zp_defer_free */ 83 4, /* zp_dontcompress */ 84 1, /* zp_rewrite */ 85 }; 86 87 static boolean_t zio_io_should_fail(uint16_t); 88 89 /* 90 * ========================================================================== 91 * I/O kmem caches 92 * ========================================================================== 93 */ 94 kmem_cache_t *zio_cache; 95 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 96 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 97 98 #ifdef _KERNEL 99 extern vmem_t *zio_alloc_arena; 100 #endif 101 102 /* 103 * Determine if we are allowed to issue the IO based on the 104 * pool state. If we must wait then block until we are told 105 * that we may continue. 106 */ 107 #define ZIO_ENTER(spa) { \ 108 if (spa->spa_state == POOL_STATE_IO_FAILURE) { \ 109 mutex_enter(&spa->spa_zio_lock); \ 110 while (spa->spa_state == POOL_STATE_IO_FAILURE) \ 111 cv_wait(&spa->spa_zio_cv, &spa->spa_zio_lock); \ 112 mutex_exit(&spa->spa_zio_lock); \ 113 } \ 114 } 115 116 /* 117 * An allocation zio is one that either currently has the DVA allocate 118 * stage set or will have it later in it's lifetime. 119 */ 120 #define IO_IS_ALLOCATING(zio) \ 121 ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) 122 123 void 124 zio_init(void) 125 { 126 size_t c; 127 vmem_t *data_alloc_arena = NULL; 128 129 #ifdef _KERNEL 130 data_alloc_arena = zio_alloc_arena; 131 #endif 132 133 zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, 134 NULL, NULL, NULL, NULL, NULL, 0); 135 136 /* 137 * For small buffers, we want a cache for each multiple of 138 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 139 * for each quarter-power of 2. For large buffers, we want 140 * a cache for each multiple of PAGESIZE. 141 */ 142 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 143 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 144 size_t p2 = size; 145 size_t align = 0; 146 147 while (p2 & (p2 - 1)) 148 p2 &= p2 - 1; 149 150 if (size <= 4 * SPA_MINBLOCKSIZE) { 151 align = SPA_MINBLOCKSIZE; 152 } else if (P2PHASE(size, PAGESIZE) == 0) { 153 align = PAGESIZE; 154 } else if (P2PHASE(size, p2 >> 2) == 0) { 155 align = p2 >> 2; 156 } 157 158 if (align != 0) { 159 char name[36]; 160 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 161 zio_buf_cache[c] = kmem_cache_create(name, size, 162 align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 163 164 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 165 zio_data_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, data_alloc_arena, 167 KMC_NODEBUG); 168 169 } 170 } 171 172 while (--c != 0) { 173 ASSERT(zio_buf_cache[c] != NULL); 174 if (zio_buf_cache[c - 1] == NULL) 175 zio_buf_cache[c - 1] = zio_buf_cache[c]; 176 177 ASSERT(zio_data_buf_cache[c] != NULL); 178 if (zio_data_buf_cache[c - 1] == NULL) 179 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 180 } 181 182 zio_taskq = taskq_create("zio_taskq", zio_resume_threads, 183 maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 184 185 zio_inject_init(); 186 } 187 188 void 189 zio_fini(void) 190 { 191 size_t c; 192 kmem_cache_t *last_cache = NULL; 193 kmem_cache_t *last_data_cache = NULL; 194 195 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 196 if (zio_buf_cache[c] != last_cache) { 197 last_cache = zio_buf_cache[c]; 198 kmem_cache_destroy(zio_buf_cache[c]); 199 } 200 zio_buf_cache[c] = NULL; 201 202 if (zio_data_buf_cache[c] != last_data_cache) { 203 last_data_cache = zio_data_buf_cache[c]; 204 kmem_cache_destroy(zio_data_buf_cache[c]); 205 } 206 zio_data_buf_cache[c] = NULL; 207 } 208 209 taskq_destroy(zio_taskq); 210 211 kmem_cache_destroy(zio_cache); 212 213 zio_inject_fini(); 214 } 215 216 /* 217 * ========================================================================== 218 * Allocate and free I/O buffers 219 * ========================================================================== 220 */ 221 222 /* 223 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 224 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 225 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 226 * excess / transient data in-core during a crashdump. 227 */ 228 void * 229 zio_buf_alloc(size_t size) 230 { 231 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 232 233 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 234 235 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 236 } 237 238 /* 239 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 240 * crashdump if the kernel panics. This exists so that we will limit the amount 241 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 242 * of kernel heap dumped to disk when the kernel panics) 243 */ 244 void * 245 zio_data_buf_alloc(size_t size) 246 { 247 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 248 249 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 250 251 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 252 } 253 254 void 255 zio_buf_free(void *buf, size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 kmem_cache_free(zio_buf_cache[c], buf); 262 } 263 264 void 265 zio_data_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_data_buf_cache[c], buf); 272 } 273 274 /* 275 * ========================================================================== 276 * Push and pop I/O transform buffers 277 * ========================================================================== 278 */ 279 static void 280 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize) 281 { 282 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 283 284 zt->zt_data = data; 285 zt->zt_size = size; 286 zt->zt_bufsize = bufsize; 287 288 zt->zt_next = zio->io_transform_stack; 289 zio->io_transform_stack = zt; 290 291 zio->io_data = data; 292 zio->io_size = size; 293 } 294 295 static void 296 zio_pop_transform(zio_t *zio, void **data, uint64_t *size, uint64_t *bufsize) 297 { 298 zio_transform_t *zt = zio->io_transform_stack; 299 300 *data = zt->zt_data; 301 *size = zt->zt_size; 302 *bufsize = zt->zt_bufsize; 303 304 zio->io_transform_stack = zt->zt_next; 305 kmem_free(zt, sizeof (zio_transform_t)); 306 307 if ((zt = zio->io_transform_stack) != NULL) { 308 zio->io_data = zt->zt_data; 309 zio->io_size = zt->zt_size; 310 } 311 } 312 313 static void 314 zio_clear_transform_stack(zio_t *zio) 315 { 316 void *data; 317 uint64_t size, bufsize; 318 319 ASSERT(zio->io_transform_stack != NULL); 320 321 zio_pop_transform(zio, &data, &size, &bufsize); 322 while (zio->io_transform_stack != NULL) { 323 zio_buf_free(data, bufsize); 324 zio_pop_transform(zio, &data, &size, &bufsize); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * Create the various types of I/O (read, write, free) 331 * ========================================================================== 332 */ 333 static zio_t * 334 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 335 void *data, uint64_t size, zio_done_func_t *done, void *private, 336 zio_type_t type, int priority, int flags, uint8_t stage, uint32_t pipeline) 337 { 338 zio_t *zio; 339 340 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 341 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 342 343 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 344 bzero(zio, sizeof (zio_t)); 345 zio->io_parent = pio; 346 zio->io_spa = spa; 347 zio->io_txg = txg; 348 zio->io_flags = flags; 349 if (bp != NULL) { 350 zio->io_bp = bp; 351 zio->io_bp_copy = *bp; 352 zio->io_bp_orig = *bp; 353 } 354 zio->io_done = done; 355 zio->io_private = private; 356 zio->io_type = type; 357 zio->io_priority = priority; 358 zio->io_stage = stage; 359 zio->io_pipeline = pipeline; 360 zio->io_timestamp = lbolt64; 361 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 362 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 363 zio_push_transform(zio, data, size, size); 364 365 /* 366 * Note on config lock: 367 * 368 * If CONFIG_HELD is set, then the caller already has the config 369 * lock, so we don't need it for this io. 370 * 371 * We set CONFIG_GRABBED to indicate that we have grabbed the 372 * config lock on behalf of this io, so it should be released 373 * in zio_done. 374 * 375 * Unless CONFIG_HELD is set, we will grab the config lock for 376 * any top-level (parent-less) io, *except* NULL top-level ios. 377 * The NULL top-level ios rarely have any children, so we delay 378 * grabbing the lock until the first child is added (but it is 379 * still grabbed on behalf of the top-level i/o, so additional 380 * children don't need to also grab it). This greatly reduces 381 * contention on the config lock. 382 */ 383 if (pio == NULL) { 384 if (type != ZIO_TYPE_NULL && 385 !(flags & ZIO_FLAG_CONFIG_HELD)) { 386 spa_config_enter(spa, RW_READER, zio); 387 zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 388 } 389 zio->io_root = zio; 390 } else { 391 zio->io_root = pio->io_root; 392 if (!(flags & ZIO_FLAG_NOBOOKMARK)) 393 zio->io_logical = pio->io_logical; 394 mutex_enter(&pio->io_lock); 395 if (pio->io_parent == NULL && 396 pio->io_type == ZIO_TYPE_NULL && 397 !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) && 398 !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) { 399 pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED; 400 spa_config_enter(spa, RW_READER, pio); 401 } 402 if (stage < ZIO_STAGE_READY) 403 pio->io_children_notready++; 404 pio->io_children_notdone++; 405 zio->io_sibling_next = pio->io_child; 406 zio->io_sibling_prev = NULL; 407 if (pio->io_child != NULL) 408 pio->io_child->io_sibling_prev = zio; 409 pio->io_child = zio; 410 zio->io_ndvas = pio->io_ndvas; 411 mutex_exit(&pio->io_lock); 412 } 413 414 /* 415 * Save off the original state incase we need to retry later. 416 */ 417 zio->io_orig_stage = zio->io_stage; 418 zio->io_orig_pipeline = zio->io_pipeline; 419 zio->io_orig_flags = zio->io_flags; 420 421 return (zio); 422 } 423 424 static void 425 zio_reset(zio_t *zio) 426 { 427 zio_clear_transform_stack(zio); 428 429 zio->io_flags = zio->io_orig_flags; 430 zio->io_stage = zio->io_orig_stage; 431 zio->io_pipeline = zio->io_orig_pipeline; 432 zio_push_transform(zio, zio->io_data, zio->io_size, zio->io_size); 433 } 434 435 zio_t * 436 zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, 437 int flags) 438 { 439 zio_t *zio; 440 441 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 442 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, ZIO_STAGE_OPEN, 443 ZIO_WAIT_FOR_CHILDREN_PIPELINE); 444 445 return (zio); 446 } 447 448 zio_t * 449 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) 450 { 451 return (zio_null(NULL, spa, done, private, flags)); 452 } 453 454 zio_t * 455 zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, 456 uint64_t size, zio_done_func_t *done, void *private, 457 int priority, int flags, zbookmark_t *zb) 458 { 459 zio_t *zio; 460 461 ASSERT3U(size, ==, BP_GET_LSIZE(bp)); 462 463 /* 464 * If the user has specified that we allow I/Os to continue 465 * then attempt to satisfy the read. 466 */ 467 if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 468 ZIO_ENTER(spa); 469 470 zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, 471 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER, 472 ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); 473 zio->io_bookmark = *zb; 474 475 zio->io_logical = zio; 476 477 /* 478 * Work off our copy of the bp so the caller can free it. 479 */ 480 zio->io_bp = &zio->io_bp_copy; 481 482 return (zio); 483 } 484 485 zio_t * 486 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 487 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 488 zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority, 489 int flags, zbookmark_t *zb) 490 { 491 zio_t *zio; 492 493 ASSERT(checksum >= ZIO_CHECKSUM_OFF && 494 checksum < ZIO_CHECKSUM_FUNCTIONS); 495 496 ASSERT(compress >= ZIO_COMPRESS_OFF && 497 compress < ZIO_COMPRESS_FUNCTIONS); 498 499 ZIO_ENTER(spa); 500 501 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 502 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 503 ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); 504 505 zio->io_ready = ready; 506 507 zio->io_bookmark = *zb; 508 509 zio->io_logical = zio; 510 511 zio->io_checksum = checksum; 512 zio->io_compress = compress; 513 zio->io_ndvas = ncopies; 514 515 if (bp->blk_birth != txg) { 516 /* XXX the bp usually (always?) gets re-zeroed later */ 517 BP_ZERO(bp); 518 BP_SET_LSIZE(bp, size); 519 BP_SET_PSIZE(bp, size); 520 } else { 521 /* Make sure someone doesn't change their mind on overwrites */ 522 ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp), 523 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 524 } 525 526 return (zio); 527 } 528 529 zio_t * 530 zio_rewrite(zio_t *pio, spa_t *spa, int checksum, 531 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 532 zio_done_func_t *done, void *private, int priority, int flags, 533 zbookmark_t *zb) 534 { 535 zio_t *zio; 536 537 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 538 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER, 539 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp)); 540 541 zio->io_bookmark = *zb; 542 zio->io_checksum = checksum; 543 zio->io_compress = ZIO_COMPRESS_OFF; 544 545 if (pio != NULL) 546 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 547 548 return (zio); 549 } 550 551 static void 552 zio_write_allocate_ready(zio_t *zio) 553 { 554 /* Free up the previous block */ 555 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 556 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 557 &zio->io_bp_orig, NULL, NULL)); 558 } 559 } 560 561 static zio_t * 562 zio_write_allocate(zio_t *pio, spa_t *spa, int checksum, 563 uint64_t txg, blkptr_t *bp, void *data, uint64_t size, 564 zio_done_func_t *done, void *private, int priority, int flags) 565 { 566 zio_t *zio; 567 568 BP_ZERO(bp); 569 BP_SET_LSIZE(bp, size); 570 BP_SET_PSIZE(bp, size); 571 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 572 573 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 574 ZIO_TYPE_WRITE, priority, flags, 575 ZIO_STAGE_OPEN, ZIO_WRITE_ALLOCATE_PIPELINE); 576 577 zio->io_checksum = checksum; 578 zio->io_compress = ZIO_COMPRESS_OFF; 579 zio->io_ready = zio_write_allocate_ready; 580 581 return (zio); 582 } 583 584 zio_t * 585 zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 586 zio_done_func_t *done, void *private) 587 { 588 zio_t *zio; 589 590 ASSERT(!BP_IS_HOLE(bp)); 591 592 if (txg == spa->spa_syncing_txg && 593 spa->spa_sync_pass > zio_sync_pass.zp_defer_free) { 594 bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); 595 return (zio_null(pio, spa, NULL, NULL, 0)); 596 } 597 598 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 599 ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER, 600 ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp)); 601 602 zio->io_bp = &zio->io_bp_copy; 603 604 return (zio); 605 } 606 607 zio_t * 608 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 609 zio_done_func_t *done, void *private) 610 { 611 zio_t *zio; 612 613 /* 614 * A claim is an allocation of a specific block. Claims are needed 615 * to support immediate writes in the intent log. The issue is that 616 * immediate writes contain committed data, but in a txg that was 617 * *not* committed. Upon opening the pool after an unclean shutdown, 618 * the intent log claims all blocks that contain immediate write data 619 * so that the SPA knows they're in use. 620 * 621 * All claims *must* be resolved in the first txg -- before the SPA 622 * starts allocating blocks -- so that nothing is allocated twice. 623 */ 624 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 625 ASSERT3U(spa_first_txg(spa), <=, txg); 626 627 zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private, 628 ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0, 629 ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp)); 630 631 zio->io_bp = &zio->io_bp_copy; 632 633 return (zio); 634 } 635 636 zio_t * 637 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 638 zio_done_func_t *done, void *private, int priority, int flags) 639 { 640 zio_t *zio; 641 int c; 642 643 if (vd->vdev_children == 0) { 644 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 645 ZIO_TYPE_IOCTL, priority, flags, 646 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 647 648 zio->io_vd = vd; 649 zio->io_cmd = cmd; 650 } else { 651 zio = zio_null(pio, spa, NULL, NULL, flags); 652 653 for (c = 0; c < vd->vdev_children; c++) 654 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 655 done, private, priority, flags)); 656 } 657 658 return (zio); 659 } 660 661 static void 662 zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, 663 int checksum, boolean_t labels) 664 { 665 ASSERT(vd->vdev_children == 0); 666 667 ASSERT(size <= SPA_MAXBLOCKSIZE); 668 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 669 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 670 671 #ifdef ZFS_DEBUG 672 if (labels) { 673 ASSERT(offset + size <= VDEV_LABEL_START_SIZE || 674 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 675 } 676 #endif 677 ASSERT3U(offset + size, <=, vd->vdev_psize); 678 679 BP_ZERO(bp); 680 681 BP_SET_LSIZE(bp, size); 682 BP_SET_PSIZE(bp, size); 683 684 BP_SET_CHECKSUM(bp, checksum); 685 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 686 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 687 688 if (checksum != ZIO_CHECKSUM_OFF) 689 ZIO_SET_CHECKSUM(&bp->blk_cksum, offset, 0, 0, 0); 690 } 691 692 zio_t * 693 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 694 void *data, int checksum, zio_done_func_t *done, void *private, 695 int priority, int flags, boolean_t labels) 696 { 697 zio_t *zio; 698 blkptr_t blk; 699 700 ZIO_ENTER(vd->vdev_spa); 701 702 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 703 704 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 705 ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, 706 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 707 708 zio->io_vd = vd; 709 zio->io_offset = offset; 710 711 /* 712 * Work off our copy of the bp so the caller can free it. 713 */ 714 zio->io_bp = &zio->io_bp_copy; 715 716 return (zio); 717 } 718 719 zio_t * 720 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 721 void *data, int checksum, zio_done_func_t *done, void *private, 722 int priority, int flags, boolean_t labels) 723 { 724 zio_block_tail_t *zbt; 725 void *wbuf; 726 zio_t *zio; 727 blkptr_t blk; 728 729 ZIO_ENTER(vd->vdev_spa); 730 731 zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); 732 733 zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, 734 ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, 735 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 736 737 zio->io_vd = vd; 738 zio->io_offset = offset; 739 740 zio->io_bp = &zio->io_bp_copy; 741 zio->io_checksum = checksum; 742 743 if (zio_checksum_table[checksum].ci_zbt) { 744 /* 745 * zbt checksums are necessarily destructive -- they modify 746 * one word of the write buffer to hold the verifier/checksum. 747 * Therefore, we must make a local copy in case the data is 748 * being written to multiple places. 749 */ 750 wbuf = zio_buf_alloc(size); 751 bcopy(data, wbuf, size); 752 zio_push_transform(zio, wbuf, size, size); 753 754 zbt = (zio_block_tail_t *)((char *)wbuf + size) - 1; 755 zbt->zbt_cksum = blk.blk_cksum; 756 } 757 758 return (zio); 759 } 760 761 /* 762 * Create a child I/O to do some work for us. It has no associated bp. 763 */ 764 zio_t * 765 zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 766 void *data, uint64_t size, int type, int priority, int flags, 767 zio_done_func_t *done, void *private) 768 { 769 uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; 770 zio_t *cio; 771 772 if (type == ZIO_TYPE_READ && bp != NULL) { 773 /* 774 * If we have the bp, then the child should perform the 775 * checksum and the parent need not. This pushes error 776 * detection as close to the leaves as possible and 777 * eliminates redundant checksums in the interior nodes. 778 */ 779 pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; 780 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 781 } 782 783 cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size, 784 done, private, type, priority, 785 (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags, 786 ZIO_STAGE_VDEV_IO_START - 1, pipeline); 787 788 cio->io_vd = vd; 789 cio->io_offset = offset; 790 791 return (cio); 792 } 793 794 /* 795 * ========================================================================== 796 * Initiate I/O, either sync or async 797 * ========================================================================== 798 */ 799 int 800 zio_wait(zio_t *zio) 801 { 802 int error; 803 804 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 805 806 zio->io_waiter = curthread; 807 808 zio_execute(zio); 809 810 mutex_enter(&zio->io_lock); 811 while (zio->io_stalled != ZIO_STAGE_DONE) 812 cv_wait(&zio->io_cv, &zio->io_lock); 813 mutex_exit(&zio->io_lock); 814 815 error = zio->io_error; 816 mutex_destroy(&zio->io_lock); 817 cv_destroy(&zio->io_cv); 818 kmem_cache_free(zio_cache, zio); 819 820 return (error); 821 } 822 823 void 824 zio_nowait(zio_t *zio) 825 { 826 zio_execute(zio); 827 } 828 829 void 830 zio_interrupt(zio_t *zio) 831 { 832 (void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type], 833 (task_func_t *)zio_execute, zio, TQ_SLEEP); 834 } 835 836 static int 837 zio_issue_async(zio_t *zio) 838 { 839 (void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type], 840 (task_func_t *)zio_execute, zio, TQ_SLEEP); 841 842 return (ZIO_PIPELINE_STOP); 843 } 844 845 /* 846 * ========================================================================== 847 * I/O pipeline interlocks: parent/child dependency scoreboarding 848 * ========================================================================== 849 */ 850 static int 851 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) 852 { 853 int rv = ZIO_PIPELINE_CONTINUE; 854 855 mutex_enter(&zio->io_lock); 856 ASSERT(zio->io_stalled == 0); 857 if (*countp != 0) { 858 zio->io_stalled = stage; 859 rv = ZIO_PIPELINE_STOP; 860 } 861 mutex_exit(&zio->io_lock); 862 863 return (rv); 864 } 865 866 static void 867 zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) 868 { 869 zio_t *pio = zio->io_parent; 870 871 mutex_enter(&pio->io_lock); 872 if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 873 pio->io_error = zio->io_error; 874 ASSERT3U(*countp, >, 0); 875 if (--*countp == 0 && pio->io_stalled == stage) { 876 pio->io_stalled = 0; 877 mutex_exit(&pio->io_lock); 878 zio_execute(pio); 879 } else { 880 mutex_exit(&pio->io_lock); 881 } 882 } 883 884 int 885 zio_wait_for_children_ready(zio_t *zio) 886 { 887 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 888 &zio->io_children_notready)); 889 } 890 891 int 892 zio_wait_for_children_done(zio_t *zio) 893 { 894 return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 895 &zio->io_children_notdone)); 896 } 897 898 static int 899 zio_read_init(zio_t *zio) 900 { 901 blkptr_t *bp = zio->io_bp; 902 903 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { 904 uint64_t csize = BP_GET_PSIZE(bp); 905 void *cbuf = zio_buf_alloc(csize); 906 907 zio_push_transform(zio, cbuf, csize, csize); 908 zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS; 909 } 910 911 if (BP_IS_GANG(bp)) { 912 uint64_t gsize = SPA_GANGBLOCKSIZE; 913 void *gbuf = zio_buf_alloc(gsize); 914 915 zio_push_transform(zio, gbuf, gsize, gsize); 916 zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS; 917 } 918 919 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 920 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 921 922 return (ZIO_PIPELINE_CONTINUE); 923 } 924 925 static int 926 zio_ready(zio_t *zio) 927 { 928 zio_t *pio = zio->io_parent; 929 930 if (zio->io_ready) 931 zio->io_ready(zio); 932 933 if (pio != NULL) 934 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY, 935 &pio->io_children_notready); 936 937 if (zio->io_bp) 938 zio->io_bp_copy = *zio->io_bp; 939 940 return (ZIO_PIPELINE_CONTINUE); 941 } 942 943 static int 944 zio_vdev_retry_io(zio_t *zio) 945 { 946 zio_t *pio = zio->io_parent; 947 948 /* 949 * Preserve the failed bp so that the io_ready() callback can 950 * update the accounting accordingly. The callback will also be 951 * responsible for freeing the previously allocated block, if one 952 * exists. 953 */ 954 zio->io_bp_orig = *zio->io_bp; 955 956 /* 957 * We must zero out the old DVA and blk_birth before reallocating 958 * the bp. 959 */ 960 BP_ZERO_DVAS(zio->io_bp); 961 zio_reset(zio); 962 963 if (pio) { 964 /* 965 * Let the parent know that we will 966 * re-alloc the write (=> new bp info). 967 */ 968 mutex_enter(&pio->io_lock); 969 pio->io_children_notready++; 970 971 /* 972 * If the parent I/O is still in the open stage, then 973 * don't bother telling it to retry since it hasn't 974 * progressed far enough for it to care. 975 */ 976 if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio)) 977 pio->io_flags |= ZIO_FLAG_WRITE_RETRY; 978 979 ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE); 980 mutex_exit(&pio->io_lock); 981 } 982 983 /* 984 * We are getting ready to process the retry request so clear 985 * the flag and the zio's current error status. 986 */ 987 zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY; 988 zio->io_error = 0; 989 990 return (ZIO_PIPELINE_CONTINUE); 991 } 992 993 int 994 zio_vdev_resume_io(spa_t *spa) 995 { 996 zio_t *zio; 997 998 mutex_enter(&spa->spa_zio_lock); 999 1000 /* 1001 * Probe all of vdevs that have experienced an I/O error. 1002 * If we are still unable to verify the integrity of the vdev 1003 * then we prevent the resume from proceeeding. 1004 */ 1005 for (zio = list_head(&spa->spa_zio_list); zio != NULL; 1006 zio = list_next(&spa->spa_zio_list, zio)) { 1007 int error = 0; 1008 1009 /* We only care about I/Os that must succeed */ 1010 if (zio->io_vd == NULL || zio->io_flags & ZIO_FLAG_CANFAIL) 1011 continue; 1012 error = vdev_probe(zio->io_vd); 1013 if (error) { 1014 mutex_exit(&spa->spa_zio_lock); 1015 return (error); 1016 } 1017 } 1018 1019 /* 1020 * Clear the vdev stats so that I/O can flow. 1021 */ 1022 vdev_clear(spa, NULL, B_FALSE); 1023 1024 spa->spa_state = POOL_STATE_ACTIVE; 1025 while ((zio = list_head(&spa->spa_zio_list)) != NULL) { 1026 list_remove(&spa->spa_zio_list, zio); 1027 zio->io_error = 0; 1028 1029 /* 1030 * If we are resuming an allocating I/O then we force it 1031 * to retry and let it resume operation where it left off. 1032 * Otherwise, go back to the ready stage and pick up from 1033 * there. 1034 */ 1035 if (zio_write_retry && IO_IS_ALLOCATING(zio)) { 1036 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1037 zio->io_stage--; 1038 } else { 1039 zio->io_stage = ZIO_STAGE_READY; 1040 } 1041 1042 (void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute, 1043 zio, TQ_SLEEP); 1044 } 1045 mutex_exit(&spa->spa_zio_lock); 1046 1047 /* 1048 * Wait for the taskqs to finish and recheck the pool state since 1049 * it's possible that a resumed I/O has failed again. 1050 */ 1051 taskq_wait(zio_taskq); 1052 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1053 return (EIO); 1054 1055 mutex_enter(&spa->spa_zio_lock); 1056 cv_broadcast(&spa->spa_zio_cv); 1057 mutex_exit(&spa->spa_zio_lock); 1058 1059 return (0); 1060 } 1061 1062 static int 1063 zio_vdev_suspend_io(zio_t *zio) 1064 { 1065 spa_t *spa = zio->io_spa; 1066 1067 /* 1068 * We've experienced an unrecoverable failure so 1069 * set the pool state accordingly and queue all 1070 * failed IOs. 1071 */ 1072 spa->spa_state = POOL_STATE_IO_FAILURE; 1073 1074 mutex_enter(&spa->spa_zio_lock); 1075 list_insert_tail(&spa->spa_zio_list, zio); 1076 1077 #ifndef _KERNEL 1078 /* Used to notify ztest that the pool has suspended */ 1079 cv_broadcast(&spa->spa_zio_cv); 1080 #endif 1081 mutex_exit(&spa->spa_zio_lock); 1082 1083 return (ZIO_PIPELINE_STOP); 1084 } 1085 1086 static int 1087 zio_assess(zio_t *zio) 1088 { 1089 spa_t *spa = zio->io_spa; 1090 blkptr_t *bp = zio->io_bp; 1091 vdev_t *vd = zio->io_vd; 1092 1093 ASSERT(zio->io_children_notready == 0); 1094 ASSERT(zio->io_children_notdone == 0); 1095 1096 if (bp != NULL) { 1097 ASSERT(bp->blk_pad[0] == 0); 1098 ASSERT(bp->blk_pad[1] == 0); 1099 ASSERT(bp->blk_pad[2] == 0); 1100 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0); 1101 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 1102 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 1103 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 1104 if (zio->io_ndvas != 0) 1105 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp)); 1106 ASSERT(BP_COUNT_GANG(bp) == 0 || 1107 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 1108 } 1109 } 1110 1111 /* 1112 * Some child I/O has indicated that a retry is necessary, so 1113 * we set an error on the I/O and let the logic below do the 1114 * rest. 1115 */ 1116 if (zio->io_flags & ZIO_FLAG_WRITE_RETRY) 1117 zio->io_error = ERESTART; 1118 1119 if (vd != NULL) 1120 vdev_stat_update(zio); 1121 1122 if (zio->io_error) { 1123 /* 1124 * If this I/O is attached to a particular vdev, 1125 * generate an error message describing the I/O failure 1126 * at the block level. We ignore these errors if the 1127 * device is currently unavailable. 1128 */ 1129 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 1130 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 1131 1132 if ((zio->io_error == EIO || 1133 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && 1134 zio->io_logical == zio) { 1135 /* 1136 * For root I/O requests, tell the SPA to log the error 1137 * appropriately. Also, generate a logical data 1138 * ereport. 1139 */ 1140 spa_log_error(spa, zio); 1141 1142 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 1143 0, 0); 1144 } 1145 1146 /* 1147 * If we are an allocating I/O then we attempt to reissue 1148 * the I/O on another vdev unless the pool is out of space. 1149 * We handle this condition based on the spa's failmode 1150 * property. 1151 */ 1152 if (zio_write_retry && zio->io_error != ENOSPC && 1153 IO_IS_ALLOCATING(zio)) 1154 return (zio_vdev_retry_io(zio)); 1155 1156 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1157 1158 /* 1159 * For I/O requests that cannot fail, we carry out 1160 * the requested behavior based on the failmode pool 1161 * property. 1162 * 1163 * XXX - Need to differentiate between an ENOSPC as 1164 * a result of vdev failures vs. a full pool. 1165 */ 1166 if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { 1167 char *blkbuf; 1168 1169 #ifdef ZFS_DEBUG 1170 blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_NOSLEEP); 1171 if (blkbuf) { 1172 sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, 1173 bp ? bp : &zio->io_bp_copy); 1174 } 1175 cmn_err(CE_WARN, "ZFS: %s (%s on %s off %llx: zio %p " 1176 "%s): error %d", zio->io_error == ECKSUM ? 1177 "bad checksum" : "I/O failure", 1178 zio_type_name[zio->io_type], 1179 vdev_description(vd), 1180 (u_longlong_t)zio->io_offset, 1181 (void *)zio, blkbuf ? blkbuf : "", zio->io_error); 1182 #endif 1183 1184 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) { 1185 fm_panic("Pool '%s' has encountered an " 1186 "uncorrectable I/O failure and the " 1187 "failure mode property for this pool " 1188 "is set to panic.", spa_name(spa)); 1189 } 1190 cmn_err(CE_WARN, "Pool '%s' has encountered " 1191 "an uncorrectable I/O error. " 1192 "Manual intervention is required.", spa_name(spa)); 1193 return (zio_vdev_suspend_io(zio)); 1194 } 1195 } 1196 ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY)); 1197 ASSERT(zio->io_children_notready == 0); 1198 1199 return (ZIO_PIPELINE_CONTINUE); 1200 } 1201 1202 static int 1203 zio_done(zio_t *zio) 1204 { 1205 zio_t *pio = zio->io_parent; 1206 spa_t *spa = zio->io_spa; 1207 1208 ASSERT(zio->io_children_notready == 0); 1209 ASSERT(zio->io_children_notdone == 0); 1210 1211 zio_clear_transform_stack(zio); 1212 1213 if (zio->io_done) 1214 zio->io_done(zio); 1215 1216 ASSERT(zio->io_delegate_list == NULL); 1217 ASSERT(zio->io_delegate_next == NULL); 1218 1219 if (pio != NULL) { 1220 zio_t *next, *prev; 1221 1222 mutex_enter(&pio->io_lock); 1223 next = zio->io_sibling_next; 1224 prev = zio->io_sibling_prev; 1225 if (next != NULL) 1226 next->io_sibling_prev = prev; 1227 if (prev != NULL) 1228 prev->io_sibling_next = next; 1229 if (pio->io_child == zio) 1230 pio->io_child = next; 1231 mutex_exit(&pio->io_lock); 1232 1233 zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE, 1234 &pio->io_children_notdone); 1235 } 1236 1237 /* 1238 * Note: this I/O is now done, and will shortly be freed, so there is no 1239 * need to clear this (or any other) flag. 1240 */ 1241 if (zio->io_flags & ZIO_FLAG_CONFIG_GRABBED) 1242 spa_config_exit(spa, zio); 1243 1244 if (zio->io_waiter != NULL) { 1245 mutex_enter(&zio->io_lock); 1246 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1247 zio->io_stalled = zio->io_stage; 1248 cv_broadcast(&zio->io_cv); 1249 mutex_exit(&zio->io_lock); 1250 } else { 1251 mutex_destroy(&zio->io_lock); 1252 cv_destroy(&zio->io_cv); 1253 kmem_cache_free(zio_cache, zio); 1254 } 1255 1256 return (ZIO_PIPELINE_STOP); 1257 } 1258 1259 /* 1260 * ========================================================================== 1261 * Compression support 1262 * ========================================================================== 1263 */ 1264 static int 1265 zio_write_compress(zio_t *zio) 1266 { 1267 int compress = zio->io_compress; 1268 blkptr_t *bp = zio->io_bp; 1269 void *cbuf; 1270 uint64_t lsize = zio->io_size; 1271 uint64_t csize = lsize; 1272 uint64_t cbufsize = 0; 1273 int pass; 1274 1275 if (bp->blk_birth == zio->io_txg) { 1276 /* 1277 * We're rewriting an existing block, which means we're 1278 * working on behalf of spa_sync(). For spa_sync() to 1279 * converge, it must eventually be the case that we don't 1280 * have to allocate new blocks. But compression changes 1281 * the blocksize, which forces a reallocate, and makes 1282 * convergence take longer. Therefore, after the first 1283 * few passes, stop compressing to ensure convergence. 1284 */ 1285 pass = spa_sync_pass(zio->io_spa); 1286 if (pass > zio_sync_pass.zp_dontcompress) 1287 compress = ZIO_COMPRESS_OFF; 1288 } else { 1289 ASSERT(BP_IS_HOLE(bp)); 1290 pass = 1; 1291 } 1292 1293 if (compress != ZIO_COMPRESS_OFF) 1294 if (!zio_compress_data(compress, zio->io_data, zio->io_size, 1295 &cbuf, &csize, &cbufsize)) 1296 compress = ZIO_COMPRESS_OFF; 1297 1298 if (compress != ZIO_COMPRESS_OFF && csize != 0) 1299 zio_push_transform(zio, cbuf, csize, cbufsize); 1300 1301 /* 1302 * The final pass of spa_sync() must be all rewrites, but the first 1303 * few passes offer a trade-off: allocating blocks defers convergence, 1304 * but newly allocated blocks are sequential, so they can be written 1305 * to disk faster. Therefore, we allow the first few passes of 1306 * spa_sync() to reallocate new blocks, but force rewrites after that. 1307 * There should only be a handful of blocks after pass 1 in any case. 1308 */ 1309 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && 1310 pass > zio_sync_pass.zp_rewrite) { 1311 ASSERT(csize != 0); 1312 BP_SET_LSIZE(bp, lsize); 1313 BP_SET_COMPRESS(bp, compress); 1314 zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp); 1315 } else { 1316 if (bp->blk_birth == zio->io_txg) 1317 BP_ZERO(bp); 1318 if (csize == 0) { 1319 BP_ZERO(bp); 1320 zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE; 1321 } else { 1322 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1323 BP_SET_LSIZE(bp, lsize); 1324 BP_SET_PSIZE(bp, csize); 1325 BP_SET_COMPRESS(bp, compress); 1326 } 1327 } 1328 1329 return (ZIO_PIPELINE_CONTINUE); 1330 } 1331 1332 static int 1333 zio_read_decompress(zio_t *zio) 1334 { 1335 blkptr_t *bp = zio->io_bp; 1336 void *data; 1337 uint64_t size; 1338 uint64_t bufsize; 1339 int compress = BP_GET_COMPRESS(bp); 1340 1341 ASSERT(compress != ZIO_COMPRESS_OFF); 1342 1343 zio_pop_transform(zio, &data, &size, &bufsize); 1344 1345 if (zio_decompress_data(compress, data, size, 1346 zio->io_data, zio->io_size)) 1347 zio->io_error = EIO; 1348 1349 zio_buf_free(data, bufsize); 1350 1351 return (ZIO_PIPELINE_CONTINUE); 1352 } 1353 1354 /* 1355 * ========================================================================== 1356 * Gang block support 1357 * ========================================================================== 1358 */ 1359 static void 1360 zio_gang_byteswap(zio_t *zio) 1361 { 1362 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1363 1364 if (BP_SHOULD_BYTESWAP(zio->io_bp)) 1365 byteswap_uint64_array(zio->io_data, zio->io_size); 1366 } 1367 1368 static int 1369 zio_get_gang_header(zio_t *zio) 1370 { 1371 blkptr_t *bp = zio->io_bp; 1372 uint64_t gsize = SPA_GANGBLOCKSIZE; 1373 void *gbuf = zio_buf_alloc(gsize); 1374 1375 ASSERT(BP_IS_GANG(bp)); 1376 1377 zio_push_transform(zio, gbuf, gsize, gsize); 1378 1379 zio_nowait(zio_create(zio, zio->io_spa, bp->blk_birth, bp, gbuf, gsize, 1380 NULL, NULL, ZIO_TYPE_READ, zio->io_priority, 1381 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1382 ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE)); 1383 1384 return (zio_wait_for_children_done(zio)); 1385 } 1386 1387 static int 1388 zio_read_gang_members(zio_t *zio) 1389 { 1390 zio_gbh_phys_t *gbh; 1391 uint64_t gsize, gbufsize, loff, lsize; 1392 int i; 1393 1394 ASSERT(BP_IS_GANG(zio->io_bp)); 1395 1396 zio_gang_byteswap(zio); 1397 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1398 1399 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1400 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1401 lsize = BP_GET_PSIZE(gbp); 1402 1403 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1404 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1405 ASSERT3U(loff + lsize, <=, zio->io_size); 1406 ASSERT(i < SPA_GBH_NBLKPTRS); 1407 ASSERT(!BP_IS_HOLE(gbp)); 1408 1409 zio_nowait(zio_read(zio, zio->io_spa, gbp, 1410 (char *)zio->io_data + loff, lsize, 1411 NULL, NULL, zio->io_priority, 1412 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1413 } 1414 1415 zio_buf_free(gbh, gbufsize); 1416 1417 return (zio_wait_for_children_done(zio)); 1418 } 1419 1420 static int 1421 zio_rewrite_gang_members(zio_t *zio) 1422 { 1423 zio_gbh_phys_t *gbh; 1424 uint64_t gsize, gbufsize, loff, lsize; 1425 int i; 1426 1427 ASSERT(BP_IS_GANG(zio->io_bp)); 1428 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1429 1430 zio_gang_byteswap(zio); 1431 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1432 1433 ASSERT(gsize == gbufsize); 1434 1435 for (loff = 0, i = 0; loff != zio->io_size; loff += lsize, i++) { 1436 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1437 lsize = BP_GET_PSIZE(gbp); 1438 1439 ASSERT(BP_GET_COMPRESS(gbp) == ZIO_COMPRESS_OFF); 1440 ASSERT3U(lsize, ==, BP_GET_LSIZE(gbp)); 1441 ASSERT3U(loff + lsize, <=, zio->io_size); 1442 ASSERT(i < SPA_GBH_NBLKPTRS); 1443 ASSERT(!BP_IS_HOLE(gbp)); 1444 1445 zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, 1446 zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, 1447 NULL, NULL, zio->io_priority, 1448 zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark)); 1449 } 1450 1451 zio_push_transform(zio, gbh, gsize, gbufsize); 1452 1453 return (zio_wait_for_children_ready(zio)); 1454 } 1455 1456 static int 1457 zio_free_gang_members(zio_t *zio) 1458 { 1459 zio_gbh_phys_t *gbh; 1460 uint64_t gsize, gbufsize; 1461 int i; 1462 1463 ASSERT(BP_IS_GANG(zio->io_bp)); 1464 1465 zio_gang_byteswap(zio); 1466 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1467 1468 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1469 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1470 1471 if (BP_IS_HOLE(gbp)) 1472 continue; 1473 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 1474 gbp, NULL, NULL)); 1475 } 1476 1477 zio_buf_free(gbh, gbufsize); 1478 1479 return (ZIO_PIPELINE_CONTINUE); 1480 } 1481 1482 static int 1483 zio_claim_gang_members(zio_t *zio) 1484 { 1485 zio_gbh_phys_t *gbh; 1486 uint64_t gsize, gbufsize; 1487 int i; 1488 1489 ASSERT(BP_IS_GANG(zio->io_bp)); 1490 1491 zio_gang_byteswap(zio); 1492 zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize); 1493 1494 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1495 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1496 if (BP_IS_HOLE(gbp)) 1497 continue; 1498 zio_nowait(zio_claim(zio, zio->io_spa, zio->io_txg, 1499 gbp, NULL, NULL)); 1500 } 1501 1502 zio_buf_free(gbh, gbufsize); 1503 1504 return (ZIO_PIPELINE_CONTINUE); 1505 } 1506 1507 static void 1508 zio_write_allocate_gang_member_done(zio_t *zio) 1509 { 1510 zio_t *pio = zio->io_parent; 1511 dva_t *cdva = zio->io_bp->blk_dva; 1512 dva_t *pdva = pio->io_bp->blk_dva; 1513 uint64_t asize; 1514 int d; 1515 1516 ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas); 1517 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1518 ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); 1519 ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); 1520 1521 mutex_enter(&pio->io_lock); 1522 for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) { 1523 ASSERT(DVA_GET_GANG(&pdva[d])); 1524 asize = DVA_GET_ASIZE(&pdva[d]); 1525 asize += DVA_GET_ASIZE(&cdva[d]); 1526 DVA_SET_ASIZE(&pdva[d], asize); 1527 } 1528 mutex_exit(&pio->io_lock); 1529 } 1530 1531 static int 1532 zio_write_allocate_gang_members(zio_t *zio, metaslab_class_t *mc) 1533 { 1534 blkptr_t *bp = zio->io_bp; 1535 dva_t *dva = bp->blk_dva; 1536 spa_t *spa = zio->io_spa; 1537 zio_gbh_phys_t *gbh; 1538 uint64_t txg = zio->io_txg; 1539 uint64_t resid = zio->io_size; 1540 uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE); 1541 uint64_t gsize, loff, lsize; 1542 uint32_t gbps_left; 1543 int ndvas = zio->io_ndvas; 1544 int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); 1545 int error; 1546 int i, d; 1547 1548 gsize = SPA_GANGBLOCKSIZE; 1549 gbps_left = SPA_GBH_NBLKPTRS; 1550 1551 error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL, 1552 B_FALSE); 1553 if (error) { 1554 zio->io_error = error; 1555 return (ZIO_PIPELINE_CONTINUE); 1556 } 1557 1558 for (d = 0; d < gbh_ndvas; d++) 1559 DVA_SET_GANG(&dva[d], 1); 1560 1561 bp->blk_birth = txg; 1562 1563 gbh = zio_buf_alloc(gsize); 1564 bzero(gbh, gsize); 1565 1566 for (loff = 0, i = 0; loff != zio->io_size; 1567 loff += lsize, resid -= lsize, gbps_left--, i++) { 1568 blkptr_t *gbp = &gbh->zg_blkptr[i]; 1569 dva = gbp->blk_dva; 1570 1571 ASSERT(gbps_left != 0); 1572 maxalloc = MIN(maxalloc, resid); 1573 1574 while (resid <= maxalloc * gbps_left) { 1575 error = metaslab_alloc(spa, mc, maxalloc, gbp, ndvas, 1576 txg, bp, B_FALSE); 1577 if (error == 0) 1578 break; 1579 ASSERT3U(error, ==, ENOSPC); 1580 /* XXX - free up previous allocations? */ 1581 if (maxalloc == SPA_MINBLOCKSIZE) { 1582 zio->io_error = error; 1583 return (ZIO_PIPELINE_CONTINUE); 1584 } 1585 maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE); 1586 } 1587 1588 if (resid <= maxalloc * gbps_left) { 1589 lsize = maxalloc; 1590 BP_SET_LSIZE(gbp, lsize); 1591 BP_SET_PSIZE(gbp, lsize); 1592 BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF); 1593 gbp->blk_birth = txg; 1594 zio_nowait(zio_rewrite(zio, spa, 1595 zio->io_checksum, txg, gbp, 1596 (char *)zio->io_data + loff, lsize, 1597 zio_write_allocate_gang_member_done, NULL, 1598 zio->io_priority, 1599 zio->io_flags & ZIO_FLAG_GANG_INHERIT, 1600 &zio->io_bookmark)); 1601 } else { 1602 lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); 1603 ASSERT(lsize != SPA_MINBLOCKSIZE); 1604 zio_nowait(zio_write_allocate(zio, spa, 1605 zio->io_checksum, txg, gbp, 1606 (char *)zio->io_data + loff, lsize, 1607 zio_write_allocate_gang_member_done, NULL, 1608 zio->io_priority, 1609 zio->io_flags & ZIO_FLAG_GANG_INHERIT)); 1610 } 1611 } 1612 1613 ASSERT(resid == 0 && loff == zio->io_size); 1614 1615 zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE; 1616 1617 zio_push_transform(zio, gbh, gsize, gsize); 1618 1619 /* 1620 * As much as we'd like this to be 'ready' instead of 'done', 1621 * updating our ASIZE doesn't happen until the io_done callback, 1622 * so we have to wait for that to finish in order for our BP 1623 * to be stable. 1624 */ 1625 return (zio_wait_for_children_done(zio)); 1626 } 1627 1628 /* 1629 * ========================================================================== 1630 * Allocate and free blocks 1631 * ========================================================================== 1632 */ 1633 static int 1634 zio_dva_allocate(zio_t *zio) 1635 { 1636 spa_t *spa = zio->io_spa; 1637 metaslab_class_t *mc = spa->spa_normal_class; 1638 blkptr_t *bp = zio->io_bp; 1639 int error; 1640 1641 ASSERT(BP_IS_HOLE(bp)); 1642 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 1643 ASSERT3U(zio->io_ndvas, >, 0); 1644 ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa)); 1645 1646 /* 1647 * For testing purposes, we force I/Os to retry. We don't allow 1648 * retries beyond the first pass since those I/Os are non-allocating 1649 * writes. 1650 */ 1651 if (zio_io_fail_shift && 1652 spa_sync_pass(zio->io_spa) <= zio_sync_pass.zp_rewrite && 1653 zio_io_should_fail(zio_io_fail_shift)) 1654 zio->io_flags |= ZIO_FLAG_WRITE_RETRY; 1655 1656 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1657 1658 error = metaslab_alloc(spa, mc, zio->io_size, bp, zio->io_ndvas, 1659 zio->io_txg, NULL, B_FALSE); 1660 1661 if (error == 0) { 1662 bp->blk_birth = zio->io_txg; 1663 } else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) { 1664 return (zio_write_allocate_gang_members(zio, mc)); 1665 } else { 1666 zio->io_error = error; 1667 } 1668 1669 return (ZIO_PIPELINE_CONTINUE); 1670 } 1671 1672 static int 1673 zio_dva_free(zio_t *zio) 1674 { 1675 blkptr_t *bp = zio->io_bp; 1676 1677 metaslab_free(zio->io_spa, bp, zio->io_txg, B_FALSE); 1678 1679 BP_ZERO(bp); 1680 1681 return (ZIO_PIPELINE_CONTINUE); 1682 } 1683 1684 static int 1685 zio_dva_claim(zio_t *zio) 1686 { 1687 zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 1688 1689 return (ZIO_PIPELINE_CONTINUE); 1690 } 1691 1692 /* 1693 * ========================================================================== 1694 * Read and write to physical devices 1695 * ========================================================================== 1696 */ 1697 1698 static int 1699 zio_vdev_io_start(zio_t *zio) 1700 { 1701 vdev_t *vd = zio->io_vd; 1702 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1703 blkptr_t *bp = zio->io_bp; 1704 uint64_t align; 1705 spa_t *spa = zio->io_spa; 1706 1707 /* 1708 * If the pool is already in a failure state then just suspend 1709 * this IO until the problem is resolved. We will reissue them 1710 * at that time. 1711 */ 1712 if (spa_state(spa) == POOL_STATE_IO_FAILURE && 1713 zio->io_type == ZIO_TYPE_WRITE) 1714 return (zio_vdev_suspend_io(zio)); 1715 1716 /* 1717 * The mirror_ops handle multiple DVAs in a single BP 1718 */ 1719 if (vd == NULL) 1720 return (vdev_mirror_ops.vdev_op_io_start(zio)); 1721 1722 align = 1ULL << tvd->vdev_ashift; 1723 1724 if (zio->io_retries == 0 && vd == tvd) 1725 zio->io_flags |= ZIO_FLAG_FAILFAST; 1726 1727 if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) { 1728 zio->io_flags |= ZIO_FLAG_PHYSICAL; 1729 zio->io_offset += VDEV_LABEL_START_SIZE; 1730 } 1731 1732 if (P2PHASE(zio->io_size, align) != 0) { 1733 uint64_t asize = P2ROUNDUP(zio->io_size, align); 1734 char *abuf = zio_buf_alloc(asize); 1735 ASSERT(vd == tvd); 1736 if (zio->io_type == ZIO_TYPE_WRITE) { 1737 bcopy(zio->io_data, abuf, zio->io_size); 1738 bzero(abuf + zio->io_size, asize - zio->io_size); 1739 } 1740 zio_push_transform(zio, abuf, asize, asize); 1741 ASSERT(!(zio->io_flags & ZIO_FLAG_SUBBLOCK)); 1742 zio->io_flags |= ZIO_FLAG_SUBBLOCK; 1743 } 1744 1745 ASSERT(P2PHASE(zio->io_offset, align) == 0); 1746 ASSERT(P2PHASE(zio->io_size, align) == 0); 1747 ASSERT(bp == NULL || 1748 P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size); 1749 ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); 1750 1751 return (vd->vdev_ops->vdev_op_io_start(zio)); 1752 } 1753 1754 static int 1755 zio_vdev_io_done(zio_t *zio) 1756 { 1757 if (zio->io_vd == NULL) 1758 return (vdev_mirror_ops.vdev_op_io_done(zio)); 1759 1760 return (zio->io_vd->vdev_ops->vdev_op_io_done(zio)); 1761 } 1762 1763 /* XXPOLICY */ 1764 boolean_t 1765 zio_should_retry(zio_t *zio) 1766 { 1767 vdev_t *vd = zio->io_vd; 1768 1769 if (zio->io_error == 0) 1770 return (B_FALSE); 1771 if (zio->io_delegate_list != NULL) 1772 return (B_FALSE); 1773 if (vd && vd != vd->vdev_top) 1774 return (B_FALSE); 1775 if (zio->io_flags & ZIO_FLAG_DONT_RETRY) 1776 return (B_FALSE); 1777 if (zio->io_retries > 0) 1778 return (B_FALSE); 1779 1780 return (B_TRUE); 1781 } 1782 1783 static int 1784 zio_vdev_io_assess(zio_t *zio) 1785 { 1786 vdev_t *vd = zio->io_vd; 1787 vdev_t *tvd = vd ? vd->vdev_top : NULL; 1788 1789 ASSERT(zio->io_vsd == NULL); 1790 1791 if (zio->io_flags & ZIO_FLAG_SUBBLOCK) { 1792 void *abuf; 1793 uint64_t asize; 1794 ASSERT(vd == tvd); 1795 zio_pop_transform(zio, &abuf, &asize, &asize); 1796 if (zio->io_type == ZIO_TYPE_READ) 1797 bcopy(abuf, zio->io_data, zio->io_size); 1798 zio_buf_free(abuf, asize); 1799 zio->io_flags &= ~ZIO_FLAG_SUBBLOCK; 1800 } 1801 1802 if (zio_injection_enabled && !zio->io_error) 1803 zio->io_error = zio_handle_fault_injection(zio, EIO); 1804 1805 /* 1806 * If the I/O failed, determine whether we should attempt to retry it. 1807 */ 1808 /* XXPOLICY */ 1809 if (zio_should_retry(zio)) { 1810 ASSERT(tvd == vd); 1811 1812 zio->io_retries++; 1813 zio->io_error = 0; 1814 zio->io_flags &= ZIO_FLAG_RETRY_INHERIT; 1815 /* XXPOLICY */ 1816 zio->io_flags &= ~ZIO_FLAG_FAILFAST; 1817 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 1818 zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; 1819 1820 return (ZIO_PIPELINE_CONTINUE); 1821 } 1822 1823 return (ZIO_PIPELINE_CONTINUE); 1824 } 1825 1826 void 1827 zio_vdev_io_reissue(zio_t *zio) 1828 { 1829 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1830 ASSERT(zio->io_error == 0); 1831 1832 zio->io_stage--; 1833 } 1834 1835 void 1836 zio_vdev_io_redone(zio_t *zio) 1837 { 1838 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 1839 1840 zio->io_stage--; 1841 } 1842 1843 void 1844 zio_vdev_io_bypass(zio_t *zio) 1845 { 1846 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 1847 ASSERT(zio->io_error == 0); 1848 1849 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 1850 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; 1851 } 1852 1853 /* 1854 * ========================================================================== 1855 * Generate and verify checksums 1856 * ========================================================================== 1857 */ 1858 static int 1859 zio_checksum_generate(zio_t *zio) 1860 { 1861 int checksum = zio->io_checksum; 1862 blkptr_t *bp = zio->io_bp; 1863 1864 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 1865 1866 BP_SET_CHECKSUM(bp, checksum); 1867 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1868 1869 zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size); 1870 1871 return (ZIO_PIPELINE_CONTINUE); 1872 } 1873 1874 static int 1875 zio_gang_checksum_generate(zio_t *zio) 1876 { 1877 zio_cksum_t zc; 1878 zio_gbh_phys_t *gbh = zio->io_data; 1879 1880 ASSERT(BP_IS_GANG(zio->io_bp)); 1881 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 1882 1883 zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum); 1884 1885 zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size); 1886 1887 return (ZIO_PIPELINE_CONTINUE); 1888 } 1889 1890 static int 1891 zio_checksum_verify(zio_t *zio) 1892 { 1893 if (zio->io_bp != NULL) { 1894 zio->io_error = zio_checksum_error(zio); 1895 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 1896 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 1897 zio->io_spa, zio->io_vd, zio, 0, 0); 1898 } 1899 1900 return (ZIO_PIPELINE_CONTINUE); 1901 } 1902 1903 /* 1904 * Called by RAID-Z to ensure we don't compute the checksum twice. 1905 */ 1906 void 1907 zio_checksum_verified(zio_t *zio) 1908 { 1909 zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); 1910 } 1911 1912 /* 1913 * Set the external verifier for a gang block based on stuff in the bp 1914 */ 1915 void 1916 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp) 1917 { 1918 blkptr_t *bp = zio->io_bp; 1919 1920 zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp)); 1921 zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp)); 1922 zcp->zc_word[2] = bp->blk_birth; 1923 zcp->zc_word[3] = 0; 1924 } 1925 1926 /* 1927 * ========================================================================== 1928 * Define the pipeline 1929 * ========================================================================== 1930 */ 1931 typedef int zio_pipe_stage_t(zio_t *zio); 1932 1933 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = { 1934 NULL, 1935 zio_wait_for_children_ready, 1936 zio_read_init, 1937 zio_issue_async, 1938 zio_write_compress, 1939 zio_checksum_generate, 1940 zio_get_gang_header, 1941 zio_rewrite_gang_members, 1942 zio_free_gang_members, 1943 zio_claim_gang_members, 1944 zio_dva_allocate, 1945 zio_dva_free, 1946 zio_dva_claim, 1947 zio_gang_checksum_generate, 1948 zio_ready, 1949 zio_vdev_io_start, 1950 zio_vdev_io_done, 1951 zio_vdev_io_assess, 1952 zio_wait_for_children_done, 1953 zio_checksum_verify, 1954 zio_read_gang_members, 1955 zio_read_decompress, 1956 zio_assess, 1957 zio_done, 1958 NULL 1959 }; 1960 1961 /* 1962 * Execute the I/O pipeline until one of the following occurs: 1963 * (1) the I/O completes; (2) the pipeline stalls waiting for 1964 * dependent child I/Os; (3) the I/O issues, so we're waiting 1965 * for an I/O completion interrupt; (4) the I/O is delegated by 1966 * vdev-level caching or aggregation; (5) the I/O is deferred 1967 * due to vdev-level queueing; (6) the I/O is handed off to 1968 * another thread. In all cases, the pipeline stops whenever 1969 * there's no CPU work; it never burns a thread in cv_wait(). 1970 * 1971 * There's no locking on io_stage because there's no legitimate way 1972 * for multiple threads to be attempting to process the same I/O. 1973 */ 1974 void 1975 zio_execute(zio_t *zio) 1976 { 1977 while (zio->io_stage < ZIO_STAGE_DONE) { 1978 uint32_t pipeline = zio->io_pipeline; 1979 int rv; 1980 1981 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1982 1983 /* 1984 * If an error occurred outside the vdev stack, 1985 * just execute the interlock stages to clean up. 1986 */ 1987 if (zio->io_error && 1988 ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0) 1989 pipeline &= ZIO_ERROR_PIPELINE_MASK; 1990 1991 while (((1U << ++zio->io_stage) & pipeline) == 0) 1992 continue; 1993 1994 ASSERT(zio->io_stage <= ZIO_STAGE_DONE); 1995 ASSERT(zio->io_stalled == 0); 1996 1997 rv = zio_pipeline[zio->io_stage](zio); 1998 1999 if (rv == ZIO_PIPELINE_STOP) 2000 return; 2001 2002 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 2003 } 2004 } 2005 2006 static boolean_t 2007 zio_io_should_fail(uint16_t range) 2008 { 2009 static uint16_t allocs = 0; 2010 2011 return (P2PHASE(allocs++, 1U<<range) == 0); 2012 } 2013 2014 /* 2015 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2016 */ 2017 int 2018 zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, 2019 uint64_t txg) 2020 { 2021 int error; 2022 2023 spa_config_enter(spa, RW_READER, FTAG); 2024 2025 if (zio_zil_fail_shift && zio_io_should_fail(zio_zil_fail_shift)) { 2026 spa_config_exit(spa, FTAG); 2027 return (ENOSPC); 2028 } 2029 2030 /* 2031 * We were passed the previous log block's DVA in bp->blk_dva[0]. 2032 * We use that as a hint for which vdev to allocate from next. 2033 */ 2034 error = metaslab_alloc(spa, spa->spa_log_class, size, 2035 new_bp, 1, txg, old_bp, B_TRUE); 2036 2037 if (error) 2038 error = metaslab_alloc(spa, spa->spa_normal_class, size, 2039 new_bp, 1, txg, old_bp, B_TRUE); 2040 2041 if (error == 0) { 2042 BP_SET_LSIZE(new_bp, size); 2043 BP_SET_PSIZE(new_bp, size); 2044 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2045 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2046 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2047 BP_SET_LEVEL(new_bp, 0); 2048 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2049 new_bp->blk_birth = txg; 2050 } 2051 2052 spa_config_exit(spa, FTAG); 2053 2054 return (error); 2055 } 2056 2057 /* 2058 * Free an intent log block. We know it can't be a gang block, so there's 2059 * nothing to do except metaslab_free() it. 2060 */ 2061 void 2062 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) 2063 { 2064 ASSERT(!BP_IS_GANG(bp)); 2065 2066 spa_config_enter(spa, RW_READER, FTAG); 2067 2068 metaslab_free(spa, bp, txg, B_FALSE); 2069 2070 spa_config_exit(spa, FTAG); 2071 } 2072 2073 /* 2074 * start an async flush of the write cache for this vdev 2075 */ 2076 void 2077 zio_flush(zio_t *zio, vdev_t *vd) 2078 { 2079 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 2080 NULL, NULL, ZIO_PRIORITY_NOW, 2081 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY)); 2082 } 2083