1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/fm/fs/zfs.h> 28 #include <sys/spa.h> 29 #include <sys/txg.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio_impl.h> 33 #include <sys/zio_compress.h> 34 #include <sys/zio_checksum.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/arc.h> 37 #include <sys/ddt.h> 38 39 /* 40 * ========================================================================== 41 * I/O priority table 42 * ========================================================================== 43 */ 44 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 45 0, /* ZIO_PRIORITY_NOW */ 46 0, /* ZIO_PRIORITY_SYNC_READ */ 47 0, /* ZIO_PRIORITY_SYNC_WRITE */ 48 0, /* ZIO_PRIORITY_LOG_WRITE */ 49 1, /* ZIO_PRIORITY_CACHE_FILL */ 50 1, /* ZIO_PRIORITY_AGG */ 51 4, /* ZIO_PRIORITY_FREE */ 52 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 53 6, /* ZIO_PRIORITY_ASYNC_READ */ 54 10, /* ZIO_PRIORITY_RESILVER */ 55 20, /* ZIO_PRIORITY_SCRUB */ 56 }; 57 58 /* 59 * ========================================================================== 60 * I/O type descriptions 61 * ========================================================================== 62 */ 63 char *zio_type_name[ZIO_TYPES] = { 64 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 65 "zio_ioctl" 66 }; 67 68 /* 69 * ========================================================================== 70 * I/O kmem caches 71 * ========================================================================== 72 */ 73 kmem_cache_t *zio_cache; 74 kmem_cache_t *zio_link_cache; 75 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 76 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 77 78 #ifdef _KERNEL 79 extern vmem_t *zio_alloc_arena; 80 #endif 81 82 /* 83 * An allocating zio is one that either currently has the DVA allocate 84 * stage set or will have it later in its lifetime. 85 */ 86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 87 88 #ifdef ZFS_DEBUG 89 int zio_buf_debug_limit = 16384; 90 #else 91 int zio_buf_debug_limit = 0; 92 #endif 93 94 void 95 zio_init(void) 96 { 97 size_t c; 98 vmem_t *data_alloc_arena = NULL; 99 100 #ifdef _KERNEL 101 data_alloc_arena = zio_alloc_arena; 102 #endif 103 zio_cache = kmem_cache_create("zio_cache", 104 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 105 zio_link_cache = kmem_cache_create("zio_link_cache", 106 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 107 108 /* 109 * For small buffers, we want a cache for each multiple of 110 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 111 * for each quarter-power of 2. For large buffers, we want 112 * a cache for each multiple of PAGESIZE. 113 */ 114 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 115 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 116 size_t p2 = size; 117 size_t align = 0; 118 119 while (p2 & (p2 - 1)) 120 p2 &= p2 - 1; 121 122 if (size <= 4 * SPA_MINBLOCKSIZE) { 123 align = SPA_MINBLOCKSIZE; 124 } else if (P2PHASE(size, PAGESIZE) == 0) { 125 align = PAGESIZE; 126 } else if (P2PHASE(size, p2 >> 2) == 0) { 127 align = p2 >> 2; 128 } 129 130 if (align != 0) { 131 char name[36]; 132 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 133 zio_buf_cache[c] = kmem_cache_create(name, size, 134 align, NULL, NULL, NULL, NULL, NULL, 135 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 136 137 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 138 zio_data_buf_cache[c] = kmem_cache_create(name, size, 139 align, NULL, NULL, NULL, NULL, data_alloc_arena, 140 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 141 } 142 } 143 144 while (--c != 0) { 145 ASSERT(zio_buf_cache[c] != NULL); 146 if (zio_buf_cache[c - 1] == NULL) 147 zio_buf_cache[c - 1] = zio_buf_cache[c]; 148 149 ASSERT(zio_data_buf_cache[c] != NULL); 150 if (zio_data_buf_cache[c - 1] == NULL) 151 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 152 } 153 154 zio_inject_init(); 155 } 156 157 void 158 zio_fini(void) 159 { 160 size_t c; 161 kmem_cache_t *last_cache = NULL; 162 kmem_cache_t *last_data_cache = NULL; 163 164 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 165 if (zio_buf_cache[c] != last_cache) { 166 last_cache = zio_buf_cache[c]; 167 kmem_cache_destroy(zio_buf_cache[c]); 168 } 169 zio_buf_cache[c] = NULL; 170 171 if (zio_data_buf_cache[c] != last_data_cache) { 172 last_data_cache = zio_data_buf_cache[c]; 173 kmem_cache_destroy(zio_data_buf_cache[c]); 174 } 175 zio_data_buf_cache[c] = NULL; 176 } 177 178 kmem_cache_destroy(zio_link_cache); 179 kmem_cache_destroy(zio_cache); 180 181 zio_inject_fini(); 182 } 183 184 /* 185 * ========================================================================== 186 * Allocate and free I/O buffers 187 * ========================================================================== 188 */ 189 190 /* 191 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 192 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 193 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 194 * excess / transient data in-core during a crashdump. 195 */ 196 void * 197 zio_buf_alloc(size_t size) 198 { 199 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 200 201 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 202 203 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 204 } 205 206 /* 207 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 208 * crashdump if the kernel panics. This exists so that we will limit the amount 209 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 210 * of kernel heap dumped to disk when the kernel panics) 211 */ 212 void * 213 zio_data_buf_alloc(size_t size) 214 { 215 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 216 217 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 218 219 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 220 } 221 222 void 223 zio_buf_free(void *buf, size_t size) 224 { 225 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 226 227 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 228 229 kmem_cache_free(zio_buf_cache[c], buf); 230 } 231 232 void 233 zio_data_buf_free(void *buf, size_t size) 234 { 235 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 236 237 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 238 239 kmem_cache_free(zio_data_buf_cache[c], buf); 240 } 241 242 /* 243 * ========================================================================== 244 * Push and pop I/O transform buffers 245 * ========================================================================== 246 */ 247 static void 248 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 249 zio_transform_func_t *transform) 250 { 251 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 252 253 zt->zt_orig_data = zio->io_data; 254 zt->zt_orig_size = zio->io_size; 255 zt->zt_bufsize = bufsize; 256 zt->zt_transform = transform; 257 258 zt->zt_next = zio->io_transform_stack; 259 zio->io_transform_stack = zt; 260 261 zio->io_data = data; 262 zio->io_size = size; 263 } 264 265 static void 266 zio_pop_transforms(zio_t *zio) 267 { 268 zio_transform_t *zt; 269 270 while ((zt = zio->io_transform_stack) != NULL) { 271 if (zt->zt_transform != NULL) 272 zt->zt_transform(zio, 273 zt->zt_orig_data, zt->zt_orig_size); 274 275 if (zt->zt_bufsize != 0) 276 zio_buf_free(zio->io_data, zt->zt_bufsize); 277 278 zio->io_data = zt->zt_orig_data; 279 zio->io_size = zt->zt_orig_size; 280 zio->io_transform_stack = zt->zt_next; 281 282 kmem_free(zt, sizeof (zio_transform_t)); 283 } 284 } 285 286 /* 287 * ========================================================================== 288 * I/O transform callbacks for subblocks and decompression 289 * ========================================================================== 290 */ 291 static void 292 zio_subblock(zio_t *zio, void *data, uint64_t size) 293 { 294 ASSERT(zio->io_size > size); 295 296 if (zio->io_type == ZIO_TYPE_READ) 297 bcopy(zio->io_data, data, size); 298 } 299 300 static void 301 zio_decompress(zio_t *zio, void *data, uint64_t size) 302 { 303 if (zio->io_error == 0 && 304 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 305 zio->io_data, data, zio->io_size, size) != 0) 306 zio->io_error = EIO; 307 } 308 309 /* 310 * ========================================================================== 311 * I/O parent/child relationships and pipeline interlocks 312 * ========================================================================== 313 */ 314 /* 315 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 316 * continue calling these functions until they return NULL. 317 * Otherwise, the next caller will pick up the list walk in 318 * some indeterminate state. (Otherwise every caller would 319 * have to pass in a cookie to keep the state represented by 320 * io_walk_link, which gets annoying.) 321 */ 322 zio_t * 323 zio_walk_parents(zio_t *cio) 324 { 325 zio_link_t *zl = cio->io_walk_link; 326 list_t *pl = &cio->io_parent_list; 327 328 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 329 cio->io_walk_link = zl; 330 331 if (zl == NULL) 332 return (NULL); 333 334 ASSERT(zl->zl_child == cio); 335 return (zl->zl_parent); 336 } 337 338 zio_t * 339 zio_walk_children(zio_t *pio) 340 { 341 zio_link_t *zl = pio->io_walk_link; 342 list_t *cl = &pio->io_child_list; 343 344 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 345 pio->io_walk_link = zl; 346 347 if (zl == NULL) 348 return (NULL); 349 350 ASSERT(zl->zl_parent == pio); 351 return (zl->zl_child); 352 } 353 354 zio_t * 355 zio_unique_parent(zio_t *cio) 356 { 357 zio_t *pio = zio_walk_parents(cio); 358 359 VERIFY(zio_walk_parents(cio) == NULL); 360 return (pio); 361 } 362 363 void 364 zio_add_child(zio_t *pio, zio_t *cio) 365 { 366 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 367 368 /* 369 * Logical I/Os can have logical, gang, or vdev children. 370 * Gang I/Os can have gang or vdev children. 371 * Vdev I/Os can only have vdev children. 372 * The following ASSERT captures all of these constraints. 373 */ 374 ASSERT(cio->io_child_type <= pio->io_child_type); 375 376 zl->zl_parent = pio; 377 zl->zl_child = cio; 378 379 mutex_enter(&cio->io_lock); 380 mutex_enter(&pio->io_lock); 381 382 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 383 384 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 385 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 386 387 list_insert_head(&pio->io_child_list, zl); 388 list_insert_head(&cio->io_parent_list, zl); 389 390 pio->io_child_count++; 391 cio->io_parent_count++; 392 393 mutex_exit(&pio->io_lock); 394 mutex_exit(&cio->io_lock); 395 } 396 397 static void 398 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 399 { 400 ASSERT(zl->zl_parent == pio); 401 ASSERT(zl->zl_child == cio); 402 403 mutex_enter(&cio->io_lock); 404 mutex_enter(&pio->io_lock); 405 406 list_remove(&pio->io_child_list, zl); 407 list_remove(&cio->io_parent_list, zl); 408 409 pio->io_child_count--; 410 cio->io_parent_count--; 411 412 mutex_exit(&pio->io_lock); 413 mutex_exit(&cio->io_lock); 414 415 kmem_cache_free(zio_link_cache, zl); 416 } 417 418 static boolean_t 419 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 420 { 421 uint64_t *countp = &zio->io_children[child][wait]; 422 boolean_t waiting = B_FALSE; 423 424 mutex_enter(&zio->io_lock); 425 ASSERT(zio->io_stall == NULL); 426 if (*countp != 0) { 427 zio->io_stage >>= 1; 428 zio->io_stall = countp; 429 waiting = B_TRUE; 430 } 431 mutex_exit(&zio->io_lock); 432 433 return (waiting); 434 } 435 436 static void 437 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 438 { 439 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 440 int *errorp = &pio->io_child_error[zio->io_child_type]; 441 442 mutex_enter(&pio->io_lock); 443 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 444 *errorp = zio_worst_error(*errorp, zio->io_error); 445 pio->io_reexecute |= zio->io_reexecute; 446 ASSERT3U(*countp, >, 0); 447 if (--*countp == 0 && pio->io_stall == countp) { 448 pio->io_stall = NULL; 449 mutex_exit(&pio->io_lock); 450 zio_execute(pio); 451 } else { 452 mutex_exit(&pio->io_lock); 453 } 454 } 455 456 static void 457 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 458 { 459 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 460 zio->io_error = zio->io_child_error[c]; 461 } 462 463 /* 464 * ========================================================================== 465 * Create the various types of I/O (read, write, free, etc) 466 * ========================================================================== 467 */ 468 static zio_t * 469 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 470 void *data, uint64_t size, zio_done_func_t *done, void *private, 471 zio_type_t type, int priority, enum zio_flag flags, 472 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 473 enum zio_stage stage, enum zio_stage pipeline) 474 { 475 zio_t *zio; 476 477 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 478 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 479 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 480 481 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 482 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 483 ASSERT(vd || stage == ZIO_STAGE_OPEN); 484 485 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 486 bzero(zio, sizeof (zio_t)); 487 488 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 489 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 490 491 list_create(&zio->io_parent_list, sizeof (zio_link_t), 492 offsetof(zio_link_t, zl_parent_node)); 493 list_create(&zio->io_child_list, sizeof (zio_link_t), 494 offsetof(zio_link_t, zl_child_node)); 495 496 if (vd != NULL) 497 zio->io_child_type = ZIO_CHILD_VDEV; 498 else if (flags & ZIO_FLAG_GANG_CHILD) 499 zio->io_child_type = ZIO_CHILD_GANG; 500 else if (flags & ZIO_FLAG_DDT_CHILD) 501 zio->io_child_type = ZIO_CHILD_DDT; 502 else 503 zio->io_child_type = ZIO_CHILD_LOGICAL; 504 505 if (bp != NULL) { 506 zio->io_bp = (blkptr_t *)bp; 507 zio->io_bp_copy = *bp; 508 zio->io_bp_orig = *bp; 509 if (type != ZIO_TYPE_WRITE || 510 zio->io_child_type == ZIO_CHILD_DDT) 511 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 512 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 513 zio->io_logical = zio; 514 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 515 pipeline |= ZIO_GANG_STAGES; 516 } 517 518 zio->io_spa = spa; 519 zio->io_txg = txg; 520 zio->io_done = done; 521 zio->io_private = private; 522 zio->io_type = type; 523 zio->io_priority = priority; 524 zio->io_vd = vd; 525 zio->io_offset = offset; 526 zio->io_orig_data = zio->io_data = data; 527 zio->io_orig_size = zio->io_size = size; 528 zio->io_orig_flags = zio->io_flags = flags; 529 zio->io_orig_stage = zio->io_stage = stage; 530 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 531 532 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 533 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 534 535 if (zb != NULL) 536 zio->io_bookmark = *zb; 537 538 if (pio != NULL) { 539 if (zio->io_logical == NULL) 540 zio->io_logical = pio->io_logical; 541 if (zio->io_child_type == ZIO_CHILD_GANG) 542 zio->io_gang_leader = pio->io_gang_leader; 543 zio_add_child(pio, zio); 544 } 545 546 return (zio); 547 } 548 549 static void 550 zio_destroy(zio_t *zio) 551 { 552 list_destroy(&zio->io_parent_list); 553 list_destroy(&zio->io_child_list); 554 mutex_destroy(&zio->io_lock); 555 cv_destroy(&zio->io_cv); 556 kmem_cache_free(zio_cache, zio); 557 } 558 559 zio_t * 560 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 561 void *private, enum zio_flag flags) 562 { 563 zio_t *zio; 564 565 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 566 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 567 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 568 569 return (zio); 570 } 571 572 zio_t * 573 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 574 { 575 return (zio_null(NULL, spa, NULL, done, private, flags)); 576 } 577 578 zio_t * 579 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 580 void *data, uint64_t size, zio_done_func_t *done, void *private, 581 int priority, enum zio_flag flags, const zbookmark_t *zb) 582 { 583 zio_t *zio; 584 585 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 586 data, size, done, private, 587 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 588 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 589 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 590 591 return (zio); 592 } 593 594 zio_t * 595 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 596 void *data, uint64_t size, const zio_prop_t *zp, 597 zio_done_func_t *ready, zio_done_func_t *done, void *private, 598 int priority, enum zio_flag flags, const zbookmark_t *zb) 599 { 600 zio_t *zio; 601 602 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 603 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 604 zp->zp_compress >= ZIO_COMPRESS_OFF && 605 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 606 zp->zp_type < DMU_OT_NUMTYPES && 607 zp->zp_level < 32 && 608 zp->zp_copies > 0 && 609 zp->zp_copies <= spa_max_replication(spa) && 610 zp->zp_dedup <= 1 && 611 zp->zp_dedup_verify <= 1); 612 613 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 614 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 615 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 616 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 617 618 zio->io_ready = ready; 619 zio->io_prop = *zp; 620 621 return (zio); 622 } 623 624 zio_t * 625 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 626 uint64_t size, zio_done_func_t *done, void *private, int priority, 627 enum zio_flag flags, zbookmark_t *zb) 628 { 629 zio_t *zio; 630 631 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 632 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 633 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 634 635 return (zio); 636 } 637 638 void 639 zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 640 { 641 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 642 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 643 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 644 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 645 646 zio->io_prop.zp_copies = copies; 647 zio->io_bp_override = bp; 648 } 649 650 void 651 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 652 { 653 bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 654 } 655 656 zio_t * 657 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 658 enum zio_flag flags) 659 { 660 zio_t *zio; 661 662 ASSERT(!BP_IS_HOLE(bp)); 663 ASSERT(spa_syncing_txg(spa) == txg); 664 ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 665 666 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 667 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 668 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 669 670 return (zio); 671 } 672 673 zio_t * 674 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 675 zio_done_func_t *done, void *private, enum zio_flag flags) 676 { 677 zio_t *zio; 678 679 /* 680 * A claim is an allocation of a specific block. Claims are needed 681 * to support immediate writes in the intent log. The issue is that 682 * immediate writes contain committed data, but in a txg that was 683 * *not* committed. Upon opening the pool after an unclean shutdown, 684 * the intent log claims all blocks that contain immediate write data 685 * so that the SPA knows they're in use. 686 * 687 * All claims *must* be resolved in the first txg -- before the SPA 688 * starts allocating blocks -- so that nothing is allocated twice. 689 * If txg == 0 we just verify that the block is claimable. 690 */ 691 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 692 ASSERT(txg == spa_first_txg(spa) || txg == 0); 693 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 694 695 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 696 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 697 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 698 699 return (zio); 700 } 701 702 zio_t * 703 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 704 zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 705 { 706 zio_t *zio; 707 int c; 708 709 if (vd->vdev_children == 0) { 710 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 711 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 712 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 713 714 zio->io_cmd = cmd; 715 } else { 716 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 717 718 for (c = 0; c < vd->vdev_children; c++) 719 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 720 done, private, priority, flags)); 721 } 722 723 return (zio); 724 } 725 726 zio_t * 727 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 728 void *data, int checksum, zio_done_func_t *done, void *private, 729 int priority, enum zio_flag flags, boolean_t labels) 730 { 731 zio_t *zio; 732 733 ASSERT(vd->vdev_children == 0); 734 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 735 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 736 ASSERT3U(offset + size, <=, vd->vdev_psize); 737 738 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 739 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 740 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 741 742 zio->io_prop.zp_checksum = checksum; 743 744 return (zio); 745 } 746 747 zio_t * 748 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 749 void *data, int checksum, zio_done_func_t *done, void *private, 750 int priority, enum zio_flag flags, boolean_t labels) 751 { 752 zio_t *zio; 753 754 ASSERT(vd->vdev_children == 0); 755 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 756 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 757 ASSERT3U(offset + size, <=, vd->vdev_psize); 758 759 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 760 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 761 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 762 763 zio->io_prop.zp_checksum = checksum; 764 765 if (zio_checksum_table[checksum].ci_zbt) { 766 /* 767 * zbt checksums are necessarily destructive -- they modify 768 * the end of the write buffer to hold the verifier/checksum. 769 * Therefore, we must make a local copy in case the data is 770 * being written to multiple places in parallel. 771 */ 772 void *wbuf = zio_buf_alloc(size); 773 bcopy(data, wbuf, size); 774 zio_push_transform(zio, wbuf, size, size, NULL); 775 } 776 777 return (zio); 778 } 779 780 /* 781 * Create a child I/O to do some work for us. 782 */ 783 zio_t * 784 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 785 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 786 zio_done_func_t *done, void *private) 787 { 788 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 789 zio_t *zio; 790 791 ASSERT(vd->vdev_parent == 792 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 793 794 if (type == ZIO_TYPE_READ && bp != NULL) { 795 /* 796 * If we have the bp, then the child should perform the 797 * checksum and the parent need not. This pushes error 798 * detection as close to the leaves as possible and 799 * eliminates redundant checksums in the interior nodes. 800 */ 801 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 802 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 803 } 804 805 if (vd->vdev_children == 0) 806 offset += VDEV_LABEL_START_SIZE; 807 808 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 809 810 /* 811 * If we've decided to do a repair, the write is not speculative -- 812 * even if the original read was. 813 */ 814 if (flags & ZIO_FLAG_IO_REPAIR) 815 flags &= ~ZIO_FLAG_SPECULATIVE; 816 817 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 818 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 819 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 820 821 return (zio); 822 } 823 824 zio_t * 825 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 826 int type, int priority, enum zio_flag flags, 827 zio_done_func_t *done, void *private) 828 { 829 zio_t *zio; 830 831 ASSERT(vd->vdev_ops->vdev_op_leaf); 832 833 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 834 data, size, done, private, type, priority, 835 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 836 vd, offset, NULL, 837 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 838 839 return (zio); 840 } 841 842 void 843 zio_flush(zio_t *zio, vdev_t *vd) 844 { 845 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 846 NULL, NULL, ZIO_PRIORITY_NOW, 847 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 848 } 849 850 /* 851 * ========================================================================== 852 * Prepare to read and write logical blocks 853 * ========================================================================== 854 */ 855 856 static int 857 zio_read_bp_init(zio_t *zio) 858 { 859 blkptr_t *bp = zio->io_bp; 860 861 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 862 zio->io_child_type == ZIO_CHILD_LOGICAL && 863 !(zio->io_flags & ZIO_FLAG_RAW)) { 864 uint64_t psize = BP_GET_PSIZE(bp); 865 void *cbuf = zio_buf_alloc(psize); 866 867 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 868 } 869 870 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 871 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 872 873 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 874 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 875 876 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 877 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 878 879 return (ZIO_PIPELINE_CONTINUE); 880 } 881 882 static int 883 zio_write_bp_init(zio_t *zio) 884 { 885 spa_t *spa = zio->io_spa; 886 zio_prop_t *zp = &zio->io_prop; 887 enum zio_compress compress = zp->zp_compress; 888 blkptr_t *bp = zio->io_bp; 889 uint64_t lsize = zio->io_size; 890 uint64_t psize = lsize; 891 int pass = 1; 892 893 /* 894 * If our children haven't all reached the ready stage, 895 * wait for them and then repeat this pipeline stage. 896 */ 897 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 898 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 899 return (ZIO_PIPELINE_STOP); 900 901 if (!IO_IS_ALLOCATING(zio)) 902 return (ZIO_PIPELINE_CONTINUE); 903 904 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 905 906 if (zio->io_bp_override) { 907 ASSERT(bp->blk_birth != zio->io_txg); 908 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 909 910 *bp = *zio->io_bp_override; 911 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 912 913 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 914 return (ZIO_PIPELINE_CONTINUE); 915 916 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 917 zp->zp_dedup_verify); 918 919 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 920 BP_SET_DEDUP(bp, 1); 921 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 922 return (ZIO_PIPELINE_CONTINUE); 923 } 924 zio->io_bp_override = NULL; 925 BP_ZERO(bp); 926 } 927 928 if (bp->blk_birth == zio->io_txg) { 929 /* 930 * We're rewriting an existing block, which means we're 931 * working on behalf of spa_sync(). For spa_sync() to 932 * converge, it must eventually be the case that we don't 933 * have to allocate new blocks. But compression changes 934 * the blocksize, which forces a reallocate, and makes 935 * convergence take longer. Therefore, after the first 936 * few passes, stop compressing to ensure convergence. 937 */ 938 pass = spa_sync_pass(spa); 939 940 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 941 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 942 ASSERT(!BP_GET_DEDUP(bp)); 943 944 if (pass > SYNC_PASS_DONT_COMPRESS) 945 compress = ZIO_COMPRESS_OFF; 946 947 /* Make sure someone doesn't change their mind on overwrites */ 948 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 949 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 950 } 951 952 if (compress != ZIO_COMPRESS_OFF) { 953 void *cbuf = zio_buf_alloc(lsize); 954 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 955 if (psize == 0 || psize == lsize) { 956 compress = ZIO_COMPRESS_OFF; 957 zio_buf_free(cbuf, lsize); 958 } else { 959 ASSERT(psize < lsize); 960 zio_push_transform(zio, cbuf, psize, lsize, NULL); 961 } 962 } 963 964 /* 965 * The final pass of spa_sync() must be all rewrites, but the first 966 * few passes offer a trade-off: allocating blocks defers convergence, 967 * but newly allocated blocks are sequential, so they can be written 968 * to disk faster. Therefore, we allow the first few passes of 969 * spa_sync() to allocate new blocks, but force rewrites after that. 970 * There should only be a handful of blocks after pass 1 in any case. 971 */ 972 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 973 pass > SYNC_PASS_REWRITE) { 974 ASSERT(psize != 0); 975 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 976 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 977 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 978 } else { 979 BP_ZERO(bp); 980 zio->io_pipeline = ZIO_WRITE_PIPELINE; 981 } 982 983 if (psize == 0) { 984 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 985 } else { 986 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 987 BP_SET_LSIZE(bp, lsize); 988 BP_SET_PSIZE(bp, psize); 989 BP_SET_COMPRESS(bp, compress); 990 BP_SET_CHECKSUM(bp, zp->zp_checksum); 991 BP_SET_TYPE(bp, zp->zp_type); 992 BP_SET_LEVEL(bp, zp->zp_level); 993 BP_SET_DEDUP(bp, zp->zp_dedup); 994 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 995 if (zp->zp_dedup) { 996 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 997 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 998 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 999 } 1000 } 1001 1002 return (ZIO_PIPELINE_CONTINUE); 1003 } 1004 1005 static int 1006 zio_free_bp_init(zio_t *zio) 1007 { 1008 blkptr_t *bp = zio->io_bp; 1009 1010 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1011 if (BP_GET_DEDUP(bp)) 1012 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1013 else 1014 arc_free(zio->io_spa, bp); 1015 } 1016 1017 return (ZIO_PIPELINE_CONTINUE); 1018 } 1019 1020 /* 1021 * ========================================================================== 1022 * Execute the I/O pipeline 1023 * ========================================================================== 1024 */ 1025 1026 static void 1027 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) 1028 { 1029 spa_t *spa = zio->io_spa; 1030 zio_type_t t = zio->io_type; 1031 1032 /* 1033 * If we're a config writer or a probe, the normal issue and 1034 * interrupt threads may all be blocked waiting for the config lock. 1035 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1036 */ 1037 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1038 t = ZIO_TYPE_NULL; 1039 1040 /* 1041 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1042 */ 1043 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1044 t = ZIO_TYPE_NULL; 1045 1046 /* 1047 * If this is a high priority I/O, then use the high priority taskq. 1048 */ 1049 if (zio->io_priority == ZIO_PRIORITY_NOW && 1050 spa->spa_zio_taskq[t][q + 1] != NULL) 1051 q++; 1052 1053 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1054 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1055 (task_func_t *)zio_execute, zio, TQ_SLEEP); 1056 } 1057 1058 static boolean_t 1059 zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1060 { 1061 kthread_t *executor = zio->io_executor; 1062 spa_t *spa = zio->io_spa; 1063 1064 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1065 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1066 return (B_TRUE); 1067 1068 return (B_FALSE); 1069 } 1070 1071 static int 1072 zio_issue_async(zio_t *zio) 1073 { 1074 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1075 1076 return (ZIO_PIPELINE_STOP); 1077 } 1078 1079 void 1080 zio_interrupt(zio_t *zio) 1081 { 1082 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); 1083 } 1084 1085 /* 1086 * Execute the I/O pipeline until one of the following occurs: 1087 * (1) the I/O completes; (2) the pipeline stalls waiting for 1088 * dependent child I/Os; (3) the I/O issues, so we're waiting 1089 * for an I/O completion interrupt; (4) the I/O is delegated by 1090 * vdev-level caching or aggregation; (5) the I/O is deferred 1091 * due to vdev-level queueing; (6) the I/O is handed off to 1092 * another thread. In all cases, the pipeline stops whenever 1093 * there's no CPU work; it never burns a thread in cv_wait(). 1094 * 1095 * There's no locking on io_stage because there's no legitimate way 1096 * for multiple threads to be attempting to process the same I/O. 1097 */ 1098 static zio_pipe_stage_t *zio_pipeline[]; 1099 1100 void 1101 zio_execute(zio_t *zio) 1102 { 1103 zio->io_executor = curthread; 1104 1105 while (zio->io_stage < ZIO_STAGE_DONE) { 1106 enum zio_stage pipeline = zio->io_pipeline; 1107 enum zio_stage stage = zio->io_stage; 1108 int rv; 1109 1110 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1111 ASSERT(ISP2(stage)); 1112 ASSERT(zio->io_stall == NULL); 1113 1114 do { 1115 stage <<= 1; 1116 } while ((stage & pipeline) == 0); 1117 1118 ASSERT(stage <= ZIO_STAGE_DONE); 1119 1120 /* 1121 * If we are in interrupt context and this pipeline stage 1122 * will grab a config lock that is held across I/O, 1123 * or may wait for an I/O that needs an interrupt thread 1124 * to complete, issue async to avoid deadlock. 1125 */ 1126 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1127 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1128 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1129 return; 1130 } 1131 1132 zio->io_stage = stage; 1133 rv = zio_pipeline[highbit(stage) - 1](zio); 1134 1135 if (rv == ZIO_PIPELINE_STOP) 1136 return; 1137 1138 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1139 } 1140 } 1141 1142 /* 1143 * ========================================================================== 1144 * Initiate I/O, either sync or async 1145 * ========================================================================== 1146 */ 1147 int 1148 zio_wait(zio_t *zio) 1149 { 1150 int error; 1151 1152 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1153 ASSERT(zio->io_executor == NULL); 1154 1155 zio->io_waiter = curthread; 1156 1157 zio_execute(zio); 1158 1159 mutex_enter(&zio->io_lock); 1160 while (zio->io_executor != NULL) 1161 cv_wait(&zio->io_cv, &zio->io_lock); 1162 mutex_exit(&zio->io_lock); 1163 1164 error = zio->io_error; 1165 zio_destroy(zio); 1166 1167 return (error); 1168 } 1169 1170 void 1171 zio_nowait(zio_t *zio) 1172 { 1173 ASSERT(zio->io_executor == NULL); 1174 1175 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1176 zio_unique_parent(zio) == NULL) { 1177 /* 1178 * This is a logical async I/O with no parent to wait for it. 1179 * We add it to the spa_async_root_zio "Godfather" I/O which 1180 * will ensure they complete prior to unloading the pool. 1181 */ 1182 spa_t *spa = zio->io_spa; 1183 1184 zio_add_child(spa->spa_async_zio_root, zio); 1185 } 1186 1187 zio_execute(zio); 1188 } 1189 1190 /* 1191 * ========================================================================== 1192 * Reexecute or suspend/resume failed I/O 1193 * ========================================================================== 1194 */ 1195 1196 static void 1197 zio_reexecute(zio_t *pio) 1198 { 1199 zio_t *cio, *cio_next; 1200 1201 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1202 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1203 ASSERT(pio->io_gang_leader == NULL); 1204 ASSERT(pio->io_gang_tree == NULL); 1205 1206 pio->io_flags = pio->io_orig_flags; 1207 pio->io_stage = pio->io_orig_stage; 1208 pio->io_pipeline = pio->io_orig_pipeline; 1209 pio->io_reexecute = 0; 1210 pio->io_error = 0; 1211 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1212 pio->io_state[w] = 0; 1213 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1214 pio->io_child_error[c] = 0; 1215 1216 if (IO_IS_ALLOCATING(pio)) 1217 BP_ZERO(pio->io_bp); 1218 1219 /* 1220 * As we reexecute pio's children, new children could be created. 1221 * New children go to the head of pio's io_child_list, however, 1222 * so we will (correctly) not reexecute them. The key is that 1223 * the remainder of pio's io_child_list, from 'cio_next' onward, 1224 * cannot be affected by any side effects of reexecuting 'cio'. 1225 */ 1226 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1227 cio_next = zio_walk_children(pio); 1228 mutex_enter(&pio->io_lock); 1229 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1230 pio->io_children[cio->io_child_type][w]++; 1231 mutex_exit(&pio->io_lock); 1232 zio_reexecute(cio); 1233 } 1234 1235 /* 1236 * Now that all children have been reexecuted, execute the parent. 1237 * We don't reexecute "The Godfather" I/O here as it's the 1238 * responsibility of the caller to wait on him. 1239 */ 1240 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1241 zio_execute(pio); 1242 } 1243 1244 void 1245 zio_suspend(spa_t *spa, zio_t *zio) 1246 { 1247 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1248 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1249 "failure and the failure mode property for this pool " 1250 "is set to panic.", spa_name(spa)); 1251 1252 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1253 1254 mutex_enter(&spa->spa_suspend_lock); 1255 1256 if (spa->spa_suspend_zio_root == NULL) 1257 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1258 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1259 ZIO_FLAG_GODFATHER); 1260 1261 spa->spa_suspended = B_TRUE; 1262 1263 if (zio != NULL) { 1264 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1265 ASSERT(zio != spa->spa_suspend_zio_root); 1266 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1267 ASSERT(zio_unique_parent(zio) == NULL); 1268 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1269 zio_add_child(spa->spa_suspend_zio_root, zio); 1270 } 1271 1272 mutex_exit(&spa->spa_suspend_lock); 1273 } 1274 1275 int 1276 zio_resume(spa_t *spa) 1277 { 1278 zio_t *pio; 1279 1280 /* 1281 * Reexecute all previously suspended i/o. 1282 */ 1283 mutex_enter(&spa->spa_suspend_lock); 1284 spa->spa_suspended = B_FALSE; 1285 cv_broadcast(&spa->spa_suspend_cv); 1286 pio = spa->spa_suspend_zio_root; 1287 spa->spa_suspend_zio_root = NULL; 1288 mutex_exit(&spa->spa_suspend_lock); 1289 1290 if (pio == NULL) 1291 return (0); 1292 1293 zio_reexecute(pio); 1294 return (zio_wait(pio)); 1295 } 1296 1297 void 1298 zio_resume_wait(spa_t *spa) 1299 { 1300 mutex_enter(&spa->spa_suspend_lock); 1301 while (spa_suspended(spa)) 1302 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1303 mutex_exit(&spa->spa_suspend_lock); 1304 } 1305 1306 /* 1307 * ========================================================================== 1308 * Gang blocks. 1309 * 1310 * A gang block is a collection of small blocks that looks to the DMU 1311 * like one large block. When zio_dva_allocate() cannot find a block 1312 * of the requested size, due to either severe fragmentation or the pool 1313 * being nearly full, it calls zio_write_gang_block() to construct the 1314 * block from smaller fragments. 1315 * 1316 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1317 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1318 * an indirect block: it's an array of block pointers. It consumes 1319 * only one sector and hence is allocatable regardless of fragmentation. 1320 * The gang header's bps point to its gang members, which hold the data. 1321 * 1322 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1323 * as the verifier to ensure uniqueness of the SHA256 checksum. 1324 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1325 * not the gang header. This ensures that data block signatures (needed for 1326 * deduplication) are independent of how the block is physically stored. 1327 * 1328 * Gang blocks can be nested: a gang member may itself be a gang block. 1329 * Thus every gang block is a tree in which root and all interior nodes are 1330 * gang headers, and the leaves are normal blocks that contain user data. 1331 * The root of the gang tree is called the gang leader. 1332 * 1333 * To perform any operation (read, rewrite, free, claim) on a gang block, 1334 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1335 * in the io_gang_tree field of the original logical i/o by recursively 1336 * reading the gang leader and all gang headers below it. This yields 1337 * an in-core tree containing the contents of every gang header and the 1338 * bps for every constituent of the gang block. 1339 * 1340 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1341 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1342 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1343 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1344 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1345 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1346 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1347 * of the gang header plus zio_checksum_compute() of the data to update the 1348 * gang header's blk_cksum as described above. 1349 * 1350 * The two-phase assemble/issue model solves the problem of partial failure -- 1351 * what if you'd freed part of a gang block but then couldn't read the 1352 * gang header for another part? Assembling the entire gang tree first 1353 * ensures that all the necessary gang header I/O has succeeded before 1354 * starting the actual work of free, claim, or write. Once the gang tree 1355 * is assembled, free and claim are in-memory operations that cannot fail. 1356 * 1357 * In the event that a gang write fails, zio_dva_unallocate() walks the 1358 * gang tree to immediately free (i.e. insert back into the space map) 1359 * everything we've allocated. This ensures that we don't get ENOSPC 1360 * errors during repeated suspend/resume cycles due to a flaky device. 1361 * 1362 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1363 * the gang tree, we won't modify the block, so we can safely defer the free 1364 * (knowing that the block is still intact). If we *can* assemble the gang 1365 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1366 * each constituent bp and we can allocate a new block on the next sync pass. 1367 * 1368 * In all cases, the gang tree allows complete recovery from partial failure. 1369 * ========================================================================== 1370 */ 1371 1372 static zio_t * 1373 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1374 { 1375 if (gn != NULL) 1376 return (pio); 1377 1378 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1379 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1380 &pio->io_bookmark)); 1381 } 1382 1383 zio_t * 1384 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1385 { 1386 zio_t *zio; 1387 1388 if (gn != NULL) { 1389 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1390 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1391 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1392 /* 1393 * As we rewrite each gang header, the pipeline will compute 1394 * a new gang block header checksum for it; but no one will 1395 * compute a new data checksum, so we do that here. The one 1396 * exception is the gang leader: the pipeline already computed 1397 * its data checksum because that stage precedes gang assembly. 1398 * (Presently, nothing actually uses interior data checksums; 1399 * this is just good hygiene.) 1400 */ 1401 if (gn != pio->io_gang_leader->io_gang_tree) { 1402 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1403 data, BP_GET_PSIZE(bp)); 1404 } 1405 /* 1406 * If we are here to damage data for testing purposes, 1407 * leave the GBH alone so that we can detect the damage. 1408 */ 1409 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1410 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1411 } else { 1412 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1413 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1414 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1415 } 1416 1417 return (zio); 1418 } 1419 1420 /* ARGSUSED */ 1421 zio_t * 1422 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1423 { 1424 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1425 ZIO_GANG_CHILD_FLAGS(pio))); 1426 } 1427 1428 /* ARGSUSED */ 1429 zio_t * 1430 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1431 { 1432 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1433 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1434 } 1435 1436 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1437 NULL, 1438 zio_read_gang, 1439 zio_rewrite_gang, 1440 zio_free_gang, 1441 zio_claim_gang, 1442 NULL 1443 }; 1444 1445 static void zio_gang_tree_assemble_done(zio_t *zio); 1446 1447 static zio_gang_node_t * 1448 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1449 { 1450 zio_gang_node_t *gn; 1451 1452 ASSERT(*gnpp == NULL); 1453 1454 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1455 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1456 *gnpp = gn; 1457 1458 return (gn); 1459 } 1460 1461 static void 1462 zio_gang_node_free(zio_gang_node_t **gnpp) 1463 { 1464 zio_gang_node_t *gn = *gnpp; 1465 1466 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1467 ASSERT(gn->gn_child[g] == NULL); 1468 1469 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1470 kmem_free(gn, sizeof (*gn)); 1471 *gnpp = NULL; 1472 } 1473 1474 static void 1475 zio_gang_tree_free(zio_gang_node_t **gnpp) 1476 { 1477 zio_gang_node_t *gn = *gnpp; 1478 1479 if (gn == NULL) 1480 return; 1481 1482 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1483 zio_gang_tree_free(&gn->gn_child[g]); 1484 1485 zio_gang_node_free(gnpp); 1486 } 1487 1488 static void 1489 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1490 { 1491 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1492 1493 ASSERT(gio->io_gang_leader == gio); 1494 ASSERT(BP_IS_GANG(bp)); 1495 1496 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1497 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1498 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1499 } 1500 1501 static void 1502 zio_gang_tree_assemble_done(zio_t *zio) 1503 { 1504 zio_t *gio = zio->io_gang_leader; 1505 zio_gang_node_t *gn = zio->io_private; 1506 blkptr_t *bp = zio->io_bp; 1507 1508 ASSERT(gio == zio_unique_parent(zio)); 1509 ASSERT(zio->io_child_count == 0); 1510 1511 if (zio->io_error) 1512 return; 1513 1514 if (BP_SHOULD_BYTESWAP(bp)) 1515 byteswap_uint64_array(zio->io_data, zio->io_size); 1516 1517 ASSERT(zio->io_data == gn->gn_gbh); 1518 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1519 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1520 1521 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1522 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1523 if (!BP_IS_GANG(gbp)) 1524 continue; 1525 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1526 } 1527 } 1528 1529 static void 1530 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1531 { 1532 zio_t *gio = pio->io_gang_leader; 1533 zio_t *zio; 1534 1535 ASSERT(BP_IS_GANG(bp) == !!gn); 1536 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1537 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1538 1539 /* 1540 * If you're a gang header, your data is in gn->gn_gbh. 1541 * If you're a gang member, your data is in 'data' and gn == NULL. 1542 */ 1543 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1544 1545 if (gn != NULL) { 1546 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1547 1548 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1549 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1550 if (BP_IS_HOLE(gbp)) 1551 continue; 1552 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1553 data = (char *)data + BP_GET_PSIZE(gbp); 1554 } 1555 } 1556 1557 if (gn == gio->io_gang_tree) 1558 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1559 1560 if (zio != pio) 1561 zio_nowait(zio); 1562 } 1563 1564 static int 1565 zio_gang_assemble(zio_t *zio) 1566 { 1567 blkptr_t *bp = zio->io_bp; 1568 1569 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1570 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1571 1572 zio->io_gang_leader = zio; 1573 1574 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1575 1576 return (ZIO_PIPELINE_CONTINUE); 1577 } 1578 1579 static int 1580 zio_gang_issue(zio_t *zio) 1581 { 1582 blkptr_t *bp = zio->io_bp; 1583 1584 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1585 return (ZIO_PIPELINE_STOP); 1586 1587 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1588 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1589 1590 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1591 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1592 else 1593 zio_gang_tree_free(&zio->io_gang_tree); 1594 1595 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1596 1597 return (ZIO_PIPELINE_CONTINUE); 1598 } 1599 1600 static void 1601 zio_write_gang_member_ready(zio_t *zio) 1602 { 1603 zio_t *pio = zio_unique_parent(zio); 1604 zio_t *gio = zio->io_gang_leader; 1605 dva_t *cdva = zio->io_bp->blk_dva; 1606 dva_t *pdva = pio->io_bp->blk_dva; 1607 uint64_t asize; 1608 1609 if (BP_IS_HOLE(zio->io_bp)) 1610 return; 1611 1612 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1613 1614 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1615 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1616 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1617 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1618 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1619 1620 mutex_enter(&pio->io_lock); 1621 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1622 ASSERT(DVA_GET_GANG(&pdva[d])); 1623 asize = DVA_GET_ASIZE(&pdva[d]); 1624 asize += DVA_GET_ASIZE(&cdva[d]); 1625 DVA_SET_ASIZE(&pdva[d], asize); 1626 } 1627 mutex_exit(&pio->io_lock); 1628 } 1629 1630 static int 1631 zio_write_gang_block(zio_t *pio) 1632 { 1633 spa_t *spa = pio->io_spa; 1634 blkptr_t *bp = pio->io_bp; 1635 zio_t *gio = pio->io_gang_leader; 1636 zio_t *zio; 1637 zio_gang_node_t *gn, **gnpp; 1638 zio_gbh_phys_t *gbh; 1639 uint64_t txg = pio->io_txg; 1640 uint64_t resid = pio->io_size; 1641 uint64_t lsize; 1642 int copies = gio->io_prop.zp_copies; 1643 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1644 zio_prop_t zp; 1645 int error; 1646 1647 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1648 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1649 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1650 if (error) { 1651 pio->io_error = error; 1652 return (ZIO_PIPELINE_CONTINUE); 1653 } 1654 1655 if (pio == gio) { 1656 gnpp = &gio->io_gang_tree; 1657 } else { 1658 gnpp = pio->io_private; 1659 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1660 } 1661 1662 gn = zio_gang_node_alloc(gnpp); 1663 gbh = gn->gn_gbh; 1664 bzero(gbh, SPA_GANGBLOCKSIZE); 1665 1666 /* 1667 * Create the gang header. 1668 */ 1669 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1670 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1671 1672 /* 1673 * Create and nowait the gang children. 1674 */ 1675 for (int g = 0; resid != 0; resid -= lsize, g++) { 1676 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1677 SPA_MINBLOCKSIZE); 1678 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1679 1680 zp.zp_checksum = gio->io_prop.zp_checksum; 1681 zp.zp_compress = ZIO_COMPRESS_OFF; 1682 zp.zp_type = DMU_OT_NONE; 1683 zp.zp_level = 0; 1684 zp.zp_copies = gio->io_prop.zp_copies; 1685 zp.zp_dedup = 0; 1686 zp.zp_dedup_verify = 0; 1687 1688 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1689 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1690 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1691 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1692 &pio->io_bookmark)); 1693 } 1694 1695 /* 1696 * Set pio's pipeline to just wait for zio to finish. 1697 */ 1698 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1699 1700 zio_nowait(zio); 1701 1702 return (ZIO_PIPELINE_CONTINUE); 1703 } 1704 1705 /* 1706 * ========================================================================== 1707 * Dedup 1708 * ========================================================================== 1709 */ 1710 static void 1711 zio_ddt_child_read_done(zio_t *zio) 1712 { 1713 blkptr_t *bp = zio->io_bp; 1714 ddt_entry_t *dde = zio->io_private; 1715 ddt_phys_t *ddp; 1716 zio_t *pio = zio_unique_parent(zio); 1717 1718 mutex_enter(&pio->io_lock); 1719 ddp = ddt_phys_select(dde, bp); 1720 if (zio->io_error == 0) 1721 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1722 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1723 dde->dde_repair_data = zio->io_data; 1724 else 1725 zio_buf_free(zio->io_data, zio->io_size); 1726 mutex_exit(&pio->io_lock); 1727 } 1728 1729 static int 1730 zio_ddt_read_start(zio_t *zio) 1731 { 1732 blkptr_t *bp = zio->io_bp; 1733 1734 ASSERT(BP_GET_DEDUP(bp)); 1735 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1736 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1737 1738 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1739 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1740 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1741 ddt_phys_t *ddp = dde->dde_phys; 1742 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1743 blkptr_t blk; 1744 1745 ASSERT(zio->io_vsd == NULL); 1746 zio->io_vsd = dde; 1747 1748 if (ddp_self == NULL) 1749 return (ZIO_PIPELINE_CONTINUE); 1750 1751 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1752 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1753 continue; 1754 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1755 &blk); 1756 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1757 zio_buf_alloc(zio->io_size), zio->io_size, 1758 zio_ddt_child_read_done, dde, zio->io_priority, 1759 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1760 &zio->io_bookmark)); 1761 } 1762 return (ZIO_PIPELINE_CONTINUE); 1763 } 1764 1765 zio_nowait(zio_read(zio, zio->io_spa, bp, 1766 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1767 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1768 1769 return (ZIO_PIPELINE_CONTINUE); 1770 } 1771 1772 static int 1773 zio_ddt_read_done(zio_t *zio) 1774 { 1775 blkptr_t *bp = zio->io_bp; 1776 1777 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1778 return (ZIO_PIPELINE_STOP); 1779 1780 ASSERT(BP_GET_DEDUP(bp)); 1781 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1782 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1783 1784 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1785 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1786 ddt_entry_t *dde = zio->io_vsd; 1787 if (ddt == NULL) { 1788 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1789 return (ZIO_PIPELINE_CONTINUE); 1790 } 1791 if (dde == NULL) { 1792 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1793 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1794 return (ZIO_PIPELINE_STOP); 1795 } 1796 if (dde->dde_repair_data != NULL) { 1797 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1798 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1799 } 1800 ddt_repair_done(ddt, dde); 1801 zio->io_vsd = NULL; 1802 } 1803 1804 ASSERT(zio->io_vsd == NULL); 1805 1806 return (ZIO_PIPELINE_CONTINUE); 1807 } 1808 1809 static boolean_t 1810 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1811 { 1812 spa_t *spa = zio->io_spa; 1813 1814 /* 1815 * Note: we compare the original data, not the transformed data, 1816 * because when zio->io_bp is an override bp, we will not have 1817 * pushed the I/O transforms. That's an important optimization 1818 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1819 */ 1820 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1821 zio_t *lio = dde->dde_lead_zio[p]; 1822 1823 if (lio != NULL) { 1824 return (lio->io_orig_size != zio->io_orig_size || 1825 bcmp(zio->io_orig_data, lio->io_orig_data, 1826 zio->io_orig_size) != 0); 1827 } 1828 } 1829 1830 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1831 ddt_phys_t *ddp = &dde->dde_phys[p]; 1832 1833 if (ddp->ddp_phys_birth != 0) { 1834 arc_buf_t *abuf = NULL; 1835 uint32_t aflags = ARC_WAIT; 1836 blkptr_t blk = *zio->io_bp; 1837 int error; 1838 1839 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 1840 1841 ddt_exit(ddt); 1842 1843 error = arc_read_nolock(NULL, spa, &blk, 1844 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 1845 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1846 &aflags, &zio->io_bookmark); 1847 1848 if (error == 0) { 1849 if (arc_buf_size(abuf) != zio->io_orig_size || 1850 bcmp(abuf->b_data, zio->io_orig_data, 1851 zio->io_orig_size) != 0) 1852 error = EEXIST; 1853 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 1854 } 1855 1856 ddt_enter(ddt); 1857 return (error != 0); 1858 } 1859 } 1860 1861 return (B_FALSE); 1862 } 1863 1864 static void 1865 zio_ddt_child_write_ready(zio_t *zio) 1866 { 1867 int p = zio->io_prop.zp_copies; 1868 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1869 ddt_entry_t *dde = zio->io_private; 1870 ddt_phys_t *ddp = &dde->dde_phys[p]; 1871 zio_t *pio; 1872 1873 if (zio->io_error) 1874 return; 1875 1876 ddt_enter(ddt); 1877 1878 ASSERT(dde->dde_lead_zio[p] == zio); 1879 1880 ddt_phys_fill(ddp, zio->io_bp); 1881 1882 while ((pio = zio_walk_parents(zio)) != NULL) 1883 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 1884 1885 ddt_exit(ddt); 1886 } 1887 1888 static void 1889 zio_ddt_child_write_done(zio_t *zio) 1890 { 1891 int p = zio->io_prop.zp_copies; 1892 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1893 ddt_entry_t *dde = zio->io_private; 1894 ddt_phys_t *ddp = &dde->dde_phys[p]; 1895 1896 ddt_enter(ddt); 1897 1898 ASSERT(ddp->ddp_refcnt == 0); 1899 ASSERT(dde->dde_lead_zio[p] == zio); 1900 dde->dde_lead_zio[p] = NULL; 1901 1902 if (zio->io_error == 0) { 1903 while (zio_walk_parents(zio) != NULL) 1904 ddt_phys_addref(ddp); 1905 } else { 1906 ddt_phys_clear(ddp); 1907 } 1908 1909 ddt_exit(ddt); 1910 } 1911 1912 static void 1913 zio_ddt_ditto_write_done(zio_t *zio) 1914 { 1915 int p = DDT_PHYS_DITTO; 1916 zio_prop_t *zp = &zio->io_prop; 1917 blkptr_t *bp = zio->io_bp; 1918 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1919 ddt_entry_t *dde = zio->io_private; 1920 ddt_phys_t *ddp = &dde->dde_phys[p]; 1921 ddt_key_t *ddk = &dde->dde_key; 1922 1923 ddt_enter(ddt); 1924 1925 ASSERT(ddp->ddp_refcnt == 0); 1926 ASSERT(dde->dde_lead_zio[p] == zio); 1927 dde->dde_lead_zio[p] = NULL; 1928 1929 if (zio->io_error == 0) { 1930 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 1931 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 1932 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 1933 if (ddp->ddp_phys_birth != 0) 1934 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 1935 ddt_phys_fill(ddp, bp); 1936 } 1937 1938 ddt_exit(ddt); 1939 } 1940 1941 static int 1942 zio_ddt_write(zio_t *zio) 1943 { 1944 spa_t *spa = zio->io_spa; 1945 blkptr_t *bp = zio->io_bp; 1946 uint64_t txg = zio->io_txg; 1947 zio_prop_t *zp = &zio->io_prop; 1948 int p = zp->zp_copies; 1949 int ditto_copies; 1950 zio_t *cio = NULL; 1951 zio_t *dio = NULL; 1952 ddt_t *ddt = ddt_select(spa, bp); 1953 ddt_entry_t *dde; 1954 ddt_phys_t *ddp; 1955 1956 ASSERT(BP_GET_DEDUP(bp)); 1957 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 1958 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 1959 1960 ddt_enter(ddt); 1961 dde = ddt_lookup(ddt, bp, B_TRUE); 1962 ddp = &dde->dde_phys[p]; 1963 1964 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 1965 /* 1966 * If we're using a weak checksum, upgrade to a strong checksum 1967 * and try again. If we're already using a strong checksum, 1968 * we can't resolve it, so just convert to an ordinary write. 1969 * (And automatically e-mail a paper to Nature?) 1970 */ 1971 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 1972 zp->zp_checksum = spa_dedup_checksum(spa); 1973 zio_pop_transforms(zio); 1974 zio->io_stage = ZIO_STAGE_OPEN; 1975 BP_ZERO(bp); 1976 } else { 1977 zp->zp_dedup = 0; 1978 } 1979 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1980 ddt_exit(ddt); 1981 return (ZIO_PIPELINE_CONTINUE); 1982 } 1983 1984 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 1985 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 1986 1987 if (ditto_copies > ddt_ditto_copies_present(dde) && 1988 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 1989 zio_prop_t czp = *zp; 1990 1991 czp.zp_copies = ditto_copies; 1992 1993 /* 1994 * If we arrived here with an override bp, we won't have run 1995 * the transform stack, so we won't have the data we need to 1996 * generate a child i/o. So, toss the override bp and restart. 1997 * This is safe, because using the override bp is just an 1998 * optimization; and it's rare, so the cost doesn't matter. 1999 */ 2000 if (zio->io_bp_override) { 2001 zio_pop_transforms(zio); 2002 zio->io_stage = ZIO_STAGE_OPEN; 2003 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2004 zio->io_bp_override = NULL; 2005 BP_ZERO(bp); 2006 ddt_exit(ddt); 2007 return (ZIO_PIPELINE_CONTINUE); 2008 } 2009 2010 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2011 zio->io_orig_size, &czp, NULL, 2012 zio_ddt_ditto_write_done, dde, zio->io_priority, 2013 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2014 2015 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2016 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2017 } 2018 2019 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2020 if (ddp->ddp_phys_birth != 0) 2021 ddt_bp_fill(ddp, bp, txg); 2022 if (dde->dde_lead_zio[p] != NULL) 2023 zio_add_child(zio, dde->dde_lead_zio[p]); 2024 else 2025 ddt_phys_addref(ddp); 2026 } else if (zio->io_bp_override) { 2027 ASSERT(bp->blk_birth == txg); 2028 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2029 ddt_phys_fill(ddp, bp); 2030 ddt_phys_addref(ddp); 2031 } else { 2032 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2033 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2034 zio_ddt_child_write_done, dde, zio->io_priority, 2035 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2036 2037 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2038 dde->dde_lead_zio[p] = cio; 2039 } 2040 2041 ddt_exit(ddt); 2042 2043 if (cio) 2044 zio_nowait(cio); 2045 if (dio) 2046 zio_nowait(dio); 2047 2048 return (ZIO_PIPELINE_CONTINUE); 2049 } 2050 2051 static int 2052 zio_ddt_free(zio_t *zio) 2053 { 2054 spa_t *spa = zio->io_spa; 2055 blkptr_t *bp = zio->io_bp; 2056 ddt_t *ddt = ddt_select(spa, bp); 2057 ddt_entry_t *dde; 2058 ddt_phys_t *ddp; 2059 2060 ASSERT(BP_GET_DEDUP(bp)); 2061 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2062 2063 ddt_enter(ddt); 2064 dde = ddt_lookup(ddt, bp, B_TRUE); 2065 ddp = ddt_phys_select(dde, bp); 2066 ddt_phys_decref(ddp); 2067 ddt_exit(ddt); 2068 2069 return (ZIO_PIPELINE_CONTINUE); 2070 } 2071 2072 /* 2073 * ========================================================================== 2074 * Allocate and free blocks 2075 * ========================================================================== 2076 */ 2077 static int 2078 zio_dva_allocate(zio_t *zio) 2079 { 2080 spa_t *spa = zio->io_spa; 2081 metaslab_class_t *mc = spa_normal_class(spa); 2082 blkptr_t *bp = zio->io_bp; 2083 int error; 2084 2085 if (zio->io_gang_leader == NULL) { 2086 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2087 zio->io_gang_leader = zio; 2088 } 2089 2090 ASSERT(BP_IS_HOLE(bp)); 2091 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 2092 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2093 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2094 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2095 2096 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2097 zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2098 2099 if (error) { 2100 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2101 return (zio_write_gang_block(zio)); 2102 zio->io_error = error; 2103 } 2104 2105 return (ZIO_PIPELINE_CONTINUE); 2106 } 2107 2108 static int 2109 zio_dva_free(zio_t *zio) 2110 { 2111 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2112 2113 return (ZIO_PIPELINE_CONTINUE); 2114 } 2115 2116 static int 2117 zio_dva_claim(zio_t *zio) 2118 { 2119 int error; 2120 2121 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2122 if (error) 2123 zio->io_error = error; 2124 2125 return (ZIO_PIPELINE_CONTINUE); 2126 } 2127 2128 /* 2129 * Undo an allocation. This is used by zio_done() when an I/O fails 2130 * and we want to give back the block we just allocated. 2131 * This handles both normal blocks and gang blocks. 2132 */ 2133 static void 2134 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2135 { 2136 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2137 ASSERT(zio->io_bp_override == NULL); 2138 2139 if (!BP_IS_HOLE(bp)) 2140 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2141 2142 if (gn != NULL) { 2143 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2144 zio_dva_unallocate(zio, gn->gn_child[g], 2145 &gn->gn_gbh->zg_blkptr[g]); 2146 } 2147 } 2148 } 2149 2150 /* 2151 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2152 */ 2153 int 2154 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2155 uint64_t size, boolean_t use_slog) 2156 { 2157 int error = 1; 2158 2159 ASSERT(txg > spa_syncing_txg(spa)); 2160 2161 if (use_slog) 2162 error = metaslab_alloc(spa, spa_log_class(spa), size, 2163 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2164 2165 if (error) 2166 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2167 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2168 2169 if (error == 0) { 2170 BP_SET_LSIZE(new_bp, size); 2171 BP_SET_PSIZE(new_bp, size); 2172 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2173 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2174 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2175 BP_SET_LEVEL(new_bp, 0); 2176 BP_SET_DEDUP(new_bp, 0); 2177 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2178 } 2179 2180 return (error); 2181 } 2182 2183 /* 2184 * Free an intent log block. 2185 */ 2186 void 2187 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2188 { 2189 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2190 ASSERT(!BP_IS_GANG(bp)); 2191 2192 zio_free(spa, txg, bp); 2193 } 2194 2195 /* 2196 * ========================================================================== 2197 * Read and write to physical devices 2198 * ========================================================================== 2199 */ 2200 static int 2201 zio_vdev_io_start(zio_t *zio) 2202 { 2203 vdev_t *vd = zio->io_vd; 2204 uint64_t align; 2205 spa_t *spa = zio->io_spa; 2206 2207 ASSERT(zio->io_error == 0); 2208 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2209 2210 if (vd == NULL) { 2211 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2212 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2213 2214 /* 2215 * The mirror_ops handle multiple DVAs in a single BP. 2216 */ 2217 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2218 } 2219 2220 align = 1ULL << vd->vdev_top->vdev_ashift; 2221 2222 if (P2PHASE(zio->io_size, align) != 0) { 2223 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2224 char *abuf = zio_buf_alloc(asize); 2225 ASSERT(vd == vd->vdev_top); 2226 if (zio->io_type == ZIO_TYPE_WRITE) { 2227 bcopy(zio->io_data, abuf, zio->io_size); 2228 bzero(abuf + zio->io_size, asize - zio->io_size); 2229 } 2230 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2231 } 2232 2233 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2234 ASSERT(P2PHASE(zio->io_size, align) == 0); 2235 ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2236 2237 /* 2238 * If this is a repair I/O, and there's no self-healing involved -- 2239 * that is, we're just resilvering what we expect to resilver -- 2240 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2241 * This prevents spurious resilvering with nested replication. 2242 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2243 * A is out of date, we'll read from C+D, then use the data to 2244 * resilver A+B -- but we don't actually want to resilver B, just A. 2245 * The top-level mirror has no way to know this, so instead we just 2246 * discard unnecessary repairs as we work our way down the vdev tree. 2247 * The same logic applies to any form of nested replication: 2248 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2249 */ 2250 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2251 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2252 zio->io_txg != 0 && /* not a delegated i/o */ 2253 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2254 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2255 zio_vdev_io_bypass(zio); 2256 return (ZIO_PIPELINE_CONTINUE); 2257 } 2258 2259 if (vd->vdev_ops->vdev_op_leaf && 2260 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2261 2262 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2263 return (ZIO_PIPELINE_CONTINUE); 2264 2265 if ((zio = vdev_queue_io(zio)) == NULL) 2266 return (ZIO_PIPELINE_STOP); 2267 2268 if (!vdev_accessible(vd, zio)) { 2269 zio->io_error = ENXIO; 2270 zio_interrupt(zio); 2271 return (ZIO_PIPELINE_STOP); 2272 } 2273 } 2274 2275 return (vd->vdev_ops->vdev_op_io_start(zio)); 2276 } 2277 2278 static int 2279 zio_vdev_io_done(zio_t *zio) 2280 { 2281 vdev_t *vd = zio->io_vd; 2282 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2283 boolean_t unexpected_error = B_FALSE; 2284 2285 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2286 return (ZIO_PIPELINE_STOP); 2287 2288 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2289 2290 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2291 2292 vdev_queue_io_done(zio); 2293 2294 if (zio->io_type == ZIO_TYPE_WRITE) 2295 vdev_cache_write(zio); 2296 2297 if (zio_injection_enabled && zio->io_error == 0) 2298 zio->io_error = zio_handle_device_injection(vd, 2299 zio, EIO); 2300 2301 if (zio_injection_enabled && zio->io_error == 0) 2302 zio->io_error = zio_handle_label_injection(zio, EIO); 2303 2304 if (zio->io_error) { 2305 if (!vdev_accessible(vd, zio)) { 2306 zio->io_error = ENXIO; 2307 } else { 2308 unexpected_error = B_TRUE; 2309 } 2310 } 2311 } 2312 2313 ops->vdev_op_io_done(zio); 2314 2315 if (unexpected_error) 2316 VERIFY(vdev_probe(vd, zio) == NULL); 2317 2318 return (ZIO_PIPELINE_CONTINUE); 2319 } 2320 2321 /* 2322 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2323 * disk, and use that to finish the checksum ereport later. 2324 */ 2325 static void 2326 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2327 const void *good_buf) 2328 { 2329 /* no processing needed */ 2330 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2331 } 2332 2333 /*ARGSUSED*/ 2334 void 2335 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2336 { 2337 void *buf = zio_buf_alloc(zio->io_size); 2338 2339 bcopy(zio->io_data, buf, zio->io_size); 2340 2341 zcr->zcr_cbinfo = zio->io_size; 2342 zcr->zcr_cbdata = buf; 2343 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2344 zcr->zcr_free = zio_buf_free; 2345 } 2346 2347 static int 2348 zio_vdev_io_assess(zio_t *zio) 2349 { 2350 vdev_t *vd = zio->io_vd; 2351 2352 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2353 return (ZIO_PIPELINE_STOP); 2354 2355 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2356 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2357 2358 if (zio->io_vsd != NULL) { 2359 zio->io_vsd_ops->vsd_free(zio); 2360 zio->io_vsd = NULL; 2361 } 2362 2363 if (zio_injection_enabled && zio->io_error == 0) 2364 zio->io_error = zio_handle_fault_injection(zio, EIO); 2365 2366 /* 2367 * If the I/O failed, determine whether we should attempt to retry it. 2368 */ 2369 if (zio->io_error && vd == NULL && 2370 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2371 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2372 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2373 zio->io_error = 0; 2374 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2375 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2376 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2377 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 2378 return (ZIO_PIPELINE_STOP); 2379 } 2380 2381 /* 2382 * If we got an error on a leaf device, convert it to ENXIO 2383 * if the device is not accessible at all. 2384 */ 2385 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2386 !vdev_accessible(vd, zio)) 2387 zio->io_error = ENXIO; 2388 2389 /* 2390 * If we can't write to an interior vdev (mirror or RAID-Z), 2391 * set vdev_cant_write so that we stop trying to allocate from it. 2392 */ 2393 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2394 vd != NULL && !vd->vdev_ops->vdev_op_leaf) 2395 vd->vdev_cant_write = B_TRUE; 2396 2397 if (zio->io_error) 2398 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2399 2400 return (ZIO_PIPELINE_CONTINUE); 2401 } 2402 2403 void 2404 zio_vdev_io_reissue(zio_t *zio) 2405 { 2406 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2407 ASSERT(zio->io_error == 0); 2408 2409 zio->io_stage >>= 1; 2410 } 2411 2412 void 2413 zio_vdev_io_redone(zio_t *zio) 2414 { 2415 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2416 2417 zio->io_stage >>= 1; 2418 } 2419 2420 void 2421 zio_vdev_io_bypass(zio_t *zio) 2422 { 2423 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2424 ASSERT(zio->io_error == 0); 2425 2426 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2427 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2428 } 2429 2430 /* 2431 * ========================================================================== 2432 * Generate and verify checksums 2433 * ========================================================================== 2434 */ 2435 static int 2436 zio_checksum_generate(zio_t *zio) 2437 { 2438 blkptr_t *bp = zio->io_bp; 2439 enum zio_checksum checksum; 2440 2441 if (bp == NULL) { 2442 /* 2443 * This is zio_write_phys(). 2444 * We're either generating a label checksum, or none at all. 2445 */ 2446 checksum = zio->io_prop.zp_checksum; 2447 2448 if (checksum == ZIO_CHECKSUM_OFF) 2449 return (ZIO_PIPELINE_CONTINUE); 2450 2451 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2452 } else { 2453 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2454 ASSERT(!IO_IS_ALLOCATING(zio)); 2455 checksum = ZIO_CHECKSUM_GANG_HEADER; 2456 } else { 2457 checksum = BP_GET_CHECKSUM(bp); 2458 } 2459 } 2460 2461 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2462 2463 return (ZIO_PIPELINE_CONTINUE); 2464 } 2465 2466 static int 2467 zio_checksum_verify(zio_t *zio) 2468 { 2469 zio_bad_cksum_t info; 2470 blkptr_t *bp = zio->io_bp; 2471 int error; 2472 2473 ASSERT(zio->io_vd != NULL); 2474 2475 if (bp == NULL) { 2476 /* 2477 * This is zio_read_phys(). 2478 * We're either verifying a label checksum, or nothing at all. 2479 */ 2480 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2481 return (ZIO_PIPELINE_CONTINUE); 2482 2483 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2484 } 2485 2486 if ((error = zio_checksum_error(zio, &info)) != 0) { 2487 zio->io_error = error; 2488 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2489 zfs_ereport_start_checksum(zio->io_spa, 2490 zio->io_vd, zio, zio->io_offset, 2491 zio->io_size, NULL, &info); 2492 } 2493 } 2494 2495 return (ZIO_PIPELINE_CONTINUE); 2496 } 2497 2498 /* 2499 * Called by RAID-Z to ensure we don't compute the checksum twice. 2500 */ 2501 void 2502 zio_checksum_verified(zio_t *zio) 2503 { 2504 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2505 } 2506 2507 /* 2508 * ========================================================================== 2509 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2510 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2511 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2512 * indicate errors that are specific to one I/O, and most likely permanent. 2513 * Any other error is presumed to be worse because we weren't expecting it. 2514 * ========================================================================== 2515 */ 2516 int 2517 zio_worst_error(int e1, int e2) 2518 { 2519 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2520 int r1, r2; 2521 2522 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2523 if (e1 == zio_error_rank[r1]) 2524 break; 2525 2526 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2527 if (e2 == zio_error_rank[r2]) 2528 break; 2529 2530 return (r1 > r2 ? e1 : e2); 2531 } 2532 2533 /* 2534 * ========================================================================== 2535 * I/O completion 2536 * ========================================================================== 2537 */ 2538 static int 2539 zio_ready(zio_t *zio) 2540 { 2541 blkptr_t *bp = zio->io_bp; 2542 zio_t *pio, *pio_next; 2543 2544 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2545 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2546 return (ZIO_PIPELINE_STOP); 2547 2548 if (zio->io_ready) { 2549 ASSERT(IO_IS_ALLOCATING(zio)); 2550 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2551 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2552 2553 zio->io_ready(zio); 2554 } 2555 2556 if (bp != NULL && bp != &zio->io_bp_copy) 2557 zio->io_bp_copy = *bp; 2558 2559 if (zio->io_error) 2560 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2561 2562 mutex_enter(&zio->io_lock); 2563 zio->io_state[ZIO_WAIT_READY] = 1; 2564 pio = zio_walk_parents(zio); 2565 mutex_exit(&zio->io_lock); 2566 2567 /* 2568 * As we notify zio's parents, new parents could be added. 2569 * New parents go to the head of zio's io_parent_list, however, 2570 * so we will (correctly) not notify them. The remainder of zio's 2571 * io_parent_list, from 'pio_next' onward, cannot change because 2572 * all parents must wait for us to be done before they can be done. 2573 */ 2574 for (; pio != NULL; pio = pio_next) { 2575 pio_next = zio_walk_parents(zio); 2576 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2577 } 2578 2579 if (zio->io_flags & ZIO_FLAG_NODATA) { 2580 if (BP_IS_GANG(bp)) { 2581 zio->io_flags &= ~ZIO_FLAG_NODATA; 2582 } else { 2583 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2584 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2585 } 2586 } 2587 2588 if (zio_injection_enabled && 2589 zio->io_spa->spa_syncing_txg == zio->io_txg) 2590 zio_handle_ignored_writes(zio); 2591 2592 return (ZIO_PIPELINE_CONTINUE); 2593 } 2594 2595 static int 2596 zio_done(zio_t *zio) 2597 { 2598 spa_t *spa = zio->io_spa; 2599 zio_t *lio = zio->io_logical; 2600 blkptr_t *bp = zio->io_bp; 2601 vdev_t *vd = zio->io_vd; 2602 uint64_t psize = zio->io_size; 2603 zio_t *pio, *pio_next; 2604 2605 /* 2606 * If our children haven't all completed, 2607 * wait for them and then repeat this pipeline stage. 2608 */ 2609 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2610 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2611 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2612 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2613 return (ZIO_PIPELINE_STOP); 2614 2615 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2616 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2617 ASSERT(zio->io_children[c][w] == 0); 2618 2619 if (bp != NULL) { 2620 ASSERT(bp->blk_pad[0] == 0); 2621 ASSERT(bp->blk_pad[1] == 0); 2622 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2623 (bp == zio_unique_parent(zio)->io_bp)); 2624 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2625 zio->io_bp_override == NULL && 2626 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2627 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2628 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2629 ASSERT(BP_COUNT_GANG(bp) == 0 || 2630 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2631 } 2632 } 2633 2634 /* 2635 * If there were child vdev/gang/ddt errors, they apply to us now. 2636 */ 2637 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2638 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2639 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2640 2641 /* 2642 * If the I/O on the transformed data was successful, generate any 2643 * checksum reports now while we still have the transformed data. 2644 */ 2645 if (zio->io_error == 0) { 2646 while (zio->io_cksum_report != NULL) { 2647 zio_cksum_report_t *zcr = zio->io_cksum_report; 2648 uint64_t align = zcr->zcr_align; 2649 uint64_t asize = P2ROUNDUP(psize, align); 2650 char *abuf = zio->io_data; 2651 2652 if (asize != psize) { 2653 abuf = zio_buf_alloc(asize); 2654 bcopy(zio->io_data, abuf, psize); 2655 bzero(abuf + psize, asize - psize); 2656 } 2657 2658 zio->io_cksum_report = zcr->zcr_next; 2659 zcr->zcr_next = NULL; 2660 zcr->zcr_finish(zcr, abuf); 2661 zfs_ereport_free_checksum(zcr); 2662 2663 if (asize != psize) 2664 zio_buf_free(abuf, asize); 2665 } 2666 } 2667 2668 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2669 2670 vdev_stat_update(zio, psize); 2671 2672 if (zio->io_error) { 2673 /* 2674 * If this I/O is attached to a particular vdev, 2675 * generate an error message describing the I/O failure 2676 * at the block level. We ignore these errors if the 2677 * device is currently unavailable. 2678 */ 2679 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2680 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2681 2682 if ((zio->io_error == EIO || !(zio->io_flags & 2683 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2684 zio == lio) { 2685 /* 2686 * For logical I/O requests, tell the SPA to log the 2687 * error and generate a logical data ereport. 2688 */ 2689 spa_log_error(spa, zio); 2690 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2691 0, 0); 2692 } 2693 } 2694 2695 if (zio->io_error && zio == lio) { 2696 /* 2697 * Determine whether zio should be reexecuted. This will 2698 * propagate all the way to the root via zio_notify_parent(). 2699 */ 2700 ASSERT(vd == NULL && bp != NULL); 2701 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2702 2703 if (IO_IS_ALLOCATING(zio) && 2704 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2705 if (zio->io_error != ENOSPC) 2706 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2707 else 2708 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2709 } 2710 2711 if ((zio->io_type == ZIO_TYPE_READ || 2712 zio->io_type == ZIO_TYPE_FREE) && 2713 zio->io_error == ENXIO && 2714 spa_load_state(spa) == SPA_LOAD_NONE && 2715 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2716 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2717 2718 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2719 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2720 2721 /* 2722 * Here is a possibly good place to attempt to do 2723 * either combinatorial reconstruction or error correction 2724 * based on checksums. It also might be a good place 2725 * to send out preliminary ereports before we suspend 2726 * processing. 2727 */ 2728 } 2729 2730 /* 2731 * If there were logical child errors, they apply to us now. 2732 * We defer this until now to avoid conflating logical child 2733 * errors with errors that happened to the zio itself when 2734 * updating vdev stats and reporting FMA events above. 2735 */ 2736 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2737 2738 if ((zio->io_error || zio->io_reexecute) && 2739 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2740 !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 2741 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2742 2743 zio_gang_tree_free(&zio->io_gang_tree); 2744 2745 /* 2746 * Godfather I/Os should never suspend. 2747 */ 2748 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2749 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2750 zio->io_reexecute = 0; 2751 2752 if (zio->io_reexecute) { 2753 /* 2754 * This is a logical I/O that wants to reexecute. 2755 * 2756 * Reexecute is top-down. When an i/o fails, if it's not 2757 * the root, it simply notifies its parent and sticks around. 2758 * The parent, seeing that it still has children in zio_done(), 2759 * does the same. This percolates all the way up to the root. 2760 * The root i/o will reexecute or suspend the entire tree. 2761 * 2762 * This approach ensures that zio_reexecute() honors 2763 * all the original i/o dependency relationships, e.g. 2764 * parents not executing until children are ready. 2765 */ 2766 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2767 2768 zio->io_gang_leader = NULL; 2769 2770 mutex_enter(&zio->io_lock); 2771 zio->io_state[ZIO_WAIT_DONE] = 1; 2772 mutex_exit(&zio->io_lock); 2773 2774 /* 2775 * "The Godfather" I/O monitors its children but is 2776 * not a true parent to them. It will track them through 2777 * the pipeline but severs its ties whenever they get into 2778 * trouble (e.g. suspended). This allows "The Godfather" 2779 * I/O to return status without blocking. 2780 */ 2781 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2782 zio_link_t *zl = zio->io_walk_link; 2783 pio_next = zio_walk_parents(zio); 2784 2785 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 2786 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 2787 zio_remove_child(pio, zio, zl); 2788 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2789 } 2790 } 2791 2792 if ((pio = zio_unique_parent(zio)) != NULL) { 2793 /* 2794 * We're not a root i/o, so there's nothing to do 2795 * but notify our parent. Don't propagate errors 2796 * upward since we haven't permanently failed yet. 2797 */ 2798 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2799 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2800 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2801 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2802 /* 2803 * We'd fail again if we reexecuted now, so suspend 2804 * until conditions improve (e.g. device comes online). 2805 */ 2806 zio_suspend(spa, zio); 2807 } else { 2808 /* 2809 * Reexecution is potentially a huge amount of work. 2810 * Hand it off to the otherwise-unused claim taskq. 2811 */ 2812 (void) taskq_dispatch( 2813 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2814 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2815 } 2816 return (ZIO_PIPELINE_STOP); 2817 } 2818 2819 ASSERT(zio->io_child_count == 0); 2820 ASSERT(zio->io_reexecute == 0); 2821 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2822 2823 /* 2824 * Report any checksum errors, since the I/O is complete. 2825 */ 2826 while (zio->io_cksum_report != NULL) { 2827 zio_cksum_report_t *zcr = zio->io_cksum_report; 2828 zio->io_cksum_report = zcr->zcr_next; 2829 zcr->zcr_next = NULL; 2830 zcr->zcr_finish(zcr, NULL); 2831 zfs_ereport_free_checksum(zcr); 2832 } 2833 2834 /* 2835 * It is the responsibility of the done callback to ensure that this 2836 * particular zio is no longer discoverable for adoption, and as 2837 * such, cannot acquire any new parents. 2838 */ 2839 if (zio->io_done) 2840 zio->io_done(zio); 2841 2842 mutex_enter(&zio->io_lock); 2843 zio->io_state[ZIO_WAIT_DONE] = 1; 2844 mutex_exit(&zio->io_lock); 2845 2846 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2847 zio_link_t *zl = zio->io_walk_link; 2848 pio_next = zio_walk_parents(zio); 2849 zio_remove_child(pio, zio, zl); 2850 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2851 } 2852 2853 if (zio->io_waiter != NULL) { 2854 mutex_enter(&zio->io_lock); 2855 zio->io_executor = NULL; 2856 cv_broadcast(&zio->io_cv); 2857 mutex_exit(&zio->io_lock); 2858 } else { 2859 zio_destroy(zio); 2860 } 2861 2862 return (ZIO_PIPELINE_STOP); 2863 } 2864 2865 /* 2866 * ========================================================================== 2867 * I/O pipeline definition 2868 * ========================================================================== 2869 */ 2870 static zio_pipe_stage_t *zio_pipeline[] = { 2871 NULL, 2872 zio_read_bp_init, 2873 zio_free_bp_init, 2874 zio_issue_async, 2875 zio_write_bp_init, 2876 zio_checksum_generate, 2877 zio_ddt_read_start, 2878 zio_ddt_read_done, 2879 zio_ddt_write, 2880 zio_ddt_free, 2881 zio_gang_assemble, 2882 zio_gang_issue, 2883 zio_dva_allocate, 2884 zio_dva_free, 2885 zio_dva_claim, 2886 zio_ready, 2887 zio_vdev_io_start, 2888 zio_vdev_io_done, 2889 zio_vdev_io_assess, 2890 zio_checksum_verify, 2891 zio_done 2892 }; 2893