1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/fm/fs/zfs.h> 28 #include <sys/spa.h> 29 #include <sys/txg.h> 30 #include <sys/spa_impl.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio_impl.h> 33 #include <sys/zio_compress.h> 34 #include <sys/zio_checksum.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/arc.h> 37 #include <sys/ddt.h> 38 39 /* 40 * ========================================================================== 41 * I/O priority table 42 * ========================================================================== 43 */ 44 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 45 0, /* ZIO_PRIORITY_NOW */ 46 0, /* ZIO_PRIORITY_SYNC_READ */ 47 0, /* ZIO_PRIORITY_SYNC_WRITE */ 48 6, /* ZIO_PRIORITY_ASYNC_READ */ 49 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 50 4, /* ZIO_PRIORITY_FREE */ 51 0, /* ZIO_PRIORITY_CACHE_FILL */ 52 0, /* ZIO_PRIORITY_LOG_WRITE */ 53 10, /* ZIO_PRIORITY_RESILVER */ 54 20, /* ZIO_PRIORITY_SCRUB */ 55 }; 56 57 /* 58 * ========================================================================== 59 * I/O type descriptions 60 * ========================================================================== 61 */ 62 char *zio_type_name[ZIO_TYPES] = { 63 "null", "read", "write", "free", "claim", "ioctl" }; 64 65 /* 66 * ========================================================================== 67 * I/O kmem caches 68 * ========================================================================== 69 */ 70 kmem_cache_t *zio_cache; 71 kmem_cache_t *zio_link_cache; 72 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 73 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 74 75 #ifdef _KERNEL 76 extern vmem_t *zio_alloc_arena; 77 #endif 78 79 /* 80 * An allocating zio is one that either currently has the DVA allocate 81 * stage set or will have it later in its lifetime. 82 */ 83 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 84 85 #ifdef ZFS_DEBUG 86 int zio_buf_debug_limit = 16384; 87 #else 88 int zio_buf_debug_limit = 0; 89 #endif 90 91 void 92 zio_init(void) 93 { 94 size_t c; 95 vmem_t *data_alloc_arena = NULL; 96 97 #ifdef _KERNEL 98 data_alloc_arena = zio_alloc_arena; 99 #endif 100 zio_cache = kmem_cache_create("zio_cache", 101 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 102 zio_link_cache = kmem_cache_create("zio_link_cache", 103 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 104 105 /* 106 * For small buffers, we want a cache for each multiple of 107 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 108 * for each quarter-power of 2. For large buffers, we want 109 * a cache for each multiple of PAGESIZE. 110 */ 111 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 112 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 113 size_t p2 = size; 114 size_t align = 0; 115 116 while (p2 & (p2 - 1)) 117 p2 &= p2 - 1; 118 119 if (size <= 4 * SPA_MINBLOCKSIZE) { 120 align = SPA_MINBLOCKSIZE; 121 } else if (P2PHASE(size, PAGESIZE) == 0) { 122 align = PAGESIZE; 123 } else if (P2PHASE(size, p2 >> 2) == 0) { 124 align = p2 >> 2; 125 } 126 127 if (align != 0) { 128 char name[36]; 129 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 130 zio_buf_cache[c] = kmem_cache_create(name, size, 131 align, NULL, NULL, NULL, NULL, NULL, 132 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 133 134 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 135 zio_data_buf_cache[c] = kmem_cache_create(name, size, 136 align, NULL, NULL, NULL, NULL, data_alloc_arena, 137 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 138 } 139 } 140 141 while (--c != 0) { 142 ASSERT(zio_buf_cache[c] != NULL); 143 if (zio_buf_cache[c - 1] == NULL) 144 zio_buf_cache[c - 1] = zio_buf_cache[c]; 145 146 ASSERT(zio_data_buf_cache[c] != NULL); 147 if (zio_data_buf_cache[c - 1] == NULL) 148 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 149 } 150 151 zio_inject_init(); 152 } 153 154 void 155 zio_fini(void) 156 { 157 size_t c; 158 kmem_cache_t *last_cache = NULL; 159 kmem_cache_t *last_data_cache = NULL; 160 161 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 162 if (zio_buf_cache[c] != last_cache) { 163 last_cache = zio_buf_cache[c]; 164 kmem_cache_destroy(zio_buf_cache[c]); 165 } 166 zio_buf_cache[c] = NULL; 167 168 if (zio_data_buf_cache[c] != last_data_cache) { 169 last_data_cache = zio_data_buf_cache[c]; 170 kmem_cache_destroy(zio_data_buf_cache[c]); 171 } 172 zio_data_buf_cache[c] = NULL; 173 } 174 175 kmem_cache_destroy(zio_link_cache); 176 kmem_cache_destroy(zio_cache); 177 178 zio_inject_fini(); 179 } 180 181 /* 182 * ========================================================================== 183 * Allocate and free I/O buffers 184 * ========================================================================== 185 */ 186 187 /* 188 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 189 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 190 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 191 * excess / transient data in-core during a crashdump. 192 */ 193 void * 194 zio_buf_alloc(size_t size) 195 { 196 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 197 198 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 199 200 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 201 } 202 203 /* 204 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 205 * crashdump if the kernel panics. This exists so that we will limit the amount 206 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 207 * of kernel heap dumped to disk when the kernel panics) 208 */ 209 void * 210 zio_data_buf_alloc(size_t size) 211 { 212 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 213 214 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 215 216 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 217 } 218 219 void 220 zio_buf_free(void *buf, size_t size) 221 { 222 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 223 224 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 225 226 kmem_cache_free(zio_buf_cache[c], buf); 227 } 228 229 void 230 zio_data_buf_free(void *buf, size_t size) 231 { 232 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 233 234 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 235 236 kmem_cache_free(zio_data_buf_cache[c], buf); 237 } 238 239 /* 240 * ========================================================================== 241 * Push and pop I/O transform buffers 242 * ========================================================================== 243 */ 244 static void 245 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 246 zio_transform_func_t *transform) 247 { 248 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 249 250 zt->zt_orig_data = zio->io_data; 251 zt->zt_orig_size = zio->io_size; 252 zt->zt_bufsize = bufsize; 253 zt->zt_transform = transform; 254 255 zt->zt_next = zio->io_transform_stack; 256 zio->io_transform_stack = zt; 257 258 zio->io_data = data; 259 zio->io_size = size; 260 } 261 262 static void 263 zio_pop_transforms(zio_t *zio) 264 { 265 zio_transform_t *zt; 266 267 while ((zt = zio->io_transform_stack) != NULL) { 268 if (zt->zt_transform != NULL) 269 zt->zt_transform(zio, 270 zt->zt_orig_data, zt->zt_orig_size); 271 272 if (zt->zt_bufsize != 0) 273 zio_buf_free(zio->io_data, zt->zt_bufsize); 274 275 zio->io_data = zt->zt_orig_data; 276 zio->io_size = zt->zt_orig_size; 277 zio->io_transform_stack = zt->zt_next; 278 279 kmem_free(zt, sizeof (zio_transform_t)); 280 } 281 } 282 283 /* 284 * ========================================================================== 285 * I/O transform callbacks for subblocks and decompression 286 * ========================================================================== 287 */ 288 static void 289 zio_subblock(zio_t *zio, void *data, uint64_t size) 290 { 291 ASSERT(zio->io_size > size); 292 293 if (zio->io_type == ZIO_TYPE_READ) 294 bcopy(zio->io_data, data, size); 295 } 296 297 static void 298 zio_decompress(zio_t *zio, void *data, uint64_t size) 299 { 300 if (zio->io_error == 0 && 301 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 302 zio->io_data, data, zio->io_size, size) != 0) 303 zio->io_error = EIO; 304 } 305 306 /* 307 * ========================================================================== 308 * I/O parent/child relationships and pipeline interlocks 309 * ========================================================================== 310 */ 311 /* 312 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 313 * continue calling these functions until they return NULL. 314 * Otherwise, the next caller will pick up the list walk in 315 * some indeterminate state. (Otherwise every caller would 316 * have to pass in a cookie to keep the state represented by 317 * io_walk_link, which gets annoying.) 318 */ 319 zio_t * 320 zio_walk_parents(zio_t *cio) 321 { 322 zio_link_t *zl = cio->io_walk_link; 323 list_t *pl = &cio->io_parent_list; 324 325 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 326 cio->io_walk_link = zl; 327 328 if (zl == NULL) 329 return (NULL); 330 331 ASSERT(zl->zl_child == cio); 332 return (zl->zl_parent); 333 } 334 335 zio_t * 336 zio_walk_children(zio_t *pio) 337 { 338 zio_link_t *zl = pio->io_walk_link; 339 list_t *cl = &pio->io_child_list; 340 341 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 342 pio->io_walk_link = zl; 343 344 if (zl == NULL) 345 return (NULL); 346 347 ASSERT(zl->zl_parent == pio); 348 return (zl->zl_child); 349 } 350 351 zio_t * 352 zio_unique_parent(zio_t *cio) 353 { 354 zio_t *pio = zio_walk_parents(cio); 355 356 VERIFY(zio_walk_parents(cio) == NULL); 357 return (pio); 358 } 359 360 void 361 zio_add_child(zio_t *pio, zio_t *cio) 362 { 363 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 364 365 /* 366 * Logical I/Os can have logical, gang, or vdev children. 367 * Gang I/Os can have gang or vdev children. 368 * Vdev I/Os can only have vdev children. 369 * The following ASSERT captures all of these constraints. 370 */ 371 ASSERT(cio->io_child_type <= pio->io_child_type); 372 373 zl->zl_parent = pio; 374 zl->zl_child = cio; 375 376 mutex_enter(&cio->io_lock); 377 mutex_enter(&pio->io_lock); 378 379 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 380 381 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 382 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 383 384 list_insert_head(&pio->io_child_list, zl); 385 list_insert_head(&cio->io_parent_list, zl); 386 387 pio->io_child_count++; 388 cio->io_parent_count++; 389 390 mutex_exit(&pio->io_lock); 391 mutex_exit(&cio->io_lock); 392 } 393 394 static void 395 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 396 { 397 ASSERT(zl->zl_parent == pio); 398 ASSERT(zl->zl_child == cio); 399 400 mutex_enter(&cio->io_lock); 401 mutex_enter(&pio->io_lock); 402 403 list_remove(&pio->io_child_list, zl); 404 list_remove(&cio->io_parent_list, zl); 405 406 pio->io_child_count--; 407 cio->io_parent_count--; 408 409 mutex_exit(&pio->io_lock); 410 mutex_exit(&cio->io_lock); 411 412 kmem_cache_free(zio_link_cache, zl); 413 } 414 415 static boolean_t 416 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 417 { 418 uint64_t *countp = &zio->io_children[child][wait]; 419 boolean_t waiting = B_FALSE; 420 421 mutex_enter(&zio->io_lock); 422 ASSERT(zio->io_stall == NULL); 423 if (*countp != 0) { 424 zio->io_stage >>= 1; 425 zio->io_stall = countp; 426 waiting = B_TRUE; 427 } 428 mutex_exit(&zio->io_lock); 429 430 return (waiting); 431 } 432 433 static void 434 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 435 { 436 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 437 int *errorp = &pio->io_child_error[zio->io_child_type]; 438 439 mutex_enter(&pio->io_lock); 440 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 441 *errorp = zio_worst_error(*errorp, zio->io_error); 442 pio->io_reexecute |= zio->io_reexecute; 443 ASSERT3U(*countp, >, 0); 444 if (--*countp == 0 && pio->io_stall == countp) { 445 pio->io_stall = NULL; 446 mutex_exit(&pio->io_lock); 447 zio_execute(pio); 448 } else { 449 mutex_exit(&pio->io_lock); 450 } 451 } 452 453 static void 454 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 455 { 456 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 457 zio->io_error = zio->io_child_error[c]; 458 } 459 460 /* 461 * ========================================================================== 462 * Create the various types of I/O (read, write, free, etc) 463 * ========================================================================== 464 */ 465 static zio_t * 466 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 467 void *data, uint64_t size, zio_done_func_t *done, void *private, 468 zio_type_t type, int priority, enum zio_flag flags, 469 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 470 enum zio_stage stage, enum zio_stage pipeline) 471 { 472 zio_t *zio; 473 474 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 475 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 476 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 477 478 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 479 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 480 ASSERT(vd || stage == ZIO_STAGE_OPEN); 481 482 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 483 bzero(zio, sizeof (zio_t)); 484 485 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 486 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 487 488 list_create(&zio->io_parent_list, sizeof (zio_link_t), 489 offsetof(zio_link_t, zl_parent_node)); 490 list_create(&zio->io_child_list, sizeof (zio_link_t), 491 offsetof(zio_link_t, zl_child_node)); 492 493 if (vd != NULL) 494 zio->io_child_type = ZIO_CHILD_VDEV; 495 else if (flags & ZIO_FLAG_GANG_CHILD) 496 zio->io_child_type = ZIO_CHILD_GANG; 497 else if (flags & ZIO_FLAG_DDT_CHILD) 498 zio->io_child_type = ZIO_CHILD_DDT; 499 else 500 zio->io_child_type = ZIO_CHILD_LOGICAL; 501 502 if (bp != NULL) { 503 zio->io_bp = (blkptr_t *)bp; 504 zio->io_bp_copy = *bp; 505 zio->io_bp_orig = *bp; 506 if (type != ZIO_TYPE_WRITE || 507 zio->io_child_type == ZIO_CHILD_DDT) 508 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 509 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 510 zio->io_logical = zio; 511 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 512 pipeline |= ZIO_GANG_STAGES; 513 } 514 515 zio->io_spa = spa; 516 zio->io_txg = txg; 517 zio->io_done = done; 518 zio->io_private = private; 519 zio->io_type = type; 520 zio->io_priority = priority; 521 zio->io_vd = vd; 522 zio->io_offset = offset; 523 zio->io_orig_data = zio->io_data = data; 524 zio->io_orig_size = zio->io_size = size; 525 zio->io_orig_flags = zio->io_flags = flags; 526 zio->io_orig_stage = zio->io_stage = stage; 527 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 528 529 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 530 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 531 532 if (zb != NULL) 533 zio->io_bookmark = *zb; 534 535 if (pio != NULL) { 536 if (zio->io_logical == NULL) 537 zio->io_logical = pio->io_logical; 538 if (zio->io_child_type == ZIO_CHILD_GANG) 539 zio->io_gang_leader = pio->io_gang_leader; 540 zio_add_child(pio, zio); 541 } 542 543 return (zio); 544 } 545 546 static void 547 zio_destroy(zio_t *zio) 548 { 549 list_destroy(&zio->io_parent_list); 550 list_destroy(&zio->io_child_list); 551 mutex_destroy(&zio->io_lock); 552 cv_destroy(&zio->io_cv); 553 kmem_cache_free(zio_cache, zio); 554 } 555 556 zio_t * 557 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 558 void *private, enum zio_flag flags) 559 { 560 zio_t *zio; 561 562 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 563 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 564 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 565 566 return (zio); 567 } 568 569 zio_t * 570 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 571 { 572 return (zio_null(NULL, spa, NULL, done, private, flags)); 573 } 574 575 zio_t * 576 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 577 void *data, uint64_t size, zio_done_func_t *done, void *private, 578 int priority, enum zio_flag flags, const zbookmark_t *zb) 579 { 580 zio_t *zio; 581 582 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 583 data, size, done, private, 584 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 585 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 586 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 587 588 return (zio); 589 } 590 591 zio_t * 592 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 593 void *data, uint64_t size, const zio_prop_t *zp, 594 zio_done_func_t *ready, zio_done_func_t *done, void *private, 595 int priority, enum zio_flag flags, const zbookmark_t *zb) 596 { 597 zio_t *zio; 598 599 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 600 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 601 zp->zp_compress >= ZIO_COMPRESS_OFF && 602 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 603 zp->zp_type < DMU_OT_NUMTYPES && 604 zp->zp_level < 32 && 605 zp->zp_copies > 0 && 606 zp->zp_copies <= spa_max_replication(spa) && 607 zp->zp_dedup <= 1 && 608 zp->zp_dedup_verify <= 1); 609 610 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 611 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 612 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 613 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 614 615 zio->io_ready = ready; 616 zio->io_prop = *zp; 617 618 return (zio); 619 } 620 621 zio_t * 622 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 623 uint64_t size, zio_done_func_t *done, void *private, int priority, 624 enum zio_flag flags, zbookmark_t *zb) 625 { 626 zio_t *zio; 627 628 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 629 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 630 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 631 632 return (zio); 633 } 634 635 void 636 zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 637 { 638 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 639 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 640 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 641 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 642 643 zio->io_prop.zp_copies = copies; 644 zio->io_bp_override = bp; 645 } 646 647 void 648 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 649 { 650 bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 651 } 652 653 zio_t * 654 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 655 enum zio_flag flags) 656 { 657 zio_t *zio; 658 659 ASSERT(!BP_IS_HOLE(bp)); 660 ASSERT(spa_syncing_txg(spa) == txg); 661 ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 662 663 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 664 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 665 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 666 667 return (zio); 668 } 669 670 zio_t * 671 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 672 zio_done_func_t *done, void *private, enum zio_flag flags) 673 { 674 zio_t *zio; 675 676 /* 677 * A claim is an allocation of a specific block. Claims are needed 678 * to support immediate writes in the intent log. The issue is that 679 * immediate writes contain committed data, but in a txg that was 680 * *not* committed. Upon opening the pool after an unclean shutdown, 681 * the intent log claims all blocks that contain immediate write data 682 * so that the SPA knows they're in use. 683 * 684 * All claims *must* be resolved in the first txg -- before the SPA 685 * starts allocating blocks -- so that nothing is allocated twice. 686 * If txg == 0 we just verify that the block is claimable. 687 */ 688 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 689 ASSERT(txg == spa_first_txg(spa) || txg == 0); 690 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 691 692 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 693 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 694 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 695 696 return (zio); 697 } 698 699 zio_t * 700 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 701 zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 702 { 703 zio_t *zio; 704 int c; 705 706 if (vd->vdev_children == 0) { 707 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 708 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 709 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 710 711 zio->io_cmd = cmd; 712 } else { 713 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 714 715 for (c = 0; c < vd->vdev_children; c++) 716 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 717 done, private, priority, flags)); 718 } 719 720 return (zio); 721 } 722 723 zio_t * 724 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 725 void *data, int checksum, zio_done_func_t *done, void *private, 726 int priority, enum zio_flag flags, boolean_t labels) 727 { 728 zio_t *zio; 729 730 ASSERT(vd->vdev_children == 0); 731 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 732 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 733 ASSERT3U(offset + size, <=, vd->vdev_psize); 734 735 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 736 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 737 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 738 739 zio->io_prop.zp_checksum = checksum; 740 741 return (zio); 742 } 743 744 zio_t * 745 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 746 void *data, int checksum, zio_done_func_t *done, void *private, 747 int priority, enum zio_flag flags, boolean_t labels) 748 { 749 zio_t *zio; 750 751 ASSERT(vd->vdev_children == 0); 752 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 753 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 754 ASSERT3U(offset + size, <=, vd->vdev_psize); 755 756 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 757 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 758 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 759 760 zio->io_prop.zp_checksum = checksum; 761 762 if (zio_checksum_table[checksum].ci_zbt) { 763 /* 764 * zbt checksums are necessarily destructive -- they modify 765 * the end of the write buffer to hold the verifier/checksum. 766 * Therefore, we must make a local copy in case the data is 767 * being written to multiple places in parallel. 768 */ 769 void *wbuf = zio_buf_alloc(size); 770 bcopy(data, wbuf, size); 771 zio_push_transform(zio, wbuf, size, size, NULL); 772 } 773 774 return (zio); 775 } 776 777 /* 778 * Create a child I/O to do some work for us. 779 */ 780 zio_t * 781 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 782 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 783 zio_done_func_t *done, void *private) 784 { 785 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 786 zio_t *zio; 787 788 ASSERT(vd->vdev_parent == 789 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 790 791 if (type == ZIO_TYPE_READ && bp != NULL) { 792 /* 793 * If we have the bp, then the child should perform the 794 * checksum and the parent need not. This pushes error 795 * detection as close to the leaves as possible and 796 * eliminates redundant checksums in the interior nodes. 797 */ 798 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 799 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 800 } 801 802 if (vd->vdev_children == 0) 803 offset += VDEV_LABEL_START_SIZE; 804 805 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 806 807 /* 808 * If we've decided to do a repair, the write is not speculative -- 809 * even if the original read was. 810 */ 811 if (flags & ZIO_FLAG_IO_REPAIR) 812 flags &= ~ZIO_FLAG_SPECULATIVE; 813 814 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 815 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 816 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 817 818 return (zio); 819 } 820 821 zio_t * 822 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 823 int type, int priority, enum zio_flag flags, 824 zio_done_func_t *done, void *private) 825 { 826 zio_t *zio; 827 828 ASSERT(vd->vdev_ops->vdev_op_leaf); 829 830 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 831 data, size, done, private, type, priority, 832 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 833 vd, offset, NULL, 834 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 835 836 return (zio); 837 } 838 839 void 840 zio_flush(zio_t *zio, vdev_t *vd) 841 { 842 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 843 NULL, NULL, ZIO_PRIORITY_NOW, 844 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 845 } 846 847 /* 848 * ========================================================================== 849 * Prepare to read and write logical blocks 850 * ========================================================================== 851 */ 852 853 static int 854 zio_read_bp_init(zio_t *zio) 855 { 856 blkptr_t *bp = zio->io_bp; 857 858 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 859 zio->io_child_type == ZIO_CHILD_LOGICAL && 860 !(zio->io_flags & ZIO_FLAG_RAW)) { 861 uint64_t psize = BP_GET_PSIZE(bp); 862 void *cbuf = zio_buf_alloc(psize); 863 864 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 865 } 866 867 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 868 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 869 870 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 871 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 872 873 return (ZIO_PIPELINE_CONTINUE); 874 } 875 876 static int 877 zio_write_bp_init(zio_t *zio) 878 { 879 spa_t *spa = zio->io_spa; 880 zio_prop_t *zp = &zio->io_prop; 881 enum zio_compress compress = zp->zp_compress; 882 blkptr_t *bp = zio->io_bp; 883 uint64_t lsize = zio->io_size; 884 uint64_t psize = lsize; 885 int pass = 1; 886 887 /* 888 * If our children haven't all reached the ready stage, 889 * wait for them and then repeat this pipeline stage. 890 */ 891 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 892 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 893 return (ZIO_PIPELINE_STOP); 894 895 if (!IO_IS_ALLOCATING(zio)) 896 return (ZIO_PIPELINE_CONTINUE); 897 898 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 899 900 if (zio->io_bp_override) { 901 ASSERT(bp->blk_birth != zio->io_txg); 902 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 903 904 *bp = *zio->io_bp_override; 905 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 906 907 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 908 return (ZIO_PIPELINE_CONTINUE); 909 910 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 911 zp->zp_dedup_verify); 912 913 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 914 BP_SET_DEDUP(bp, 1); 915 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 916 return (ZIO_PIPELINE_CONTINUE); 917 } 918 zio->io_bp_override = NULL; 919 BP_ZERO(bp); 920 } 921 922 if (bp->blk_birth == zio->io_txg) { 923 /* 924 * We're rewriting an existing block, which means we're 925 * working on behalf of spa_sync(). For spa_sync() to 926 * converge, it must eventually be the case that we don't 927 * have to allocate new blocks. But compression changes 928 * the blocksize, which forces a reallocate, and makes 929 * convergence take longer. Therefore, after the first 930 * few passes, stop compressing to ensure convergence. 931 */ 932 pass = spa_sync_pass(spa); 933 934 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 935 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 936 ASSERT(!BP_GET_DEDUP(bp)); 937 938 if (pass > SYNC_PASS_DONT_COMPRESS) 939 compress = ZIO_COMPRESS_OFF; 940 941 /* Make sure someone doesn't change their mind on overwrites */ 942 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 943 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 944 } 945 946 if (compress != ZIO_COMPRESS_OFF) { 947 void *cbuf = zio_buf_alloc(lsize); 948 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 949 if (psize == 0 || psize == lsize) { 950 compress = ZIO_COMPRESS_OFF; 951 zio_buf_free(cbuf, lsize); 952 } else { 953 ASSERT(psize < lsize); 954 zio_push_transform(zio, cbuf, psize, lsize, NULL); 955 } 956 } 957 958 /* 959 * The final pass of spa_sync() must be all rewrites, but the first 960 * few passes offer a trade-off: allocating blocks defers convergence, 961 * but newly allocated blocks are sequential, so they can be written 962 * to disk faster. Therefore, we allow the first few passes of 963 * spa_sync() to allocate new blocks, but force rewrites after that. 964 * There should only be a handful of blocks after pass 1 in any case. 965 */ 966 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 967 pass > SYNC_PASS_REWRITE) { 968 ASSERT(psize != 0); 969 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 970 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 971 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 972 } else { 973 BP_ZERO(bp); 974 zio->io_pipeline = ZIO_WRITE_PIPELINE; 975 } 976 977 if (psize == 0) { 978 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 979 } else { 980 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 981 BP_SET_LSIZE(bp, lsize); 982 BP_SET_PSIZE(bp, psize); 983 BP_SET_COMPRESS(bp, compress); 984 BP_SET_CHECKSUM(bp, zp->zp_checksum); 985 BP_SET_TYPE(bp, zp->zp_type); 986 BP_SET_LEVEL(bp, zp->zp_level); 987 BP_SET_DEDUP(bp, zp->zp_dedup); 988 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 989 if (zp->zp_dedup) { 990 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 991 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 992 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 993 } 994 } 995 996 return (ZIO_PIPELINE_CONTINUE); 997 } 998 999 static int 1000 zio_free_bp_init(zio_t *zio) 1001 { 1002 blkptr_t *bp = zio->io_bp; 1003 1004 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1005 if (BP_GET_DEDUP(bp)) 1006 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1007 else 1008 arc_free(zio->io_spa, bp); 1009 } 1010 1011 return (ZIO_PIPELINE_CONTINUE); 1012 } 1013 1014 /* 1015 * ========================================================================== 1016 * Execute the I/O pipeline 1017 * ========================================================================== 1018 */ 1019 1020 static void 1021 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) 1022 { 1023 zio_type_t t = zio->io_type; 1024 1025 /* 1026 * If we're a config writer or a probe, the normal issue and 1027 * interrupt threads may all be blocked waiting for the config lock. 1028 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1029 */ 1030 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1031 t = ZIO_TYPE_NULL; 1032 1033 /* 1034 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1035 */ 1036 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1037 t = ZIO_TYPE_NULL; 1038 1039 (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], 1040 (task_func_t *)zio_execute, zio, TQ_SLEEP); 1041 } 1042 1043 static boolean_t 1044 zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1045 { 1046 kthread_t *executor = zio->io_executor; 1047 spa_t *spa = zio->io_spa; 1048 1049 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1050 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1051 return (B_TRUE); 1052 1053 return (B_FALSE); 1054 } 1055 1056 static int 1057 zio_issue_async(zio_t *zio) 1058 { 1059 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1060 1061 return (ZIO_PIPELINE_STOP); 1062 } 1063 1064 void 1065 zio_interrupt(zio_t *zio) 1066 { 1067 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); 1068 } 1069 1070 /* 1071 * Execute the I/O pipeline until one of the following occurs: 1072 * (1) the I/O completes; (2) the pipeline stalls waiting for 1073 * dependent child I/Os; (3) the I/O issues, so we're waiting 1074 * for an I/O completion interrupt; (4) the I/O is delegated by 1075 * vdev-level caching or aggregation; (5) the I/O is deferred 1076 * due to vdev-level queueing; (6) the I/O is handed off to 1077 * another thread. In all cases, the pipeline stops whenever 1078 * there's no CPU work; it never burns a thread in cv_wait(). 1079 * 1080 * There's no locking on io_stage because there's no legitimate way 1081 * for multiple threads to be attempting to process the same I/O. 1082 */ 1083 static zio_pipe_stage_t *zio_pipeline[]; 1084 1085 void 1086 zio_execute(zio_t *zio) 1087 { 1088 zio->io_executor = curthread; 1089 1090 while (zio->io_stage < ZIO_STAGE_DONE) { 1091 enum zio_stage pipeline = zio->io_pipeline; 1092 enum zio_stage stage = zio->io_stage; 1093 int rv; 1094 1095 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1096 ASSERT(ISP2(stage)); 1097 ASSERT(zio->io_stall == NULL); 1098 1099 do { 1100 stage <<= 1; 1101 } while ((stage & pipeline) == 0); 1102 1103 ASSERT(stage <= ZIO_STAGE_DONE); 1104 1105 /* 1106 * If we are in interrupt context and this pipeline stage 1107 * will grab a config lock that is held across I/O, 1108 * or may wait for an I/O that needs an interrupt thread 1109 * to complete, issue async to avoid deadlock. 1110 */ 1111 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1112 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1113 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1114 return; 1115 } 1116 1117 zio->io_stage = stage; 1118 rv = zio_pipeline[highbit(stage) - 1](zio); 1119 1120 if (rv == ZIO_PIPELINE_STOP) 1121 return; 1122 1123 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1124 } 1125 } 1126 1127 /* 1128 * ========================================================================== 1129 * Initiate I/O, either sync or async 1130 * ========================================================================== 1131 */ 1132 int 1133 zio_wait(zio_t *zio) 1134 { 1135 int error; 1136 1137 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1138 ASSERT(zio->io_executor == NULL); 1139 1140 zio->io_waiter = curthread; 1141 1142 zio_execute(zio); 1143 1144 mutex_enter(&zio->io_lock); 1145 while (zio->io_executor != NULL) 1146 cv_wait(&zio->io_cv, &zio->io_lock); 1147 mutex_exit(&zio->io_lock); 1148 1149 error = zio->io_error; 1150 zio_destroy(zio); 1151 1152 return (error); 1153 } 1154 1155 void 1156 zio_nowait(zio_t *zio) 1157 { 1158 ASSERT(zio->io_executor == NULL); 1159 1160 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1161 zio_unique_parent(zio) == NULL) { 1162 /* 1163 * This is a logical async I/O with no parent to wait for it. 1164 * We add it to the spa_async_root_zio "Godfather" I/O which 1165 * will ensure they complete prior to unloading the pool. 1166 */ 1167 spa_t *spa = zio->io_spa; 1168 1169 zio_add_child(spa->spa_async_zio_root, zio); 1170 } 1171 1172 zio_execute(zio); 1173 } 1174 1175 /* 1176 * ========================================================================== 1177 * Reexecute or suspend/resume failed I/O 1178 * ========================================================================== 1179 */ 1180 1181 static void 1182 zio_reexecute(zio_t *pio) 1183 { 1184 zio_t *cio, *cio_next; 1185 1186 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1187 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1188 ASSERT(pio->io_gang_leader == NULL); 1189 ASSERT(pio->io_gang_tree == NULL); 1190 1191 pio->io_flags = pio->io_orig_flags; 1192 pio->io_stage = pio->io_orig_stage; 1193 pio->io_pipeline = pio->io_orig_pipeline; 1194 pio->io_reexecute = 0; 1195 pio->io_error = 0; 1196 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1197 pio->io_state[w] = 0; 1198 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1199 pio->io_child_error[c] = 0; 1200 1201 if (IO_IS_ALLOCATING(pio)) 1202 BP_ZERO(pio->io_bp); 1203 1204 /* 1205 * As we reexecute pio's children, new children could be created. 1206 * New children go to the head of pio's io_child_list, however, 1207 * so we will (correctly) not reexecute them. The key is that 1208 * the remainder of pio's io_child_list, from 'cio_next' onward, 1209 * cannot be affected by any side effects of reexecuting 'cio'. 1210 */ 1211 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1212 cio_next = zio_walk_children(pio); 1213 mutex_enter(&pio->io_lock); 1214 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1215 pio->io_children[cio->io_child_type][w]++; 1216 mutex_exit(&pio->io_lock); 1217 zio_reexecute(cio); 1218 } 1219 1220 /* 1221 * Now that all children have been reexecuted, execute the parent. 1222 * We don't reexecute "The Godfather" I/O here as it's the 1223 * responsibility of the caller to wait on him. 1224 */ 1225 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1226 zio_execute(pio); 1227 } 1228 1229 void 1230 zio_suspend(spa_t *spa, zio_t *zio) 1231 { 1232 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1233 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1234 "failure and the failure mode property for this pool " 1235 "is set to panic.", spa_name(spa)); 1236 1237 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1238 1239 mutex_enter(&spa->spa_suspend_lock); 1240 1241 if (spa->spa_suspend_zio_root == NULL) 1242 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1243 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1244 ZIO_FLAG_GODFATHER); 1245 1246 spa->spa_suspended = B_TRUE; 1247 1248 if (zio != NULL) { 1249 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1250 ASSERT(zio != spa->spa_suspend_zio_root); 1251 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1252 ASSERT(zio_unique_parent(zio) == NULL); 1253 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1254 zio_add_child(spa->spa_suspend_zio_root, zio); 1255 } 1256 1257 mutex_exit(&spa->spa_suspend_lock); 1258 } 1259 1260 int 1261 zio_resume(spa_t *spa) 1262 { 1263 zio_t *pio; 1264 1265 /* 1266 * Reexecute all previously suspended i/o. 1267 */ 1268 mutex_enter(&spa->spa_suspend_lock); 1269 spa->spa_suspended = B_FALSE; 1270 cv_broadcast(&spa->spa_suspend_cv); 1271 pio = spa->spa_suspend_zio_root; 1272 spa->spa_suspend_zio_root = NULL; 1273 mutex_exit(&spa->spa_suspend_lock); 1274 1275 if (pio == NULL) 1276 return (0); 1277 1278 zio_reexecute(pio); 1279 return (zio_wait(pio)); 1280 } 1281 1282 void 1283 zio_resume_wait(spa_t *spa) 1284 { 1285 mutex_enter(&spa->spa_suspend_lock); 1286 while (spa_suspended(spa)) 1287 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1288 mutex_exit(&spa->spa_suspend_lock); 1289 } 1290 1291 /* 1292 * ========================================================================== 1293 * Gang blocks. 1294 * 1295 * A gang block is a collection of small blocks that looks to the DMU 1296 * like one large block. When zio_dva_allocate() cannot find a block 1297 * of the requested size, due to either severe fragmentation or the pool 1298 * being nearly full, it calls zio_write_gang_block() to construct the 1299 * block from smaller fragments. 1300 * 1301 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1302 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1303 * an indirect block: it's an array of block pointers. It consumes 1304 * only one sector and hence is allocatable regardless of fragmentation. 1305 * The gang header's bps point to its gang members, which hold the data. 1306 * 1307 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1308 * as the verifier to ensure uniqueness of the SHA256 checksum. 1309 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1310 * not the gang header. This ensures that data block signatures (needed for 1311 * deduplication) are independent of how the block is physically stored. 1312 * 1313 * Gang blocks can be nested: a gang member may itself be a gang block. 1314 * Thus every gang block is a tree in which root and all interior nodes are 1315 * gang headers, and the leaves are normal blocks that contain user data. 1316 * The root of the gang tree is called the gang leader. 1317 * 1318 * To perform any operation (read, rewrite, free, claim) on a gang block, 1319 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1320 * in the io_gang_tree field of the original logical i/o by recursively 1321 * reading the gang leader and all gang headers below it. This yields 1322 * an in-core tree containing the contents of every gang header and the 1323 * bps for every constituent of the gang block. 1324 * 1325 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1326 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1327 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1328 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1329 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1330 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1331 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1332 * of the gang header plus zio_checksum_compute() of the data to update the 1333 * gang header's blk_cksum as described above. 1334 * 1335 * The two-phase assemble/issue model solves the problem of partial failure -- 1336 * what if you'd freed part of a gang block but then couldn't read the 1337 * gang header for another part? Assembling the entire gang tree first 1338 * ensures that all the necessary gang header I/O has succeeded before 1339 * starting the actual work of free, claim, or write. Once the gang tree 1340 * is assembled, free and claim are in-memory operations that cannot fail. 1341 * 1342 * In the event that a gang write fails, zio_dva_unallocate() walks the 1343 * gang tree to immediately free (i.e. insert back into the space map) 1344 * everything we've allocated. This ensures that we don't get ENOSPC 1345 * errors during repeated suspend/resume cycles due to a flaky device. 1346 * 1347 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1348 * the gang tree, we won't modify the block, so we can safely defer the free 1349 * (knowing that the block is still intact). If we *can* assemble the gang 1350 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1351 * each constituent bp and we can allocate a new block on the next sync pass. 1352 * 1353 * In all cases, the gang tree allows complete recovery from partial failure. 1354 * ========================================================================== 1355 */ 1356 1357 static zio_t * 1358 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1359 { 1360 if (gn != NULL) 1361 return (pio); 1362 1363 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1364 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1365 &pio->io_bookmark)); 1366 } 1367 1368 zio_t * 1369 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1370 { 1371 zio_t *zio; 1372 1373 if (gn != NULL) { 1374 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1375 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1376 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1377 /* 1378 * As we rewrite each gang header, the pipeline will compute 1379 * a new gang block header checksum for it; but no one will 1380 * compute a new data checksum, so we do that here. The one 1381 * exception is the gang leader: the pipeline already computed 1382 * its data checksum because that stage precedes gang assembly. 1383 * (Presently, nothing actually uses interior data checksums; 1384 * this is just good hygiene.) 1385 */ 1386 if (gn != pio->io_gang_leader->io_gang_tree) { 1387 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1388 data, BP_GET_PSIZE(bp)); 1389 } 1390 /* 1391 * If we are here to damage data for testing purposes, 1392 * leave the GBH alone so that we can detect the damage. 1393 */ 1394 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1395 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1396 } else { 1397 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1398 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1399 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1400 } 1401 1402 return (zio); 1403 } 1404 1405 /* ARGSUSED */ 1406 zio_t * 1407 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1408 { 1409 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1410 ZIO_GANG_CHILD_FLAGS(pio))); 1411 } 1412 1413 /* ARGSUSED */ 1414 zio_t * 1415 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1416 { 1417 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1418 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1419 } 1420 1421 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1422 NULL, 1423 zio_read_gang, 1424 zio_rewrite_gang, 1425 zio_free_gang, 1426 zio_claim_gang, 1427 NULL 1428 }; 1429 1430 static void zio_gang_tree_assemble_done(zio_t *zio); 1431 1432 static zio_gang_node_t * 1433 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1434 { 1435 zio_gang_node_t *gn; 1436 1437 ASSERT(*gnpp == NULL); 1438 1439 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1440 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1441 *gnpp = gn; 1442 1443 return (gn); 1444 } 1445 1446 static void 1447 zio_gang_node_free(zio_gang_node_t **gnpp) 1448 { 1449 zio_gang_node_t *gn = *gnpp; 1450 1451 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1452 ASSERT(gn->gn_child[g] == NULL); 1453 1454 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1455 kmem_free(gn, sizeof (*gn)); 1456 *gnpp = NULL; 1457 } 1458 1459 static void 1460 zio_gang_tree_free(zio_gang_node_t **gnpp) 1461 { 1462 zio_gang_node_t *gn = *gnpp; 1463 1464 if (gn == NULL) 1465 return; 1466 1467 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1468 zio_gang_tree_free(&gn->gn_child[g]); 1469 1470 zio_gang_node_free(gnpp); 1471 } 1472 1473 static void 1474 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1475 { 1476 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1477 1478 ASSERT(gio->io_gang_leader == gio); 1479 ASSERT(BP_IS_GANG(bp)); 1480 1481 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1482 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1483 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1484 } 1485 1486 static void 1487 zio_gang_tree_assemble_done(zio_t *zio) 1488 { 1489 zio_t *gio = zio->io_gang_leader; 1490 zio_gang_node_t *gn = zio->io_private; 1491 blkptr_t *bp = zio->io_bp; 1492 1493 ASSERT(gio == zio_unique_parent(zio)); 1494 ASSERT(zio->io_child_count == 0); 1495 1496 if (zio->io_error) 1497 return; 1498 1499 if (BP_SHOULD_BYTESWAP(bp)) 1500 byteswap_uint64_array(zio->io_data, zio->io_size); 1501 1502 ASSERT(zio->io_data == gn->gn_gbh); 1503 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1504 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1505 1506 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1507 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1508 if (!BP_IS_GANG(gbp)) 1509 continue; 1510 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1511 } 1512 } 1513 1514 static void 1515 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1516 { 1517 zio_t *gio = pio->io_gang_leader; 1518 zio_t *zio; 1519 1520 ASSERT(BP_IS_GANG(bp) == !!gn); 1521 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1522 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1523 1524 /* 1525 * If you're a gang header, your data is in gn->gn_gbh. 1526 * If you're a gang member, your data is in 'data' and gn == NULL. 1527 */ 1528 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1529 1530 if (gn != NULL) { 1531 ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); 1532 1533 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1534 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1535 if (BP_IS_HOLE(gbp)) 1536 continue; 1537 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1538 data = (char *)data + BP_GET_PSIZE(gbp); 1539 } 1540 } 1541 1542 if (gn == gio->io_gang_tree) 1543 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1544 1545 if (zio != pio) 1546 zio_nowait(zio); 1547 } 1548 1549 static int 1550 zio_gang_assemble(zio_t *zio) 1551 { 1552 blkptr_t *bp = zio->io_bp; 1553 1554 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1555 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1556 1557 zio->io_gang_leader = zio; 1558 1559 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1560 1561 return (ZIO_PIPELINE_CONTINUE); 1562 } 1563 1564 static int 1565 zio_gang_issue(zio_t *zio) 1566 { 1567 blkptr_t *bp = zio->io_bp; 1568 1569 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1570 return (ZIO_PIPELINE_STOP); 1571 1572 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1573 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1574 1575 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1576 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1577 else 1578 zio_gang_tree_free(&zio->io_gang_tree); 1579 1580 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1581 1582 return (ZIO_PIPELINE_CONTINUE); 1583 } 1584 1585 static void 1586 zio_write_gang_member_ready(zio_t *zio) 1587 { 1588 zio_t *pio = zio_unique_parent(zio); 1589 zio_t *gio = zio->io_gang_leader; 1590 dva_t *cdva = zio->io_bp->blk_dva; 1591 dva_t *pdva = pio->io_bp->blk_dva; 1592 uint64_t asize; 1593 1594 if (BP_IS_HOLE(zio->io_bp)) 1595 return; 1596 1597 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1598 1599 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1600 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1601 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1602 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1603 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1604 1605 mutex_enter(&pio->io_lock); 1606 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1607 ASSERT(DVA_GET_GANG(&pdva[d])); 1608 asize = DVA_GET_ASIZE(&pdva[d]); 1609 asize += DVA_GET_ASIZE(&cdva[d]); 1610 DVA_SET_ASIZE(&pdva[d], asize); 1611 } 1612 mutex_exit(&pio->io_lock); 1613 } 1614 1615 static int 1616 zio_write_gang_block(zio_t *pio) 1617 { 1618 spa_t *spa = pio->io_spa; 1619 blkptr_t *bp = pio->io_bp; 1620 zio_t *gio = pio->io_gang_leader; 1621 zio_t *zio; 1622 zio_gang_node_t *gn, **gnpp; 1623 zio_gbh_phys_t *gbh; 1624 uint64_t txg = pio->io_txg; 1625 uint64_t resid = pio->io_size; 1626 uint64_t lsize; 1627 int copies = gio->io_prop.zp_copies; 1628 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1629 zio_prop_t zp; 1630 int error; 1631 1632 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1633 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1634 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1635 if (error) { 1636 pio->io_error = error; 1637 return (ZIO_PIPELINE_CONTINUE); 1638 } 1639 1640 if (pio == gio) { 1641 gnpp = &gio->io_gang_tree; 1642 } else { 1643 gnpp = pio->io_private; 1644 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1645 } 1646 1647 gn = zio_gang_node_alloc(gnpp); 1648 gbh = gn->gn_gbh; 1649 bzero(gbh, SPA_GANGBLOCKSIZE); 1650 1651 /* 1652 * Create the gang header. 1653 */ 1654 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1655 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1656 1657 /* 1658 * Create and nowait the gang children. 1659 */ 1660 for (int g = 0; resid != 0; resid -= lsize, g++) { 1661 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1662 SPA_MINBLOCKSIZE); 1663 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1664 1665 zp.zp_checksum = gio->io_prop.zp_checksum; 1666 zp.zp_compress = ZIO_COMPRESS_OFF; 1667 zp.zp_type = DMU_OT_NONE; 1668 zp.zp_level = 0; 1669 zp.zp_copies = gio->io_prop.zp_copies; 1670 zp.zp_dedup = 0; 1671 zp.zp_dedup_verify = 0; 1672 1673 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1674 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1675 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1676 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1677 &pio->io_bookmark)); 1678 } 1679 1680 /* 1681 * Set pio's pipeline to just wait for zio to finish. 1682 */ 1683 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1684 1685 zio_nowait(zio); 1686 1687 return (ZIO_PIPELINE_CONTINUE); 1688 } 1689 1690 /* 1691 * ========================================================================== 1692 * Dedup 1693 * ========================================================================== 1694 */ 1695 static void 1696 zio_ddt_child_read_done(zio_t *zio) 1697 { 1698 blkptr_t *bp = zio->io_bp; 1699 ddt_entry_t *dde = zio->io_private; 1700 ddt_phys_t *ddp; 1701 zio_t *pio = zio_unique_parent(zio); 1702 1703 mutex_enter(&pio->io_lock); 1704 ddp = ddt_phys_select(dde, bp); 1705 if (zio->io_error == 0) 1706 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1707 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1708 dde->dde_repair_data = zio->io_data; 1709 else 1710 zio_buf_free(zio->io_data, zio->io_size); 1711 mutex_exit(&pio->io_lock); 1712 } 1713 1714 static int 1715 zio_ddt_read_start(zio_t *zio) 1716 { 1717 blkptr_t *bp = zio->io_bp; 1718 1719 ASSERT(BP_GET_DEDUP(bp)); 1720 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1721 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1722 1723 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1724 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1725 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1726 ddt_phys_t *ddp = dde->dde_phys; 1727 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1728 blkptr_t blk; 1729 1730 ASSERT(zio->io_vsd == NULL); 1731 zio->io_vsd = dde; 1732 1733 if (ddp_self == NULL) 1734 return (ZIO_PIPELINE_CONTINUE); 1735 1736 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1737 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1738 continue; 1739 ddt_bp_create(ddt, &dde->dde_key, ddp, &blk); 1740 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1741 zio_buf_alloc(zio->io_size), zio->io_size, 1742 zio_ddt_child_read_done, dde, zio->io_priority, 1743 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1744 &zio->io_bookmark)); 1745 } 1746 return (ZIO_PIPELINE_CONTINUE); 1747 } 1748 1749 zio_nowait(zio_read(zio, zio->io_spa, bp, 1750 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1751 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1752 1753 return (ZIO_PIPELINE_CONTINUE); 1754 } 1755 1756 static int 1757 zio_ddt_read_done(zio_t *zio) 1758 { 1759 blkptr_t *bp = zio->io_bp; 1760 1761 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1762 return (ZIO_PIPELINE_STOP); 1763 1764 ASSERT(BP_GET_DEDUP(bp)); 1765 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1766 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1767 1768 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1769 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1770 ddt_entry_t *dde = zio->io_vsd; 1771 if (ddt == NULL) { 1772 ASSERT(zio->io_spa->spa_load_state != SPA_LOAD_NONE); 1773 return (ZIO_PIPELINE_CONTINUE); 1774 } 1775 if (dde == NULL) { 1776 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1777 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 1778 return (ZIO_PIPELINE_STOP); 1779 } 1780 if (dde->dde_repair_data != NULL) { 1781 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1782 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1783 } 1784 ddt_repair_done(ddt, dde); 1785 zio->io_vsd = NULL; 1786 } 1787 1788 ASSERT(zio->io_vsd == NULL); 1789 1790 return (ZIO_PIPELINE_CONTINUE); 1791 } 1792 1793 static boolean_t 1794 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1795 { 1796 spa_t *spa = zio->io_spa; 1797 1798 /* 1799 * Note: we compare the original data, not the transformed data, 1800 * because when zio->io_bp is an override bp, we will not have 1801 * pushed the I/O transforms. That's an important optimization 1802 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1803 */ 1804 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1805 zio_t *lio = dde->dde_lead_zio[p]; 1806 1807 if (lio != NULL) { 1808 return (lio->io_orig_size != zio->io_orig_size || 1809 bcmp(zio->io_orig_data, lio->io_orig_data, 1810 zio->io_orig_size) != 0); 1811 } 1812 } 1813 1814 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1815 ddt_phys_t *ddp = &dde->dde_phys[p]; 1816 1817 if (ddp->ddp_phys_birth != 0) { 1818 arc_buf_t *abuf = NULL; 1819 uint32_t aflags = ARC_WAIT; 1820 blkptr_t blk = *zio->io_bp; 1821 int error; 1822 1823 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 1824 1825 ddt_exit(ddt); 1826 1827 error = arc_read_nolock(NULL, spa, &blk, 1828 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 1829 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1830 &aflags, &zio->io_bookmark); 1831 1832 if (error == 0) { 1833 if (arc_buf_size(abuf) != zio->io_orig_size || 1834 bcmp(abuf->b_data, zio->io_orig_data, 1835 zio->io_orig_size) != 0) 1836 error = EEXIST; 1837 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 1838 } 1839 1840 ddt_enter(ddt); 1841 return (error != 0); 1842 } 1843 } 1844 1845 return (B_FALSE); 1846 } 1847 1848 static void 1849 zio_ddt_child_write_ready(zio_t *zio) 1850 { 1851 int p = zio->io_prop.zp_copies; 1852 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1853 ddt_entry_t *dde = zio->io_private; 1854 ddt_phys_t *ddp = &dde->dde_phys[p]; 1855 zio_t *pio; 1856 1857 if (zio->io_error) 1858 return; 1859 1860 ddt_enter(ddt); 1861 1862 ASSERT(dde->dde_lead_zio[p] == zio); 1863 1864 ddt_phys_fill(ddp, zio->io_bp); 1865 1866 while ((pio = zio_walk_parents(zio)) != NULL) 1867 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 1868 1869 ddt_exit(ddt); 1870 } 1871 1872 static void 1873 zio_ddt_child_write_done(zio_t *zio) 1874 { 1875 int p = zio->io_prop.zp_copies; 1876 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1877 ddt_entry_t *dde = zio->io_private; 1878 ddt_phys_t *ddp = &dde->dde_phys[p]; 1879 1880 ddt_enter(ddt); 1881 1882 ASSERT(ddp->ddp_refcnt == 0); 1883 ASSERT(dde->dde_lead_zio[p] == zio); 1884 dde->dde_lead_zio[p] = NULL; 1885 1886 if (zio->io_error == 0) { 1887 while (zio_walk_parents(zio) != NULL) 1888 ddt_phys_addref(ddp); 1889 } else { 1890 ddt_phys_clear(ddp); 1891 } 1892 1893 ddt_exit(ddt); 1894 } 1895 1896 static void 1897 zio_ddt_ditto_write_done(zio_t *zio) 1898 { 1899 int p = DDT_PHYS_DITTO; 1900 zio_prop_t *zp = &zio->io_prop; 1901 blkptr_t *bp = zio->io_bp; 1902 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1903 ddt_entry_t *dde = zio->io_private; 1904 ddt_phys_t *ddp = &dde->dde_phys[p]; 1905 ddt_key_t *ddk = &dde->dde_key; 1906 1907 ddt_enter(ddt); 1908 1909 ASSERT(ddp->ddp_refcnt == 0); 1910 ASSERT(dde->dde_lead_zio[p] == zio); 1911 dde->dde_lead_zio[p] = NULL; 1912 1913 if (zio->io_error == 0) { 1914 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 1915 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 1916 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 1917 if (ddp->ddp_phys_birth != 0) 1918 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 1919 ddt_phys_fill(ddp, bp); 1920 } 1921 1922 ddt_exit(ddt); 1923 } 1924 1925 static int 1926 zio_ddt_write(zio_t *zio) 1927 { 1928 spa_t *spa = zio->io_spa; 1929 blkptr_t *bp = zio->io_bp; 1930 uint64_t txg = zio->io_txg; 1931 zio_prop_t *zp = &zio->io_prop; 1932 int p = zp->zp_copies; 1933 int ditto_copies; 1934 zio_t *cio = NULL; 1935 zio_t *dio = NULL; 1936 ddt_t *ddt = ddt_select(spa, bp); 1937 ddt_entry_t *dde; 1938 ddt_phys_t *ddp; 1939 1940 ASSERT(BP_GET_DEDUP(bp)); 1941 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 1942 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 1943 1944 ddt_enter(ddt); 1945 dde = ddt_lookup(ddt, bp, B_TRUE); 1946 ddp = &dde->dde_phys[p]; 1947 1948 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 1949 /* 1950 * If we're using a weak checksum, upgrade to a strong checksum 1951 * and try again. If we're already using a strong checksum, 1952 * we can't resolve it, so just convert to an ordinary write. 1953 * (And automatically e-mail a paper to Nature?) 1954 */ 1955 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 1956 zp->zp_checksum = spa_dedup_checksum(spa); 1957 zio_pop_transforms(zio); 1958 zio->io_stage = ZIO_STAGE_OPEN; 1959 BP_ZERO(bp); 1960 } else { 1961 zp->zp_dedup = 0; 1962 } 1963 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1964 ddt_exit(ddt); 1965 return (ZIO_PIPELINE_CONTINUE); 1966 } 1967 1968 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 1969 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 1970 1971 if (ditto_copies > ddt_ditto_copies_present(dde) && 1972 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 1973 zio_prop_t czp = *zp; 1974 1975 czp.zp_copies = ditto_copies; 1976 1977 /* 1978 * If we arrived here with an override bp, we won't have run 1979 * the transform stack, so we won't have the data we need to 1980 * generate a child i/o. So, toss the override bp and restart. 1981 * This is safe, because using the override bp is just an 1982 * optimization; and it's rare, so the cost doesn't matter. 1983 */ 1984 if (zio->io_bp_override) { 1985 zio_pop_transforms(zio); 1986 zio->io_stage = ZIO_STAGE_OPEN; 1987 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1988 zio->io_bp_override = NULL; 1989 BP_ZERO(bp); 1990 ddt_exit(ddt); 1991 return (ZIO_PIPELINE_CONTINUE); 1992 } 1993 1994 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 1995 zio->io_orig_size, &czp, NULL, 1996 zio_ddt_ditto_write_done, dde, zio->io_priority, 1997 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 1998 1999 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2000 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2001 } 2002 2003 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2004 if (ddp->ddp_phys_birth != 0) 2005 ddt_bp_fill(ddp, bp, txg); 2006 if (dde->dde_lead_zio[p] != NULL) 2007 zio_add_child(zio, dde->dde_lead_zio[p]); 2008 else 2009 ddt_phys_addref(ddp); 2010 } else if (zio->io_bp_override) { 2011 ASSERT(bp->blk_birth == txg); 2012 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2013 ddt_phys_fill(ddp, bp); 2014 ddt_phys_addref(ddp); 2015 } else { 2016 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2017 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2018 zio_ddt_child_write_done, dde, zio->io_priority, 2019 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2020 2021 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2022 dde->dde_lead_zio[p] = cio; 2023 } 2024 2025 ddt_exit(ddt); 2026 2027 if (cio) 2028 zio_nowait(cio); 2029 if (dio) 2030 zio_nowait(dio); 2031 2032 return (ZIO_PIPELINE_CONTINUE); 2033 } 2034 2035 static int 2036 zio_ddt_free(zio_t *zio) 2037 { 2038 spa_t *spa = zio->io_spa; 2039 blkptr_t *bp = zio->io_bp; 2040 ddt_t *ddt = ddt_select(spa, bp); 2041 ddt_entry_t *dde; 2042 ddt_phys_t *ddp; 2043 2044 ASSERT(BP_GET_DEDUP(bp)); 2045 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2046 2047 ddt_enter(ddt); 2048 dde = ddt_lookup(ddt, bp, B_TRUE); 2049 ddp = ddt_phys_select(dde, bp); 2050 ddt_phys_decref(ddp); 2051 ddt_exit(ddt); 2052 2053 return (ZIO_PIPELINE_CONTINUE); 2054 } 2055 2056 /* 2057 * ========================================================================== 2058 * Allocate and free blocks 2059 * ========================================================================== 2060 */ 2061 static int 2062 zio_dva_allocate(zio_t *zio) 2063 { 2064 spa_t *spa = zio->io_spa; 2065 metaslab_class_t *mc = spa_normal_class(spa); 2066 blkptr_t *bp = zio->io_bp; 2067 int error; 2068 2069 if (zio->io_gang_leader == NULL) { 2070 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2071 zio->io_gang_leader = zio; 2072 } 2073 2074 ASSERT(BP_IS_HOLE(bp)); 2075 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 2076 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2077 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2078 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2079 2080 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2081 zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2082 2083 if (error) { 2084 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2085 return (zio_write_gang_block(zio)); 2086 zio->io_error = error; 2087 } 2088 2089 return (ZIO_PIPELINE_CONTINUE); 2090 } 2091 2092 static int 2093 zio_dva_free(zio_t *zio) 2094 { 2095 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2096 2097 return (ZIO_PIPELINE_CONTINUE); 2098 } 2099 2100 static int 2101 zio_dva_claim(zio_t *zio) 2102 { 2103 int error; 2104 2105 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2106 if (error) 2107 zio->io_error = error; 2108 2109 return (ZIO_PIPELINE_CONTINUE); 2110 } 2111 2112 /* 2113 * Undo an allocation. This is used by zio_done() when an I/O fails 2114 * and we want to give back the block we just allocated. 2115 * This handles both normal blocks and gang blocks. 2116 */ 2117 static void 2118 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2119 { 2120 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2121 ASSERT(zio->io_bp_override == NULL); 2122 2123 if (!BP_IS_HOLE(bp)) 2124 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2125 2126 if (gn != NULL) { 2127 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2128 zio_dva_unallocate(zio, gn->gn_child[g], 2129 &gn->gn_gbh->zg_blkptr[g]); 2130 } 2131 } 2132 } 2133 2134 /* 2135 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2136 */ 2137 int 2138 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2139 uint64_t size, boolean_t use_slog) 2140 { 2141 int error = 1; 2142 2143 ASSERT(txg > spa_syncing_txg(spa)); 2144 2145 if (use_slog) 2146 error = metaslab_alloc(spa, spa_log_class(spa), size, 2147 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2148 2149 if (error) 2150 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2151 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2152 2153 if (error == 0) { 2154 BP_SET_LSIZE(new_bp, size); 2155 BP_SET_PSIZE(new_bp, size); 2156 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2157 BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); 2158 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2159 BP_SET_LEVEL(new_bp, 0); 2160 BP_SET_DEDUP(new_bp, 0); 2161 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2162 } 2163 2164 return (error); 2165 } 2166 2167 /* 2168 * Free an intent log block. 2169 */ 2170 void 2171 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2172 { 2173 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2174 ASSERT(!BP_IS_GANG(bp)); 2175 2176 zio_free(spa, txg, bp); 2177 } 2178 2179 /* 2180 * ========================================================================== 2181 * Read and write to physical devices 2182 * ========================================================================== 2183 */ 2184 static int 2185 zio_vdev_io_start(zio_t *zio) 2186 { 2187 vdev_t *vd = zio->io_vd; 2188 uint64_t align; 2189 spa_t *spa = zio->io_spa; 2190 2191 ASSERT(zio->io_error == 0); 2192 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2193 2194 if (vd == NULL) { 2195 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2196 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2197 2198 /* 2199 * The mirror_ops handle multiple DVAs in a single BP. 2200 */ 2201 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2202 } 2203 2204 align = 1ULL << vd->vdev_top->vdev_ashift; 2205 2206 if (P2PHASE(zio->io_size, align) != 0) { 2207 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2208 char *abuf = zio_buf_alloc(asize); 2209 ASSERT(vd == vd->vdev_top); 2210 if (zio->io_type == ZIO_TYPE_WRITE) { 2211 bcopy(zio->io_data, abuf, zio->io_size); 2212 bzero(abuf + zio->io_size, asize - zio->io_size); 2213 } 2214 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2215 } 2216 2217 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2218 ASSERT(P2PHASE(zio->io_size, align) == 0); 2219 ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2220 2221 /* 2222 * If this is a repair I/O, and there's no self-healing involved -- 2223 * that is, we're just resilvering what we expect to resilver -- 2224 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2225 * This prevents spurious resilvering with nested replication. 2226 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2227 * A is out of date, we'll read from C+D, then use the data to 2228 * resilver A+B -- but we don't actually want to resilver B, just A. 2229 * The top-level mirror has no way to know this, so instead we just 2230 * discard unnecessary repairs as we work our way down the vdev tree. 2231 * The same logic applies to any form of nested replication: 2232 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2233 */ 2234 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2235 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2236 zio->io_txg != 0 && /* not a delegated i/o */ 2237 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2238 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2239 zio_vdev_io_bypass(zio); 2240 return (ZIO_PIPELINE_CONTINUE); 2241 } 2242 2243 if (vd->vdev_ops->vdev_op_leaf && 2244 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2245 2246 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2247 return (ZIO_PIPELINE_CONTINUE); 2248 2249 if ((zio = vdev_queue_io(zio)) == NULL) 2250 return (ZIO_PIPELINE_STOP); 2251 2252 if (!vdev_accessible(vd, zio)) { 2253 zio->io_error = ENXIO; 2254 zio_interrupt(zio); 2255 return (ZIO_PIPELINE_STOP); 2256 } 2257 } 2258 2259 return (vd->vdev_ops->vdev_op_io_start(zio)); 2260 } 2261 2262 static int 2263 zio_vdev_io_done(zio_t *zio) 2264 { 2265 vdev_t *vd = zio->io_vd; 2266 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2267 boolean_t unexpected_error = B_FALSE; 2268 2269 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2270 return (ZIO_PIPELINE_STOP); 2271 2272 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2273 2274 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2275 2276 vdev_queue_io_done(zio); 2277 2278 if (zio->io_type == ZIO_TYPE_WRITE) 2279 vdev_cache_write(zio); 2280 2281 if (zio_injection_enabled && zio->io_error == 0) 2282 zio->io_error = zio_handle_device_injection(vd, 2283 zio, EIO); 2284 2285 if (zio_injection_enabled && zio->io_error == 0) 2286 zio->io_error = zio_handle_label_injection(zio, EIO); 2287 2288 if (zio->io_error) { 2289 if (!vdev_accessible(vd, zio)) { 2290 zio->io_error = ENXIO; 2291 } else { 2292 unexpected_error = B_TRUE; 2293 } 2294 } 2295 } 2296 2297 ops->vdev_op_io_done(zio); 2298 2299 if (unexpected_error) 2300 VERIFY(vdev_probe(vd, zio) == NULL); 2301 2302 return (ZIO_PIPELINE_CONTINUE); 2303 } 2304 2305 /* 2306 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2307 * disk, and use that to finish the checksum ereport later. 2308 */ 2309 static void 2310 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2311 const void *good_buf) 2312 { 2313 /* no processing needed */ 2314 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2315 } 2316 2317 /*ARGSUSED*/ 2318 void 2319 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2320 { 2321 void *buf = zio_buf_alloc(zio->io_size); 2322 2323 bcopy(zio->io_data, buf, zio->io_size); 2324 2325 zcr->zcr_cbinfo = zio->io_size; 2326 zcr->zcr_cbdata = buf; 2327 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2328 zcr->zcr_free = zio_buf_free; 2329 } 2330 2331 static int 2332 zio_vdev_io_assess(zio_t *zio) 2333 { 2334 vdev_t *vd = zio->io_vd; 2335 2336 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2337 return (ZIO_PIPELINE_STOP); 2338 2339 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2340 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2341 2342 if (zio->io_vsd != NULL) { 2343 zio->io_vsd_ops->vsd_free(zio); 2344 zio->io_vsd = NULL; 2345 } 2346 2347 if (zio_injection_enabled && zio->io_error == 0) 2348 zio->io_error = zio_handle_fault_injection(zio, EIO); 2349 2350 /* 2351 * If the I/O failed, determine whether we should attempt to retry it. 2352 */ 2353 if (zio->io_error && vd == NULL && 2354 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2355 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2356 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2357 zio->io_error = 0; 2358 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2359 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2360 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2361 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); 2362 return (ZIO_PIPELINE_STOP); 2363 } 2364 2365 /* 2366 * If we got an error on a leaf device, convert it to ENXIO 2367 * if the device is not accessible at all. 2368 */ 2369 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2370 !vdev_accessible(vd, zio)) 2371 zio->io_error = ENXIO; 2372 2373 /* 2374 * If we can't write to an interior vdev (mirror or RAID-Z), 2375 * set vdev_cant_write so that we stop trying to allocate from it. 2376 */ 2377 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2378 vd != NULL && !vd->vdev_ops->vdev_op_leaf) 2379 vd->vdev_cant_write = B_TRUE; 2380 2381 if (zio->io_error) 2382 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2383 2384 return (ZIO_PIPELINE_CONTINUE); 2385 } 2386 2387 void 2388 zio_vdev_io_reissue(zio_t *zio) 2389 { 2390 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2391 ASSERT(zio->io_error == 0); 2392 2393 zio->io_stage >>= 1; 2394 } 2395 2396 void 2397 zio_vdev_io_redone(zio_t *zio) 2398 { 2399 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2400 2401 zio->io_stage >>= 1; 2402 } 2403 2404 void 2405 zio_vdev_io_bypass(zio_t *zio) 2406 { 2407 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2408 ASSERT(zio->io_error == 0); 2409 2410 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2411 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2412 } 2413 2414 /* 2415 * ========================================================================== 2416 * Generate and verify checksums 2417 * ========================================================================== 2418 */ 2419 static int 2420 zio_checksum_generate(zio_t *zio) 2421 { 2422 blkptr_t *bp = zio->io_bp; 2423 enum zio_checksum checksum; 2424 2425 if (bp == NULL) { 2426 /* 2427 * This is zio_write_phys(). 2428 * We're either generating a label checksum, or none at all. 2429 */ 2430 checksum = zio->io_prop.zp_checksum; 2431 2432 if (checksum == ZIO_CHECKSUM_OFF) 2433 return (ZIO_PIPELINE_CONTINUE); 2434 2435 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2436 } else { 2437 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2438 ASSERT(!IO_IS_ALLOCATING(zio)); 2439 checksum = ZIO_CHECKSUM_GANG_HEADER; 2440 } else { 2441 checksum = BP_GET_CHECKSUM(bp); 2442 } 2443 } 2444 2445 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2446 2447 return (ZIO_PIPELINE_CONTINUE); 2448 } 2449 2450 static int 2451 zio_checksum_verify(zio_t *zio) 2452 { 2453 zio_bad_cksum_t info; 2454 blkptr_t *bp = zio->io_bp; 2455 int error; 2456 2457 ASSERT(zio->io_vd != NULL); 2458 2459 if (bp == NULL) { 2460 /* 2461 * This is zio_read_phys(). 2462 * We're either verifying a label checksum, or nothing at all. 2463 */ 2464 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2465 return (ZIO_PIPELINE_CONTINUE); 2466 2467 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2468 } 2469 2470 if ((error = zio_checksum_error(zio, &info)) != 0) { 2471 zio->io_error = error; 2472 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2473 zfs_ereport_start_checksum(zio->io_spa, 2474 zio->io_vd, zio, zio->io_offset, 2475 zio->io_size, NULL, &info); 2476 } 2477 } 2478 2479 return (ZIO_PIPELINE_CONTINUE); 2480 } 2481 2482 /* 2483 * Called by RAID-Z to ensure we don't compute the checksum twice. 2484 */ 2485 void 2486 zio_checksum_verified(zio_t *zio) 2487 { 2488 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2489 } 2490 2491 /* 2492 * ========================================================================== 2493 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2494 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2495 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2496 * indicate errors that are specific to one I/O, and most likely permanent. 2497 * Any other error is presumed to be worse because we weren't expecting it. 2498 * ========================================================================== 2499 */ 2500 int 2501 zio_worst_error(int e1, int e2) 2502 { 2503 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2504 int r1, r2; 2505 2506 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2507 if (e1 == zio_error_rank[r1]) 2508 break; 2509 2510 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2511 if (e2 == zio_error_rank[r2]) 2512 break; 2513 2514 return (r1 > r2 ? e1 : e2); 2515 } 2516 2517 /* 2518 * ========================================================================== 2519 * I/O completion 2520 * ========================================================================== 2521 */ 2522 static int 2523 zio_ready(zio_t *zio) 2524 { 2525 blkptr_t *bp = zio->io_bp; 2526 zio_t *pio, *pio_next; 2527 2528 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2529 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2530 return (ZIO_PIPELINE_STOP); 2531 2532 if (zio->io_ready) { 2533 ASSERT(IO_IS_ALLOCATING(zio)); 2534 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2535 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2536 2537 zio->io_ready(zio); 2538 } 2539 2540 if (bp != NULL && bp != &zio->io_bp_copy) 2541 zio->io_bp_copy = *bp; 2542 2543 if (zio->io_error) 2544 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2545 2546 mutex_enter(&zio->io_lock); 2547 zio->io_state[ZIO_WAIT_READY] = 1; 2548 pio = zio_walk_parents(zio); 2549 mutex_exit(&zio->io_lock); 2550 2551 /* 2552 * As we notify zio's parents, new parents could be added. 2553 * New parents go to the head of zio's io_parent_list, however, 2554 * so we will (correctly) not notify them. The remainder of zio's 2555 * io_parent_list, from 'pio_next' onward, cannot change because 2556 * all parents must wait for us to be done before they can be done. 2557 */ 2558 for (; pio != NULL; pio = pio_next) { 2559 pio_next = zio_walk_parents(zio); 2560 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2561 } 2562 2563 if (zio->io_flags & ZIO_FLAG_NODATA) { 2564 if (BP_IS_GANG(bp)) { 2565 zio->io_flags &= ~ZIO_FLAG_NODATA; 2566 } else { 2567 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2568 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2569 } 2570 } 2571 2572 if (zio_injection_enabled && 2573 zio->io_spa->spa_syncing_txg == zio->io_txg) 2574 zio_handle_ignored_writes(zio); 2575 2576 return (ZIO_PIPELINE_CONTINUE); 2577 } 2578 2579 static int 2580 zio_done(zio_t *zio) 2581 { 2582 spa_t *spa = zio->io_spa; 2583 zio_t *lio = zio->io_logical; 2584 blkptr_t *bp = zio->io_bp; 2585 vdev_t *vd = zio->io_vd; 2586 uint64_t psize = zio->io_size; 2587 zio_t *pio, *pio_next; 2588 2589 /* 2590 * If our children haven't all completed, 2591 * wait for them and then repeat this pipeline stage. 2592 */ 2593 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2594 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2595 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2596 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2597 return (ZIO_PIPELINE_STOP); 2598 2599 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2600 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2601 ASSERT(zio->io_children[c][w] == 0); 2602 2603 if (bp != NULL) { 2604 ASSERT(bp->blk_pad[0] == 0); 2605 ASSERT(bp->blk_pad[1] == 0); 2606 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2607 (bp == zio_unique_parent(zio)->io_bp)); 2608 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2609 zio->io_bp_override == NULL && 2610 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2611 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2612 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2613 ASSERT(BP_COUNT_GANG(bp) == 0 || 2614 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2615 } 2616 } 2617 2618 /* 2619 * If there were child vdev/gang/ddt errors, they apply to us now. 2620 */ 2621 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2622 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2623 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2624 2625 /* 2626 * If the I/O on the transformed data was successful, generate any 2627 * checksum reports now while we still have the transformed data. 2628 */ 2629 if (zio->io_error == 0) { 2630 while (zio->io_cksum_report != NULL) { 2631 zio_cksum_report_t *zcr = zio->io_cksum_report; 2632 uint64_t align = zcr->zcr_align; 2633 uint64_t asize = P2ROUNDUP(psize, align); 2634 char *abuf = zio->io_data; 2635 2636 if (asize != psize) { 2637 abuf = zio_buf_alloc(asize); 2638 bcopy(zio->io_data, abuf, psize); 2639 bzero(abuf + psize, asize - psize); 2640 } 2641 2642 zio->io_cksum_report = zcr->zcr_next; 2643 zcr->zcr_next = NULL; 2644 zcr->zcr_finish(zcr, abuf); 2645 zfs_ereport_free_checksum(zcr); 2646 2647 if (asize != psize) 2648 zio_buf_free(abuf, asize); 2649 } 2650 } 2651 2652 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2653 2654 vdev_stat_update(zio, psize); 2655 2656 if (zio->io_error) { 2657 /* 2658 * If this I/O is attached to a particular vdev, 2659 * generate an error message describing the I/O failure 2660 * at the block level. We ignore these errors if the 2661 * device is currently unavailable. 2662 */ 2663 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2664 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2665 2666 if ((zio->io_error == EIO || !(zio->io_flags & 2667 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2668 zio == lio) { 2669 /* 2670 * For logical I/O requests, tell the SPA to log the 2671 * error and generate a logical data ereport. 2672 */ 2673 spa_log_error(spa, zio); 2674 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2675 0, 0); 2676 } 2677 } 2678 2679 if (zio->io_error && zio == lio) { 2680 /* 2681 * Determine whether zio should be reexecuted. This will 2682 * propagate all the way to the root via zio_notify_parent(). 2683 */ 2684 ASSERT(vd == NULL && bp != NULL); 2685 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2686 2687 if (IO_IS_ALLOCATING(zio) && 2688 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2689 if (zio->io_error != ENOSPC) 2690 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2691 else 2692 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2693 } 2694 2695 if ((zio->io_type == ZIO_TYPE_READ || 2696 zio->io_type == ZIO_TYPE_FREE) && 2697 zio->io_error == ENXIO && 2698 spa->spa_load_state == SPA_LOAD_NONE && 2699 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2700 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2701 2702 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2703 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2704 2705 /* 2706 * Here is a possibly good place to attempt to do 2707 * either combinatorial reconstruction or error correction 2708 * based on checksums. It also might be a good place 2709 * to send out preliminary ereports before we suspend 2710 * processing. 2711 */ 2712 } 2713 2714 /* 2715 * If there were logical child errors, they apply to us now. 2716 * We defer this until now to avoid conflating logical child 2717 * errors with errors that happened to the zio itself when 2718 * updating vdev stats and reporting FMA events above. 2719 */ 2720 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2721 2722 if ((zio->io_error || zio->io_reexecute) && 2723 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2724 !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 2725 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2726 2727 zio_gang_tree_free(&zio->io_gang_tree); 2728 2729 /* 2730 * Godfather I/Os should never suspend. 2731 */ 2732 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2733 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2734 zio->io_reexecute = 0; 2735 2736 if (zio->io_reexecute) { 2737 /* 2738 * This is a logical I/O that wants to reexecute. 2739 * 2740 * Reexecute is top-down. When an i/o fails, if it's not 2741 * the root, it simply notifies its parent and sticks around. 2742 * The parent, seeing that it still has children in zio_done(), 2743 * does the same. This percolates all the way up to the root. 2744 * The root i/o will reexecute or suspend the entire tree. 2745 * 2746 * This approach ensures that zio_reexecute() honors 2747 * all the original i/o dependency relationships, e.g. 2748 * parents not executing until children are ready. 2749 */ 2750 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2751 2752 zio->io_gang_leader = NULL; 2753 2754 mutex_enter(&zio->io_lock); 2755 zio->io_state[ZIO_WAIT_DONE] = 1; 2756 mutex_exit(&zio->io_lock); 2757 2758 /* 2759 * "The Godfather" I/O monitors its children but is 2760 * not a true parent to them. It will track them through 2761 * the pipeline but severs its ties whenever they get into 2762 * trouble (e.g. suspended). This allows "The Godfather" 2763 * I/O to return status without blocking. 2764 */ 2765 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2766 zio_link_t *zl = zio->io_walk_link; 2767 pio_next = zio_walk_parents(zio); 2768 2769 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 2770 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 2771 zio_remove_child(pio, zio, zl); 2772 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2773 } 2774 } 2775 2776 if ((pio = zio_unique_parent(zio)) != NULL) { 2777 /* 2778 * We're not a root i/o, so there's nothing to do 2779 * but notify our parent. Don't propagate errors 2780 * upward since we haven't permanently failed yet. 2781 */ 2782 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2783 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2784 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2785 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2786 /* 2787 * We'd fail again if we reexecuted now, so suspend 2788 * until conditions improve (e.g. device comes online). 2789 */ 2790 zio_suspend(spa, zio); 2791 } else { 2792 /* 2793 * Reexecution is potentially a huge amount of work. 2794 * Hand it off to the otherwise-unused claim taskq. 2795 */ 2796 (void) taskq_dispatch( 2797 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2798 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2799 } 2800 return (ZIO_PIPELINE_STOP); 2801 } 2802 2803 ASSERT(zio->io_child_count == 0); 2804 ASSERT(zio->io_reexecute == 0); 2805 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2806 2807 /* 2808 * Report any checksum errors, since the I/O is complete. 2809 */ 2810 while (zio->io_cksum_report != NULL) { 2811 zio_cksum_report_t *zcr = zio->io_cksum_report; 2812 zio->io_cksum_report = zcr->zcr_next; 2813 zcr->zcr_next = NULL; 2814 zcr->zcr_finish(zcr, NULL); 2815 zfs_ereport_free_checksum(zcr); 2816 } 2817 2818 /* 2819 * It is the responsibility of the done callback to ensure that this 2820 * particular zio is no longer discoverable for adoption, and as 2821 * such, cannot acquire any new parents. 2822 */ 2823 if (zio->io_done) 2824 zio->io_done(zio); 2825 2826 mutex_enter(&zio->io_lock); 2827 zio->io_state[ZIO_WAIT_DONE] = 1; 2828 mutex_exit(&zio->io_lock); 2829 2830 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2831 zio_link_t *zl = zio->io_walk_link; 2832 pio_next = zio_walk_parents(zio); 2833 zio_remove_child(pio, zio, zl); 2834 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2835 } 2836 2837 if (zio->io_waiter != NULL) { 2838 mutex_enter(&zio->io_lock); 2839 zio->io_executor = NULL; 2840 cv_broadcast(&zio->io_cv); 2841 mutex_exit(&zio->io_lock); 2842 } else { 2843 zio_destroy(zio); 2844 } 2845 2846 return (ZIO_PIPELINE_STOP); 2847 } 2848 2849 /* 2850 * ========================================================================== 2851 * I/O pipeline definition 2852 * ========================================================================== 2853 */ 2854 static zio_pipe_stage_t *zio_pipeline[] = { 2855 NULL, 2856 zio_read_bp_init, 2857 zio_free_bp_init, 2858 zio_issue_async, 2859 zio_write_bp_init, 2860 zio_checksum_generate, 2861 zio_ddt_read_start, 2862 zio_ddt_read_done, 2863 zio_ddt_write, 2864 zio_ddt_free, 2865 zio_gang_assemble, 2866 zio_gang_issue, 2867 zio_dva_allocate, 2868 zio_dva_free, 2869 zio_dva_claim, 2870 zio_ready, 2871 zio_vdev_io_start, 2872 zio_vdev_io_done, 2873 zio_vdev_io_assess, 2874 zio_checksum_verify, 2875 zio_done 2876 }; 2877