1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/fm/fs/zfs.h> 27 #include <sys/spa.h> 28 #include <sys/txg.h> 29 #include <sys/spa_impl.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio_impl.h> 32 #include <sys/zio_compress.h> 33 #include <sys/zio_checksum.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/arc.h> 36 #include <sys/ddt.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 0, /* ZIO_PRIORITY_LOG_WRITE */ 48 1, /* ZIO_PRIORITY_CACHE_FILL */ 49 1, /* ZIO_PRIORITY_AGG */ 50 4, /* ZIO_PRIORITY_FREE */ 51 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 52 6, /* ZIO_PRIORITY_ASYNC_READ */ 53 10, /* ZIO_PRIORITY_RESILVER */ 54 20, /* ZIO_PRIORITY_SCRUB */ 55 }; 56 57 /* 58 * ========================================================================== 59 * I/O type descriptions 60 * ========================================================================== 61 */ 62 char *zio_type_name[ZIO_TYPES] = { 63 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 64 "zio_ioctl" 65 }; 66 67 /* 68 * ========================================================================== 69 * I/O kmem caches 70 * ========================================================================== 71 */ 72 kmem_cache_t *zio_cache; 73 kmem_cache_t *zio_link_cache; 74 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 75 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 76 77 #ifdef _KERNEL 78 extern vmem_t *zio_alloc_arena; 79 #endif 80 81 /* 82 * An allocating zio is one that either currently has the DVA allocate 83 * stage set or will have it later in its lifetime. 84 */ 85 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 86 87 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 88 89 #ifdef ZFS_DEBUG 90 int zio_buf_debug_limit = 16384; 91 #else 92 int zio_buf_debug_limit = 0; 93 #endif 94 95 void 96 zio_init(void) 97 { 98 size_t c; 99 vmem_t *data_alloc_arena = NULL; 100 101 #ifdef _KERNEL 102 data_alloc_arena = zio_alloc_arena; 103 #endif 104 zio_cache = kmem_cache_create("zio_cache", 105 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 106 zio_link_cache = kmem_cache_create("zio_link_cache", 107 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 108 109 /* 110 * For small buffers, we want a cache for each multiple of 111 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 112 * for each quarter-power of 2. For large buffers, we want 113 * a cache for each multiple of PAGESIZE. 114 */ 115 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 116 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 117 size_t p2 = size; 118 size_t align = 0; 119 120 while (p2 & (p2 - 1)) 121 p2 &= p2 - 1; 122 123 if (size <= 4 * SPA_MINBLOCKSIZE) { 124 align = SPA_MINBLOCKSIZE; 125 } else if (P2PHASE(size, PAGESIZE) == 0) { 126 align = PAGESIZE; 127 } else if (P2PHASE(size, p2 >> 2) == 0) { 128 align = p2 >> 2; 129 } 130 131 if (align != 0) { 132 char name[36]; 133 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 134 zio_buf_cache[c] = kmem_cache_create(name, size, 135 align, NULL, NULL, NULL, NULL, NULL, 136 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 137 138 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 139 zio_data_buf_cache[c] = kmem_cache_create(name, size, 140 align, NULL, NULL, NULL, NULL, data_alloc_arena, 141 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 142 } 143 } 144 145 while (--c != 0) { 146 ASSERT(zio_buf_cache[c] != NULL); 147 if (zio_buf_cache[c - 1] == NULL) 148 zio_buf_cache[c - 1] = zio_buf_cache[c]; 149 150 ASSERT(zio_data_buf_cache[c] != NULL); 151 if (zio_data_buf_cache[c - 1] == NULL) 152 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 153 } 154 155 zio_inject_init(); 156 } 157 158 void 159 zio_fini(void) 160 { 161 size_t c; 162 kmem_cache_t *last_cache = NULL; 163 kmem_cache_t *last_data_cache = NULL; 164 165 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 166 if (zio_buf_cache[c] != last_cache) { 167 last_cache = zio_buf_cache[c]; 168 kmem_cache_destroy(zio_buf_cache[c]); 169 } 170 zio_buf_cache[c] = NULL; 171 172 if (zio_data_buf_cache[c] != last_data_cache) { 173 last_data_cache = zio_data_buf_cache[c]; 174 kmem_cache_destroy(zio_data_buf_cache[c]); 175 } 176 zio_data_buf_cache[c] = NULL; 177 } 178 179 kmem_cache_destroy(zio_link_cache); 180 kmem_cache_destroy(zio_cache); 181 182 zio_inject_fini(); 183 } 184 185 /* 186 * ========================================================================== 187 * Allocate and free I/O buffers 188 * ========================================================================== 189 */ 190 191 /* 192 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 193 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 194 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 195 * excess / transient data in-core during a crashdump. 196 */ 197 void * 198 zio_buf_alloc(size_t size) 199 { 200 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 201 202 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 203 204 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 205 } 206 207 /* 208 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 209 * crashdump if the kernel panics. This exists so that we will limit the amount 210 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 211 * of kernel heap dumped to disk when the kernel panics) 212 */ 213 void * 214 zio_data_buf_alloc(size_t size) 215 { 216 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 217 218 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 219 220 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 221 } 222 223 void 224 zio_buf_free(void *buf, size_t size) 225 { 226 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 227 228 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 229 230 kmem_cache_free(zio_buf_cache[c], buf); 231 } 232 233 void 234 zio_data_buf_free(void *buf, size_t size) 235 { 236 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 237 238 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 239 240 kmem_cache_free(zio_data_buf_cache[c], buf); 241 } 242 243 /* 244 * ========================================================================== 245 * Push and pop I/O transform buffers 246 * ========================================================================== 247 */ 248 static void 249 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 250 zio_transform_func_t *transform) 251 { 252 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 253 254 zt->zt_orig_data = zio->io_data; 255 zt->zt_orig_size = zio->io_size; 256 zt->zt_bufsize = bufsize; 257 zt->zt_transform = transform; 258 259 zt->zt_next = zio->io_transform_stack; 260 zio->io_transform_stack = zt; 261 262 zio->io_data = data; 263 zio->io_size = size; 264 } 265 266 static void 267 zio_pop_transforms(zio_t *zio) 268 { 269 zio_transform_t *zt; 270 271 while ((zt = zio->io_transform_stack) != NULL) { 272 if (zt->zt_transform != NULL) 273 zt->zt_transform(zio, 274 zt->zt_orig_data, zt->zt_orig_size); 275 276 if (zt->zt_bufsize != 0) 277 zio_buf_free(zio->io_data, zt->zt_bufsize); 278 279 zio->io_data = zt->zt_orig_data; 280 zio->io_size = zt->zt_orig_size; 281 zio->io_transform_stack = zt->zt_next; 282 283 kmem_free(zt, sizeof (zio_transform_t)); 284 } 285 } 286 287 /* 288 * ========================================================================== 289 * I/O transform callbacks for subblocks and decompression 290 * ========================================================================== 291 */ 292 static void 293 zio_subblock(zio_t *zio, void *data, uint64_t size) 294 { 295 ASSERT(zio->io_size > size); 296 297 if (zio->io_type == ZIO_TYPE_READ) 298 bcopy(zio->io_data, data, size); 299 } 300 301 static void 302 zio_decompress(zio_t *zio, void *data, uint64_t size) 303 { 304 if (zio->io_error == 0 && 305 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 306 zio->io_data, data, zio->io_size, size) != 0) 307 zio->io_error = EIO; 308 } 309 310 /* 311 * ========================================================================== 312 * I/O parent/child relationships and pipeline interlocks 313 * ========================================================================== 314 */ 315 /* 316 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 317 * continue calling these functions until they return NULL. 318 * Otherwise, the next caller will pick up the list walk in 319 * some indeterminate state. (Otherwise every caller would 320 * have to pass in a cookie to keep the state represented by 321 * io_walk_link, which gets annoying.) 322 */ 323 zio_t * 324 zio_walk_parents(zio_t *cio) 325 { 326 zio_link_t *zl = cio->io_walk_link; 327 list_t *pl = &cio->io_parent_list; 328 329 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 330 cio->io_walk_link = zl; 331 332 if (zl == NULL) 333 return (NULL); 334 335 ASSERT(zl->zl_child == cio); 336 return (zl->zl_parent); 337 } 338 339 zio_t * 340 zio_walk_children(zio_t *pio) 341 { 342 zio_link_t *zl = pio->io_walk_link; 343 list_t *cl = &pio->io_child_list; 344 345 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 346 pio->io_walk_link = zl; 347 348 if (zl == NULL) 349 return (NULL); 350 351 ASSERT(zl->zl_parent == pio); 352 return (zl->zl_child); 353 } 354 355 zio_t * 356 zio_unique_parent(zio_t *cio) 357 { 358 zio_t *pio = zio_walk_parents(cio); 359 360 VERIFY(zio_walk_parents(cio) == NULL); 361 return (pio); 362 } 363 364 void 365 zio_add_child(zio_t *pio, zio_t *cio) 366 { 367 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 368 369 /* 370 * Logical I/Os can have logical, gang, or vdev children. 371 * Gang I/Os can have gang or vdev children. 372 * Vdev I/Os can only have vdev children. 373 * The following ASSERT captures all of these constraints. 374 */ 375 ASSERT(cio->io_child_type <= pio->io_child_type); 376 377 zl->zl_parent = pio; 378 zl->zl_child = cio; 379 380 mutex_enter(&cio->io_lock); 381 mutex_enter(&pio->io_lock); 382 383 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 384 385 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 386 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 387 388 list_insert_head(&pio->io_child_list, zl); 389 list_insert_head(&cio->io_parent_list, zl); 390 391 pio->io_child_count++; 392 cio->io_parent_count++; 393 394 mutex_exit(&pio->io_lock); 395 mutex_exit(&cio->io_lock); 396 } 397 398 static void 399 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 400 { 401 ASSERT(zl->zl_parent == pio); 402 ASSERT(zl->zl_child == cio); 403 404 mutex_enter(&cio->io_lock); 405 mutex_enter(&pio->io_lock); 406 407 list_remove(&pio->io_child_list, zl); 408 list_remove(&cio->io_parent_list, zl); 409 410 pio->io_child_count--; 411 cio->io_parent_count--; 412 413 mutex_exit(&pio->io_lock); 414 mutex_exit(&cio->io_lock); 415 416 kmem_cache_free(zio_link_cache, zl); 417 } 418 419 static boolean_t 420 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 421 { 422 uint64_t *countp = &zio->io_children[child][wait]; 423 boolean_t waiting = B_FALSE; 424 425 mutex_enter(&zio->io_lock); 426 ASSERT(zio->io_stall == NULL); 427 if (*countp != 0) { 428 zio->io_stage >>= 1; 429 zio->io_stall = countp; 430 waiting = B_TRUE; 431 } 432 mutex_exit(&zio->io_lock); 433 434 return (waiting); 435 } 436 437 static void 438 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 439 { 440 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 441 int *errorp = &pio->io_child_error[zio->io_child_type]; 442 443 mutex_enter(&pio->io_lock); 444 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 445 *errorp = zio_worst_error(*errorp, zio->io_error); 446 pio->io_reexecute |= zio->io_reexecute; 447 ASSERT3U(*countp, >, 0); 448 if (--*countp == 0 && pio->io_stall == countp) { 449 pio->io_stall = NULL; 450 mutex_exit(&pio->io_lock); 451 zio_execute(pio); 452 } else { 453 mutex_exit(&pio->io_lock); 454 } 455 } 456 457 static void 458 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 459 { 460 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 461 zio->io_error = zio->io_child_error[c]; 462 } 463 464 /* 465 * ========================================================================== 466 * Create the various types of I/O (read, write, free, etc) 467 * ========================================================================== 468 */ 469 static zio_t * 470 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 471 void *data, uint64_t size, zio_done_func_t *done, void *private, 472 zio_type_t type, int priority, enum zio_flag flags, 473 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 474 enum zio_stage stage, enum zio_stage pipeline) 475 { 476 zio_t *zio; 477 478 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 479 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 480 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 481 482 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 483 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 484 ASSERT(vd || stage == ZIO_STAGE_OPEN); 485 486 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 487 bzero(zio, sizeof (zio_t)); 488 489 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 490 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 491 492 list_create(&zio->io_parent_list, sizeof (zio_link_t), 493 offsetof(zio_link_t, zl_parent_node)); 494 list_create(&zio->io_child_list, sizeof (zio_link_t), 495 offsetof(zio_link_t, zl_child_node)); 496 497 if (vd != NULL) 498 zio->io_child_type = ZIO_CHILD_VDEV; 499 else if (flags & ZIO_FLAG_GANG_CHILD) 500 zio->io_child_type = ZIO_CHILD_GANG; 501 else if (flags & ZIO_FLAG_DDT_CHILD) 502 zio->io_child_type = ZIO_CHILD_DDT; 503 else 504 zio->io_child_type = ZIO_CHILD_LOGICAL; 505 506 if (bp != NULL) { 507 zio->io_bp = (blkptr_t *)bp; 508 zio->io_bp_copy = *bp; 509 zio->io_bp_orig = *bp; 510 if (type != ZIO_TYPE_WRITE || 511 zio->io_child_type == ZIO_CHILD_DDT) 512 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 513 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 514 zio->io_logical = zio; 515 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 516 pipeline |= ZIO_GANG_STAGES; 517 } 518 519 zio->io_spa = spa; 520 zio->io_txg = txg; 521 zio->io_done = done; 522 zio->io_private = private; 523 zio->io_type = type; 524 zio->io_priority = priority; 525 zio->io_vd = vd; 526 zio->io_offset = offset; 527 zio->io_orig_data = zio->io_data = data; 528 zio->io_orig_size = zio->io_size = size; 529 zio->io_orig_flags = zio->io_flags = flags; 530 zio->io_orig_stage = zio->io_stage = stage; 531 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 532 533 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 534 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 535 536 if (zb != NULL) 537 zio->io_bookmark = *zb; 538 539 if (pio != NULL) { 540 if (zio->io_logical == NULL) 541 zio->io_logical = pio->io_logical; 542 if (zio->io_child_type == ZIO_CHILD_GANG) 543 zio->io_gang_leader = pio->io_gang_leader; 544 zio_add_child(pio, zio); 545 } 546 547 return (zio); 548 } 549 550 static void 551 zio_destroy(zio_t *zio) 552 { 553 list_destroy(&zio->io_parent_list); 554 list_destroy(&zio->io_child_list); 555 mutex_destroy(&zio->io_lock); 556 cv_destroy(&zio->io_cv); 557 kmem_cache_free(zio_cache, zio); 558 } 559 560 zio_t * 561 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 562 void *private, enum zio_flag flags) 563 { 564 zio_t *zio; 565 566 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 567 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 568 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 569 570 return (zio); 571 } 572 573 zio_t * 574 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 575 { 576 return (zio_null(NULL, spa, NULL, done, private, flags)); 577 } 578 579 zio_t * 580 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 581 void *data, uint64_t size, zio_done_func_t *done, void *private, 582 int priority, enum zio_flag flags, const zbookmark_t *zb) 583 { 584 zio_t *zio; 585 586 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 587 data, size, done, private, 588 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 589 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 590 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 591 592 return (zio); 593 } 594 595 zio_t * 596 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 597 void *data, uint64_t size, const zio_prop_t *zp, 598 zio_done_func_t *ready, zio_done_func_t *done, void *private, 599 int priority, enum zio_flag flags, const zbookmark_t *zb) 600 { 601 zio_t *zio; 602 603 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 604 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 605 zp->zp_compress >= ZIO_COMPRESS_OFF && 606 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 607 zp->zp_type < DMU_OT_NUMTYPES && 608 zp->zp_level < 32 && 609 zp->zp_copies > 0 && 610 zp->zp_copies <= spa_max_replication(spa) && 611 zp->zp_dedup <= 1 && 612 zp->zp_dedup_verify <= 1); 613 614 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 615 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 616 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 617 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 618 619 zio->io_ready = ready; 620 zio->io_prop = *zp; 621 622 return (zio); 623 } 624 625 zio_t * 626 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 627 uint64_t size, zio_done_func_t *done, void *private, int priority, 628 enum zio_flag flags, zbookmark_t *zb) 629 { 630 zio_t *zio; 631 632 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 633 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 634 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 635 636 return (zio); 637 } 638 639 void 640 zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 641 { 642 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 643 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 644 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 645 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 646 647 zio->io_prop.zp_copies = copies; 648 zio->io_bp_override = bp; 649 } 650 651 void 652 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 653 { 654 bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 655 } 656 657 zio_t * 658 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 659 enum zio_flag flags) 660 { 661 zio_t *zio; 662 663 dprintf_bp(bp, "freeing in txg %llu, pass %u", 664 (longlong_t)txg, spa->spa_sync_pass); 665 666 ASSERT(!BP_IS_HOLE(bp)); 667 ASSERT(spa_syncing_txg(spa) == txg); 668 ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 669 670 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 671 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 672 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 673 674 return (zio); 675 } 676 677 zio_t * 678 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 679 zio_done_func_t *done, void *private, enum zio_flag flags) 680 { 681 zio_t *zio; 682 683 /* 684 * A claim is an allocation of a specific block. Claims are needed 685 * to support immediate writes in the intent log. The issue is that 686 * immediate writes contain committed data, but in a txg that was 687 * *not* committed. Upon opening the pool after an unclean shutdown, 688 * the intent log claims all blocks that contain immediate write data 689 * so that the SPA knows they're in use. 690 * 691 * All claims *must* be resolved in the first txg -- before the SPA 692 * starts allocating blocks -- so that nothing is allocated twice. 693 * If txg == 0 we just verify that the block is claimable. 694 */ 695 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 696 ASSERT(txg == spa_first_txg(spa) || txg == 0); 697 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 698 699 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 700 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 701 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 702 703 return (zio); 704 } 705 706 zio_t * 707 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 708 zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 709 { 710 zio_t *zio; 711 int c; 712 713 if (vd->vdev_children == 0) { 714 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 715 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 716 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 717 718 zio->io_cmd = cmd; 719 } else { 720 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 721 722 for (c = 0; c < vd->vdev_children; c++) 723 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 724 done, private, priority, flags)); 725 } 726 727 return (zio); 728 } 729 730 zio_t * 731 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 732 void *data, int checksum, zio_done_func_t *done, void *private, 733 int priority, enum zio_flag flags, boolean_t labels) 734 { 735 zio_t *zio; 736 737 ASSERT(vd->vdev_children == 0); 738 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 739 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 740 ASSERT3U(offset + size, <=, vd->vdev_psize); 741 742 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 743 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 744 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 745 746 zio->io_prop.zp_checksum = checksum; 747 748 return (zio); 749 } 750 751 zio_t * 752 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 753 void *data, int checksum, zio_done_func_t *done, void *private, 754 int priority, enum zio_flag flags, boolean_t labels) 755 { 756 zio_t *zio; 757 758 ASSERT(vd->vdev_children == 0); 759 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 760 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 761 ASSERT3U(offset + size, <=, vd->vdev_psize); 762 763 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 764 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 765 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 766 767 zio->io_prop.zp_checksum = checksum; 768 769 if (zio_checksum_table[checksum].ci_eck) { 770 /* 771 * zec checksums are necessarily destructive -- they modify 772 * the end of the write buffer to hold the verifier/checksum. 773 * Therefore, we must make a local copy in case the data is 774 * being written to multiple places in parallel. 775 */ 776 void *wbuf = zio_buf_alloc(size); 777 bcopy(data, wbuf, size); 778 zio_push_transform(zio, wbuf, size, size, NULL); 779 } 780 781 return (zio); 782 } 783 784 /* 785 * Create a child I/O to do some work for us. 786 */ 787 zio_t * 788 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 789 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 790 zio_done_func_t *done, void *private) 791 { 792 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 793 zio_t *zio; 794 795 ASSERT(vd->vdev_parent == 796 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 797 798 if (type == ZIO_TYPE_READ && bp != NULL) { 799 /* 800 * If we have the bp, then the child should perform the 801 * checksum and the parent need not. This pushes error 802 * detection as close to the leaves as possible and 803 * eliminates redundant checksums in the interior nodes. 804 */ 805 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 806 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 807 } 808 809 if (vd->vdev_children == 0) 810 offset += VDEV_LABEL_START_SIZE; 811 812 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 813 814 /* 815 * If we've decided to do a repair, the write is not speculative -- 816 * even if the original read was. 817 */ 818 if (flags & ZIO_FLAG_IO_REPAIR) 819 flags &= ~ZIO_FLAG_SPECULATIVE; 820 821 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 822 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 823 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 824 825 return (zio); 826 } 827 828 zio_t * 829 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 830 int type, int priority, enum zio_flag flags, 831 zio_done_func_t *done, void *private) 832 { 833 zio_t *zio; 834 835 ASSERT(vd->vdev_ops->vdev_op_leaf); 836 837 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 838 data, size, done, private, type, priority, 839 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 840 vd, offset, NULL, 841 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 842 843 return (zio); 844 } 845 846 void 847 zio_flush(zio_t *zio, vdev_t *vd) 848 { 849 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 850 NULL, NULL, ZIO_PRIORITY_NOW, 851 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 852 } 853 854 void 855 zio_shrink(zio_t *zio, uint64_t size) 856 { 857 ASSERT(zio->io_executor == NULL); 858 ASSERT(zio->io_orig_size == zio->io_size); 859 ASSERT(size <= zio->io_size); 860 861 /* 862 * We don't shrink for raidz because of problems with the 863 * reconstruction when reading back less than the block size. 864 * Note, BP_IS_RAIDZ() assumes no compression. 865 */ 866 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 867 if (!BP_IS_RAIDZ(zio->io_bp)) 868 zio->io_orig_size = zio->io_size = size; 869 } 870 871 /* 872 * ========================================================================== 873 * Prepare to read and write logical blocks 874 * ========================================================================== 875 */ 876 877 static int 878 zio_read_bp_init(zio_t *zio) 879 { 880 blkptr_t *bp = zio->io_bp; 881 882 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 883 zio->io_child_type == ZIO_CHILD_LOGICAL && 884 !(zio->io_flags & ZIO_FLAG_RAW)) { 885 uint64_t psize = BP_GET_PSIZE(bp); 886 void *cbuf = zio_buf_alloc(psize); 887 888 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 889 } 890 891 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 892 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 893 894 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 895 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 896 897 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 898 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 899 900 return (ZIO_PIPELINE_CONTINUE); 901 } 902 903 static int 904 zio_write_bp_init(zio_t *zio) 905 { 906 spa_t *spa = zio->io_spa; 907 zio_prop_t *zp = &zio->io_prop; 908 enum zio_compress compress = zp->zp_compress; 909 blkptr_t *bp = zio->io_bp; 910 uint64_t lsize = zio->io_size; 911 uint64_t psize = lsize; 912 int pass = 1; 913 914 /* 915 * If our children haven't all reached the ready stage, 916 * wait for them and then repeat this pipeline stage. 917 */ 918 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 919 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 920 return (ZIO_PIPELINE_STOP); 921 922 if (!IO_IS_ALLOCATING(zio)) 923 return (ZIO_PIPELINE_CONTINUE); 924 925 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 926 927 if (zio->io_bp_override) { 928 ASSERT(bp->blk_birth != zio->io_txg); 929 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 930 931 *bp = *zio->io_bp_override; 932 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 933 934 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 935 return (ZIO_PIPELINE_CONTINUE); 936 937 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 938 zp->zp_dedup_verify); 939 940 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 941 BP_SET_DEDUP(bp, 1); 942 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 943 return (ZIO_PIPELINE_CONTINUE); 944 } 945 zio->io_bp_override = NULL; 946 BP_ZERO(bp); 947 } 948 949 if (bp->blk_birth == zio->io_txg) { 950 /* 951 * We're rewriting an existing block, which means we're 952 * working on behalf of spa_sync(). For spa_sync() to 953 * converge, it must eventually be the case that we don't 954 * have to allocate new blocks. But compression changes 955 * the blocksize, which forces a reallocate, and makes 956 * convergence take longer. Therefore, after the first 957 * few passes, stop compressing to ensure convergence. 958 */ 959 pass = spa_sync_pass(spa); 960 961 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 962 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 963 ASSERT(!BP_GET_DEDUP(bp)); 964 965 if (pass > SYNC_PASS_DONT_COMPRESS) 966 compress = ZIO_COMPRESS_OFF; 967 968 /* Make sure someone doesn't change their mind on overwrites */ 969 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 970 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 971 } 972 973 if (compress != ZIO_COMPRESS_OFF) { 974 void *cbuf = zio_buf_alloc(lsize); 975 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 976 if (psize == 0 || psize == lsize) { 977 compress = ZIO_COMPRESS_OFF; 978 zio_buf_free(cbuf, lsize); 979 } else { 980 ASSERT(psize < lsize); 981 zio_push_transform(zio, cbuf, psize, lsize, NULL); 982 } 983 } 984 985 /* 986 * The final pass of spa_sync() must be all rewrites, but the first 987 * few passes offer a trade-off: allocating blocks defers convergence, 988 * but newly allocated blocks are sequential, so they can be written 989 * to disk faster. Therefore, we allow the first few passes of 990 * spa_sync() to allocate new blocks, but force rewrites after that. 991 * There should only be a handful of blocks after pass 1 in any case. 992 */ 993 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 994 pass > SYNC_PASS_REWRITE) { 995 ASSERT(psize != 0); 996 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 997 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 998 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 999 } else { 1000 BP_ZERO(bp); 1001 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1002 } 1003 1004 if (psize == 0) { 1005 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1006 } else { 1007 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1008 BP_SET_LSIZE(bp, lsize); 1009 BP_SET_PSIZE(bp, psize); 1010 BP_SET_COMPRESS(bp, compress); 1011 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1012 BP_SET_TYPE(bp, zp->zp_type); 1013 BP_SET_LEVEL(bp, zp->zp_level); 1014 BP_SET_DEDUP(bp, zp->zp_dedup); 1015 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1016 if (zp->zp_dedup) { 1017 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1018 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1019 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1020 } 1021 } 1022 1023 return (ZIO_PIPELINE_CONTINUE); 1024 } 1025 1026 static int 1027 zio_free_bp_init(zio_t *zio) 1028 { 1029 blkptr_t *bp = zio->io_bp; 1030 1031 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1032 if (BP_GET_DEDUP(bp)) 1033 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1034 else 1035 arc_free(zio->io_spa, bp); 1036 } 1037 1038 return (ZIO_PIPELINE_CONTINUE); 1039 } 1040 1041 /* 1042 * ========================================================================== 1043 * Execute the I/O pipeline 1044 * ========================================================================== 1045 */ 1046 1047 static void 1048 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1049 { 1050 spa_t *spa = zio->io_spa; 1051 zio_type_t t = zio->io_type; 1052 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1053 1054 /* 1055 * If we're a config writer or a probe, the normal issue and 1056 * interrupt threads may all be blocked waiting for the config lock. 1057 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1058 */ 1059 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1060 t = ZIO_TYPE_NULL; 1061 1062 /* 1063 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1064 */ 1065 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1066 t = ZIO_TYPE_NULL; 1067 1068 /* 1069 * If this is a high priority I/O, then use the high priority taskq. 1070 */ 1071 if (zio->io_priority == ZIO_PRIORITY_NOW && 1072 spa->spa_zio_taskq[t][q + 1] != NULL) 1073 q++; 1074 1075 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1076 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1077 (task_func_t *)zio_execute, zio, flags); 1078 } 1079 1080 static boolean_t 1081 zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1082 { 1083 kthread_t *executor = zio->io_executor; 1084 spa_t *spa = zio->io_spa; 1085 1086 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1087 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1088 return (B_TRUE); 1089 1090 return (B_FALSE); 1091 } 1092 1093 static int 1094 zio_issue_async(zio_t *zio) 1095 { 1096 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1097 1098 return (ZIO_PIPELINE_STOP); 1099 } 1100 1101 void 1102 zio_interrupt(zio_t *zio) 1103 { 1104 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1105 } 1106 1107 /* 1108 * Execute the I/O pipeline until one of the following occurs: 1109 * (1) the I/O completes; (2) the pipeline stalls waiting for 1110 * dependent child I/Os; (3) the I/O issues, so we're waiting 1111 * for an I/O completion interrupt; (4) the I/O is delegated by 1112 * vdev-level caching or aggregation; (5) the I/O is deferred 1113 * due to vdev-level queueing; (6) the I/O is handed off to 1114 * another thread. In all cases, the pipeline stops whenever 1115 * there's no CPU work; it never burns a thread in cv_wait(). 1116 * 1117 * There's no locking on io_stage because there's no legitimate way 1118 * for multiple threads to be attempting to process the same I/O. 1119 */ 1120 static zio_pipe_stage_t *zio_pipeline[]; 1121 1122 void 1123 zio_execute(zio_t *zio) 1124 { 1125 zio->io_executor = curthread; 1126 1127 while (zio->io_stage < ZIO_STAGE_DONE) { 1128 enum zio_stage pipeline = zio->io_pipeline; 1129 enum zio_stage stage = zio->io_stage; 1130 int rv; 1131 1132 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1133 ASSERT(ISP2(stage)); 1134 ASSERT(zio->io_stall == NULL); 1135 1136 do { 1137 stage <<= 1; 1138 } while ((stage & pipeline) == 0); 1139 1140 ASSERT(stage <= ZIO_STAGE_DONE); 1141 1142 /* 1143 * If we are in interrupt context and this pipeline stage 1144 * will grab a config lock that is held across I/O, 1145 * or may wait for an I/O that needs an interrupt thread 1146 * to complete, issue async to avoid deadlock. 1147 * 1148 * For VDEV_IO_START, we cut in line so that the io will 1149 * be sent to disk promptly. 1150 */ 1151 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1152 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1153 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1154 zio_requeue_io_start_cut_in_line : B_FALSE; 1155 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1156 return; 1157 } 1158 1159 zio->io_stage = stage; 1160 rv = zio_pipeline[highbit(stage) - 1](zio); 1161 1162 if (rv == ZIO_PIPELINE_STOP) 1163 return; 1164 1165 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1166 } 1167 } 1168 1169 /* 1170 * ========================================================================== 1171 * Initiate I/O, either sync or async 1172 * ========================================================================== 1173 */ 1174 int 1175 zio_wait(zio_t *zio) 1176 { 1177 int error; 1178 1179 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1180 ASSERT(zio->io_executor == NULL); 1181 1182 zio->io_waiter = curthread; 1183 1184 zio_execute(zio); 1185 1186 mutex_enter(&zio->io_lock); 1187 while (zio->io_executor != NULL) 1188 cv_wait(&zio->io_cv, &zio->io_lock); 1189 mutex_exit(&zio->io_lock); 1190 1191 error = zio->io_error; 1192 zio_destroy(zio); 1193 1194 return (error); 1195 } 1196 1197 void 1198 zio_nowait(zio_t *zio) 1199 { 1200 ASSERT(zio->io_executor == NULL); 1201 1202 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1203 zio_unique_parent(zio) == NULL) { 1204 /* 1205 * This is a logical async I/O with no parent to wait for it. 1206 * We add it to the spa_async_root_zio "Godfather" I/O which 1207 * will ensure they complete prior to unloading the pool. 1208 */ 1209 spa_t *spa = zio->io_spa; 1210 1211 zio_add_child(spa->spa_async_zio_root, zio); 1212 } 1213 1214 zio_execute(zio); 1215 } 1216 1217 /* 1218 * ========================================================================== 1219 * Reexecute or suspend/resume failed I/O 1220 * ========================================================================== 1221 */ 1222 1223 static void 1224 zio_reexecute(zio_t *pio) 1225 { 1226 zio_t *cio, *cio_next; 1227 1228 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1229 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1230 ASSERT(pio->io_gang_leader == NULL); 1231 ASSERT(pio->io_gang_tree == NULL); 1232 1233 pio->io_flags = pio->io_orig_flags; 1234 pio->io_stage = pio->io_orig_stage; 1235 pio->io_pipeline = pio->io_orig_pipeline; 1236 pio->io_reexecute = 0; 1237 pio->io_error = 0; 1238 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1239 pio->io_state[w] = 0; 1240 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1241 pio->io_child_error[c] = 0; 1242 1243 if (IO_IS_ALLOCATING(pio)) 1244 BP_ZERO(pio->io_bp); 1245 1246 /* 1247 * As we reexecute pio's children, new children could be created. 1248 * New children go to the head of pio's io_child_list, however, 1249 * so we will (correctly) not reexecute them. The key is that 1250 * the remainder of pio's io_child_list, from 'cio_next' onward, 1251 * cannot be affected by any side effects of reexecuting 'cio'. 1252 */ 1253 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1254 cio_next = zio_walk_children(pio); 1255 mutex_enter(&pio->io_lock); 1256 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1257 pio->io_children[cio->io_child_type][w]++; 1258 mutex_exit(&pio->io_lock); 1259 zio_reexecute(cio); 1260 } 1261 1262 /* 1263 * Now that all children have been reexecuted, execute the parent. 1264 * We don't reexecute "The Godfather" I/O here as it's the 1265 * responsibility of the caller to wait on him. 1266 */ 1267 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1268 zio_execute(pio); 1269 } 1270 1271 void 1272 zio_suspend(spa_t *spa, zio_t *zio) 1273 { 1274 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1275 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1276 "failure and the failure mode property for this pool " 1277 "is set to panic.", spa_name(spa)); 1278 1279 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1280 1281 mutex_enter(&spa->spa_suspend_lock); 1282 1283 if (spa->spa_suspend_zio_root == NULL) 1284 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1285 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1286 ZIO_FLAG_GODFATHER); 1287 1288 spa->spa_suspended = B_TRUE; 1289 1290 if (zio != NULL) { 1291 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1292 ASSERT(zio != spa->spa_suspend_zio_root); 1293 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1294 ASSERT(zio_unique_parent(zio) == NULL); 1295 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1296 zio_add_child(spa->spa_suspend_zio_root, zio); 1297 } 1298 1299 mutex_exit(&spa->spa_suspend_lock); 1300 } 1301 1302 int 1303 zio_resume(spa_t *spa) 1304 { 1305 zio_t *pio; 1306 1307 /* 1308 * Reexecute all previously suspended i/o. 1309 */ 1310 mutex_enter(&spa->spa_suspend_lock); 1311 spa->spa_suspended = B_FALSE; 1312 cv_broadcast(&spa->spa_suspend_cv); 1313 pio = spa->spa_suspend_zio_root; 1314 spa->spa_suspend_zio_root = NULL; 1315 mutex_exit(&spa->spa_suspend_lock); 1316 1317 if (pio == NULL) 1318 return (0); 1319 1320 zio_reexecute(pio); 1321 return (zio_wait(pio)); 1322 } 1323 1324 void 1325 zio_resume_wait(spa_t *spa) 1326 { 1327 mutex_enter(&spa->spa_suspend_lock); 1328 while (spa_suspended(spa)) 1329 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1330 mutex_exit(&spa->spa_suspend_lock); 1331 } 1332 1333 /* 1334 * ========================================================================== 1335 * Gang blocks. 1336 * 1337 * A gang block is a collection of small blocks that looks to the DMU 1338 * like one large block. When zio_dva_allocate() cannot find a block 1339 * of the requested size, due to either severe fragmentation or the pool 1340 * being nearly full, it calls zio_write_gang_block() to construct the 1341 * block from smaller fragments. 1342 * 1343 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1344 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1345 * an indirect block: it's an array of block pointers. It consumes 1346 * only one sector and hence is allocatable regardless of fragmentation. 1347 * The gang header's bps point to its gang members, which hold the data. 1348 * 1349 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1350 * as the verifier to ensure uniqueness of the SHA256 checksum. 1351 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1352 * not the gang header. This ensures that data block signatures (needed for 1353 * deduplication) are independent of how the block is physically stored. 1354 * 1355 * Gang blocks can be nested: a gang member may itself be a gang block. 1356 * Thus every gang block is a tree in which root and all interior nodes are 1357 * gang headers, and the leaves are normal blocks that contain user data. 1358 * The root of the gang tree is called the gang leader. 1359 * 1360 * To perform any operation (read, rewrite, free, claim) on a gang block, 1361 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1362 * in the io_gang_tree field of the original logical i/o by recursively 1363 * reading the gang leader and all gang headers below it. This yields 1364 * an in-core tree containing the contents of every gang header and the 1365 * bps for every constituent of the gang block. 1366 * 1367 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1368 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1369 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1370 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1371 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1372 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1373 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1374 * of the gang header plus zio_checksum_compute() of the data to update the 1375 * gang header's blk_cksum as described above. 1376 * 1377 * The two-phase assemble/issue model solves the problem of partial failure -- 1378 * what if you'd freed part of a gang block but then couldn't read the 1379 * gang header for another part? Assembling the entire gang tree first 1380 * ensures that all the necessary gang header I/O has succeeded before 1381 * starting the actual work of free, claim, or write. Once the gang tree 1382 * is assembled, free and claim are in-memory operations that cannot fail. 1383 * 1384 * In the event that a gang write fails, zio_dva_unallocate() walks the 1385 * gang tree to immediately free (i.e. insert back into the space map) 1386 * everything we've allocated. This ensures that we don't get ENOSPC 1387 * errors during repeated suspend/resume cycles due to a flaky device. 1388 * 1389 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1390 * the gang tree, we won't modify the block, so we can safely defer the free 1391 * (knowing that the block is still intact). If we *can* assemble the gang 1392 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1393 * each constituent bp and we can allocate a new block on the next sync pass. 1394 * 1395 * In all cases, the gang tree allows complete recovery from partial failure. 1396 * ========================================================================== 1397 */ 1398 1399 static zio_t * 1400 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1401 { 1402 if (gn != NULL) 1403 return (pio); 1404 1405 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1406 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1407 &pio->io_bookmark)); 1408 } 1409 1410 zio_t * 1411 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1412 { 1413 zio_t *zio; 1414 1415 if (gn != NULL) { 1416 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1417 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1418 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1419 /* 1420 * As we rewrite each gang header, the pipeline will compute 1421 * a new gang block header checksum for it; but no one will 1422 * compute a new data checksum, so we do that here. The one 1423 * exception is the gang leader: the pipeline already computed 1424 * its data checksum because that stage precedes gang assembly. 1425 * (Presently, nothing actually uses interior data checksums; 1426 * this is just good hygiene.) 1427 */ 1428 if (gn != pio->io_gang_leader->io_gang_tree) { 1429 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1430 data, BP_GET_PSIZE(bp)); 1431 } 1432 /* 1433 * If we are here to damage data for testing purposes, 1434 * leave the GBH alone so that we can detect the damage. 1435 */ 1436 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1437 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1438 } else { 1439 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1440 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1441 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1442 } 1443 1444 return (zio); 1445 } 1446 1447 /* ARGSUSED */ 1448 zio_t * 1449 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1450 { 1451 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1452 ZIO_GANG_CHILD_FLAGS(pio))); 1453 } 1454 1455 /* ARGSUSED */ 1456 zio_t * 1457 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1458 { 1459 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1460 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1461 } 1462 1463 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1464 NULL, 1465 zio_read_gang, 1466 zio_rewrite_gang, 1467 zio_free_gang, 1468 zio_claim_gang, 1469 NULL 1470 }; 1471 1472 static void zio_gang_tree_assemble_done(zio_t *zio); 1473 1474 static zio_gang_node_t * 1475 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1476 { 1477 zio_gang_node_t *gn; 1478 1479 ASSERT(*gnpp == NULL); 1480 1481 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1482 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1483 *gnpp = gn; 1484 1485 return (gn); 1486 } 1487 1488 static void 1489 zio_gang_node_free(zio_gang_node_t **gnpp) 1490 { 1491 zio_gang_node_t *gn = *gnpp; 1492 1493 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1494 ASSERT(gn->gn_child[g] == NULL); 1495 1496 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1497 kmem_free(gn, sizeof (*gn)); 1498 *gnpp = NULL; 1499 } 1500 1501 static void 1502 zio_gang_tree_free(zio_gang_node_t **gnpp) 1503 { 1504 zio_gang_node_t *gn = *gnpp; 1505 1506 if (gn == NULL) 1507 return; 1508 1509 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1510 zio_gang_tree_free(&gn->gn_child[g]); 1511 1512 zio_gang_node_free(gnpp); 1513 } 1514 1515 static void 1516 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1517 { 1518 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1519 1520 ASSERT(gio->io_gang_leader == gio); 1521 ASSERT(BP_IS_GANG(bp)); 1522 1523 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1524 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1525 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1526 } 1527 1528 static void 1529 zio_gang_tree_assemble_done(zio_t *zio) 1530 { 1531 zio_t *gio = zio->io_gang_leader; 1532 zio_gang_node_t *gn = zio->io_private; 1533 blkptr_t *bp = zio->io_bp; 1534 1535 ASSERT(gio == zio_unique_parent(zio)); 1536 ASSERT(zio->io_child_count == 0); 1537 1538 if (zio->io_error) 1539 return; 1540 1541 if (BP_SHOULD_BYTESWAP(bp)) 1542 byteswap_uint64_array(zio->io_data, zio->io_size); 1543 1544 ASSERT(zio->io_data == gn->gn_gbh); 1545 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1546 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1547 1548 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1549 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1550 if (!BP_IS_GANG(gbp)) 1551 continue; 1552 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1553 } 1554 } 1555 1556 static void 1557 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1558 { 1559 zio_t *gio = pio->io_gang_leader; 1560 zio_t *zio; 1561 1562 ASSERT(BP_IS_GANG(bp) == !!gn); 1563 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1564 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1565 1566 /* 1567 * If you're a gang header, your data is in gn->gn_gbh. 1568 * If you're a gang member, your data is in 'data' and gn == NULL. 1569 */ 1570 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1571 1572 if (gn != NULL) { 1573 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1574 1575 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1576 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1577 if (BP_IS_HOLE(gbp)) 1578 continue; 1579 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1580 data = (char *)data + BP_GET_PSIZE(gbp); 1581 } 1582 } 1583 1584 if (gn == gio->io_gang_tree) 1585 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1586 1587 if (zio != pio) 1588 zio_nowait(zio); 1589 } 1590 1591 static int 1592 zio_gang_assemble(zio_t *zio) 1593 { 1594 blkptr_t *bp = zio->io_bp; 1595 1596 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1597 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1598 1599 zio->io_gang_leader = zio; 1600 1601 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1602 1603 return (ZIO_PIPELINE_CONTINUE); 1604 } 1605 1606 static int 1607 zio_gang_issue(zio_t *zio) 1608 { 1609 blkptr_t *bp = zio->io_bp; 1610 1611 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1612 return (ZIO_PIPELINE_STOP); 1613 1614 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1615 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1616 1617 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1618 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1619 else 1620 zio_gang_tree_free(&zio->io_gang_tree); 1621 1622 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1623 1624 return (ZIO_PIPELINE_CONTINUE); 1625 } 1626 1627 static void 1628 zio_write_gang_member_ready(zio_t *zio) 1629 { 1630 zio_t *pio = zio_unique_parent(zio); 1631 zio_t *gio = zio->io_gang_leader; 1632 dva_t *cdva = zio->io_bp->blk_dva; 1633 dva_t *pdva = pio->io_bp->blk_dva; 1634 uint64_t asize; 1635 1636 if (BP_IS_HOLE(zio->io_bp)) 1637 return; 1638 1639 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1640 1641 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1642 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1643 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1644 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1645 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1646 1647 mutex_enter(&pio->io_lock); 1648 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1649 ASSERT(DVA_GET_GANG(&pdva[d])); 1650 asize = DVA_GET_ASIZE(&pdva[d]); 1651 asize += DVA_GET_ASIZE(&cdva[d]); 1652 DVA_SET_ASIZE(&pdva[d], asize); 1653 } 1654 mutex_exit(&pio->io_lock); 1655 } 1656 1657 static int 1658 zio_write_gang_block(zio_t *pio) 1659 { 1660 spa_t *spa = pio->io_spa; 1661 blkptr_t *bp = pio->io_bp; 1662 zio_t *gio = pio->io_gang_leader; 1663 zio_t *zio; 1664 zio_gang_node_t *gn, **gnpp; 1665 zio_gbh_phys_t *gbh; 1666 uint64_t txg = pio->io_txg; 1667 uint64_t resid = pio->io_size; 1668 uint64_t lsize; 1669 int copies = gio->io_prop.zp_copies; 1670 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1671 zio_prop_t zp; 1672 int error; 1673 1674 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1675 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1676 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1677 if (error) { 1678 pio->io_error = error; 1679 return (ZIO_PIPELINE_CONTINUE); 1680 } 1681 1682 if (pio == gio) { 1683 gnpp = &gio->io_gang_tree; 1684 } else { 1685 gnpp = pio->io_private; 1686 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1687 } 1688 1689 gn = zio_gang_node_alloc(gnpp); 1690 gbh = gn->gn_gbh; 1691 bzero(gbh, SPA_GANGBLOCKSIZE); 1692 1693 /* 1694 * Create the gang header. 1695 */ 1696 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1697 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1698 1699 /* 1700 * Create and nowait the gang children. 1701 */ 1702 for (int g = 0; resid != 0; resid -= lsize, g++) { 1703 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1704 SPA_MINBLOCKSIZE); 1705 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1706 1707 zp.zp_checksum = gio->io_prop.zp_checksum; 1708 zp.zp_compress = ZIO_COMPRESS_OFF; 1709 zp.zp_type = DMU_OT_NONE; 1710 zp.zp_level = 0; 1711 zp.zp_copies = gio->io_prop.zp_copies; 1712 zp.zp_dedup = 0; 1713 zp.zp_dedup_verify = 0; 1714 1715 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1716 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1717 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1718 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1719 &pio->io_bookmark)); 1720 } 1721 1722 /* 1723 * Set pio's pipeline to just wait for zio to finish. 1724 */ 1725 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1726 1727 zio_nowait(zio); 1728 1729 return (ZIO_PIPELINE_CONTINUE); 1730 } 1731 1732 /* 1733 * ========================================================================== 1734 * Dedup 1735 * ========================================================================== 1736 */ 1737 static void 1738 zio_ddt_child_read_done(zio_t *zio) 1739 { 1740 blkptr_t *bp = zio->io_bp; 1741 ddt_entry_t *dde = zio->io_private; 1742 ddt_phys_t *ddp; 1743 zio_t *pio = zio_unique_parent(zio); 1744 1745 mutex_enter(&pio->io_lock); 1746 ddp = ddt_phys_select(dde, bp); 1747 if (zio->io_error == 0) 1748 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1749 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1750 dde->dde_repair_data = zio->io_data; 1751 else 1752 zio_buf_free(zio->io_data, zio->io_size); 1753 mutex_exit(&pio->io_lock); 1754 } 1755 1756 static int 1757 zio_ddt_read_start(zio_t *zio) 1758 { 1759 blkptr_t *bp = zio->io_bp; 1760 1761 ASSERT(BP_GET_DEDUP(bp)); 1762 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1763 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1764 1765 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1766 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1767 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1768 ddt_phys_t *ddp = dde->dde_phys; 1769 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1770 blkptr_t blk; 1771 1772 ASSERT(zio->io_vsd == NULL); 1773 zio->io_vsd = dde; 1774 1775 if (ddp_self == NULL) 1776 return (ZIO_PIPELINE_CONTINUE); 1777 1778 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1779 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1780 continue; 1781 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1782 &blk); 1783 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1784 zio_buf_alloc(zio->io_size), zio->io_size, 1785 zio_ddt_child_read_done, dde, zio->io_priority, 1786 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1787 &zio->io_bookmark)); 1788 } 1789 return (ZIO_PIPELINE_CONTINUE); 1790 } 1791 1792 zio_nowait(zio_read(zio, zio->io_spa, bp, 1793 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1794 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1795 1796 return (ZIO_PIPELINE_CONTINUE); 1797 } 1798 1799 static int 1800 zio_ddt_read_done(zio_t *zio) 1801 { 1802 blkptr_t *bp = zio->io_bp; 1803 1804 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1805 return (ZIO_PIPELINE_STOP); 1806 1807 ASSERT(BP_GET_DEDUP(bp)); 1808 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1809 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1810 1811 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1812 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1813 ddt_entry_t *dde = zio->io_vsd; 1814 if (ddt == NULL) { 1815 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1816 return (ZIO_PIPELINE_CONTINUE); 1817 } 1818 if (dde == NULL) { 1819 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1820 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1821 return (ZIO_PIPELINE_STOP); 1822 } 1823 if (dde->dde_repair_data != NULL) { 1824 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1825 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1826 } 1827 ddt_repair_done(ddt, dde); 1828 zio->io_vsd = NULL; 1829 } 1830 1831 ASSERT(zio->io_vsd == NULL); 1832 1833 return (ZIO_PIPELINE_CONTINUE); 1834 } 1835 1836 static boolean_t 1837 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1838 { 1839 spa_t *spa = zio->io_spa; 1840 1841 /* 1842 * Note: we compare the original data, not the transformed data, 1843 * because when zio->io_bp is an override bp, we will not have 1844 * pushed the I/O transforms. That's an important optimization 1845 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1846 */ 1847 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1848 zio_t *lio = dde->dde_lead_zio[p]; 1849 1850 if (lio != NULL) { 1851 return (lio->io_orig_size != zio->io_orig_size || 1852 bcmp(zio->io_orig_data, lio->io_orig_data, 1853 zio->io_orig_size) != 0); 1854 } 1855 } 1856 1857 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1858 ddt_phys_t *ddp = &dde->dde_phys[p]; 1859 1860 if (ddp->ddp_phys_birth != 0) { 1861 arc_buf_t *abuf = NULL; 1862 uint32_t aflags = ARC_WAIT; 1863 blkptr_t blk = *zio->io_bp; 1864 int error; 1865 1866 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 1867 1868 ddt_exit(ddt); 1869 1870 error = arc_read_nolock(NULL, spa, &blk, 1871 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 1872 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1873 &aflags, &zio->io_bookmark); 1874 1875 if (error == 0) { 1876 if (arc_buf_size(abuf) != zio->io_orig_size || 1877 bcmp(abuf->b_data, zio->io_orig_data, 1878 zio->io_orig_size) != 0) 1879 error = EEXIST; 1880 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 1881 } 1882 1883 ddt_enter(ddt); 1884 return (error != 0); 1885 } 1886 } 1887 1888 return (B_FALSE); 1889 } 1890 1891 static void 1892 zio_ddt_child_write_ready(zio_t *zio) 1893 { 1894 int p = zio->io_prop.zp_copies; 1895 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1896 ddt_entry_t *dde = zio->io_private; 1897 ddt_phys_t *ddp = &dde->dde_phys[p]; 1898 zio_t *pio; 1899 1900 if (zio->io_error) 1901 return; 1902 1903 ddt_enter(ddt); 1904 1905 ASSERT(dde->dde_lead_zio[p] == zio); 1906 1907 ddt_phys_fill(ddp, zio->io_bp); 1908 1909 while ((pio = zio_walk_parents(zio)) != NULL) 1910 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 1911 1912 ddt_exit(ddt); 1913 } 1914 1915 static void 1916 zio_ddt_child_write_done(zio_t *zio) 1917 { 1918 int p = zio->io_prop.zp_copies; 1919 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1920 ddt_entry_t *dde = zio->io_private; 1921 ddt_phys_t *ddp = &dde->dde_phys[p]; 1922 1923 ddt_enter(ddt); 1924 1925 ASSERT(ddp->ddp_refcnt == 0); 1926 ASSERT(dde->dde_lead_zio[p] == zio); 1927 dde->dde_lead_zio[p] = NULL; 1928 1929 if (zio->io_error == 0) { 1930 while (zio_walk_parents(zio) != NULL) 1931 ddt_phys_addref(ddp); 1932 } else { 1933 ddt_phys_clear(ddp); 1934 } 1935 1936 ddt_exit(ddt); 1937 } 1938 1939 static void 1940 zio_ddt_ditto_write_done(zio_t *zio) 1941 { 1942 int p = DDT_PHYS_DITTO; 1943 zio_prop_t *zp = &zio->io_prop; 1944 blkptr_t *bp = zio->io_bp; 1945 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1946 ddt_entry_t *dde = zio->io_private; 1947 ddt_phys_t *ddp = &dde->dde_phys[p]; 1948 ddt_key_t *ddk = &dde->dde_key; 1949 1950 ddt_enter(ddt); 1951 1952 ASSERT(ddp->ddp_refcnt == 0); 1953 ASSERT(dde->dde_lead_zio[p] == zio); 1954 dde->dde_lead_zio[p] = NULL; 1955 1956 if (zio->io_error == 0) { 1957 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 1958 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 1959 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 1960 if (ddp->ddp_phys_birth != 0) 1961 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 1962 ddt_phys_fill(ddp, bp); 1963 } 1964 1965 ddt_exit(ddt); 1966 } 1967 1968 static int 1969 zio_ddt_write(zio_t *zio) 1970 { 1971 spa_t *spa = zio->io_spa; 1972 blkptr_t *bp = zio->io_bp; 1973 uint64_t txg = zio->io_txg; 1974 zio_prop_t *zp = &zio->io_prop; 1975 int p = zp->zp_copies; 1976 int ditto_copies; 1977 zio_t *cio = NULL; 1978 zio_t *dio = NULL; 1979 ddt_t *ddt = ddt_select(spa, bp); 1980 ddt_entry_t *dde; 1981 ddt_phys_t *ddp; 1982 1983 ASSERT(BP_GET_DEDUP(bp)); 1984 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 1985 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 1986 1987 ddt_enter(ddt); 1988 dde = ddt_lookup(ddt, bp, B_TRUE); 1989 ddp = &dde->dde_phys[p]; 1990 1991 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 1992 /* 1993 * If we're using a weak checksum, upgrade to a strong checksum 1994 * and try again. If we're already using a strong checksum, 1995 * we can't resolve it, so just convert to an ordinary write. 1996 * (And automatically e-mail a paper to Nature?) 1997 */ 1998 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 1999 zp->zp_checksum = spa_dedup_checksum(spa); 2000 zio_pop_transforms(zio); 2001 zio->io_stage = ZIO_STAGE_OPEN; 2002 BP_ZERO(bp); 2003 } else { 2004 zp->zp_dedup = 0; 2005 } 2006 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2007 ddt_exit(ddt); 2008 return (ZIO_PIPELINE_CONTINUE); 2009 } 2010 2011 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2012 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2013 2014 if (ditto_copies > ddt_ditto_copies_present(dde) && 2015 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2016 zio_prop_t czp = *zp; 2017 2018 czp.zp_copies = ditto_copies; 2019 2020 /* 2021 * If we arrived here with an override bp, we won't have run 2022 * the transform stack, so we won't have the data we need to 2023 * generate a child i/o. So, toss the override bp and restart. 2024 * This is safe, because using the override bp is just an 2025 * optimization; and it's rare, so the cost doesn't matter. 2026 */ 2027 if (zio->io_bp_override) { 2028 zio_pop_transforms(zio); 2029 zio->io_stage = ZIO_STAGE_OPEN; 2030 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2031 zio->io_bp_override = NULL; 2032 BP_ZERO(bp); 2033 ddt_exit(ddt); 2034 return (ZIO_PIPELINE_CONTINUE); 2035 } 2036 2037 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2038 zio->io_orig_size, &czp, NULL, 2039 zio_ddt_ditto_write_done, dde, zio->io_priority, 2040 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2041 2042 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2043 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2044 } 2045 2046 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2047 if (ddp->ddp_phys_birth != 0) 2048 ddt_bp_fill(ddp, bp, txg); 2049 if (dde->dde_lead_zio[p] != NULL) 2050 zio_add_child(zio, dde->dde_lead_zio[p]); 2051 else 2052 ddt_phys_addref(ddp); 2053 } else if (zio->io_bp_override) { 2054 ASSERT(bp->blk_birth == txg); 2055 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2056 ddt_phys_fill(ddp, bp); 2057 ddt_phys_addref(ddp); 2058 } else { 2059 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2060 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2061 zio_ddt_child_write_done, dde, zio->io_priority, 2062 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2063 2064 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2065 dde->dde_lead_zio[p] = cio; 2066 } 2067 2068 ddt_exit(ddt); 2069 2070 if (cio) 2071 zio_nowait(cio); 2072 if (dio) 2073 zio_nowait(dio); 2074 2075 return (ZIO_PIPELINE_CONTINUE); 2076 } 2077 2078 ddt_entry_t *freedde; /* for debugging */ 2079 2080 static int 2081 zio_ddt_free(zio_t *zio) 2082 { 2083 spa_t *spa = zio->io_spa; 2084 blkptr_t *bp = zio->io_bp; 2085 ddt_t *ddt = ddt_select(spa, bp); 2086 ddt_entry_t *dde; 2087 ddt_phys_t *ddp; 2088 2089 ASSERT(BP_GET_DEDUP(bp)); 2090 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2091 2092 ddt_enter(ddt); 2093 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2094 ddp = ddt_phys_select(dde, bp); 2095 ddt_phys_decref(ddp); 2096 ddt_exit(ddt); 2097 2098 return (ZIO_PIPELINE_CONTINUE); 2099 } 2100 2101 /* 2102 * ========================================================================== 2103 * Allocate and free blocks 2104 * ========================================================================== 2105 */ 2106 static int 2107 zio_dva_allocate(zio_t *zio) 2108 { 2109 spa_t *spa = zio->io_spa; 2110 metaslab_class_t *mc = spa_normal_class(spa); 2111 blkptr_t *bp = zio->io_bp; 2112 int error; 2113 2114 if (zio->io_gang_leader == NULL) { 2115 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2116 zio->io_gang_leader = zio; 2117 } 2118 2119 ASSERT(BP_IS_HOLE(bp)); 2120 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 2121 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2122 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2123 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2124 2125 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2126 zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2127 2128 if (error) { 2129 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2130 return (zio_write_gang_block(zio)); 2131 zio->io_error = error; 2132 } 2133 2134 return (ZIO_PIPELINE_CONTINUE); 2135 } 2136 2137 static int 2138 zio_dva_free(zio_t *zio) 2139 { 2140 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2141 2142 return (ZIO_PIPELINE_CONTINUE); 2143 } 2144 2145 static int 2146 zio_dva_claim(zio_t *zio) 2147 { 2148 int error; 2149 2150 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2151 if (error) 2152 zio->io_error = error; 2153 2154 return (ZIO_PIPELINE_CONTINUE); 2155 } 2156 2157 /* 2158 * Undo an allocation. This is used by zio_done() when an I/O fails 2159 * and we want to give back the block we just allocated. 2160 * This handles both normal blocks and gang blocks. 2161 */ 2162 static void 2163 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2164 { 2165 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2166 ASSERT(zio->io_bp_override == NULL); 2167 2168 if (!BP_IS_HOLE(bp)) 2169 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2170 2171 if (gn != NULL) { 2172 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2173 zio_dva_unallocate(zio, gn->gn_child[g], 2174 &gn->gn_gbh->zg_blkptr[g]); 2175 } 2176 } 2177 } 2178 2179 /* 2180 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2181 */ 2182 int 2183 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2184 uint64_t size, boolean_t use_slog) 2185 { 2186 int error = 1; 2187 2188 ASSERT(txg > spa_syncing_txg(spa)); 2189 2190 if (use_slog) 2191 error = metaslab_alloc(spa, spa_log_class(spa), size, 2192 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2193 2194 if (error) 2195 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2196 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2197 2198 if (error == 0) { 2199 BP_SET_LSIZE(new_bp, size); 2200 BP_SET_PSIZE(new_bp, size); 2201 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2202 BP_SET_CHECKSUM(new_bp, 2203 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2204 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2205 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2206 BP_SET_LEVEL(new_bp, 0); 2207 BP_SET_DEDUP(new_bp, 0); 2208 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2209 } 2210 2211 return (error); 2212 } 2213 2214 /* 2215 * Free an intent log block. 2216 */ 2217 void 2218 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2219 { 2220 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2221 ASSERT(!BP_IS_GANG(bp)); 2222 2223 zio_free(spa, txg, bp); 2224 } 2225 2226 /* 2227 * ========================================================================== 2228 * Read and write to physical devices 2229 * ========================================================================== 2230 */ 2231 static int 2232 zio_vdev_io_start(zio_t *zio) 2233 { 2234 vdev_t *vd = zio->io_vd; 2235 uint64_t align; 2236 spa_t *spa = zio->io_spa; 2237 2238 ASSERT(zio->io_error == 0); 2239 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2240 2241 if (vd == NULL) { 2242 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2243 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2244 2245 /* 2246 * The mirror_ops handle multiple DVAs in a single BP. 2247 */ 2248 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2249 } 2250 2251 align = 1ULL << vd->vdev_top->vdev_ashift; 2252 2253 if (P2PHASE(zio->io_size, align) != 0) { 2254 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2255 char *abuf = zio_buf_alloc(asize); 2256 ASSERT(vd == vd->vdev_top); 2257 if (zio->io_type == ZIO_TYPE_WRITE) { 2258 bcopy(zio->io_data, abuf, zio->io_size); 2259 bzero(abuf + zio->io_size, asize - zio->io_size); 2260 } 2261 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2262 } 2263 2264 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2265 ASSERT(P2PHASE(zio->io_size, align) == 0); 2266 ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2267 2268 /* 2269 * If this is a repair I/O, and there's no self-healing involved -- 2270 * that is, we're just resilvering what we expect to resilver -- 2271 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2272 * This prevents spurious resilvering with nested replication. 2273 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2274 * A is out of date, we'll read from C+D, then use the data to 2275 * resilver A+B -- but we don't actually want to resilver B, just A. 2276 * The top-level mirror has no way to know this, so instead we just 2277 * discard unnecessary repairs as we work our way down the vdev tree. 2278 * The same logic applies to any form of nested replication: 2279 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2280 */ 2281 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2282 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2283 zio->io_txg != 0 && /* not a delegated i/o */ 2284 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2285 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2286 zio_vdev_io_bypass(zio); 2287 return (ZIO_PIPELINE_CONTINUE); 2288 } 2289 2290 if (vd->vdev_ops->vdev_op_leaf && 2291 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2292 2293 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2294 return (ZIO_PIPELINE_CONTINUE); 2295 2296 if ((zio = vdev_queue_io(zio)) == NULL) 2297 return (ZIO_PIPELINE_STOP); 2298 2299 if (!vdev_accessible(vd, zio)) { 2300 zio->io_error = ENXIO; 2301 zio_interrupt(zio); 2302 return (ZIO_PIPELINE_STOP); 2303 } 2304 } 2305 2306 return (vd->vdev_ops->vdev_op_io_start(zio)); 2307 } 2308 2309 static int 2310 zio_vdev_io_done(zio_t *zio) 2311 { 2312 vdev_t *vd = zio->io_vd; 2313 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2314 boolean_t unexpected_error = B_FALSE; 2315 2316 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2317 return (ZIO_PIPELINE_STOP); 2318 2319 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2320 2321 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2322 2323 vdev_queue_io_done(zio); 2324 2325 if (zio->io_type == ZIO_TYPE_WRITE) 2326 vdev_cache_write(zio); 2327 2328 if (zio_injection_enabled && zio->io_error == 0) 2329 zio->io_error = zio_handle_device_injection(vd, 2330 zio, EIO); 2331 2332 if (zio_injection_enabled && zio->io_error == 0) 2333 zio->io_error = zio_handle_label_injection(zio, EIO); 2334 2335 if (zio->io_error) { 2336 if (!vdev_accessible(vd, zio)) { 2337 zio->io_error = ENXIO; 2338 } else { 2339 unexpected_error = B_TRUE; 2340 } 2341 } 2342 } 2343 2344 ops->vdev_op_io_done(zio); 2345 2346 if (unexpected_error) 2347 VERIFY(vdev_probe(vd, zio) == NULL); 2348 2349 return (ZIO_PIPELINE_CONTINUE); 2350 } 2351 2352 /* 2353 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2354 * disk, and use that to finish the checksum ereport later. 2355 */ 2356 static void 2357 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2358 const void *good_buf) 2359 { 2360 /* no processing needed */ 2361 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2362 } 2363 2364 /*ARGSUSED*/ 2365 void 2366 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2367 { 2368 void *buf = zio_buf_alloc(zio->io_size); 2369 2370 bcopy(zio->io_data, buf, zio->io_size); 2371 2372 zcr->zcr_cbinfo = zio->io_size; 2373 zcr->zcr_cbdata = buf; 2374 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2375 zcr->zcr_free = zio_buf_free; 2376 } 2377 2378 static int 2379 zio_vdev_io_assess(zio_t *zio) 2380 { 2381 vdev_t *vd = zio->io_vd; 2382 2383 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2384 return (ZIO_PIPELINE_STOP); 2385 2386 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2387 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2388 2389 if (zio->io_vsd != NULL) { 2390 zio->io_vsd_ops->vsd_free(zio); 2391 zio->io_vsd = NULL; 2392 } 2393 2394 if (zio_injection_enabled && zio->io_error == 0) 2395 zio->io_error = zio_handle_fault_injection(zio, EIO); 2396 2397 /* 2398 * If the I/O failed, determine whether we should attempt to retry it. 2399 * 2400 * On retry, we cut in line in the issue queue, since we don't want 2401 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2402 */ 2403 if (zio->io_error && vd == NULL && 2404 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2405 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2406 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2407 zio->io_error = 0; 2408 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2409 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2410 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2411 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2412 zio_requeue_io_start_cut_in_line); 2413 return (ZIO_PIPELINE_STOP); 2414 } 2415 2416 /* 2417 * If we got an error on a leaf device, convert it to ENXIO 2418 * if the device is not accessible at all. 2419 */ 2420 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2421 !vdev_accessible(vd, zio)) 2422 zio->io_error = ENXIO; 2423 2424 /* 2425 * If we can't write to an interior vdev (mirror or RAID-Z), 2426 * set vdev_cant_write so that we stop trying to allocate from it. 2427 */ 2428 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2429 vd != NULL && !vd->vdev_ops->vdev_op_leaf) 2430 vd->vdev_cant_write = B_TRUE; 2431 2432 if (zio->io_error) 2433 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2434 2435 return (ZIO_PIPELINE_CONTINUE); 2436 } 2437 2438 void 2439 zio_vdev_io_reissue(zio_t *zio) 2440 { 2441 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2442 ASSERT(zio->io_error == 0); 2443 2444 zio->io_stage >>= 1; 2445 } 2446 2447 void 2448 zio_vdev_io_redone(zio_t *zio) 2449 { 2450 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2451 2452 zio->io_stage >>= 1; 2453 } 2454 2455 void 2456 zio_vdev_io_bypass(zio_t *zio) 2457 { 2458 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2459 ASSERT(zio->io_error == 0); 2460 2461 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2462 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2463 } 2464 2465 /* 2466 * ========================================================================== 2467 * Generate and verify checksums 2468 * ========================================================================== 2469 */ 2470 static int 2471 zio_checksum_generate(zio_t *zio) 2472 { 2473 blkptr_t *bp = zio->io_bp; 2474 enum zio_checksum checksum; 2475 2476 if (bp == NULL) { 2477 /* 2478 * This is zio_write_phys(). 2479 * We're either generating a label checksum, or none at all. 2480 */ 2481 checksum = zio->io_prop.zp_checksum; 2482 2483 if (checksum == ZIO_CHECKSUM_OFF) 2484 return (ZIO_PIPELINE_CONTINUE); 2485 2486 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2487 } else { 2488 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2489 ASSERT(!IO_IS_ALLOCATING(zio)); 2490 checksum = ZIO_CHECKSUM_GANG_HEADER; 2491 } else { 2492 checksum = BP_GET_CHECKSUM(bp); 2493 } 2494 } 2495 2496 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2497 2498 return (ZIO_PIPELINE_CONTINUE); 2499 } 2500 2501 static int 2502 zio_checksum_verify(zio_t *zio) 2503 { 2504 zio_bad_cksum_t info; 2505 blkptr_t *bp = zio->io_bp; 2506 int error; 2507 2508 ASSERT(zio->io_vd != NULL); 2509 2510 if (bp == NULL) { 2511 /* 2512 * This is zio_read_phys(). 2513 * We're either verifying a label checksum, or nothing at all. 2514 */ 2515 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2516 return (ZIO_PIPELINE_CONTINUE); 2517 2518 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2519 } 2520 2521 if ((error = zio_checksum_error(zio, &info)) != 0) { 2522 zio->io_error = error; 2523 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2524 zfs_ereport_start_checksum(zio->io_spa, 2525 zio->io_vd, zio, zio->io_offset, 2526 zio->io_size, NULL, &info); 2527 } 2528 } 2529 2530 return (ZIO_PIPELINE_CONTINUE); 2531 } 2532 2533 /* 2534 * Called by RAID-Z to ensure we don't compute the checksum twice. 2535 */ 2536 void 2537 zio_checksum_verified(zio_t *zio) 2538 { 2539 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2540 } 2541 2542 /* 2543 * ========================================================================== 2544 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2545 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2546 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2547 * indicate errors that are specific to one I/O, and most likely permanent. 2548 * Any other error is presumed to be worse because we weren't expecting it. 2549 * ========================================================================== 2550 */ 2551 int 2552 zio_worst_error(int e1, int e2) 2553 { 2554 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2555 int r1, r2; 2556 2557 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2558 if (e1 == zio_error_rank[r1]) 2559 break; 2560 2561 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2562 if (e2 == zio_error_rank[r2]) 2563 break; 2564 2565 return (r1 > r2 ? e1 : e2); 2566 } 2567 2568 /* 2569 * ========================================================================== 2570 * I/O completion 2571 * ========================================================================== 2572 */ 2573 static int 2574 zio_ready(zio_t *zio) 2575 { 2576 blkptr_t *bp = zio->io_bp; 2577 zio_t *pio, *pio_next; 2578 2579 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2580 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2581 return (ZIO_PIPELINE_STOP); 2582 2583 if (zio->io_ready) { 2584 ASSERT(IO_IS_ALLOCATING(zio)); 2585 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2586 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2587 2588 zio->io_ready(zio); 2589 } 2590 2591 if (bp != NULL && bp != &zio->io_bp_copy) 2592 zio->io_bp_copy = *bp; 2593 2594 if (zio->io_error) 2595 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2596 2597 mutex_enter(&zio->io_lock); 2598 zio->io_state[ZIO_WAIT_READY] = 1; 2599 pio = zio_walk_parents(zio); 2600 mutex_exit(&zio->io_lock); 2601 2602 /* 2603 * As we notify zio's parents, new parents could be added. 2604 * New parents go to the head of zio's io_parent_list, however, 2605 * so we will (correctly) not notify them. The remainder of zio's 2606 * io_parent_list, from 'pio_next' onward, cannot change because 2607 * all parents must wait for us to be done before they can be done. 2608 */ 2609 for (; pio != NULL; pio = pio_next) { 2610 pio_next = zio_walk_parents(zio); 2611 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2612 } 2613 2614 if (zio->io_flags & ZIO_FLAG_NODATA) { 2615 if (BP_IS_GANG(bp)) { 2616 zio->io_flags &= ~ZIO_FLAG_NODATA; 2617 } else { 2618 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2619 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2620 } 2621 } 2622 2623 if (zio_injection_enabled && 2624 zio->io_spa->spa_syncing_txg == zio->io_txg) 2625 zio_handle_ignored_writes(zio); 2626 2627 return (ZIO_PIPELINE_CONTINUE); 2628 } 2629 2630 static int 2631 zio_done(zio_t *zio) 2632 { 2633 spa_t *spa = zio->io_spa; 2634 zio_t *lio = zio->io_logical; 2635 blkptr_t *bp = zio->io_bp; 2636 vdev_t *vd = zio->io_vd; 2637 uint64_t psize = zio->io_size; 2638 zio_t *pio, *pio_next; 2639 2640 /* 2641 * If our children haven't all completed, 2642 * wait for them and then repeat this pipeline stage. 2643 */ 2644 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2645 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2646 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2647 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2648 return (ZIO_PIPELINE_STOP); 2649 2650 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2651 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2652 ASSERT(zio->io_children[c][w] == 0); 2653 2654 if (bp != NULL) { 2655 ASSERT(bp->blk_pad[0] == 0); 2656 ASSERT(bp->blk_pad[1] == 0); 2657 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2658 (bp == zio_unique_parent(zio)->io_bp)); 2659 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2660 zio->io_bp_override == NULL && 2661 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2662 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2663 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2664 ASSERT(BP_COUNT_GANG(bp) == 0 || 2665 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2666 } 2667 } 2668 2669 /* 2670 * If there were child vdev/gang/ddt errors, they apply to us now. 2671 */ 2672 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2673 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2674 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2675 2676 /* 2677 * If the I/O on the transformed data was successful, generate any 2678 * checksum reports now while we still have the transformed data. 2679 */ 2680 if (zio->io_error == 0) { 2681 while (zio->io_cksum_report != NULL) { 2682 zio_cksum_report_t *zcr = zio->io_cksum_report; 2683 uint64_t align = zcr->zcr_align; 2684 uint64_t asize = P2ROUNDUP(psize, align); 2685 char *abuf = zio->io_data; 2686 2687 if (asize != psize) { 2688 abuf = zio_buf_alloc(asize); 2689 bcopy(zio->io_data, abuf, psize); 2690 bzero(abuf + psize, asize - psize); 2691 } 2692 2693 zio->io_cksum_report = zcr->zcr_next; 2694 zcr->zcr_next = NULL; 2695 zcr->zcr_finish(zcr, abuf); 2696 zfs_ereport_free_checksum(zcr); 2697 2698 if (asize != psize) 2699 zio_buf_free(abuf, asize); 2700 } 2701 } 2702 2703 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2704 2705 vdev_stat_update(zio, psize); 2706 2707 if (zio->io_error) { 2708 /* 2709 * If this I/O is attached to a particular vdev, 2710 * generate an error message describing the I/O failure 2711 * at the block level. We ignore these errors if the 2712 * device is currently unavailable. 2713 */ 2714 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2715 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2716 2717 if ((zio->io_error == EIO || !(zio->io_flags & 2718 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2719 zio == lio) { 2720 /* 2721 * For logical I/O requests, tell the SPA to log the 2722 * error and generate a logical data ereport. 2723 */ 2724 spa_log_error(spa, zio); 2725 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2726 0, 0); 2727 } 2728 } 2729 2730 if (zio->io_error && zio == lio) { 2731 /* 2732 * Determine whether zio should be reexecuted. This will 2733 * propagate all the way to the root via zio_notify_parent(). 2734 */ 2735 ASSERT(vd == NULL && bp != NULL); 2736 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2737 2738 if (IO_IS_ALLOCATING(zio) && 2739 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2740 if (zio->io_error != ENOSPC) 2741 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2742 else 2743 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2744 } 2745 2746 if ((zio->io_type == ZIO_TYPE_READ || 2747 zio->io_type == ZIO_TYPE_FREE) && 2748 zio->io_error == ENXIO && 2749 spa_load_state(spa) == SPA_LOAD_NONE && 2750 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2751 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2752 2753 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2754 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2755 2756 /* 2757 * Here is a possibly good place to attempt to do 2758 * either combinatorial reconstruction or error correction 2759 * based on checksums. It also might be a good place 2760 * to send out preliminary ereports before we suspend 2761 * processing. 2762 */ 2763 } 2764 2765 /* 2766 * If there were logical child errors, they apply to us now. 2767 * We defer this until now to avoid conflating logical child 2768 * errors with errors that happened to the zio itself when 2769 * updating vdev stats and reporting FMA events above. 2770 */ 2771 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2772 2773 if ((zio->io_error || zio->io_reexecute) && 2774 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2775 !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 2776 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2777 2778 zio_gang_tree_free(&zio->io_gang_tree); 2779 2780 /* 2781 * Godfather I/Os should never suspend. 2782 */ 2783 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2784 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2785 zio->io_reexecute = 0; 2786 2787 if (zio->io_reexecute) { 2788 /* 2789 * This is a logical I/O that wants to reexecute. 2790 * 2791 * Reexecute is top-down. When an i/o fails, if it's not 2792 * the root, it simply notifies its parent and sticks around. 2793 * The parent, seeing that it still has children in zio_done(), 2794 * does the same. This percolates all the way up to the root. 2795 * The root i/o will reexecute or suspend the entire tree. 2796 * 2797 * This approach ensures that zio_reexecute() honors 2798 * all the original i/o dependency relationships, e.g. 2799 * parents not executing until children are ready. 2800 */ 2801 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2802 2803 zio->io_gang_leader = NULL; 2804 2805 mutex_enter(&zio->io_lock); 2806 zio->io_state[ZIO_WAIT_DONE] = 1; 2807 mutex_exit(&zio->io_lock); 2808 2809 /* 2810 * "The Godfather" I/O monitors its children but is 2811 * not a true parent to them. It will track them through 2812 * the pipeline but severs its ties whenever they get into 2813 * trouble (e.g. suspended). This allows "The Godfather" 2814 * I/O to return status without blocking. 2815 */ 2816 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2817 zio_link_t *zl = zio->io_walk_link; 2818 pio_next = zio_walk_parents(zio); 2819 2820 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 2821 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 2822 zio_remove_child(pio, zio, zl); 2823 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2824 } 2825 } 2826 2827 if ((pio = zio_unique_parent(zio)) != NULL) { 2828 /* 2829 * We're not a root i/o, so there's nothing to do 2830 * but notify our parent. Don't propagate errors 2831 * upward since we haven't permanently failed yet. 2832 */ 2833 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2834 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2835 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2836 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2837 /* 2838 * We'd fail again if we reexecuted now, so suspend 2839 * until conditions improve (e.g. device comes online). 2840 */ 2841 zio_suspend(spa, zio); 2842 } else { 2843 /* 2844 * Reexecution is potentially a huge amount of work. 2845 * Hand it off to the otherwise-unused claim taskq. 2846 */ 2847 (void) taskq_dispatch( 2848 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2849 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2850 } 2851 return (ZIO_PIPELINE_STOP); 2852 } 2853 2854 ASSERT(zio->io_child_count == 0); 2855 ASSERT(zio->io_reexecute == 0); 2856 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2857 2858 /* 2859 * Report any checksum errors, since the I/O is complete. 2860 */ 2861 while (zio->io_cksum_report != NULL) { 2862 zio_cksum_report_t *zcr = zio->io_cksum_report; 2863 zio->io_cksum_report = zcr->zcr_next; 2864 zcr->zcr_next = NULL; 2865 zcr->zcr_finish(zcr, NULL); 2866 zfs_ereport_free_checksum(zcr); 2867 } 2868 2869 /* 2870 * It is the responsibility of the done callback to ensure that this 2871 * particular zio is no longer discoverable for adoption, and as 2872 * such, cannot acquire any new parents. 2873 */ 2874 if (zio->io_done) 2875 zio->io_done(zio); 2876 2877 mutex_enter(&zio->io_lock); 2878 zio->io_state[ZIO_WAIT_DONE] = 1; 2879 mutex_exit(&zio->io_lock); 2880 2881 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2882 zio_link_t *zl = zio->io_walk_link; 2883 pio_next = zio_walk_parents(zio); 2884 zio_remove_child(pio, zio, zl); 2885 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2886 } 2887 2888 if (zio->io_waiter != NULL) { 2889 mutex_enter(&zio->io_lock); 2890 zio->io_executor = NULL; 2891 cv_broadcast(&zio->io_cv); 2892 mutex_exit(&zio->io_lock); 2893 } else { 2894 zio_destroy(zio); 2895 } 2896 2897 return (ZIO_PIPELINE_STOP); 2898 } 2899 2900 /* 2901 * ========================================================================== 2902 * I/O pipeline definition 2903 * ========================================================================== 2904 */ 2905 static zio_pipe_stage_t *zio_pipeline[] = { 2906 NULL, 2907 zio_read_bp_init, 2908 zio_free_bp_init, 2909 zio_issue_async, 2910 zio_write_bp_init, 2911 zio_checksum_generate, 2912 zio_ddt_read_start, 2913 zio_ddt_read_done, 2914 zio_ddt_write, 2915 zio_ddt_free, 2916 zio_gang_assemble, 2917 zio_gang_issue, 2918 zio_dva_allocate, 2919 zio_dva_free, 2920 zio_dva_claim, 2921 zio_ready, 2922 zio_vdev_io_start, 2923 zio_vdev_io_done, 2924 zio_vdev_io_assess, 2925 zio_checksum_verify, 2926 zio_done 2927 }; 2928