1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/zfs_context.h> 26 #include <sys/fm/fs/zfs.h> 27 #include <sys/spa.h> 28 #include <sys/txg.h> 29 #include <sys/spa_impl.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio_impl.h> 32 #include <sys/zio_compress.h> 33 #include <sys/zio_checksum.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/arc.h> 36 #include <sys/ddt.h> 37 38 /* 39 * ========================================================================== 40 * I/O priority table 41 * ========================================================================== 42 */ 43 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 44 0, /* ZIO_PRIORITY_NOW */ 45 0, /* ZIO_PRIORITY_SYNC_READ */ 46 0, /* ZIO_PRIORITY_SYNC_WRITE */ 47 0, /* ZIO_PRIORITY_LOG_WRITE */ 48 1, /* ZIO_PRIORITY_CACHE_FILL */ 49 1, /* ZIO_PRIORITY_AGG */ 50 4, /* ZIO_PRIORITY_FREE */ 51 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 52 6, /* ZIO_PRIORITY_ASYNC_READ */ 53 10, /* ZIO_PRIORITY_RESILVER */ 54 20, /* ZIO_PRIORITY_SCRUB */ 55 }; 56 57 /* 58 * ========================================================================== 59 * I/O type descriptions 60 * ========================================================================== 61 */ 62 char *zio_type_name[ZIO_TYPES] = { 63 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 64 "zio_ioctl" 65 }; 66 67 /* 68 * ========================================================================== 69 * I/O kmem caches 70 * ========================================================================== 71 */ 72 kmem_cache_t *zio_cache; 73 kmem_cache_t *zio_link_cache; 74 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 75 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 76 77 #ifdef _KERNEL 78 extern vmem_t *zio_alloc_arena; 79 #endif 80 81 /* 82 * An allocating zio is one that either currently has the DVA allocate 83 * stage set or will have it later in its lifetime. 84 */ 85 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 86 87 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 88 89 #ifdef ZFS_DEBUG 90 int zio_buf_debug_limit = 16384; 91 #else 92 int zio_buf_debug_limit = 0; 93 #endif 94 95 void 96 zio_init(void) 97 { 98 size_t c; 99 vmem_t *data_alloc_arena = NULL; 100 101 #ifdef _KERNEL 102 data_alloc_arena = zio_alloc_arena; 103 #endif 104 zio_cache = kmem_cache_create("zio_cache", 105 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 106 zio_link_cache = kmem_cache_create("zio_link_cache", 107 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 108 109 /* 110 * For small buffers, we want a cache for each multiple of 111 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 112 * for each quarter-power of 2. For large buffers, we want 113 * a cache for each multiple of PAGESIZE. 114 */ 115 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 116 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 117 size_t p2 = size; 118 size_t align = 0; 119 120 while (p2 & (p2 - 1)) 121 p2 &= p2 - 1; 122 123 if (size <= 4 * SPA_MINBLOCKSIZE) { 124 align = SPA_MINBLOCKSIZE; 125 } else if (P2PHASE(size, PAGESIZE) == 0) { 126 align = PAGESIZE; 127 } else if (P2PHASE(size, p2 >> 2) == 0) { 128 align = p2 >> 2; 129 } 130 131 if (align != 0) { 132 char name[36]; 133 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 134 zio_buf_cache[c] = kmem_cache_create(name, size, 135 align, NULL, NULL, NULL, NULL, NULL, 136 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 137 138 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 139 zio_data_buf_cache[c] = kmem_cache_create(name, size, 140 align, NULL, NULL, NULL, NULL, data_alloc_arena, 141 size > zio_buf_debug_limit ? KMC_NODEBUG : 0); 142 } 143 } 144 145 while (--c != 0) { 146 ASSERT(zio_buf_cache[c] != NULL); 147 if (zio_buf_cache[c - 1] == NULL) 148 zio_buf_cache[c - 1] = zio_buf_cache[c]; 149 150 ASSERT(zio_data_buf_cache[c] != NULL); 151 if (zio_data_buf_cache[c - 1] == NULL) 152 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 153 } 154 155 zio_inject_init(); 156 } 157 158 void 159 zio_fini(void) 160 { 161 size_t c; 162 kmem_cache_t *last_cache = NULL; 163 kmem_cache_t *last_data_cache = NULL; 164 165 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 166 if (zio_buf_cache[c] != last_cache) { 167 last_cache = zio_buf_cache[c]; 168 kmem_cache_destroy(zio_buf_cache[c]); 169 } 170 zio_buf_cache[c] = NULL; 171 172 if (zio_data_buf_cache[c] != last_data_cache) { 173 last_data_cache = zio_data_buf_cache[c]; 174 kmem_cache_destroy(zio_data_buf_cache[c]); 175 } 176 zio_data_buf_cache[c] = NULL; 177 } 178 179 kmem_cache_destroy(zio_link_cache); 180 kmem_cache_destroy(zio_cache); 181 182 zio_inject_fini(); 183 } 184 185 /* 186 * ========================================================================== 187 * Allocate and free I/O buffers 188 * ========================================================================== 189 */ 190 191 /* 192 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 193 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 194 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 195 * excess / transient data in-core during a crashdump. 196 */ 197 void * 198 zio_buf_alloc(size_t size) 199 { 200 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 201 202 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 203 204 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 205 } 206 207 /* 208 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 209 * crashdump if the kernel panics. This exists so that we will limit the amount 210 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 211 * of kernel heap dumped to disk when the kernel panics) 212 */ 213 void * 214 zio_data_buf_alloc(size_t size) 215 { 216 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 217 218 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 219 220 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 221 } 222 223 void 224 zio_buf_free(void *buf, size_t size) 225 { 226 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 227 228 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 229 230 kmem_cache_free(zio_buf_cache[c], buf); 231 } 232 233 void 234 zio_data_buf_free(void *buf, size_t size) 235 { 236 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 237 238 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 239 240 kmem_cache_free(zio_data_buf_cache[c], buf); 241 } 242 243 /* 244 * ========================================================================== 245 * Push and pop I/O transform buffers 246 * ========================================================================== 247 */ 248 static void 249 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 250 zio_transform_func_t *transform) 251 { 252 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 253 254 zt->zt_orig_data = zio->io_data; 255 zt->zt_orig_size = zio->io_size; 256 zt->zt_bufsize = bufsize; 257 zt->zt_transform = transform; 258 259 zt->zt_next = zio->io_transform_stack; 260 zio->io_transform_stack = zt; 261 262 zio->io_data = data; 263 zio->io_size = size; 264 } 265 266 static void 267 zio_pop_transforms(zio_t *zio) 268 { 269 zio_transform_t *zt; 270 271 while ((zt = zio->io_transform_stack) != NULL) { 272 if (zt->zt_transform != NULL) 273 zt->zt_transform(zio, 274 zt->zt_orig_data, zt->zt_orig_size); 275 276 if (zt->zt_bufsize != 0) 277 zio_buf_free(zio->io_data, zt->zt_bufsize); 278 279 zio->io_data = zt->zt_orig_data; 280 zio->io_size = zt->zt_orig_size; 281 zio->io_transform_stack = zt->zt_next; 282 283 kmem_free(zt, sizeof (zio_transform_t)); 284 } 285 } 286 287 /* 288 * ========================================================================== 289 * I/O transform callbacks for subblocks and decompression 290 * ========================================================================== 291 */ 292 static void 293 zio_subblock(zio_t *zio, void *data, uint64_t size) 294 { 295 ASSERT(zio->io_size > size); 296 297 if (zio->io_type == ZIO_TYPE_READ) 298 bcopy(zio->io_data, data, size); 299 } 300 301 static void 302 zio_decompress(zio_t *zio, void *data, uint64_t size) 303 { 304 if (zio->io_error == 0 && 305 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 306 zio->io_data, data, zio->io_size, size) != 0) 307 zio->io_error = EIO; 308 } 309 310 /* 311 * ========================================================================== 312 * I/O parent/child relationships and pipeline interlocks 313 * ========================================================================== 314 */ 315 /* 316 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 317 * continue calling these functions until they return NULL. 318 * Otherwise, the next caller will pick up the list walk in 319 * some indeterminate state. (Otherwise every caller would 320 * have to pass in a cookie to keep the state represented by 321 * io_walk_link, which gets annoying.) 322 */ 323 zio_t * 324 zio_walk_parents(zio_t *cio) 325 { 326 zio_link_t *zl = cio->io_walk_link; 327 list_t *pl = &cio->io_parent_list; 328 329 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 330 cio->io_walk_link = zl; 331 332 if (zl == NULL) 333 return (NULL); 334 335 ASSERT(zl->zl_child == cio); 336 return (zl->zl_parent); 337 } 338 339 zio_t * 340 zio_walk_children(zio_t *pio) 341 { 342 zio_link_t *zl = pio->io_walk_link; 343 list_t *cl = &pio->io_child_list; 344 345 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 346 pio->io_walk_link = zl; 347 348 if (zl == NULL) 349 return (NULL); 350 351 ASSERT(zl->zl_parent == pio); 352 return (zl->zl_child); 353 } 354 355 zio_t * 356 zio_unique_parent(zio_t *cio) 357 { 358 zio_t *pio = zio_walk_parents(cio); 359 360 VERIFY(zio_walk_parents(cio) == NULL); 361 return (pio); 362 } 363 364 void 365 zio_add_child(zio_t *pio, zio_t *cio) 366 { 367 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 368 369 /* 370 * Logical I/Os can have logical, gang, or vdev children. 371 * Gang I/Os can have gang or vdev children. 372 * Vdev I/Os can only have vdev children. 373 * The following ASSERT captures all of these constraints. 374 */ 375 ASSERT(cio->io_child_type <= pio->io_child_type); 376 377 zl->zl_parent = pio; 378 zl->zl_child = cio; 379 380 mutex_enter(&cio->io_lock); 381 mutex_enter(&pio->io_lock); 382 383 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 384 385 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 386 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 387 388 list_insert_head(&pio->io_child_list, zl); 389 list_insert_head(&cio->io_parent_list, zl); 390 391 pio->io_child_count++; 392 cio->io_parent_count++; 393 394 mutex_exit(&pio->io_lock); 395 mutex_exit(&cio->io_lock); 396 } 397 398 static void 399 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 400 { 401 ASSERT(zl->zl_parent == pio); 402 ASSERT(zl->zl_child == cio); 403 404 mutex_enter(&cio->io_lock); 405 mutex_enter(&pio->io_lock); 406 407 list_remove(&pio->io_child_list, zl); 408 list_remove(&cio->io_parent_list, zl); 409 410 pio->io_child_count--; 411 cio->io_parent_count--; 412 413 mutex_exit(&pio->io_lock); 414 mutex_exit(&cio->io_lock); 415 416 kmem_cache_free(zio_link_cache, zl); 417 } 418 419 static boolean_t 420 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 421 { 422 uint64_t *countp = &zio->io_children[child][wait]; 423 boolean_t waiting = B_FALSE; 424 425 mutex_enter(&zio->io_lock); 426 ASSERT(zio->io_stall == NULL); 427 if (*countp != 0) { 428 zio->io_stage >>= 1; 429 zio->io_stall = countp; 430 waiting = B_TRUE; 431 } 432 mutex_exit(&zio->io_lock); 433 434 return (waiting); 435 } 436 437 static void 438 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 439 { 440 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 441 int *errorp = &pio->io_child_error[zio->io_child_type]; 442 443 mutex_enter(&pio->io_lock); 444 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 445 *errorp = zio_worst_error(*errorp, zio->io_error); 446 pio->io_reexecute |= zio->io_reexecute; 447 ASSERT3U(*countp, >, 0); 448 if (--*countp == 0 && pio->io_stall == countp) { 449 pio->io_stall = NULL; 450 mutex_exit(&pio->io_lock); 451 zio_execute(pio); 452 } else { 453 mutex_exit(&pio->io_lock); 454 } 455 } 456 457 static void 458 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 459 { 460 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 461 zio->io_error = zio->io_child_error[c]; 462 } 463 464 /* 465 * ========================================================================== 466 * Create the various types of I/O (read, write, free, etc) 467 * ========================================================================== 468 */ 469 static zio_t * 470 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 471 void *data, uint64_t size, zio_done_func_t *done, void *private, 472 zio_type_t type, int priority, enum zio_flag flags, 473 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 474 enum zio_stage stage, enum zio_stage pipeline) 475 { 476 zio_t *zio; 477 478 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 479 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 480 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 481 482 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 483 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 484 ASSERT(vd || stage == ZIO_STAGE_OPEN); 485 486 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 487 bzero(zio, sizeof (zio_t)); 488 489 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 490 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 491 492 list_create(&zio->io_parent_list, sizeof (zio_link_t), 493 offsetof(zio_link_t, zl_parent_node)); 494 list_create(&zio->io_child_list, sizeof (zio_link_t), 495 offsetof(zio_link_t, zl_child_node)); 496 497 if (vd != NULL) 498 zio->io_child_type = ZIO_CHILD_VDEV; 499 else if (flags & ZIO_FLAG_GANG_CHILD) 500 zio->io_child_type = ZIO_CHILD_GANG; 501 else if (flags & ZIO_FLAG_DDT_CHILD) 502 zio->io_child_type = ZIO_CHILD_DDT; 503 else 504 zio->io_child_type = ZIO_CHILD_LOGICAL; 505 506 if (bp != NULL) { 507 zio->io_bp = (blkptr_t *)bp; 508 zio->io_bp_copy = *bp; 509 zio->io_bp_orig = *bp; 510 if (type != ZIO_TYPE_WRITE || 511 zio->io_child_type == ZIO_CHILD_DDT) 512 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 513 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 514 zio->io_logical = zio; 515 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 516 pipeline |= ZIO_GANG_STAGES; 517 } 518 519 zio->io_spa = spa; 520 zio->io_txg = txg; 521 zio->io_done = done; 522 zio->io_private = private; 523 zio->io_type = type; 524 zio->io_priority = priority; 525 zio->io_vd = vd; 526 zio->io_offset = offset; 527 zio->io_orig_data = zio->io_data = data; 528 zio->io_orig_size = zio->io_size = size; 529 zio->io_orig_flags = zio->io_flags = flags; 530 zio->io_orig_stage = zio->io_stage = stage; 531 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 532 533 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 534 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 535 536 if (zb != NULL) 537 zio->io_bookmark = *zb; 538 539 if (pio != NULL) { 540 if (zio->io_logical == NULL) 541 zio->io_logical = pio->io_logical; 542 if (zio->io_child_type == ZIO_CHILD_GANG) 543 zio->io_gang_leader = pio->io_gang_leader; 544 zio_add_child(pio, zio); 545 } 546 547 return (zio); 548 } 549 550 static void 551 zio_destroy(zio_t *zio) 552 { 553 list_destroy(&zio->io_parent_list); 554 list_destroy(&zio->io_child_list); 555 mutex_destroy(&zio->io_lock); 556 cv_destroy(&zio->io_cv); 557 kmem_cache_free(zio_cache, zio); 558 } 559 560 zio_t * 561 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 562 void *private, enum zio_flag flags) 563 { 564 zio_t *zio; 565 566 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 567 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 568 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 569 570 return (zio); 571 } 572 573 zio_t * 574 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 575 { 576 return (zio_null(NULL, spa, NULL, done, private, flags)); 577 } 578 579 zio_t * 580 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 581 void *data, uint64_t size, zio_done_func_t *done, void *private, 582 int priority, enum zio_flag flags, const zbookmark_t *zb) 583 { 584 zio_t *zio; 585 586 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 587 data, size, done, private, 588 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 589 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 590 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 591 592 return (zio); 593 } 594 595 zio_t * 596 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 597 void *data, uint64_t size, const zio_prop_t *zp, 598 zio_done_func_t *ready, zio_done_func_t *done, void *private, 599 int priority, enum zio_flag flags, const zbookmark_t *zb) 600 { 601 zio_t *zio; 602 603 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 604 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 605 zp->zp_compress >= ZIO_COMPRESS_OFF && 606 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 607 zp->zp_type < DMU_OT_NUMTYPES && 608 zp->zp_level < 32 && 609 zp->zp_copies > 0 && 610 zp->zp_copies <= spa_max_replication(spa) && 611 zp->zp_dedup <= 1 && 612 zp->zp_dedup_verify <= 1); 613 614 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 615 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 616 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 617 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 618 619 zio->io_ready = ready; 620 zio->io_prop = *zp; 621 622 return (zio); 623 } 624 625 zio_t * 626 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 627 uint64_t size, zio_done_func_t *done, void *private, int priority, 628 enum zio_flag flags, zbookmark_t *zb) 629 { 630 zio_t *zio; 631 632 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 633 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 634 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 635 636 return (zio); 637 } 638 639 void 640 zio_write_override(zio_t *zio, blkptr_t *bp, int copies) 641 { 642 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 643 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 644 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 645 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 646 647 zio->io_prop.zp_copies = copies; 648 zio->io_bp_override = bp; 649 } 650 651 void 652 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 653 { 654 bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); 655 } 656 657 zio_t * 658 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 659 enum zio_flag flags) 660 { 661 zio_t *zio; 662 663 dprintf_bp(bp, "freeing in txg %llu, pass %u", 664 (longlong_t)txg, spa->spa_sync_pass); 665 666 ASSERT(!BP_IS_HOLE(bp)); 667 ASSERT(spa_syncing_txg(spa) == txg); 668 ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); 669 670 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 671 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 672 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 673 674 return (zio); 675 } 676 677 zio_t * 678 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 679 zio_done_func_t *done, void *private, enum zio_flag flags) 680 { 681 zio_t *zio; 682 683 /* 684 * A claim is an allocation of a specific block. Claims are needed 685 * to support immediate writes in the intent log. The issue is that 686 * immediate writes contain committed data, but in a txg that was 687 * *not* committed. Upon opening the pool after an unclean shutdown, 688 * the intent log claims all blocks that contain immediate write data 689 * so that the SPA knows they're in use. 690 * 691 * All claims *must* be resolved in the first txg -- before the SPA 692 * starts allocating blocks -- so that nothing is allocated twice. 693 * If txg == 0 we just verify that the block is claimable. 694 */ 695 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 696 ASSERT(txg == spa_first_txg(spa) || txg == 0); 697 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 698 699 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 700 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 701 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 702 703 return (zio); 704 } 705 706 zio_t * 707 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 708 zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 709 { 710 zio_t *zio; 711 int c; 712 713 if (vd->vdev_children == 0) { 714 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 715 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 716 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 717 718 zio->io_cmd = cmd; 719 } else { 720 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 721 722 for (c = 0; c < vd->vdev_children; c++) 723 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 724 done, private, priority, flags)); 725 } 726 727 return (zio); 728 } 729 730 zio_t * 731 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 732 void *data, int checksum, zio_done_func_t *done, void *private, 733 int priority, enum zio_flag flags, boolean_t labels) 734 { 735 zio_t *zio; 736 737 ASSERT(vd->vdev_children == 0); 738 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 739 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 740 ASSERT3U(offset + size, <=, vd->vdev_psize); 741 742 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 743 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 744 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 745 746 zio->io_prop.zp_checksum = checksum; 747 748 return (zio); 749 } 750 751 zio_t * 752 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 753 void *data, int checksum, zio_done_func_t *done, void *private, 754 int priority, enum zio_flag flags, boolean_t labels) 755 { 756 zio_t *zio; 757 758 ASSERT(vd->vdev_children == 0); 759 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 760 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 761 ASSERT3U(offset + size, <=, vd->vdev_psize); 762 763 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 764 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 765 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 766 767 zio->io_prop.zp_checksum = checksum; 768 769 if (zio_checksum_table[checksum].ci_eck) { 770 /* 771 * zec checksums are necessarily destructive -- they modify 772 * the end of the write buffer to hold the verifier/checksum. 773 * Therefore, we must make a local copy in case the data is 774 * being written to multiple places in parallel. 775 */ 776 void *wbuf = zio_buf_alloc(size); 777 bcopy(data, wbuf, size); 778 zio_push_transform(zio, wbuf, size, size, NULL); 779 } 780 781 return (zio); 782 } 783 784 /* 785 * Create a child I/O to do some work for us. 786 */ 787 zio_t * 788 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 789 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 790 zio_done_func_t *done, void *private) 791 { 792 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 793 zio_t *zio; 794 795 ASSERT(vd->vdev_parent == 796 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 797 798 if (type == ZIO_TYPE_READ && bp != NULL) { 799 /* 800 * If we have the bp, then the child should perform the 801 * checksum and the parent need not. This pushes error 802 * detection as close to the leaves as possible and 803 * eliminates redundant checksums in the interior nodes. 804 */ 805 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 806 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 807 } 808 809 if (vd->vdev_children == 0) 810 offset += VDEV_LABEL_START_SIZE; 811 812 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 813 814 /* 815 * If we've decided to do a repair, the write is not speculative -- 816 * even if the original read was. 817 */ 818 if (flags & ZIO_FLAG_IO_REPAIR) 819 flags &= ~ZIO_FLAG_SPECULATIVE; 820 821 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 822 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 823 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 824 825 return (zio); 826 } 827 828 zio_t * 829 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 830 int type, int priority, enum zio_flag flags, 831 zio_done_func_t *done, void *private) 832 { 833 zio_t *zio; 834 835 ASSERT(vd->vdev_ops->vdev_op_leaf); 836 837 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 838 data, size, done, private, type, priority, 839 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 840 vd, offset, NULL, 841 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 842 843 return (zio); 844 } 845 846 void 847 zio_flush(zio_t *zio, vdev_t *vd) 848 { 849 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 850 NULL, NULL, ZIO_PRIORITY_NOW, 851 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 852 } 853 854 void 855 zio_shrink(zio_t *zio, uint64_t size) 856 { 857 ASSERT(zio->io_executor == NULL); 858 ASSERT(zio->io_orig_size == zio->io_size); 859 ASSERT(size <= zio->io_size); 860 861 /* 862 * We don't shrink for raidz because of problems with the 863 * reconstruction when reading back less than the block size. 864 * Note, BP_IS_RAIDZ() assumes no compression. 865 */ 866 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 867 if (!BP_IS_RAIDZ(zio->io_bp)) 868 zio->io_orig_size = zio->io_size = size; 869 } 870 871 /* 872 * ========================================================================== 873 * Prepare to read and write logical blocks 874 * ========================================================================== 875 */ 876 877 static int 878 zio_read_bp_init(zio_t *zio) 879 { 880 blkptr_t *bp = zio->io_bp; 881 882 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 883 zio->io_child_type == ZIO_CHILD_LOGICAL && 884 !(zio->io_flags & ZIO_FLAG_RAW)) { 885 uint64_t psize = BP_GET_PSIZE(bp); 886 void *cbuf = zio_buf_alloc(psize); 887 888 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 889 } 890 891 if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) 892 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 893 894 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 895 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 896 897 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 898 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 899 900 return (ZIO_PIPELINE_CONTINUE); 901 } 902 903 static int 904 zio_write_bp_init(zio_t *zio) 905 { 906 spa_t *spa = zio->io_spa; 907 zio_prop_t *zp = &zio->io_prop; 908 enum zio_compress compress = zp->zp_compress; 909 blkptr_t *bp = zio->io_bp; 910 uint64_t lsize = zio->io_size; 911 uint64_t psize = lsize; 912 int pass = 1; 913 914 /* 915 * If our children haven't all reached the ready stage, 916 * wait for them and then repeat this pipeline stage. 917 */ 918 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 919 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 920 return (ZIO_PIPELINE_STOP); 921 922 if (!IO_IS_ALLOCATING(zio)) 923 return (ZIO_PIPELINE_CONTINUE); 924 925 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 926 927 if (zio->io_bp_override) { 928 ASSERT(bp->blk_birth != zio->io_txg); 929 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 930 931 *bp = *zio->io_bp_override; 932 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 933 934 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 935 return (ZIO_PIPELINE_CONTINUE); 936 937 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 938 zp->zp_dedup_verify); 939 940 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 941 BP_SET_DEDUP(bp, 1); 942 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 943 return (ZIO_PIPELINE_CONTINUE); 944 } 945 zio->io_bp_override = NULL; 946 BP_ZERO(bp); 947 } 948 949 if (bp->blk_birth == zio->io_txg) { 950 /* 951 * We're rewriting an existing block, which means we're 952 * working on behalf of spa_sync(). For spa_sync() to 953 * converge, it must eventually be the case that we don't 954 * have to allocate new blocks. But compression changes 955 * the blocksize, which forces a reallocate, and makes 956 * convergence take longer. Therefore, after the first 957 * few passes, stop compressing to ensure convergence. 958 */ 959 pass = spa_sync_pass(spa); 960 961 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 962 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 963 ASSERT(!BP_GET_DEDUP(bp)); 964 965 if (pass > SYNC_PASS_DONT_COMPRESS) 966 compress = ZIO_COMPRESS_OFF; 967 968 /* Make sure someone doesn't change their mind on overwrites */ 969 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 970 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 971 } 972 973 if (compress != ZIO_COMPRESS_OFF) { 974 void *cbuf = zio_buf_alloc(lsize); 975 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 976 if (psize == 0 || psize == lsize) { 977 compress = ZIO_COMPRESS_OFF; 978 zio_buf_free(cbuf, lsize); 979 } else { 980 ASSERT(psize < lsize); 981 zio_push_transform(zio, cbuf, psize, lsize, NULL); 982 } 983 } 984 985 /* 986 * The final pass of spa_sync() must be all rewrites, but the first 987 * few passes offer a trade-off: allocating blocks defers convergence, 988 * but newly allocated blocks are sequential, so they can be written 989 * to disk faster. Therefore, we allow the first few passes of 990 * spa_sync() to allocate new blocks, but force rewrites after that. 991 * There should only be a handful of blocks after pass 1 in any case. 992 */ 993 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 994 pass > SYNC_PASS_REWRITE) { 995 ASSERT(psize != 0); 996 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 997 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 998 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 999 } else { 1000 BP_ZERO(bp); 1001 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1002 } 1003 1004 if (psize == 0) { 1005 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1006 } else { 1007 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1008 BP_SET_LSIZE(bp, lsize); 1009 BP_SET_PSIZE(bp, psize); 1010 BP_SET_COMPRESS(bp, compress); 1011 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1012 BP_SET_TYPE(bp, zp->zp_type); 1013 BP_SET_LEVEL(bp, zp->zp_level); 1014 BP_SET_DEDUP(bp, zp->zp_dedup); 1015 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1016 if (zp->zp_dedup) { 1017 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1018 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1019 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1020 } 1021 } 1022 1023 return (ZIO_PIPELINE_CONTINUE); 1024 } 1025 1026 static int 1027 zio_free_bp_init(zio_t *zio) 1028 { 1029 blkptr_t *bp = zio->io_bp; 1030 1031 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1032 if (BP_GET_DEDUP(bp)) 1033 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1034 } 1035 1036 return (ZIO_PIPELINE_CONTINUE); 1037 } 1038 1039 /* 1040 * ========================================================================== 1041 * Execute the I/O pipeline 1042 * ========================================================================== 1043 */ 1044 1045 static void 1046 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) 1047 { 1048 spa_t *spa = zio->io_spa; 1049 zio_type_t t = zio->io_type; 1050 int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); 1051 1052 /* 1053 * If we're a config writer or a probe, the normal issue and 1054 * interrupt threads may all be blocked waiting for the config lock. 1055 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1056 */ 1057 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1058 t = ZIO_TYPE_NULL; 1059 1060 /* 1061 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1062 */ 1063 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1064 t = ZIO_TYPE_NULL; 1065 1066 /* 1067 * If this is a high priority I/O, then use the high priority taskq. 1068 */ 1069 if (zio->io_priority == ZIO_PRIORITY_NOW && 1070 spa->spa_zio_taskq[t][q + 1] != NULL) 1071 q++; 1072 1073 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1074 (void) taskq_dispatch(spa->spa_zio_taskq[t][q], 1075 (task_func_t *)zio_execute, zio, flags); 1076 } 1077 1078 static boolean_t 1079 zio_taskq_member(zio_t *zio, enum zio_taskq_type q) 1080 { 1081 kthread_t *executor = zio->io_executor; 1082 spa_t *spa = zio->io_spa; 1083 1084 for (zio_type_t t = 0; t < ZIO_TYPES; t++) 1085 if (taskq_member(spa->spa_zio_taskq[t][q], executor)) 1086 return (B_TRUE); 1087 1088 return (B_FALSE); 1089 } 1090 1091 static int 1092 zio_issue_async(zio_t *zio) 1093 { 1094 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1095 1096 return (ZIO_PIPELINE_STOP); 1097 } 1098 1099 void 1100 zio_interrupt(zio_t *zio) 1101 { 1102 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1103 } 1104 1105 /* 1106 * Execute the I/O pipeline until one of the following occurs: 1107 * (1) the I/O completes; (2) the pipeline stalls waiting for 1108 * dependent child I/Os; (3) the I/O issues, so we're waiting 1109 * for an I/O completion interrupt; (4) the I/O is delegated by 1110 * vdev-level caching or aggregation; (5) the I/O is deferred 1111 * due to vdev-level queueing; (6) the I/O is handed off to 1112 * another thread. In all cases, the pipeline stops whenever 1113 * there's no CPU work; it never burns a thread in cv_wait(). 1114 * 1115 * There's no locking on io_stage because there's no legitimate way 1116 * for multiple threads to be attempting to process the same I/O. 1117 */ 1118 static zio_pipe_stage_t *zio_pipeline[]; 1119 1120 void 1121 zio_execute(zio_t *zio) 1122 { 1123 zio->io_executor = curthread; 1124 1125 while (zio->io_stage < ZIO_STAGE_DONE) { 1126 enum zio_stage pipeline = zio->io_pipeline; 1127 enum zio_stage stage = zio->io_stage; 1128 int rv; 1129 1130 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1131 ASSERT(ISP2(stage)); 1132 ASSERT(zio->io_stall == NULL); 1133 1134 do { 1135 stage <<= 1; 1136 } while ((stage & pipeline) == 0); 1137 1138 ASSERT(stage <= ZIO_STAGE_DONE); 1139 1140 /* 1141 * If we are in interrupt context and this pipeline stage 1142 * will grab a config lock that is held across I/O, 1143 * or may wait for an I/O that needs an interrupt thread 1144 * to complete, issue async to avoid deadlock. 1145 * 1146 * For VDEV_IO_START, we cut in line so that the io will 1147 * be sent to disk promptly. 1148 */ 1149 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1150 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1151 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1152 zio_requeue_io_start_cut_in_line : B_FALSE; 1153 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1154 return; 1155 } 1156 1157 zio->io_stage = stage; 1158 rv = zio_pipeline[highbit(stage) - 1](zio); 1159 1160 if (rv == ZIO_PIPELINE_STOP) 1161 return; 1162 1163 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1164 } 1165 } 1166 1167 /* 1168 * ========================================================================== 1169 * Initiate I/O, either sync or async 1170 * ========================================================================== 1171 */ 1172 int 1173 zio_wait(zio_t *zio) 1174 { 1175 int error; 1176 1177 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1178 ASSERT(zio->io_executor == NULL); 1179 1180 zio->io_waiter = curthread; 1181 1182 zio_execute(zio); 1183 1184 mutex_enter(&zio->io_lock); 1185 while (zio->io_executor != NULL) 1186 cv_wait(&zio->io_cv, &zio->io_lock); 1187 mutex_exit(&zio->io_lock); 1188 1189 error = zio->io_error; 1190 zio_destroy(zio); 1191 1192 return (error); 1193 } 1194 1195 void 1196 zio_nowait(zio_t *zio) 1197 { 1198 ASSERT(zio->io_executor == NULL); 1199 1200 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1201 zio_unique_parent(zio) == NULL) { 1202 /* 1203 * This is a logical async I/O with no parent to wait for it. 1204 * We add it to the spa_async_root_zio "Godfather" I/O which 1205 * will ensure they complete prior to unloading the pool. 1206 */ 1207 spa_t *spa = zio->io_spa; 1208 1209 zio_add_child(spa->spa_async_zio_root, zio); 1210 } 1211 1212 zio_execute(zio); 1213 } 1214 1215 /* 1216 * ========================================================================== 1217 * Reexecute or suspend/resume failed I/O 1218 * ========================================================================== 1219 */ 1220 1221 static void 1222 zio_reexecute(zio_t *pio) 1223 { 1224 zio_t *cio, *cio_next; 1225 1226 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1227 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1228 ASSERT(pio->io_gang_leader == NULL); 1229 ASSERT(pio->io_gang_tree == NULL); 1230 1231 pio->io_flags = pio->io_orig_flags; 1232 pio->io_stage = pio->io_orig_stage; 1233 pio->io_pipeline = pio->io_orig_pipeline; 1234 pio->io_reexecute = 0; 1235 pio->io_error = 0; 1236 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1237 pio->io_state[w] = 0; 1238 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1239 pio->io_child_error[c] = 0; 1240 1241 if (IO_IS_ALLOCATING(pio)) 1242 BP_ZERO(pio->io_bp); 1243 1244 /* 1245 * As we reexecute pio's children, new children could be created. 1246 * New children go to the head of pio's io_child_list, however, 1247 * so we will (correctly) not reexecute them. The key is that 1248 * the remainder of pio's io_child_list, from 'cio_next' onward, 1249 * cannot be affected by any side effects of reexecuting 'cio'. 1250 */ 1251 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1252 cio_next = zio_walk_children(pio); 1253 mutex_enter(&pio->io_lock); 1254 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1255 pio->io_children[cio->io_child_type][w]++; 1256 mutex_exit(&pio->io_lock); 1257 zio_reexecute(cio); 1258 } 1259 1260 /* 1261 * Now that all children have been reexecuted, execute the parent. 1262 * We don't reexecute "The Godfather" I/O here as it's the 1263 * responsibility of the caller to wait on him. 1264 */ 1265 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1266 zio_execute(pio); 1267 } 1268 1269 void 1270 zio_suspend(spa_t *spa, zio_t *zio) 1271 { 1272 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1273 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1274 "failure and the failure mode property for this pool " 1275 "is set to panic.", spa_name(spa)); 1276 1277 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1278 1279 mutex_enter(&spa->spa_suspend_lock); 1280 1281 if (spa->spa_suspend_zio_root == NULL) 1282 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1283 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1284 ZIO_FLAG_GODFATHER); 1285 1286 spa->spa_suspended = B_TRUE; 1287 1288 if (zio != NULL) { 1289 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1290 ASSERT(zio != spa->spa_suspend_zio_root); 1291 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1292 ASSERT(zio_unique_parent(zio) == NULL); 1293 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1294 zio_add_child(spa->spa_suspend_zio_root, zio); 1295 } 1296 1297 mutex_exit(&spa->spa_suspend_lock); 1298 } 1299 1300 int 1301 zio_resume(spa_t *spa) 1302 { 1303 zio_t *pio; 1304 1305 /* 1306 * Reexecute all previously suspended i/o. 1307 */ 1308 mutex_enter(&spa->spa_suspend_lock); 1309 spa->spa_suspended = B_FALSE; 1310 cv_broadcast(&spa->spa_suspend_cv); 1311 pio = spa->spa_suspend_zio_root; 1312 spa->spa_suspend_zio_root = NULL; 1313 mutex_exit(&spa->spa_suspend_lock); 1314 1315 if (pio == NULL) 1316 return (0); 1317 1318 zio_reexecute(pio); 1319 return (zio_wait(pio)); 1320 } 1321 1322 void 1323 zio_resume_wait(spa_t *spa) 1324 { 1325 mutex_enter(&spa->spa_suspend_lock); 1326 while (spa_suspended(spa)) 1327 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1328 mutex_exit(&spa->spa_suspend_lock); 1329 } 1330 1331 /* 1332 * ========================================================================== 1333 * Gang blocks. 1334 * 1335 * A gang block is a collection of small blocks that looks to the DMU 1336 * like one large block. When zio_dva_allocate() cannot find a block 1337 * of the requested size, due to either severe fragmentation or the pool 1338 * being nearly full, it calls zio_write_gang_block() to construct the 1339 * block from smaller fragments. 1340 * 1341 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1342 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1343 * an indirect block: it's an array of block pointers. It consumes 1344 * only one sector and hence is allocatable regardless of fragmentation. 1345 * The gang header's bps point to its gang members, which hold the data. 1346 * 1347 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1348 * as the verifier to ensure uniqueness of the SHA256 checksum. 1349 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1350 * not the gang header. This ensures that data block signatures (needed for 1351 * deduplication) are independent of how the block is physically stored. 1352 * 1353 * Gang blocks can be nested: a gang member may itself be a gang block. 1354 * Thus every gang block is a tree in which root and all interior nodes are 1355 * gang headers, and the leaves are normal blocks that contain user data. 1356 * The root of the gang tree is called the gang leader. 1357 * 1358 * To perform any operation (read, rewrite, free, claim) on a gang block, 1359 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1360 * in the io_gang_tree field of the original logical i/o by recursively 1361 * reading the gang leader and all gang headers below it. This yields 1362 * an in-core tree containing the contents of every gang header and the 1363 * bps for every constituent of the gang block. 1364 * 1365 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1366 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1367 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1368 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1369 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1370 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1371 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1372 * of the gang header plus zio_checksum_compute() of the data to update the 1373 * gang header's blk_cksum as described above. 1374 * 1375 * The two-phase assemble/issue model solves the problem of partial failure -- 1376 * what if you'd freed part of a gang block but then couldn't read the 1377 * gang header for another part? Assembling the entire gang tree first 1378 * ensures that all the necessary gang header I/O has succeeded before 1379 * starting the actual work of free, claim, or write. Once the gang tree 1380 * is assembled, free and claim are in-memory operations that cannot fail. 1381 * 1382 * In the event that a gang write fails, zio_dva_unallocate() walks the 1383 * gang tree to immediately free (i.e. insert back into the space map) 1384 * everything we've allocated. This ensures that we don't get ENOSPC 1385 * errors during repeated suspend/resume cycles due to a flaky device. 1386 * 1387 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1388 * the gang tree, we won't modify the block, so we can safely defer the free 1389 * (knowing that the block is still intact). If we *can* assemble the gang 1390 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1391 * each constituent bp and we can allocate a new block on the next sync pass. 1392 * 1393 * In all cases, the gang tree allows complete recovery from partial failure. 1394 * ========================================================================== 1395 */ 1396 1397 static zio_t * 1398 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1399 { 1400 if (gn != NULL) 1401 return (pio); 1402 1403 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1404 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1405 &pio->io_bookmark)); 1406 } 1407 1408 zio_t * 1409 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1410 { 1411 zio_t *zio; 1412 1413 if (gn != NULL) { 1414 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1415 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1416 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1417 /* 1418 * As we rewrite each gang header, the pipeline will compute 1419 * a new gang block header checksum for it; but no one will 1420 * compute a new data checksum, so we do that here. The one 1421 * exception is the gang leader: the pipeline already computed 1422 * its data checksum because that stage precedes gang assembly. 1423 * (Presently, nothing actually uses interior data checksums; 1424 * this is just good hygiene.) 1425 */ 1426 if (gn != pio->io_gang_leader->io_gang_tree) { 1427 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1428 data, BP_GET_PSIZE(bp)); 1429 } 1430 /* 1431 * If we are here to damage data for testing purposes, 1432 * leave the GBH alone so that we can detect the damage. 1433 */ 1434 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1435 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1436 } else { 1437 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1438 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1439 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1440 } 1441 1442 return (zio); 1443 } 1444 1445 /* ARGSUSED */ 1446 zio_t * 1447 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1448 { 1449 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1450 ZIO_GANG_CHILD_FLAGS(pio))); 1451 } 1452 1453 /* ARGSUSED */ 1454 zio_t * 1455 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1456 { 1457 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1458 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1459 } 1460 1461 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1462 NULL, 1463 zio_read_gang, 1464 zio_rewrite_gang, 1465 zio_free_gang, 1466 zio_claim_gang, 1467 NULL 1468 }; 1469 1470 static void zio_gang_tree_assemble_done(zio_t *zio); 1471 1472 static zio_gang_node_t * 1473 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1474 { 1475 zio_gang_node_t *gn; 1476 1477 ASSERT(*gnpp == NULL); 1478 1479 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1480 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1481 *gnpp = gn; 1482 1483 return (gn); 1484 } 1485 1486 static void 1487 zio_gang_node_free(zio_gang_node_t **gnpp) 1488 { 1489 zio_gang_node_t *gn = *gnpp; 1490 1491 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1492 ASSERT(gn->gn_child[g] == NULL); 1493 1494 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1495 kmem_free(gn, sizeof (*gn)); 1496 *gnpp = NULL; 1497 } 1498 1499 static void 1500 zio_gang_tree_free(zio_gang_node_t **gnpp) 1501 { 1502 zio_gang_node_t *gn = *gnpp; 1503 1504 if (gn == NULL) 1505 return; 1506 1507 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1508 zio_gang_tree_free(&gn->gn_child[g]); 1509 1510 zio_gang_node_free(gnpp); 1511 } 1512 1513 static void 1514 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1515 { 1516 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1517 1518 ASSERT(gio->io_gang_leader == gio); 1519 ASSERT(BP_IS_GANG(bp)); 1520 1521 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1522 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1523 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1524 } 1525 1526 static void 1527 zio_gang_tree_assemble_done(zio_t *zio) 1528 { 1529 zio_t *gio = zio->io_gang_leader; 1530 zio_gang_node_t *gn = zio->io_private; 1531 blkptr_t *bp = zio->io_bp; 1532 1533 ASSERT(gio == zio_unique_parent(zio)); 1534 ASSERT(zio->io_child_count == 0); 1535 1536 if (zio->io_error) 1537 return; 1538 1539 if (BP_SHOULD_BYTESWAP(bp)) 1540 byteswap_uint64_array(zio->io_data, zio->io_size); 1541 1542 ASSERT(zio->io_data == gn->gn_gbh); 1543 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1544 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1545 1546 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1547 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1548 if (!BP_IS_GANG(gbp)) 1549 continue; 1550 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1551 } 1552 } 1553 1554 static void 1555 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1556 { 1557 zio_t *gio = pio->io_gang_leader; 1558 zio_t *zio; 1559 1560 ASSERT(BP_IS_GANG(bp) == !!gn); 1561 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1562 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1563 1564 /* 1565 * If you're a gang header, your data is in gn->gn_gbh. 1566 * If you're a gang member, your data is in 'data' and gn == NULL. 1567 */ 1568 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1569 1570 if (gn != NULL) { 1571 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1572 1573 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1574 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1575 if (BP_IS_HOLE(gbp)) 1576 continue; 1577 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1578 data = (char *)data + BP_GET_PSIZE(gbp); 1579 } 1580 } 1581 1582 if (gn == gio->io_gang_tree) 1583 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1584 1585 if (zio != pio) 1586 zio_nowait(zio); 1587 } 1588 1589 static int 1590 zio_gang_assemble(zio_t *zio) 1591 { 1592 blkptr_t *bp = zio->io_bp; 1593 1594 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1595 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1596 1597 zio->io_gang_leader = zio; 1598 1599 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1600 1601 return (ZIO_PIPELINE_CONTINUE); 1602 } 1603 1604 static int 1605 zio_gang_issue(zio_t *zio) 1606 { 1607 blkptr_t *bp = zio->io_bp; 1608 1609 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1610 return (ZIO_PIPELINE_STOP); 1611 1612 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1613 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1614 1615 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1616 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1617 else 1618 zio_gang_tree_free(&zio->io_gang_tree); 1619 1620 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1621 1622 return (ZIO_PIPELINE_CONTINUE); 1623 } 1624 1625 static void 1626 zio_write_gang_member_ready(zio_t *zio) 1627 { 1628 zio_t *pio = zio_unique_parent(zio); 1629 zio_t *gio = zio->io_gang_leader; 1630 dva_t *cdva = zio->io_bp->blk_dva; 1631 dva_t *pdva = pio->io_bp->blk_dva; 1632 uint64_t asize; 1633 1634 if (BP_IS_HOLE(zio->io_bp)) 1635 return; 1636 1637 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1638 1639 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1640 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1641 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1642 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1643 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1644 1645 mutex_enter(&pio->io_lock); 1646 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1647 ASSERT(DVA_GET_GANG(&pdva[d])); 1648 asize = DVA_GET_ASIZE(&pdva[d]); 1649 asize += DVA_GET_ASIZE(&cdva[d]); 1650 DVA_SET_ASIZE(&pdva[d], asize); 1651 } 1652 mutex_exit(&pio->io_lock); 1653 } 1654 1655 static int 1656 zio_write_gang_block(zio_t *pio) 1657 { 1658 spa_t *spa = pio->io_spa; 1659 blkptr_t *bp = pio->io_bp; 1660 zio_t *gio = pio->io_gang_leader; 1661 zio_t *zio; 1662 zio_gang_node_t *gn, **gnpp; 1663 zio_gbh_phys_t *gbh; 1664 uint64_t txg = pio->io_txg; 1665 uint64_t resid = pio->io_size; 1666 uint64_t lsize; 1667 int copies = gio->io_prop.zp_copies; 1668 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1669 zio_prop_t zp; 1670 int error; 1671 1672 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1673 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1674 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1675 if (error) { 1676 pio->io_error = error; 1677 return (ZIO_PIPELINE_CONTINUE); 1678 } 1679 1680 if (pio == gio) { 1681 gnpp = &gio->io_gang_tree; 1682 } else { 1683 gnpp = pio->io_private; 1684 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1685 } 1686 1687 gn = zio_gang_node_alloc(gnpp); 1688 gbh = gn->gn_gbh; 1689 bzero(gbh, SPA_GANGBLOCKSIZE); 1690 1691 /* 1692 * Create the gang header. 1693 */ 1694 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1695 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1696 1697 /* 1698 * Create and nowait the gang children. 1699 */ 1700 for (int g = 0; resid != 0; resid -= lsize, g++) { 1701 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1702 SPA_MINBLOCKSIZE); 1703 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1704 1705 zp.zp_checksum = gio->io_prop.zp_checksum; 1706 zp.zp_compress = ZIO_COMPRESS_OFF; 1707 zp.zp_type = DMU_OT_NONE; 1708 zp.zp_level = 0; 1709 zp.zp_copies = gio->io_prop.zp_copies; 1710 zp.zp_dedup = 0; 1711 zp.zp_dedup_verify = 0; 1712 1713 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1714 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1715 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1716 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1717 &pio->io_bookmark)); 1718 } 1719 1720 /* 1721 * Set pio's pipeline to just wait for zio to finish. 1722 */ 1723 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1724 1725 zio_nowait(zio); 1726 1727 return (ZIO_PIPELINE_CONTINUE); 1728 } 1729 1730 /* 1731 * ========================================================================== 1732 * Dedup 1733 * ========================================================================== 1734 */ 1735 static void 1736 zio_ddt_child_read_done(zio_t *zio) 1737 { 1738 blkptr_t *bp = zio->io_bp; 1739 ddt_entry_t *dde = zio->io_private; 1740 ddt_phys_t *ddp; 1741 zio_t *pio = zio_unique_parent(zio); 1742 1743 mutex_enter(&pio->io_lock); 1744 ddp = ddt_phys_select(dde, bp); 1745 if (zio->io_error == 0) 1746 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1747 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1748 dde->dde_repair_data = zio->io_data; 1749 else 1750 zio_buf_free(zio->io_data, zio->io_size); 1751 mutex_exit(&pio->io_lock); 1752 } 1753 1754 static int 1755 zio_ddt_read_start(zio_t *zio) 1756 { 1757 blkptr_t *bp = zio->io_bp; 1758 1759 ASSERT(BP_GET_DEDUP(bp)); 1760 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1761 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1762 1763 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1764 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1765 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1766 ddt_phys_t *ddp = dde->dde_phys; 1767 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1768 blkptr_t blk; 1769 1770 ASSERT(zio->io_vsd == NULL); 1771 zio->io_vsd = dde; 1772 1773 if (ddp_self == NULL) 1774 return (ZIO_PIPELINE_CONTINUE); 1775 1776 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1777 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1778 continue; 1779 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1780 &blk); 1781 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1782 zio_buf_alloc(zio->io_size), zio->io_size, 1783 zio_ddt_child_read_done, dde, zio->io_priority, 1784 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1785 &zio->io_bookmark)); 1786 } 1787 return (ZIO_PIPELINE_CONTINUE); 1788 } 1789 1790 zio_nowait(zio_read(zio, zio->io_spa, bp, 1791 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1792 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1793 1794 return (ZIO_PIPELINE_CONTINUE); 1795 } 1796 1797 static int 1798 zio_ddt_read_done(zio_t *zio) 1799 { 1800 blkptr_t *bp = zio->io_bp; 1801 1802 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1803 return (ZIO_PIPELINE_STOP); 1804 1805 ASSERT(BP_GET_DEDUP(bp)); 1806 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1807 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1808 1809 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1810 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1811 ddt_entry_t *dde = zio->io_vsd; 1812 if (ddt == NULL) { 1813 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1814 return (ZIO_PIPELINE_CONTINUE); 1815 } 1816 if (dde == NULL) { 1817 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1818 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1819 return (ZIO_PIPELINE_STOP); 1820 } 1821 if (dde->dde_repair_data != NULL) { 1822 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1823 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1824 } 1825 ddt_repair_done(ddt, dde); 1826 zio->io_vsd = NULL; 1827 } 1828 1829 ASSERT(zio->io_vsd == NULL); 1830 1831 return (ZIO_PIPELINE_CONTINUE); 1832 } 1833 1834 static boolean_t 1835 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1836 { 1837 spa_t *spa = zio->io_spa; 1838 1839 /* 1840 * Note: we compare the original data, not the transformed data, 1841 * because when zio->io_bp is an override bp, we will not have 1842 * pushed the I/O transforms. That's an important optimization 1843 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1844 */ 1845 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1846 zio_t *lio = dde->dde_lead_zio[p]; 1847 1848 if (lio != NULL) { 1849 return (lio->io_orig_size != zio->io_orig_size || 1850 bcmp(zio->io_orig_data, lio->io_orig_data, 1851 zio->io_orig_size) != 0); 1852 } 1853 } 1854 1855 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1856 ddt_phys_t *ddp = &dde->dde_phys[p]; 1857 1858 if (ddp->ddp_phys_birth != 0) { 1859 arc_buf_t *abuf = NULL; 1860 uint32_t aflags = ARC_WAIT; 1861 blkptr_t blk = *zio->io_bp; 1862 int error; 1863 1864 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 1865 1866 ddt_exit(ddt); 1867 1868 error = arc_read_nolock(NULL, spa, &blk, 1869 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 1870 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1871 &aflags, &zio->io_bookmark); 1872 1873 if (error == 0) { 1874 if (arc_buf_size(abuf) != zio->io_orig_size || 1875 bcmp(abuf->b_data, zio->io_orig_data, 1876 zio->io_orig_size) != 0) 1877 error = EEXIST; 1878 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 1879 } 1880 1881 ddt_enter(ddt); 1882 return (error != 0); 1883 } 1884 } 1885 1886 return (B_FALSE); 1887 } 1888 1889 static void 1890 zio_ddt_child_write_ready(zio_t *zio) 1891 { 1892 int p = zio->io_prop.zp_copies; 1893 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1894 ddt_entry_t *dde = zio->io_private; 1895 ddt_phys_t *ddp = &dde->dde_phys[p]; 1896 zio_t *pio; 1897 1898 if (zio->io_error) 1899 return; 1900 1901 ddt_enter(ddt); 1902 1903 ASSERT(dde->dde_lead_zio[p] == zio); 1904 1905 ddt_phys_fill(ddp, zio->io_bp); 1906 1907 while ((pio = zio_walk_parents(zio)) != NULL) 1908 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 1909 1910 ddt_exit(ddt); 1911 } 1912 1913 static void 1914 zio_ddt_child_write_done(zio_t *zio) 1915 { 1916 int p = zio->io_prop.zp_copies; 1917 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 1918 ddt_entry_t *dde = zio->io_private; 1919 ddt_phys_t *ddp = &dde->dde_phys[p]; 1920 1921 ddt_enter(ddt); 1922 1923 ASSERT(ddp->ddp_refcnt == 0); 1924 ASSERT(dde->dde_lead_zio[p] == zio); 1925 dde->dde_lead_zio[p] = NULL; 1926 1927 if (zio->io_error == 0) { 1928 while (zio_walk_parents(zio) != NULL) 1929 ddt_phys_addref(ddp); 1930 } else { 1931 ddt_phys_clear(ddp); 1932 } 1933 1934 ddt_exit(ddt); 1935 } 1936 1937 static void 1938 zio_ddt_ditto_write_done(zio_t *zio) 1939 { 1940 int p = DDT_PHYS_DITTO; 1941 zio_prop_t *zp = &zio->io_prop; 1942 blkptr_t *bp = zio->io_bp; 1943 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1944 ddt_entry_t *dde = zio->io_private; 1945 ddt_phys_t *ddp = &dde->dde_phys[p]; 1946 ddt_key_t *ddk = &dde->dde_key; 1947 1948 ddt_enter(ddt); 1949 1950 ASSERT(ddp->ddp_refcnt == 0); 1951 ASSERT(dde->dde_lead_zio[p] == zio); 1952 dde->dde_lead_zio[p] = NULL; 1953 1954 if (zio->io_error == 0) { 1955 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 1956 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 1957 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 1958 if (ddp->ddp_phys_birth != 0) 1959 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 1960 ddt_phys_fill(ddp, bp); 1961 } 1962 1963 ddt_exit(ddt); 1964 } 1965 1966 static int 1967 zio_ddt_write(zio_t *zio) 1968 { 1969 spa_t *spa = zio->io_spa; 1970 blkptr_t *bp = zio->io_bp; 1971 uint64_t txg = zio->io_txg; 1972 zio_prop_t *zp = &zio->io_prop; 1973 int p = zp->zp_copies; 1974 int ditto_copies; 1975 zio_t *cio = NULL; 1976 zio_t *dio = NULL; 1977 ddt_t *ddt = ddt_select(spa, bp); 1978 ddt_entry_t *dde; 1979 ddt_phys_t *ddp; 1980 1981 ASSERT(BP_GET_DEDUP(bp)); 1982 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 1983 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 1984 1985 ddt_enter(ddt); 1986 dde = ddt_lookup(ddt, bp, B_TRUE); 1987 ddp = &dde->dde_phys[p]; 1988 1989 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 1990 /* 1991 * If we're using a weak checksum, upgrade to a strong checksum 1992 * and try again. If we're already using a strong checksum, 1993 * we can't resolve it, so just convert to an ordinary write. 1994 * (And automatically e-mail a paper to Nature?) 1995 */ 1996 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 1997 zp->zp_checksum = spa_dedup_checksum(spa); 1998 zio_pop_transforms(zio); 1999 zio->io_stage = ZIO_STAGE_OPEN; 2000 BP_ZERO(bp); 2001 } else { 2002 zp->zp_dedup = 0; 2003 } 2004 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2005 ddt_exit(ddt); 2006 return (ZIO_PIPELINE_CONTINUE); 2007 } 2008 2009 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2010 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2011 2012 if (ditto_copies > ddt_ditto_copies_present(dde) && 2013 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2014 zio_prop_t czp = *zp; 2015 2016 czp.zp_copies = ditto_copies; 2017 2018 /* 2019 * If we arrived here with an override bp, we won't have run 2020 * the transform stack, so we won't have the data we need to 2021 * generate a child i/o. So, toss the override bp and restart. 2022 * This is safe, because using the override bp is just an 2023 * optimization; and it's rare, so the cost doesn't matter. 2024 */ 2025 if (zio->io_bp_override) { 2026 zio_pop_transforms(zio); 2027 zio->io_stage = ZIO_STAGE_OPEN; 2028 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2029 zio->io_bp_override = NULL; 2030 BP_ZERO(bp); 2031 ddt_exit(ddt); 2032 return (ZIO_PIPELINE_CONTINUE); 2033 } 2034 2035 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2036 zio->io_orig_size, &czp, NULL, 2037 zio_ddt_ditto_write_done, dde, zio->io_priority, 2038 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2039 2040 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2041 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2042 } 2043 2044 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2045 if (ddp->ddp_phys_birth != 0) 2046 ddt_bp_fill(ddp, bp, txg); 2047 if (dde->dde_lead_zio[p] != NULL) 2048 zio_add_child(zio, dde->dde_lead_zio[p]); 2049 else 2050 ddt_phys_addref(ddp); 2051 } else if (zio->io_bp_override) { 2052 ASSERT(bp->blk_birth == txg); 2053 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2054 ddt_phys_fill(ddp, bp); 2055 ddt_phys_addref(ddp); 2056 } else { 2057 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2058 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2059 zio_ddt_child_write_done, dde, zio->io_priority, 2060 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2061 2062 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2063 dde->dde_lead_zio[p] = cio; 2064 } 2065 2066 ddt_exit(ddt); 2067 2068 if (cio) 2069 zio_nowait(cio); 2070 if (dio) 2071 zio_nowait(dio); 2072 2073 return (ZIO_PIPELINE_CONTINUE); 2074 } 2075 2076 ddt_entry_t *freedde; /* for debugging */ 2077 2078 static int 2079 zio_ddt_free(zio_t *zio) 2080 { 2081 spa_t *spa = zio->io_spa; 2082 blkptr_t *bp = zio->io_bp; 2083 ddt_t *ddt = ddt_select(spa, bp); 2084 ddt_entry_t *dde; 2085 ddt_phys_t *ddp; 2086 2087 ASSERT(BP_GET_DEDUP(bp)); 2088 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2089 2090 ddt_enter(ddt); 2091 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2092 ddp = ddt_phys_select(dde, bp); 2093 ddt_phys_decref(ddp); 2094 ddt_exit(ddt); 2095 2096 return (ZIO_PIPELINE_CONTINUE); 2097 } 2098 2099 /* 2100 * ========================================================================== 2101 * Allocate and free blocks 2102 * ========================================================================== 2103 */ 2104 static int 2105 zio_dva_allocate(zio_t *zio) 2106 { 2107 spa_t *spa = zio->io_spa; 2108 metaslab_class_t *mc = spa_normal_class(spa); 2109 blkptr_t *bp = zio->io_bp; 2110 int error; 2111 2112 if (zio->io_gang_leader == NULL) { 2113 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2114 zio->io_gang_leader = zio; 2115 } 2116 2117 ASSERT(BP_IS_HOLE(bp)); 2118 ASSERT3U(BP_GET_NDVAS(bp), ==, 0); 2119 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2120 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2121 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2122 2123 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2124 zio->io_prop.zp_copies, zio->io_txg, NULL, 0); 2125 2126 if (error) { 2127 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2128 return (zio_write_gang_block(zio)); 2129 zio->io_error = error; 2130 } 2131 2132 return (ZIO_PIPELINE_CONTINUE); 2133 } 2134 2135 static int 2136 zio_dva_free(zio_t *zio) 2137 { 2138 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2139 2140 return (ZIO_PIPELINE_CONTINUE); 2141 } 2142 2143 static int 2144 zio_dva_claim(zio_t *zio) 2145 { 2146 int error; 2147 2148 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2149 if (error) 2150 zio->io_error = error; 2151 2152 return (ZIO_PIPELINE_CONTINUE); 2153 } 2154 2155 /* 2156 * Undo an allocation. This is used by zio_done() when an I/O fails 2157 * and we want to give back the block we just allocated. 2158 * This handles both normal blocks and gang blocks. 2159 */ 2160 static void 2161 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2162 { 2163 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2164 ASSERT(zio->io_bp_override == NULL); 2165 2166 if (!BP_IS_HOLE(bp)) 2167 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2168 2169 if (gn != NULL) { 2170 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2171 zio_dva_unallocate(zio, gn->gn_child[g], 2172 &gn->gn_gbh->zg_blkptr[g]); 2173 } 2174 } 2175 } 2176 2177 /* 2178 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2179 */ 2180 int 2181 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2182 uint64_t size, boolean_t use_slog) 2183 { 2184 int error = 1; 2185 2186 ASSERT(txg > spa_syncing_txg(spa)); 2187 2188 if (use_slog) 2189 error = metaslab_alloc(spa, spa_log_class(spa), size, 2190 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2191 2192 if (error) 2193 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2194 new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); 2195 2196 if (error == 0) { 2197 BP_SET_LSIZE(new_bp, size); 2198 BP_SET_PSIZE(new_bp, size); 2199 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2200 BP_SET_CHECKSUM(new_bp, 2201 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2202 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2203 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2204 BP_SET_LEVEL(new_bp, 0); 2205 BP_SET_DEDUP(new_bp, 0); 2206 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2207 } 2208 2209 return (error); 2210 } 2211 2212 /* 2213 * Free an intent log block. 2214 */ 2215 void 2216 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2217 { 2218 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2219 ASSERT(!BP_IS_GANG(bp)); 2220 2221 zio_free(spa, txg, bp); 2222 } 2223 2224 /* 2225 * ========================================================================== 2226 * Read and write to physical devices 2227 * ========================================================================== 2228 */ 2229 static int 2230 zio_vdev_io_start(zio_t *zio) 2231 { 2232 vdev_t *vd = zio->io_vd; 2233 uint64_t align; 2234 spa_t *spa = zio->io_spa; 2235 2236 ASSERT(zio->io_error == 0); 2237 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2238 2239 if (vd == NULL) { 2240 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2241 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2242 2243 /* 2244 * The mirror_ops handle multiple DVAs in a single BP. 2245 */ 2246 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2247 } 2248 2249 align = 1ULL << vd->vdev_top->vdev_ashift; 2250 2251 if (P2PHASE(zio->io_size, align) != 0) { 2252 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2253 char *abuf = zio_buf_alloc(asize); 2254 ASSERT(vd == vd->vdev_top); 2255 if (zio->io_type == ZIO_TYPE_WRITE) { 2256 bcopy(zio->io_data, abuf, zio->io_size); 2257 bzero(abuf + zio->io_size, asize - zio->io_size); 2258 } 2259 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2260 } 2261 2262 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2263 ASSERT(P2PHASE(zio->io_size, align) == 0); 2264 ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2265 2266 /* 2267 * If this is a repair I/O, and there's no self-healing involved -- 2268 * that is, we're just resilvering what we expect to resilver -- 2269 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2270 * This prevents spurious resilvering with nested replication. 2271 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2272 * A is out of date, we'll read from C+D, then use the data to 2273 * resilver A+B -- but we don't actually want to resilver B, just A. 2274 * The top-level mirror has no way to know this, so instead we just 2275 * discard unnecessary repairs as we work our way down the vdev tree. 2276 * The same logic applies to any form of nested replication: 2277 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2278 */ 2279 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2280 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2281 zio->io_txg != 0 && /* not a delegated i/o */ 2282 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2283 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2284 zio_vdev_io_bypass(zio); 2285 return (ZIO_PIPELINE_CONTINUE); 2286 } 2287 2288 if (vd->vdev_ops->vdev_op_leaf && 2289 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2290 2291 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2292 return (ZIO_PIPELINE_CONTINUE); 2293 2294 if ((zio = vdev_queue_io(zio)) == NULL) 2295 return (ZIO_PIPELINE_STOP); 2296 2297 if (!vdev_accessible(vd, zio)) { 2298 zio->io_error = ENXIO; 2299 zio_interrupt(zio); 2300 return (ZIO_PIPELINE_STOP); 2301 } 2302 } 2303 2304 return (vd->vdev_ops->vdev_op_io_start(zio)); 2305 } 2306 2307 static int 2308 zio_vdev_io_done(zio_t *zio) 2309 { 2310 vdev_t *vd = zio->io_vd; 2311 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2312 boolean_t unexpected_error = B_FALSE; 2313 2314 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2315 return (ZIO_PIPELINE_STOP); 2316 2317 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2318 2319 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2320 2321 vdev_queue_io_done(zio); 2322 2323 if (zio->io_type == ZIO_TYPE_WRITE) 2324 vdev_cache_write(zio); 2325 2326 if (zio_injection_enabled && zio->io_error == 0) 2327 zio->io_error = zio_handle_device_injection(vd, 2328 zio, EIO); 2329 2330 if (zio_injection_enabled && zio->io_error == 0) 2331 zio->io_error = zio_handle_label_injection(zio, EIO); 2332 2333 if (zio->io_error) { 2334 if (!vdev_accessible(vd, zio)) { 2335 zio->io_error = ENXIO; 2336 } else { 2337 unexpected_error = B_TRUE; 2338 } 2339 } 2340 } 2341 2342 ops->vdev_op_io_done(zio); 2343 2344 if (unexpected_error) 2345 VERIFY(vdev_probe(vd, zio) == NULL); 2346 2347 return (ZIO_PIPELINE_CONTINUE); 2348 } 2349 2350 /* 2351 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2352 * disk, and use that to finish the checksum ereport later. 2353 */ 2354 static void 2355 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2356 const void *good_buf) 2357 { 2358 /* no processing needed */ 2359 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2360 } 2361 2362 /*ARGSUSED*/ 2363 void 2364 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2365 { 2366 void *buf = zio_buf_alloc(zio->io_size); 2367 2368 bcopy(zio->io_data, buf, zio->io_size); 2369 2370 zcr->zcr_cbinfo = zio->io_size; 2371 zcr->zcr_cbdata = buf; 2372 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2373 zcr->zcr_free = zio_buf_free; 2374 } 2375 2376 static int 2377 zio_vdev_io_assess(zio_t *zio) 2378 { 2379 vdev_t *vd = zio->io_vd; 2380 2381 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2382 return (ZIO_PIPELINE_STOP); 2383 2384 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2385 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2386 2387 if (zio->io_vsd != NULL) { 2388 zio->io_vsd_ops->vsd_free(zio); 2389 zio->io_vsd = NULL; 2390 } 2391 2392 if (zio_injection_enabled && zio->io_error == 0) 2393 zio->io_error = zio_handle_fault_injection(zio, EIO); 2394 2395 /* 2396 * If the I/O failed, determine whether we should attempt to retry it. 2397 * 2398 * On retry, we cut in line in the issue queue, since we don't want 2399 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2400 */ 2401 if (zio->io_error && vd == NULL && 2402 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2403 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2404 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2405 zio->io_error = 0; 2406 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2407 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2408 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2409 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2410 zio_requeue_io_start_cut_in_line); 2411 return (ZIO_PIPELINE_STOP); 2412 } 2413 2414 /* 2415 * If we got an error on a leaf device, convert it to ENXIO 2416 * if the device is not accessible at all. 2417 */ 2418 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2419 !vdev_accessible(vd, zio)) 2420 zio->io_error = ENXIO; 2421 2422 /* 2423 * If we can't write to an interior vdev (mirror or RAID-Z), 2424 * set vdev_cant_write so that we stop trying to allocate from it. 2425 */ 2426 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2427 vd != NULL && !vd->vdev_ops->vdev_op_leaf) 2428 vd->vdev_cant_write = B_TRUE; 2429 2430 if (zio->io_error) 2431 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2432 2433 return (ZIO_PIPELINE_CONTINUE); 2434 } 2435 2436 void 2437 zio_vdev_io_reissue(zio_t *zio) 2438 { 2439 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2440 ASSERT(zio->io_error == 0); 2441 2442 zio->io_stage >>= 1; 2443 } 2444 2445 void 2446 zio_vdev_io_redone(zio_t *zio) 2447 { 2448 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2449 2450 zio->io_stage >>= 1; 2451 } 2452 2453 void 2454 zio_vdev_io_bypass(zio_t *zio) 2455 { 2456 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2457 ASSERT(zio->io_error == 0); 2458 2459 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2460 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2461 } 2462 2463 /* 2464 * ========================================================================== 2465 * Generate and verify checksums 2466 * ========================================================================== 2467 */ 2468 static int 2469 zio_checksum_generate(zio_t *zio) 2470 { 2471 blkptr_t *bp = zio->io_bp; 2472 enum zio_checksum checksum; 2473 2474 if (bp == NULL) { 2475 /* 2476 * This is zio_write_phys(). 2477 * We're either generating a label checksum, or none at all. 2478 */ 2479 checksum = zio->io_prop.zp_checksum; 2480 2481 if (checksum == ZIO_CHECKSUM_OFF) 2482 return (ZIO_PIPELINE_CONTINUE); 2483 2484 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2485 } else { 2486 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2487 ASSERT(!IO_IS_ALLOCATING(zio)); 2488 checksum = ZIO_CHECKSUM_GANG_HEADER; 2489 } else { 2490 checksum = BP_GET_CHECKSUM(bp); 2491 } 2492 } 2493 2494 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2495 2496 return (ZIO_PIPELINE_CONTINUE); 2497 } 2498 2499 static int 2500 zio_checksum_verify(zio_t *zio) 2501 { 2502 zio_bad_cksum_t info; 2503 blkptr_t *bp = zio->io_bp; 2504 int error; 2505 2506 ASSERT(zio->io_vd != NULL); 2507 2508 if (bp == NULL) { 2509 /* 2510 * This is zio_read_phys(). 2511 * We're either verifying a label checksum, or nothing at all. 2512 */ 2513 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2514 return (ZIO_PIPELINE_CONTINUE); 2515 2516 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2517 } 2518 2519 if ((error = zio_checksum_error(zio, &info)) != 0) { 2520 zio->io_error = error; 2521 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2522 zfs_ereport_start_checksum(zio->io_spa, 2523 zio->io_vd, zio, zio->io_offset, 2524 zio->io_size, NULL, &info); 2525 } 2526 } 2527 2528 return (ZIO_PIPELINE_CONTINUE); 2529 } 2530 2531 /* 2532 * Called by RAID-Z to ensure we don't compute the checksum twice. 2533 */ 2534 void 2535 zio_checksum_verified(zio_t *zio) 2536 { 2537 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2538 } 2539 2540 /* 2541 * ========================================================================== 2542 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2543 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2544 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2545 * indicate errors that are specific to one I/O, and most likely permanent. 2546 * Any other error is presumed to be worse because we weren't expecting it. 2547 * ========================================================================== 2548 */ 2549 int 2550 zio_worst_error(int e1, int e2) 2551 { 2552 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2553 int r1, r2; 2554 2555 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2556 if (e1 == zio_error_rank[r1]) 2557 break; 2558 2559 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2560 if (e2 == zio_error_rank[r2]) 2561 break; 2562 2563 return (r1 > r2 ? e1 : e2); 2564 } 2565 2566 /* 2567 * ========================================================================== 2568 * I/O completion 2569 * ========================================================================== 2570 */ 2571 static int 2572 zio_ready(zio_t *zio) 2573 { 2574 blkptr_t *bp = zio->io_bp; 2575 zio_t *pio, *pio_next; 2576 2577 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2578 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2579 return (ZIO_PIPELINE_STOP); 2580 2581 if (zio->io_ready) { 2582 ASSERT(IO_IS_ALLOCATING(zio)); 2583 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2584 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2585 2586 zio->io_ready(zio); 2587 } 2588 2589 if (bp != NULL && bp != &zio->io_bp_copy) 2590 zio->io_bp_copy = *bp; 2591 2592 if (zio->io_error) 2593 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2594 2595 mutex_enter(&zio->io_lock); 2596 zio->io_state[ZIO_WAIT_READY] = 1; 2597 pio = zio_walk_parents(zio); 2598 mutex_exit(&zio->io_lock); 2599 2600 /* 2601 * As we notify zio's parents, new parents could be added. 2602 * New parents go to the head of zio's io_parent_list, however, 2603 * so we will (correctly) not notify them. The remainder of zio's 2604 * io_parent_list, from 'pio_next' onward, cannot change because 2605 * all parents must wait for us to be done before they can be done. 2606 */ 2607 for (; pio != NULL; pio = pio_next) { 2608 pio_next = zio_walk_parents(zio); 2609 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2610 } 2611 2612 if (zio->io_flags & ZIO_FLAG_NODATA) { 2613 if (BP_IS_GANG(bp)) { 2614 zio->io_flags &= ~ZIO_FLAG_NODATA; 2615 } else { 2616 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2617 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2618 } 2619 } 2620 2621 if (zio_injection_enabled && 2622 zio->io_spa->spa_syncing_txg == zio->io_txg) 2623 zio_handle_ignored_writes(zio); 2624 2625 return (ZIO_PIPELINE_CONTINUE); 2626 } 2627 2628 static int 2629 zio_done(zio_t *zio) 2630 { 2631 spa_t *spa = zio->io_spa; 2632 zio_t *lio = zio->io_logical; 2633 blkptr_t *bp = zio->io_bp; 2634 vdev_t *vd = zio->io_vd; 2635 uint64_t psize = zio->io_size; 2636 zio_t *pio, *pio_next; 2637 2638 /* 2639 * If our children haven't all completed, 2640 * wait for them and then repeat this pipeline stage. 2641 */ 2642 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2643 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2644 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2645 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2646 return (ZIO_PIPELINE_STOP); 2647 2648 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2649 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2650 ASSERT(zio->io_children[c][w] == 0); 2651 2652 if (bp != NULL) { 2653 ASSERT(bp->blk_pad[0] == 0); 2654 ASSERT(bp->blk_pad[1] == 0); 2655 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2656 (bp == zio_unique_parent(zio)->io_bp)); 2657 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2658 zio->io_bp_override == NULL && 2659 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2660 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2661 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2662 ASSERT(BP_COUNT_GANG(bp) == 0 || 2663 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2664 } 2665 } 2666 2667 /* 2668 * If there were child vdev/gang/ddt errors, they apply to us now. 2669 */ 2670 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2671 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2672 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2673 2674 /* 2675 * If the I/O on the transformed data was successful, generate any 2676 * checksum reports now while we still have the transformed data. 2677 */ 2678 if (zio->io_error == 0) { 2679 while (zio->io_cksum_report != NULL) { 2680 zio_cksum_report_t *zcr = zio->io_cksum_report; 2681 uint64_t align = zcr->zcr_align; 2682 uint64_t asize = P2ROUNDUP(psize, align); 2683 char *abuf = zio->io_data; 2684 2685 if (asize != psize) { 2686 abuf = zio_buf_alloc(asize); 2687 bcopy(zio->io_data, abuf, psize); 2688 bzero(abuf + psize, asize - psize); 2689 } 2690 2691 zio->io_cksum_report = zcr->zcr_next; 2692 zcr->zcr_next = NULL; 2693 zcr->zcr_finish(zcr, abuf); 2694 zfs_ereport_free_checksum(zcr); 2695 2696 if (asize != psize) 2697 zio_buf_free(abuf, asize); 2698 } 2699 } 2700 2701 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2702 2703 vdev_stat_update(zio, psize); 2704 2705 if (zio->io_error) { 2706 /* 2707 * If this I/O is attached to a particular vdev, 2708 * generate an error message describing the I/O failure 2709 * at the block level. We ignore these errors if the 2710 * device is currently unavailable. 2711 */ 2712 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2713 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2714 2715 if ((zio->io_error == EIO || !(zio->io_flags & 2716 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2717 zio == lio) { 2718 /* 2719 * For logical I/O requests, tell the SPA to log the 2720 * error and generate a logical data ereport. 2721 */ 2722 spa_log_error(spa, zio); 2723 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2724 0, 0); 2725 } 2726 } 2727 2728 if (zio->io_error && zio == lio) { 2729 /* 2730 * Determine whether zio should be reexecuted. This will 2731 * propagate all the way to the root via zio_notify_parent(). 2732 */ 2733 ASSERT(vd == NULL && bp != NULL); 2734 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2735 2736 if (IO_IS_ALLOCATING(zio) && 2737 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2738 if (zio->io_error != ENOSPC) 2739 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2740 else 2741 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2742 } 2743 2744 if ((zio->io_type == ZIO_TYPE_READ || 2745 zio->io_type == ZIO_TYPE_FREE) && 2746 zio->io_error == ENXIO && 2747 spa_load_state(spa) == SPA_LOAD_NONE && 2748 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2749 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2750 2751 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2752 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2753 2754 /* 2755 * Here is a possibly good place to attempt to do 2756 * either combinatorial reconstruction or error correction 2757 * based on checksums. It also might be a good place 2758 * to send out preliminary ereports before we suspend 2759 * processing. 2760 */ 2761 } 2762 2763 /* 2764 * If there were logical child errors, they apply to us now. 2765 * We defer this until now to avoid conflating logical child 2766 * errors with errors that happened to the zio itself when 2767 * updating vdev stats and reporting FMA events above. 2768 */ 2769 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2770 2771 if ((zio->io_error || zio->io_reexecute) && 2772 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2773 !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) 2774 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2775 2776 zio_gang_tree_free(&zio->io_gang_tree); 2777 2778 /* 2779 * Godfather I/Os should never suspend. 2780 */ 2781 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2782 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2783 zio->io_reexecute = 0; 2784 2785 if (zio->io_reexecute) { 2786 /* 2787 * This is a logical I/O that wants to reexecute. 2788 * 2789 * Reexecute is top-down. When an i/o fails, if it's not 2790 * the root, it simply notifies its parent and sticks around. 2791 * The parent, seeing that it still has children in zio_done(), 2792 * does the same. This percolates all the way up to the root. 2793 * The root i/o will reexecute or suspend the entire tree. 2794 * 2795 * This approach ensures that zio_reexecute() honors 2796 * all the original i/o dependency relationships, e.g. 2797 * parents not executing until children are ready. 2798 */ 2799 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2800 2801 zio->io_gang_leader = NULL; 2802 2803 mutex_enter(&zio->io_lock); 2804 zio->io_state[ZIO_WAIT_DONE] = 1; 2805 mutex_exit(&zio->io_lock); 2806 2807 /* 2808 * "The Godfather" I/O monitors its children but is 2809 * not a true parent to them. It will track them through 2810 * the pipeline but severs its ties whenever they get into 2811 * trouble (e.g. suspended). This allows "The Godfather" 2812 * I/O to return status without blocking. 2813 */ 2814 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2815 zio_link_t *zl = zio->io_walk_link; 2816 pio_next = zio_walk_parents(zio); 2817 2818 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 2819 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 2820 zio_remove_child(pio, zio, zl); 2821 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2822 } 2823 } 2824 2825 if ((pio = zio_unique_parent(zio)) != NULL) { 2826 /* 2827 * We're not a root i/o, so there's nothing to do 2828 * but notify our parent. Don't propagate errors 2829 * upward since we haven't permanently failed yet. 2830 */ 2831 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 2832 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 2833 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2834 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 2835 /* 2836 * We'd fail again if we reexecuted now, so suspend 2837 * until conditions improve (e.g. device comes online). 2838 */ 2839 zio_suspend(spa, zio); 2840 } else { 2841 /* 2842 * Reexecution is potentially a huge amount of work. 2843 * Hand it off to the otherwise-unused claim taskq. 2844 */ 2845 (void) taskq_dispatch( 2846 spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], 2847 (task_func_t *)zio_reexecute, zio, TQ_SLEEP); 2848 } 2849 return (ZIO_PIPELINE_STOP); 2850 } 2851 2852 ASSERT(zio->io_child_count == 0); 2853 ASSERT(zio->io_reexecute == 0); 2854 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 2855 2856 /* 2857 * Report any checksum errors, since the I/O is complete. 2858 */ 2859 while (zio->io_cksum_report != NULL) { 2860 zio_cksum_report_t *zcr = zio->io_cksum_report; 2861 zio->io_cksum_report = zcr->zcr_next; 2862 zcr->zcr_next = NULL; 2863 zcr->zcr_finish(zcr, NULL); 2864 zfs_ereport_free_checksum(zcr); 2865 } 2866 2867 /* 2868 * It is the responsibility of the done callback to ensure that this 2869 * particular zio is no longer discoverable for adoption, and as 2870 * such, cannot acquire any new parents. 2871 */ 2872 if (zio->io_done) 2873 zio->io_done(zio); 2874 2875 mutex_enter(&zio->io_lock); 2876 zio->io_state[ZIO_WAIT_DONE] = 1; 2877 mutex_exit(&zio->io_lock); 2878 2879 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 2880 zio_link_t *zl = zio->io_walk_link; 2881 pio_next = zio_walk_parents(zio); 2882 zio_remove_child(pio, zio, zl); 2883 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 2884 } 2885 2886 if (zio->io_waiter != NULL) { 2887 mutex_enter(&zio->io_lock); 2888 zio->io_executor = NULL; 2889 cv_broadcast(&zio->io_cv); 2890 mutex_exit(&zio->io_lock); 2891 } else { 2892 zio_destroy(zio); 2893 } 2894 2895 return (ZIO_PIPELINE_STOP); 2896 } 2897 2898 /* 2899 * ========================================================================== 2900 * I/O pipeline definition 2901 * ========================================================================== 2902 */ 2903 static zio_pipe_stage_t *zio_pipeline[] = { 2904 NULL, 2905 zio_read_bp_init, 2906 zio_free_bp_init, 2907 zio_issue_async, 2908 zio_write_bp_init, 2909 zio_checksum_generate, 2910 zio_ddt_read_start, 2911 zio_ddt_read_done, 2912 zio_ddt_write, 2913 zio_ddt_free, 2914 zio_gang_assemble, 2915 zio_gang_issue, 2916 zio_dva_allocate, 2917 zio_dva_free, 2918 zio_dva_claim, 2919 zio_ready, 2920 zio_vdev_io_start, 2921 zio_vdev_io_done, 2922 zio_vdev_io_assess, 2923 zio_checksum_verify, 2924 zio_done 2925 }; 2926