1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/txg.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio_impl.h> 34 #include <sys/zio_compress.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/arc.h> 38 #include <sys/ddt.h> 39 40 /* 41 * ========================================================================== 42 * I/O priority table 43 * ========================================================================== 44 */ 45 uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 46 0, /* ZIO_PRIORITY_NOW */ 47 0, /* ZIO_PRIORITY_SYNC_READ */ 48 0, /* ZIO_PRIORITY_SYNC_WRITE */ 49 0, /* ZIO_PRIORITY_LOG_WRITE */ 50 1, /* ZIO_PRIORITY_CACHE_FILL */ 51 1, /* ZIO_PRIORITY_AGG */ 52 4, /* ZIO_PRIORITY_FREE */ 53 4, /* ZIO_PRIORITY_ASYNC_WRITE */ 54 6, /* ZIO_PRIORITY_ASYNC_READ */ 55 10, /* ZIO_PRIORITY_RESILVER */ 56 20, /* ZIO_PRIORITY_SCRUB */ 57 2, /* ZIO_PRIORITY_DDT_PREFETCH */ 58 }; 59 60 /* 61 * ========================================================================== 62 * I/O type descriptions 63 * ========================================================================== 64 */ 65 char *zio_type_name[ZIO_TYPES] = { 66 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 67 "zio_ioctl" 68 }; 69 70 /* 71 * ========================================================================== 72 * I/O kmem caches 73 * ========================================================================== 74 */ 75 kmem_cache_t *zio_cache; 76 kmem_cache_t *zio_link_cache; 77 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 78 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 79 80 #ifdef _KERNEL 81 extern vmem_t *zio_alloc_arena; 82 #endif 83 extern int zfs_mg_alloc_failures; 84 85 /* 86 * The following actions directly effect the spa's sync-to-convergence logic. 87 * The values below define the sync pass when we start performing the action. 88 * Care should be taken when changing these values as they directly impact 89 * spa_sync() performance. Tuning these values may introduce subtle performance 90 * pathologies and should only be done in the context of performance analysis. 91 * These tunables will eventually be removed and replaced with #defines once 92 * enough analysis has been done to determine optimal values. 93 * 94 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 95 * regular blocks are not deferred. 96 */ 97 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 98 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 99 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 100 101 /* 102 * An allocating zio is one that either currently has the DVA allocate 103 * stage set or will have it later in its lifetime. 104 */ 105 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 106 107 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 108 109 #ifdef ZFS_DEBUG 110 int zio_buf_debug_limit = 16384; 111 #else 112 int zio_buf_debug_limit = 0; 113 #endif 114 115 void 116 zio_init(void) 117 { 118 size_t c; 119 vmem_t *data_alloc_arena = NULL; 120 121 #ifdef _KERNEL 122 data_alloc_arena = zio_alloc_arena; 123 #endif 124 zio_cache = kmem_cache_create("zio_cache", 125 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 126 zio_link_cache = kmem_cache_create("zio_link_cache", 127 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 128 129 /* 130 * For small buffers, we want a cache for each multiple of 131 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 132 * for each quarter-power of 2. For large buffers, we want 133 * a cache for each multiple of PAGESIZE. 134 */ 135 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 136 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 137 size_t p2 = size; 138 size_t align = 0; 139 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 140 141 while (p2 & (p2 - 1)) 142 p2 &= p2 - 1; 143 144 #ifndef _KERNEL 145 /* 146 * If we are using watchpoints, put each buffer on its own page, 147 * to eliminate the performance overhead of trapping to the 148 * kernel when modifying a non-watched buffer that shares the 149 * page with a watched buffer. 150 */ 151 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 152 continue; 153 #endif 154 if (size <= 4 * SPA_MINBLOCKSIZE) { 155 align = SPA_MINBLOCKSIZE; 156 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 157 align = PAGESIZE; 158 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 159 align = p2 >> 2; 160 } 161 162 if (align != 0) { 163 char name[36]; 164 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 165 zio_buf_cache[c] = kmem_cache_create(name, size, 166 align, NULL, NULL, NULL, NULL, NULL, cflags); 167 168 /* 169 * Since zio_data bufs do not appear in crash dumps, we 170 * pass KMC_NOTOUCH so that no allocator metadata is 171 * stored with the buffers. 172 */ 173 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 174 zio_data_buf_cache[c] = kmem_cache_create(name, size, 175 align, NULL, NULL, NULL, NULL, data_alloc_arena, 176 cflags | KMC_NOTOUCH); 177 } 178 } 179 180 while (--c != 0) { 181 ASSERT(zio_buf_cache[c] != NULL); 182 if (zio_buf_cache[c - 1] == NULL) 183 zio_buf_cache[c - 1] = zio_buf_cache[c]; 184 185 ASSERT(zio_data_buf_cache[c] != NULL); 186 if (zio_data_buf_cache[c - 1] == NULL) 187 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 188 } 189 190 /* 191 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 192 * to fail 3 times per txg or 8 failures, whichever is greater. 193 */ 194 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 195 196 zio_inject_init(); 197 } 198 199 void 200 zio_fini(void) 201 { 202 size_t c; 203 kmem_cache_t *last_cache = NULL; 204 kmem_cache_t *last_data_cache = NULL; 205 206 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 207 if (zio_buf_cache[c] != last_cache) { 208 last_cache = zio_buf_cache[c]; 209 kmem_cache_destroy(zio_buf_cache[c]); 210 } 211 zio_buf_cache[c] = NULL; 212 213 if (zio_data_buf_cache[c] != last_data_cache) { 214 last_data_cache = zio_data_buf_cache[c]; 215 kmem_cache_destroy(zio_data_buf_cache[c]); 216 } 217 zio_data_buf_cache[c] = NULL; 218 } 219 220 kmem_cache_destroy(zio_link_cache); 221 kmem_cache_destroy(zio_cache); 222 223 zio_inject_fini(); 224 } 225 226 /* 227 * ========================================================================== 228 * Allocate and free I/O buffers 229 * ========================================================================== 230 */ 231 232 /* 233 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 234 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 235 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 236 * excess / transient data in-core during a crashdump. 237 */ 238 void * 239 zio_buf_alloc(size_t size) 240 { 241 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 242 243 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 244 245 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 246 } 247 248 /* 249 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 250 * crashdump if the kernel panics. This exists so that we will limit the amount 251 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 252 * of kernel heap dumped to disk when the kernel panics) 253 */ 254 void * 255 zio_data_buf_alloc(size_t size) 256 { 257 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 258 259 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 260 261 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 262 } 263 264 void 265 zio_buf_free(void *buf, size_t size) 266 { 267 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 268 269 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 270 271 kmem_cache_free(zio_buf_cache[c], buf); 272 } 273 274 void 275 zio_data_buf_free(void *buf, size_t size) 276 { 277 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 278 279 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 280 281 kmem_cache_free(zio_data_buf_cache[c], buf); 282 } 283 284 /* 285 * ========================================================================== 286 * Push and pop I/O transform buffers 287 * ========================================================================== 288 */ 289 static void 290 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 291 zio_transform_func_t *transform) 292 { 293 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 294 295 zt->zt_orig_data = zio->io_data; 296 zt->zt_orig_size = zio->io_size; 297 zt->zt_bufsize = bufsize; 298 zt->zt_transform = transform; 299 300 zt->zt_next = zio->io_transform_stack; 301 zio->io_transform_stack = zt; 302 303 zio->io_data = data; 304 zio->io_size = size; 305 } 306 307 static void 308 zio_pop_transforms(zio_t *zio) 309 { 310 zio_transform_t *zt; 311 312 while ((zt = zio->io_transform_stack) != NULL) { 313 if (zt->zt_transform != NULL) 314 zt->zt_transform(zio, 315 zt->zt_orig_data, zt->zt_orig_size); 316 317 if (zt->zt_bufsize != 0) 318 zio_buf_free(zio->io_data, zt->zt_bufsize); 319 320 zio->io_data = zt->zt_orig_data; 321 zio->io_size = zt->zt_orig_size; 322 zio->io_transform_stack = zt->zt_next; 323 324 kmem_free(zt, sizeof (zio_transform_t)); 325 } 326 } 327 328 /* 329 * ========================================================================== 330 * I/O transform callbacks for subblocks and decompression 331 * ========================================================================== 332 */ 333 static void 334 zio_subblock(zio_t *zio, void *data, uint64_t size) 335 { 336 ASSERT(zio->io_size > size); 337 338 if (zio->io_type == ZIO_TYPE_READ) 339 bcopy(zio->io_data, data, size); 340 } 341 342 static void 343 zio_decompress(zio_t *zio, void *data, uint64_t size) 344 { 345 if (zio->io_error == 0 && 346 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 347 zio->io_data, data, zio->io_size, size) != 0) 348 zio->io_error = SET_ERROR(EIO); 349 } 350 351 /* 352 * ========================================================================== 353 * I/O parent/child relationships and pipeline interlocks 354 * ========================================================================== 355 */ 356 /* 357 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 358 * continue calling these functions until they return NULL. 359 * Otherwise, the next caller will pick up the list walk in 360 * some indeterminate state. (Otherwise every caller would 361 * have to pass in a cookie to keep the state represented by 362 * io_walk_link, which gets annoying.) 363 */ 364 zio_t * 365 zio_walk_parents(zio_t *cio) 366 { 367 zio_link_t *zl = cio->io_walk_link; 368 list_t *pl = &cio->io_parent_list; 369 370 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 371 cio->io_walk_link = zl; 372 373 if (zl == NULL) 374 return (NULL); 375 376 ASSERT(zl->zl_child == cio); 377 return (zl->zl_parent); 378 } 379 380 zio_t * 381 zio_walk_children(zio_t *pio) 382 { 383 zio_link_t *zl = pio->io_walk_link; 384 list_t *cl = &pio->io_child_list; 385 386 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 387 pio->io_walk_link = zl; 388 389 if (zl == NULL) 390 return (NULL); 391 392 ASSERT(zl->zl_parent == pio); 393 return (zl->zl_child); 394 } 395 396 zio_t * 397 zio_unique_parent(zio_t *cio) 398 { 399 zio_t *pio = zio_walk_parents(cio); 400 401 VERIFY(zio_walk_parents(cio) == NULL); 402 return (pio); 403 } 404 405 void 406 zio_add_child(zio_t *pio, zio_t *cio) 407 { 408 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 409 410 /* 411 * Logical I/Os can have logical, gang, or vdev children. 412 * Gang I/Os can have gang or vdev children. 413 * Vdev I/Os can only have vdev children. 414 * The following ASSERT captures all of these constraints. 415 */ 416 ASSERT(cio->io_child_type <= pio->io_child_type); 417 418 zl->zl_parent = pio; 419 zl->zl_child = cio; 420 421 mutex_enter(&cio->io_lock); 422 mutex_enter(&pio->io_lock); 423 424 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 425 426 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 427 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 428 429 list_insert_head(&pio->io_child_list, zl); 430 list_insert_head(&cio->io_parent_list, zl); 431 432 pio->io_child_count++; 433 cio->io_parent_count++; 434 435 mutex_exit(&pio->io_lock); 436 mutex_exit(&cio->io_lock); 437 } 438 439 static void 440 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 441 { 442 ASSERT(zl->zl_parent == pio); 443 ASSERT(zl->zl_child == cio); 444 445 mutex_enter(&cio->io_lock); 446 mutex_enter(&pio->io_lock); 447 448 list_remove(&pio->io_child_list, zl); 449 list_remove(&cio->io_parent_list, zl); 450 451 pio->io_child_count--; 452 cio->io_parent_count--; 453 454 mutex_exit(&pio->io_lock); 455 mutex_exit(&cio->io_lock); 456 457 kmem_cache_free(zio_link_cache, zl); 458 } 459 460 static boolean_t 461 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 462 { 463 uint64_t *countp = &zio->io_children[child][wait]; 464 boolean_t waiting = B_FALSE; 465 466 mutex_enter(&zio->io_lock); 467 ASSERT(zio->io_stall == NULL); 468 if (*countp != 0) { 469 zio->io_stage >>= 1; 470 zio->io_stall = countp; 471 waiting = B_TRUE; 472 } 473 mutex_exit(&zio->io_lock); 474 475 return (waiting); 476 } 477 478 static void 479 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 480 { 481 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 482 int *errorp = &pio->io_child_error[zio->io_child_type]; 483 484 mutex_enter(&pio->io_lock); 485 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 486 *errorp = zio_worst_error(*errorp, zio->io_error); 487 pio->io_reexecute |= zio->io_reexecute; 488 ASSERT3U(*countp, >, 0); 489 if (--*countp == 0 && pio->io_stall == countp) { 490 pio->io_stall = NULL; 491 mutex_exit(&pio->io_lock); 492 zio_execute(pio); 493 } else { 494 mutex_exit(&pio->io_lock); 495 } 496 } 497 498 static void 499 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 500 { 501 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 502 zio->io_error = zio->io_child_error[c]; 503 } 504 505 /* 506 * ========================================================================== 507 * Create the various types of I/O (read, write, free, etc) 508 * ========================================================================== 509 */ 510 static zio_t * 511 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 512 void *data, uint64_t size, zio_done_func_t *done, void *private, 513 zio_type_t type, int priority, enum zio_flag flags, 514 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 515 enum zio_stage stage, enum zio_stage pipeline) 516 { 517 zio_t *zio; 518 519 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 520 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 521 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 522 523 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 524 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 525 ASSERT(vd || stage == ZIO_STAGE_OPEN); 526 527 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 528 bzero(zio, sizeof (zio_t)); 529 530 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 531 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 532 533 list_create(&zio->io_parent_list, sizeof (zio_link_t), 534 offsetof(zio_link_t, zl_parent_node)); 535 list_create(&zio->io_child_list, sizeof (zio_link_t), 536 offsetof(zio_link_t, zl_child_node)); 537 538 if (vd != NULL) 539 zio->io_child_type = ZIO_CHILD_VDEV; 540 else if (flags & ZIO_FLAG_GANG_CHILD) 541 zio->io_child_type = ZIO_CHILD_GANG; 542 else if (flags & ZIO_FLAG_DDT_CHILD) 543 zio->io_child_type = ZIO_CHILD_DDT; 544 else 545 zio->io_child_type = ZIO_CHILD_LOGICAL; 546 547 if (bp != NULL) { 548 zio->io_bp = (blkptr_t *)bp; 549 zio->io_bp_copy = *bp; 550 zio->io_bp_orig = *bp; 551 if (type != ZIO_TYPE_WRITE || 552 zio->io_child_type == ZIO_CHILD_DDT) 553 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 554 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 555 zio->io_logical = zio; 556 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 557 pipeline |= ZIO_GANG_STAGES; 558 } 559 560 zio->io_spa = spa; 561 zio->io_txg = txg; 562 zio->io_done = done; 563 zio->io_private = private; 564 zio->io_type = type; 565 zio->io_priority = priority; 566 zio->io_vd = vd; 567 zio->io_offset = offset; 568 zio->io_orig_data = zio->io_data = data; 569 zio->io_orig_size = zio->io_size = size; 570 zio->io_orig_flags = zio->io_flags = flags; 571 zio->io_orig_stage = zio->io_stage = stage; 572 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 573 574 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 575 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 576 577 if (zb != NULL) 578 zio->io_bookmark = *zb; 579 580 if (pio != NULL) { 581 if (zio->io_logical == NULL) 582 zio->io_logical = pio->io_logical; 583 if (zio->io_child_type == ZIO_CHILD_GANG) 584 zio->io_gang_leader = pio->io_gang_leader; 585 zio_add_child(pio, zio); 586 } 587 588 return (zio); 589 } 590 591 static void 592 zio_destroy(zio_t *zio) 593 { 594 list_destroy(&zio->io_parent_list); 595 list_destroy(&zio->io_child_list); 596 mutex_destroy(&zio->io_lock); 597 cv_destroy(&zio->io_cv); 598 kmem_cache_free(zio_cache, zio); 599 } 600 601 zio_t * 602 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 603 void *private, enum zio_flag flags) 604 { 605 zio_t *zio; 606 607 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 608 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 609 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 610 611 return (zio); 612 } 613 614 zio_t * 615 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 616 { 617 return (zio_null(NULL, spa, NULL, done, private, flags)); 618 } 619 620 zio_t * 621 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 622 void *data, uint64_t size, zio_done_func_t *done, void *private, 623 int priority, enum zio_flag flags, const zbookmark_t *zb) 624 { 625 zio_t *zio; 626 627 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 628 data, size, done, private, 629 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 630 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 631 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 632 633 return (zio); 634 } 635 636 zio_t * 637 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 638 void *data, uint64_t size, const zio_prop_t *zp, 639 zio_done_func_t *ready, zio_done_func_t *done, void *private, 640 int priority, enum zio_flag flags, const zbookmark_t *zb) 641 { 642 zio_t *zio; 643 644 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 645 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 646 zp->zp_compress >= ZIO_COMPRESS_OFF && 647 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 648 DMU_OT_IS_VALID(zp->zp_type) && 649 zp->zp_level < 32 && 650 zp->zp_copies > 0 && 651 zp->zp_copies <= spa_max_replication(spa)); 652 653 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 654 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 655 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 656 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 657 658 zio->io_ready = ready; 659 zio->io_prop = *zp; 660 661 return (zio); 662 } 663 664 zio_t * 665 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 666 uint64_t size, zio_done_func_t *done, void *private, int priority, 667 enum zio_flag flags, zbookmark_t *zb) 668 { 669 zio_t *zio; 670 671 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 672 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 673 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 674 675 return (zio); 676 } 677 678 void 679 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 680 { 681 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 682 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 683 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 684 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 685 686 /* 687 * We must reset the io_prop to match the values that existed 688 * when the bp was first written by dmu_sync() keeping in mind 689 * that nopwrite and dedup are mutually exclusive. 690 */ 691 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 692 zio->io_prop.zp_nopwrite = nopwrite; 693 zio->io_prop.zp_copies = copies; 694 zio->io_bp_override = bp; 695 } 696 697 void 698 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 699 { 700 metaslab_check_free(spa, bp); 701 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 702 } 703 704 zio_t * 705 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 706 enum zio_flag flags) 707 { 708 zio_t *zio; 709 710 dprintf_bp(bp, "freeing in txg %llu, pass %u", 711 (longlong_t)txg, spa->spa_sync_pass); 712 713 ASSERT(!BP_IS_HOLE(bp)); 714 ASSERT(spa_syncing_txg(spa) == txg); 715 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 716 717 metaslab_check_free(spa, bp); 718 arc_freed(spa, bp); 719 720 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 721 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, 722 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); 723 724 return (zio); 725 } 726 727 zio_t * 728 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 729 zio_done_func_t *done, void *private, enum zio_flag flags) 730 { 731 zio_t *zio; 732 733 /* 734 * A claim is an allocation of a specific block. Claims are needed 735 * to support immediate writes in the intent log. The issue is that 736 * immediate writes contain committed data, but in a txg that was 737 * *not* committed. Upon opening the pool after an unclean shutdown, 738 * the intent log claims all blocks that contain immediate write data 739 * so that the SPA knows they're in use. 740 * 741 * All claims *must* be resolved in the first txg -- before the SPA 742 * starts allocating blocks -- so that nothing is allocated twice. 743 * If txg == 0 we just verify that the block is claimable. 744 */ 745 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 746 ASSERT(txg == spa_first_txg(spa) || txg == 0); 747 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 748 749 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 750 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 751 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 752 753 return (zio); 754 } 755 756 zio_t * 757 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 758 zio_done_func_t *done, void *private, int priority, enum zio_flag flags) 759 { 760 zio_t *zio; 761 int c; 762 763 if (vd->vdev_children == 0) { 764 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 765 ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL, 766 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 767 768 zio->io_cmd = cmd; 769 } else { 770 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 771 772 for (c = 0; c < vd->vdev_children; c++) 773 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 774 done, private, priority, flags)); 775 } 776 777 return (zio); 778 } 779 780 zio_t * 781 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 782 void *data, int checksum, zio_done_func_t *done, void *private, 783 int priority, enum zio_flag flags, boolean_t labels) 784 { 785 zio_t *zio; 786 787 ASSERT(vd->vdev_children == 0); 788 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 789 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 790 ASSERT3U(offset + size, <=, vd->vdev_psize); 791 792 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 793 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 794 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 795 796 zio->io_prop.zp_checksum = checksum; 797 798 return (zio); 799 } 800 801 zio_t * 802 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 803 void *data, int checksum, zio_done_func_t *done, void *private, 804 int priority, enum zio_flag flags, boolean_t labels) 805 { 806 zio_t *zio; 807 808 ASSERT(vd->vdev_children == 0); 809 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 810 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 811 ASSERT3U(offset + size, <=, vd->vdev_psize); 812 813 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 814 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 815 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 816 817 zio->io_prop.zp_checksum = checksum; 818 819 if (zio_checksum_table[checksum].ci_eck) { 820 /* 821 * zec checksums are necessarily destructive -- they modify 822 * the end of the write buffer to hold the verifier/checksum. 823 * Therefore, we must make a local copy in case the data is 824 * being written to multiple places in parallel. 825 */ 826 void *wbuf = zio_buf_alloc(size); 827 bcopy(data, wbuf, size); 828 zio_push_transform(zio, wbuf, size, size, NULL); 829 } 830 831 return (zio); 832 } 833 834 /* 835 * Create a child I/O to do some work for us. 836 */ 837 zio_t * 838 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 839 void *data, uint64_t size, int type, int priority, enum zio_flag flags, 840 zio_done_func_t *done, void *private) 841 { 842 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 843 zio_t *zio; 844 845 ASSERT(vd->vdev_parent == 846 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 847 848 if (type == ZIO_TYPE_READ && bp != NULL) { 849 /* 850 * If we have the bp, then the child should perform the 851 * checksum and the parent need not. This pushes error 852 * detection as close to the leaves as possible and 853 * eliminates redundant checksums in the interior nodes. 854 */ 855 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 856 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 857 } 858 859 if (vd->vdev_children == 0) 860 offset += VDEV_LABEL_START_SIZE; 861 862 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 863 864 /* 865 * If we've decided to do a repair, the write is not speculative -- 866 * even if the original read was. 867 */ 868 if (flags & ZIO_FLAG_IO_REPAIR) 869 flags &= ~ZIO_FLAG_SPECULATIVE; 870 871 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 872 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 873 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 874 875 return (zio); 876 } 877 878 zio_t * 879 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 880 int type, int priority, enum zio_flag flags, 881 zio_done_func_t *done, void *private) 882 { 883 zio_t *zio; 884 885 ASSERT(vd->vdev_ops->vdev_op_leaf); 886 887 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 888 data, size, done, private, type, priority, 889 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, 890 vd, offset, NULL, 891 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 892 893 return (zio); 894 } 895 896 void 897 zio_flush(zio_t *zio, vdev_t *vd) 898 { 899 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 900 NULL, NULL, ZIO_PRIORITY_NOW, 901 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 902 } 903 904 void 905 zio_shrink(zio_t *zio, uint64_t size) 906 { 907 ASSERT(zio->io_executor == NULL); 908 ASSERT(zio->io_orig_size == zio->io_size); 909 ASSERT(size <= zio->io_size); 910 911 /* 912 * We don't shrink for raidz because of problems with the 913 * reconstruction when reading back less than the block size. 914 * Note, BP_IS_RAIDZ() assumes no compression. 915 */ 916 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 917 if (!BP_IS_RAIDZ(zio->io_bp)) 918 zio->io_orig_size = zio->io_size = size; 919 } 920 921 /* 922 * ========================================================================== 923 * Prepare to read and write logical blocks 924 * ========================================================================== 925 */ 926 927 static int 928 zio_read_bp_init(zio_t *zio) 929 { 930 blkptr_t *bp = zio->io_bp; 931 932 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 933 zio->io_child_type == ZIO_CHILD_LOGICAL && 934 !(zio->io_flags & ZIO_FLAG_RAW)) { 935 uint64_t psize = BP_GET_PSIZE(bp); 936 void *cbuf = zio_buf_alloc(psize); 937 938 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 939 } 940 941 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 942 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 943 944 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 945 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 946 947 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 948 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 949 950 return (ZIO_PIPELINE_CONTINUE); 951 } 952 953 static int 954 zio_write_bp_init(zio_t *zio) 955 { 956 spa_t *spa = zio->io_spa; 957 zio_prop_t *zp = &zio->io_prop; 958 enum zio_compress compress = zp->zp_compress; 959 blkptr_t *bp = zio->io_bp; 960 uint64_t lsize = zio->io_size; 961 uint64_t psize = lsize; 962 int pass = 1; 963 964 /* 965 * If our children haven't all reached the ready stage, 966 * wait for them and then repeat this pipeline stage. 967 */ 968 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 969 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 970 return (ZIO_PIPELINE_STOP); 971 972 if (!IO_IS_ALLOCATING(zio)) 973 return (ZIO_PIPELINE_CONTINUE); 974 975 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 976 977 if (zio->io_bp_override) { 978 ASSERT(bp->blk_birth != zio->io_txg); 979 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 980 981 *bp = *zio->io_bp_override; 982 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 983 984 /* 985 * If we've been overridden and nopwrite is set then 986 * set the flag accordingly to indicate that a nopwrite 987 * has already occurred. 988 */ 989 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 990 ASSERT(!zp->zp_dedup); 991 zio->io_flags |= ZIO_FLAG_NOPWRITE; 992 return (ZIO_PIPELINE_CONTINUE); 993 } 994 995 ASSERT(!zp->zp_nopwrite); 996 997 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 998 return (ZIO_PIPELINE_CONTINUE); 999 1000 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1001 zp->zp_dedup_verify); 1002 1003 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1004 BP_SET_DEDUP(bp, 1); 1005 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1006 return (ZIO_PIPELINE_CONTINUE); 1007 } 1008 zio->io_bp_override = NULL; 1009 BP_ZERO(bp); 1010 } 1011 1012 if (bp->blk_birth == zio->io_txg) { 1013 /* 1014 * We're rewriting an existing block, which means we're 1015 * working on behalf of spa_sync(). For spa_sync() to 1016 * converge, it must eventually be the case that we don't 1017 * have to allocate new blocks. But compression changes 1018 * the blocksize, which forces a reallocate, and makes 1019 * convergence take longer. Therefore, after the first 1020 * few passes, stop compressing to ensure convergence. 1021 */ 1022 pass = spa_sync_pass(spa); 1023 1024 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1025 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1026 ASSERT(!BP_GET_DEDUP(bp)); 1027 1028 if (pass >= zfs_sync_pass_dont_compress) 1029 compress = ZIO_COMPRESS_OFF; 1030 1031 /* Make sure someone doesn't change their mind on overwrites */ 1032 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1033 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1034 } 1035 1036 if (compress != ZIO_COMPRESS_OFF) { 1037 void *cbuf = zio_buf_alloc(lsize); 1038 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1039 if (psize == 0 || psize == lsize) { 1040 compress = ZIO_COMPRESS_OFF; 1041 zio_buf_free(cbuf, lsize); 1042 } else { 1043 ASSERT(psize < lsize); 1044 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1045 } 1046 } 1047 1048 /* 1049 * The final pass of spa_sync() must be all rewrites, but the first 1050 * few passes offer a trade-off: allocating blocks defers convergence, 1051 * but newly allocated blocks are sequential, so they can be written 1052 * to disk faster. Therefore, we allow the first few passes of 1053 * spa_sync() to allocate new blocks, but force rewrites after that. 1054 * There should only be a handful of blocks after pass 1 in any case. 1055 */ 1056 if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && 1057 pass >= zfs_sync_pass_rewrite) { 1058 ASSERT(psize != 0); 1059 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1060 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1061 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1062 } else { 1063 BP_ZERO(bp); 1064 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1065 } 1066 1067 if (psize == 0) { 1068 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1069 } else { 1070 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1071 BP_SET_LSIZE(bp, lsize); 1072 BP_SET_PSIZE(bp, psize); 1073 BP_SET_COMPRESS(bp, compress); 1074 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1075 BP_SET_TYPE(bp, zp->zp_type); 1076 BP_SET_LEVEL(bp, zp->zp_level); 1077 BP_SET_DEDUP(bp, zp->zp_dedup); 1078 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1079 if (zp->zp_dedup) { 1080 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1081 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1082 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1083 } 1084 if (zp->zp_nopwrite) { 1085 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1086 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1087 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1088 } 1089 } 1090 1091 return (ZIO_PIPELINE_CONTINUE); 1092 } 1093 1094 static int 1095 zio_free_bp_init(zio_t *zio) 1096 { 1097 blkptr_t *bp = zio->io_bp; 1098 1099 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1100 if (BP_GET_DEDUP(bp)) 1101 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1102 } 1103 1104 return (ZIO_PIPELINE_CONTINUE); 1105 } 1106 1107 /* 1108 * ========================================================================== 1109 * Execute the I/O pipeline 1110 * ========================================================================== 1111 */ 1112 1113 static void 1114 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1115 { 1116 spa_t *spa = zio->io_spa; 1117 zio_type_t t = zio->io_type; 1118 int flags = (cutinline ? TQ_FRONT : 0); 1119 1120 /* 1121 * If we're a config writer or a probe, the normal issue and 1122 * interrupt threads may all be blocked waiting for the config lock. 1123 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1124 */ 1125 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1126 t = ZIO_TYPE_NULL; 1127 1128 /* 1129 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1130 */ 1131 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1132 t = ZIO_TYPE_NULL; 1133 1134 /* 1135 * If this is a high priority I/O, then use the high priority taskq if 1136 * available. 1137 */ 1138 if (zio->io_priority == ZIO_PRIORITY_NOW && 1139 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1140 q++; 1141 1142 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1143 1144 /* 1145 * NB: We are assuming that the zio can only be dispatched 1146 * to a single taskq at a time. It would be a grievous error 1147 * to dispatch the zio to another taskq at the same time. 1148 */ 1149 ASSERT(zio->io_tqent.tqent_next == NULL); 1150 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1151 flags, &zio->io_tqent); 1152 } 1153 1154 static boolean_t 1155 zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1156 { 1157 kthread_t *executor = zio->io_executor; 1158 spa_t *spa = zio->io_spa; 1159 1160 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1161 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1162 uint_t i; 1163 for (i = 0; i < tqs->stqs_count; i++) { 1164 if (taskq_member(tqs->stqs_taskq[i], executor)) 1165 return (B_TRUE); 1166 } 1167 } 1168 1169 return (B_FALSE); 1170 } 1171 1172 static int 1173 zio_issue_async(zio_t *zio) 1174 { 1175 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1176 1177 return (ZIO_PIPELINE_STOP); 1178 } 1179 1180 void 1181 zio_interrupt(zio_t *zio) 1182 { 1183 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1184 } 1185 1186 /* 1187 * Execute the I/O pipeline until one of the following occurs: 1188 * 1189 * (1) the I/O completes 1190 * (2) the pipeline stalls waiting for dependent child I/Os 1191 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1192 * (4) the I/O is delegated by vdev-level caching or aggregation 1193 * (5) the I/O is deferred due to vdev-level queueing 1194 * (6) the I/O is handed off to another thread. 1195 * 1196 * In all cases, the pipeline stops whenever there's no CPU work; it never 1197 * burns a thread in cv_wait(). 1198 * 1199 * There's no locking on io_stage because there's no legitimate way 1200 * for multiple threads to be attempting to process the same I/O. 1201 */ 1202 static zio_pipe_stage_t *zio_pipeline[]; 1203 1204 void 1205 zio_execute(zio_t *zio) 1206 { 1207 zio->io_executor = curthread; 1208 1209 while (zio->io_stage < ZIO_STAGE_DONE) { 1210 enum zio_stage pipeline = zio->io_pipeline; 1211 enum zio_stage stage = zio->io_stage; 1212 int rv; 1213 1214 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1215 ASSERT(ISP2(stage)); 1216 ASSERT(zio->io_stall == NULL); 1217 1218 do { 1219 stage <<= 1; 1220 } while ((stage & pipeline) == 0); 1221 1222 ASSERT(stage <= ZIO_STAGE_DONE); 1223 1224 /* 1225 * If we are in interrupt context and this pipeline stage 1226 * will grab a config lock that is held across I/O, 1227 * or may wait for an I/O that needs an interrupt thread 1228 * to complete, issue async to avoid deadlock. 1229 * 1230 * For VDEV_IO_START, we cut in line so that the io will 1231 * be sent to disk promptly. 1232 */ 1233 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1234 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1235 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1236 zio_requeue_io_start_cut_in_line : B_FALSE; 1237 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1238 return; 1239 } 1240 1241 zio->io_stage = stage; 1242 rv = zio_pipeline[highbit(stage) - 1](zio); 1243 1244 if (rv == ZIO_PIPELINE_STOP) 1245 return; 1246 1247 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1248 } 1249 } 1250 1251 /* 1252 * ========================================================================== 1253 * Initiate I/O, either sync or async 1254 * ========================================================================== 1255 */ 1256 int 1257 zio_wait(zio_t *zio) 1258 { 1259 int error; 1260 1261 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1262 ASSERT(zio->io_executor == NULL); 1263 1264 zio->io_waiter = curthread; 1265 1266 zio_execute(zio); 1267 1268 mutex_enter(&zio->io_lock); 1269 while (zio->io_executor != NULL) 1270 cv_wait(&zio->io_cv, &zio->io_lock); 1271 mutex_exit(&zio->io_lock); 1272 1273 error = zio->io_error; 1274 zio_destroy(zio); 1275 1276 return (error); 1277 } 1278 1279 void 1280 zio_nowait(zio_t *zio) 1281 { 1282 ASSERT(zio->io_executor == NULL); 1283 1284 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1285 zio_unique_parent(zio) == NULL) { 1286 /* 1287 * This is a logical async I/O with no parent to wait for it. 1288 * We add it to the spa_async_root_zio "Godfather" I/O which 1289 * will ensure they complete prior to unloading the pool. 1290 */ 1291 spa_t *spa = zio->io_spa; 1292 1293 zio_add_child(spa->spa_async_zio_root, zio); 1294 } 1295 1296 zio_execute(zio); 1297 } 1298 1299 /* 1300 * ========================================================================== 1301 * Reexecute or suspend/resume failed I/O 1302 * ========================================================================== 1303 */ 1304 1305 static void 1306 zio_reexecute(zio_t *pio) 1307 { 1308 zio_t *cio, *cio_next; 1309 1310 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1311 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1312 ASSERT(pio->io_gang_leader == NULL); 1313 ASSERT(pio->io_gang_tree == NULL); 1314 1315 pio->io_flags = pio->io_orig_flags; 1316 pio->io_stage = pio->io_orig_stage; 1317 pio->io_pipeline = pio->io_orig_pipeline; 1318 pio->io_reexecute = 0; 1319 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1320 pio->io_error = 0; 1321 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1322 pio->io_state[w] = 0; 1323 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1324 pio->io_child_error[c] = 0; 1325 1326 if (IO_IS_ALLOCATING(pio)) 1327 BP_ZERO(pio->io_bp); 1328 1329 /* 1330 * As we reexecute pio's children, new children could be created. 1331 * New children go to the head of pio's io_child_list, however, 1332 * so we will (correctly) not reexecute them. The key is that 1333 * the remainder of pio's io_child_list, from 'cio_next' onward, 1334 * cannot be affected by any side effects of reexecuting 'cio'. 1335 */ 1336 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1337 cio_next = zio_walk_children(pio); 1338 mutex_enter(&pio->io_lock); 1339 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1340 pio->io_children[cio->io_child_type][w]++; 1341 mutex_exit(&pio->io_lock); 1342 zio_reexecute(cio); 1343 } 1344 1345 /* 1346 * Now that all children have been reexecuted, execute the parent. 1347 * We don't reexecute "The Godfather" I/O here as it's the 1348 * responsibility of the caller to wait on him. 1349 */ 1350 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1351 zio_execute(pio); 1352 } 1353 1354 void 1355 zio_suspend(spa_t *spa, zio_t *zio) 1356 { 1357 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1358 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1359 "failure and the failure mode property for this pool " 1360 "is set to panic.", spa_name(spa)); 1361 1362 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1363 1364 mutex_enter(&spa->spa_suspend_lock); 1365 1366 if (spa->spa_suspend_zio_root == NULL) 1367 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1368 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1369 ZIO_FLAG_GODFATHER); 1370 1371 spa->spa_suspended = B_TRUE; 1372 1373 if (zio != NULL) { 1374 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1375 ASSERT(zio != spa->spa_suspend_zio_root); 1376 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1377 ASSERT(zio_unique_parent(zio) == NULL); 1378 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1379 zio_add_child(spa->spa_suspend_zio_root, zio); 1380 } 1381 1382 mutex_exit(&spa->spa_suspend_lock); 1383 } 1384 1385 int 1386 zio_resume(spa_t *spa) 1387 { 1388 zio_t *pio; 1389 1390 /* 1391 * Reexecute all previously suspended i/o. 1392 */ 1393 mutex_enter(&spa->spa_suspend_lock); 1394 spa->spa_suspended = B_FALSE; 1395 cv_broadcast(&spa->spa_suspend_cv); 1396 pio = spa->spa_suspend_zio_root; 1397 spa->spa_suspend_zio_root = NULL; 1398 mutex_exit(&spa->spa_suspend_lock); 1399 1400 if (pio == NULL) 1401 return (0); 1402 1403 zio_reexecute(pio); 1404 return (zio_wait(pio)); 1405 } 1406 1407 void 1408 zio_resume_wait(spa_t *spa) 1409 { 1410 mutex_enter(&spa->spa_suspend_lock); 1411 while (spa_suspended(spa)) 1412 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1413 mutex_exit(&spa->spa_suspend_lock); 1414 } 1415 1416 /* 1417 * ========================================================================== 1418 * Gang blocks. 1419 * 1420 * A gang block is a collection of small blocks that looks to the DMU 1421 * like one large block. When zio_dva_allocate() cannot find a block 1422 * of the requested size, due to either severe fragmentation or the pool 1423 * being nearly full, it calls zio_write_gang_block() to construct the 1424 * block from smaller fragments. 1425 * 1426 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1427 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1428 * an indirect block: it's an array of block pointers. It consumes 1429 * only one sector and hence is allocatable regardless of fragmentation. 1430 * The gang header's bps point to its gang members, which hold the data. 1431 * 1432 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1433 * as the verifier to ensure uniqueness of the SHA256 checksum. 1434 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1435 * not the gang header. This ensures that data block signatures (needed for 1436 * deduplication) are independent of how the block is physically stored. 1437 * 1438 * Gang blocks can be nested: a gang member may itself be a gang block. 1439 * Thus every gang block is a tree in which root and all interior nodes are 1440 * gang headers, and the leaves are normal blocks that contain user data. 1441 * The root of the gang tree is called the gang leader. 1442 * 1443 * To perform any operation (read, rewrite, free, claim) on a gang block, 1444 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1445 * in the io_gang_tree field of the original logical i/o by recursively 1446 * reading the gang leader and all gang headers below it. This yields 1447 * an in-core tree containing the contents of every gang header and the 1448 * bps for every constituent of the gang block. 1449 * 1450 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1451 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1452 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1453 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1454 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1455 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1456 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1457 * of the gang header plus zio_checksum_compute() of the data to update the 1458 * gang header's blk_cksum as described above. 1459 * 1460 * The two-phase assemble/issue model solves the problem of partial failure -- 1461 * what if you'd freed part of a gang block but then couldn't read the 1462 * gang header for another part? Assembling the entire gang tree first 1463 * ensures that all the necessary gang header I/O has succeeded before 1464 * starting the actual work of free, claim, or write. Once the gang tree 1465 * is assembled, free and claim are in-memory operations that cannot fail. 1466 * 1467 * In the event that a gang write fails, zio_dva_unallocate() walks the 1468 * gang tree to immediately free (i.e. insert back into the space map) 1469 * everything we've allocated. This ensures that we don't get ENOSPC 1470 * errors during repeated suspend/resume cycles due to a flaky device. 1471 * 1472 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1473 * the gang tree, we won't modify the block, so we can safely defer the free 1474 * (knowing that the block is still intact). If we *can* assemble the gang 1475 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1476 * each constituent bp and we can allocate a new block on the next sync pass. 1477 * 1478 * In all cases, the gang tree allows complete recovery from partial failure. 1479 * ========================================================================== 1480 */ 1481 1482 static zio_t * 1483 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1484 { 1485 if (gn != NULL) 1486 return (pio); 1487 1488 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1489 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1490 &pio->io_bookmark)); 1491 } 1492 1493 zio_t * 1494 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1495 { 1496 zio_t *zio; 1497 1498 if (gn != NULL) { 1499 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1500 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1501 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1502 /* 1503 * As we rewrite each gang header, the pipeline will compute 1504 * a new gang block header checksum for it; but no one will 1505 * compute a new data checksum, so we do that here. The one 1506 * exception is the gang leader: the pipeline already computed 1507 * its data checksum because that stage precedes gang assembly. 1508 * (Presently, nothing actually uses interior data checksums; 1509 * this is just good hygiene.) 1510 */ 1511 if (gn != pio->io_gang_leader->io_gang_tree) { 1512 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1513 data, BP_GET_PSIZE(bp)); 1514 } 1515 /* 1516 * If we are here to damage data for testing purposes, 1517 * leave the GBH alone so that we can detect the damage. 1518 */ 1519 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1520 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1521 } else { 1522 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1523 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1524 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1525 } 1526 1527 return (zio); 1528 } 1529 1530 /* ARGSUSED */ 1531 zio_t * 1532 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1533 { 1534 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1535 ZIO_GANG_CHILD_FLAGS(pio))); 1536 } 1537 1538 /* ARGSUSED */ 1539 zio_t * 1540 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1541 { 1542 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1543 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1544 } 1545 1546 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1547 NULL, 1548 zio_read_gang, 1549 zio_rewrite_gang, 1550 zio_free_gang, 1551 zio_claim_gang, 1552 NULL 1553 }; 1554 1555 static void zio_gang_tree_assemble_done(zio_t *zio); 1556 1557 static zio_gang_node_t * 1558 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1559 { 1560 zio_gang_node_t *gn; 1561 1562 ASSERT(*gnpp == NULL); 1563 1564 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1565 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1566 *gnpp = gn; 1567 1568 return (gn); 1569 } 1570 1571 static void 1572 zio_gang_node_free(zio_gang_node_t **gnpp) 1573 { 1574 zio_gang_node_t *gn = *gnpp; 1575 1576 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1577 ASSERT(gn->gn_child[g] == NULL); 1578 1579 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1580 kmem_free(gn, sizeof (*gn)); 1581 *gnpp = NULL; 1582 } 1583 1584 static void 1585 zio_gang_tree_free(zio_gang_node_t **gnpp) 1586 { 1587 zio_gang_node_t *gn = *gnpp; 1588 1589 if (gn == NULL) 1590 return; 1591 1592 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1593 zio_gang_tree_free(&gn->gn_child[g]); 1594 1595 zio_gang_node_free(gnpp); 1596 } 1597 1598 static void 1599 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1600 { 1601 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1602 1603 ASSERT(gio->io_gang_leader == gio); 1604 ASSERT(BP_IS_GANG(bp)); 1605 1606 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1607 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1608 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1609 } 1610 1611 static void 1612 zio_gang_tree_assemble_done(zio_t *zio) 1613 { 1614 zio_t *gio = zio->io_gang_leader; 1615 zio_gang_node_t *gn = zio->io_private; 1616 blkptr_t *bp = zio->io_bp; 1617 1618 ASSERT(gio == zio_unique_parent(zio)); 1619 ASSERT(zio->io_child_count == 0); 1620 1621 if (zio->io_error) 1622 return; 1623 1624 if (BP_SHOULD_BYTESWAP(bp)) 1625 byteswap_uint64_array(zio->io_data, zio->io_size); 1626 1627 ASSERT(zio->io_data == gn->gn_gbh); 1628 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1629 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1630 1631 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1632 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1633 if (!BP_IS_GANG(gbp)) 1634 continue; 1635 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1636 } 1637 } 1638 1639 static void 1640 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1641 { 1642 zio_t *gio = pio->io_gang_leader; 1643 zio_t *zio; 1644 1645 ASSERT(BP_IS_GANG(bp) == !!gn); 1646 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1647 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1648 1649 /* 1650 * If you're a gang header, your data is in gn->gn_gbh. 1651 * If you're a gang member, your data is in 'data' and gn == NULL. 1652 */ 1653 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1654 1655 if (gn != NULL) { 1656 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1657 1658 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1659 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1660 if (BP_IS_HOLE(gbp)) 1661 continue; 1662 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1663 data = (char *)data + BP_GET_PSIZE(gbp); 1664 } 1665 } 1666 1667 if (gn == gio->io_gang_tree) 1668 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1669 1670 if (zio != pio) 1671 zio_nowait(zio); 1672 } 1673 1674 static int 1675 zio_gang_assemble(zio_t *zio) 1676 { 1677 blkptr_t *bp = zio->io_bp; 1678 1679 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1680 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1681 1682 zio->io_gang_leader = zio; 1683 1684 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1685 1686 return (ZIO_PIPELINE_CONTINUE); 1687 } 1688 1689 static int 1690 zio_gang_issue(zio_t *zio) 1691 { 1692 blkptr_t *bp = zio->io_bp; 1693 1694 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1695 return (ZIO_PIPELINE_STOP); 1696 1697 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1698 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1699 1700 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1701 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1702 else 1703 zio_gang_tree_free(&zio->io_gang_tree); 1704 1705 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1706 1707 return (ZIO_PIPELINE_CONTINUE); 1708 } 1709 1710 static void 1711 zio_write_gang_member_ready(zio_t *zio) 1712 { 1713 zio_t *pio = zio_unique_parent(zio); 1714 zio_t *gio = zio->io_gang_leader; 1715 dva_t *cdva = zio->io_bp->blk_dva; 1716 dva_t *pdva = pio->io_bp->blk_dva; 1717 uint64_t asize; 1718 1719 if (BP_IS_HOLE(zio->io_bp)) 1720 return; 1721 1722 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1723 1724 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1725 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1726 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1727 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1728 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1729 1730 mutex_enter(&pio->io_lock); 1731 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1732 ASSERT(DVA_GET_GANG(&pdva[d])); 1733 asize = DVA_GET_ASIZE(&pdva[d]); 1734 asize += DVA_GET_ASIZE(&cdva[d]); 1735 DVA_SET_ASIZE(&pdva[d], asize); 1736 } 1737 mutex_exit(&pio->io_lock); 1738 } 1739 1740 static int 1741 zio_write_gang_block(zio_t *pio) 1742 { 1743 spa_t *spa = pio->io_spa; 1744 blkptr_t *bp = pio->io_bp; 1745 zio_t *gio = pio->io_gang_leader; 1746 zio_t *zio; 1747 zio_gang_node_t *gn, **gnpp; 1748 zio_gbh_phys_t *gbh; 1749 uint64_t txg = pio->io_txg; 1750 uint64_t resid = pio->io_size; 1751 uint64_t lsize; 1752 int copies = gio->io_prop.zp_copies; 1753 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1754 zio_prop_t zp; 1755 int error; 1756 1757 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1758 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1759 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1760 if (error) { 1761 pio->io_error = error; 1762 return (ZIO_PIPELINE_CONTINUE); 1763 } 1764 1765 if (pio == gio) { 1766 gnpp = &gio->io_gang_tree; 1767 } else { 1768 gnpp = pio->io_private; 1769 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1770 } 1771 1772 gn = zio_gang_node_alloc(gnpp); 1773 gbh = gn->gn_gbh; 1774 bzero(gbh, SPA_GANGBLOCKSIZE); 1775 1776 /* 1777 * Create the gang header. 1778 */ 1779 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1780 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1781 1782 /* 1783 * Create and nowait the gang children. 1784 */ 1785 for (int g = 0; resid != 0; resid -= lsize, g++) { 1786 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1787 SPA_MINBLOCKSIZE); 1788 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1789 1790 zp.zp_checksum = gio->io_prop.zp_checksum; 1791 zp.zp_compress = ZIO_COMPRESS_OFF; 1792 zp.zp_type = DMU_OT_NONE; 1793 zp.zp_level = 0; 1794 zp.zp_copies = gio->io_prop.zp_copies; 1795 zp.zp_dedup = B_FALSE; 1796 zp.zp_dedup_verify = B_FALSE; 1797 zp.zp_nopwrite = B_FALSE; 1798 1799 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1800 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1801 zio_write_gang_member_ready, NULL, &gn->gn_child[g], 1802 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1803 &pio->io_bookmark)); 1804 } 1805 1806 /* 1807 * Set pio's pipeline to just wait for zio to finish. 1808 */ 1809 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1810 1811 zio_nowait(zio); 1812 1813 return (ZIO_PIPELINE_CONTINUE); 1814 } 1815 1816 /* 1817 * The zio_nop_write stage in the pipeline determines if allocating 1818 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1819 * such as SHA256, we can compare the checksums of the new data and the old 1820 * to determine if allocating a new block is required. The nopwrite 1821 * feature can handle writes in either syncing or open context (i.e. zil 1822 * writes) and as a result is mutually exclusive with dedup. 1823 */ 1824 static int 1825 zio_nop_write(zio_t *zio) 1826 { 1827 blkptr_t *bp = zio->io_bp; 1828 blkptr_t *bp_orig = &zio->io_bp_orig; 1829 zio_prop_t *zp = &zio->io_prop; 1830 1831 ASSERT(BP_GET_LEVEL(bp) == 0); 1832 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1833 ASSERT(zp->zp_nopwrite); 1834 ASSERT(!zp->zp_dedup); 1835 ASSERT(zio->io_bp_override == NULL); 1836 ASSERT(IO_IS_ALLOCATING(zio)); 1837 1838 /* 1839 * Check to see if the original bp and the new bp have matching 1840 * characteristics (i.e. same checksum, compression algorithms, etc). 1841 * If they don't then just continue with the pipeline which will 1842 * allocate a new bp. 1843 */ 1844 if (BP_IS_HOLE(bp_orig) || 1845 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1846 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1847 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1848 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1849 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1850 return (ZIO_PIPELINE_CONTINUE); 1851 1852 /* 1853 * If the checksums match then reset the pipeline so that we 1854 * avoid allocating a new bp and issuing any I/O. 1855 */ 1856 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1857 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1858 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1859 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1860 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1861 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1862 sizeof (uint64_t)) == 0); 1863 1864 *bp = *bp_orig; 1865 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1866 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1867 } 1868 1869 return (ZIO_PIPELINE_CONTINUE); 1870 } 1871 1872 /* 1873 * ========================================================================== 1874 * Dedup 1875 * ========================================================================== 1876 */ 1877 static void 1878 zio_ddt_child_read_done(zio_t *zio) 1879 { 1880 blkptr_t *bp = zio->io_bp; 1881 ddt_entry_t *dde = zio->io_private; 1882 ddt_phys_t *ddp; 1883 zio_t *pio = zio_unique_parent(zio); 1884 1885 mutex_enter(&pio->io_lock); 1886 ddp = ddt_phys_select(dde, bp); 1887 if (zio->io_error == 0) 1888 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1889 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1890 dde->dde_repair_data = zio->io_data; 1891 else 1892 zio_buf_free(zio->io_data, zio->io_size); 1893 mutex_exit(&pio->io_lock); 1894 } 1895 1896 static int 1897 zio_ddt_read_start(zio_t *zio) 1898 { 1899 blkptr_t *bp = zio->io_bp; 1900 1901 ASSERT(BP_GET_DEDUP(bp)); 1902 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1903 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1904 1905 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1906 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1907 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1908 ddt_phys_t *ddp = dde->dde_phys; 1909 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1910 blkptr_t blk; 1911 1912 ASSERT(zio->io_vsd == NULL); 1913 zio->io_vsd = dde; 1914 1915 if (ddp_self == NULL) 1916 return (ZIO_PIPELINE_CONTINUE); 1917 1918 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1919 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1920 continue; 1921 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1922 &blk); 1923 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1924 zio_buf_alloc(zio->io_size), zio->io_size, 1925 zio_ddt_child_read_done, dde, zio->io_priority, 1926 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1927 &zio->io_bookmark)); 1928 } 1929 return (ZIO_PIPELINE_CONTINUE); 1930 } 1931 1932 zio_nowait(zio_read(zio, zio->io_spa, bp, 1933 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1934 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1935 1936 return (ZIO_PIPELINE_CONTINUE); 1937 } 1938 1939 static int 1940 zio_ddt_read_done(zio_t *zio) 1941 { 1942 blkptr_t *bp = zio->io_bp; 1943 1944 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1945 return (ZIO_PIPELINE_STOP); 1946 1947 ASSERT(BP_GET_DEDUP(bp)); 1948 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1949 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1950 1951 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1952 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1953 ddt_entry_t *dde = zio->io_vsd; 1954 if (ddt == NULL) { 1955 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1956 return (ZIO_PIPELINE_CONTINUE); 1957 } 1958 if (dde == NULL) { 1959 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1960 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1961 return (ZIO_PIPELINE_STOP); 1962 } 1963 if (dde->dde_repair_data != NULL) { 1964 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1965 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1966 } 1967 ddt_repair_done(ddt, dde); 1968 zio->io_vsd = NULL; 1969 } 1970 1971 ASSERT(zio->io_vsd == NULL); 1972 1973 return (ZIO_PIPELINE_CONTINUE); 1974 } 1975 1976 static boolean_t 1977 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 1978 { 1979 spa_t *spa = zio->io_spa; 1980 1981 /* 1982 * Note: we compare the original data, not the transformed data, 1983 * because when zio->io_bp is an override bp, we will not have 1984 * pushed the I/O transforms. That's an important optimization 1985 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 1986 */ 1987 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1988 zio_t *lio = dde->dde_lead_zio[p]; 1989 1990 if (lio != NULL) { 1991 return (lio->io_orig_size != zio->io_orig_size || 1992 bcmp(zio->io_orig_data, lio->io_orig_data, 1993 zio->io_orig_size) != 0); 1994 } 1995 } 1996 1997 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 1998 ddt_phys_t *ddp = &dde->dde_phys[p]; 1999 2000 if (ddp->ddp_phys_birth != 0) { 2001 arc_buf_t *abuf = NULL; 2002 uint32_t aflags = ARC_WAIT; 2003 blkptr_t blk = *zio->io_bp; 2004 int error; 2005 2006 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2007 2008 ddt_exit(ddt); 2009 2010 error = arc_read(NULL, spa, &blk, 2011 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2012 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2013 &aflags, &zio->io_bookmark); 2014 2015 if (error == 0) { 2016 if (arc_buf_size(abuf) != zio->io_orig_size || 2017 bcmp(abuf->b_data, zio->io_orig_data, 2018 zio->io_orig_size) != 0) 2019 error = SET_ERROR(EEXIST); 2020 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2021 } 2022 2023 ddt_enter(ddt); 2024 return (error != 0); 2025 } 2026 } 2027 2028 return (B_FALSE); 2029 } 2030 2031 static void 2032 zio_ddt_child_write_ready(zio_t *zio) 2033 { 2034 int p = zio->io_prop.zp_copies; 2035 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2036 ddt_entry_t *dde = zio->io_private; 2037 ddt_phys_t *ddp = &dde->dde_phys[p]; 2038 zio_t *pio; 2039 2040 if (zio->io_error) 2041 return; 2042 2043 ddt_enter(ddt); 2044 2045 ASSERT(dde->dde_lead_zio[p] == zio); 2046 2047 ddt_phys_fill(ddp, zio->io_bp); 2048 2049 while ((pio = zio_walk_parents(zio)) != NULL) 2050 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2051 2052 ddt_exit(ddt); 2053 } 2054 2055 static void 2056 zio_ddt_child_write_done(zio_t *zio) 2057 { 2058 int p = zio->io_prop.zp_copies; 2059 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2060 ddt_entry_t *dde = zio->io_private; 2061 ddt_phys_t *ddp = &dde->dde_phys[p]; 2062 2063 ddt_enter(ddt); 2064 2065 ASSERT(ddp->ddp_refcnt == 0); 2066 ASSERT(dde->dde_lead_zio[p] == zio); 2067 dde->dde_lead_zio[p] = NULL; 2068 2069 if (zio->io_error == 0) { 2070 while (zio_walk_parents(zio) != NULL) 2071 ddt_phys_addref(ddp); 2072 } else { 2073 ddt_phys_clear(ddp); 2074 } 2075 2076 ddt_exit(ddt); 2077 } 2078 2079 static void 2080 zio_ddt_ditto_write_done(zio_t *zio) 2081 { 2082 int p = DDT_PHYS_DITTO; 2083 zio_prop_t *zp = &zio->io_prop; 2084 blkptr_t *bp = zio->io_bp; 2085 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2086 ddt_entry_t *dde = zio->io_private; 2087 ddt_phys_t *ddp = &dde->dde_phys[p]; 2088 ddt_key_t *ddk = &dde->dde_key; 2089 2090 ddt_enter(ddt); 2091 2092 ASSERT(ddp->ddp_refcnt == 0); 2093 ASSERT(dde->dde_lead_zio[p] == zio); 2094 dde->dde_lead_zio[p] = NULL; 2095 2096 if (zio->io_error == 0) { 2097 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2098 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2099 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2100 if (ddp->ddp_phys_birth != 0) 2101 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2102 ddt_phys_fill(ddp, bp); 2103 } 2104 2105 ddt_exit(ddt); 2106 } 2107 2108 static int 2109 zio_ddt_write(zio_t *zio) 2110 { 2111 spa_t *spa = zio->io_spa; 2112 blkptr_t *bp = zio->io_bp; 2113 uint64_t txg = zio->io_txg; 2114 zio_prop_t *zp = &zio->io_prop; 2115 int p = zp->zp_copies; 2116 int ditto_copies; 2117 zio_t *cio = NULL; 2118 zio_t *dio = NULL; 2119 ddt_t *ddt = ddt_select(spa, bp); 2120 ddt_entry_t *dde; 2121 ddt_phys_t *ddp; 2122 2123 ASSERT(BP_GET_DEDUP(bp)); 2124 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2125 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2126 2127 ddt_enter(ddt); 2128 dde = ddt_lookup(ddt, bp, B_TRUE); 2129 ddp = &dde->dde_phys[p]; 2130 2131 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2132 /* 2133 * If we're using a weak checksum, upgrade to a strong checksum 2134 * and try again. If we're already using a strong checksum, 2135 * we can't resolve it, so just convert to an ordinary write. 2136 * (And automatically e-mail a paper to Nature?) 2137 */ 2138 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2139 zp->zp_checksum = spa_dedup_checksum(spa); 2140 zio_pop_transforms(zio); 2141 zio->io_stage = ZIO_STAGE_OPEN; 2142 BP_ZERO(bp); 2143 } else { 2144 zp->zp_dedup = B_FALSE; 2145 } 2146 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2147 ddt_exit(ddt); 2148 return (ZIO_PIPELINE_CONTINUE); 2149 } 2150 2151 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2152 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2153 2154 if (ditto_copies > ddt_ditto_copies_present(dde) && 2155 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2156 zio_prop_t czp = *zp; 2157 2158 czp.zp_copies = ditto_copies; 2159 2160 /* 2161 * If we arrived here with an override bp, we won't have run 2162 * the transform stack, so we won't have the data we need to 2163 * generate a child i/o. So, toss the override bp and restart. 2164 * This is safe, because using the override bp is just an 2165 * optimization; and it's rare, so the cost doesn't matter. 2166 */ 2167 if (zio->io_bp_override) { 2168 zio_pop_transforms(zio); 2169 zio->io_stage = ZIO_STAGE_OPEN; 2170 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2171 zio->io_bp_override = NULL; 2172 BP_ZERO(bp); 2173 ddt_exit(ddt); 2174 return (ZIO_PIPELINE_CONTINUE); 2175 } 2176 2177 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2178 zio->io_orig_size, &czp, NULL, 2179 zio_ddt_ditto_write_done, dde, zio->io_priority, 2180 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2181 2182 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2183 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2184 } 2185 2186 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2187 if (ddp->ddp_phys_birth != 0) 2188 ddt_bp_fill(ddp, bp, txg); 2189 if (dde->dde_lead_zio[p] != NULL) 2190 zio_add_child(zio, dde->dde_lead_zio[p]); 2191 else 2192 ddt_phys_addref(ddp); 2193 } else if (zio->io_bp_override) { 2194 ASSERT(bp->blk_birth == txg); 2195 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2196 ddt_phys_fill(ddp, bp); 2197 ddt_phys_addref(ddp); 2198 } else { 2199 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2200 zio->io_orig_size, zp, zio_ddt_child_write_ready, 2201 zio_ddt_child_write_done, dde, zio->io_priority, 2202 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2203 2204 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2205 dde->dde_lead_zio[p] = cio; 2206 } 2207 2208 ddt_exit(ddt); 2209 2210 if (cio) 2211 zio_nowait(cio); 2212 if (dio) 2213 zio_nowait(dio); 2214 2215 return (ZIO_PIPELINE_CONTINUE); 2216 } 2217 2218 ddt_entry_t *freedde; /* for debugging */ 2219 2220 static int 2221 zio_ddt_free(zio_t *zio) 2222 { 2223 spa_t *spa = zio->io_spa; 2224 blkptr_t *bp = zio->io_bp; 2225 ddt_t *ddt = ddt_select(spa, bp); 2226 ddt_entry_t *dde; 2227 ddt_phys_t *ddp; 2228 2229 ASSERT(BP_GET_DEDUP(bp)); 2230 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2231 2232 ddt_enter(ddt); 2233 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2234 ddp = ddt_phys_select(dde, bp); 2235 ddt_phys_decref(ddp); 2236 ddt_exit(ddt); 2237 2238 return (ZIO_PIPELINE_CONTINUE); 2239 } 2240 2241 /* 2242 * ========================================================================== 2243 * Allocate and free blocks 2244 * ========================================================================== 2245 */ 2246 static int 2247 zio_dva_allocate(zio_t *zio) 2248 { 2249 spa_t *spa = zio->io_spa; 2250 metaslab_class_t *mc = spa_normal_class(spa); 2251 blkptr_t *bp = zio->io_bp; 2252 int error; 2253 int flags = 0; 2254 2255 if (zio->io_gang_leader == NULL) { 2256 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2257 zio->io_gang_leader = zio; 2258 } 2259 2260 ASSERT(BP_IS_HOLE(bp)); 2261 ASSERT0(BP_GET_NDVAS(bp)); 2262 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2263 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2264 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2265 2266 /* 2267 * The dump device does not support gang blocks so allocation on 2268 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2269 * the "fast" gang feature. 2270 */ 2271 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2272 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2273 METASLAB_GANG_CHILD : 0; 2274 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2275 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2276 2277 if (error) { 2278 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2279 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2280 error); 2281 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2282 return (zio_write_gang_block(zio)); 2283 zio->io_error = error; 2284 } 2285 2286 return (ZIO_PIPELINE_CONTINUE); 2287 } 2288 2289 static int 2290 zio_dva_free(zio_t *zio) 2291 { 2292 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2293 2294 return (ZIO_PIPELINE_CONTINUE); 2295 } 2296 2297 static int 2298 zio_dva_claim(zio_t *zio) 2299 { 2300 int error; 2301 2302 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2303 if (error) 2304 zio->io_error = error; 2305 2306 return (ZIO_PIPELINE_CONTINUE); 2307 } 2308 2309 /* 2310 * Undo an allocation. This is used by zio_done() when an I/O fails 2311 * and we want to give back the block we just allocated. 2312 * This handles both normal blocks and gang blocks. 2313 */ 2314 static void 2315 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2316 { 2317 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2318 ASSERT(zio->io_bp_override == NULL); 2319 2320 if (!BP_IS_HOLE(bp)) 2321 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2322 2323 if (gn != NULL) { 2324 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2325 zio_dva_unallocate(zio, gn->gn_child[g], 2326 &gn->gn_gbh->zg_blkptr[g]); 2327 } 2328 } 2329 } 2330 2331 /* 2332 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2333 */ 2334 int 2335 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2336 uint64_t size, boolean_t use_slog) 2337 { 2338 int error = 1; 2339 2340 ASSERT(txg > spa_syncing_txg(spa)); 2341 2342 /* 2343 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2344 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2345 * when allocating them. 2346 */ 2347 if (use_slog) { 2348 error = metaslab_alloc(spa, spa_log_class(spa), size, 2349 new_bp, 1, txg, old_bp, 2350 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2351 } 2352 2353 if (error) { 2354 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2355 new_bp, 1, txg, old_bp, 2356 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2357 } 2358 2359 if (error == 0) { 2360 BP_SET_LSIZE(new_bp, size); 2361 BP_SET_PSIZE(new_bp, size); 2362 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2363 BP_SET_CHECKSUM(new_bp, 2364 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2365 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2366 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2367 BP_SET_LEVEL(new_bp, 0); 2368 BP_SET_DEDUP(new_bp, 0); 2369 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2370 } 2371 2372 return (error); 2373 } 2374 2375 /* 2376 * Free an intent log block. 2377 */ 2378 void 2379 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2380 { 2381 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2382 ASSERT(!BP_IS_GANG(bp)); 2383 2384 zio_free(spa, txg, bp); 2385 } 2386 2387 /* 2388 * ========================================================================== 2389 * Read and write to physical devices 2390 * ========================================================================== 2391 */ 2392 static int 2393 zio_vdev_io_start(zio_t *zio) 2394 { 2395 vdev_t *vd = zio->io_vd; 2396 uint64_t align; 2397 spa_t *spa = zio->io_spa; 2398 2399 ASSERT(zio->io_error == 0); 2400 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2401 2402 if (vd == NULL) { 2403 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2404 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2405 2406 /* 2407 * The mirror_ops handle multiple DVAs in a single BP. 2408 */ 2409 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2410 } 2411 2412 /* 2413 * We keep track of time-sensitive I/Os so that the scan thread 2414 * can quickly react to certain workloads. In particular, we care 2415 * about non-scrubbing, top-level reads and writes with the following 2416 * characteristics: 2417 * - synchronous writes of user data to non-slog devices 2418 * - any reads of user data 2419 * When these conditions are met, adjust the timestamp of spa_last_io 2420 * which allows the scan thread to adjust its workload accordingly. 2421 */ 2422 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2423 vd == vd->vdev_top && !vd->vdev_islog && 2424 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2425 zio->io_txg != spa_syncing_txg(spa)) { 2426 uint64_t old = spa->spa_last_io; 2427 uint64_t new = ddi_get_lbolt64(); 2428 if (old != new) 2429 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2430 } 2431 2432 align = 1ULL << vd->vdev_top->vdev_ashift; 2433 2434 if (P2PHASE(zio->io_size, align) != 0) { 2435 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2436 char *abuf = zio_buf_alloc(asize); 2437 ASSERT(vd == vd->vdev_top); 2438 if (zio->io_type == ZIO_TYPE_WRITE) { 2439 bcopy(zio->io_data, abuf, zio->io_size); 2440 bzero(abuf + zio->io_size, asize - zio->io_size); 2441 } 2442 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2443 } 2444 2445 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2446 ASSERT(P2PHASE(zio->io_size, align) == 0); 2447 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2448 2449 /* 2450 * If this is a repair I/O, and there's no self-healing involved -- 2451 * that is, we're just resilvering what we expect to resilver -- 2452 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2453 * This prevents spurious resilvering with nested replication. 2454 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2455 * A is out of date, we'll read from C+D, then use the data to 2456 * resilver A+B -- but we don't actually want to resilver B, just A. 2457 * The top-level mirror has no way to know this, so instead we just 2458 * discard unnecessary repairs as we work our way down the vdev tree. 2459 * The same logic applies to any form of nested replication: 2460 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2461 */ 2462 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2463 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2464 zio->io_txg != 0 && /* not a delegated i/o */ 2465 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2466 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2467 zio_vdev_io_bypass(zio); 2468 return (ZIO_PIPELINE_CONTINUE); 2469 } 2470 2471 if (vd->vdev_ops->vdev_op_leaf && 2472 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2473 2474 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) 2475 return (ZIO_PIPELINE_CONTINUE); 2476 2477 if ((zio = vdev_queue_io(zio)) == NULL) 2478 return (ZIO_PIPELINE_STOP); 2479 2480 if (!vdev_accessible(vd, zio)) { 2481 zio->io_error = SET_ERROR(ENXIO); 2482 zio_interrupt(zio); 2483 return (ZIO_PIPELINE_STOP); 2484 } 2485 } 2486 2487 return (vd->vdev_ops->vdev_op_io_start(zio)); 2488 } 2489 2490 static int 2491 zio_vdev_io_done(zio_t *zio) 2492 { 2493 vdev_t *vd = zio->io_vd; 2494 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2495 boolean_t unexpected_error = B_FALSE; 2496 2497 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2498 return (ZIO_PIPELINE_STOP); 2499 2500 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2501 2502 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2503 2504 vdev_queue_io_done(zio); 2505 2506 if (zio->io_type == ZIO_TYPE_WRITE) 2507 vdev_cache_write(zio); 2508 2509 if (zio_injection_enabled && zio->io_error == 0) 2510 zio->io_error = zio_handle_device_injection(vd, 2511 zio, EIO); 2512 2513 if (zio_injection_enabled && zio->io_error == 0) 2514 zio->io_error = zio_handle_label_injection(zio, EIO); 2515 2516 if (zio->io_error) { 2517 if (!vdev_accessible(vd, zio)) { 2518 zio->io_error = SET_ERROR(ENXIO); 2519 } else { 2520 unexpected_error = B_TRUE; 2521 } 2522 } 2523 } 2524 2525 ops->vdev_op_io_done(zio); 2526 2527 if (unexpected_error) 2528 VERIFY(vdev_probe(vd, zio) == NULL); 2529 2530 return (ZIO_PIPELINE_CONTINUE); 2531 } 2532 2533 /* 2534 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2535 * disk, and use that to finish the checksum ereport later. 2536 */ 2537 static void 2538 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2539 const void *good_buf) 2540 { 2541 /* no processing needed */ 2542 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2543 } 2544 2545 /*ARGSUSED*/ 2546 void 2547 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2548 { 2549 void *buf = zio_buf_alloc(zio->io_size); 2550 2551 bcopy(zio->io_data, buf, zio->io_size); 2552 2553 zcr->zcr_cbinfo = zio->io_size; 2554 zcr->zcr_cbdata = buf; 2555 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2556 zcr->zcr_free = zio_buf_free; 2557 } 2558 2559 static int 2560 zio_vdev_io_assess(zio_t *zio) 2561 { 2562 vdev_t *vd = zio->io_vd; 2563 2564 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2565 return (ZIO_PIPELINE_STOP); 2566 2567 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2568 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2569 2570 if (zio->io_vsd != NULL) { 2571 zio->io_vsd_ops->vsd_free(zio); 2572 zio->io_vsd = NULL; 2573 } 2574 2575 if (zio_injection_enabled && zio->io_error == 0) 2576 zio->io_error = zio_handle_fault_injection(zio, EIO); 2577 2578 /* 2579 * If the I/O failed, determine whether we should attempt to retry it. 2580 * 2581 * On retry, we cut in line in the issue queue, since we don't want 2582 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2583 */ 2584 if (zio->io_error && vd == NULL && 2585 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2586 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2587 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2588 zio->io_error = 0; 2589 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2590 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2591 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2592 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2593 zio_requeue_io_start_cut_in_line); 2594 return (ZIO_PIPELINE_STOP); 2595 } 2596 2597 /* 2598 * If we got an error on a leaf device, convert it to ENXIO 2599 * if the device is not accessible at all. 2600 */ 2601 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2602 !vdev_accessible(vd, zio)) 2603 zio->io_error = SET_ERROR(ENXIO); 2604 2605 /* 2606 * If we can't write to an interior vdev (mirror or RAID-Z), 2607 * set vdev_cant_write so that we stop trying to allocate from it. 2608 */ 2609 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2610 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2611 vd->vdev_cant_write = B_TRUE; 2612 } 2613 2614 if (zio->io_error) 2615 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2616 2617 return (ZIO_PIPELINE_CONTINUE); 2618 } 2619 2620 void 2621 zio_vdev_io_reissue(zio_t *zio) 2622 { 2623 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2624 ASSERT(zio->io_error == 0); 2625 2626 zio->io_stage >>= 1; 2627 } 2628 2629 void 2630 zio_vdev_io_redone(zio_t *zio) 2631 { 2632 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2633 2634 zio->io_stage >>= 1; 2635 } 2636 2637 void 2638 zio_vdev_io_bypass(zio_t *zio) 2639 { 2640 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2641 ASSERT(zio->io_error == 0); 2642 2643 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2644 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2645 } 2646 2647 /* 2648 * ========================================================================== 2649 * Generate and verify checksums 2650 * ========================================================================== 2651 */ 2652 static int 2653 zio_checksum_generate(zio_t *zio) 2654 { 2655 blkptr_t *bp = zio->io_bp; 2656 enum zio_checksum checksum; 2657 2658 if (bp == NULL) { 2659 /* 2660 * This is zio_write_phys(). 2661 * We're either generating a label checksum, or none at all. 2662 */ 2663 checksum = zio->io_prop.zp_checksum; 2664 2665 if (checksum == ZIO_CHECKSUM_OFF) 2666 return (ZIO_PIPELINE_CONTINUE); 2667 2668 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2669 } else { 2670 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2671 ASSERT(!IO_IS_ALLOCATING(zio)); 2672 checksum = ZIO_CHECKSUM_GANG_HEADER; 2673 } else { 2674 checksum = BP_GET_CHECKSUM(bp); 2675 } 2676 } 2677 2678 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2679 2680 return (ZIO_PIPELINE_CONTINUE); 2681 } 2682 2683 static int 2684 zio_checksum_verify(zio_t *zio) 2685 { 2686 zio_bad_cksum_t info; 2687 blkptr_t *bp = zio->io_bp; 2688 int error; 2689 2690 ASSERT(zio->io_vd != NULL); 2691 2692 if (bp == NULL) { 2693 /* 2694 * This is zio_read_phys(). 2695 * We're either verifying a label checksum, or nothing at all. 2696 */ 2697 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2698 return (ZIO_PIPELINE_CONTINUE); 2699 2700 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2701 } 2702 2703 if ((error = zio_checksum_error(zio, &info)) != 0) { 2704 zio->io_error = error; 2705 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2706 zfs_ereport_start_checksum(zio->io_spa, 2707 zio->io_vd, zio, zio->io_offset, 2708 zio->io_size, NULL, &info); 2709 } 2710 } 2711 2712 return (ZIO_PIPELINE_CONTINUE); 2713 } 2714 2715 /* 2716 * Called by RAID-Z to ensure we don't compute the checksum twice. 2717 */ 2718 void 2719 zio_checksum_verified(zio_t *zio) 2720 { 2721 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2722 } 2723 2724 /* 2725 * ========================================================================== 2726 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2727 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2728 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2729 * indicate errors that are specific to one I/O, and most likely permanent. 2730 * Any other error is presumed to be worse because we weren't expecting it. 2731 * ========================================================================== 2732 */ 2733 int 2734 zio_worst_error(int e1, int e2) 2735 { 2736 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2737 int r1, r2; 2738 2739 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2740 if (e1 == zio_error_rank[r1]) 2741 break; 2742 2743 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2744 if (e2 == zio_error_rank[r2]) 2745 break; 2746 2747 return (r1 > r2 ? e1 : e2); 2748 } 2749 2750 /* 2751 * ========================================================================== 2752 * I/O completion 2753 * ========================================================================== 2754 */ 2755 static int 2756 zio_ready(zio_t *zio) 2757 { 2758 blkptr_t *bp = zio->io_bp; 2759 zio_t *pio, *pio_next; 2760 2761 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2762 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2763 return (ZIO_PIPELINE_STOP); 2764 2765 if (zio->io_ready) { 2766 ASSERT(IO_IS_ALLOCATING(zio)); 2767 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2768 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2769 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2770 2771 zio->io_ready(zio); 2772 } 2773 2774 if (bp != NULL && bp != &zio->io_bp_copy) 2775 zio->io_bp_copy = *bp; 2776 2777 if (zio->io_error) 2778 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2779 2780 mutex_enter(&zio->io_lock); 2781 zio->io_state[ZIO_WAIT_READY] = 1; 2782 pio = zio_walk_parents(zio); 2783 mutex_exit(&zio->io_lock); 2784 2785 /* 2786 * As we notify zio's parents, new parents could be added. 2787 * New parents go to the head of zio's io_parent_list, however, 2788 * so we will (correctly) not notify them. The remainder of zio's 2789 * io_parent_list, from 'pio_next' onward, cannot change because 2790 * all parents must wait for us to be done before they can be done. 2791 */ 2792 for (; pio != NULL; pio = pio_next) { 2793 pio_next = zio_walk_parents(zio); 2794 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2795 } 2796 2797 if (zio->io_flags & ZIO_FLAG_NODATA) { 2798 if (BP_IS_GANG(bp)) { 2799 zio->io_flags &= ~ZIO_FLAG_NODATA; 2800 } else { 2801 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2802 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2803 } 2804 } 2805 2806 if (zio_injection_enabled && 2807 zio->io_spa->spa_syncing_txg == zio->io_txg) 2808 zio_handle_ignored_writes(zio); 2809 2810 return (ZIO_PIPELINE_CONTINUE); 2811 } 2812 2813 static int 2814 zio_done(zio_t *zio) 2815 { 2816 spa_t *spa = zio->io_spa; 2817 zio_t *lio = zio->io_logical; 2818 blkptr_t *bp = zio->io_bp; 2819 vdev_t *vd = zio->io_vd; 2820 uint64_t psize = zio->io_size; 2821 zio_t *pio, *pio_next; 2822 2823 /* 2824 * If our children haven't all completed, 2825 * wait for them and then repeat this pipeline stage. 2826 */ 2827 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2828 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2829 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2830 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2831 return (ZIO_PIPELINE_STOP); 2832 2833 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2834 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2835 ASSERT(zio->io_children[c][w] == 0); 2836 2837 if (bp != NULL) { 2838 ASSERT(bp->blk_pad[0] == 0); 2839 ASSERT(bp->blk_pad[1] == 0); 2840 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2841 (bp == zio_unique_parent(zio)->io_bp)); 2842 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2843 zio->io_bp_override == NULL && 2844 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2845 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2846 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2847 ASSERT(BP_COUNT_GANG(bp) == 0 || 2848 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2849 } 2850 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2851 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2852 } 2853 2854 /* 2855 * If there were child vdev/gang/ddt errors, they apply to us now. 2856 */ 2857 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2858 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2859 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2860 2861 /* 2862 * If the I/O on the transformed data was successful, generate any 2863 * checksum reports now while we still have the transformed data. 2864 */ 2865 if (zio->io_error == 0) { 2866 while (zio->io_cksum_report != NULL) { 2867 zio_cksum_report_t *zcr = zio->io_cksum_report; 2868 uint64_t align = zcr->zcr_align; 2869 uint64_t asize = P2ROUNDUP(psize, align); 2870 char *abuf = zio->io_data; 2871 2872 if (asize != psize) { 2873 abuf = zio_buf_alloc(asize); 2874 bcopy(zio->io_data, abuf, psize); 2875 bzero(abuf + psize, asize - psize); 2876 } 2877 2878 zio->io_cksum_report = zcr->zcr_next; 2879 zcr->zcr_next = NULL; 2880 zcr->zcr_finish(zcr, abuf); 2881 zfs_ereport_free_checksum(zcr); 2882 2883 if (asize != psize) 2884 zio_buf_free(abuf, asize); 2885 } 2886 } 2887 2888 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2889 2890 vdev_stat_update(zio, psize); 2891 2892 if (zio->io_error) { 2893 /* 2894 * If this I/O is attached to a particular vdev, 2895 * generate an error message describing the I/O failure 2896 * at the block level. We ignore these errors if the 2897 * device is currently unavailable. 2898 */ 2899 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2900 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2901 2902 if ((zio->io_error == EIO || !(zio->io_flags & 2903 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2904 zio == lio) { 2905 /* 2906 * For logical I/O requests, tell the SPA to log the 2907 * error and generate a logical data ereport. 2908 */ 2909 spa_log_error(spa, zio); 2910 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2911 0, 0); 2912 } 2913 } 2914 2915 if (zio->io_error && zio == lio) { 2916 /* 2917 * Determine whether zio should be reexecuted. This will 2918 * propagate all the way to the root via zio_notify_parent(). 2919 */ 2920 ASSERT(vd == NULL && bp != NULL); 2921 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2922 2923 if (IO_IS_ALLOCATING(zio) && 2924 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2925 if (zio->io_error != ENOSPC) 2926 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2927 else 2928 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2929 } 2930 2931 if ((zio->io_type == ZIO_TYPE_READ || 2932 zio->io_type == ZIO_TYPE_FREE) && 2933 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 2934 zio->io_error == ENXIO && 2935 spa_load_state(spa) == SPA_LOAD_NONE && 2936 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2937 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2938 2939 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2940 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2941 2942 /* 2943 * Here is a possibly good place to attempt to do 2944 * either combinatorial reconstruction or error correction 2945 * based on checksums. It also might be a good place 2946 * to send out preliminary ereports before we suspend 2947 * processing. 2948 */ 2949 } 2950 2951 /* 2952 * If there were logical child errors, they apply to us now. 2953 * We defer this until now to avoid conflating logical child 2954 * errors with errors that happened to the zio itself when 2955 * updating vdev stats and reporting FMA events above. 2956 */ 2957 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2958 2959 if ((zio->io_error || zio->io_reexecute) && 2960 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2961 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 2962 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2963 2964 zio_gang_tree_free(&zio->io_gang_tree); 2965 2966 /* 2967 * Godfather I/Os should never suspend. 2968 */ 2969 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2970 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 2971 zio->io_reexecute = 0; 2972 2973 if (zio->io_reexecute) { 2974 /* 2975 * This is a logical I/O that wants to reexecute. 2976 * 2977 * Reexecute is top-down. When an i/o fails, if it's not 2978 * the root, it simply notifies its parent and sticks around. 2979 * The parent, seeing that it still has children in zio_done(), 2980 * does the same. This percolates all the way up to the root. 2981 * The root i/o will reexecute or suspend the entire tree. 2982 * 2983 * This approach ensures that zio_reexecute() honors 2984 * all the original i/o dependency relationships, e.g. 2985 * parents not executing until children are ready. 2986 */ 2987 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2988 2989 zio->io_gang_leader = NULL; 2990 2991 mutex_enter(&zio->io_lock); 2992 zio->io_state[ZIO_WAIT_DONE] = 1; 2993 mutex_exit(&zio->io_lock); 2994 2995 /* 2996 * "The Godfather" I/O monitors its children but is 2997 * not a true parent to them. It will track them through 2998 * the pipeline but severs its ties whenever they get into 2999 * trouble (e.g. suspended). This allows "The Godfather" 3000 * I/O to return status without blocking. 3001 */ 3002 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3003 zio_link_t *zl = zio->io_walk_link; 3004 pio_next = zio_walk_parents(zio); 3005 3006 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3007 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3008 zio_remove_child(pio, zio, zl); 3009 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3010 } 3011 } 3012 3013 if ((pio = zio_unique_parent(zio)) != NULL) { 3014 /* 3015 * We're not a root i/o, so there's nothing to do 3016 * but notify our parent. Don't propagate errors 3017 * upward since we haven't permanently failed yet. 3018 */ 3019 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3020 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3021 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3022 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3023 /* 3024 * We'd fail again if we reexecuted now, so suspend 3025 * until conditions improve (e.g. device comes online). 3026 */ 3027 zio_suspend(spa, zio); 3028 } else { 3029 /* 3030 * Reexecution is potentially a huge amount of work. 3031 * Hand it off to the otherwise-unused claim taskq. 3032 */ 3033 ASSERT(zio->io_tqent.tqent_next == NULL); 3034 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3035 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3036 0, &zio->io_tqent); 3037 } 3038 return (ZIO_PIPELINE_STOP); 3039 } 3040 3041 ASSERT(zio->io_child_count == 0); 3042 ASSERT(zio->io_reexecute == 0); 3043 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3044 3045 /* 3046 * Report any checksum errors, since the I/O is complete. 3047 */ 3048 while (zio->io_cksum_report != NULL) { 3049 zio_cksum_report_t *zcr = zio->io_cksum_report; 3050 zio->io_cksum_report = zcr->zcr_next; 3051 zcr->zcr_next = NULL; 3052 zcr->zcr_finish(zcr, NULL); 3053 zfs_ereport_free_checksum(zcr); 3054 } 3055 3056 /* 3057 * It is the responsibility of the done callback to ensure that this 3058 * particular zio is no longer discoverable for adoption, and as 3059 * such, cannot acquire any new parents. 3060 */ 3061 if (zio->io_done) 3062 zio->io_done(zio); 3063 3064 mutex_enter(&zio->io_lock); 3065 zio->io_state[ZIO_WAIT_DONE] = 1; 3066 mutex_exit(&zio->io_lock); 3067 3068 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3069 zio_link_t *zl = zio->io_walk_link; 3070 pio_next = zio_walk_parents(zio); 3071 zio_remove_child(pio, zio, zl); 3072 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3073 } 3074 3075 if (zio->io_waiter != NULL) { 3076 mutex_enter(&zio->io_lock); 3077 zio->io_executor = NULL; 3078 cv_broadcast(&zio->io_cv); 3079 mutex_exit(&zio->io_lock); 3080 } else { 3081 zio_destroy(zio); 3082 } 3083 3084 return (ZIO_PIPELINE_STOP); 3085 } 3086 3087 /* 3088 * ========================================================================== 3089 * I/O pipeline definition 3090 * ========================================================================== 3091 */ 3092 static zio_pipe_stage_t *zio_pipeline[] = { 3093 NULL, 3094 zio_read_bp_init, 3095 zio_free_bp_init, 3096 zio_issue_async, 3097 zio_write_bp_init, 3098 zio_checksum_generate, 3099 zio_nop_write, 3100 zio_ddt_read_start, 3101 zio_ddt_read_done, 3102 zio_ddt_write, 3103 zio_ddt_free, 3104 zio_gang_assemble, 3105 zio_gang_issue, 3106 zio_dva_allocate, 3107 zio_dva_free, 3108 zio_dva_claim, 3109 zio_ready, 3110 zio_vdev_io_start, 3111 zio_vdev_io_done, 3112 zio_vdev_io_assess, 3113 zio_checksum_verify, 3114 zio_done 3115 }; 3116 3117 /* dnp is the dnode for zb1->zb_object */ 3118 boolean_t 3119 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3120 const zbookmark_t *zb2) 3121 { 3122 uint64_t zb1nextL0, zb2thisobj; 3123 3124 ASSERT(zb1->zb_objset == zb2->zb_objset); 3125 ASSERT(zb2->zb_level == 0); 3126 3127 /* 3128 * A bookmark in the deadlist is considered to be after 3129 * everything else. 3130 */ 3131 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3132 return (B_TRUE); 3133 3134 /* The objset_phys_t isn't before anything. */ 3135 if (dnp == NULL) 3136 return (B_FALSE); 3137 3138 zb1nextL0 = (zb1->zb_blkid + 1) << 3139 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3140 3141 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3142 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3143 3144 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3145 uint64_t nextobj = zb1nextL0 * 3146 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3147 return (nextobj <= zb2thisobj); 3148 } 3149 3150 if (zb1->zb_object < zb2thisobj) 3151 return (B_TRUE); 3152 if (zb1->zb_object > zb2thisobj) 3153 return (B_FALSE); 3154 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3155 return (B_FALSE); 3156 return (zb1nextL0 <= zb2->zb_blkid); 3157 } 3158