1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/txg.h> 31 #include <sys/spa_impl.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/zio_impl.h> 34 #include <sys/zio_compress.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/arc.h> 38 #include <sys/ddt.h> 39 #include <sys/zfeature.h> 40 41 /* 42 * ========================================================================== 43 * I/O type descriptions 44 * ========================================================================== 45 */ 46 const char *zio_type_name[ZIO_TYPES] = { 47 "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", 48 "zio_ioctl" 49 }; 50 51 /* 52 * ========================================================================== 53 * I/O kmem caches 54 * ========================================================================== 55 */ 56 kmem_cache_t *zio_cache; 57 kmem_cache_t *zio_link_cache; 58 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 59 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; 60 61 #ifdef _KERNEL 62 extern vmem_t *zio_alloc_arena; 63 #endif 64 extern int zfs_mg_alloc_failures; 65 66 /* 67 * The following actions directly effect the spa's sync-to-convergence logic. 68 * The values below define the sync pass when we start performing the action. 69 * Care should be taken when changing these values as they directly impact 70 * spa_sync() performance. Tuning these values may introduce subtle performance 71 * pathologies and should only be done in the context of performance analysis. 72 * These tunables will eventually be removed and replaced with #defines once 73 * enough analysis has been done to determine optimal values. 74 * 75 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that 76 * regular blocks are not deferred. 77 */ 78 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ 79 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */ 80 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ 81 82 /* 83 * An allocating zio is one that either currently has the DVA allocate 84 * stage set or will have it later in its lifetime. 85 */ 86 #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) 87 88 boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; 89 90 #ifdef ZFS_DEBUG 91 int zio_buf_debug_limit = 16384; 92 #else 93 int zio_buf_debug_limit = 0; 94 #endif 95 96 void 97 zio_init(void) 98 { 99 size_t c; 100 vmem_t *data_alloc_arena = NULL; 101 102 #ifdef _KERNEL 103 data_alloc_arena = zio_alloc_arena; 104 #endif 105 zio_cache = kmem_cache_create("zio_cache", 106 sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 107 zio_link_cache = kmem_cache_create("zio_link_cache", 108 sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 109 110 /* 111 * For small buffers, we want a cache for each multiple of 112 * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache 113 * for each quarter-power of 2. For large buffers, we want 114 * a cache for each multiple of PAGESIZE. 115 */ 116 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 117 size_t size = (c + 1) << SPA_MINBLOCKSHIFT; 118 size_t p2 = size; 119 size_t align = 0; 120 size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; 121 122 while (p2 & (p2 - 1)) 123 p2 &= p2 - 1; 124 125 #ifndef _KERNEL 126 /* 127 * If we are using watchpoints, put each buffer on its own page, 128 * to eliminate the performance overhead of trapping to the 129 * kernel when modifying a non-watched buffer that shares the 130 * page with a watched buffer. 131 */ 132 if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) 133 continue; 134 #endif 135 if (size <= 4 * SPA_MINBLOCKSIZE) { 136 align = SPA_MINBLOCKSIZE; 137 } else if (IS_P2ALIGNED(size, PAGESIZE)) { 138 align = PAGESIZE; 139 } else if (IS_P2ALIGNED(size, p2 >> 2)) { 140 align = p2 >> 2; 141 } 142 143 if (align != 0) { 144 char name[36]; 145 (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); 146 zio_buf_cache[c] = kmem_cache_create(name, size, 147 align, NULL, NULL, NULL, NULL, NULL, cflags); 148 149 /* 150 * Since zio_data bufs do not appear in crash dumps, we 151 * pass KMC_NOTOUCH so that no allocator metadata is 152 * stored with the buffers. 153 */ 154 (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); 155 zio_data_buf_cache[c] = kmem_cache_create(name, size, 156 align, NULL, NULL, NULL, NULL, data_alloc_arena, 157 cflags | KMC_NOTOUCH); 158 } 159 } 160 161 while (--c != 0) { 162 ASSERT(zio_buf_cache[c] != NULL); 163 if (zio_buf_cache[c - 1] == NULL) 164 zio_buf_cache[c - 1] = zio_buf_cache[c]; 165 166 ASSERT(zio_data_buf_cache[c] != NULL); 167 if (zio_data_buf_cache[c - 1] == NULL) 168 zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; 169 } 170 171 /* 172 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs 173 * to fail 3 times per txg or 8 failures, whichever is greater. 174 */ 175 if (zfs_mg_alloc_failures == 0) 176 zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8); 177 178 zio_inject_init(); 179 } 180 181 void 182 zio_fini(void) 183 { 184 size_t c; 185 kmem_cache_t *last_cache = NULL; 186 kmem_cache_t *last_data_cache = NULL; 187 188 for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { 189 if (zio_buf_cache[c] != last_cache) { 190 last_cache = zio_buf_cache[c]; 191 kmem_cache_destroy(zio_buf_cache[c]); 192 } 193 zio_buf_cache[c] = NULL; 194 195 if (zio_data_buf_cache[c] != last_data_cache) { 196 last_data_cache = zio_data_buf_cache[c]; 197 kmem_cache_destroy(zio_data_buf_cache[c]); 198 } 199 zio_data_buf_cache[c] = NULL; 200 } 201 202 kmem_cache_destroy(zio_link_cache); 203 kmem_cache_destroy(zio_cache); 204 205 zio_inject_fini(); 206 } 207 208 /* 209 * ========================================================================== 210 * Allocate and free I/O buffers 211 * ========================================================================== 212 */ 213 214 /* 215 * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a 216 * crashdump if the kernel panics, so use it judiciously. Obviously, it's 217 * useful to inspect ZFS metadata, but if possible, we should avoid keeping 218 * excess / transient data in-core during a crashdump. 219 */ 220 void * 221 zio_buf_alloc(size_t size) 222 { 223 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 224 225 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 226 227 return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); 228 } 229 230 /* 231 * Use zio_data_buf_alloc to allocate data. The data will not appear in a 232 * crashdump if the kernel panics. This exists so that we will limit the amount 233 * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount 234 * of kernel heap dumped to disk when the kernel panics) 235 */ 236 void * 237 zio_data_buf_alloc(size_t size) 238 { 239 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 240 241 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 242 243 return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); 244 } 245 246 void 247 zio_buf_free(void *buf, size_t size) 248 { 249 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 250 251 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 252 253 kmem_cache_free(zio_buf_cache[c], buf); 254 } 255 256 void 257 zio_data_buf_free(void *buf, size_t size) 258 { 259 size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; 260 261 ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); 262 263 kmem_cache_free(zio_data_buf_cache[c], buf); 264 } 265 266 /* 267 * ========================================================================== 268 * Push and pop I/O transform buffers 269 * ========================================================================== 270 */ 271 static void 272 zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, 273 zio_transform_func_t *transform) 274 { 275 zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); 276 277 zt->zt_orig_data = zio->io_data; 278 zt->zt_orig_size = zio->io_size; 279 zt->zt_bufsize = bufsize; 280 zt->zt_transform = transform; 281 282 zt->zt_next = zio->io_transform_stack; 283 zio->io_transform_stack = zt; 284 285 zio->io_data = data; 286 zio->io_size = size; 287 } 288 289 static void 290 zio_pop_transforms(zio_t *zio) 291 { 292 zio_transform_t *zt; 293 294 while ((zt = zio->io_transform_stack) != NULL) { 295 if (zt->zt_transform != NULL) 296 zt->zt_transform(zio, 297 zt->zt_orig_data, zt->zt_orig_size); 298 299 if (zt->zt_bufsize != 0) 300 zio_buf_free(zio->io_data, zt->zt_bufsize); 301 302 zio->io_data = zt->zt_orig_data; 303 zio->io_size = zt->zt_orig_size; 304 zio->io_transform_stack = zt->zt_next; 305 306 kmem_free(zt, sizeof (zio_transform_t)); 307 } 308 } 309 310 /* 311 * ========================================================================== 312 * I/O transform callbacks for subblocks and decompression 313 * ========================================================================== 314 */ 315 static void 316 zio_subblock(zio_t *zio, void *data, uint64_t size) 317 { 318 ASSERT(zio->io_size > size); 319 320 if (zio->io_type == ZIO_TYPE_READ) 321 bcopy(zio->io_data, data, size); 322 } 323 324 static void 325 zio_decompress(zio_t *zio, void *data, uint64_t size) 326 { 327 if (zio->io_error == 0 && 328 zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), 329 zio->io_data, data, zio->io_size, size) != 0) 330 zio->io_error = SET_ERROR(EIO); 331 } 332 333 /* 334 * ========================================================================== 335 * I/O parent/child relationships and pipeline interlocks 336 * ========================================================================== 337 */ 338 /* 339 * NOTE - Callers to zio_walk_parents() and zio_walk_children must 340 * continue calling these functions until they return NULL. 341 * Otherwise, the next caller will pick up the list walk in 342 * some indeterminate state. (Otherwise every caller would 343 * have to pass in a cookie to keep the state represented by 344 * io_walk_link, which gets annoying.) 345 */ 346 zio_t * 347 zio_walk_parents(zio_t *cio) 348 { 349 zio_link_t *zl = cio->io_walk_link; 350 list_t *pl = &cio->io_parent_list; 351 352 zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); 353 cio->io_walk_link = zl; 354 355 if (zl == NULL) 356 return (NULL); 357 358 ASSERT(zl->zl_child == cio); 359 return (zl->zl_parent); 360 } 361 362 zio_t * 363 zio_walk_children(zio_t *pio) 364 { 365 zio_link_t *zl = pio->io_walk_link; 366 list_t *cl = &pio->io_child_list; 367 368 zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); 369 pio->io_walk_link = zl; 370 371 if (zl == NULL) 372 return (NULL); 373 374 ASSERT(zl->zl_parent == pio); 375 return (zl->zl_child); 376 } 377 378 zio_t * 379 zio_unique_parent(zio_t *cio) 380 { 381 zio_t *pio = zio_walk_parents(cio); 382 383 VERIFY(zio_walk_parents(cio) == NULL); 384 return (pio); 385 } 386 387 void 388 zio_add_child(zio_t *pio, zio_t *cio) 389 { 390 zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); 391 392 /* 393 * Logical I/Os can have logical, gang, or vdev children. 394 * Gang I/Os can have gang or vdev children. 395 * Vdev I/Os can only have vdev children. 396 * The following ASSERT captures all of these constraints. 397 */ 398 ASSERT(cio->io_child_type <= pio->io_child_type); 399 400 zl->zl_parent = pio; 401 zl->zl_child = cio; 402 403 mutex_enter(&cio->io_lock); 404 mutex_enter(&pio->io_lock); 405 406 ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); 407 408 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 409 pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; 410 411 list_insert_head(&pio->io_child_list, zl); 412 list_insert_head(&cio->io_parent_list, zl); 413 414 pio->io_child_count++; 415 cio->io_parent_count++; 416 417 mutex_exit(&pio->io_lock); 418 mutex_exit(&cio->io_lock); 419 } 420 421 static void 422 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) 423 { 424 ASSERT(zl->zl_parent == pio); 425 ASSERT(zl->zl_child == cio); 426 427 mutex_enter(&cio->io_lock); 428 mutex_enter(&pio->io_lock); 429 430 list_remove(&pio->io_child_list, zl); 431 list_remove(&cio->io_parent_list, zl); 432 433 pio->io_child_count--; 434 cio->io_parent_count--; 435 436 mutex_exit(&pio->io_lock); 437 mutex_exit(&cio->io_lock); 438 439 kmem_cache_free(zio_link_cache, zl); 440 } 441 442 static boolean_t 443 zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) 444 { 445 uint64_t *countp = &zio->io_children[child][wait]; 446 boolean_t waiting = B_FALSE; 447 448 mutex_enter(&zio->io_lock); 449 ASSERT(zio->io_stall == NULL); 450 if (*countp != 0) { 451 zio->io_stage >>= 1; 452 zio->io_stall = countp; 453 waiting = B_TRUE; 454 } 455 mutex_exit(&zio->io_lock); 456 457 return (waiting); 458 } 459 460 static void 461 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) 462 { 463 uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; 464 int *errorp = &pio->io_child_error[zio->io_child_type]; 465 466 mutex_enter(&pio->io_lock); 467 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 468 *errorp = zio_worst_error(*errorp, zio->io_error); 469 pio->io_reexecute |= zio->io_reexecute; 470 ASSERT3U(*countp, >, 0); 471 472 (*countp)--; 473 474 if (*countp == 0 && pio->io_stall == countp) { 475 pio->io_stall = NULL; 476 mutex_exit(&pio->io_lock); 477 zio_execute(pio); 478 } else { 479 mutex_exit(&pio->io_lock); 480 } 481 } 482 483 static void 484 zio_inherit_child_errors(zio_t *zio, enum zio_child c) 485 { 486 if (zio->io_child_error[c] != 0 && zio->io_error == 0) 487 zio->io_error = zio->io_child_error[c]; 488 } 489 490 /* 491 * ========================================================================== 492 * Create the various types of I/O (read, write, free, etc) 493 * ========================================================================== 494 */ 495 static zio_t * 496 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 497 void *data, uint64_t size, zio_done_func_t *done, void *private, 498 zio_type_t type, zio_priority_t priority, enum zio_flag flags, 499 vdev_t *vd, uint64_t offset, const zbookmark_t *zb, 500 enum zio_stage stage, enum zio_stage pipeline) 501 { 502 zio_t *zio; 503 504 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 505 ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); 506 ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); 507 508 ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); 509 ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); 510 ASSERT(vd || stage == ZIO_STAGE_OPEN); 511 512 zio = kmem_cache_alloc(zio_cache, KM_SLEEP); 513 bzero(zio, sizeof (zio_t)); 514 515 mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); 516 cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); 517 518 list_create(&zio->io_parent_list, sizeof (zio_link_t), 519 offsetof(zio_link_t, zl_parent_node)); 520 list_create(&zio->io_child_list, sizeof (zio_link_t), 521 offsetof(zio_link_t, zl_child_node)); 522 523 if (vd != NULL) 524 zio->io_child_type = ZIO_CHILD_VDEV; 525 else if (flags & ZIO_FLAG_GANG_CHILD) 526 zio->io_child_type = ZIO_CHILD_GANG; 527 else if (flags & ZIO_FLAG_DDT_CHILD) 528 zio->io_child_type = ZIO_CHILD_DDT; 529 else 530 zio->io_child_type = ZIO_CHILD_LOGICAL; 531 532 if (bp != NULL) { 533 zio->io_bp = (blkptr_t *)bp; 534 zio->io_bp_copy = *bp; 535 zio->io_bp_orig = *bp; 536 if (type != ZIO_TYPE_WRITE || 537 zio->io_child_type == ZIO_CHILD_DDT) 538 zio->io_bp = &zio->io_bp_copy; /* so caller can free */ 539 if (zio->io_child_type == ZIO_CHILD_LOGICAL) 540 zio->io_logical = zio; 541 if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) 542 pipeline |= ZIO_GANG_STAGES; 543 } 544 545 zio->io_spa = spa; 546 zio->io_txg = txg; 547 zio->io_done = done; 548 zio->io_private = private; 549 zio->io_type = type; 550 zio->io_priority = priority; 551 zio->io_vd = vd; 552 zio->io_offset = offset; 553 zio->io_orig_data = zio->io_data = data; 554 zio->io_orig_size = zio->io_size = size; 555 zio->io_orig_flags = zio->io_flags = flags; 556 zio->io_orig_stage = zio->io_stage = stage; 557 zio->io_orig_pipeline = zio->io_pipeline = pipeline; 558 559 zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); 560 zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); 561 562 if (zb != NULL) 563 zio->io_bookmark = *zb; 564 565 if (pio != NULL) { 566 if (zio->io_logical == NULL) 567 zio->io_logical = pio->io_logical; 568 if (zio->io_child_type == ZIO_CHILD_GANG) 569 zio->io_gang_leader = pio->io_gang_leader; 570 zio_add_child(pio, zio); 571 } 572 573 return (zio); 574 } 575 576 static void 577 zio_destroy(zio_t *zio) 578 { 579 list_destroy(&zio->io_parent_list); 580 list_destroy(&zio->io_child_list); 581 mutex_destroy(&zio->io_lock); 582 cv_destroy(&zio->io_cv); 583 kmem_cache_free(zio_cache, zio); 584 } 585 586 zio_t * 587 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, 588 void *private, enum zio_flag flags) 589 { 590 zio_t *zio; 591 592 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 593 ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 594 ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); 595 596 return (zio); 597 } 598 599 zio_t * 600 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) 601 { 602 return (zio_null(NULL, spa, NULL, done, private, flags)); 603 } 604 605 zio_t * 606 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, 607 void *data, uint64_t size, zio_done_func_t *done, void *private, 608 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 609 { 610 zio_t *zio; 611 612 zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, 613 data, size, done, private, 614 ZIO_TYPE_READ, priority, flags, NULL, 0, zb, 615 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 616 ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); 617 618 return (zio); 619 } 620 621 zio_t * 622 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 623 void *data, uint64_t size, const zio_prop_t *zp, 624 zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, 625 void *private, 626 zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb) 627 { 628 zio_t *zio; 629 630 ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && 631 zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && 632 zp->zp_compress >= ZIO_COMPRESS_OFF && 633 zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && 634 DMU_OT_IS_VALID(zp->zp_type) && 635 zp->zp_level < 32 && 636 zp->zp_copies > 0 && 637 zp->zp_copies <= spa_max_replication(spa)); 638 639 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 640 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 641 ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? 642 ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); 643 644 zio->io_ready = ready; 645 zio->io_physdone = physdone; 646 zio->io_prop = *zp; 647 648 return (zio); 649 } 650 651 zio_t * 652 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, 653 uint64_t size, zio_done_func_t *done, void *private, 654 zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb) 655 { 656 zio_t *zio; 657 658 zio = zio_create(pio, spa, txg, bp, data, size, done, private, 659 ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, 660 ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); 661 662 return (zio); 663 } 664 665 void 666 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) 667 { 668 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 669 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 670 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 671 ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); 672 673 /* 674 * We must reset the io_prop to match the values that existed 675 * when the bp was first written by dmu_sync() keeping in mind 676 * that nopwrite and dedup are mutually exclusive. 677 */ 678 zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; 679 zio->io_prop.zp_nopwrite = nopwrite; 680 zio->io_prop.zp_copies = copies; 681 zio->io_bp_override = bp; 682 } 683 684 void 685 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) 686 { 687 metaslab_check_free(spa, bp); 688 689 /* 690 * Frees that are for the currently-syncing txg, are not going to be 691 * deferred, and which will not need to do a read (i.e. not GANG or 692 * DEDUP), can be processed immediately. Otherwise, put them on the 693 * in-memory list for later processing. 694 */ 695 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || 696 txg != spa->spa_syncing_txg || 697 spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) { 698 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); 699 } else { 700 VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0))); 701 } 702 } 703 704 zio_t * 705 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 706 enum zio_flag flags) 707 { 708 zio_t *zio; 709 enum zio_stage stage = ZIO_FREE_PIPELINE; 710 711 dprintf_bp(bp, "freeing in txg %llu, pass %u", 712 (longlong_t)txg, spa->spa_sync_pass); 713 714 ASSERT(!BP_IS_HOLE(bp)); 715 ASSERT(spa_syncing_txg(spa) == txg); 716 ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free); 717 718 metaslab_check_free(spa, bp); 719 arc_freed(spa, bp); 720 721 /* 722 * GANG and DEDUP blocks can induce a read (for the gang block header, 723 * or the DDT), so issue them asynchronously so that this thread is 724 * not tied up. 725 */ 726 if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) 727 stage |= ZIO_STAGE_ISSUE_ASYNC; 728 729 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 730 NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, 731 NULL, 0, NULL, ZIO_STAGE_OPEN, stage); 732 733 734 return (zio); 735 } 736 737 zio_t * 738 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, 739 zio_done_func_t *done, void *private, enum zio_flag flags) 740 { 741 zio_t *zio; 742 743 /* 744 * A claim is an allocation of a specific block. Claims are needed 745 * to support immediate writes in the intent log. The issue is that 746 * immediate writes contain committed data, but in a txg that was 747 * *not* committed. Upon opening the pool after an unclean shutdown, 748 * the intent log claims all blocks that contain immediate write data 749 * so that the SPA knows they're in use. 750 * 751 * All claims *must* be resolved in the first txg -- before the SPA 752 * starts allocating blocks -- so that nothing is allocated twice. 753 * If txg == 0 we just verify that the block is claimable. 754 */ 755 ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); 756 ASSERT(txg == spa_first_txg(spa) || txg == 0); 757 ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ 758 759 zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), 760 done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, 761 NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); 762 763 return (zio); 764 } 765 766 zio_t * 767 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, 768 zio_done_func_t *done, void *private, enum zio_flag flags) 769 { 770 zio_t *zio; 771 int c; 772 773 if (vd->vdev_children == 0) { 774 zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, 775 ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, 776 ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); 777 778 zio->io_cmd = cmd; 779 } else { 780 zio = zio_null(pio, spa, NULL, NULL, NULL, flags); 781 782 for (c = 0; c < vd->vdev_children; c++) 783 zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, 784 done, private, flags)); 785 } 786 787 return (zio); 788 } 789 790 zio_t * 791 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 792 void *data, int checksum, zio_done_func_t *done, void *private, 793 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 794 { 795 zio_t *zio; 796 797 ASSERT(vd->vdev_children == 0); 798 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 799 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 800 ASSERT3U(offset + size, <=, vd->vdev_psize); 801 802 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 803 ZIO_TYPE_READ, priority, flags, vd, offset, NULL, 804 ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); 805 806 zio->io_prop.zp_checksum = checksum; 807 808 return (zio); 809 } 810 811 zio_t * 812 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, 813 void *data, int checksum, zio_done_func_t *done, void *private, 814 zio_priority_t priority, enum zio_flag flags, boolean_t labels) 815 { 816 zio_t *zio; 817 818 ASSERT(vd->vdev_children == 0); 819 ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || 820 offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); 821 ASSERT3U(offset + size, <=, vd->vdev_psize); 822 823 zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, 824 ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, 825 ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); 826 827 zio->io_prop.zp_checksum = checksum; 828 829 if (zio_checksum_table[checksum].ci_eck) { 830 /* 831 * zec checksums are necessarily destructive -- they modify 832 * the end of the write buffer to hold the verifier/checksum. 833 * Therefore, we must make a local copy in case the data is 834 * being written to multiple places in parallel. 835 */ 836 void *wbuf = zio_buf_alloc(size); 837 bcopy(data, wbuf, size); 838 zio_push_transform(zio, wbuf, size, size, NULL); 839 } 840 841 return (zio); 842 } 843 844 /* 845 * Create a child I/O to do some work for us. 846 */ 847 zio_t * 848 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, 849 void *data, uint64_t size, int type, zio_priority_t priority, 850 enum zio_flag flags, zio_done_func_t *done, void *private) 851 { 852 enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; 853 zio_t *zio; 854 855 ASSERT(vd->vdev_parent == 856 (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev)); 857 858 if (type == ZIO_TYPE_READ && bp != NULL) { 859 /* 860 * If we have the bp, then the child should perform the 861 * checksum and the parent need not. This pushes error 862 * detection as close to the leaves as possible and 863 * eliminates redundant checksums in the interior nodes. 864 */ 865 pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; 866 pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 867 } 868 869 if (vd->vdev_children == 0) 870 offset += VDEV_LABEL_START_SIZE; 871 872 flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; 873 874 /* 875 * If we've decided to do a repair, the write is not speculative -- 876 * even if the original read was. 877 */ 878 if (flags & ZIO_FLAG_IO_REPAIR) 879 flags &= ~ZIO_FLAG_SPECULATIVE; 880 881 zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, 882 done, private, type, priority, flags, vd, offset, &pio->io_bookmark, 883 ZIO_STAGE_VDEV_IO_START >> 1, pipeline); 884 885 zio->io_physdone = pio->io_physdone; 886 if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) 887 zio->io_logical->io_phys_children++; 888 889 return (zio); 890 } 891 892 zio_t * 893 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, 894 int type, zio_priority_t priority, enum zio_flag flags, 895 zio_done_func_t *done, void *private) 896 { 897 zio_t *zio; 898 899 ASSERT(vd->vdev_ops->vdev_op_leaf); 900 901 zio = zio_create(NULL, vd->vdev_spa, 0, NULL, 902 data, size, done, private, type, priority, 903 flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, 904 vd, offset, NULL, 905 ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); 906 907 return (zio); 908 } 909 910 void 911 zio_flush(zio_t *zio, vdev_t *vd) 912 { 913 zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 914 NULL, NULL, 915 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); 916 } 917 918 void 919 zio_shrink(zio_t *zio, uint64_t size) 920 { 921 ASSERT(zio->io_executor == NULL); 922 ASSERT(zio->io_orig_size == zio->io_size); 923 ASSERT(size <= zio->io_size); 924 925 /* 926 * We don't shrink for raidz because of problems with the 927 * reconstruction when reading back less than the block size. 928 * Note, BP_IS_RAIDZ() assumes no compression. 929 */ 930 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 931 if (!BP_IS_RAIDZ(zio->io_bp)) 932 zio->io_orig_size = zio->io_size = size; 933 } 934 935 /* 936 * ========================================================================== 937 * Prepare to read and write logical blocks 938 * ========================================================================== 939 */ 940 941 static int 942 zio_read_bp_init(zio_t *zio) 943 { 944 blkptr_t *bp = zio->io_bp; 945 946 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && 947 zio->io_child_type == ZIO_CHILD_LOGICAL && 948 !(zio->io_flags & ZIO_FLAG_RAW)) { 949 uint64_t psize = BP_GET_PSIZE(bp); 950 void *cbuf = zio_buf_alloc(psize); 951 952 zio_push_transform(zio, cbuf, psize, psize, zio_decompress); 953 } 954 955 if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) 956 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 957 958 if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) 959 zio->io_flags |= ZIO_FLAG_DONT_CACHE; 960 961 if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) 962 zio->io_pipeline = ZIO_DDT_READ_PIPELINE; 963 964 return (ZIO_PIPELINE_CONTINUE); 965 } 966 967 static int 968 zio_write_bp_init(zio_t *zio) 969 { 970 spa_t *spa = zio->io_spa; 971 zio_prop_t *zp = &zio->io_prop; 972 enum zio_compress compress = zp->zp_compress; 973 blkptr_t *bp = zio->io_bp; 974 uint64_t lsize = zio->io_size; 975 uint64_t psize = lsize; 976 int pass = 1; 977 978 /* 979 * If our children haven't all reached the ready stage, 980 * wait for them and then repeat this pipeline stage. 981 */ 982 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 983 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY)) 984 return (ZIO_PIPELINE_STOP); 985 986 if (!IO_IS_ALLOCATING(zio)) 987 return (ZIO_PIPELINE_CONTINUE); 988 989 ASSERT(zio->io_child_type != ZIO_CHILD_DDT); 990 991 if (zio->io_bp_override) { 992 ASSERT(bp->blk_birth != zio->io_txg); 993 ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); 994 995 *bp = *zio->io_bp_override; 996 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 997 998 /* 999 * If we've been overridden and nopwrite is set then 1000 * set the flag accordingly to indicate that a nopwrite 1001 * has already occurred. 1002 */ 1003 if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { 1004 ASSERT(!zp->zp_dedup); 1005 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1006 return (ZIO_PIPELINE_CONTINUE); 1007 } 1008 1009 ASSERT(!zp->zp_nopwrite); 1010 1011 if (BP_IS_HOLE(bp) || !zp->zp_dedup) 1012 return (ZIO_PIPELINE_CONTINUE); 1013 1014 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || 1015 zp->zp_dedup_verify); 1016 1017 if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { 1018 BP_SET_DEDUP(bp, 1); 1019 zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; 1020 return (ZIO_PIPELINE_CONTINUE); 1021 } 1022 zio->io_bp_override = NULL; 1023 BP_ZERO(bp); 1024 } 1025 1026 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { 1027 /* 1028 * We're rewriting an existing block, which means we're 1029 * working on behalf of spa_sync(). For spa_sync() to 1030 * converge, it must eventually be the case that we don't 1031 * have to allocate new blocks. But compression changes 1032 * the blocksize, which forces a reallocate, and makes 1033 * convergence take longer. Therefore, after the first 1034 * few passes, stop compressing to ensure convergence. 1035 */ 1036 pass = spa_sync_pass(spa); 1037 1038 ASSERT(zio->io_txg == spa_syncing_txg(spa)); 1039 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1040 ASSERT(!BP_GET_DEDUP(bp)); 1041 1042 if (pass >= zfs_sync_pass_dont_compress) 1043 compress = ZIO_COMPRESS_OFF; 1044 1045 /* Make sure someone doesn't change their mind on overwrites */ 1046 ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), 1047 spa_max_replication(spa)) == BP_GET_NDVAS(bp)); 1048 } 1049 1050 if (compress != ZIO_COMPRESS_OFF) { 1051 void *cbuf = zio_buf_alloc(lsize); 1052 psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); 1053 if (psize == 0 || psize == lsize) { 1054 compress = ZIO_COMPRESS_OFF; 1055 zio_buf_free(cbuf, lsize); 1056 } else { 1057 ASSERT(psize < lsize); 1058 zio_push_transform(zio, cbuf, psize, lsize, NULL); 1059 } 1060 } 1061 1062 /* 1063 * The final pass of spa_sync() must be all rewrites, but the first 1064 * few passes offer a trade-off: allocating blocks defers convergence, 1065 * but newly allocated blocks are sequential, so they can be written 1066 * to disk faster. Therefore, we allow the first few passes of 1067 * spa_sync() to allocate new blocks, but force rewrites after that. 1068 * There should only be a handful of blocks after pass 1 in any case. 1069 */ 1070 if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && 1071 BP_GET_PSIZE(bp) == psize && 1072 pass >= zfs_sync_pass_rewrite) { 1073 ASSERT(psize != 0); 1074 enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; 1075 zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; 1076 zio->io_flags |= ZIO_FLAG_IO_REWRITE; 1077 } else { 1078 BP_ZERO(bp); 1079 zio->io_pipeline = ZIO_WRITE_PIPELINE; 1080 } 1081 1082 if (psize == 0) { 1083 if (zio->io_bp_orig.blk_birth != 0 && 1084 spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 1085 BP_SET_LSIZE(bp, lsize); 1086 BP_SET_TYPE(bp, zp->zp_type); 1087 BP_SET_LEVEL(bp, zp->zp_level); 1088 BP_SET_BIRTH(bp, zio->io_txg, 0); 1089 } 1090 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1091 } else { 1092 ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); 1093 BP_SET_LSIZE(bp, lsize); 1094 BP_SET_TYPE(bp, zp->zp_type); 1095 BP_SET_LEVEL(bp, zp->zp_level); 1096 BP_SET_PSIZE(bp, psize); 1097 BP_SET_COMPRESS(bp, compress); 1098 BP_SET_CHECKSUM(bp, zp->zp_checksum); 1099 BP_SET_DEDUP(bp, zp->zp_dedup); 1100 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 1101 if (zp->zp_dedup) { 1102 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1103 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1104 zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; 1105 } 1106 if (zp->zp_nopwrite) { 1107 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1108 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1109 zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; 1110 } 1111 } 1112 1113 return (ZIO_PIPELINE_CONTINUE); 1114 } 1115 1116 static int 1117 zio_free_bp_init(zio_t *zio) 1118 { 1119 blkptr_t *bp = zio->io_bp; 1120 1121 if (zio->io_child_type == ZIO_CHILD_LOGICAL) { 1122 if (BP_GET_DEDUP(bp)) 1123 zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; 1124 } 1125 1126 return (ZIO_PIPELINE_CONTINUE); 1127 } 1128 1129 /* 1130 * ========================================================================== 1131 * Execute the I/O pipeline 1132 * ========================================================================== 1133 */ 1134 1135 static void 1136 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) 1137 { 1138 spa_t *spa = zio->io_spa; 1139 zio_type_t t = zio->io_type; 1140 int flags = (cutinline ? TQ_FRONT : 0); 1141 1142 /* 1143 * If we're a config writer or a probe, the normal issue and 1144 * interrupt threads may all be blocked waiting for the config lock. 1145 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. 1146 */ 1147 if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) 1148 t = ZIO_TYPE_NULL; 1149 1150 /* 1151 * A similar issue exists for the L2ARC write thread until L2ARC 2.0. 1152 */ 1153 if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) 1154 t = ZIO_TYPE_NULL; 1155 1156 /* 1157 * If this is a high priority I/O, then use the high priority taskq if 1158 * available. 1159 */ 1160 if (zio->io_priority == ZIO_PRIORITY_NOW && 1161 spa->spa_zio_taskq[t][q + 1].stqs_count != 0) 1162 q++; 1163 1164 ASSERT3U(q, <, ZIO_TASKQ_TYPES); 1165 1166 /* 1167 * NB: We are assuming that the zio can only be dispatched 1168 * to a single taskq at a time. It would be a grievous error 1169 * to dispatch the zio to another taskq at the same time. 1170 */ 1171 ASSERT(zio->io_tqent.tqent_next == NULL); 1172 spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, 1173 flags, &zio->io_tqent); 1174 } 1175 1176 static boolean_t 1177 zio_taskq_member(zio_t *zio, zio_taskq_type_t q) 1178 { 1179 kthread_t *executor = zio->io_executor; 1180 spa_t *spa = zio->io_spa; 1181 1182 for (zio_type_t t = 0; t < ZIO_TYPES; t++) { 1183 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1184 uint_t i; 1185 for (i = 0; i < tqs->stqs_count; i++) { 1186 if (taskq_member(tqs->stqs_taskq[i], executor)) 1187 return (B_TRUE); 1188 } 1189 } 1190 1191 return (B_FALSE); 1192 } 1193 1194 static int 1195 zio_issue_async(zio_t *zio) 1196 { 1197 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1198 1199 return (ZIO_PIPELINE_STOP); 1200 } 1201 1202 void 1203 zio_interrupt(zio_t *zio) 1204 { 1205 zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); 1206 } 1207 1208 /* 1209 * Execute the I/O pipeline until one of the following occurs: 1210 * 1211 * (1) the I/O completes 1212 * (2) the pipeline stalls waiting for dependent child I/Os 1213 * (3) the I/O issues, so we're waiting for an I/O completion interrupt 1214 * (4) the I/O is delegated by vdev-level caching or aggregation 1215 * (5) the I/O is deferred due to vdev-level queueing 1216 * (6) the I/O is handed off to another thread. 1217 * 1218 * In all cases, the pipeline stops whenever there's no CPU work; it never 1219 * burns a thread in cv_wait(). 1220 * 1221 * There's no locking on io_stage because there's no legitimate way 1222 * for multiple threads to be attempting to process the same I/O. 1223 */ 1224 static zio_pipe_stage_t *zio_pipeline[]; 1225 1226 void 1227 zio_execute(zio_t *zio) 1228 { 1229 zio->io_executor = curthread; 1230 1231 while (zio->io_stage < ZIO_STAGE_DONE) { 1232 enum zio_stage pipeline = zio->io_pipeline; 1233 enum zio_stage stage = zio->io_stage; 1234 int rv; 1235 1236 ASSERT(!MUTEX_HELD(&zio->io_lock)); 1237 ASSERT(ISP2(stage)); 1238 ASSERT(zio->io_stall == NULL); 1239 1240 do { 1241 stage <<= 1; 1242 } while ((stage & pipeline) == 0); 1243 1244 ASSERT(stage <= ZIO_STAGE_DONE); 1245 1246 /* 1247 * If we are in interrupt context and this pipeline stage 1248 * will grab a config lock that is held across I/O, 1249 * or may wait for an I/O that needs an interrupt thread 1250 * to complete, issue async to avoid deadlock. 1251 * 1252 * For VDEV_IO_START, we cut in line so that the io will 1253 * be sent to disk promptly. 1254 */ 1255 if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && 1256 zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { 1257 boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? 1258 zio_requeue_io_start_cut_in_line : B_FALSE; 1259 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); 1260 return; 1261 } 1262 1263 zio->io_stage = stage; 1264 rv = zio_pipeline[highbit(stage) - 1](zio); 1265 1266 if (rv == ZIO_PIPELINE_STOP) 1267 return; 1268 1269 ASSERT(rv == ZIO_PIPELINE_CONTINUE); 1270 } 1271 } 1272 1273 /* 1274 * ========================================================================== 1275 * Initiate I/O, either sync or async 1276 * ========================================================================== 1277 */ 1278 int 1279 zio_wait(zio_t *zio) 1280 { 1281 int error; 1282 1283 ASSERT(zio->io_stage == ZIO_STAGE_OPEN); 1284 ASSERT(zio->io_executor == NULL); 1285 1286 zio->io_waiter = curthread; 1287 1288 zio_execute(zio); 1289 1290 mutex_enter(&zio->io_lock); 1291 while (zio->io_executor != NULL) 1292 cv_wait(&zio->io_cv, &zio->io_lock); 1293 mutex_exit(&zio->io_lock); 1294 1295 error = zio->io_error; 1296 zio_destroy(zio); 1297 1298 return (error); 1299 } 1300 1301 void 1302 zio_nowait(zio_t *zio) 1303 { 1304 ASSERT(zio->io_executor == NULL); 1305 1306 if (zio->io_child_type == ZIO_CHILD_LOGICAL && 1307 zio_unique_parent(zio) == NULL) { 1308 /* 1309 * This is a logical async I/O with no parent to wait for it. 1310 * We add it to the spa_async_root_zio "Godfather" I/O which 1311 * will ensure they complete prior to unloading the pool. 1312 */ 1313 spa_t *spa = zio->io_spa; 1314 1315 zio_add_child(spa->spa_async_zio_root, zio); 1316 } 1317 1318 zio_execute(zio); 1319 } 1320 1321 /* 1322 * ========================================================================== 1323 * Reexecute or suspend/resume failed I/O 1324 * ========================================================================== 1325 */ 1326 1327 static void 1328 zio_reexecute(zio_t *pio) 1329 { 1330 zio_t *cio, *cio_next; 1331 1332 ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); 1333 ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); 1334 ASSERT(pio->io_gang_leader == NULL); 1335 ASSERT(pio->io_gang_tree == NULL); 1336 1337 pio->io_flags = pio->io_orig_flags; 1338 pio->io_stage = pio->io_orig_stage; 1339 pio->io_pipeline = pio->io_orig_pipeline; 1340 pio->io_reexecute = 0; 1341 pio->io_flags |= ZIO_FLAG_REEXECUTED; 1342 pio->io_error = 0; 1343 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1344 pio->io_state[w] = 0; 1345 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 1346 pio->io_child_error[c] = 0; 1347 1348 if (IO_IS_ALLOCATING(pio)) 1349 BP_ZERO(pio->io_bp); 1350 1351 /* 1352 * As we reexecute pio's children, new children could be created. 1353 * New children go to the head of pio's io_child_list, however, 1354 * so we will (correctly) not reexecute them. The key is that 1355 * the remainder of pio's io_child_list, from 'cio_next' onward, 1356 * cannot be affected by any side effects of reexecuting 'cio'. 1357 */ 1358 for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { 1359 cio_next = zio_walk_children(pio); 1360 mutex_enter(&pio->io_lock); 1361 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 1362 pio->io_children[cio->io_child_type][w]++; 1363 mutex_exit(&pio->io_lock); 1364 zio_reexecute(cio); 1365 } 1366 1367 /* 1368 * Now that all children have been reexecuted, execute the parent. 1369 * We don't reexecute "The Godfather" I/O here as it's the 1370 * responsibility of the caller to wait on him. 1371 */ 1372 if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) 1373 zio_execute(pio); 1374 } 1375 1376 void 1377 zio_suspend(spa_t *spa, zio_t *zio) 1378 { 1379 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) 1380 fm_panic("Pool '%s' has encountered an uncorrectable I/O " 1381 "failure and the failure mode property for this pool " 1382 "is set to panic.", spa_name(spa)); 1383 1384 zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); 1385 1386 mutex_enter(&spa->spa_suspend_lock); 1387 1388 if (spa->spa_suspend_zio_root == NULL) 1389 spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 1390 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 1391 ZIO_FLAG_GODFATHER); 1392 1393 spa->spa_suspended = B_TRUE; 1394 1395 if (zio != NULL) { 1396 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 1397 ASSERT(zio != spa->spa_suspend_zio_root); 1398 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1399 ASSERT(zio_unique_parent(zio) == NULL); 1400 ASSERT(zio->io_stage == ZIO_STAGE_DONE); 1401 zio_add_child(spa->spa_suspend_zio_root, zio); 1402 } 1403 1404 mutex_exit(&spa->spa_suspend_lock); 1405 } 1406 1407 int 1408 zio_resume(spa_t *spa) 1409 { 1410 zio_t *pio; 1411 1412 /* 1413 * Reexecute all previously suspended i/o. 1414 */ 1415 mutex_enter(&spa->spa_suspend_lock); 1416 spa->spa_suspended = B_FALSE; 1417 cv_broadcast(&spa->spa_suspend_cv); 1418 pio = spa->spa_suspend_zio_root; 1419 spa->spa_suspend_zio_root = NULL; 1420 mutex_exit(&spa->spa_suspend_lock); 1421 1422 if (pio == NULL) 1423 return (0); 1424 1425 zio_reexecute(pio); 1426 return (zio_wait(pio)); 1427 } 1428 1429 void 1430 zio_resume_wait(spa_t *spa) 1431 { 1432 mutex_enter(&spa->spa_suspend_lock); 1433 while (spa_suspended(spa)) 1434 cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); 1435 mutex_exit(&spa->spa_suspend_lock); 1436 } 1437 1438 /* 1439 * ========================================================================== 1440 * Gang blocks. 1441 * 1442 * A gang block is a collection of small blocks that looks to the DMU 1443 * like one large block. When zio_dva_allocate() cannot find a block 1444 * of the requested size, due to either severe fragmentation or the pool 1445 * being nearly full, it calls zio_write_gang_block() to construct the 1446 * block from smaller fragments. 1447 * 1448 * A gang block consists of a gang header (zio_gbh_phys_t) and up to 1449 * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like 1450 * an indirect block: it's an array of block pointers. It consumes 1451 * only one sector and hence is allocatable regardless of fragmentation. 1452 * The gang header's bps point to its gang members, which hold the data. 1453 * 1454 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg> 1455 * as the verifier to ensure uniqueness of the SHA256 checksum. 1456 * Critically, the gang block bp's blk_cksum is the checksum of the data, 1457 * not the gang header. This ensures that data block signatures (needed for 1458 * deduplication) are independent of how the block is physically stored. 1459 * 1460 * Gang blocks can be nested: a gang member may itself be a gang block. 1461 * Thus every gang block is a tree in which root and all interior nodes are 1462 * gang headers, and the leaves are normal blocks that contain user data. 1463 * The root of the gang tree is called the gang leader. 1464 * 1465 * To perform any operation (read, rewrite, free, claim) on a gang block, 1466 * zio_gang_assemble() first assembles the gang tree (minus data leaves) 1467 * in the io_gang_tree field of the original logical i/o by recursively 1468 * reading the gang leader and all gang headers below it. This yields 1469 * an in-core tree containing the contents of every gang header and the 1470 * bps for every constituent of the gang block. 1471 * 1472 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree 1473 * and invokes a callback on each bp. To free a gang block, zio_gang_issue() 1474 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. 1475 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). 1476 * zio_read_gang() is a wrapper around zio_read() that omits reading gang 1477 * headers, since we already have those in io_gang_tree. zio_rewrite_gang() 1478 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() 1479 * of the gang header plus zio_checksum_compute() of the data to update the 1480 * gang header's blk_cksum as described above. 1481 * 1482 * The two-phase assemble/issue model solves the problem of partial failure -- 1483 * what if you'd freed part of a gang block but then couldn't read the 1484 * gang header for another part? Assembling the entire gang tree first 1485 * ensures that all the necessary gang header I/O has succeeded before 1486 * starting the actual work of free, claim, or write. Once the gang tree 1487 * is assembled, free and claim are in-memory operations that cannot fail. 1488 * 1489 * In the event that a gang write fails, zio_dva_unallocate() walks the 1490 * gang tree to immediately free (i.e. insert back into the space map) 1491 * everything we've allocated. This ensures that we don't get ENOSPC 1492 * errors during repeated suspend/resume cycles due to a flaky device. 1493 * 1494 * Gang rewrites only happen during sync-to-convergence. If we can't assemble 1495 * the gang tree, we won't modify the block, so we can safely defer the free 1496 * (knowing that the block is still intact). If we *can* assemble the gang 1497 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free 1498 * each constituent bp and we can allocate a new block on the next sync pass. 1499 * 1500 * In all cases, the gang tree allows complete recovery from partial failure. 1501 * ========================================================================== 1502 */ 1503 1504 static zio_t * 1505 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1506 { 1507 if (gn != NULL) 1508 return (pio); 1509 1510 return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), 1511 NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1512 &pio->io_bookmark)); 1513 } 1514 1515 zio_t * 1516 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1517 { 1518 zio_t *zio; 1519 1520 if (gn != NULL) { 1521 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1522 gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, 1523 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1524 /* 1525 * As we rewrite each gang header, the pipeline will compute 1526 * a new gang block header checksum for it; but no one will 1527 * compute a new data checksum, so we do that here. The one 1528 * exception is the gang leader: the pipeline already computed 1529 * its data checksum because that stage precedes gang assembly. 1530 * (Presently, nothing actually uses interior data checksums; 1531 * this is just good hygiene.) 1532 */ 1533 if (gn != pio->io_gang_leader->io_gang_tree) { 1534 zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), 1535 data, BP_GET_PSIZE(bp)); 1536 } 1537 /* 1538 * If we are here to damage data for testing purposes, 1539 * leave the GBH alone so that we can detect the damage. 1540 */ 1541 if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) 1542 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 1543 } else { 1544 zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, 1545 data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, 1546 ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1547 } 1548 1549 return (zio); 1550 } 1551 1552 /* ARGSUSED */ 1553 zio_t * 1554 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1555 { 1556 return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, 1557 ZIO_GANG_CHILD_FLAGS(pio))); 1558 } 1559 1560 /* ARGSUSED */ 1561 zio_t * 1562 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) 1563 { 1564 return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, 1565 NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); 1566 } 1567 1568 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { 1569 NULL, 1570 zio_read_gang, 1571 zio_rewrite_gang, 1572 zio_free_gang, 1573 zio_claim_gang, 1574 NULL 1575 }; 1576 1577 static void zio_gang_tree_assemble_done(zio_t *zio); 1578 1579 static zio_gang_node_t * 1580 zio_gang_node_alloc(zio_gang_node_t **gnpp) 1581 { 1582 zio_gang_node_t *gn; 1583 1584 ASSERT(*gnpp == NULL); 1585 1586 gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); 1587 gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); 1588 *gnpp = gn; 1589 1590 return (gn); 1591 } 1592 1593 static void 1594 zio_gang_node_free(zio_gang_node_t **gnpp) 1595 { 1596 zio_gang_node_t *gn = *gnpp; 1597 1598 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1599 ASSERT(gn->gn_child[g] == NULL); 1600 1601 zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); 1602 kmem_free(gn, sizeof (*gn)); 1603 *gnpp = NULL; 1604 } 1605 1606 static void 1607 zio_gang_tree_free(zio_gang_node_t **gnpp) 1608 { 1609 zio_gang_node_t *gn = *gnpp; 1610 1611 if (gn == NULL) 1612 return; 1613 1614 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) 1615 zio_gang_tree_free(&gn->gn_child[g]); 1616 1617 zio_gang_node_free(gnpp); 1618 } 1619 1620 static void 1621 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) 1622 { 1623 zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); 1624 1625 ASSERT(gio->io_gang_leader == gio); 1626 ASSERT(BP_IS_GANG(bp)); 1627 1628 zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, 1629 SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, 1630 gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); 1631 } 1632 1633 static void 1634 zio_gang_tree_assemble_done(zio_t *zio) 1635 { 1636 zio_t *gio = zio->io_gang_leader; 1637 zio_gang_node_t *gn = zio->io_private; 1638 blkptr_t *bp = zio->io_bp; 1639 1640 ASSERT(gio == zio_unique_parent(zio)); 1641 ASSERT(zio->io_child_count == 0); 1642 1643 if (zio->io_error) 1644 return; 1645 1646 if (BP_SHOULD_BYTESWAP(bp)) 1647 byteswap_uint64_array(zio->io_data, zio->io_size); 1648 1649 ASSERT(zio->io_data == gn->gn_gbh); 1650 ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); 1651 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1652 1653 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1654 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1655 if (!BP_IS_GANG(gbp)) 1656 continue; 1657 zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); 1658 } 1659 } 1660 1661 static void 1662 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) 1663 { 1664 zio_t *gio = pio->io_gang_leader; 1665 zio_t *zio; 1666 1667 ASSERT(BP_IS_GANG(bp) == !!gn); 1668 ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); 1669 ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); 1670 1671 /* 1672 * If you're a gang header, your data is in gn->gn_gbh. 1673 * If you're a gang member, your data is in 'data' and gn == NULL. 1674 */ 1675 zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); 1676 1677 if (gn != NULL) { 1678 ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); 1679 1680 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 1681 blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; 1682 if (BP_IS_HOLE(gbp)) 1683 continue; 1684 zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); 1685 data = (char *)data + BP_GET_PSIZE(gbp); 1686 } 1687 } 1688 1689 if (gn == gio->io_gang_tree) 1690 ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); 1691 1692 if (zio != pio) 1693 zio_nowait(zio); 1694 } 1695 1696 static int 1697 zio_gang_assemble(zio_t *zio) 1698 { 1699 blkptr_t *bp = zio->io_bp; 1700 1701 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); 1702 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1703 1704 zio->io_gang_leader = zio; 1705 1706 zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); 1707 1708 return (ZIO_PIPELINE_CONTINUE); 1709 } 1710 1711 static int 1712 zio_gang_issue(zio_t *zio) 1713 { 1714 blkptr_t *bp = zio->io_bp; 1715 1716 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) 1717 return (ZIO_PIPELINE_STOP); 1718 1719 ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); 1720 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 1721 1722 if (zio->io_child_error[ZIO_CHILD_GANG] == 0) 1723 zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); 1724 else 1725 zio_gang_tree_free(&zio->io_gang_tree); 1726 1727 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1728 1729 return (ZIO_PIPELINE_CONTINUE); 1730 } 1731 1732 static void 1733 zio_write_gang_member_ready(zio_t *zio) 1734 { 1735 zio_t *pio = zio_unique_parent(zio); 1736 zio_t *gio = zio->io_gang_leader; 1737 dva_t *cdva = zio->io_bp->blk_dva; 1738 dva_t *pdva = pio->io_bp->blk_dva; 1739 uint64_t asize; 1740 1741 if (BP_IS_HOLE(zio->io_bp)) 1742 return; 1743 1744 ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); 1745 1746 ASSERT(zio->io_child_type == ZIO_CHILD_GANG); 1747 ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); 1748 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); 1749 ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); 1750 ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); 1751 1752 mutex_enter(&pio->io_lock); 1753 for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { 1754 ASSERT(DVA_GET_GANG(&pdva[d])); 1755 asize = DVA_GET_ASIZE(&pdva[d]); 1756 asize += DVA_GET_ASIZE(&cdva[d]); 1757 DVA_SET_ASIZE(&pdva[d], asize); 1758 } 1759 mutex_exit(&pio->io_lock); 1760 } 1761 1762 static int 1763 zio_write_gang_block(zio_t *pio) 1764 { 1765 spa_t *spa = pio->io_spa; 1766 blkptr_t *bp = pio->io_bp; 1767 zio_t *gio = pio->io_gang_leader; 1768 zio_t *zio; 1769 zio_gang_node_t *gn, **gnpp; 1770 zio_gbh_phys_t *gbh; 1771 uint64_t txg = pio->io_txg; 1772 uint64_t resid = pio->io_size; 1773 uint64_t lsize; 1774 int copies = gio->io_prop.zp_copies; 1775 int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); 1776 zio_prop_t zp; 1777 int error; 1778 1779 error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, 1780 bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, 1781 METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); 1782 if (error) { 1783 pio->io_error = error; 1784 return (ZIO_PIPELINE_CONTINUE); 1785 } 1786 1787 if (pio == gio) { 1788 gnpp = &gio->io_gang_tree; 1789 } else { 1790 gnpp = pio->io_private; 1791 ASSERT(pio->io_ready == zio_write_gang_member_ready); 1792 } 1793 1794 gn = zio_gang_node_alloc(gnpp); 1795 gbh = gn->gn_gbh; 1796 bzero(gbh, SPA_GANGBLOCKSIZE); 1797 1798 /* 1799 * Create the gang header. 1800 */ 1801 zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, 1802 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); 1803 1804 /* 1805 * Create and nowait the gang children. 1806 */ 1807 for (int g = 0; resid != 0; resid -= lsize, g++) { 1808 lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), 1809 SPA_MINBLOCKSIZE); 1810 ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); 1811 1812 zp.zp_checksum = gio->io_prop.zp_checksum; 1813 zp.zp_compress = ZIO_COMPRESS_OFF; 1814 zp.zp_type = DMU_OT_NONE; 1815 zp.zp_level = 0; 1816 zp.zp_copies = gio->io_prop.zp_copies; 1817 zp.zp_dedup = B_FALSE; 1818 zp.zp_dedup_verify = B_FALSE; 1819 zp.zp_nopwrite = B_FALSE; 1820 1821 zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], 1822 (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, 1823 zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], 1824 pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), 1825 &pio->io_bookmark)); 1826 } 1827 1828 /* 1829 * Set pio's pipeline to just wait for zio to finish. 1830 */ 1831 pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1832 1833 zio_nowait(zio); 1834 1835 return (ZIO_PIPELINE_CONTINUE); 1836 } 1837 1838 /* 1839 * The zio_nop_write stage in the pipeline determines if allocating 1840 * a new bp is necessary. By leveraging a cryptographically secure checksum, 1841 * such as SHA256, we can compare the checksums of the new data and the old 1842 * to determine if allocating a new block is required. The nopwrite 1843 * feature can handle writes in either syncing or open context (i.e. zil 1844 * writes) and as a result is mutually exclusive with dedup. 1845 */ 1846 static int 1847 zio_nop_write(zio_t *zio) 1848 { 1849 blkptr_t *bp = zio->io_bp; 1850 blkptr_t *bp_orig = &zio->io_bp_orig; 1851 zio_prop_t *zp = &zio->io_prop; 1852 1853 ASSERT(BP_GET_LEVEL(bp) == 0); 1854 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); 1855 ASSERT(zp->zp_nopwrite); 1856 ASSERT(!zp->zp_dedup); 1857 ASSERT(zio->io_bp_override == NULL); 1858 ASSERT(IO_IS_ALLOCATING(zio)); 1859 1860 /* 1861 * Check to see if the original bp and the new bp have matching 1862 * characteristics (i.e. same checksum, compression algorithms, etc). 1863 * If they don't then just continue with the pipeline which will 1864 * allocate a new bp. 1865 */ 1866 if (BP_IS_HOLE(bp_orig) || 1867 !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup || 1868 BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || 1869 BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || 1870 BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || 1871 zp->zp_copies != BP_GET_NDVAS(bp_orig)) 1872 return (ZIO_PIPELINE_CONTINUE); 1873 1874 /* 1875 * If the checksums match then reset the pipeline so that we 1876 * avoid allocating a new bp and issuing any I/O. 1877 */ 1878 if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { 1879 ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup); 1880 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); 1881 ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); 1882 ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); 1883 ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, 1884 sizeof (uint64_t)) == 0); 1885 1886 *bp = *bp_orig; 1887 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 1888 zio->io_flags |= ZIO_FLAG_NOPWRITE; 1889 } 1890 1891 return (ZIO_PIPELINE_CONTINUE); 1892 } 1893 1894 /* 1895 * ========================================================================== 1896 * Dedup 1897 * ========================================================================== 1898 */ 1899 static void 1900 zio_ddt_child_read_done(zio_t *zio) 1901 { 1902 blkptr_t *bp = zio->io_bp; 1903 ddt_entry_t *dde = zio->io_private; 1904 ddt_phys_t *ddp; 1905 zio_t *pio = zio_unique_parent(zio); 1906 1907 mutex_enter(&pio->io_lock); 1908 ddp = ddt_phys_select(dde, bp); 1909 if (zio->io_error == 0) 1910 ddt_phys_clear(ddp); /* this ddp doesn't need repair */ 1911 if (zio->io_error == 0 && dde->dde_repair_data == NULL) 1912 dde->dde_repair_data = zio->io_data; 1913 else 1914 zio_buf_free(zio->io_data, zio->io_size); 1915 mutex_exit(&pio->io_lock); 1916 } 1917 1918 static int 1919 zio_ddt_read_start(zio_t *zio) 1920 { 1921 blkptr_t *bp = zio->io_bp; 1922 1923 ASSERT(BP_GET_DEDUP(bp)); 1924 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1925 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1926 1927 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1928 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1929 ddt_entry_t *dde = ddt_repair_start(ddt, bp); 1930 ddt_phys_t *ddp = dde->dde_phys; 1931 ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); 1932 blkptr_t blk; 1933 1934 ASSERT(zio->io_vsd == NULL); 1935 zio->io_vsd = dde; 1936 1937 if (ddp_self == NULL) 1938 return (ZIO_PIPELINE_CONTINUE); 1939 1940 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1941 if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) 1942 continue; 1943 ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, 1944 &blk); 1945 zio_nowait(zio_read(zio, zio->io_spa, &blk, 1946 zio_buf_alloc(zio->io_size), zio->io_size, 1947 zio_ddt_child_read_done, dde, zio->io_priority, 1948 ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, 1949 &zio->io_bookmark)); 1950 } 1951 return (ZIO_PIPELINE_CONTINUE); 1952 } 1953 1954 zio_nowait(zio_read(zio, zio->io_spa, bp, 1955 zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, 1956 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); 1957 1958 return (ZIO_PIPELINE_CONTINUE); 1959 } 1960 1961 static int 1962 zio_ddt_read_done(zio_t *zio) 1963 { 1964 blkptr_t *bp = zio->io_bp; 1965 1966 if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) 1967 return (ZIO_PIPELINE_STOP); 1968 1969 ASSERT(BP_GET_DEDUP(bp)); 1970 ASSERT(BP_GET_PSIZE(bp) == zio->io_size); 1971 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 1972 1973 if (zio->io_child_error[ZIO_CHILD_DDT]) { 1974 ddt_t *ddt = ddt_select(zio->io_spa, bp); 1975 ddt_entry_t *dde = zio->io_vsd; 1976 if (ddt == NULL) { 1977 ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); 1978 return (ZIO_PIPELINE_CONTINUE); 1979 } 1980 if (dde == NULL) { 1981 zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; 1982 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); 1983 return (ZIO_PIPELINE_STOP); 1984 } 1985 if (dde->dde_repair_data != NULL) { 1986 bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); 1987 zio->io_child_error[ZIO_CHILD_DDT] = 0; 1988 } 1989 ddt_repair_done(ddt, dde); 1990 zio->io_vsd = NULL; 1991 } 1992 1993 ASSERT(zio->io_vsd == NULL); 1994 1995 return (ZIO_PIPELINE_CONTINUE); 1996 } 1997 1998 static boolean_t 1999 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) 2000 { 2001 spa_t *spa = zio->io_spa; 2002 2003 /* 2004 * Note: we compare the original data, not the transformed data, 2005 * because when zio->io_bp is an override bp, we will not have 2006 * pushed the I/O transforms. That's an important optimization 2007 * because otherwise we'd compress/encrypt all dmu_sync() data twice. 2008 */ 2009 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2010 zio_t *lio = dde->dde_lead_zio[p]; 2011 2012 if (lio != NULL) { 2013 return (lio->io_orig_size != zio->io_orig_size || 2014 bcmp(zio->io_orig_data, lio->io_orig_data, 2015 zio->io_orig_size) != 0); 2016 } 2017 } 2018 2019 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 2020 ddt_phys_t *ddp = &dde->dde_phys[p]; 2021 2022 if (ddp->ddp_phys_birth != 0) { 2023 arc_buf_t *abuf = NULL; 2024 uint32_t aflags = ARC_WAIT; 2025 blkptr_t blk = *zio->io_bp; 2026 int error; 2027 2028 ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); 2029 2030 ddt_exit(ddt); 2031 2032 error = arc_read(NULL, spa, &blk, 2033 arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, 2034 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2035 &aflags, &zio->io_bookmark); 2036 2037 if (error == 0) { 2038 if (arc_buf_size(abuf) != zio->io_orig_size || 2039 bcmp(abuf->b_data, zio->io_orig_data, 2040 zio->io_orig_size) != 0) 2041 error = SET_ERROR(EEXIST); 2042 VERIFY(arc_buf_remove_ref(abuf, &abuf)); 2043 } 2044 2045 ddt_enter(ddt); 2046 return (error != 0); 2047 } 2048 } 2049 2050 return (B_FALSE); 2051 } 2052 2053 static void 2054 zio_ddt_child_write_ready(zio_t *zio) 2055 { 2056 int p = zio->io_prop.zp_copies; 2057 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2058 ddt_entry_t *dde = zio->io_private; 2059 ddt_phys_t *ddp = &dde->dde_phys[p]; 2060 zio_t *pio; 2061 2062 if (zio->io_error) 2063 return; 2064 2065 ddt_enter(ddt); 2066 2067 ASSERT(dde->dde_lead_zio[p] == zio); 2068 2069 ddt_phys_fill(ddp, zio->io_bp); 2070 2071 while ((pio = zio_walk_parents(zio)) != NULL) 2072 ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); 2073 2074 ddt_exit(ddt); 2075 } 2076 2077 static void 2078 zio_ddt_child_write_done(zio_t *zio) 2079 { 2080 int p = zio->io_prop.zp_copies; 2081 ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); 2082 ddt_entry_t *dde = zio->io_private; 2083 ddt_phys_t *ddp = &dde->dde_phys[p]; 2084 2085 ddt_enter(ddt); 2086 2087 ASSERT(ddp->ddp_refcnt == 0); 2088 ASSERT(dde->dde_lead_zio[p] == zio); 2089 dde->dde_lead_zio[p] = NULL; 2090 2091 if (zio->io_error == 0) { 2092 while (zio_walk_parents(zio) != NULL) 2093 ddt_phys_addref(ddp); 2094 } else { 2095 ddt_phys_clear(ddp); 2096 } 2097 2098 ddt_exit(ddt); 2099 } 2100 2101 static void 2102 zio_ddt_ditto_write_done(zio_t *zio) 2103 { 2104 int p = DDT_PHYS_DITTO; 2105 zio_prop_t *zp = &zio->io_prop; 2106 blkptr_t *bp = zio->io_bp; 2107 ddt_t *ddt = ddt_select(zio->io_spa, bp); 2108 ddt_entry_t *dde = zio->io_private; 2109 ddt_phys_t *ddp = &dde->dde_phys[p]; 2110 ddt_key_t *ddk = &dde->dde_key; 2111 2112 ddt_enter(ddt); 2113 2114 ASSERT(ddp->ddp_refcnt == 0); 2115 ASSERT(dde->dde_lead_zio[p] == zio); 2116 dde->dde_lead_zio[p] = NULL; 2117 2118 if (zio->io_error == 0) { 2119 ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); 2120 ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); 2121 ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); 2122 if (ddp->ddp_phys_birth != 0) 2123 ddt_phys_free(ddt, ddk, ddp, zio->io_txg); 2124 ddt_phys_fill(ddp, bp); 2125 } 2126 2127 ddt_exit(ddt); 2128 } 2129 2130 static int 2131 zio_ddt_write(zio_t *zio) 2132 { 2133 spa_t *spa = zio->io_spa; 2134 blkptr_t *bp = zio->io_bp; 2135 uint64_t txg = zio->io_txg; 2136 zio_prop_t *zp = &zio->io_prop; 2137 int p = zp->zp_copies; 2138 int ditto_copies; 2139 zio_t *cio = NULL; 2140 zio_t *dio = NULL; 2141 ddt_t *ddt = ddt_select(spa, bp); 2142 ddt_entry_t *dde; 2143 ddt_phys_t *ddp; 2144 2145 ASSERT(BP_GET_DEDUP(bp)); 2146 ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); 2147 ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); 2148 2149 ddt_enter(ddt); 2150 dde = ddt_lookup(ddt, bp, B_TRUE); 2151 ddp = &dde->dde_phys[p]; 2152 2153 if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { 2154 /* 2155 * If we're using a weak checksum, upgrade to a strong checksum 2156 * and try again. If we're already using a strong checksum, 2157 * we can't resolve it, so just convert to an ordinary write. 2158 * (And automatically e-mail a paper to Nature?) 2159 */ 2160 if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { 2161 zp->zp_checksum = spa_dedup_checksum(spa); 2162 zio_pop_transforms(zio); 2163 zio->io_stage = ZIO_STAGE_OPEN; 2164 BP_ZERO(bp); 2165 } else { 2166 zp->zp_dedup = B_FALSE; 2167 } 2168 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2169 ddt_exit(ddt); 2170 return (ZIO_PIPELINE_CONTINUE); 2171 } 2172 2173 ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); 2174 ASSERT(ditto_copies < SPA_DVAS_PER_BP); 2175 2176 if (ditto_copies > ddt_ditto_copies_present(dde) && 2177 dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { 2178 zio_prop_t czp = *zp; 2179 2180 czp.zp_copies = ditto_copies; 2181 2182 /* 2183 * If we arrived here with an override bp, we won't have run 2184 * the transform stack, so we won't have the data we need to 2185 * generate a child i/o. So, toss the override bp and restart. 2186 * This is safe, because using the override bp is just an 2187 * optimization; and it's rare, so the cost doesn't matter. 2188 */ 2189 if (zio->io_bp_override) { 2190 zio_pop_transforms(zio); 2191 zio->io_stage = ZIO_STAGE_OPEN; 2192 zio->io_pipeline = ZIO_WRITE_PIPELINE; 2193 zio->io_bp_override = NULL; 2194 BP_ZERO(bp); 2195 ddt_exit(ddt); 2196 return (ZIO_PIPELINE_CONTINUE); 2197 } 2198 2199 dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2200 zio->io_orig_size, &czp, NULL, NULL, 2201 zio_ddt_ditto_write_done, dde, zio->io_priority, 2202 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2203 2204 zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); 2205 dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; 2206 } 2207 2208 if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { 2209 if (ddp->ddp_phys_birth != 0) 2210 ddt_bp_fill(ddp, bp, txg); 2211 if (dde->dde_lead_zio[p] != NULL) 2212 zio_add_child(zio, dde->dde_lead_zio[p]); 2213 else 2214 ddt_phys_addref(ddp); 2215 } else if (zio->io_bp_override) { 2216 ASSERT(bp->blk_birth == txg); 2217 ASSERT(BP_EQUAL(bp, zio->io_bp_override)); 2218 ddt_phys_fill(ddp, bp); 2219 ddt_phys_addref(ddp); 2220 } else { 2221 cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, 2222 zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, 2223 zio_ddt_child_write_done, dde, zio->io_priority, 2224 ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); 2225 2226 zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); 2227 dde->dde_lead_zio[p] = cio; 2228 } 2229 2230 ddt_exit(ddt); 2231 2232 if (cio) 2233 zio_nowait(cio); 2234 if (dio) 2235 zio_nowait(dio); 2236 2237 return (ZIO_PIPELINE_CONTINUE); 2238 } 2239 2240 ddt_entry_t *freedde; /* for debugging */ 2241 2242 static int 2243 zio_ddt_free(zio_t *zio) 2244 { 2245 spa_t *spa = zio->io_spa; 2246 blkptr_t *bp = zio->io_bp; 2247 ddt_t *ddt = ddt_select(spa, bp); 2248 ddt_entry_t *dde; 2249 ddt_phys_t *ddp; 2250 2251 ASSERT(BP_GET_DEDUP(bp)); 2252 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2253 2254 ddt_enter(ddt); 2255 freedde = dde = ddt_lookup(ddt, bp, B_TRUE); 2256 ddp = ddt_phys_select(dde, bp); 2257 ddt_phys_decref(ddp); 2258 ddt_exit(ddt); 2259 2260 return (ZIO_PIPELINE_CONTINUE); 2261 } 2262 2263 /* 2264 * ========================================================================== 2265 * Allocate and free blocks 2266 * ========================================================================== 2267 */ 2268 static int 2269 zio_dva_allocate(zio_t *zio) 2270 { 2271 spa_t *spa = zio->io_spa; 2272 metaslab_class_t *mc = spa_normal_class(spa); 2273 blkptr_t *bp = zio->io_bp; 2274 int error; 2275 int flags = 0; 2276 2277 if (zio->io_gang_leader == NULL) { 2278 ASSERT(zio->io_child_type > ZIO_CHILD_GANG); 2279 zio->io_gang_leader = zio; 2280 } 2281 2282 ASSERT(BP_IS_HOLE(bp)); 2283 ASSERT0(BP_GET_NDVAS(bp)); 2284 ASSERT3U(zio->io_prop.zp_copies, >, 0); 2285 ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); 2286 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 2287 2288 /* 2289 * The dump device does not support gang blocks so allocation on 2290 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid 2291 * the "fast" gang feature. 2292 */ 2293 flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0; 2294 flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ? 2295 METASLAB_GANG_CHILD : 0; 2296 error = metaslab_alloc(spa, mc, zio->io_size, bp, 2297 zio->io_prop.zp_copies, zio->io_txg, NULL, flags); 2298 2299 if (error) { 2300 spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, " 2301 "size %llu, error %d", spa_name(spa), zio, zio->io_size, 2302 error); 2303 if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) 2304 return (zio_write_gang_block(zio)); 2305 zio->io_error = error; 2306 } 2307 2308 return (ZIO_PIPELINE_CONTINUE); 2309 } 2310 2311 static int 2312 zio_dva_free(zio_t *zio) 2313 { 2314 metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); 2315 2316 return (ZIO_PIPELINE_CONTINUE); 2317 } 2318 2319 static int 2320 zio_dva_claim(zio_t *zio) 2321 { 2322 int error; 2323 2324 error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); 2325 if (error) 2326 zio->io_error = error; 2327 2328 return (ZIO_PIPELINE_CONTINUE); 2329 } 2330 2331 /* 2332 * Undo an allocation. This is used by zio_done() when an I/O fails 2333 * and we want to give back the block we just allocated. 2334 * This handles both normal blocks and gang blocks. 2335 */ 2336 static void 2337 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) 2338 { 2339 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); 2340 ASSERT(zio->io_bp_override == NULL); 2341 2342 if (!BP_IS_HOLE(bp)) 2343 metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); 2344 2345 if (gn != NULL) { 2346 for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { 2347 zio_dva_unallocate(zio, gn->gn_child[g], 2348 &gn->gn_gbh->zg_blkptr[g]); 2349 } 2350 } 2351 } 2352 2353 /* 2354 * Try to allocate an intent log block. Return 0 on success, errno on failure. 2355 */ 2356 int 2357 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, 2358 uint64_t size, boolean_t use_slog) 2359 { 2360 int error = 1; 2361 2362 ASSERT(txg > spa_syncing_txg(spa)); 2363 2364 /* 2365 * ZIL blocks are always contiguous (i.e. not gang blocks) so we 2366 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang" 2367 * when allocating them. 2368 */ 2369 if (use_slog) { 2370 error = metaslab_alloc(spa, spa_log_class(spa), size, 2371 new_bp, 1, txg, old_bp, 2372 METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID); 2373 } 2374 2375 if (error) { 2376 error = metaslab_alloc(spa, spa_normal_class(spa), size, 2377 new_bp, 1, txg, old_bp, 2378 METASLAB_HINTBP_AVOID); 2379 } 2380 2381 if (error == 0) { 2382 BP_SET_LSIZE(new_bp, size); 2383 BP_SET_PSIZE(new_bp, size); 2384 BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); 2385 BP_SET_CHECKSUM(new_bp, 2386 spa_version(spa) >= SPA_VERSION_SLIM_ZIL 2387 ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); 2388 BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); 2389 BP_SET_LEVEL(new_bp, 0); 2390 BP_SET_DEDUP(new_bp, 0); 2391 BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); 2392 } 2393 2394 return (error); 2395 } 2396 2397 /* 2398 * Free an intent log block. 2399 */ 2400 void 2401 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) 2402 { 2403 ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); 2404 ASSERT(!BP_IS_GANG(bp)); 2405 2406 zio_free(spa, txg, bp); 2407 } 2408 2409 /* 2410 * ========================================================================== 2411 * Read and write to physical devices 2412 * ========================================================================== 2413 */ 2414 static int 2415 zio_vdev_io_start(zio_t *zio) 2416 { 2417 vdev_t *vd = zio->io_vd; 2418 uint64_t align; 2419 spa_t *spa = zio->io_spa; 2420 2421 ASSERT(zio->io_error == 0); 2422 ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); 2423 2424 if (vd == NULL) { 2425 if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2426 spa_config_enter(spa, SCL_ZIO, zio, RW_READER); 2427 2428 /* 2429 * The mirror_ops handle multiple DVAs in a single BP. 2430 */ 2431 return (vdev_mirror_ops.vdev_op_io_start(zio)); 2432 } 2433 2434 /* 2435 * We keep track of time-sensitive I/Os so that the scan thread 2436 * can quickly react to certain workloads. In particular, we care 2437 * about non-scrubbing, top-level reads and writes with the following 2438 * characteristics: 2439 * - synchronous writes of user data to non-slog devices 2440 * - any reads of user data 2441 * When these conditions are met, adjust the timestamp of spa_last_io 2442 * which allows the scan thread to adjust its workload accordingly. 2443 */ 2444 if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && 2445 vd == vd->vdev_top && !vd->vdev_islog && 2446 zio->io_bookmark.zb_objset != DMU_META_OBJSET && 2447 zio->io_txg != spa_syncing_txg(spa)) { 2448 uint64_t old = spa->spa_last_io; 2449 uint64_t new = ddi_get_lbolt64(); 2450 if (old != new) 2451 (void) atomic_cas_64(&spa->spa_last_io, old, new); 2452 } 2453 2454 align = 1ULL << vd->vdev_top->vdev_ashift; 2455 2456 if (P2PHASE(zio->io_size, align) != 0) { 2457 uint64_t asize = P2ROUNDUP(zio->io_size, align); 2458 char *abuf = zio_buf_alloc(asize); 2459 ASSERT(vd == vd->vdev_top); 2460 if (zio->io_type == ZIO_TYPE_WRITE) { 2461 bcopy(zio->io_data, abuf, zio->io_size); 2462 bzero(abuf + zio->io_size, asize - zio->io_size); 2463 } 2464 zio_push_transform(zio, abuf, asize, asize, zio_subblock); 2465 } 2466 2467 ASSERT(P2PHASE(zio->io_offset, align) == 0); 2468 ASSERT(P2PHASE(zio->io_size, align) == 0); 2469 VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); 2470 2471 /* 2472 * If this is a repair I/O, and there's no self-healing involved -- 2473 * that is, we're just resilvering what we expect to resilver -- 2474 * then don't do the I/O unless zio's txg is actually in vd's DTL. 2475 * This prevents spurious resilvering with nested replication. 2476 * For example, given a mirror of mirrors, (A+B)+(C+D), if only 2477 * A is out of date, we'll read from C+D, then use the data to 2478 * resilver A+B -- but we don't actually want to resilver B, just A. 2479 * The top-level mirror has no way to know this, so instead we just 2480 * discard unnecessary repairs as we work our way down the vdev tree. 2481 * The same logic applies to any form of nested replication: 2482 * ditto + mirror, RAID-Z + replacing, etc. This covers them all. 2483 */ 2484 if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && 2485 !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && 2486 zio->io_txg != 0 && /* not a delegated i/o */ 2487 !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { 2488 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 2489 zio_vdev_io_bypass(zio); 2490 return (ZIO_PIPELINE_CONTINUE); 2491 } 2492 2493 if (vd->vdev_ops->vdev_op_leaf && 2494 (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { 2495 2496 if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) 2497 return (ZIO_PIPELINE_CONTINUE); 2498 2499 if ((zio = vdev_queue_io(zio)) == NULL) 2500 return (ZIO_PIPELINE_STOP); 2501 2502 if (!vdev_accessible(vd, zio)) { 2503 zio->io_error = SET_ERROR(ENXIO); 2504 zio_interrupt(zio); 2505 return (ZIO_PIPELINE_STOP); 2506 } 2507 } 2508 2509 return (vd->vdev_ops->vdev_op_io_start(zio)); 2510 } 2511 2512 static int 2513 zio_vdev_io_done(zio_t *zio) 2514 { 2515 vdev_t *vd = zio->io_vd; 2516 vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; 2517 boolean_t unexpected_error = B_FALSE; 2518 2519 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2520 return (ZIO_PIPELINE_STOP); 2521 2522 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 2523 2524 if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { 2525 2526 vdev_queue_io_done(zio); 2527 2528 if (zio->io_type == ZIO_TYPE_WRITE) 2529 vdev_cache_write(zio); 2530 2531 if (zio_injection_enabled && zio->io_error == 0) 2532 zio->io_error = zio_handle_device_injection(vd, 2533 zio, EIO); 2534 2535 if (zio_injection_enabled && zio->io_error == 0) 2536 zio->io_error = zio_handle_label_injection(zio, EIO); 2537 2538 if (zio->io_error) { 2539 if (!vdev_accessible(vd, zio)) { 2540 zio->io_error = SET_ERROR(ENXIO); 2541 } else { 2542 unexpected_error = B_TRUE; 2543 } 2544 } 2545 } 2546 2547 ops->vdev_op_io_done(zio); 2548 2549 if (unexpected_error) 2550 VERIFY(vdev_probe(vd, zio) == NULL); 2551 2552 return (ZIO_PIPELINE_CONTINUE); 2553 } 2554 2555 /* 2556 * For non-raidz ZIOs, we can just copy aside the bad data read from the 2557 * disk, and use that to finish the checksum ereport later. 2558 */ 2559 static void 2560 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, 2561 const void *good_buf) 2562 { 2563 /* no processing needed */ 2564 zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); 2565 } 2566 2567 /*ARGSUSED*/ 2568 void 2569 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) 2570 { 2571 void *buf = zio_buf_alloc(zio->io_size); 2572 2573 bcopy(zio->io_data, buf, zio->io_size); 2574 2575 zcr->zcr_cbinfo = zio->io_size; 2576 zcr->zcr_cbdata = buf; 2577 zcr->zcr_finish = zio_vsd_default_cksum_finish; 2578 zcr->zcr_free = zio_buf_free; 2579 } 2580 2581 static int 2582 zio_vdev_io_assess(zio_t *zio) 2583 { 2584 vdev_t *vd = zio->io_vd; 2585 2586 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) 2587 return (ZIO_PIPELINE_STOP); 2588 2589 if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) 2590 spa_config_exit(zio->io_spa, SCL_ZIO, zio); 2591 2592 if (zio->io_vsd != NULL) { 2593 zio->io_vsd_ops->vsd_free(zio); 2594 zio->io_vsd = NULL; 2595 } 2596 2597 if (zio_injection_enabled && zio->io_error == 0) 2598 zio->io_error = zio_handle_fault_injection(zio, EIO); 2599 2600 /* 2601 * If the I/O failed, determine whether we should attempt to retry it. 2602 * 2603 * On retry, we cut in line in the issue queue, since we don't want 2604 * compression/checksumming/etc. work to prevent our (cheap) IO reissue. 2605 */ 2606 if (zio->io_error && vd == NULL && 2607 !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { 2608 ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ 2609 ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ 2610 zio->io_error = 0; 2611 zio->io_flags |= ZIO_FLAG_IO_RETRY | 2612 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; 2613 zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; 2614 zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, 2615 zio_requeue_io_start_cut_in_line); 2616 return (ZIO_PIPELINE_STOP); 2617 } 2618 2619 /* 2620 * If we got an error on a leaf device, convert it to ENXIO 2621 * if the device is not accessible at all. 2622 */ 2623 if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && 2624 !vdev_accessible(vd, zio)) 2625 zio->io_error = SET_ERROR(ENXIO); 2626 2627 /* 2628 * If we can't write to an interior vdev (mirror or RAID-Z), 2629 * set vdev_cant_write so that we stop trying to allocate from it. 2630 */ 2631 if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && 2632 vd != NULL && !vd->vdev_ops->vdev_op_leaf) { 2633 vd->vdev_cant_write = B_TRUE; 2634 } 2635 2636 if (zio->io_error) 2637 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2638 2639 if (vd != NULL && vd->vdev_ops->vdev_op_leaf && 2640 zio->io_physdone != NULL) { 2641 ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); 2642 ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); 2643 zio->io_physdone(zio->io_logical); 2644 } 2645 2646 return (ZIO_PIPELINE_CONTINUE); 2647 } 2648 2649 void 2650 zio_vdev_io_reissue(zio_t *zio) 2651 { 2652 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2653 ASSERT(zio->io_error == 0); 2654 2655 zio->io_stage >>= 1; 2656 } 2657 2658 void 2659 zio_vdev_io_redone(zio_t *zio) 2660 { 2661 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); 2662 2663 zio->io_stage >>= 1; 2664 } 2665 2666 void 2667 zio_vdev_io_bypass(zio_t *zio) 2668 { 2669 ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); 2670 ASSERT(zio->io_error == 0); 2671 2672 zio->io_flags |= ZIO_FLAG_IO_BYPASS; 2673 zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; 2674 } 2675 2676 /* 2677 * ========================================================================== 2678 * Generate and verify checksums 2679 * ========================================================================== 2680 */ 2681 static int 2682 zio_checksum_generate(zio_t *zio) 2683 { 2684 blkptr_t *bp = zio->io_bp; 2685 enum zio_checksum checksum; 2686 2687 if (bp == NULL) { 2688 /* 2689 * This is zio_write_phys(). 2690 * We're either generating a label checksum, or none at all. 2691 */ 2692 checksum = zio->io_prop.zp_checksum; 2693 2694 if (checksum == ZIO_CHECKSUM_OFF) 2695 return (ZIO_PIPELINE_CONTINUE); 2696 2697 ASSERT(checksum == ZIO_CHECKSUM_LABEL); 2698 } else { 2699 if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { 2700 ASSERT(!IO_IS_ALLOCATING(zio)); 2701 checksum = ZIO_CHECKSUM_GANG_HEADER; 2702 } else { 2703 checksum = BP_GET_CHECKSUM(bp); 2704 } 2705 } 2706 2707 zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); 2708 2709 return (ZIO_PIPELINE_CONTINUE); 2710 } 2711 2712 static int 2713 zio_checksum_verify(zio_t *zio) 2714 { 2715 zio_bad_cksum_t info; 2716 blkptr_t *bp = zio->io_bp; 2717 int error; 2718 2719 ASSERT(zio->io_vd != NULL); 2720 2721 if (bp == NULL) { 2722 /* 2723 * This is zio_read_phys(). 2724 * We're either verifying a label checksum, or nothing at all. 2725 */ 2726 if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) 2727 return (ZIO_PIPELINE_CONTINUE); 2728 2729 ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); 2730 } 2731 2732 if ((error = zio_checksum_error(zio, &info)) != 0) { 2733 zio->io_error = error; 2734 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 2735 zfs_ereport_start_checksum(zio->io_spa, 2736 zio->io_vd, zio, zio->io_offset, 2737 zio->io_size, NULL, &info); 2738 } 2739 } 2740 2741 return (ZIO_PIPELINE_CONTINUE); 2742 } 2743 2744 /* 2745 * Called by RAID-Z to ensure we don't compute the checksum twice. 2746 */ 2747 void 2748 zio_checksum_verified(zio_t *zio) 2749 { 2750 zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; 2751 } 2752 2753 /* 2754 * ========================================================================== 2755 * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. 2756 * An error of 0 indictes success. ENXIO indicates whole-device failure, 2757 * which may be transient (e.g. unplugged) or permament. ECKSUM and EIO 2758 * indicate errors that are specific to one I/O, and most likely permanent. 2759 * Any other error is presumed to be worse because we weren't expecting it. 2760 * ========================================================================== 2761 */ 2762 int 2763 zio_worst_error(int e1, int e2) 2764 { 2765 static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; 2766 int r1, r2; 2767 2768 for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) 2769 if (e1 == zio_error_rank[r1]) 2770 break; 2771 2772 for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) 2773 if (e2 == zio_error_rank[r2]) 2774 break; 2775 2776 return (r1 > r2 ? e1 : e2); 2777 } 2778 2779 /* 2780 * ========================================================================== 2781 * I/O completion 2782 * ========================================================================== 2783 */ 2784 static int 2785 zio_ready(zio_t *zio) 2786 { 2787 blkptr_t *bp = zio->io_bp; 2788 zio_t *pio, *pio_next; 2789 2790 if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || 2791 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) 2792 return (ZIO_PIPELINE_STOP); 2793 2794 if (zio->io_ready) { 2795 ASSERT(IO_IS_ALLOCATING(zio)); 2796 ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || 2797 (zio->io_flags & ZIO_FLAG_NOPWRITE)); 2798 ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); 2799 2800 zio->io_ready(zio); 2801 } 2802 2803 if (bp != NULL && bp != &zio->io_bp_copy) 2804 zio->io_bp_copy = *bp; 2805 2806 if (zio->io_error) 2807 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; 2808 2809 mutex_enter(&zio->io_lock); 2810 zio->io_state[ZIO_WAIT_READY] = 1; 2811 pio = zio_walk_parents(zio); 2812 mutex_exit(&zio->io_lock); 2813 2814 /* 2815 * As we notify zio's parents, new parents could be added. 2816 * New parents go to the head of zio's io_parent_list, however, 2817 * so we will (correctly) not notify them. The remainder of zio's 2818 * io_parent_list, from 'pio_next' onward, cannot change because 2819 * all parents must wait for us to be done before they can be done. 2820 */ 2821 for (; pio != NULL; pio = pio_next) { 2822 pio_next = zio_walk_parents(zio); 2823 zio_notify_parent(pio, zio, ZIO_WAIT_READY); 2824 } 2825 2826 if (zio->io_flags & ZIO_FLAG_NODATA) { 2827 if (BP_IS_GANG(bp)) { 2828 zio->io_flags &= ~ZIO_FLAG_NODATA; 2829 } else { 2830 ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); 2831 zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; 2832 } 2833 } 2834 2835 if (zio_injection_enabled && 2836 zio->io_spa->spa_syncing_txg == zio->io_txg) 2837 zio_handle_ignored_writes(zio); 2838 2839 return (ZIO_PIPELINE_CONTINUE); 2840 } 2841 2842 static int 2843 zio_done(zio_t *zio) 2844 { 2845 spa_t *spa = zio->io_spa; 2846 zio_t *lio = zio->io_logical; 2847 blkptr_t *bp = zio->io_bp; 2848 vdev_t *vd = zio->io_vd; 2849 uint64_t psize = zio->io_size; 2850 zio_t *pio, *pio_next; 2851 2852 /* 2853 * If our children haven't all completed, 2854 * wait for them and then repeat this pipeline stage. 2855 */ 2856 if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || 2857 zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || 2858 zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || 2859 zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) 2860 return (ZIO_PIPELINE_STOP); 2861 2862 for (int c = 0; c < ZIO_CHILD_TYPES; c++) 2863 for (int w = 0; w < ZIO_WAIT_TYPES; w++) 2864 ASSERT(zio->io_children[c][w] == 0); 2865 2866 if (bp != NULL) { 2867 ASSERT(bp->blk_pad[0] == 0); 2868 ASSERT(bp->blk_pad[1] == 0); 2869 ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || 2870 (bp == zio_unique_parent(zio)->io_bp)); 2871 if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && 2872 zio->io_bp_override == NULL && 2873 !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { 2874 ASSERT(!BP_SHOULD_BYTESWAP(bp)); 2875 ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); 2876 ASSERT(BP_COUNT_GANG(bp) == 0 || 2877 (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); 2878 } 2879 if (zio->io_flags & ZIO_FLAG_NOPWRITE) 2880 VERIFY(BP_EQUAL(bp, &zio->io_bp_orig)); 2881 } 2882 2883 /* 2884 * If there were child vdev/gang/ddt errors, they apply to us now. 2885 */ 2886 zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); 2887 zio_inherit_child_errors(zio, ZIO_CHILD_GANG); 2888 zio_inherit_child_errors(zio, ZIO_CHILD_DDT); 2889 2890 /* 2891 * If the I/O on the transformed data was successful, generate any 2892 * checksum reports now while we still have the transformed data. 2893 */ 2894 if (zio->io_error == 0) { 2895 while (zio->io_cksum_report != NULL) { 2896 zio_cksum_report_t *zcr = zio->io_cksum_report; 2897 uint64_t align = zcr->zcr_align; 2898 uint64_t asize = P2ROUNDUP(psize, align); 2899 char *abuf = zio->io_data; 2900 2901 if (asize != psize) { 2902 abuf = zio_buf_alloc(asize); 2903 bcopy(zio->io_data, abuf, psize); 2904 bzero(abuf + psize, asize - psize); 2905 } 2906 2907 zio->io_cksum_report = zcr->zcr_next; 2908 zcr->zcr_next = NULL; 2909 zcr->zcr_finish(zcr, abuf); 2910 zfs_ereport_free_checksum(zcr); 2911 2912 if (asize != psize) 2913 zio_buf_free(abuf, asize); 2914 } 2915 } 2916 2917 zio_pop_transforms(zio); /* note: may set zio->io_error */ 2918 2919 vdev_stat_update(zio, psize); 2920 2921 if (zio->io_error) { 2922 /* 2923 * If this I/O is attached to a particular vdev, 2924 * generate an error message describing the I/O failure 2925 * at the block level. We ignore these errors if the 2926 * device is currently unavailable. 2927 */ 2928 if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) 2929 zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); 2930 2931 if ((zio->io_error == EIO || !(zio->io_flags & 2932 (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && 2933 zio == lio) { 2934 /* 2935 * For logical I/O requests, tell the SPA to log the 2936 * error and generate a logical data ereport. 2937 */ 2938 spa_log_error(spa, zio); 2939 zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, 2940 0, 0); 2941 } 2942 } 2943 2944 if (zio->io_error && zio == lio) { 2945 /* 2946 * Determine whether zio should be reexecuted. This will 2947 * propagate all the way to the root via zio_notify_parent(). 2948 */ 2949 ASSERT(vd == NULL && bp != NULL); 2950 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 2951 2952 if (IO_IS_ALLOCATING(zio) && 2953 !(zio->io_flags & ZIO_FLAG_CANFAIL)) { 2954 if (zio->io_error != ENOSPC) 2955 zio->io_reexecute |= ZIO_REEXECUTE_NOW; 2956 else 2957 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2958 } 2959 2960 if ((zio->io_type == ZIO_TYPE_READ || 2961 zio->io_type == ZIO_TYPE_FREE) && 2962 !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && 2963 zio->io_error == ENXIO && 2964 spa_load_state(spa) == SPA_LOAD_NONE && 2965 spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) 2966 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2967 2968 if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) 2969 zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; 2970 2971 /* 2972 * Here is a possibly good place to attempt to do 2973 * either combinatorial reconstruction or error correction 2974 * based on checksums. It also might be a good place 2975 * to send out preliminary ereports before we suspend 2976 * processing. 2977 */ 2978 } 2979 2980 /* 2981 * If there were logical child errors, they apply to us now. 2982 * We defer this until now to avoid conflating logical child 2983 * errors with errors that happened to the zio itself when 2984 * updating vdev stats and reporting FMA events above. 2985 */ 2986 zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); 2987 2988 if ((zio->io_error || zio->io_reexecute) && 2989 IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && 2990 !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) 2991 zio_dva_unallocate(zio, zio->io_gang_tree, bp); 2992 2993 zio_gang_tree_free(&zio->io_gang_tree); 2994 2995 /* 2996 * Godfather I/Os should never suspend. 2997 */ 2998 if ((zio->io_flags & ZIO_FLAG_GODFATHER) && 2999 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) 3000 zio->io_reexecute = 0; 3001 3002 if (zio->io_reexecute) { 3003 /* 3004 * This is a logical I/O that wants to reexecute. 3005 * 3006 * Reexecute is top-down. When an i/o fails, if it's not 3007 * the root, it simply notifies its parent and sticks around. 3008 * The parent, seeing that it still has children in zio_done(), 3009 * does the same. This percolates all the way up to the root. 3010 * The root i/o will reexecute or suspend the entire tree. 3011 * 3012 * This approach ensures that zio_reexecute() honors 3013 * all the original i/o dependency relationships, e.g. 3014 * parents not executing until children are ready. 3015 */ 3016 ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); 3017 3018 zio->io_gang_leader = NULL; 3019 3020 mutex_enter(&zio->io_lock); 3021 zio->io_state[ZIO_WAIT_DONE] = 1; 3022 mutex_exit(&zio->io_lock); 3023 3024 /* 3025 * "The Godfather" I/O monitors its children but is 3026 * not a true parent to them. It will track them through 3027 * the pipeline but severs its ties whenever they get into 3028 * trouble (e.g. suspended). This allows "The Godfather" 3029 * I/O to return status without blocking. 3030 */ 3031 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3032 zio_link_t *zl = zio->io_walk_link; 3033 pio_next = zio_walk_parents(zio); 3034 3035 if ((pio->io_flags & ZIO_FLAG_GODFATHER) && 3036 (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { 3037 zio_remove_child(pio, zio, zl); 3038 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3039 } 3040 } 3041 3042 if ((pio = zio_unique_parent(zio)) != NULL) { 3043 /* 3044 * We're not a root i/o, so there's nothing to do 3045 * but notify our parent. Don't propagate errors 3046 * upward since we haven't permanently failed yet. 3047 */ 3048 ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); 3049 zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; 3050 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3051 } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { 3052 /* 3053 * We'd fail again if we reexecuted now, so suspend 3054 * until conditions improve (e.g. device comes online). 3055 */ 3056 zio_suspend(spa, zio); 3057 } else { 3058 /* 3059 * Reexecution is potentially a huge amount of work. 3060 * Hand it off to the otherwise-unused claim taskq. 3061 */ 3062 ASSERT(zio->io_tqent.tqent_next == NULL); 3063 spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM, 3064 ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio, 3065 0, &zio->io_tqent); 3066 } 3067 return (ZIO_PIPELINE_STOP); 3068 } 3069 3070 ASSERT(zio->io_child_count == 0); 3071 ASSERT(zio->io_reexecute == 0); 3072 ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); 3073 3074 /* 3075 * Report any checksum errors, since the I/O is complete. 3076 */ 3077 while (zio->io_cksum_report != NULL) { 3078 zio_cksum_report_t *zcr = zio->io_cksum_report; 3079 zio->io_cksum_report = zcr->zcr_next; 3080 zcr->zcr_next = NULL; 3081 zcr->zcr_finish(zcr, NULL); 3082 zfs_ereport_free_checksum(zcr); 3083 } 3084 3085 /* 3086 * It is the responsibility of the done callback to ensure that this 3087 * particular zio is no longer discoverable for adoption, and as 3088 * such, cannot acquire any new parents. 3089 */ 3090 if (zio->io_done) 3091 zio->io_done(zio); 3092 3093 mutex_enter(&zio->io_lock); 3094 zio->io_state[ZIO_WAIT_DONE] = 1; 3095 mutex_exit(&zio->io_lock); 3096 3097 for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { 3098 zio_link_t *zl = zio->io_walk_link; 3099 pio_next = zio_walk_parents(zio); 3100 zio_remove_child(pio, zio, zl); 3101 zio_notify_parent(pio, zio, ZIO_WAIT_DONE); 3102 } 3103 3104 if (zio->io_waiter != NULL) { 3105 mutex_enter(&zio->io_lock); 3106 zio->io_executor = NULL; 3107 cv_broadcast(&zio->io_cv); 3108 mutex_exit(&zio->io_lock); 3109 } else { 3110 zio_destroy(zio); 3111 } 3112 3113 return (ZIO_PIPELINE_STOP); 3114 } 3115 3116 /* 3117 * ========================================================================== 3118 * I/O pipeline definition 3119 * ========================================================================== 3120 */ 3121 static zio_pipe_stage_t *zio_pipeline[] = { 3122 NULL, 3123 zio_read_bp_init, 3124 zio_free_bp_init, 3125 zio_issue_async, 3126 zio_write_bp_init, 3127 zio_checksum_generate, 3128 zio_nop_write, 3129 zio_ddt_read_start, 3130 zio_ddt_read_done, 3131 zio_ddt_write, 3132 zio_ddt_free, 3133 zio_gang_assemble, 3134 zio_gang_issue, 3135 zio_dva_allocate, 3136 zio_dva_free, 3137 zio_dva_claim, 3138 zio_ready, 3139 zio_vdev_io_start, 3140 zio_vdev_io_done, 3141 zio_vdev_io_assess, 3142 zio_checksum_verify, 3143 zio_done 3144 }; 3145 3146 /* dnp is the dnode for zb1->zb_object */ 3147 boolean_t 3148 zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1, 3149 const zbookmark_t *zb2) 3150 { 3151 uint64_t zb1nextL0, zb2thisobj; 3152 3153 ASSERT(zb1->zb_objset == zb2->zb_objset); 3154 ASSERT(zb2->zb_level == 0); 3155 3156 /* 3157 * A bookmark in the deadlist is considered to be after 3158 * everything else. 3159 */ 3160 if (zb2->zb_object == DMU_DEADLIST_OBJECT) 3161 return (B_TRUE); 3162 3163 /* The objset_phys_t isn't before anything. */ 3164 if (dnp == NULL) 3165 return (B_FALSE); 3166 3167 zb1nextL0 = (zb1->zb_blkid + 1) << 3168 ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); 3169 3170 zb2thisobj = zb2->zb_object ? zb2->zb_object : 3171 zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); 3172 3173 if (zb1->zb_object == DMU_META_DNODE_OBJECT) { 3174 uint64_t nextobj = zb1nextL0 * 3175 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; 3176 return (nextobj <= zb2thisobj); 3177 } 3178 3179 if (zb1->zb_object < zb2thisobj) 3180 return (B_TRUE); 3181 if (zb1->zb_object > zb2thisobj) 3182 return (B_FALSE); 3183 if (zb2->zb_object == DMU_META_DNODE_OBJECT) 3184 return (B_FALSE); 3185 return (zb1nextL0 <= zb2->zb_blkid); 3186 } 3187