1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu_objset.h> 28 #include <sys/dmu_traverse.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dnode.h> 33 #include <sys/spa.h> 34 #include <sys/spa_impl.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_impl.h> 37 #include <sys/sa.h> 38 #include <sys/sa_impl.h> 39 #include <sys/callb.h> 40 #include <sys/zfeature.h> 41 42 int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ 43 int32_t send_holes_without_birth_time = 1; 44 45 typedef struct prefetch_data { 46 kmutex_t pd_mtx; 47 kcondvar_t pd_cv; 48 int32_t pd_bytes_fetched; 49 int pd_flags; 50 boolean_t pd_cancel; 51 boolean_t pd_exited; 52 zbookmark_phys_t pd_resume; 53 } prefetch_data_t; 54 55 typedef struct traverse_data { 56 spa_t *td_spa; 57 uint64_t td_objset; 58 blkptr_t *td_rootbp; 59 uint64_t td_min_txg; 60 zbookmark_phys_t *td_resume; 61 int td_flags; 62 prefetch_data_t *td_pfd; 63 boolean_t td_paused; 64 uint64_t td_hole_birth_enabled_txg; 65 blkptr_cb_t *td_func; 66 void *td_arg; 67 boolean_t td_realloc_possible; 68 } traverse_data_t; 69 70 static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, 71 const dnode_phys_t *dnp, uint64_t objset, uint64_t object); 72 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, 73 uint64_t objset, uint64_t object); 74 75 static int 76 traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, 77 uint64_t claim_txg) 78 { 79 traverse_data_t *td = arg; 80 zbookmark_phys_t zb; 81 82 if (BP_IS_HOLE(bp)) 83 return (0); 84 85 if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) 86 return (-1); 87 88 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 89 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 90 91 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); 92 93 return (0); 94 } 95 96 static int 97 traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, 98 uint64_t claim_txg) 99 { 100 traverse_data_t *td = arg; 101 102 if (lrc->lrc_txtype == TX_WRITE) { 103 lr_write_t *lr = (lr_write_t *)lrc; 104 blkptr_t *bp = &lr->lr_blkptr; 105 zbookmark_phys_t zb; 106 107 if (BP_IS_HOLE(bp)) 108 return (0); 109 110 if (claim_txg == 0 || bp->blk_birth < claim_txg) 111 return (0); 112 113 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, 114 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 115 116 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, 117 td->td_arg); 118 } 119 return (0); 120 } 121 122 static void 123 traverse_zil(traverse_data_t *td, zil_header_t *zh) 124 { 125 uint64_t claim_txg = zh->zh_claim_txg; 126 127 /* 128 * We only want to visit blocks that have been claimed but not yet 129 * replayed; plus blocks that are already stable in read-only mode. 130 */ 131 if (claim_txg == 0 && spa_writeable(td->td_spa)) 132 return; 133 134 zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 135 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 136 claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT)); 137 zil_free(zilog); 138 } 139 140 typedef enum resume_skip { 141 RESUME_SKIP_ALL, 142 RESUME_SKIP_NONE, 143 RESUME_SKIP_CHILDREN 144 } resume_skip_t; 145 146 /* 147 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and 148 * the block indicated by zb does not need to be visited at all. Returns 149 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the 150 * resume point. This indicates that this block should be visited but not its 151 * children (since they must have been visited in a previous traversal). 152 * Otherwise returns RESUME_SKIP_NONE. 153 */ 154 static resume_skip_t 155 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, 156 const zbookmark_phys_t *zb) 157 { 158 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { 159 /* 160 * If we already visited this bp & everything below, 161 * don't bother doing it again. 162 */ 163 if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) 164 return (RESUME_SKIP_ALL); 165 166 /* 167 * If we found the block we're trying to resume from, zero 168 * the bookmark out to indicate that we have resumed. 169 */ 170 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { 171 bzero(td->td_resume, sizeof (*zb)); 172 if (td->td_flags & TRAVERSE_POST) 173 return (RESUME_SKIP_CHILDREN); 174 } 175 } 176 return (RESUME_SKIP_NONE); 177 } 178 179 static void 180 traverse_prefetch_metadata(traverse_data_t *td, 181 const blkptr_t *bp, const zbookmark_phys_t *zb) 182 { 183 arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 184 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; 185 186 if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) 187 return; 188 /* 189 * If we are in the process of resuming, don't prefetch, because 190 * some children will not be needed (and in fact may have already 191 * been freed). 192 */ 193 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) 194 return; 195 if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) 196 return; 197 if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) 198 return; 199 ASSERT(!BP_IS_REDACTED(bp)); 200 201 if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) 202 zio_flags |= ZIO_FLAG_RAW; 203 204 (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, 205 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 206 } 207 208 static boolean_t 209 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) 210 { 211 ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); 212 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || 213 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp)) 214 return (B_FALSE); 215 return (B_TRUE); 216 } 217 218 static int 219 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, 220 const blkptr_t *bp, const zbookmark_phys_t *zb) 221 { 222 int err = 0; 223 arc_buf_t *buf = NULL; 224 prefetch_data_t *pd = td->td_pfd; 225 226 switch (resume_skip_check(td, dnp, zb)) { 227 case RESUME_SKIP_ALL: 228 return (0); 229 case RESUME_SKIP_CHILDREN: 230 goto post; 231 case RESUME_SKIP_NONE: 232 break; 233 default: 234 ASSERT(0); 235 } 236 237 if (bp->blk_birth == 0) { 238 /* 239 * Since this block has a birth time of 0 it must be one of 240 * two things: a hole created before the 241 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole 242 * which has always been a hole in an object. 243 * 244 * If a file is written sparsely, then the unwritten parts of 245 * the file were "always holes" -- that is, they have been 246 * holes since this object was allocated. However, we (and 247 * our callers) can not necessarily tell when an object was 248 * allocated. Therefore, if it's possible that this object 249 * was freed and then its object number reused, we need to 250 * visit all the holes with birth==0. 251 * 252 * If it isn't possible that the object number was reused, 253 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote 254 * all the blocks we will visit as part of this traversal, 255 * then this hole must have always existed, so we can skip 256 * it. We visit blocks born after (exclusive) td_min_txg. 257 * 258 * Note that the meta-dnode cannot be reallocated. 259 */ 260 if (!send_holes_without_birth_time && 261 (!td->td_realloc_possible || 262 zb->zb_object == DMU_META_DNODE_OBJECT) && 263 td->td_hole_birth_enabled_txg <= td->td_min_txg) 264 return (0); 265 } else if (bp->blk_birth <= td->td_min_txg) { 266 return (0); 267 } 268 269 if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { 270 uint64_t size = BP_GET_LSIZE(bp); 271 mutex_enter(&pd->pd_mtx); 272 ASSERT(pd->pd_bytes_fetched >= 0); 273 while (pd->pd_bytes_fetched < size && !pd->pd_exited) 274 cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); 275 pd->pd_bytes_fetched -= size; 276 cv_broadcast(&pd->pd_cv); 277 mutex_exit(&pd->pd_mtx); 278 } 279 280 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { 281 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 282 if (err != 0) 283 goto post; 284 return (0); 285 } 286 287 if (td->td_flags & TRAVERSE_PRE) { 288 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, 289 td->td_arg); 290 if (err == TRAVERSE_VISIT_NO_CHILDREN) 291 return (0); 292 if (err != 0) 293 goto post; 294 } 295 296 if (BP_GET_LEVEL(bp) > 0) { 297 uint32_t flags = ARC_FLAG_WAIT; 298 int32_t i; 299 int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 300 zbookmark_phys_t *czb; 301 302 ASSERT(!BP_IS_PROTECTED(bp)); 303 304 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 305 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 306 if (err != 0) 307 goto post; 308 309 czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); 310 311 for (i = 0; i < epb; i++) { 312 SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, 313 zb->zb_level - 1, 314 zb->zb_blkid * epb + i); 315 traverse_prefetch_metadata(td, 316 &((blkptr_t *)buf->b_data)[i], czb); 317 } 318 319 /* recursively visitbp() blocks below this */ 320 for (i = 0; i < epb; i++) { 321 SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, 322 zb->zb_level - 1, 323 zb->zb_blkid * epb + i); 324 err = traverse_visitbp(td, dnp, 325 &((blkptr_t *)buf->b_data)[i], czb); 326 if (err != 0) 327 break; 328 } 329 330 kmem_free(czb, sizeof (zbookmark_phys_t)); 331 332 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 333 uint32_t flags = ARC_FLAG_WAIT; 334 uint32_t zio_flags = ZIO_FLAG_CANFAIL; 335 int32_t i; 336 int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 337 dnode_phys_t *child_dnp; 338 339 /* 340 * dnode blocks might have their bonus buffers encrypted, so 341 * we must be careful to honor TRAVERSE_NO_DECRYPT 342 */ 343 if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) 344 zio_flags |= ZIO_FLAG_RAW; 345 346 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 347 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 348 if (err != 0) 349 goto post; 350 351 child_dnp = buf->b_data; 352 353 for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { 354 prefetch_dnode_metadata(td, &child_dnp[i], 355 zb->zb_objset, zb->zb_blkid * epb + i); 356 } 357 358 /* recursively visitbp() blocks below this */ 359 for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { 360 err = traverse_dnode(td, bp, &child_dnp[i], 361 zb->zb_objset, zb->zb_blkid * epb + i); 362 if (err != 0) 363 break; 364 } 365 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 366 uint32_t zio_flags = ZIO_FLAG_CANFAIL; 367 arc_flags_t flags = ARC_FLAG_WAIT; 368 objset_phys_t *osp; 369 370 if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) 371 zio_flags |= ZIO_FLAG_RAW; 372 373 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 374 ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); 375 if (err != 0) 376 goto post; 377 378 osp = buf->b_data; 379 prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, 380 DMU_META_DNODE_OBJECT); 381 /* 382 * See the block comment above for the goal of this variable. 383 * If the maxblkid of the meta-dnode is 0, then we know that 384 * we've never had more than DNODES_PER_BLOCK objects in the 385 * dataset, which means we can't have reused any object ids. 386 */ 387 if (osp->os_meta_dnode.dn_maxblkid == 0) 388 td->td_realloc_possible = B_FALSE; 389 390 if (OBJSET_BUF_HAS_USERUSED(buf)) { 391 if (OBJSET_BUF_HAS_PROJECTUSED(buf)) 392 prefetch_dnode_metadata(td, 393 &osp->os_projectused_dnode, 394 zb->zb_objset, DMU_PROJECTUSED_OBJECT); 395 prefetch_dnode_metadata(td, &osp->os_groupused_dnode, 396 zb->zb_objset, DMU_GROUPUSED_OBJECT); 397 prefetch_dnode_metadata(td, &osp->os_userused_dnode, 398 zb->zb_objset, DMU_USERUSED_OBJECT); 399 } 400 401 err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset, 402 DMU_META_DNODE_OBJECT); 403 if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) { 404 if (OBJSET_BUF_HAS_PROJECTUSED(buf)) 405 err = traverse_dnode(td, bp, 406 &osp->os_projectused_dnode, zb->zb_objset, 407 DMU_PROJECTUSED_OBJECT); 408 if (err == 0) 409 err = traverse_dnode(td, bp, 410 &osp->os_groupused_dnode, zb->zb_objset, 411 DMU_GROUPUSED_OBJECT); 412 if (err == 0) 413 err = traverse_dnode(td, bp, 414 &osp->os_userused_dnode, zb->zb_objset, 415 DMU_USERUSED_OBJECT); 416 } 417 } 418 419 if (buf) 420 arc_buf_destroy(buf, &buf); 421 422 post: 423 if (err == 0 && (td->td_flags & TRAVERSE_POST)) 424 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 425 426 if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) { 427 /* 428 * Ignore this disk error as requested by the HARD flag, 429 * and continue traversal. 430 */ 431 err = 0; 432 } 433 434 /* 435 * If we are stopping here, set td_resume. 436 */ 437 if (td->td_resume != NULL && err != 0 && !td->td_paused) { 438 td->td_resume->zb_objset = zb->zb_objset; 439 td->td_resume->zb_object = zb->zb_object; 440 td->td_resume->zb_level = 0; 441 /* 442 * If we have stopped on an indirect block (e.g. due to 443 * i/o error), we have not visited anything below it. 444 * Set the bookmark to the first level-0 block that we need 445 * to visit. This way, the resuming code does not need to 446 * deal with resuming from indirect blocks. 447 * 448 * Note, if zb_level <= 0, dnp may be NULL, so we don't want 449 * to dereference it. 450 */ 451 td->td_resume->zb_blkid = zb->zb_blkid; 452 if (zb->zb_level > 0) { 453 td->td_resume->zb_blkid <<= zb->zb_level * 454 (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); 455 } 456 td->td_paused = B_TRUE; 457 } 458 459 return (err); 460 } 461 462 static void 463 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, 464 uint64_t objset, uint64_t object) 465 { 466 int j; 467 zbookmark_phys_t czb; 468 469 for (j = 0; j < dnp->dn_nblkptr; j++) { 470 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 471 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); 472 } 473 474 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 475 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 476 traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); 477 } 478 } 479 480 static int 481 traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, 482 uint64_t objset, uint64_t object) 483 { 484 int j, err = 0; 485 zbookmark_phys_t czb; 486 487 if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && 488 object < td->td_resume->zb_object) 489 return (0); 490 491 if (td->td_flags & TRAVERSE_PRE) { 492 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 493 ZB_DNODE_BLKID); 494 err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, 495 td->td_arg); 496 if (err == TRAVERSE_VISIT_NO_CHILDREN) 497 return (0); 498 if (err != 0) 499 return (err); 500 } 501 502 for (j = 0; j < dnp->dn_nblkptr; j++) { 503 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 504 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); 505 if (err != 0) 506 break; 507 } 508 509 if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { 510 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 511 err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); 512 } 513 514 if (err == 0 && (td->td_flags & TRAVERSE_POST)) { 515 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, 516 ZB_DNODE_BLKID); 517 err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, 518 td->td_arg); 519 if (err == TRAVERSE_VISIT_NO_CHILDREN) 520 return (0); 521 if (err != 0) 522 return (err); 523 } 524 return (err); 525 } 526 527 /* ARGSUSED */ 528 static int 529 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 530 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 531 { 532 prefetch_data_t *pfd = arg; 533 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; 534 arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | 535 ARC_FLAG_PRESCIENT_PREFETCH; 536 537 ASSERT(pfd->pd_bytes_fetched >= 0); 538 if (zb->zb_level == ZB_DNODE_LEVEL) 539 return (0); 540 if (pfd->pd_cancel) 541 return (SET_ERROR(EINTR)); 542 543 if (!prefetch_needed(pfd, bp)) 544 return (0); 545 546 mutex_enter(&pfd->pd_mtx); 547 while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) 548 cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx); 549 pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); 550 cv_broadcast(&pfd->pd_cv); 551 mutex_exit(&pfd->pd_mtx); 552 553 if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) 554 zio_flags |= ZIO_FLAG_RAW; 555 556 (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 557 zio_flags, &aflags, zb); 558 559 return (0); 560 } 561 562 static void 563 traverse_prefetch_thread(void *arg) 564 { 565 traverse_data_t *td_main = arg; 566 traverse_data_t td = *td_main; 567 zbookmark_phys_t czb; 568 fstrans_cookie_t cookie = spl_fstrans_mark(); 569 570 td.td_func = traverse_prefetcher; 571 td.td_arg = td_main->td_pfd; 572 td.td_pfd = NULL; 573 td.td_resume = &td_main->td_pfd->pd_resume; 574 575 SET_BOOKMARK(&czb, td.td_objset, 576 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 577 (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); 578 579 mutex_enter(&td_main->td_pfd->pd_mtx); 580 td_main->td_pfd->pd_exited = B_TRUE; 581 cv_broadcast(&td_main->td_pfd->pd_cv); 582 mutex_exit(&td_main->td_pfd->pd_mtx); 583 spl_fstrans_unmark(cookie); 584 } 585 586 /* 587 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 588 * in syncing context). 589 */ 590 static int 591 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, 592 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 593 blkptr_cb_t func, void *arg) 594 { 595 traverse_data_t *td; 596 prefetch_data_t *pd; 597 zbookmark_phys_t *czb; 598 int err; 599 600 ASSERT(ds == NULL || objset == ds->ds_object); 601 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); 602 603 td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP); 604 pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP); 605 czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); 606 607 td->td_spa = spa; 608 td->td_objset = objset; 609 td->td_rootbp = rootbp; 610 td->td_min_txg = txg_start; 611 td->td_resume = resume; 612 td->td_func = func; 613 td->td_arg = arg; 614 td->td_pfd = pd; 615 td->td_flags = flags; 616 td->td_paused = B_FALSE; 617 td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); 618 619 if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { 620 VERIFY(spa_feature_enabled_txg(spa, 621 SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg)); 622 } else { 623 td->td_hole_birth_enabled_txg = UINT64_MAX; 624 } 625 626 pd->pd_flags = flags; 627 if (resume != NULL) 628 pd->pd_resume = *resume; 629 mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); 630 cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); 631 632 SET_BOOKMARK(czb, td->td_objset, 633 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 634 635 /* See comment on ZIL traversal in dsl_scan_visitds. */ 636 if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { 637 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 638 uint32_t flags = ARC_FLAG_WAIT; 639 objset_phys_t *osp; 640 arc_buf_t *buf; 641 ASSERT(!BP_IS_REDACTED(rootbp)); 642 643 if ((td->td_flags & TRAVERSE_NO_DECRYPT) && 644 BP_IS_PROTECTED(rootbp)) 645 zio_flags |= ZIO_FLAG_RAW; 646 647 err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, 648 &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb); 649 if (err != 0) { 650 /* 651 * If both TRAVERSE_HARD and TRAVERSE_PRE are set, 652 * continue to visitbp so that td_func can be called 653 * in pre stage, and err will reset to zero. 654 */ 655 if (!(td->td_flags & TRAVERSE_HARD) || 656 !(td->td_flags & TRAVERSE_PRE)) 657 goto out; 658 } else { 659 osp = buf->b_data; 660 traverse_zil(td, &osp->os_zil_header); 661 arc_buf_destroy(buf, &buf); 662 } 663 } 664 665 if (!(flags & TRAVERSE_PREFETCH_DATA) || 666 taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread, 667 td, TQ_NOQUEUE) == TASKQID_INVALID) 668 pd->pd_exited = B_TRUE; 669 670 err = traverse_visitbp(td, NULL, rootbp, czb); 671 672 mutex_enter(&pd->pd_mtx); 673 pd->pd_cancel = B_TRUE; 674 cv_broadcast(&pd->pd_cv); 675 while (!pd->pd_exited) 676 cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); 677 mutex_exit(&pd->pd_mtx); 678 out: 679 mutex_destroy(&pd->pd_mtx); 680 cv_destroy(&pd->pd_cv); 681 682 kmem_free(czb, sizeof (zbookmark_phys_t)); 683 kmem_free(pd, sizeof (struct prefetch_data)); 684 kmem_free(td, sizeof (struct traverse_data)); 685 686 return (err); 687 } 688 689 /* 690 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 691 * in syncing context). 692 */ 693 int 694 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, 695 zbookmark_phys_t *resume, 696 int flags, blkptr_cb_t func, void *arg) 697 { 698 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, 699 &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); 700 } 701 702 int 703 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, 704 int flags, blkptr_cb_t func, void *arg) 705 { 706 return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); 707 } 708 709 int 710 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, 711 uint64_t txg_start, zbookmark_phys_t *resume, int flags, 712 blkptr_cb_t func, void *arg) 713 { 714 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, 715 blkptr, txg_start, resume, flags, func, arg)); 716 } 717 718 /* 719 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 720 */ 721 int 722 traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 723 blkptr_cb_t func, void *arg) 724 { 725 int err; 726 dsl_pool_t *dp = spa_get_dsl(spa); 727 objset_t *mos = dp->dp_meta_objset; 728 boolean_t hard = (flags & TRAVERSE_HARD); 729 730 /* visit the MOS */ 731 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), 732 txg_start, NULL, flags, func, arg); 733 if (err != 0) 734 return (err); 735 736 /* visit each dataset */ 737 for (uint64_t obj = 1; err == 0; 738 err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { 739 dmu_object_info_t doi; 740 741 err = dmu_object_info(mos, obj, &doi); 742 if (err != 0) { 743 if (hard) 744 continue; 745 break; 746 } 747 748 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { 749 dsl_dataset_t *ds; 750 uint64_t txg = txg_start; 751 752 dsl_pool_config_enter(dp, FTAG); 753 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 754 dsl_pool_config_exit(dp, FTAG); 755 if (err != 0) { 756 if (hard) 757 continue; 758 break; 759 } 760 if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) 761 txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; 762 err = traverse_dataset(ds, txg, flags, func, arg); 763 dsl_dataset_rele(ds, FTAG); 764 if (err != 0) 765 break; 766 } 767 } 768 if (err == ESRCH) 769 err = 0; 770 return (err); 771 } 772 773 EXPORT_SYMBOL(traverse_dataset); 774 EXPORT_SYMBOL(traverse_pool); 775 776 /* BEGIN CSTYLED */ 777 ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, 778 "Max number of bytes to prefetch"); 779 780 #if defined(_KERNEL) 781 module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); 782 MODULE_PARM_DESC(ignore_hole_birth, 783 "Alias for send_holes_without_birth_time"); 784 #endif 785 786 ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, 787 "Ignore hole_birth txg for zfs send"); 788 /* END CSTYLED */ 789