1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu_objset.h> 28 #include <sys/dmu_traverse.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dnode.h> 33 #include <sys/spa.h> 34 #include <sys/zio.h> 35 #include <sys/dmu_impl.h> 36 #include <sys/sa.h> 37 #include <sys/sa_impl.h> 38 #include <sys/callb.h> 39 40 int zfs_pd_blks_max = 100; 41 42 typedef struct prefetch_data { 43 kmutex_t pd_mtx; 44 kcondvar_t pd_cv; 45 int pd_blks_max; 46 int pd_blks_fetched; 47 int pd_flags; 48 boolean_t pd_cancel; 49 boolean_t pd_exited; 50 } prefetch_data_t; 51 52 typedef struct traverse_data { 53 spa_t *td_spa; 54 uint64_t td_objset; 55 blkptr_t *td_rootbp; 56 uint64_t td_min_txg; 57 zbookmark_t *td_resume; 58 int td_flags; 59 prefetch_data_t *td_pfd; 60 blkptr_cb_t *td_func; 61 void *td_arg; 62 } traverse_data_t; 63 64 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 65 uint64_t objset, uint64_t object); 66 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, 67 uint64_t objset, uint64_t object); 68 69 static int 70 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 71 { 72 traverse_data_t *td = arg; 73 zbookmark_t zb; 74 75 if (bp->blk_birth == 0) 76 return (0); 77 78 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) 79 return (0); 80 81 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 82 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 83 84 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); 85 86 return (0); 87 } 88 89 static int 90 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 91 { 92 traverse_data_t *td = arg; 93 94 if (lrc->lrc_txtype == TX_WRITE) { 95 lr_write_t *lr = (lr_write_t *)lrc; 96 blkptr_t *bp = &lr->lr_blkptr; 97 zbookmark_t zb; 98 99 if (bp->blk_birth == 0) 100 return (0); 101 102 if (claim_txg == 0 || bp->blk_birth < claim_txg) 103 return (0); 104 105 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, 106 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 107 108 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, 109 td->td_arg); 110 } 111 return (0); 112 } 113 114 static void 115 traverse_zil(traverse_data_t *td, zil_header_t *zh) 116 { 117 uint64_t claim_txg = zh->zh_claim_txg; 118 zilog_t *zilog; 119 120 /* 121 * We only want to visit blocks that have been claimed but not yet 122 * replayed; plus, in read-only mode, blocks that are already stable. 123 */ 124 if (claim_txg == 0 && spa_writeable(td->td_spa)) 125 return; 126 127 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 128 129 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 130 claim_txg); 131 132 zil_free(zilog); 133 } 134 135 typedef enum resume_skip { 136 RESUME_SKIP_ALL, 137 RESUME_SKIP_NONE, 138 RESUME_SKIP_CHILDREN 139 } resume_skip_t; 140 141 /* 142 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and 143 * the block indicated by zb does not need to be visited at all. Returns 144 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the 145 * resume point. This indicates that this block should be visited but not its 146 * children (since they must have been visited in a previous traversal). 147 * Otherwise returns RESUME_SKIP_NONE. 148 */ 149 static resume_skip_t 150 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, 151 const zbookmark_t *zb) 152 { 153 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { 154 /* 155 * If we already visited this bp & everything below, 156 * don't bother doing it again. 157 */ 158 if (zbookmark_is_before(dnp, zb, td->td_resume)) 159 return (RESUME_SKIP_ALL); 160 161 /* 162 * If we found the block we're trying to resume from, zero 163 * the bookmark out to indicate that we have resumed. 164 */ 165 ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); 166 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { 167 bzero(td->td_resume, sizeof (*zb)); 168 if (td->td_flags & TRAVERSE_POST) 169 return (RESUME_SKIP_CHILDREN); 170 } 171 } 172 return (RESUME_SKIP_NONE); 173 } 174 175 static void 176 traverse_pause(traverse_data_t *td, const zbookmark_t *zb) 177 { 178 ASSERT(td->td_resume != NULL); 179 ASSERT0(zb->zb_level); 180 bcopy(zb, td->td_resume, sizeof (*td->td_resume)); 181 } 182 183 static void 184 traverse_prefetch_metadata(traverse_data_t *td, 185 const blkptr_t *bp, const zbookmark_t *zb) 186 { 187 uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; 188 189 if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) 190 return; 191 /* 192 * If we are in the process of resuming, don't prefetch, because 193 * some children will not be needed (and in fact may have already 194 * been freed). 195 */ 196 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) 197 return; 198 if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) 199 return; 200 if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) 201 return; 202 203 (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, 204 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 205 } 206 207 static int 208 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, 209 const blkptr_t *bp, const zbookmark_t *zb) 210 { 211 zbookmark_t czb; 212 int err = 0, lasterr = 0; 213 arc_buf_t *buf = NULL; 214 prefetch_data_t *pd = td->td_pfd; 215 boolean_t hard = td->td_flags & TRAVERSE_HARD; 216 boolean_t pause = B_FALSE; 217 218 switch (resume_skip_check(td, dnp, zb)) { 219 case RESUME_SKIP_ALL: 220 return (0); 221 case RESUME_SKIP_CHILDREN: 222 goto post; 223 case RESUME_SKIP_NONE: 224 break; 225 default: 226 ASSERT(0); 227 } 228 229 if (BP_IS_HOLE(bp)) { 230 err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); 231 return (err); 232 } 233 234 if (bp->blk_birth <= td->td_min_txg) 235 return (0); 236 237 if (pd && !pd->pd_exited && 238 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || 239 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { 240 mutex_enter(&pd->pd_mtx); 241 ASSERT(pd->pd_blks_fetched >= 0); 242 while (pd->pd_blks_fetched == 0 && !pd->pd_exited) 243 cv_wait(&pd->pd_cv, &pd->pd_mtx); 244 pd->pd_blks_fetched--; 245 cv_broadcast(&pd->pd_cv); 246 mutex_exit(&pd->pd_mtx); 247 } 248 249 if (td->td_flags & TRAVERSE_PRE) { 250 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, 251 td->td_arg); 252 if (err == TRAVERSE_VISIT_NO_CHILDREN) 253 return (0); 254 if (err == ERESTART) 255 pause = B_TRUE; /* handle pausing at a common point */ 256 if (err != 0) 257 goto post; 258 } 259 260 if (BP_GET_LEVEL(bp) > 0) { 261 uint32_t flags = ARC_WAIT; 262 int i; 263 blkptr_t *cbp; 264 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 265 266 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 267 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 268 if (err) 269 return (err); 270 cbp = buf->b_data; 271 272 for (i = 0; i < epb; i++) { 273 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 274 zb->zb_level - 1, 275 zb->zb_blkid * epb + i); 276 traverse_prefetch_metadata(td, &cbp[i], &czb); 277 } 278 279 /* recursively visitbp() blocks below this */ 280 for (i = 0; i < epb; i++) { 281 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 282 zb->zb_level - 1, 283 zb->zb_blkid * epb + i); 284 err = traverse_visitbp(td, dnp, &cbp[i], &czb); 285 if (err) { 286 if (!hard) 287 break; 288 lasterr = err; 289 } 290 } 291 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 292 uint32_t flags = ARC_WAIT; 293 int i; 294 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 295 296 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 297 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 298 if (err) 299 return (err); 300 dnp = buf->b_data; 301 302 for (i = 0; i < epb; i++) { 303 prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, 304 zb->zb_blkid * epb + i); 305 } 306 307 /* recursively visitbp() blocks below this */ 308 for (i = 0; i < epb; i++) { 309 err = traverse_dnode(td, &dnp[i], zb->zb_objset, 310 zb->zb_blkid * epb + i); 311 if (err) { 312 if (!hard) 313 break; 314 lasterr = err; 315 } 316 } 317 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 318 uint32_t flags = ARC_WAIT; 319 objset_phys_t *osp; 320 dnode_phys_t *dnp; 321 322 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, 323 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 324 if (err) 325 return (err); 326 327 osp = buf->b_data; 328 dnp = &osp->os_meta_dnode; 329 prefetch_dnode_metadata(td, dnp, zb->zb_objset, 330 DMU_META_DNODE_OBJECT); 331 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { 332 prefetch_dnode_metadata(td, &osp->os_userused_dnode, 333 zb->zb_objset, DMU_USERUSED_OBJECT); 334 prefetch_dnode_metadata(td, &osp->os_groupused_dnode, 335 zb->zb_objset, DMU_USERUSED_OBJECT); 336 } 337 338 err = traverse_dnode(td, dnp, zb->zb_objset, 339 DMU_META_DNODE_OBJECT); 340 if (err && hard) { 341 lasterr = err; 342 err = 0; 343 } 344 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 345 dnp = &osp->os_userused_dnode; 346 err = traverse_dnode(td, dnp, zb->zb_objset, 347 DMU_USERUSED_OBJECT); 348 } 349 if (err && hard) { 350 lasterr = err; 351 err = 0; 352 } 353 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 354 dnp = &osp->os_groupused_dnode; 355 err = traverse_dnode(td, dnp, zb->zb_objset, 356 DMU_GROUPUSED_OBJECT); 357 } 358 } 359 360 if (buf) 361 (void) arc_buf_remove_ref(buf, &buf); 362 363 post: 364 if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { 365 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 366 if (err == ERESTART) 367 pause = B_TRUE; 368 } 369 370 if (pause && td->td_resume != NULL) { 371 ASSERT3U(err, ==, ERESTART); 372 ASSERT(!hard); 373 traverse_pause(td, zb); 374 } 375 376 return (err != 0 ? err : lasterr); 377 } 378 379 static void 380 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, 381 uint64_t objset, uint64_t object) 382 { 383 int j; 384 zbookmark_t czb; 385 386 for (j = 0; j < dnp->dn_nblkptr; j++) { 387 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 388 traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); 389 } 390 391 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 392 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 393 traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); 394 } 395 } 396 397 static int 398 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 399 uint64_t objset, uint64_t object) 400 { 401 int j, err = 0, lasterr = 0; 402 zbookmark_t czb; 403 boolean_t hard = (td->td_flags & TRAVERSE_HARD); 404 405 for (j = 0; j < dnp->dn_nblkptr; j++) { 406 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 407 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); 408 if (err) { 409 if (!hard) 410 break; 411 lasterr = err; 412 } 413 } 414 415 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 416 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); 417 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); 418 if (err) { 419 if (!hard) 420 return (err); 421 lasterr = err; 422 } 423 } 424 return (err != 0 ? err : lasterr); 425 } 426 427 /* ARGSUSED */ 428 static int 429 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 430 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 431 { 432 prefetch_data_t *pfd = arg; 433 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 434 435 ASSERT(pfd->pd_blks_fetched >= 0); 436 if (pfd->pd_cancel) 437 return (EINTR); 438 439 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || 440 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || 441 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) 442 return (0); 443 444 mutex_enter(&pfd->pd_mtx); 445 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) 446 cv_wait(&pfd->pd_cv, &pfd->pd_mtx); 447 pfd->pd_blks_fetched++; 448 cv_broadcast(&pfd->pd_cv); 449 mutex_exit(&pfd->pd_mtx); 450 451 (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 452 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); 453 454 return (0); 455 } 456 457 static void 458 traverse_prefetch_thread(void *arg) 459 { 460 traverse_data_t *td_main = arg; 461 traverse_data_t td = *td_main; 462 zbookmark_t czb; 463 464 td.td_func = traverse_prefetcher; 465 td.td_arg = td_main->td_pfd; 466 td.td_pfd = NULL; 467 468 SET_BOOKMARK(&czb, td.td_objset, 469 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 470 (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); 471 472 mutex_enter(&td_main->td_pfd->pd_mtx); 473 td_main->td_pfd->pd_exited = B_TRUE; 474 cv_broadcast(&td_main->td_pfd->pd_cv); 475 mutex_exit(&td_main->td_pfd->pd_mtx); 476 } 477 478 /* 479 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 480 * in syncing context). 481 */ 482 static int 483 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, 484 uint64_t txg_start, zbookmark_t *resume, int flags, 485 blkptr_cb_t func, void *arg) 486 { 487 traverse_data_t td; 488 prefetch_data_t pd = { 0 }; 489 zbookmark_t czb; 490 int err; 491 492 ASSERT(ds == NULL || objset == ds->ds_object); 493 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); 494 495 /* 496 * The data prefetching mechanism (the prefetch thread) is incompatible 497 * with resuming from a bookmark. 498 */ 499 ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); 500 501 td.td_spa = spa; 502 td.td_objset = objset; 503 td.td_rootbp = rootbp; 504 td.td_min_txg = txg_start; 505 td.td_resume = resume; 506 td.td_func = func; 507 td.td_arg = arg; 508 td.td_pfd = &pd; 509 td.td_flags = flags; 510 511 pd.pd_blks_max = zfs_pd_blks_max; 512 pd.pd_flags = flags; 513 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); 514 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); 515 516 /* See comment on ZIL traversal in dsl_scan_visitds. */ 517 if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { 518 objset_t *os; 519 520 err = dmu_objset_from_ds(ds, &os); 521 if (err) 522 return (err); 523 524 traverse_zil(&td, &os->os_zil_header); 525 } 526 527 if (!(flags & TRAVERSE_PREFETCH_DATA) || 528 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, 529 &td, TQ_NOQUEUE)) 530 pd.pd_exited = B_TRUE; 531 532 SET_BOOKMARK(&czb, td.td_objset, 533 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 534 err = traverse_visitbp(&td, NULL, rootbp, &czb); 535 536 mutex_enter(&pd.pd_mtx); 537 pd.pd_cancel = B_TRUE; 538 cv_broadcast(&pd.pd_cv); 539 while (!pd.pd_exited) 540 cv_wait(&pd.pd_cv, &pd.pd_mtx); 541 mutex_exit(&pd.pd_mtx); 542 543 mutex_destroy(&pd.pd_mtx); 544 cv_destroy(&pd.pd_cv); 545 546 return (err); 547 } 548 549 /* 550 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 551 * in syncing context). 552 */ 553 int 554 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, 555 blkptr_cb_t func, void *arg) 556 { 557 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, 558 &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); 559 } 560 561 int 562 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, 563 uint64_t txg_start, zbookmark_t *resume, int flags, 564 blkptr_cb_t func, void *arg) 565 { 566 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, 567 blkptr, txg_start, resume, flags, func, arg)); 568 } 569 570 /* 571 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 572 */ 573 int 574 traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 575 blkptr_cb_t func, void *arg) 576 { 577 int err, lasterr = 0; 578 uint64_t obj; 579 dsl_pool_t *dp = spa_get_dsl(spa); 580 objset_t *mos = dp->dp_meta_objset; 581 boolean_t hard = (flags & TRAVERSE_HARD); 582 583 /* visit the MOS */ 584 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), 585 txg_start, NULL, flags, func, arg); 586 if (err) 587 return (err); 588 589 /* visit each dataset */ 590 for (obj = 1; err == 0 || (err != ESRCH && hard); 591 err = dmu_object_next(mos, &obj, FALSE, txg_start)) { 592 dmu_object_info_t doi; 593 594 err = dmu_object_info(mos, obj, &doi); 595 if (err) { 596 if (!hard) 597 return (err); 598 lasterr = err; 599 continue; 600 } 601 602 if (doi.doi_type == DMU_OT_DSL_DATASET) { 603 dsl_dataset_t *ds; 604 uint64_t txg = txg_start; 605 606 rw_enter(&dp->dp_config_rwlock, RW_READER); 607 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 608 rw_exit(&dp->dp_config_rwlock); 609 if (err) { 610 if (!hard) 611 return (err); 612 lasterr = err; 613 continue; 614 } 615 if (ds->ds_phys->ds_prev_snap_txg > txg) 616 txg = ds->ds_phys->ds_prev_snap_txg; 617 err = traverse_dataset(ds, txg, flags, func, arg); 618 dsl_dataset_rele(ds, FTAG); 619 if (err) { 620 if (!hard) 621 return (err); 622 lasterr = err; 623 } 624 } 625 } 626 if (err == ESRCH) 627 err = 0; 628 return (err != 0 ? err : lasterr); 629 } 630