1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu_objset.h> 28 #include <sys/dmu_traverse.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dnode.h> 33 #include <sys/spa.h> 34 #include <sys/zio.h> 35 #include <sys/dmu_impl.h> 36 #include <sys/sa.h> 37 #include <sys/sa_impl.h> 38 #include <sys/callb.h> 39 40 int zfs_pd_blks_max = 100; 41 42 typedef struct prefetch_data { 43 kmutex_t pd_mtx; 44 kcondvar_t pd_cv; 45 int pd_blks_max; 46 int pd_blks_fetched; 47 int pd_flags; 48 boolean_t pd_cancel; 49 boolean_t pd_exited; 50 } prefetch_data_t; 51 52 typedef struct traverse_data { 53 spa_t *td_spa; 54 uint64_t td_objset; 55 blkptr_t *td_rootbp; 56 uint64_t td_min_txg; 57 zbookmark_t *td_resume; 58 int td_flags; 59 prefetch_data_t *td_pfd; 60 blkptr_cb_t *td_func; 61 void *td_arg; 62 } traverse_data_t; 63 64 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 65 arc_buf_t *buf, uint64_t objset, uint64_t object); 66 67 static int 68 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 69 { 70 traverse_data_t *td = arg; 71 zbookmark_t zb; 72 73 if (bp->blk_birth == 0) 74 return (0); 75 76 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) 77 return (0); 78 79 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 80 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 81 82 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg); 83 84 return (0); 85 } 86 87 static int 88 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 89 { 90 traverse_data_t *td = arg; 91 92 if (lrc->lrc_txtype == TX_WRITE) { 93 lr_write_t *lr = (lr_write_t *)lrc; 94 blkptr_t *bp = &lr->lr_blkptr; 95 zbookmark_t zb; 96 97 if (bp->blk_birth == 0) 98 return (0); 99 100 if (claim_txg == 0 || bp->blk_birth < claim_txg) 101 return (0); 102 103 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, 104 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 105 106 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, 107 td->td_arg); 108 } 109 return (0); 110 } 111 112 static void 113 traverse_zil(traverse_data_t *td, zil_header_t *zh) 114 { 115 uint64_t claim_txg = zh->zh_claim_txg; 116 zilog_t *zilog; 117 118 /* 119 * We only want to visit blocks that have been claimed but not yet 120 * replayed; plus, in read-only mode, blocks that are already stable. 121 */ 122 if (claim_txg == 0 && spa_writeable(td->td_spa)) 123 return; 124 125 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 126 127 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 128 claim_txg); 129 130 zil_free(zilog); 131 } 132 133 typedef enum resume_skip { 134 RESUME_SKIP_ALL, 135 RESUME_SKIP_NONE, 136 RESUME_SKIP_CHILDREN 137 } resume_skip_t; 138 139 /* 140 * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and 141 * the block indicated by zb does not need to be visited at all. Returns 142 * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the 143 * resume point. This indicates that this block should be visited but not its 144 * children (since they must have been visited in a previous traversal). 145 * Otherwise returns RESUME_SKIP_NONE. 146 */ 147 static resume_skip_t 148 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, 149 const zbookmark_t *zb) 150 { 151 if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { 152 /* 153 * If we already visited this bp & everything below, 154 * don't bother doing it again. 155 */ 156 if (zbookmark_is_before(dnp, zb, td->td_resume)) 157 return (RESUME_SKIP_ALL); 158 159 /* 160 * If we found the block we're trying to resume from, zero 161 * the bookmark out to indicate that we have resumed. 162 */ 163 ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object); 164 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { 165 bzero(td->td_resume, sizeof (*zb)); 166 if (td->td_flags & TRAVERSE_POST) 167 return (RESUME_SKIP_CHILDREN); 168 } 169 } 170 return (RESUME_SKIP_NONE); 171 } 172 173 static void 174 traverse_pause(traverse_data_t *td, const zbookmark_t *zb) 175 { 176 ASSERT(td->td_resume != NULL); 177 ASSERT3U(zb->zb_level, ==, 0); 178 bcopy(zb, td->td_resume, sizeof (*td->td_resume)); 179 } 180 181 static int 182 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, 183 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 184 { 185 zbookmark_t czb; 186 int err = 0, lasterr = 0; 187 arc_buf_t *buf = NULL; 188 prefetch_data_t *pd = td->td_pfd; 189 boolean_t hard = td->td_flags & TRAVERSE_HARD; 190 boolean_t pause = B_FALSE; 191 192 switch (resume_skip_check(td, dnp, zb)) { 193 case RESUME_SKIP_ALL: 194 return (0); 195 case RESUME_SKIP_CHILDREN: 196 goto post; 197 case RESUME_SKIP_NONE: 198 break; 199 default: 200 ASSERT(0); 201 } 202 203 if (BP_IS_HOLE(bp)) { 204 err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp, 205 td->td_arg); 206 return (err); 207 } 208 209 if (bp->blk_birth <= td->td_min_txg) 210 return (0); 211 212 if (pd && !pd->pd_exited && 213 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || 214 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { 215 mutex_enter(&pd->pd_mtx); 216 ASSERT(pd->pd_blks_fetched >= 0); 217 while (pd->pd_blks_fetched == 0 && !pd->pd_exited) 218 cv_wait(&pd->pd_cv, &pd->pd_mtx); 219 pd->pd_blks_fetched--; 220 cv_broadcast(&pd->pd_cv); 221 mutex_exit(&pd->pd_mtx); 222 } 223 224 if (td->td_flags & TRAVERSE_PRE) { 225 err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, 226 td->td_arg); 227 if (err == TRAVERSE_VISIT_NO_CHILDREN) 228 return (0); 229 if (err == ERESTART) 230 pause = B_TRUE; /* handle pausing at a common point */ 231 if (err != 0) 232 goto post; 233 } 234 235 if (BP_GET_LEVEL(bp) > 0) { 236 uint32_t flags = ARC_WAIT; 237 int i; 238 blkptr_t *cbp; 239 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 240 241 err = dsl_read(NULL, td->td_spa, bp, pbuf, 242 arc_getbuf_func, &buf, 243 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 244 if (err) 245 return (err); 246 247 /* recursively visitbp() blocks below this */ 248 cbp = buf->b_data; 249 for (i = 0; i < epb; i++, cbp++) { 250 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 251 zb->zb_level - 1, 252 zb->zb_blkid * epb + i); 253 err = traverse_visitbp(td, dnp, buf, cbp, &czb); 254 if (err) { 255 if (!hard) 256 break; 257 lasterr = err; 258 } 259 } 260 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 261 uint32_t flags = ARC_WAIT; 262 int i; 263 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 264 265 err = dsl_read(NULL, td->td_spa, bp, pbuf, 266 arc_getbuf_func, &buf, 267 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 268 if (err) 269 return (err); 270 271 /* recursively visitbp() blocks below this */ 272 dnp = buf->b_data; 273 for (i = 0; i < epb; i++, dnp++) { 274 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 275 zb->zb_blkid * epb + i); 276 if (err) { 277 if (!hard) 278 break; 279 lasterr = err; 280 } 281 } 282 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 283 uint32_t flags = ARC_WAIT; 284 objset_phys_t *osp; 285 dnode_phys_t *dnp; 286 287 err = dsl_read_nolock(NULL, td->td_spa, bp, 288 arc_getbuf_func, &buf, 289 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 290 if (err) 291 return (err); 292 293 osp = buf->b_data; 294 dnp = &osp->os_meta_dnode; 295 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 296 DMU_META_DNODE_OBJECT); 297 if (err && hard) { 298 lasterr = err; 299 err = 0; 300 } 301 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 302 dnp = &osp->os_userused_dnode; 303 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 304 DMU_USERUSED_OBJECT); 305 } 306 if (err && hard) { 307 lasterr = err; 308 err = 0; 309 } 310 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 311 dnp = &osp->os_groupused_dnode; 312 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 313 DMU_GROUPUSED_OBJECT); 314 } 315 } 316 317 if (buf) 318 (void) arc_buf_remove_ref(buf, &buf); 319 320 post: 321 if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) { 322 err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp, 323 td->td_arg); 324 if (err == ERESTART) 325 pause = B_TRUE; 326 } 327 328 if (pause && td->td_resume != NULL) { 329 ASSERT3U(err, ==, ERESTART); 330 ASSERT(!hard); 331 traverse_pause(td, zb); 332 } 333 334 return (err != 0 ? err : lasterr); 335 } 336 337 static int 338 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, 339 arc_buf_t *buf, uint64_t objset, uint64_t object) 340 { 341 int j, err = 0, lasterr = 0; 342 zbookmark_t czb; 343 boolean_t hard = (td->td_flags & TRAVERSE_HARD); 344 345 for (j = 0; j < dnp->dn_nblkptr; j++) { 346 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 347 err = traverse_visitbp(td, dnp, buf, 348 (blkptr_t *)&dnp->dn_blkptr[j], &czb); 349 if (err) { 350 if (!hard) 351 break; 352 lasterr = err; 353 } 354 } 355 356 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 357 SET_BOOKMARK(&czb, objset, 358 object, 0, DMU_SPILL_BLKID); 359 err = traverse_visitbp(td, dnp, buf, 360 (blkptr_t *)&dnp->dn_spill, &czb); 361 if (err) { 362 if (!hard) 363 return (err); 364 lasterr = err; 365 } 366 } 367 return (err != 0 ? err : lasterr); 368 } 369 370 /* ARGSUSED */ 371 static int 372 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 373 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, 374 void *arg) 375 { 376 prefetch_data_t *pfd = arg; 377 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 378 379 ASSERT(pfd->pd_blks_fetched >= 0); 380 if (pfd->pd_cancel) 381 return (EINTR); 382 383 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || 384 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || 385 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) 386 return (0); 387 388 mutex_enter(&pfd->pd_mtx); 389 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) 390 cv_wait(&pfd->pd_cv, &pfd->pd_mtx); 391 pfd->pd_blks_fetched++; 392 cv_broadcast(&pfd->pd_cv); 393 mutex_exit(&pfd->pd_mtx); 394 395 (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL, 396 ZIO_PRIORITY_ASYNC_READ, 397 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 398 &aflags, zb); 399 400 return (0); 401 } 402 403 static void 404 traverse_prefetch_thread(void *arg) 405 { 406 traverse_data_t *td_main = arg; 407 traverse_data_t td = *td_main; 408 zbookmark_t czb; 409 410 td.td_func = traverse_prefetcher; 411 td.td_arg = td_main->td_pfd; 412 td.td_pfd = NULL; 413 414 SET_BOOKMARK(&czb, td.td_objset, 415 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 416 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); 417 418 mutex_enter(&td_main->td_pfd->pd_mtx); 419 td_main->td_pfd->pd_exited = B_TRUE; 420 cv_broadcast(&td_main->td_pfd->pd_cv); 421 mutex_exit(&td_main->td_pfd->pd_mtx); 422 } 423 424 /* 425 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 426 * in syncing context). 427 */ 428 static int 429 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, 430 uint64_t txg_start, zbookmark_t *resume, int flags, 431 blkptr_cb_t func, void *arg) 432 { 433 traverse_data_t td; 434 prefetch_data_t pd = { 0 }; 435 zbookmark_t czb; 436 int err; 437 438 ASSERT(ds == NULL || objset == ds->ds_object); 439 ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); 440 441 td.td_spa = spa; 442 td.td_objset = objset; 443 td.td_rootbp = rootbp; 444 td.td_min_txg = txg_start; 445 td.td_resume = resume; 446 td.td_func = func; 447 td.td_arg = arg; 448 td.td_pfd = &pd; 449 td.td_flags = flags; 450 451 pd.pd_blks_max = zfs_pd_blks_max; 452 pd.pd_flags = flags; 453 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); 454 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); 455 456 /* See comment on ZIL traversal in dsl_scan_visitds. */ 457 if (ds != NULL && !dsl_dataset_is_snapshot(ds)) { 458 objset_t *os; 459 460 err = dmu_objset_from_ds(ds, &os); 461 if (err) 462 return (err); 463 464 traverse_zil(&td, &os->os_zil_header); 465 } 466 467 if (!(flags & TRAVERSE_PREFETCH) || 468 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, 469 &td, TQ_NOQUEUE)) 470 pd.pd_exited = B_TRUE; 471 472 SET_BOOKMARK(&czb, td.td_objset, 473 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 474 err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); 475 476 mutex_enter(&pd.pd_mtx); 477 pd.pd_cancel = B_TRUE; 478 cv_broadcast(&pd.pd_cv); 479 while (!pd.pd_exited) 480 cv_wait(&pd.pd_cv, &pd.pd_mtx); 481 mutex_exit(&pd.pd_mtx); 482 483 mutex_destroy(&pd.pd_mtx); 484 cv_destroy(&pd.pd_cv); 485 486 return (err); 487 } 488 489 /* 490 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 491 * in syncing context). 492 */ 493 int 494 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, 495 blkptr_cb_t func, void *arg) 496 { 497 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, 498 &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg)); 499 } 500 501 int 502 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, 503 uint64_t txg_start, zbookmark_t *resume, int flags, 504 blkptr_cb_t func, void *arg) 505 { 506 return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, 507 blkptr, txg_start, resume, flags, func, arg)); 508 } 509 510 /* 511 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 512 */ 513 int 514 traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 515 blkptr_cb_t func, void *arg) 516 { 517 int err, lasterr = 0; 518 uint64_t obj; 519 dsl_pool_t *dp = spa_get_dsl(spa); 520 objset_t *mos = dp->dp_meta_objset; 521 boolean_t hard = (flags & TRAVERSE_HARD); 522 523 /* visit the MOS */ 524 err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), 525 txg_start, NULL, flags, func, arg); 526 if (err) 527 return (err); 528 529 /* visit each dataset */ 530 for (obj = 1; err == 0 || (err != ESRCH && hard); 531 err = dmu_object_next(mos, &obj, FALSE, txg_start)) { 532 dmu_object_info_t doi; 533 534 err = dmu_object_info(mos, obj, &doi); 535 if (err) { 536 if (!hard) 537 return (err); 538 lasterr = err; 539 continue; 540 } 541 542 if (doi.doi_type == DMU_OT_DSL_DATASET) { 543 dsl_dataset_t *ds; 544 uint64_t txg = txg_start; 545 546 rw_enter(&dp->dp_config_rwlock, RW_READER); 547 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 548 rw_exit(&dp->dp_config_rwlock); 549 if (err) { 550 if (!hard) 551 return (err); 552 lasterr = err; 553 continue; 554 } 555 if (ds->ds_phys->ds_prev_snap_txg > txg) 556 txg = ds->ds_phys->ds_prev_snap_txg; 557 err = traverse_dataset(ds, txg, flags, func, arg); 558 dsl_dataset_rele(ds, FTAG); 559 if (err) { 560 if (!hard) 561 return (err); 562 lasterr = err; 563 } 564 } 565 } 566 if (err == ESRCH) 567 err = 0; 568 return (err != 0 ? err : lasterr); 569 } 570