1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu_objset.h> 28 #include <sys/dmu_traverse.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dnode.h> 33 #include <sys/spa.h> 34 #include <sys/zio.h> 35 #include <sys/dmu_impl.h> 36 #include <sys/callb.h> 37 38 struct prefetch_data { 39 kmutex_t pd_mtx; 40 kcondvar_t pd_cv; 41 int pd_blks_max; 42 int pd_blks_fetched; 43 int pd_flags; 44 boolean_t pd_cancel; 45 boolean_t pd_exited; 46 }; 47 48 struct traverse_data { 49 spa_t *td_spa; 50 uint64_t td_objset; 51 blkptr_t *td_rootbp; 52 uint64_t td_min_txg; 53 int td_flags; 54 struct prefetch_data *td_pfd; 55 blkptr_cb_t *td_func; 56 void *td_arg; 57 }; 58 59 static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, 60 arc_buf_t *buf, uint64_t objset, uint64_t object); 61 62 /* ARGSUSED */ 63 static int 64 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 65 { 66 struct traverse_data *td = arg; 67 zbookmark_t zb; 68 69 if (bp->blk_birth == 0) 70 return (0); 71 72 if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) 73 return (0); 74 75 SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 76 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 77 78 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); 79 80 return (0); 81 } 82 83 /* ARGSUSED */ 84 static int 85 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) 86 { 87 struct traverse_data *td = arg; 88 89 if (lrc->lrc_txtype == TX_WRITE) { 90 lr_write_t *lr = (lr_write_t *)lrc; 91 blkptr_t *bp = &lr->lr_blkptr; 92 zbookmark_t zb; 93 94 if (bp->blk_birth == 0) 95 return (0); 96 97 if (claim_txg == 0 || bp->blk_birth < claim_txg) 98 return (0); 99 100 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, 101 lr->lr_offset / BP_GET_LSIZE(bp)); 102 103 (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, 104 td->td_arg); 105 } 106 return (0); 107 } 108 109 static void 110 traverse_zil(struct traverse_data *td, zil_header_t *zh) 111 { 112 uint64_t claim_txg = zh->zh_claim_txg; 113 zilog_t *zilog; 114 115 /* 116 * We only want to visit blocks that have been claimed but not yet 117 * replayed; plus, in read-only mode, blocks that are already stable. 118 */ 119 if (claim_txg == 0 && spa_writeable(td->td_spa)) 120 return; 121 122 zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); 123 124 (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, 125 claim_txg); 126 127 zil_free(zilog); 128 } 129 130 static int 131 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, 132 arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) 133 { 134 zbookmark_t czb; 135 int err = 0, lasterr = 0; 136 arc_buf_t *buf = NULL; 137 struct prefetch_data *pd = td->td_pfd; 138 boolean_t hard = td->td_flags & TRAVERSE_HARD; 139 140 if (bp->blk_birth == 0) { 141 err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); 142 return (err); 143 } 144 145 if (bp->blk_birth <= td->td_min_txg) 146 return (0); 147 148 if (pd && !pd->pd_exited && 149 ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || 150 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { 151 mutex_enter(&pd->pd_mtx); 152 ASSERT(pd->pd_blks_fetched >= 0); 153 while (pd->pd_blks_fetched == 0 && !pd->pd_exited) 154 cv_wait(&pd->pd_cv, &pd->pd_mtx); 155 pd->pd_blks_fetched--; 156 cv_broadcast(&pd->pd_cv); 157 mutex_exit(&pd->pd_mtx); 158 } 159 160 if (td->td_flags & TRAVERSE_PRE) { 161 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 162 if (err) 163 return (err); 164 } 165 166 if (BP_GET_LEVEL(bp) > 0) { 167 uint32_t flags = ARC_WAIT; 168 int i; 169 blkptr_t *cbp; 170 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 171 172 err = arc_read(NULL, td->td_spa, bp, pbuf, 173 arc_getbuf_func, &buf, 174 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 175 if (err) 176 return (err); 177 178 /* recursively visitbp() blocks below this */ 179 cbp = buf->b_data; 180 for (i = 0; i < epb; i++, cbp++) { 181 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 182 zb->zb_level - 1, 183 zb->zb_blkid * epb + i); 184 err = traverse_visitbp(td, dnp, buf, cbp, &czb); 185 if (err) { 186 if (!hard) 187 break; 188 lasterr = err; 189 } 190 } 191 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 192 uint32_t flags = ARC_WAIT; 193 int i; 194 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 195 196 err = arc_read(NULL, td->td_spa, bp, pbuf, 197 arc_getbuf_func, &buf, 198 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 199 if (err) 200 return (err); 201 202 /* recursively visitbp() blocks below this */ 203 dnp = buf->b_data; 204 for (i = 0; i < epb; i++, dnp++) { 205 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 206 zb->zb_blkid * epb + i); 207 if (err) { 208 if (!hard) 209 break; 210 lasterr = err; 211 } 212 } 213 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 214 uint32_t flags = ARC_WAIT; 215 objset_phys_t *osp; 216 dnode_phys_t *dnp; 217 218 err = arc_read_nolock(NULL, td->td_spa, bp, 219 arc_getbuf_func, &buf, 220 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 221 if (err) 222 return (err); 223 224 osp = buf->b_data; 225 traverse_zil(td, &osp->os_zil_header); 226 227 dnp = &osp->os_meta_dnode; 228 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 229 DMU_META_DNODE_OBJECT); 230 if (err && hard) { 231 lasterr = err; 232 err = 0; 233 } 234 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 235 dnp = &osp->os_userused_dnode; 236 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 237 DMU_USERUSED_OBJECT); 238 } 239 if (err && hard) { 240 lasterr = err; 241 err = 0; 242 } 243 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { 244 dnp = &osp->os_groupused_dnode; 245 err = traverse_dnode(td, dnp, buf, zb->zb_objset, 246 DMU_GROUPUSED_OBJECT); 247 } 248 } 249 250 if (buf) 251 (void) arc_buf_remove_ref(buf, &buf); 252 253 if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) 254 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); 255 256 return (err != 0 ? err : lasterr); 257 } 258 259 static int 260 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, 261 arc_buf_t *buf, uint64_t objset, uint64_t object) 262 { 263 int j, err = 0, lasterr = 0; 264 zbookmark_t czb; 265 boolean_t hard = (td->td_flags & TRAVERSE_HARD); 266 267 for (j = 0; j < dnp->dn_nblkptr; j++) { 268 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); 269 err = traverse_visitbp(td, dnp, buf, 270 (blkptr_t *)&dnp->dn_blkptr[j], &czb); 271 if (err) { 272 if (!hard) 273 break; 274 lasterr = err; 275 } 276 } 277 return (err != 0 ? err : lasterr); 278 } 279 280 /* ARGSUSED */ 281 static int 282 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 283 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 284 { 285 struct prefetch_data *pfd = arg; 286 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 287 288 ASSERT(pfd->pd_blks_fetched >= 0); 289 if (pfd->pd_cancel) 290 return (EINTR); 291 292 if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || 293 BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || 294 BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) 295 return (0); 296 297 mutex_enter(&pfd->pd_mtx); 298 while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) 299 cv_wait(&pfd->pd_cv, &pfd->pd_mtx); 300 pfd->pd_blks_fetched++; 301 cv_broadcast(&pfd->pd_cv); 302 mutex_exit(&pfd->pd_mtx); 303 304 (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, 305 ZIO_PRIORITY_ASYNC_READ, 306 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 307 &aflags, zb); 308 309 return (0); 310 } 311 312 static void 313 traverse_prefetch_thread(void *arg) 314 { 315 struct traverse_data *td_main = arg; 316 struct traverse_data td = *td_main; 317 zbookmark_t czb; 318 319 td.td_func = traverse_prefetcher; 320 td.td_arg = td_main->td_pfd; 321 td.td_pfd = NULL; 322 323 SET_BOOKMARK(&czb, td.td_objset, 324 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 325 (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); 326 327 mutex_enter(&td_main->td_pfd->pd_mtx); 328 td_main->td_pfd->pd_exited = B_TRUE; 329 cv_broadcast(&td_main->td_pfd->pd_cv); 330 mutex_exit(&td_main->td_pfd->pd_mtx); 331 } 332 333 /* 334 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 335 * in syncing context). 336 */ 337 static int 338 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, 339 uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) 340 { 341 struct traverse_data td; 342 struct prefetch_data pd = { 0 }; 343 zbookmark_t czb; 344 int err; 345 346 td.td_spa = spa; 347 td.td_objset = objset; 348 td.td_rootbp = rootbp; 349 td.td_min_txg = txg_start; 350 td.td_func = func; 351 td.td_arg = arg; 352 td.td_pfd = &pd; 353 td.td_flags = flags; 354 355 pd.pd_blks_max = 100; 356 pd.pd_flags = flags; 357 mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); 358 cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); 359 360 if (!(flags & TRAVERSE_PREFETCH) || 361 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, 362 &td, TQ_NOQUEUE)) 363 pd.pd_exited = B_TRUE; 364 365 SET_BOOKMARK(&czb, objset, 366 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 367 err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); 368 369 mutex_enter(&pd.pd_mtx); 370 pd.pd_cancel = B_TRUE; 371 cv_broadcast(&pd.pd_cv); 372 while (!pd.pd_exited) 373 cv_wait(&pd.pd_cv, &pd.pd_mtx); 374 mutex_exit(&pd.pd_mtx); 375 376 mutex_destroy(&pd.pd_mtx); 377 cv_destroy(&pd.pd_cv); 378 379 return (err); 380 } 381 382 /* 383 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are 384 * in syncing context). 385 */ 386 int 387 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, 388 blkptr_cb_t func, void *arg) 389 { 390 return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, 391 &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); 392 } 393 394 /* 395 * NB: pool must not be changing on-disk (eg, from zdb or sync context). 396 */ 397 int 398 traverse_pool(spa_t *spa, uint64_t txg_start, int flags, 399 blkptr_cb_t func, void *arg) 400 { 401 int err, lasterr = 0; 402 uint64_t obj; 403 dsl_pool_t *dp = spa_get_dsl(spa); 404 objset_t *mos = dp->dp_meta_objset; 405 boolean_t hard = (flags & TRAVERSE_HARD); 406 407 /* visit the MOS */ 408 err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), 409 txg_start, flags, func, arg); 410 if (err) 411 return (err); 412 413 /* visit each dataset */ 414 for (obj = 1; err == 0 || (err != ESRCH && hard); 415 err = dmu_object_next(mos, &obj, FALSE, txg_start)) { 416 dmu_object_info_t doi; 417 418 err = dmu_object_info(mos, obj, &doi); 419 if (err) { 420 if (!hard) 421 return (err); 422 lasterr = err; 423 continue; 424 } 425 426 if (doi.doi_type == DMU_OT_DSL_DATASET) { 427 dsl_dataset_t *ds; 428 uint64_t txg = txg_start; 429 430 rw_enter(&dp->dp_config_rwlock, RW_READER); 431 err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); 432 rw_exit(&dp->dp_config_rwlock); 433 if (err) { 434 if (!hard) 435 return (err); 436 lasterr = err; 437 continue; 438 } 439 if (ds->ds_phys->ds_prev_snap_txg > txg) 440 txg = ds->ds_phys->ds_prev_snap_txg; 441 err = traverse_dataset(ds, txg, flags, func, arg); 442 dsl_dataset_rele(ds, FTAG); 443 if (err) { 444 if (!hard) 445 return (err); 446 lasterr = err; 447 } 448 } 449 } 450 if (err == ESRCH) 451 err = 0; 452 return (err != 0 ? err : lasterr); 453 } 454