xref: /titanic_52/usr/src/uts/common/fs/zfs/dmu_traverse.c (revision 913a902882f7ced52ed7832cccd9062aff4c27f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/dnode.h>
32 #include <sys/spa.h>
33 #include <sys/zio.h>
34 #include <sys/dmu_impl.h>
35 #include <sys/sa.h>
36 #include <sys/sa_impl.h>
37 #include <sys/callb.h>
38 
39 int zfs_pd_blks_max = 100;
40 
41 struct prefetch_data {
42 	kmutex_t pd_mtx;
43 	kcondvar_t pd_cv;
44 	int pd_blks_max;
45 	int pd_blks_fetched;
46 	int pd_flags;
47 	boolean_t pd_cancel;
48 	boolean_t pd_exited;
49 };
50 
51 struct traverse_data {
52 	spa_t *td_spa;
53 	uint64_t td_objset;
54 	blkptr_t *td_rootbp;
55 	uint64_t td_min_txg;
56 	int td_flags;
57 	struct prefetch_data *td_pfd;
58 	blkptr_cb_t *td_func;
59 	void *td_arg;
60 };
61 
62 static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
63     arc_buf_t *buf, uint64_t objset, uint64_t object);
64 
65 /* ARGSUSED */
66 static int
67 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
68 {
69 	struct traverse_data *td = arg;
70 	zbookmark_t zb;
71 
72 	if (bp->blk_birth == 0)
73 		return (0);
74 
75 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
76 		return (0);
77 
78 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
79 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
80 
81 	(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
82 
83 	return (0);
84 }
85 
86 /* ARGSUSED */
87 static int
88 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
89 {
90 	struct traverse_data *td = arg;
91 
92 	if (lrc->lrc_txtype == TX_WRITE) {
93 		lr_write_t *lr = (lr_write_t *)lrc;
94 		blkptr_t *bp = &lr->lr_blkptr;
95 		zbookmark_t zb;
96 
97 		if (bp->blk_birth == 0)
98 			return (0);
99 
100 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
101 			return (0);
102 
103 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
104 		    lr->lr_offset / BP_GET_LSIZE(bp));
105 
106 		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
107 		    td->td_arg);
108 	}
109 	return (0);
110 }
111 
112 static void
113 traverse_zil(struct traverse_data *td, zil_header_t *zh)
114 {
115 	uint64_t claim_txg = zh->zh_claim_txg;
116 	zilog_t *zilog;
117 
118 	/*
119 	 * We only want to visit blocks that have been claimed but not yet
120 	 * replayed; plus, in read-only mode, blocks that are already stable.
121 	 */
122 	if (claim_txg == 0 && spa_writeable(td->td_spa))
123 		return;
124 
125 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
126 
127 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
128 	    claim_txg);
129 
130 	zil_free(zilog);
131 }
132 
133 static int
134 traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
135     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
136 {
137 	zbookmark_t czb;
138 	int err = 0, lasterr = 0;
139 	arc_buf_t *buf = NULL;
140 	struct prefetch_data *pd = td->td_pfd;
141 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
142 
143 	if (bp->blk_birth == 0) {
144 		err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
145 		    td->td_arg);
146 		return (err);
147 	}
148 
149 	if (bp->blk_birth <= td->td_min_txg)
150 		return (0);
151 
152 	if (pd && !pd->pd_exited &&
153 	    ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
154 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
155 		mutex_enter(&pd->pd_mtx);
156 		ASSERT(pd->pd_blks_fetched >= 0);
157 		while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
158 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
159 		pd->pd_blks_fetched--;
160 		cv_broadcast(&pd->pd_cv);
161 		mutex_exit(&pd->pd_mtx);
162 	}
163 
164 	if (td->td_flags & TRAVERSE_PRE) {
165 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
166 		    td->td_arg);
167 		if (err)
168 			return (err);
169 	}
170 
171 	if (BP_GET_LEVEL(bp) > 0) {
172 		uint32_t flags = ARC_WAIT;
173 		int i;
174 		blkptr_t *cbp;
175 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
176 
177 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
178 		    arc_getbuf_func, &buf,
179 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
180 		if (err)
181 			return (err);
182 
183 		/* recursively visitbp() blocks below this */
184 		cbp = buf->b_data;
185 		for (i = 0; i < epb; i++, cbp++) {
186 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
187 			    zb->zb_level - 1,
188 			    zb->zb_blkid * epb + i);
189 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
190 			if (err) {
191 				if (!hard)
192 					break;
193 				lasterr = err;
194 			}
195 		}
196 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
197 		uint32_t flags = ARC_WAIT;
198 		int i;
199 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
200 
201 		err = dsl_read(NULL, td->td_spa, bp, pbuf,
202 		    arc_getbuf_func, &buf,
203 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
204 		if (err)
205 			return (err);
206 
207 		/* recursively visitbp() blocks below this */
208 		dnp = buf->b_data;
209 		for (i = 0; i < epb; i++, dnp++) {
210 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
211 			    zb->zb_blkid * epb + i);
212 			if (err) {
213 				if (!hard)
214 					break;
215 				lasterr = err;
216 			}
217 		}
218 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
219 		uint32_t flags = ARC_WAIT;
220 		objset_phys_t *osp;
221 		dnode_phys_t *dnp;
222 
223 		err = dsl_read_nolock(NULL, td->td_spa, bp,
224 		    arc_getbuf_func, &buf,
225 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
226 		if (err)
227 			return (err);
228 
229 		osp = buf->b_data;
230 		traverse_zil(td, &osp->os_zil_header);
231 
232 		dnp = &osp->os_meta_dnode;
233 		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
234 		    DMU_META_DNODE_OBJECT);
235 		if (err && hard) {
236 			lasterr = err;
237 			err = 0;
238 		}
239 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
240 			dnp = &osp->os_userused_dnode;
241 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
242 			    DMU_USERUSED_OBJECT);
243 		}
244 		if (err && hard) {
245 			lasterr = err;
246 			err = 0;
247 		}
248 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
249 			dnp = &osp->os_groupused_dnode;
250 			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
251 			    DMU_GROUPUSED_OBJECT);
252 		}
253 	}
254 
255 	if (buf)
256 		(void) arc_buf_remove_ref(buf, &buf);
257 
258 	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
259 		err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
260 		    td->td_arg);
261 	}
262 
263 	return (err != 0 ? err : lasterr);
264 }
265 
266 static int
267 traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
268     arc_buf_t *buf, uint64_t objset, uint64_t object)
269 {
270 	int j, err = 0, lasterr = 0;
271 	zbookmark_t czb;
272 	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
273 
274 	for (j = 0; j < dnp->dn_nblkptr; j++) {
275 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
276 		err = traverse_visitbp(td, dnp, buf,
277 		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
278 		if (err) {
279 			if (!hard)
280 				break;
281 			lasterr = err;
282 		}
283 	}
284 
285 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
286 		SET_BOOKMARK(&czb, objset,
287 		    object, 0, DMU_SPILL_BLKID);
288 		err = traverse_visitbp(td, dnp, buf,
289 		    (blkptr_t *)&dnp->dn_spill, &czb);
290 		if (err) {
291 			if (!hard)
292 				return (err);
293 			lasterr = err;
294 		}
295 	}
296 	return (err != 0 ? err : lasterr);
297 }
298 
299 /* ARGSUSED */
300 static int
301 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
302     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
303     void *arg)
304 {
305 	struct prefetch_data *pfd = arg;
306 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
307 
308 	ASSERT(pfd->pd_blks_fetched >= 0);
309 	if (pfd->pd_cancel)
310 		return (EINTR);
311 
312 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
313 	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
314 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
315 		return (0);
316 
317 	mutex_enter(&pfd->pd_mtx);
318 	while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
319 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
320 	pfd->pd_blks_fetched++;
321 	cv_broadcast(&pfd->pd_cv);
322 	mutex_exit(&pfd->pd_mtx);
323 
324 	(void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
325 	    ZIO_PRIORITY_ASYNC_READ,
326 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
327 	    &aflags, zb);
328 
329 	return (0);
330 }
331 
332 static void
333 traverse_prefetch_thread(void *arg)
334 {
335 	struct traverse_data *td_main = arg;
336 	struct traverse_data td = *td_main;
337 	zbookmark_t czb;
338 
339 	td.td_func = traverse_prefetcher;
340 	td.td_arg = td_main->td_pfd;
341 	td.td_pfd = NULL;
342 
343 	SET_BOOKMARK(&czb, td.td_objset,
344 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
345 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
346 
347 	mutex_enter(&td_main->td_pfd->pd_mtx);
348 	td_main->td_pfd->pd_exited = B_TRUE;
349 	cv_broadcast(&td_main->td_pfd->pd_cv);
350 	mutex_exit(&td_main->td_pfd->pd_mtx);
351 }
352 
353 /*
354  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
355  * in syncing context).
356  */
357 static int
358 traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
359     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
360 {
361 	struct traverse_data td;
362 	struct prefetch_data pd = { 0 };
363 	zbookmark_t czb;
364 	int err;
365 
366 	td.td_spa = spa;
367 	td.td_objset = objset;
368 	td.td_rootbp = rootbp;
369 	td.td_min_txg = txg_start;
370 	td.td_func = func;
371 	td.td_arg = arg;
372 	td.td_pfd = &pd;
373 	td.td_flags = flags;
374 
375 	pd.pd_blks_max = zfs_pd_blks_max;
376 	pd.pd_flags = flags;
377 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
378 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
379 
380 	if (!(flags & TRAVERSE_PREFETCH) ||
381 	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
382 	    &td, TQ_NOQUEUE))
383 		pd.pd_exited = B_TRUE;
384 
385 	SET_BOOKMARK(&czb, objset,
386 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
387 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
388 
389 	mutex_enter(&pd.pd_mtx);
390 	pd.pd_cancel = B_TRUE;
391 	cv_broadcast(&pd.pd_cv);
392 	while (!pd.pd_exited)
393 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
394 	mutex_exit(&pd.pd_mtx);
395 
396 	mutex_destroy(&pd.pd_mtx);
397 	cv_destroy(&pd.pd_cv);
398 
399 	return (err);
400 }
401 
402 /*
403  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
404  * in syncing context).
405  */
406 int
407 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
408     blkptr_cb_t func, void *arg)
409 {
410 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object,
411 	    &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
412 }
413 
414 /*
415  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
416  */
417 int
418 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
419     blkptr_cb_t func, void *arg)
420 {
421 	int err, lasterr = 0;
422 	uint64_t obj;
423 	dsl_pool_t *dp = spa_get_dsl(spa);
424 	objset_t *mos = dp->dp_meta_objset;
425 	boolean_t hard = (flags & TRAVERSE_HARD);
426 
427 	/* visit the MOS */
428 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
429 	    txg_start, flags, func, arg);
430 	if (err)
431 		return (err);
432 
433 	/* visit each dataset */
434 	for (obj = 1; err == 0 || (err != ESRCH && hard);
435 	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
436 		dmu_object_info_t doi;
437 
438 		err = dmu_object_info(mos, obj, &doi);
439 		if (err) {
440 			if (!hard)
441 				return (err);
442 			lasterr = err;
443 			continue;
444 		}
445 
446 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
447 			dsl_dataset_t *ds;
448 			uint64_t txg = txg_start;
449 
450 			rw_enter(&dp->dp_config_rwlock, RW_READER);
451 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
452 			rw_exit(&dp->dp_config_rwlock);
453 			if (err) {
454 				if (!hard)
455 					return (err);
456 				lasterr = err;
457 				continue;
458 			}
459 			if (ds->ds_phys->ds_prev_snap_txg > txg)
460 				txg = ds->ds_phys->ds_prev_snap_txg;
461 			err = traverse_dataset(ds, txg, flags, func, arg);
462 			dsl_dataset_rele(ds, FTAG);
463 			if (err) {
464 				if (!hard)
465 					return (err);
466 				lasterr = err;
467 			}
468 		}
469 	}
470 	if (err == ESRCH)
471 		err = 0;
472 	return (err != 0 ? err : lasterr);
473 }
474