xref: /freebsd/sys/contrib/openzfs/module/zfs/bptree.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy  * CDDL HEADER START
4eda14cbcSMatt Macy  *
5eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy  *
9eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska  * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy  * See the License for the specific language governing permissions
12eda14cbcSMatt Macy  * and limitations under the License.
13eda14cbcSMatt Macy  *
14eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy  *
20eda14cbcSMatt Macy  * CDDL HEADER END
21eda14cbcSMatt Macy  */
22eda14cbcSMatt Macy 
23eda14cbcSMatt Macy /*
24eda14cbcSMatt Macy  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  */
26eda14cbcSMatt Macy 
27eda14cbcSMatt Macy #include <sys/arc.h>
28eda14cbcSMatt Macy #include <sys/bptree.h>
29eda14cbcSMatt Macy #include <sys/dmu.h>
30eda14cbcSMatt Macy #include <sys/dmu_objset.h>
31eda14cbcSMatt Macy #include <sys/dmu_tx.h>
32eda14cbcSMatt Macy #include <sys/dmu_traverse.h>
33eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
34eda14cbcSMatt Macy #include <sys/dsl_dir.h>
35eda14cbcSMatt Macy #include <sys/dsl_pool.h>
36eda14cbcSMatt Macy #include <sys/dnode.h>
37eda14cbcSMatt Macy #include <sys/spa.h>
38eda14cbcSMatt Macy 
39eda14cbcSMatt Macy /*
40eda14cbcSMatt Macy  * A bptree is a queue of root block pointers from destroyed datasets. When a
41eda14cbcSMatt Macy  * dataset is destroyed its root block pointer is put on the end of the pool's
42eda14cbcSMatt Macy  * bptree queue so the dataset's blocks can be freed asynchronously by
43eda14cbcSMatt Macy  * dsl_scan_sync. This allows the delete operation to finish without traversing
44eda14cbcSMatt Macy  * all the dataset's blocks.
45eda14cbcSMatt Macy  *
46eda14cbcSMatt Macy  * Note that while bt_begin and bt_end are only ever incremented in this code,
47eda14cbcSMatt Macy  * they are effectively reset to 0 every time the entire bptree is freed because
48eda14cbcSMatt Macy  * the bptree's object is destroyed and re-created.
49eda14cbcSMatt Macy  */
50eda14cbcSMatt Macy 
51eda14cbcSMatt Macy struct bptree_args {
52eda14cbcSMatt Macy 	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
53eda14cbcSMatt Macy 	boolean_t ba_free;	/* true if freeing during traversal */
54eda14cbcSMatt Macy 
55eda14cbcSMatt Macy 	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
56eda14cbcSMatt Macy 	void *ba_arg;		/* caller supplied argument to ba_func */
57eda14cbcSMatt Macy 	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
58eda14cbcSMatt Macy } bptree_args_t;
59eda14cbcSMatt Macy 
60eda14cbcSMatt Macy uint64_t
bptree_alloc(objset_t * os,dmu_tx_t * tx)61eda14cbcSMatt Macy bptree_alloc(objset_t *os, dmu_tx_t *tx)
62eda14cbcSMatt Macy {
63eda14cbcSMatt Macy 	uint64_t obj;
64eda14cbcSMatt Macy 	dmu_buf_t *db;
65eda14cbcSMatt Macy 	bptree_phys_t *bt;
66eda14cbcSMatt Macy 
67eda14cbcSMatt Macy 	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
68eda14cbcSMatt Macy 	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
69eda14cbcSMatt Macy 	    sizeof (bptree_phys_t), tx);
70eda14cbcSMatt Macy 
71eda14cbcSMatt Macy 	/*
72eda14cbcSMatt Macy 	 * Bonus buffer contents are already initialized to 0, but for
73eda14cbcSMatt Macy 	 * readability we make it explicit.
74eda14cbcSMatt Macy 	 */
75eda14cbcSMatt Macy 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
76eda14cbcSMatt Macy 	dmu_buf_will_dirty(db, tx);
77eda14cbcSMatt Macy 	bt = db->db_data;
78eda14cbcSMatt Macy 	bt->bt_begin = 0;
79eda14cbcSMatt Macy 	bt->bt_end = 0;
80eda14cbcSMatt Macy 	bt->bt_bytes = 0;
81eda14cbcSMatt Macy 	bt->bt_comp = 0;
82eda14cbcSMatt Macy 	bt->bt_uncomp = 0;
83eda14cbcSMatt Macy 	dmu_buf_rele(db, FTAG);
84eda14cbcSMatt Macy 
85eda14cbcSMatt Macy 	return (obj);
86eda14cbcSMatt Macy }
87eda14cbcSMatt Macy 
88eda14cbcSMatt Macy int
bptree_free(objset_t * os,uint64_t obj,dmu_tx_t * tx)89eda14cbcSMatt Macy bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
90eda14cbcSMatt Macy {
91eda14cbcSMatt Macy 	dmu_buf_t *db;
92eda14cbcSMatt Macy 	bptree_phys_t *bt;
93eda14cbcSMatt Macy 
94eda14cbcSMatt Macy 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
95eda14cbcSMatt Macy 	bt = db->db_data;
96eda14cbcSMatt Macy 	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
97eda14cbcSMatt Macy 	ASSERT0(bt->bt_bytes);
98eda14cbcSMatt Macy 	ASSERT0(bt->bt_comp);
99eda14cbcSMatt Macy 	ASSERT0(bt->bt_uncomp);
100eda14cbcSMatt Macy 	dmu_buf_rele(db, FTAG);
101eda14cbcSMatt Macy 
102eda14cbcSMatt Macy 	return (dmu_object_free(os, obj, tx));
103eda14cbcSMatt Macy }
104eda14cbcSMatt Macy 
105eda14cbcSMatt Macy boolean_t
bptree_is_empty(objset_t * os,uint64_t obj)106eda14cbcSMatt Macy bptree_is_empty(objset_t *os, uint64_t obj)
107eda14cbcSMatt Macy {
108eda14cbcSMatt Macy 	dmu_buf_t *db;
109eda14cbcSMatt Macy 	bptree_phys_t *bt;
110eda14cbcSMatt Macy 	boolean_t rv;
111eda14cbcSMatt Macy 
112eda14cbcSMatt Macy 	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
113eda14cbcSMatt Macy 	bt = db->db_data;
114eda14cbcSMatt Macy 	rv = (bt->bt_begin == bt->bt_end);
115eda14cbcSMatt Macy 	dmu_buf_rele(db, FTAG);
116eda14cbcSMatt Macy 	return (rv);
117eda14cbcSMatt Macy }
118eda14cbcSMatt Macy 
119eda14cbcSMatt Macy void
bptree_add(objset_t * os,uint64_t obj,blkptr_t * bp,uint64_t birth_txg,uint64_t bytes,uint64_t comp,uint64_t uncomp,dmu_tx_t * tx)120eda14cbcSMatt Macy bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
121eda14cbcSMatt Macy     uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
122eda14cbcSMatt Macy {
123eda14cbcSMatt Macy 	dmu_buf_t *db;
124eda14cbcSMatt Macy 	bptree_phys_t *bt;
125eda14cbcSMatt Macy 	bptree_entry_phys_t *bte;
126eda14cbcSMatt Macy 
127eda14cbcSMatt Macy 	/*
128eda14cbcSMatt Macy 	 * bptree objects are in the pool mos, therefore they can only be
129eda14cbcSMatt Macy 	 * modified in syncing context. Furthermore, this is only modified
130eda14cbcSMatt Macy 	 * by the sync thread, so no locking is necessary.
131eda14cbcSMatt Macy 	 */
132eda14cbcSMatt Macy 	ASSERT(dmu_tx_is_syncing(tx));
133eda14cbcSMatt Macy 
134eda14cbcSMatt Macy 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
135eda14cbcSMatt Macy 	bt = db->db_data;
136eda14cbcSMatt Macy 
137eda14cbcSMatt Macy 	bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
138eda14cbcSMatt Macy 	bte->be_birth_txg = birth_txg;
139eda14cbcSMatt Macy 	bte->be_bp = *bp;
140eda14cbcSMatt Macy 	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
141eda14cbcSMatt Macy 	kmem_free(bte, sizeof (*bte));
142eda14cbcSMatt Macy 
143eda14cbcSMatt Macy 	dmu_buf_will_dirty(db, tx);
144eda14cbcSMatt Macy 	bt->bt_end++;
145eda14cbcSMatt Macy 	bt->bt_bytes += bytes;
146eda14cbcSMatt Macy 	bt->bt_comp += comp;
147eda14cbcSMatt Macy 	bt->bt_uncomp += uncomp;
148eda14cbcSMatt Macy 	dmu_buf_rele(db, FTAG);
149eda14cbcSMatt Macy }
150eda14cbcSMatt Macy 
151eda14cbcSMatt Macy static int
bptree_visit_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)152eda14cbcSMatt Macy bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
153eda14cbcSMatt Macy     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
154eda14cbcSMatt Macy {
155e92ffd9bSMartin Matuska 	(void) zilog, (void) dnp;
156eda14cbcSMatt Macy 	int err;
157eda14cbcSMatt Macy 	struct bptree_args *ba = arg;
158eda14cbcSMatt Macy 
159eda14cbcSMatt Macy 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
160eda14cbcSMatt Macy 	    BP_IS_REDACTED(bp))
161eda14cbcSMatt Macy 		return (0);
162eda14cbcSMatt Macy 
163eda14cbcSMatt Macy 	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
164eda14cbcSMatt Macy 	if (err == 0 && ba->ba_free) {
165eda14cbcSMatt Macy 		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
166eda14cbcSMatt Macy 		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
167eda14cbcSMatt Macy 		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
168eda14cbcSMatt Macy 	}
169eda14cbcSMatt Macy 	return (err);
170eda14cbcSMatt Macy }
171eda14cbcSMatt Macy 
172eda14cbcSMatt Macy /*
173eda14cbcSMatt Macy  * If "free" is set:
174eda14cbcSMatt Macy  *  - It is assumed that "func" will be freeing the block pointers.
175eda14cbcSMatt Macy  *  - If "func" returns nonzero, the bookmark will be remembered and
176eda14cbcSMatt Macy  *    iteration will be restarted from this point on next invocation.
177eda14cbcSMatt Macy  *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
178eda14cbcSMatt Macy  *    bptree_iterate will remember the bookmark, continue traversing
179eda14cbcSMatt Macy  *    any additional entries, and return 0.
180eda14cbcSMatt Macy  *
181eda14cbcSMatt Macy  * If "free" is not set, traversal will stop and return an error if
182eda14cbcSMatt Macy  * an i/o error is encountered.
183eda14cbcSMatt Macy  *
184eda14cbcSMatt Macy  * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
185eda14cbcSMatt Macy  * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
186eda14cbcSMatt Macy  * traverse_dataset_destroyed()).
187eda14cbcSMatt Macy  */
188eda14cbcSMatt Macy int
bptree_iterate(objset_t * os,uint64_t obj,boolean_t free,bptree_itor_t func,void * arg,dmu_tx_t * tx)189eda14cbcSMatt Macy bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
190eda14cbcSMatt Macy     void *arg, dmu_tx_t *tx)
191eda14cbcSMatt Macy {
192eda14cbcSMatt Macy 	boolean_t ioerr = B_FALSE;
193eda14cbcSMatt Macy 	int err;
194eda14cbcSMatt Macy 	uint64_t i;
195eda14cbcSMatt Macy 	dmu_buf_t *db;
196eda14cbcSMatt Macy 	struct bptree_args ba;
197eda14cbcSMatt Macy 
198eda14cbcSMatt Macy 	ASSERT(!free || dmu_tx_is_syncing(tx));
199eda14cbcSMatt Macy 
200eda14cbcSMatt Macy 	err = dmu_bonus_hold(os, obj, FTAG, &db);
201eda14cbcSMatt Macy 	if (err != 0)
202eda14cbcSMatt Macy 		return (err);
203eda14cbcSMatt Macy 
204eda14cbcSMatt Macy 	if (free)
205eda14cbcSMatt Macy 		dmu_buf_will_dirty(db, tx);
206eda14cbcSMatt Macy 
207eda14cbcSMatt Macy 	ba.ba_phys = db->db_data;
208eda14cbcSMatt Macy 	ba.ba_free = free;
209eda14cbcSMatt Macy 	ba.ba_func = func;
210eda14cbcSMatt Macy 	ba.ba_arg = arg;
211eda14cbcSMatt Macy 	ba.ba_tx = tx;
212eda14cbcSMatt Macy 
213eda14cbcSMatt Macy 	err = 0;
214eda14cbcSMatt Macy 	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
215eda14cbcSMatt Macy 		bptree_entry_phys_t bte;
216eda14cbcSMatt Macy 		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
217eda14cbcSMatt Macy 		    TRAVERSE_NO_DECRYPT;
218eda14cbcSMatt Macy 
219eda14cbcSMatt Macy 		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
220eda14cbcSMatt Macy 		    &bte, DMU_READ_NO_PREFETCH);
221eda14cbcSMatt Macy 		if (err != 0)
222eda14cbcSMatt Macy 			break;
223eda14cbcSMatt Macy 
224eda14cbcSMatt Macy 		if (zfs_free_leak_on_eio)
225eda14cbcSMatt Macy 			flags |= TRAVERSE_HARD;
226eda14cbcSMatt Macy 		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
227eda14cbcSMatt Macy 		    "bookmark %lld/%lld/%lld/%lld",
228eda14cbcSMatt Macy 		    (longlong_t)i,
229eda14cbcSMatt Macy 		    (longlong_t)bte.be_birth_txg,
230eda14cbcSMatt Macy 		    (longlong_t)bte.be_zb.zb_objset,
231eda14cbcSMatt Macy 		    (longlong_t)bte.be_zb.zb_object,
232eda14cbcSMatt Macy 		    (longlong_t)bte.be_zb.zb_level,
233eda14cbcSMatt Macy 		    (longlong_t)bte.be_zb.zb_blkid);
234eda14cbcSMatt Macy 		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
235eda14cbcSMatt Macy 		    bte.be_birth_txg, &bte.be_zb, flags,
236eda14cbcSMatt Macy 		    bptree_visit_cb, &ba);
237eda14cbcSMatt Macy 		if (free) {
238eda14cbcSMatt Macy 			/*
239eda14cbcSMatt Macy 			 * The callback has freed the visited block pointers.
240eda14cbcSMatt Macy 			 * Record our traversal progress on disk, either by
241eda14cbcSMatt Macy 			 * updating this record's bookmark, or by logically
242eda14cbcSMatt Macy 			 * removing this record by advancing bt_begin.
243eda14cbcSMatt Macy 			 */
244eda14cbcSMatt Macy 			if (err != 0) {
245eda14cbcSMatt Macy 				/* save bookmark for future resume */
246eda14cbcSMatt Macy 				ASSERT3U(bte.be_zb.zb_objset, ==,
247eda14cbcSMatt Macy 				    ZB_DESTROYED_OBJSET);
248eda14cbcSMatt Macy 				ASSERT0(bte.be_zb.zb_level);
249eda14cbcSMatt Macy 				dmu_write(os, obj, i * sizeof (bte),
250eda14cbcSMatt Macy 				    sizeof (bte), &bte, tx);
251eda14cbcSMatt Macy 				if (err == EIO || err == ECKSUM ||
252eda14cbcSMatt Macy 				    err == ENXIO) {
253eda14cbcSMatt Macy 					/*
254eda14cbcSMatt Macy 					 * Skip the rest of this tree and
255eda14cbcSMatt Macy 					 * continue on to the next entry.
256eda14cbcSMatt Macy 					 */
257eda14cbcSMatt Macy 					err = 0;
258eda14cbcSMatt Macy 					ioerr = B_TRUE;
259eda14cbcSMatt Macy 				} else {
260eda14cbcSMatt Macy 					break;
261eda14cbcSMatt Macy 				}
262eda14cbcSMatt Macy 			} else if (ioerr) {
263eda14cbcSMatt Macy 				/*
264eda14cbcSMatt Macy 				 * This entry is finished, but there were
265eda14cbcSMatt Macy 				 * i/o errors on previous entries, so we
266eda14cbcSMatt Macy 				 * can't adjust bt_begin.  Set this entry's
267eda14cbcSMatt Macy 				 * be_birth_txg such that it will be
268eda14cbcSMatt Macy 				 * treated as a no-op in future traversals.
269eda14cbcSMatt Macy 				 */
270eda14cbcSMatt Macy 				bte.be_birth_txg = UINT64_MAX;
271eda14cbcSMatt Macy 				dmu_write(os, obj, i * sizeof (bte),
272eda14cbcSMatt Macy 				    sizeof (bte), &bte, tx);
273eda14cbcSMatt Macy 			}
274eda14cbcSMatt Macy 
275eda14cbcSMatt Macy 			if (!ioerr) {
276eda14cbcSMatt Macy 				ba.ba_phys->bt_begin++;
277eda14cbcSMatt Macy 				(void) dmu_free_range(os, obj,
278eda14cbcSMatt Macy 				    i * sizeof (bte), sizeof (bte), tx);
279eda14cbcSMatt Macy 			}
280eda14cbcSMatt Macy 		} else if (err != 0) {
281eda14cbcSMatt Macy 			break;
282eda14cbcSMatt Macy 		}
283eda14cbcSMatt Macy 	}
284eda14cbcSMatt Macy 
285eda14cbcSMatt Macy 	ASSERT(!free || err != 0 || ioerr ||
286eda14cbcSMatt Macy 	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
287eda14cbcSMatt Macy 
288eda14cbcSMatt Macy 	/* if all blocks are free there should be no used space */
289eda14cbcSMatt Macy 	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
290eda14cbcSMatt Macy 		if (zfs_free_leak_on_eio) {
291eda14cbcSMatt Macy 			ba.ba_phys->bt_bytes = 0;
292eda14cbcSMatt Macy 			ba.ba_phys->bt_comp = 0;
293eda14cbcSMatt Macy 			ba.ba_phys->bt_uncomp = 0;
294eda14cbcSMatt Macy 		}
295eda14cbcSMatt Macy 
296eda14cbcSMatt Macy 		ASSERT0(ba.ba_phys->bt_bytes);
297eda14cbcSMatt Macy 		ASSERT0(ba.ba_phys->bt_comp);
298eda14cbcSMatt Macy 		ASSERT0(ba.ba_phys->bt_uncomp);
299eda14cbcSMatt Macy 	}
300eda14cbcSMatt Macy 
301eda14cbcSMatt Macy 	dmu_buf_rele(db, FTAG);
302eda14cbcSMatt Macy 
303eda14cbcSMatt Macy 	return (err);
304eda14cbcSMatt Macy }
305