1*61145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy
23eda14cbcSMatt Macy /*
24eda14cbcSMatt Macy * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25eda14cbcSMatt Macy */
26eda14cbcSMatt Macy
27eda14cbcSMatt Macy #include <sys/arc.h>
28eda14cbcSMatt Macy #include <sys/bptree.h>
29eda14cbcSMatt Macy #include <sys/dmu.h>
30eda14cbcSMatt Macy #include <sys/dmu_objset.h>
31eda14cbcSMatt Macy #include <sys/dmu_tx.h>
32eda14cbcSMatt Macy #include <sys/dmu_traverse.h>
33eda14cbcSMatt Macy #include <sys/dsl_dataset.h>
34eda14cbcSMatt Macy #include <sys/dsl_dir.h>
35eda14cbcSMatt Macy #include <sys/dsl_pool.h>
36eda14cbcSMatt Macy #include <sys/dnode.h>
37eda14cbcSMatt Macy #include <sys/spa.h>
38eda14cbcSMatt Macy
39eda14cbcSMatt Macy /*
40eda14cbcSMatt Macy * A bptree is a queue of root block pointers from destroyed datasets. When a
41eda14cbcSMatt Macy * dataset is destroyed its root block pointer is put on the end of the pool's
42eda14cbcSMatt Macy * bptree queue so the dataset's blocks can be freed asynchronously by
43eda14cbcSMatt Macy * dsl_scan_sync. This allows the delete operation to finish without traversing
44eda14cbcSMatt Macy * all the dataset's blocks.
45eda14cbcSMatt Macy *
46eda14cbcSMatt Macy * Note that while bt_begin and bt_end are only ever incremented in this code,
47eda14cbcSMatt Macy * they are effectively reset to 0 every time the entire bptree is freed because
48eda14cbcSMatt Macy * the bptree's object is destroyed and re-created.
49eda14cbcSMatt Macy */
50eda14cbcSMatt Macy
51eda14cbcSMatt Macy struct bptree_args {
52eda14cbcSMatt Macy bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
53eda14cbcSMatt Macy boolean_t ba_free; /* true if freeing during traversal */
54eda14cbcSMatt Macy
55eda14cbcSMatt Macy bptree_itor_t *ba_func; /* function to call for each blockpointer */
56eda14cbcSMatt Macy void *ba_arg; /* caller supplied argument to ba_func */
57eda14cbcSMatt Macy dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
58eda14cbcSMatt Macy } bptree_args_t;
59eda14cbcSMatt Macy
60eda14cbcSMatt Macy uint64_t
bptree_alloc(objset_t * os,dmu_tx_t * tx)61eda14cbcSMatt Macy bptree_alloc(objset_t *os, dmu_tx_t *tx)
62eda14cbcSMatt Macy {
63eda14cbcSMatt Macy uint64_t obj;
64eda14cbcSMatt Macy dmu_buf_t *db;
65eda14cbcSMatt Macy bptree_phys_t *bt;
66eda14cbcSMatt Macy
67eda14cbcSMatt Macy obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
68eda14cbcSMatt Macy SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
69eda14cbcSMatt Macy sizeof (bptree_phys_t), tx);
70eda14cbcSMatt Macy
71eda14cbcSMatt Macy /*
72eda14cbcSMatt Macy * Bonus buffer contents are already initialized to 0, but for
73eda14cbcSMatt Macy * readability we make it explicit.
74eda14cbcSMatt Macy */
75eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
76eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
77eda14cbcSMatt Macy bt = db->db_data;
78eda14cbcSMatt Macy bt->bt_begin = 0;
79eda14cbcSMatt Macy bt->bt_end = 0;
80eda14cbcSMatt Macy bt->bt_bytes = 0;
81eda14cbcSMatt Macy bt->bt_comp = 0;
82eda14cbcSMatt Macy bt->bt_uncomp = 0;
83eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
84eda14cbcSMatt Macy
85eda14cbcSMatt Macy return (obj);
86eda14cbcSMatt Macy }
87eda14cbcSMatt Macy
88eda14cbcSMatt Macy int
bptree_free(objset_t * os,uint64_t obj,dmu_tx_t * tx)89eda14cbcSMatt Macy bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
90eda14cbcSMatt Macy {
91eda14cbcSMatt Macy dmu_buf_t *db;
92eda14cbcSMatt Macy bptree_phys_t *bt;
93eda14cbcSMatt Macy
94eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
95eda14cbcSMatt Macy bt = db->db_data;
96eda14cbcSMatt Macy ASSERT3U(bt->bt_begin, ==, bt->bt_end);
97eda14cbcSMatt Macy ASSERT0(bt->bt_bytes);
98eda14cbcSMatt Macy ASSERT0(bt->bt_comp);
99eda14cbcSMatt Macy ASSERT0(bt->bt_uncomp);
100eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
101eda14cbcSMatt Macy
102eda14cbcSMatt Macy return (dmu_object_free(os, obj, tx));
103eda14cbcSMatt Macy }
104eda14cbcSMatt Macy
105eda14cbcSMatt Macy boolean_t
bptree_is_empty(objset_t * os,uint64_t obj)106eda14cbcSMatt Macy bptree_is_empty(objset_t *os, uint64_t obj)
107eda14cbcSMatt Macy {
108eda14cbcSMatt Macy dmu_buf_t *db;
109eda14cbcSMatt Macy bptree_phys_t *bt;
110eda14cbcSMatt Macy boolean_t rv;
111eda14cbcSMatt Macy
112eda14cbcSMatt Macy VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
113eda14cbcSMatt Macy bt = db->db_data;
114eda14cbcSMatt Macy rv = (bt->bt_begin == bt->bt_end);
115eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
116eda14cbcSMatt Macy return (rv);
117eda14cbcSMatt Macy }
118eda14cbcSMatt Macy
119eda14cbcSMatt Macy void
bptree_add(objset_t * os,uint64_t obj,blkptr_t * bp,uint64_t birth_txg,uint64_t bytes,uint64_t comp,uint64_t uncomp,dmu_tx_t * tx)120eda14cbcSMatt Macy bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
121eda14cbcSMatt Macy uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
122eda14cbcSMatt Macy {
123eda14cbcSMatt Macy dmu_buf_t *db;
124eda14cbcSMatt Macy bptree_phys_t *bt;
125eda14cbcSMatt Macy bptree_entry_phys_t *bte;
126eda14cbcSMatt Macy
127eda14cbcSMatt Macy /*
128eda14cbcSMatt Macy * bptree objects are in the pool mos, therefore they can only be
129eda14cbcSMatt Macy * modified in syncing context. Furthermore, this is only modified
130eda14cbcSMatt Macy * by the sync thread, so no locking is necessary.
131eda14cbcSMatt Macy */
132eda14cbcSMatt Macy ASSERT(dmu_tx_is_syncing(tx));
133eda14cbcSMatt Macy
134eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
135eda14cbcSMatt Macy bt = db->db_data;
136eda14cbcSMatt Macy
137eda14cbcSMatt Macy bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
138eda14cbcSMatt Macy bte->be_birth_txg = birth_txg;
139eda14cbcSMatt Macy bte->be_bp = *bp;
140eda14cbcSMatt Macy dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
141eda14cbcSMatt Macy kmem_free(bte, sizeof (*bte));
142eda14cbcSMatt Macy
143eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
144eda14cbcSMatt Macy bt->bt_end++;
145eda14cbcSMatt Macy bt->bt_bytes += bytes;
146eda14cbcSMatt Macy bt->bt_comp += comp;
147eda14cbcSMatt Macy bt->bt_uncomp += uncomp;
148eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
149eda14cbcSMatt Macy }
150eda14cbcSMatt Macy
151eda14cbcSMatt Macy static int
bptree_visit_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)152eda14cbcSMatt Macy bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
153eda14cbcSMatt Macy const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
154eda14cbcSMatt Macy {
155e92ffd9bSMartin Matuska (void) zilog, (void) dnp;
156eda14cbcSMatt Macy int err;
157eda14cbcSMatt Macy struct bptree_args *ba = arg;
158eda14cbcSMatt Macy
159eda14cbcSMatt Macy if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
160eda14cbcSMatt Macy BP_IS_REDACTED(bp))
161eda14cbcSMatt Macy return (0);
162eda14cbcSMatt Macy
163eda14cbcSMatt Macy err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
164eda14cbcSMatt Macy if (err == 0 && ba->ba_free) {
165eda14cbcSMatt Macy ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
166eda14cbcSMatt Macy ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
167eda14cbcSMatt Macy ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
168eda14cbcSMatt Macy }
169eda14cbcSMatt Macy return (err);
170eda14cbcSMatt Macy }
171eda14cbcSMatt Macy
172eda14cbcSMatt Macy /*
173eda14cbcSMatt Macy * If "free" is set:
174eda14cbcSMatt Macy * - It is assumed that "func" will be freeing the block pointers.
175eda14cbcSMatt Macy * - If "func" returns nonzero, the bookmark will be remembered and
176eda14cbcSMatt Macy * iteration will be restarted from this point on next invocation.
177eda14cbcSMatt Macy * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
178eda14cbcSMatt Macy * bptree_iterate will remember the bookmark, continue traversing
179eda14cbcSMatt Macy * any additional entries, and return 0.
180eda14cbcSMatt Macy *
181eda14cbcSMatt Macy * If "free" is not set, traversal will stop and return an error if
182eda14cbcSMatt Macy * an i/o error is encountered.
183eda14cbcSMatt Macy *
184eda14cbcSMatt Macy * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
185eda14cbcSMatt Macy * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
186eda14cbcSMatt Macy * traverse_dataset_destroyed()).
187eda14cbcSMatt Macy */
188eda14cbcSMatt Macy int
bptree_iterate(objset_t * os,uint64_t obj,boolean_t free,bptree_itor_t func,void * arg,dmu_tx_t * tx)189eda14cbcSMatt Macy bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
190eda14cbcSMatt Macy void *arg, dmu_tx_t *tx)
191eda14cbcSMatt Macy {
192eda14cbcSMatt Macy boolean_t ioerr = B_FALSE;
193eda14cbcSMatt Macy int err;
194eda14cbcSMatt Macy uint64_t i;
195eda14cbcSMatt Macy dmu_buf_t *db;
196eda14cbcSMatt Macy struct bptree_args ba;
197eda14cbcSMatt Macy
198eda14cbcSMatt Macy ASSERT(!free || dmu_tx_is_syncing(tx));
199eda14cbcSMatt Macy
200eda14cbcSMatt Macy err = dmu_bonus_hold(os, obj, FTAG, &db);
201eda14cbcSMatt Macy if (err != 0)
202eda14cbcSMatt Macy return (err);
203eda14cbcSMatt Macy
204eda14cbcSMatt Macy if (free)
205eda14cbcSMatt Macy dmu_buf_will_dirty(db, tx);
206eda14cbcSMatt Macy
207eda14cbcSMatt Macy ba.ba_phys = db->db_data;
208eda14cbcSMatt Macy ba.ba_free = free;
209eda14cbcSMatt Macy ba.ba_func = func;
210eda14cbcSMatt Macy ba.ba_arg = arg;
211eda14cbcSMatt Macy ba.ba_tx = tx;
212eda14cbcSMatt Macy
213eda14cbcSMatt Macy err = 0;
214eda14cbcSMatt Macy for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
215eda14cbcSMatt Macy bptree_entry_phys_t bte;
216eda14cbcSMatt Macy int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
217eda14cbcSMatt Macy TRAVERSE_NO_DECRYPT;
218eda14cbcSMatt Macy
219eda14cbcSMatt Macy err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
220eda14cbcSMatt Macy &bte, DMU_READ_NO_PREFETCH);
221eda14cbcSMatt Macy if (err != 0)
222eda14cbcSMatt Macy break;
223eda14cbcSMatt Macy
224eda14cbcSMatt Macy if (zfs_free_leak_on_eio)
225eda14cbcSMatt Macy flags |= TRAVERSE_HARD;
226eda14cbcSMatt Macy zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
227eda14cbcSMatt Macy "bookmark %lld/%lld/%lld/%lld",
228eda14cbcSMatt Macy (longlong_t)i,
229eda14cbcSMatt Macy (longlong_t)bte.be_birth_txg,
230eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_objset,
231eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_object,
232eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_level,
233eda14cbcSMatt Macy (longlong_t)bte.be_zb.zb_blkid);
234eda14cbcSMatt Macy err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
235eda14cbcSMatt Macy bte.be_birth_txg, &bte.be_zb, flags,
236eda14cbcSMatt Macy bptree_visit_cb, &ba);
237eda14cbcSMatt Macy if (free) {
238eda14cbcSMatt Macy /*
239eda14cbcSMatt Macy * The callback has freed the visited block pointers.
240eda14cbcSMatt Macy * Record our traversal progress on disk, either by
241eda14cbcSMatt Macy * updating this record's bookmark, or by logically
242eda14cbcSMatt Macy * removing this record by advancing bt_begin.
243eda14cbcSMatt Macy */
244eda14cbcSMatt Macy if (err != 0) {
245eda14cbcSMatt Macy /* save bookmark for future resume */
246eda14cbcSMatt Macy ASSERT3U(bte.be_zb.zb_objset, ==,
247eda14cbcSMatt Macy ZB_DESTROYED_OBJSET);
248eda14cbcSMatt Macy ASSERT0(bte.be_zb.zb_level);
249eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte),
250eda14cbcSMatt Macy sizeof (bte), &bte, tx);
251eda14cbcSMatt Macy if (err == EIO || err == ECKSUM ||
252eda14cbcSMatt Macy err == ENXIO) {
253eda14cbcSMatt Macy /*
254eda14cbcSMatt Macy * Skip the rest of this tree and
255eda14cbcSMatt Macy * continue on to the next entry.
256eda14cbcSMatt Macy */
257eda14cbcSMatt Macy err = 0;
258eda14cbcSMatt Macy ioerr = B_TRUE;
259eda14cbcSMatt Macy } else {
260eda14cbcSMatt Macy break;
261eda14cbcSMatt Macy }
262eda14cbcSMatt Macy } else if (ioerr) {
263eda14cbcSMatt Macy /*
264eda14cbcSMatt Macy * This entry is finished, but there were
265eda14cbcSMatt Macy * i/o errors on previous entries, so we
266eda14cbcSMatt Macy * can't adjust bt_begin. Set this entry's
267eda14cbcSMatt Macy * be_birth_txg such that it will be
268eda14cbcSMatt Macy * treated as a no-op in future traversals.
269eda14cbcSMatt Macy */
270eda14cbcSMatt Macy bte.be_birth_txg = UINT64_MAX;
271eda14cbcSMatt Macy dmu_write(os, obj, i * sizeof (bte),
272eda14cbcSMatt Macy sizeof (bte), &bte, tx);
273eda14cbcSMatt Macy }
274eda14cbcSMatt Macy
275eda14cbcSMatt Macy if (!ioerr) {
276eda14cbcSMatt Macy ba.ba_phys->bt_begin++;
277eda14cbcSMatt Macy (void) dmu_free_range(os, obj,
278eda14cbcSMatt Macy i * sizeof (bte), sizeof (bte), tx);
279eda14cbcSMatt Macy }
280eda14cbcSMatt Macy } else if (err != 0) {
281eda14cbcSMatt Macy break;
282eda14cbcSMatt Macy }
283eda14cbcSMatt Macy }
284eda14cbcSMatt Macy
285eda14cbcSMatt Macy ASSERT(!free || err != 0 || ioerr ||
286eda14cbcSMatt Macy ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
287eda14cbcSMatt Macy
288eda14cbcSMatt Macy /* if all blocks are free there should be no used space */
289eda14cbcSMatt Macy if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
290eda14cbcSMatt Macy if (zfs_free_leak_on_eio) {
291eda14cbcSMatt Macy ba.ba_phys->bt_bytes = 0;
292eda14cbcSMatt Macy ba.ba_phys->bt_comp = 0;
293eda14cbcSMatt Macy ba.ba_phys->bt_uncomp = 0;
294eda14cbcSMatt Macy }
295eda14cbcSMatt Macy
296eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_bytes);
297eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_comp);
298eda14cbcSMatt Macy ASSERT0(ba.ba_phys->bt_uncomp);
299eda14cbcSMatt Macy }
300eda14cbcSMatt Macy
301eda14cbcSMatt Macy dmu_buf_rele(db, FTAG);
302eda14cbcSMatt Macy
303eda14cbcSMatt Macy return (err);
304eda14cbcSMatt Macy }
305