153089ab7Seschrock /*
253089ab7Seschrock * CDDL HEADER START
353089ab7Seschrock *
453089ab7Seschrock * The contents of this file are subject to the terms of the
553089ab7Seschrock * Common Development and Distribution License (the "License").
653089ab7Seschrock * You may not use this file except in compliance with the License.
753089ab7Seschrock *
853089ab7Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
953089ab7Seschrock * or http://www.opensolaris.org/os/licensing.
1053089ab7Seschrock * See the License for the specific language governing permissions
1153089ab7Seschrock * and limitations under the License.
1253089ab7Seschrock *
1353089ab7Seschrock * When distributing Covered Code, include this CDDL HEADER in each
1453089ab7Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1553089ab7Seschrock * If applicable, add the following below this CDDL HEADER, with the
1653089ab7Seschrock * fields enclosed by brackets "[]" replaced with your own identifying
1753089ab7Seschrock * information: Portions Copyright [yyyy] [name of copyright owner]
1853089ab7Seschrock *
1953089ab7Seschrock * CDDL HEADER END
2053089ab7Seschrock */
2153089ab7Seschrock
2253089ab7Seschrock /*
2305d95d03SMatthew Ahrens * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2453089ab7Seschrock */
2553089ab7Seschrock
2653089ab7Seschrock #include <sys/arc.h>
2753089ab7Seschrock #include <sys/bptree.h>
2853089ab7Seschrock #include <sys/dmu.h>
2953089ab7Seschrock #include <sys/dmu_objset.h>
3053089ab7Seschrock #include <sys/dmu_tx.h>
3153089ab7Seschrock #include <sys/dmu_traverse.h>
3253089ab7Seschrock #include <sys/dsl_dataset.h>
3353089ab7Seschrock #include <sys/dsl_dir.h>
3453089ab7Seschrock #include <sys/dsl_pool.h>
3553089ab7Seschrock #include <sys/dnode.h>
3653089ab7Seschrock #include <sys/refcount.h>
3753089ab7Seschrock #include <sys/spa.h>
3853089ab7Seschrock
3953089ab7Seschrock /*
4053089ab7Seschrock * A bptree is a queue of root block pointers from destroyed datasets. When a
4153089ab7Seschrock * dataset is destroyed its root block pointer is put on the end of the pool's
4253089ab7Seschrock * bptree queue so the dataset's blocks can be freed asynchronously by
4353089ab7Seschrock * dsl_scan_sync. This allows the delete operation to finish without traversing
4453089ab7Seschrock * all the dataset's blocks.
4553089ab7Seschrock *
46f7170741SWill Andrews * Note that while bt_begin and bt_end are only ever incremented in this code,
4753089ab7Seschrock * they are effectively reset to 0 every time the entire bptree is freed because
4853089ab7Seschrock * the bptree's object is destroyed and re-created.
4953089ab7Seschrock */
5053089ab7Seschrock
5153089ab7Seschrock struct bptree_args {
5253089ab7Seschrock bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */
5353089ab7Seschrock boolean_t ba_free; /* true if freeing during traversal */
5453089ab7Seschrock
5553089ab7Seschrock bptree_itor_t *ba_func; /* function to call for each blockpointer */
5653089ab7Seschrock void *ba_arg; /* caller supplied argument to ba_func */
5753089ab7Seschrock dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */
5853089ab7Seschrock } bptree_args_t;
5953089ab7Seschrock
6053089ab7Seschrock uint64_t
bptree_alloc(objset_t * os,dmu_tx_t * tx)6153089ab7Seschrock bptree_alloc(objset_t *os, dmu_tx_t *tx)
6253089ab7Seschrock {
6353089ab7Seschrock uint64_t obj;
6453089ab7Seschrock dmu_buf_t *db;
6553089ab7Seschrock bptree_phys_t *bt;
6653089ab7Seschrock
6753089ab7Seschrock obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
68d1a98260SMatthew Ahrens SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
6953089ab7Seschrock sizeof (bptree_phys_t), tx);
7053089ab7Seschrock
7153089ab7Seschrock /*
7253089ab7Seschrock * Bonus buffer contents are already initialized to 0, but for
7353089ab7Seschrock * readability we make it explicit.
7453089ab7Seschrock */
75b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
7653089ab7Seschrock dmu_buf_will_dirty(db, tx);
7753089ab7Seschrock bt = db->db_data;
7853089ab7Seschrock bt->bt_begin = 0;
7953089ab7Seschrock bt->bt_end = 0;
8053089ab7Seschrock bt->bt_bytes = 0;
8153089ab7Seschrock bt->bt_comp = 0;
8253089ab7Seschrock bt->bt_uncomp = 0;
8353089ab7Seschrock dmu_buf_rele(db, FTAG);
8453089ab7Seschrock
8553089ab7Seschrock return (obj);
8653089ab7Seschrock }
8753089ab7Seschrock
8853089ab7Seschrock int
bptree_free(objset_t * os,uint64_t obj,dmu_tx_t * tx)8953089ab7Seschrock bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
9053089ab7Seschrock {
9153089ab7Seschrock dmu_buf_t *db;
9253089ab7Seschrock bptree_phys_t *bt;
9353089ab7Seschrock
94b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
9553089ab7Seschrock bt = db->db_data;
9653089ab7Seschrock ASSERT3U(bt->bt_begin, ==, bt->bt_end);
97fb09f5aaSMadhav Suresh ASSERT0(bt->bt_bytes);
98fb09f5aaSMadhav Suresh ASSERT0(bt->bt_comp);
99fb09f5aaSMadhav Suresh ASSERT0(bt->bt_uncomp);
10053089ab7Seschrock dmu_buf_rele(db, FTAG);
10153089ab7Seschrock
10253089ab7Seschrock return (dmu_object_free(os, obj, tx));
10353089ab7Seschrock }
10453089ab7Seschrock
105994617e7SMatthew Ahrens boolean_t
bptree_is_empty(objset_t * os,uint64_t obj)106994617e7SMatthew Ahrens bptree_is_empty(objset_t *os, uint64_t obj)
107994617e7SMatthew Ahrens {
108994617e7SMatthew Ahrens dmu_buf_t *db;
109994617e7SMatthew Ahrens bptree_phys_t *bt;
110994617e7SMatthew Ahrens boolean_t rv;
111994617e7SMatthew Ahrens
112994617e7SMatthew Ahrens VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
113994617e7SMatthew Ahrens bt = db->db_data;
114994617e7SMatthew Ahrens rv = (bt->bt_begin == bt->bt_end);
115994617e7SMatthew Ahrens dmu_buf_rele(db, FTAG);
116994617e7SMatthew Ahrens return (rv);
117994617e7SMatthew Ahrens }
118994617e7SMatthew Ahrens
11953089ab7Seschrock void
bptree_add(objset_t * os,uint64_t obj,blkptr_t * bp,uint64_t birth_txg,uint64_t bytes,uint64_t comp,uint64_t uncomp,dmu_tx_t * tx)12053089ab7Seschrock bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
12153089ab7Seschrock uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
12253089ab7Seschrock {
12353089ab7Seschrock dmu_buf_t *db;
12453089ab7Seschrock bptree_phys_t *bt;
125994617e7SMatthew Ahrens bptree_entry_phys_t bte = { 0 };
12653089ab7Seschrock
12753089ab7Seschrock /*
12853089ab7Seschrock * bptree objects are in the pool mos, therefore they can only be
12953089ab7Seschrock * modified in syncing context. Furthermore, this is only modified
13053089ab7Seschrock * by the sync thread, so no locking is necessary.
13153089ab7Seschrock */
13253089ab7Seschrock ASSERT(dmu_tx_is_syncing(tx));
13353089ab7Seschrock
134b420f3adSRichard Lowe VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
13553089ab7Seschrock bt = db->db_data;
13653089ab7Seschrock
13753089ab7Seschrock bte.be_birth_txg = birth_txg;
13853089ab7Seschrock bte.be_bp = *bp;
13953089ab7Seschrock dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
14053089ab7Seschrock
14153089ab7Seschrock dmu_buf_will_dirty(db, tx);
14253089ab7Seschrock bt->bt_end++;
14353089ab7Seschrock bt->bt_bytes += bytes;
14453089ab7Seschrock bt->bt_comp += comp;
14553089ab7Seschrock bt->bt_uncomp += uncomp;
14653089ab7Seschrock dmu_buf_rele(db, FTAG);
14753089ab7Seschrock }
14853089ab7Seschrock
14953089ab7Seschrock /* ARGSUSED */
15053089ab7Seschrock static int
bptree_visit_cb(spa_t * spa,zilog_t * zilog,const blkptr_t * bp,const zbookmark_phys_t * zb,const dnode_phys_t * dnp,void * arg)1511b912ec7SGeorge Wilson bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
15205d95d03SMatthew Ahrens const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
15353089ab7Seschrock {
15453089ab7Seschrock int err;
15553089ab7Seschrock struct bptree_args *ba = arg;
15653089ab7Seschrock
157*5f9bb2f3SPaul Dagnelie if (bp == NULL || BP_IS_HOLE(bp))
15853089ab7Seschrock return (0);
15953089ab7Seschrock
16053089ab7Seschrock err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
16153089ab7Seschrock if (err == 0 && ba->ba_free) {
16253089ab7Seschrock ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
16353089ab7Seschrock ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
16453089ab7Seschrock ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
16553089ab7Seschrock }
16653089ab7Seschrock return (err);
16753089ab7Seschrock }
16853089ab7Seschrock
169994617e7SMatthew Ahrens /*
170994617e7SMatthew Ahrens * If "free" is set:
171994617e7SMatthew Ahrens * - It is assumed that "func" will be freeing the block pointers.
172994617e7SMatthew Ahrens * - If "func" returns nonzero, the bookmark will be remembered and
173994617e7SMatthew Ahrens * iteration will be restarted from this point on next invocation.
174994617e7SMatthew Ahrens * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
175994617e7SMatthew Ahrens * bptree_iterate will remember the bookmark, continue traversing
176994617e7SMatthew Ahrens * any additional entries, and return 0.
177994617e7SMatthew Ahrens *
178994617e7SMatthew Ahrens * If "free" is not set, traversal will stop and return an error if
179994617e7SMatthew Ahrens * an i/o error is encountered.
180994617e7SMatthew Ahrens *
181994617e7SMatthew Ahrens * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
182994617e7SMatthew Ahrens * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
183994617e7SMatthew Ahrens * traverse_dataset_destroyed()).
184994617e7SMatthew Ahrens */
18553089ab7Seschrock int
bptree_iterate(objset_t * os,uint64_t obj,boolean_t free,bptree_itor_t func,void * arg,dmu_tx_t * tx)18653089ab7Seschrock bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
18753089ab7Seschrock void *arg, dmu_tx_t *tx)
18853089ab7Seschrock {
189994617e7SMatthew Ahrens boolean_t ioerr = B_FALSE;
19053089ab7Seschrock int err;
19153089ab7Seschrock uint64_t i;
19253089ab7Seschrock dmu_buf_t *db;
19353089ab7Seschrock struct bptree_args ba;
19453089ab7Seschrock
19553089ab7Seschrock ASSERT(!free || dmu_tx_is_syncing(tx));
19653089ab7Seschrock
19753089ab7Seschrock err = dmu_bonus_hold(os, obj, FTAG, &db);
19853089ab7Seschrock if (err != 0)
19953089ab7Seschrock return (err);
20053089ab7Seschrock
20153089ab7Seschrock if (free)
20253089ab7Seschrock dmu_buf_will_dirty(db, tx);
20353089ab7Seschrock
20453089ab7Seschrock ba.ba_phys = db->db_data;
20553089ab7Seschrock ba.ba_free = free;
20653089ab7Seschrock ba.ba_func = func;
20753089ab7Seschrock ba.ba_arg = arg;
20853089ab7Seschrock ba.ba_tx = tx;
20953089ab7Seschrock
21053089ab7Seschrock err = 0;
21153089ab7Seschrock for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
21253089ab7Seschrock bptree_entry_phys_t bte;
213d61a30d6SMatthew Ahrens int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
21453089ab7Seschrock
21553089ab7Seschrock err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
21653089ab7Seschrock &bte, DMU_READ_NO_PREFETCH);
21753089ab7Seschrock if (err != 0)
21853089ab7Seschrock break;
21953089ab7Seschrock
220994617e7SMatthew Ahrens if (zfs_free_leak_on_eio)
221d61a30d6SMatthew Ahrens flags |= TRAVERSE_HARD;
222994617e7SMatthew Ahrens zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
223994617e7SMatthew Ahrens "bookmark %lld/%lld/%lld/%lld",
224994617e7SMatthew Ahrens i, (longlong_t)bte.be_birth_txg,
225994617e7SMatthew Ahrens (longlong_t)bte.be_zb.zb_objset,
226994617e7SMatthew Ahrens (longlong_t)bte.be_zb.zb_object,
227994617e7SMatthew Ahrens (longlong_t)bte.be_zb.zb_level,
228994617e7SMatthew Ahrens (longlong_t)bte.be_zb.zb_blkid);
22953089ab7Seschrock err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
230d61a30d6SMatthew Ahrens bte.be_birth_txg, &bte.be_zb, flags,
23153089ab7Seschrock bptree_visit_cb, &ba);
23253089ab7Seschrock if (free) {
233994617e7SMatthew Ahrens /*
234994617e7SMatthew Ahrens * The callback has freed the visited block pointers.
235994617e7SMatthew Ahrens * Record our traversal progress on disk, either by
236994617e7SMatthew Ahrens * updating this record's bookmark, or by logically
237994617e7SMatthew Ahrens * removing this record by advancing bt_begin.
238994617e7SMatthew Ahrens */
239994617e7SMatthew Ahrens if (err != 0) {
24053089ab7Seschrock /* save bookmark for future resume */
24153089ab7Seschrock ASSERT3U(bte.be_zb.zb_objset, ==,
24253089ab7Seschrock ZB_DESTROYED_OBJSET);
243fb09f5aaSMadhav Suresh ASSERT0(bte.be_zb.zb_level);
24453089ab7Seschrock dmu_write(os, obj, i * sizeof (bte),
24553089ab7Seschrock sizeof (bte), &bte, tx);
246994617e7SMatthew Ahrens if (err == EIO || err == ECKSUM ||
247994617e7SMatthew Ahrens err == ENXIO) {
248994617e7SMatthew Ahrens /*
249994617e7SMatthew Ahrens * Skip the rest of this tree and
250994617e7SMatthew Ahrens * continue on to the next entry.
251994617e7SMatthew Ahrens */
252994617e7SMatthew Ahrens err = 0;
253994617e7SMatthew Ahrens ioerr = B_TRUE;
254994617e7SMatthew Ahrens } else {
25553089ab7Seschrock break;
256d61a30d6SMatthew Ahrens }
257994617e7SMatthew Ahrens } else if (ioerr) {
258d61a30d6SMatthew Ahrens /*
259994617e7SMatthew Ahrens * This entry is finished, but there were
260994617e7SMatthew Ahrens * i/o errors on previous entries, so we
261994617e7SMatthew Ahrens * can't adjust bt_begin. Set this entry's
262994617e7SMatthew Ahrens * be_birth_txg such that it will be
263994617e7SMatthew Ahrens * treated as a no-op in future traversals.
264d61a30d6SMatthew Ahrens */
265994617e7SMatthew Ahrens bte.be_birth_txg = UINT64_MAX;
266994617e7SMatthew Ahrens dmu_write(os, obj, i * sizeof (bte),
267994617e7SMatthew Ahrens sizeof (bte), &bte, tx);
268d61a30d6SMatthew Ahrens }
269d61a30d6SMatthew Ahrens
270994617e7SMatthew Ahrens if (!ioerr) {
27153089ab7Seschrock ba.ba_phys->bt_begin++;
27253089ab7Seschrock (void) dmu_free_range(os, obj,
27353089ab7Seschrock i * sizeof (bte), sizeof (bte), tx);
27453089ab7Seschrock }
275994617e7SMatthew Ahrens } else if (err != 0) {
276994617e7SMatthew Ahrens break;
277994617e7SMatthew Ahrens }
27853089ab7Seschrock }
27953089ab7Seschrock
280994617e7SMatthew Ahrens ASSERT(!free || err != 0 || ioerr ||
281994617e7SMatthew Ahrens ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
28253089ab7Seschrock
28353089ab7Seschrock /* if all blocks are free there should be no used space */
28453089ab7Seschrock if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
285994617e7SMatthew Ahrens if (zfs_free_leak_on_eio) {
286994617e7SMatthew Ahrens ba.ba_phys->bt_bytes = 0;
287994617e7SMatthew Ahrens ba.ba_phys->bt_comp = 0;
288994617e7SMatthew Ahrens ba.ba_phys->bt_uncomp = 0;
289994617e7SMatthew Ahrens }
290994617e7SMatthew Ahrens
291fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_bytes);
292fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_comp);
293fb09f5aaSMadhav Suresh ASSERT0(ba.ba_phys->bt_uncomp);
29453089ab7Seschrock }
29553089ab7Seschrock
29653089ab7Seschrock dmu_buf_rele(db, FTAG);
29753089ab7Seschrock
29853089ab7Seschrock return (err);
29953089ab7Seschrock }
300