1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/arc.h> 28 #include <sys/bptree.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dmu_traverse.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dsl_pool.h> 36 #include <sys/dnode.h> 37 #include <sys/spa.h> 38 39 /* 40 * A bptree is a queue of root block pointers from destroyed datasets. When a 41 * dataset is destroyed its root block pointer is put on the end of the pool's 42 * bptree queue so the dataset's blocks can be freed asynchronously by 43 * dsl_scan_sync. This allows the delete operation to finish without traversing 44 * all the dataset's blocks. 45 * 46 * Note that while bt_begin and bt_end are only ever incremented in this code, 47 * they are effectively reset to 0 every time the entire bptree is freed because 48 * the bptree's object is destroyed and re-created. 49 */ 50 51 struct bptree_args { 52 bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 53 boolean_t ba_free; /* true if freeing during traversal */ 54 55 bptree_itor_t *ba_func; /* function to call for each blockpointer */ 56 void *ba_arg; /* caller supplied argument to ba_func */ 57 dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 58 } bptree_args_t; 59 60 uint64_t 61 bptree_alloc(objset_t *os, dmu_tx_t *tx) 62 { 63 uint64_t obj; 64 dmu_buf_t *db; 65 bptree_phys_t *bt; 66 67 obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 68 SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 69 sizeof (bptree_phys_t), tx); 70 71 /* 72 * Bonus buffer contents are already initialized to 0, but for 73 * readability we make it explicit. 74 */ 75 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 76 dmu_buf_will_dirty(db, tx); 77 bt = db->db_data; 78 bt->bt_begin = 0; 79 bt->bt_end = 0; 80 bt->bt_bytes = 0; 81 bt->bt_comp = 0; 82 bt->bt_uncomp = 0; 83 dmu_buf_rele(db, FTAG); 84 85 return (obj); 86 } 87 88 int 89 bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 90 { 91 dmu_buf_t *db; 92 bptree_phys_t *bt; 93 94 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 95 bt = db->db_data; 96 ASSERT3U(bt->bt_begin, ==, bt->bt_end); 97 ASSERT0(bt->bt_bytes); 98 ASSERT0(bt->bt_comp); 99 ASSERT0(bt->bt_uncomp); 100 dmu_buf_rele(db, FTAG); 101 102 return (dmu_object_free(os, obj, tx)); 103 } 104 105 boolean_t 106 bptree_is_empty(objset_t *os, uint64_t obj) 107 { 108 dmu_buf_t *db; 109 bptree_phys_t *bt; 110 boolean_t rv; 111 112 VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); 113 bt = db->db_data; 114 rv = (bt->bt_begin == bt->bt_end); 115 dmu_buf_rele(db, FTAG); 116 return (rv); 117 } 118 119 void 120 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 121 uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 122 { 123 dmu_buf_t *db; 124 bptree_phys_t *bt; 125 bptree_entry_phys_t *bte; 126 127 /* 128 * bptree objects are in the pool mos, therefore they can only be 129 * modified in syncing context. Furthermore, this is only modified 130 * by the sync thread, so no locking is necessary. 131 */ 132 ASSERT(dmu_tx_is_syncing(tx)); 133 134 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 135 bt = db->db_data; 136 137 bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); 138 bte->be_birth_txg = birth_txg; 139 bte->be_bp = *bp; 140 dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); 141 kmem_free(bte, sizeof (*bte)); 142 143 dmu_buf_will_dirty(db, tx); 144 bt->bt_end++; 145 bt->bt_bytes += bytes; 146 bt->bt_comp += comp; 147 bt->bt_uncomp += uncomp; 148 dmu_buf_rele(db, FTAG); 149 } 150 151 static int 152 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 153 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 154 { 155 (void) zilog, (void) dnp; 156 int err; 157 struct bptree_args *ba = arg; 158 159 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 160 BP_IS_REDACTED(bp)) 161 return (0); 162 163 err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 164 if (err == 0 && ba->ba_free) { 165 ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 166 ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 167 ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 168 } 169 return (err); 170 } 171 172 /* 173 * If "free" is set: 174 * - It is assumed that "func" will be freeing the block pointers. 175 * - If "func" returns nonzero, the bookmark will be remembered and 176 * iteration will be restarted from this point on next invocation. 177 * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), 178 * bptree_iterate will remember the bookmark, continue traversing 179 * any additional entries, and return 0. 180 * 181 * If "free" is not set, traversal will stop and return an error if 182 * an i/o error is encountered. 183 * 184 * In either case, if zfs_free_leak_on_eio is set, i/o errors will be 185 * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to 186 * traverse_dataset_destroyed()). 187 */ 188 int 189 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 190 void *arg, dmu_tx_t *tx) 191 { 192 boolean_t ioerr = B_FALSE; 193 int err; 194 uint64_t i; 195 dmu_buf_t *db; 196 struct bptree_args ba; 197 198 ASSERT(!free || dmu_tx_is_syncing(tx)); 199 200 err = dmu_bonus_hold(os, obj, FTAG, &db); 201 if (err != 0) 202 return (err); 203 204 if (free) 205 dmu_buf_will_dirty(db, tx); 206 207 ba.ba_phys = db->db_data; 208 ba.ba_free = free; 209 ba.ba_func = func; 210 ba.ba_arg = arg; 211 ba.ba_tx = tx; 212 213 err = 0; 214 for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 215 bptree_entry_phys_t bte; 216 int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST | 217 TRAVERSE_NO_DECRYPT; 218 219 err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 220 &bte, DMU_READ_NO_PREFETCH); 221 if (err != 0) 222 break; 223 224 if (zfs_free_leak_on_eio) 225 flags |= TRAVERSE_HARD; 226 zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " 227 "bookmark %lld/%lld/%lld/%lld", 228 (longlong_t)i, 229 (longlong_t)bte.be_birth_txg, 230 (longlong_t)bte.be_zb.zb_objset, 231 (longlong_t)bte.be_zb.zb_object, 232 (longlong_t)bte.be_zb.zb_level, 233 (longlong_t)bte.be_zb.zb_blkid); 234 err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 235 bte.be_birth_txg, &bte.be_zb, flags, 236 bptree_visit_cb, &ba); 237 if (free) { 238 /* 239 * The callback has freed the visited block pointers. 240 * Record our traversal progress on disk, either by 241 * updating this record's bookmark, or by logically 242 * removing this record by advancing bt_begin. 243 */ 244 if (err != 0) { 245 /* save bookmark for future resume */ 246 ASSERT3U(bte.be_zb.zb_objset, ==, 247 ZB_DESTROYED_OBJSET); 248 ASSERT0(bte.be_zb.zb_level); 249 dmu_write(os, obj, i * sizeof (bte), 250 sizeof (bte), &bte, tx); 251 if (err == EIO || err == ECKSUM || 252 err == ENXIO) { 253 /* 254 * Skip the rest of this tree and 255 * continue on to the next entry. 256 */ 257 err = 0; 258 ioerr = B_TRUE; 259 } else { 260 break; 261 } 262 } else if (ioerr) { 263 /* 264 * This entry is finished, but there were 265 * i/o errors on previous entries, so we 266 * can't adjust bt_begin. Set this entry's 267 * be_birth_txg such that it will be 268 * treated as a no-op in future traversals. 269 */ 270 bte.be_birth_txg = UINT64_MAX; 271 dmu_write(os, obj, i * sizeof (bte), 272 sizeof (bte), &bte, tx); 273 } 274 275 if (!ioerr) { 276 ba.ba_phys->bt_begin++; 277 (void) dmu_free_range(os, obj, 278 i * sizeof (bte), sizeof (bte), tx); 279 } 280 } else if (err != 0) { 281 break; 282 } 283 } 284 285 ASSERT(!free || err != 0 || ioerr || 286 ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 287 288 /* if all blocks are free there should be no used space */ 289 if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 290 if (zfs_free_leak_on_eio) { 291 ba.ba_phys->bt_bytes = 0; 292 ba.ba_phys->bt_comp = 0; 293 ba.ba_phys->bt_uncomp = 0; 294 } 295 296 ASSERT0(ba.ba_phys->bt_bytes); 297 ASSERT0(ba.ba_phys->bt_comp); 298 ASSERT0(ba.ba_phys->bt_uncomp); 299 } 300 301 dmu_buf_rele(db, FTAG); 302 303 return (err); 304 } 305