1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2013 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/arc.h> 27 #include <sys/bptree.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_objset.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dmu_traverse.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dsl_pool.h> 35 #include <sys/dnode.h> 36 #include <sys/refcount.h> 37 #include <sys/spa.h> 38 39 /* 40 * A bptree is a queue of root block pointers from destroyed datasets. When a 41 * dataset is destroyed its root block pointer is put on the end of the pool's 42 * bptree queue so the dataset's blocks can be freed asynchronously by 43 * dsl_scan_sync. This allows the delete operation to finish without traversing 44 * all the dataset's blocks. 45 * 46 * Note that while bt_begin and bt_end are only ever incremented in this code, 47 * they are effectively reset to 0 every time the entire bptree is freed because 48 * the bptree's object is destroyed and re-created. 49 */ 50 51 struct bptree_args { 52 bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ 53 boolean_t ba_free; /* true if freeing during traversal */ 54 55 bptree_itor_t *ba_func; /* function to call for each blockpointer */ 56 void *ba_arg; /* caller supplied argument to ba_func */ 57 dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ 58 } bptree_args_t; 59 60 uint64_t 61 bptree_alloc(objset_t *os, dmu_tx_t *tx) 62 { 63 uint64_t obj; 64 dmu_buf_t *db; 65 bptree_phys_t *bt; 66 67 obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, 68 SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, 69 sizeof (bptree_phys_t), tx); 70 71 /* 72 * Bonus buffer contents are already initialized to 0, but for 73 * readability we make it explicit. 74 */ 75 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 76 dmu_buf_will_dirty(db, tx); 77 bt = db->db_data; 78 bt->bt_begin = 0; 79 bt->bt_end = 0; 80 bt->bt_bytes = 0; 81 bt->bt_comp = 0; 82 bt->bt_uncomp = 0; 83 dmu_buf_rele(db, FTAG); 84 85 return (obj); 86 } 87 88 int 89 bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 90 { 91 dmu_buf_t *db; 92 bptree_phys_t *bt; 93 94 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 95 bt = db->db_data; 96 ASSERT3U(bt->bt_begin, ==, bt->bt_end); 97 ASSERT0(bt->bt_bytes); 98 ASSERT0(bt->bt_comp); 99 ASSERT0(bt->bt_uncomp); 100 dmu_buf_rele(db, FTAG); 101 102 return (dmu_object_free(os, obj, tx)); 103 } 104 105 void 106 bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, 107 uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) 108 { 109 dmu_buf_t *db; 110 bptree_phys_t *bt; 111 bptree_entry_phys_t bte; 112 113 /* 114 * bptree objects are in the pool mos, therefore they can only be 115 * modified in syncing context. Furthermore, this is only modified 116 * by the sync thread, so no locking is necessary. 117 */ 118 ASSERT(dmu_tx_is_syncing(tx)); 119 120 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 121 bt = db->db_data; 122 123 bte.be_birth_txg = birth_txg; 124 bte.be_bp = *bp; 125 bzero(&bte.be_zb, sizeof (bte.be_zb)); 126 dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); 127 128 dmu_buf_will_dirty(db, tx); 129 bt->bt_end++; 130 bt->bt_bytes += bytes; 131 bt->bt_comp += comp; 132 bt->bt_uncomp += uncomp; 133 dmu_buf_rele(db, FTAG); 134 } 135 136 /* ARGSUSED */ 137 static int 138 bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 139 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 140 { 141 int err; 142 struct bptree_args *ba = arg; 143 144 if (BP_IS_HOLE(bp)) 145 return (0); 146 147 err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); 148 if (err == 0 && ba->ba_free) { 149 ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); 150 ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); 151 ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); 152 } 153 return (err); 154 } 155 156 int 157 bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, 158 void *arg, dmu_tx_t *tx) 159 { 160 int err; 161 uint64_t i; 162 dmu_buf_t *db; 163 struct bptree_args ba; 164 165 ASSERT(!free || dmu_tx_is_syncing(tx)); 166 167 err = dmu_bonus_hold(os, obj, FTAG, &db); 168 if (err != 0) 169 return (err); 170 171 if (free) 172 dmu_buf_will_dirty(db, tx); 173 174 ba.ba_phys = db->db_data; 175 ba.ba_free = free; 176 ba.ba_func = func; 177 ba.ba_arg = arg; 178 ba.ba_tx = tx; 179 180 err = 0; 181 for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { 182 bptree_entry_phys_t bte; 183 int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; 184 185 ASSERT(!free || i == ba.ba_phys->bt_begin); 186 187 err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), 188 &bte, DMU_READ_NO_PREFETCH); 189 if (err != 0) 190 break; 191 192 if (zfs_recover) 193 flags |= TRAVERSE_HARD; 194 err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, 195 bte.be_birth_txg, &bte.be_zb, flags, 196 bptree_visit_cb, &ba); 197 if (free) { 198 if (err == ERESTART) { 199 /* save bookmark for future resume */ 200 ASSERT3U(bte.be_zb.zb_objset, ==, 201 ZB_DESTROYED_OBJSET); 202 ASSERT0(bte.be_zb.zb_level); 203 dmu_write(os, obj, i * sizeof (bte), 204 sizeof (bte), &bte, tx); 205 break; 206 } 207 if (err != 0) { 208 /* 209 * We can not properly handle an i/o 210 * error, because the traversal code 211 * does not know how to resume from an 212 * arbitrary bookmark. 213 */ 214 zfs_panic_recover("error %u from " 215 "traverse_dataset_destroyed()", err); 216 } 217 218 ba.ba_phys->bt_begin++; 219 (void) dmu_free_range(os, obj, 220 i * sizeof (bte), sizeof (bte), tx); 221 } 222 } 223 224 ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); 225 226 /* if all blocks are free there should be no used space */ 227 if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { 228 ASSERT0(ba.ba_phys->bt_bytes); 229 ASSERT0(ba.ba_phys->bt_comp); 230 ASSERT0(ba.ba_phys->bt_uncomp); 231 } 232 233 dmu_buf_rele(db, FTAG); 234 235 return (err); 236 } 237