1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/dsl_dataset.h> 27 #include <sys/dmu.h> 28 #include <sys/refcount.h> 29 #include <sys/zap.h> 30 #include <sys/zfs_context.h> 31 #include <sys/dsl_pool.h> 32 33 /* 34 * Deadlist concurrency: 35 * 36 * Deadlists can only be modified from the syncing thread. 37 * 38 * Except for dsl_deadlist_insert(), it can only be modified with the 39 * dp_config_rwlock held with RW_WRITER. 40 * 41 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 42 * be called concurrently, from open context, with the dl_config_rwlock held 43 * with RW_READER. 44 * 45 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 46 * the accessors, protecting: 47 * dl_phys->dl_used,comp,uncomp 48 * and protecting the dl_tree from being loaded. 49 * The locking is provided by dl_lock. Note that locking on the bpobj_t 50 * provides its own locking, and dl_oldfmt is immutable. 51 */ 52 53 static int 54 dsl_deadlist_compare(const void *arg1, const void *arg2) 55 { 56 const dsl_deadlist_entry_t *dle1 = arg1; 57 const dsl_deadlist_entry_t *dle2 = arg2; 58 59 if (dle1->dle_mintxg < dle2->dle_mintxg) 60 return (-1); 61 else if (dle1->dle_mintxg > dle2->dle_mintxg) 62 return (+1); 63 else 64 return (0); 65 } 66 67 static void 68 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 69 { 70 zap_cursor_t zc; 71 zap_attribute_t za; 72 73 ASSERT(!dl->dl_oldfmt); 74 if (dl->dl_havetree) 75 return; 76 77 avl_create(&dl->dl_tree, dsl_deadlist_compare, 78 sizeof (dsl_deadlist_entry_t), 79 offsetof(dsl_deadlist_entry_t, dle_node)); 80 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 81 zap_cursor_retrieve(&zc, &za) == 0; 82 zap_cursor_advance(&zc)) { 83 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 84 dle->dle_mintxg = strtonum(za.za_name, NULL); 85 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 86 za.za_first_integer)); 87 avl_add(&dl->dl_tree, dle); 88 } 89 zap_cursor_fini(&zc); 90 dl->dl_havetree = B_TRUE; 91 } 92 93 void 94 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 95 { 96 dmu_object_info_t doi; 97 98 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 99 dl->dl_os = os; 100 dl->dl_object = object; 101 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 102 dmu_object_info_from_db(dl->dl_dbuf, &doi); 103 if (doi.doi_type == DMU_OT_BPOBJ) { 104 dmu_buf_rele(dl->dl_dbuf, dl); 105 dl->dl_dbuf = NULL; 106 dl->dl_oldfmt = B_TRUE; 107 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 108 return; 109 } 110 111 dl->dl_oldfmt = B_FALSE; 112 dl->dl_phys = dl->dl_dbuf->db_data; 113 dl->dl_havetree = B_FALSE; 114 } 115 116 void 117 dsl_deadlist_close(dsl_deadlist_t *dl) 118 { 119 void *cookie = NULL; 120 dsl_deadlist_entry_t *dle; 121 122 if (dl->dl_oldfmt) { 123 dl->dl_oldfmt = B_FALSE; 124 bpobj_close(&dl->dl_bpobj); 125 return; 126 } 127 128 if (dl->dl_havetree) { 129 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 130 != NULL) { 131 bpobj_close(&dle->dle_bpobj); 132 kmem_free(dle, sizeof (*dle)); 133 } 134 avl_destroy(&dl->dl_tree); 135 } 136 dmu_buf_rele(dl->dl_dbuf, dl); 137 mutex_destroy(&dl->dl_lock); 138 dl->dl_dbuf = NULL; 139 dl->dl_phys = NULL; 140 } 141 142 uint64_t 143 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 144 { 145 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 146 return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx)); 147 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 148 sizeof (dsl_deadlist_phys_t), tx)); 149 } 150 151 void 152 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 153 { 154 dmu_object_info_t doi; 155 zap_cursor_t zc; 156 zap_attribute_t za; 157 158 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 159 if (doi.doi_type == DMU_OT_BPOBJ) { 160 bpobj_free(os, dlobj, tx); 161 return; 162 } 163 164 for (zap_cursor_init(&zc, os, dlobj); 165 zap_cursor_retrieve(&zc, &za) == 0; 166 zap_cursor_advance(&zc)) { 167 uint64_t obj = za.za_first_integer; 168 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 169 bpobj_decr_empty(os, tx); 170 else 171 bpobj_free(os, obj, tx); 172 } 173 zap_cursor_fini(&zc); 174 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 175 } 176 177 static void 178 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 179 const blkptr_t *bp, dmu_tx_t *tx) 180 { 181 if (dle->dle_bpobj.bpo_object == 182 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 183 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 184 bpobj_close(&dle->dle_bpobj); 185 bpobj_decr_empty(dl->dl_os, tx); 186 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 187 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 188 dle->dle_mintxg, obj, tx)); 189 } 190 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 191 } 192 193 static void 194 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 195 uint64_t obj, dmu_tx_t *tx) 196 { 197 if (dle->dle_bpobj.bpo_object != 198 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 199 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 200 } else { 201 bpobj_close(&dle->dle_bpobj); 202 bpobj_decr_empty(dl->dl_os, tx); 203 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 204 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 205 dle->dle_mintxg, obj, tx)); 206 } 207 } 208 209 void 210 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 211 { 212 dsl_deadlist_entry_t dle_tofind; 213 dsl_deadlist_entry_t *dle; 214 avl_index_t where; 215 216 if (dl->dl_oldfmt) { 217 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 218 return; 219 } 220 221 dsl_deadlist_load_tree(dl); 222 223 dmu_buf_will_dirty(dl->dl_dbuf, tx); 224 mutex_enter(&dl->dl_lock); 225 dl->dl_phys->dl_used += 226 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 227 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 228 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 229 mutex_exit(&dl->dl_lock); 230 231 dle_tofind.dle_mintxg = bp->blk_birth; 232 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 233 if (dle == NULL) 234 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 235 else 236 dle = AVL_PREV(&dl->dl_tree, dle); 237 dle_enqueue(dl, dle, bp, tx); 238 } 239 240 /* 241 * Insert new key in deadlist, which must be > all current entries. 242 * mintxg is not inclusive. 243 */ 244 void 245 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 246 { 247 uint64_t obj; 248 dsl_deadlist_entry_t *dle; 249 250 if (dl->dl_oldfmt) 251 return; 252 253 dsl_deadlist_load_tree(dl); 254 255 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 256 dle->dle_mintxg = mintxg; 257 obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 258 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 259 avl_add(&dl->dl_tree, dle); 260 261 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 262 mintxg, obj, tx)); 263 } 264 265 /* 266 * Remove this key, merging its entries into the previous key. 267 */ 268 void 269 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 270 { 271 dsl_deadlist_entry_t dle_tofind; 272 dsl_deadlist_entry_t *dle, *dle_prev; 273 274 if (dl->dl_oldfmt) 275 return; 276 277 dsl_deadlist_load_tree(dl); 278 279 dle_tofind.dle_mintxg = mintxg; 280 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 281 dle_prev = AVL_PREV(&dl->dl_tree, dle); 282 283 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 284 285 avl_remove(&dl->dl_tree, dle); 286 bpobj_close(&dle->dle_bpobj); 287 kmem_free(dle, sizeof (*dle)); 288 289 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 290 } 291 292 /* 293 * Walk ds's snapshots to regenerate generate ZAP & AVL. 294 */ 295 static void 296 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 297 uint64_t mrs_obj, dmu_tx_t *tx) 298 { 299 dsl_deadlist_t dl; 300 dsl_pool_t *dp = dmu_objset_pool(os); 301 302 dsl_deadlist_open(&dl, os, dlobj); 303 if (dl.dl_oldfmt) { 304 dsl_deadlist_close(&dl); 305 return; 306 } 307 308 while (mrs_obj != 0) { 309 dsl_dataset_t *ds; 310 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 311 dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx); 312 mrs_obj = ds->ds_phys->ds_prev_snap_obj; 313 dsl_dataset_rele(ds, FTAG); 314 } 315 dsl_deadlist_close(&dl); 316 } 317 318 uint64_t 319 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 320 uint64_t mrs_obj, dmu_tx_t *tx) 321 { 322 dsl_deadlist_entry_t *dle; 323 uint64_t newobj; 324 325 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 326 327 if (dl->dl_oldfmt) { 328 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 329 return (newobj); 330 } 331 332 dsl_deadlist_load_tree(dl); 333 334 for (dle = avl_first(&dl->dl_tree); dle; 335 dle = AVL_NEXT(&dl->dl_tree, dle)) { 336 uint64_t obj; 337 338 if (dle->dle_mintxg >= maxtxg) 339 break; 340 341 obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx); 342 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 343 dle->dle_mintxg, obj, tx)); 344 } 345 return (newobj); 346 } 347 348 void 349 dsl_deadlist_space(dsl_deadlist_t *dl, 350 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 351 { 352 if (dl->dl_oldfmt) { 353 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 354 usedp, compp, uncompp)); 355 return; 356 } 357 358 mutex_enter(&dl->dl_lock); 359 *usedp = dl->dl_phys->dl_used; 360 *compp = dl->dl_phys->dl_comp; 361 *uncompp = dl->dl_phys->dl_uncomp; 362 mutex_exit(&dl->dl_lock); 363 } 364 365 /* 366 * return space used in the range (mintxg, maxtxg]. 367 * Includes maxtxg, does not include mintxg. 368 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 369 * larger than any bp in the deadlist (eg. UINT64_MAX)). 370 */ 371 void 372 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 373 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 374 { 375 dsl_deadlist_entry_t *dle; 376 dsl_deadlist_entry_t dle_tofind; 377 avl_index_t where; 378 379 if (dl->dl_oldfmt) { 380 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 381 mintxg, maxtxg, usedp, compp, uncompp)); 382 return; 383 } 384 385 *usedp = *compp = *uncompp = 0; 386 387 mutex_enter(&dl->dl_lock); 388 dsl_deadlist_load_tree(dl); 389 dle_tofind.dle_mintxg = mintxg; 390 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 391 /* 392 * If we don't find this mintxg, there shouldn't be anything 393 * after it either. 394 */ 395 ASSERT(dle != NULL || 396 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 397 398 for (; dle && dle->dle_mintxg < maxtxg; 399 dle = AVL_NEXT(&dl->dl_tree, dle)) { 400 uint64_t used, comp, uncomp; 401 402 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 403 &used, &comp, &uncomp)); 404 405 *usedp += used; 406 *compp += comp; 407 *uncompp += uncomp; 408 } 409 mutex_exit(&dl->dl_lock); 410 } 411 412 static void 413 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 414 dmu_tx_t *tx) 415 { 416 dsl_deadlist_entry_t dle_tofind; 417 dsl_deadlist_entry_t *dle; 418 avl_index_t where; 419 uint64_t used, comp, uncomp; 420 bpobj_t bpo; 421 422 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 423 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 424 bpobj_close(&bpo); 425 426 dsl_deadlist_load_tree(dl); 427 428 dmu_buf_will_dirty(dl->dl_dbuf, tx); 429 mutex_enter(&dl->dl_lock); 430 dl->dl_phys->dl_used += used; 431 dl->dl_phys->dl_comp += comp; 432 dl->dl_phys->dl_uncomp += uncomp; 433 mutex_exit(&dl->dl_lock); 434 435 dle_tofind.dle_mintxg = birth; 436 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 437 if (dle == NULL) 438 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 439 dle_enqueue_subobj(dl, dle, obj, tx); 440 } 441 442 static int 443 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 444 { 445 dsl_deadlist_t *dl = arg; 446 dsl_deadlist_insert(dl, bp, tx); 447 return (0); 448 } 449 450 /* 451 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 452 * an empty deadlist. 453 */ 454 void 455 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 456 { 457 zap_cursor_t zc; 458 zap_attribute_t za; 459 dmu_buf_t *bonus; 460 dsl_deadlist_phys_t *dlp; 461 dmu_object_info_t doi; 462 463 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 464 if (doi.doi_type == DMU_OT_BPOBJ) { 465 bpobj_t bpo; 466 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 467 VERIFY3U(0, ==, bpobj_iterate(&bpo, 468 dsl_deadlist_insert_cb, dl, tx)); 469 bpobj_close(&bpo); 470 return; 471 } 472 473 for (zap_cursor_init(&zc, dl->dl_os, obj); 474 zap_cursor_retrieve(&zc, &za) == 0; 475 zap_cursor_advance(&zc)) { 476 uint64_t mintxg = strtonum(za.za_name, NULL); 477 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 478 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 479 } 480 zap_cursor_fini(&zc); 481 482 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 483 dlp = bonus->db_data; 484 dmu_buf_will_dirty(bonus, tx); 485 bzero(dlp, sizeof (*dlp)); 486 dmu_buf_rele(bonus, FTAG); 487 } 488 489 /* 490 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 491 */ 492 void 493 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 494 dmu_tx_t *tx) 495 { 496 dsl_deadlist_entry_t dle_tofind; 497 dsl_deadlist_entry_t *dle; 498 avl_index_t where; 499 500 ASSERT(!dl->dl_oldfmt); 501 dmu_buf_will_dirty(dl->dl_dbuf, tx); 502 dsl_deadlist_load_tree(dl); 503 504 dle_tofind.dle_mintxg = mintxg; 505 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 506 if (dle == NULL) 507 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 508 while (dle) { 509 uint64_t used, comp, uncomp; 510 dsl_deadlist_entry_t *dle_next; 511 512 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 513 514 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 515 &used, &comp, &uncomp)); 516 mutex_enter(&dl->dl_lock); 517 ASSERT3U(dl->dl_phys->dl_used, >=, used); 518 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 519 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 520 dl->dl_phys->dl_used -= used; 521 dl->dl_phys->dl_comp -= comp; 522 dl->dl_phys->dl_uncomp -= uncomp; 523 mutex_exit(&dl->dl_lock); 524 525 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 526 dle->dle_mintxg, tx)); 527 528 dle_next = AVL_NEXT(&dl->dl_tree, dle); 529 avl_remove(&dl->dl_tree, dle); 530 bpobj_close(&dle->dle_bpobj); 531 kmem_free(dle, sizeof (*dle)); 532 dle = dle_next; 533 } 534 } 535