1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 */ 26 27 #include <sys/dsl_dataset.h> 28 #include <sys/dmu.h> 29 #include <sys/refcount.h> 30 #include <sys/zap.h> 31 #include <sys/zfs_context.h> 32 #include <sys/dsl_pool.h> 33 34 /* 35 * Deadlist concurrency: 36 * 37 * Deadlists can only be modified from the syncing thread. 38 * 39 * Except for dsl_deadlist_insert(), it can only be modified with the 40 * dp_config_rwlock held with RW_WRITER. 41 * 42 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 43 * be called concurrently, from open context, with the dl_config_rwlock held 44 * with RW_READER. 45 * 46 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 47 * the accessors, protecting: 48 * dl_phys->dl_used,comp,uncomp 49 * and protecting the dl_tree from being loaded. 50 * The locking is provided by dl_lock. Note that locking on the bpobj_t 51 * provides its own locking, and dl_oldfmt is immutable. 52 */ 53 54 static int 55 dsl_deadlist_compare(const void *arg1, const void *arg2) 56 { 57 const dsl_deadlist_entry_t *dle1 = arg1; 58 const dsl_deadlist_entry_t *dle2 = arg2; 59 60 if (dle1->dle_mintxg < dle2->dle_mintxg) 61 return (-1); 62 else if (dle1->dle_mintxg > dle2->dle_mintxg) 63 return (+1); 64 else 65 return (0); 66 } 67 68 static void 69 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 70 { 71 zap_cursor_t zc; 72 zap_attribute_t za; 73 74 ASSERT(!dl->dl_oldfmt); 75 if (dl->dl_havetree) 76 return; 77 78 avl_create(&dl->dl_tree, dsl_deadlist_compare, 79 sizeof (dsl_deadlist_entry_t), 80 offsetof(dsl_deadlist_entry_t, dle_node)); 81 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 82 zap_cursor_retrieve(&zc, &za) == 0; 83 zap_cursor_advance(&zc)) { 84 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 85 dle->dle_mintxg = strtonum(za.za_name, NULL); 86 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 87 za.za_first_integer)); 88 avl_add(&dl->dl_tree, dle); 89 } 90 zap_cursor_fini(&zc); 91 dl->dl_havetree = B_TRUE; 92 } 93 94 void 95 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 96 { 97 dmu_object_info_t doi; 98 99 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 100 dl->dl_os = os; 101 dl->dl_object = object; 102 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 103 dmu_object_info_from_db(dl->dl_dbuf, &doi); 104 if (doi.doi_type == DMU_OT_BPOBJ) { 105 dmu_buf_rele(dl->dl_dbuf, dl); 106 dl->dl_dbuf = NULL; 107 dl->dl_oldfmt = B_TRUE; 108 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 109 return; 110 } 111 112 dl->dl_oldfmt = B_FALSE; 113 dl->dl_phys = dl->dl_dbuf->db_data; 114 dl->dl_havetree = B_FALSE; 115 } 116 117 void 118 dsl_deadlist_close(dsl_deadlist_t *dl) 119 { 120 void *cookie = NULL; 121 dsl_deadlist_entry_t *dle; 122 123 dl->dl_os = NULL; 124 125 if (dl->dl_oldfmt) { 126 dl->dl_oldfmt = B_FALSE; 127 bpobj_close(&dl->dl_bpobj); 128 return; 129 } 130 131 if (dl->dl_havetree) { 132 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 133 != NULL) { 134 bpobj_close(&dle->dle_bpobj); 135 kmem_free(dle, sizeof (*dle)); 136 } 137 avl_destroy(&dl->dl_tree); 138 } 139 dmu_buf_rele(dl->dl_dbuf, dl); 140 mutex_destroy(&dl->dl_lock); 141 dl->dl_dbuf = NULL; 142 dl->dl_phys = NULL; 143 } 144 145 uint64_t 146 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 147 { 148 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 149 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 150 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 151 sizeof (dsl_deadlist_phys_t), tx)); 152 } 153 154 void 155 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 156 { 157 dmu_object_info_t doi; 158 zap_cursor_t zc; 159 zap_attribute_t za; 160 161 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 162 if (doi.doi_type == DMU_OT_BPOBJ) { 163 bpobj_free(os, dlobj, tx); 164 return; 165 } 166 167 for (zap_cursor_init(&zc, os, dlobj); 168 zap_cursor_retrieve(&zc, &za) == 0; 169 zap_cursor_advance(&zc)) { 170 uint64_t obj = za.za_first_integer; 171 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 172 bpobj_decr_empty(os, tx); 173 else 174 bpobj_free(os, obj, tx); 175 } 176 zap_cursor_fini(&zc); 177 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 178 } 179 180 static void 181 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 182 const blkptr_t *bp, dmu_tx_t *tx) 183 { 184 if (dle->dle_bpobj.bpo_object == 185 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 186 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 187 bpobj_close(&dle->dle_bpobj); 188 bpobj_decr_empty(dl->dl_os, tx); 189 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 190 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 191 dle->dle_mintxg, obj, tx)); 192 } 193 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 194 } 195 196 static void 197 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 198 uint64_t obj, dmu_tx_t *tx) 199 { 200 if (dle->dle_bpobj.bpo_object != 201 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 202 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 203 } else { 204 bpobj_close(&dle->dle_bpobj); 205 bpobj_decr_empty(dl->dl_os, tx); 206 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 207 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 208 dle->dle_mintxg, obj, tx)); 209 } 210 } 211 212 void 213 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 214 { 215 dsl_deadlist_entry_t dle_tofind; 216 dsl_deadlist_entry_t *dle; 217 avl_index_t where; 218 219 if (dl->dl_oldfmt) { 220 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 221 return; 222 } 223 224 dsl_deadlist_load_tree(dl); 225 226 dmu_buf_will_dirty(dl->dl_dbuf, tx); 227 mutex_enter(&dl->dl_lock); 228 dl->dl_phys->dl_used += 229 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 230 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 231 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 232 mutex_exit(&dl->dl_lock); 233 234 dle_tofind.dle_mintxg = bp->blk_birth; 235 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 236 if (dle == NULL) 237 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 238 else 239 dle = AVL_PREV(&dl->dl_tree, dle); 240 dle_enqueue(dl, dle, bp, tx); 241 } 242 243 /* 244 * Insert new key in deadlist, which must be > all current entries. 245 * mintxg is not inclusive. 246 */ 247 void 248 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 249 { 250 uint64_t obj; 251 dsl_deadlist_entry_t *dle; 252 253 if (dl->dl_oldfmt) 254 return; 255 256 dsl_deadlist_load_tree(dl); 257 258 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 259 dle->dle_mintxg = mintxg; 260 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 261 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 262 avl_add(&dl->dl_tree, dle); 263 264 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 265 mintxg, obj, tx)); 266 } 267 268 /* 269 * Remove this key, merging its entries into the previous key. 270 */ 271 void 272 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 273 { 274 dsl_deadlist_entry_t dle_tofind; 275 dsl_deadlist_entry_t *dle, *dle_prev; 276 277 if (dl->dl_oldfmt) 278 return; 279 280 dsl_deadlist_load_tree(dl); 281 282 dle_tofind.dle_mintxg = mintxg; 283 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 284 dle_prev = AVL_PREV(&dl->dl_tree, dle); 285 286 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 287 288 avl_remove(&dl->dl_tree, dle); 289 bpobj_close(&dle->dle_bpobj); 290 kmem_free(dle, sizeof (*dle)); 291 292 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 293 } 294 295 /* 296 * Walk ds's snapshots to regenerate generate ZAP & AVL. 297 */ 298 static void 299 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 300 uint64_t mrs_obj, dmu_tx_t *tx) 301 { 302 dsl_deadlist_t dl; 303 dsl_pool_t *dp = dmu_objset_pool(os); 304 305 dsl_deadlist_open(&dl, os, dlobj); 306 if (dl.dl_oldfmt) { 307 dsl_deadlist_close(&dl); 308 return; 309 } 310 311 while (mrs_obj != 0) { 312 dsl_dataset_t *ds; 313 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 314 dsl_deadlist_add_key(&dl, 315 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 316 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 317 dsl_dataset_rele(ds, FTAG); 318 } 319 dsl_deadlist_close(&dl); 320 } 321 322 uint64_t 323 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 324 uint64_t mrs_obj, dmu_tx_t *tx) 325 { 326 dsl_deadlist_entry_t *dle; 327 uint64_t newobj; 328 329 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 330 331 if (dl->dl_oldfmt) { 332 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 333 return (newobj); 334 } 335 336 dsl_deadlist_load_tree(dl); 337 338 for (dle = avl_first(&dl->dl_tree); dle; 339 dle = AVL_NEXT(&dl->dl_tree, dle)) { 340 uint64_t obj; 341 342 if (dle->dle_mintxg >= maxtxg) 343 break; 344 345 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 346 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 347 dle->dle_mintxg, obj, tx)); 348 } 349 return (newobj); 350 } 351 352 void 353 dsl_deadlist_space(dsl_deadlist_t *dl, 354 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 355 { 356 if (dl->dl_oldfmt) { 357 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 358 usedp, compp, uncompp)); 359 return; 360 } 361 362 mutex_enter(&dl->dl_lock); 363 *usedp = dl->dl_phys->dl_used; 364 *compp = dl->dl_phys->dl_comp; 365 *uncompp = dl->dl_phys->dl_uncomp; 366 mutex_exit(&dl->dl_lock); 367 } 368 369 /* 370 * return space used in the range (mintxg, maxtxg]. 371 * Includes maxtxg, does not include mintxg. 372 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 373 * larger than any bp in the deadlist (eg. UINT64_MAX)). 374 */ 375 void 376 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 377 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 378 { 379 dsl_deadlist_entry_t *dle; 380 dsl_deadlist_entry_t dle_tofind; 381 avl_index_t where; 382 383 if (dl->dl_oldfmt) { 384 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 385 mintxg, maxtxg, usedp, compp, uncompp)); 386 return; 387 } 388 389 *usedp = *compp = *uncompp = 0; 390 391 mutex_enter(&dl->dl_lock); 392 dsl_deadlist_load_tree(dl); 393 dle_tofind.dle_mintxg = mintxg; 394 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 395 /* 396 * If we don't find this mintxg, there shouldn't be anything 397 * after it either. 398 */ 399 ASSERT(dle != NULL || 400 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 401 402 for (; dle && dle->dle_mintxg < maxtxg; 403 dle = AVL_NEXT(&dl->dl_tree, dle)) { 404 uint64_t used, comp, uncomp; 405 406 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 407 &used, &comp, &uncomp)); 408 409 *usedp += used; 410 *compp += comp; 411 *uncompp += uncomp; 412 } 413 mutex_exit(&dl->dl_lock); 414 } 415 416 static void 417 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 418 dmu_tx_t *tx) 419 { 420 dsl_deadlist_entry_t dle_tofind; 421 dsl_deadlist_entry_t *dle; 422 avl_index_t where; 423 uint64_t used, comp, uncomp; 424 bpobj_t bpo; 425 426 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 427 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 428 bpobj_close(&bpo); 429 430 dsl_deadlist_load_tree(dl); 431 432 dmu_buf_will_dirty(dl->dl_dbuf, tx); 433 mutex_enter(&dl->dl_lock); 434 dl->dl_phys->dl_used += used; 435 dl->dl_phys->dl_comp += comp; 436 dl->dl_phys->dl_uncomp += uncomp; 437 mutex_exit(&dl->dl_lock); 438 439 dle_tofind.dle_mintxg = birth; 440 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 441 if (dle == NULL) 442 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 443 dle_enqueue_subobj(dl, dle, obj, tx); 444 } 445 446 static int 447 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 448 { 449 dsl_deadlist_t *dl = arg; 450 dsl_deadlist_insert(dl, bp, tx); 451 return (0); 452 } 453 454 /* 455 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 456 * an empty deadlist. 457 */ 458 void 459 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 460 { 461 zap_cursor_t zc; 462 zap_attribute_t za; 463 dmu_buf_t *bonus; 464 dsl_deadlist_phys_t *dlp; 465 dmu_object_info_t doi; 466 467 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 468 if (doi.doi_type == DMU_OT_BPOBJ) { 469 bpobj_t bpo; 470 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 471 VERIFY3U(0, ==, bpobj_iterate(&bpo, 472 dsl_deadlist_insert_cb, dl, tx)); 473 bpobj_close(&bpo); 474 return; 475 } 476 477 for (zap_cursor_init(&zc, dl->dl_os, obj); 478 zap_cursor_retrieve(&zc, &za) == 0; 479 zap_cursor_advance(&zc)) { 480 uint64_t mintxg = strtonum(za.za_name, NULL); 481 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 482 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 483 } 484 zap_cursor_fini(&zc); 485 486 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 487 dlp = bonus->db_data; 488 dmu_buf_will_dirty(bonus, tx); 489 bzero(dlp, sizeof (*dlp)); 490 dmu_buf_rele(bonus, FTAG); 491 } 492 493 /* 494 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 495 */ 496 void 497 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 498 dmu_tx_t *tx) 499 { 500 dsl_deadlist_entry_t dle_tofind; 501 dsl_deadlist_entry_t *dle; 502 avl_index_t where; 503 504 ASSERT(!dl->dl_oldfmt); 505 dmu_buf_will_dirty(dl->dl_dbuf, tx); 506 dsl_deadlist_load_tree(dl); 507 508 dle_tofind.dle_mintxg = mintxg; 509 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 510 if (dle == NULL) 511 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 512 while (dle) { 513 uint64_t used, comp, uncomp; 514 dsl_deadlist_entry_t *dle_next; 515 516 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 517 518 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 519 &used, &comp, &uncomp)); 520 mutex_enter(&dl->dl_lock); 521 ASSERT3U(dl->dl_phys->dl_used, >=, used); 522 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 523 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 524 dl->dl_phys->dl_used -= used; 525 dl->dl_phys->dl_comp -= comp; 526 dl->dl_phys->dl_uncomp -= uncomp; 527 mutex_exit(&dl->dl_lock); 528 529 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 530 dle->dle_mintxg, tx)); 531 532 dle_next = AVL_NEXT(&dl->dl_tree, dle); 533 avl_remove(&dl->dl_tree, dle); 534 bpobj_close(&dle->dle_bpobj); 535 kmem_free(dle, sizeof (*dle)); 536 dle = dle_next; 537 } 538 } 539