1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 #include <sys/dsl_dataset.h> 29 #include <sys/dmu.h> 30 #include <sys/refcount.h> 31 #include <sys/zap.h> 32 #include <sys/zfs_context.h> 33 #include <sys/dsl_pool.h> 34 35 /* 36 * Deadlist concurrency: 37 * 38 * Deadlists can only be modified from the syncing thread. 39 * 40 * Except for dsl_deadlist_insert(), it can only be modified with the 41 * dp_config_rwlock held with RW_WRITER. 42 * 43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 44 * be called concurrently, from open context, with the dl_config_rwlock held 45 * with RW_READER. 46 * 47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 48 * the accessors, protecting: 49 * dl_phys->dl_used,comp,uncomp 50 * and protecting the dl_tree from being loaded. 51 * The locking is provided by dl_lock. Note that locking on the bpobj_t 52 * provides its own locking, and dl_oldfmt is immutable. 53 */ 54 55 static int 56 dsl_deadlist_compare(const void *arg1, const void *arg2) 57 { 58 const dsl_deadlist_entry_t *dle1 = arg1; 59 const dsl_deadlist_entry_t *dle2 = arg2; 60 61 if (dle1->dle_mintxg < dle2->dle_mintxg) 62 return (-1); 63 else if (dle1->dle_mintxg > dle2->dle_mintxg) 64 return (+1); 65 else 66 return (0); 67 } 68 69 static void 70 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 71 { 72 zap_cursor_t zc; 73 zap_attribute_t za; 74 75 ASSERT(!dl->dl_oldfmt); 76 if (dl->dl_havetree) 77 return; 78 79 avl_create(&dl->dl_tree, dsl_deadlist_compare, 80 sizeof (dsl_deadlist_entry_t), 81 offsetof(dsl_deadlist_entry_t, dle_node)); 82 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 83 zap_cursor_retrieve(&zc, &za) == 0; 84 zap_cursor_advance(&zc)) { 85 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 86 dle->dle_mintxg = strtonum(za.za_name, NULL); 87 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 88 za.za_first_integer)); 89 avl_add(&dl->dl_tree, dle); 90 } 91 zap_cursor_fini(&zc); 92 dl->dl_havetree = B_TRUE; 93 } 94 95 void 96 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 97 { 98 dmu_object_info_t doi; 99 100 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 101 dl->dl_os = os; 102 dl->dl_object = object; 103 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 104 dmu_object_info_from_db(dl->dl_dbuf, &doi); 105 if (doi.doi_type == DMU_OT_BPOBJ) { 106 dmu_buf_rele(dl->dl_dbuf, dl); 107 dl->dl_dbuf = NULL; 108 dl->dl_oldfmt = B_TRUE; 109 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 110 return; 111 } 112 113 dl->dl_oldfmt = B_FALSE; 114 dl->dl_phys = dl->dl_dbuf->db_data; 115 dl->dl_havetree = B_FALSE; 116 } 117 118 void 119 dsl_deadlist_close(dsl_deadlist_t *dl) 120 { 121 void *cookie = NULL; 122 dsl_deadlist_entry_t *dle; 123 124 dl->dl_os = NULL; 125 126 if (dl->dl_oldfmt) { 127 dl->dl_oldfmt = B_FALSE; 128 bpobj_close(&dl->dl_bpobj); 129 return; 130 } 131 132 if (dl->dl_havetree) { 133 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 134 != NULL) { 135 bpobj_close(&dle->dle_bpobj); 136 kmem_free(dle, sizeof (*dle)); 137 } 138 avl_destroy(&dl->dl_tree); 139 } 140 dmu_buf_rele(dl->dl_dbuf, dl); 141 mutex_destroy(&dl->dl_lock); 142 dl->dl_dbuf = NULL; 143 dl->dl_phys = NULL; 144 } 145 146 uint64_t 147 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 148 { 149 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 150 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 151 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 152 sizeof (dsl_deadlist_phys_t), tx)); 153 } 154 155 void 156 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 157 { 158 dmu_object_info_t doi; 159 zap_cursor_t zc; 160 zap_attribute_t za; 161 162 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 163 if (doi.doi_type == DMU_OT_BPOBJ) { 164 bpobj_free(os, dlobj, tx); 165 return; 166 } 167 168 for (zap_cursor_init(&zc, os, dlobj); 169 zap_cursor_retrieve(&zc, &za) == 0; 170 zap_cursor_advance(&zc)) { 171 uint64_t obj = za.za_first_integer; 172 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 173 bpobj_decr_empty(os, tx); 174 else 175 bpobj_free(os, obj, tx); 176 } 177 zap_cursor_fini(&zc); 178 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 179 } 180 181 static void 182 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 183 const blkptr_t *bp, dmu_tx_t *tx) 184 { 185 if (dle->dle_bpobj.bpo_object == 186 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 187 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 188 bpobj_close(&dle->dle_bpobj); 189 bpobj_decr_empty(dl->dl_os, tx); 190 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 191 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 192 dle->dle_mintxg, obj, tx)); 193 } 194 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 195 } 196 197 static void 198 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 199 uint64_t obj, dmu_tx_t *tx) 200 { 201 if (dle->dle_bpobj.bpo_object != 202 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 203 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 204 } else { 205 bpobj_close(&dle->dle_bpobj); 206 bpobj_decr_empty(dl->dl_os, tx); 207 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 208 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 209 dle->dle_mintxg, obj, tx)); 210 } 211 } 212 213 void 214 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 215 { 216 dsl_deadlist_entry_t dle_tofind; 217 dsl_deadlist_entry_t *dle; 218 avl_index_t where; 219 220 if (dl->dl_oldfmt) { 221 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 222 return; 223 } 224 225 dsl_deadlist_load_tree(dl); 226 227 dmu_buf_will_dirty(dl->dl_dbuf, tx); 228 mutex_enter(&dl->dl_lock); 229 dl->dl_phys->dl_used += 230 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 231 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 232 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 233 mutex_exit(&dl->dl_lock); 234 235 dle_tofind.dle_mintxg = bp->blk_birth; 236 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 237 if (dle == NULL) 238 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 239 else 240 dle = AVL_PREV(&dl->dl_tree, dle); 241 dle_enqueue(dl, dle, bp, tx); 242 } 243 244 /* 245 * Insert new key in deadlist, which must be > all current entries. 246 * mintxg is not inclusive. 247 */ 248 void 249 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 250 { 251 uint64_t obj; 252 dsl_deadlist_entry_t *dle; 253 254 if (dl->dl_oldfmt) 255 return; 256 257 dsl_deadlist_load_tree(dl); 258 259 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 260 dle->dle_mintxg = mintxg; 261 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 262 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 263 avl_add(&dl->dl_tree, dle); 264 265 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 266 mintxg, obj, tx)); 267 } 268 269 /* 270 * Remove this key, merging its entries into the previous key. 271 */ 272 void 273 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 274 { 275 dsl_deadlist_entry_t dle_tofind; 276 dsl_deadlist_entry_t *dle, *dle_prev; 277 278 if (dl->dl_oldfmt) 279 return; 280 281 dsl_deadlist_load_tree(dl); 282 283 dle_tofind.dle_mintxg = mintxg; 284 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 285 dle_prev = AVL_PREV(&dl->dl_tree, dle); 286 287 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 288 289 avl_remove(&dl->dl_tree, dle); 290 bpobj_close(&dle->dle_bpobj); 291 kmem_free(dle, sizeof (*dle)); 292 293 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 294 } 295 296 /* 297 * Walk ds's snapshots to regenerate generate ZAP & AVL. 298 */ 299 static void 300 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 301 uint64_t mrs_obj, dmu_tx_t *tx) 302 { 303 dsl_deadlist_t dl; 304 dsl_pool_t *dp = dmu_objset_pool(os); 305 306 dsl_deadlist_open(&dl, os, dlobj); 307 if (dl.dl_oldfmt) { 308 dsl_deadlist_close(&dl); 309 return; 310 } 311 312 while (mrs_obj != 0) { 313 dsl_dataset_t *ds; 314 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 315 dsl_deadlist_add_key(&dl, 316 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 317 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 318 dsl_dataset_rele(ds, FTAG); 319 } 320 dsl_deadlist_close(&dl); 321 } 322 323 uint64_t 324 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 325 uint64_t mrs_obj, dmu_tx_t *tx) 326 { 327 dsl_deadlist_entry_t *dle; 328 uint64_t newobj; 329 330 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 331 332 if (dl->dl_oldfmt) { 333 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 334 return (newobj); 335 } 336 337 dsl_deadlist_load_tree(dl); 338 339 for (dle = avl_first(&dl->dl_tree); dle; 340 dle = AVL_NEXT(&dl->dl_tree, dle)) { 341 uint64_t obj; 342 343 if (dle->dle_mintxg >= maxtxg) 344 break; 345 346 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 347 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 348 dle->dle_mintxg, obj, tx)); 349 } 350 return (newobj); 351 } 352 353 void 354 dsl_deadlist_space(dsl_deadlist_t *dl, 355 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 356 { 357 if (dl->dl_oldfmt) { 358 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 359 usedp, compp, uncompp)); 360 return; 361 } 362 363 mutex_enter(&dl->dl_lock); 364 *usedp = dl->dl_phys->dl_used; 365 *compp = dl->dl_phys->dl_comp; 366 *uncompp = dl->dl_phys->dl_uncomp; 367 mutex_exit(&dl->dl_lock); 368 } 369 370 /* 371 * return space used in the range (mintxg, maxtxg]. 372 * Includes maxtxg, does not include mintxg. 373 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 374 * larger than any bp in the deadlist (eg. UINT64_MAX)). 375 */ 376 void 377 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 378 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 379 { 380 dsl_deadlist_entry_t *dle; 381 dsl_deadlist_entry_t dle_tofind; 382 avl_index_t where; 383 384 if (dl->dl_oldfmt) { 385 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 386 mintxg, maxtxg, usedp, compp, uncompp)); 387 return; 388 } 389 390 *usedp = *compp = *uncompp = 0; 391 392 mutex_enter(&dl->dl_lock); 393 dsl_deadlist_load_tree(dl); 394 dle_tofind.dle_mintxg = mintxg; 395 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 396 /* 397 * If we don't find this mintxg, there shouldn't be anything 398 * after it either. 399 */ 400 ASSERT(dle != NULL || 401 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 402 403 for (; dle && dle->dle_mintxg < maxtxg; 404 dle = AVL_NEXT(&dl->dl_tree, dle)) { 405 uint64_t used, comp, uncomp; 406 407 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 408 &used, &comp, &uncomp)); 409 410 *usedp += used; 411 *compp += comp; 412 *uncompp += uncomp; 413 } 414 mutex_exit(&dl->dl_lock); 415 } 416 417 static void 418 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 419 dmu_tx_t *tx) 420 { 421 dsl_deadlist_entry_t dle_tofind; 422 dsl_deadlist_entry_t *dle; 423 avl_index_t where; 424 uint64_t used, comp, uncomp; 425 bpobj_t bpo; 426 427 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 428 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 429 bpobj_close(&bpo); 430 431 dsl_deadlist_load_tree(dl); 432 433 dmu_buf_will_dirty(dl->dl_dbuf, tx); 434 mutex_enter(&dl->dl_lock); 435 dl->dl_phys->dl_used += used; 436 dl->dl_phys->dl_comp += comp; 437 dl->dl_phys->dl_uncomp += uncomp; 438 mutex_exit(&dl->dl_lock); 439 440 dle_tofind.dle_mintxg = birth; 441 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 442 if (dle == NULL) 443 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 444 dle_enqueue_subobj(dl, dle, obj, tx); 445 } 446 447 static int 448 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 449 { 450 dsl_deadlist_t *dl = arg; 451 dsl_deadlist_insert(dl, bp, tx); 452 return (0); 453 } 454 455 /* 456 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 457 * an empty deadlist. 458 */ 459 void 460 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 461 { 462 zap_cursor_t zc; 463 zap_attribute_t za; 464 dmu_buf_t *bonus; 465 dsl_deadlist_phys_t *dlp; 466 dmu_object_info_t doi; 467 468 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 469 if (doi.doi_type == DMU_OT_BPOBJ) { 470 bpobj_t bpo; 471 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 472 VERIFY3U(0, ==, bpobj_iterate(&bpo, 473 dsl_deadlist_insert_cb, dl, tx)); 474 bpobj_close(&bpo); 475 return; 476 } 477 478 for (zap_cursor_init(&zc, dl->dl_os, obj); 479 zap_cursor_retrieve(&zc, &za) == 0; 480 zap_cursor_advance(&zc)) { 481 uint64_t mintxg = strtonum(za.za_name, NULL); 482 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 483 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 484 } 485 zap_cursor_fini(&zc); 486 487 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 488 dlp = bonus->db_data; 489 dmu_buf_will_dirty(bonus, tx); 490 bzero(dlp, sizeof (*dlp)); 491 dmu_buf_rele(bonus, FTAG); 492 } 493 494 /* 495 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 496 */ 497 void 498 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 499 dmu_tx_t *tx) 500 { 501 dsl_deadlist_entry_t dle_tofind; 502 dsl_deadlist_entry_t *dle; 503 avl_index_t where; 504 505 ASSERT(!dl->dl_oldfmt); 506 dmu_buf_will_dirty(dl->dl_dbuf, tx); 507 dsl_deadlist_load_tree(dl); 508 509 dle_tofind.dle_mintxg = mintxg; 510 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 511 if (dle == NULL) 512 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 513 while (dle) { 514 uint64_t used, comp, uncomp; 515 dsl_deadlist_entry_t *dle_next; 516 517 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 518 519 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 520 &used, &comp, &uncomp)); 521 mutex_enter(&dl->dl_lock); 522 ASSERT3U(dl->dl_phys->dl_used, >=, used); 523 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 524 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 525 dl->dl_phys->dl_used -= used; 526 dl->dl_phys->dl_comp -= comp; 527 dl->dl_phys->dl_uncomp -= uncomp; 528 mutex_exit(&dl->dl_lock); 529 530 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 531 dle->dle_mintxg, tx)); 532 533 dle_next = AVL_NEXT(&dl->dl_tree, dle); 534 avl_remove(&dl->dl_tree, dle); 535 bpobj_close(&dle->dle_bpobj); 536 kmem_free(dle, sizeof (*dle)); 537 dle = dle_next; 538 } 539 } 540