1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/sort.h> 9 #include "messages.h" 10 #include "ctree.h" 11 #include "delayed-ref.h" 12 #include "transaction.h" 13 #include "qgroup.h" 14 #include "space-info.h" 15 #include "tree-mod-log.h" 16 #include "fs.h" 17 18 struct kmem_cache *btrfs_delayed_ref_head_cachep; 19 struct kmem_cache *btrfs_delayed_ref_node_cachep; 20 struct kmem_cache *btrfs_delayed_extent_op_cachep; 21 /* 22 * delayed back reference update tracking. For subvolume trees 23 * we queue up extent allocations and backref maintenance for 24 * delayed processing. This avoids deep call chains where we 25 * add extents in the middle of btrfs_search_slot, and it allows 26 * us to buffer up frequently modified backrefs in an rb tree instead 27 * of hammering updates on the extent allocation tree. 28 */ 29 30 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 31 { 32 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 33 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 34 bool ret = false; 35 u64 reserved; 36 37 spin_lock(&global_rsv->lock); 38 reserved = global_rsv->reserved; 39 spin_unlock(&global_rsv->lock); 40 41 /* 42 * Since the global reserve is just kind of magic we don't really want 43 * to rely on it to save our bacon, so if our size is more than the 44 * delayed_refs_rsv and the global rsv then it's time to think about 45 * bailing. 46 */ 47 spin_lock(&delayed_refs_rsv->lock); 48 reserved += delayed_refs_rsv->reserved; 49 if (delayed_refs_rsv->size >= reserved) 50 ret = true; 51 spin_unlock(&delayed_refs_rsv->lock); 52 return ret; 53 } 54 55 /* 56 * Release a ref head's reservation. 57 * 58 * @fs_info: the filesystem 59 * @nr_refs: number of delayed refs to drop 60 * @nr_csums: number of csum items to drop 61 * 62 * Drops the delayed ref head's count from the delayed refs rsv and free any 63 * excess reservation we had. 64 */ 65 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) 66 { 67 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 68 u64 num_bytes; 69 u64 released; 70 71 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); 72 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 73 74 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 75 if (released) 76 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 77 0, released, 0); 78 } 79 80 /* 81 * Adjust the size of the delayed refs rsv. 82 * 83 * This is to be called anytime we may have adjusted trans->delayed_ref_updates 84 * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and 85 * add it to the delayed_refs_rsv. 86 */ 87 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 88 { 89 struct btrfs_fs_info *fs_info = trans->fs_info; 90 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 91 struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; 92 u64 num_bytes; 93 u64 reserved_bytes; 94 95 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); 96 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, 97 trans->delayed_ref_csum_deletions); 98 99 if (num_bytes == 0) 100 return; 101 102 /* 103 * Try to take num_bytes from the transaction's local delayed reserve. 104 * If not possible, try to take as much as it's available. If the local 105 * reserve doesn't have enough reserved space, the delayed refs reserve 106 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called 107 * by someone or if a transaction commit is triggered before that, the 108 * global block reserve will be used. We want to minimize using the 109 * global block reserve for cases we can account for in advance, to 110 * avoid exhausting it and reach -ENOSPC during a transaction commit. 111 */ 112 spin_lock(&local_rsv->lock); 113 reserved_bytes = min(num_bytes, local_rsv->reserved); 114 local_rsv->reserved -= reserved_bytes; 115 local_rsv->full = (local_rsv->reserved >= local_rsv->size); 116 spin_unlock(&local_rsv->lock); 117 118 spin_lock(&delayed_rsv->lock); 119 delayed_rsv->size += num_bytes; 120 delayed_rsv->reserved += reserved_bytes; 121 delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); 122 spin_unlock(&delayed_rsv->lock); 123 trans->delayed_ref_updates = 0; 124 trans->delayed_ref_csum_deletions = 0; 125 } 126 127 /* 128 * Adjust the size of the delayed refs block reserve for 1 block group item 129 * insertion, used after allocating a block group. 130 */ 131 void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 132 { 133 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 134 135 spin_lock(&delayed_rsv->lock); 136 /* 137 * Inserting a block group item does not require changing the free space 138 * tree, only the extent tree or the block group tree, so this is all we 139 * need. 140 */ 141 delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); 142 delayed_rsv->full = false; 143 spin_unlock(&delayed_rsv->lock); 144 } 145 146 /* 147 * Adjust the size of the delayed refs block reserve to release space for 1 148 * block group item insertion. 149 */ 150 void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 151 { 152 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 153 const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 154 u64 released; 155 156 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 157 if (released > 0) 158 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 159 0, released, 0); 160 } 161 162 /* 163 * Adjust the size of the delayed refs block reserve for 1 block group item 164 * update. 165 */ 166 void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 167 { 168 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 169 170 spin_lock(&delayed_rsv->lock); 171 /* 172 * Updating a block group item does not result in new nodes/leaves and 173 * does not require changing the free space tree, only the extent tree 174 * or the block group tree, so this is all we need. 175 */ 176 delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); 177 delayed_rsv->full = false; 178 spin_unlock(&delayed_rsv->lock); 179 } 180 181 /* 182 * Adjust the size of the delayed refs block reserve to release space for 1 183 * block group item update. 184 */ 185 void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 186 { 187 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 188 const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 189 u64 released; 190 191 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 192 if (released > 0) 193 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 194 0, released, 0); 195 } 196 197 /* 198 * Refill based on our delayed refs usage. 199 * 200 * @fs_info: the filesystem 201 * @flush: control how we can flush for this reservation. 202 * 203 * This will refill the delayed block_rsv up to 1 items size worth of space and 204 * will return -ENOSPC if we can't make the reservation. 205 */ 206 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 207 enum btrfs_reserve_flush_enum flush) 208 { 209 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 210 struct btrfs_space_info *space_info = block_rsv->space_info; 211 u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); 212 u64 num_bytes = 0; 213 u64 refilled_bytes; 214 u64 to_free; 215 int ret = -ENOSPC; 216 217 spin_lock(&block_rsv->lock); 218 if (block_rsv->reserved < block_rsv->size) { 219 num_bytes = block_rsv->size - block_rsv->reserved; 220 num_bytes = min(num_bytes, limit); 221 } 222 spin_unlock(&block_rsv->lock); 223 224 if (!num_bytes) 225 return 0; 226 227 ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); 228 if (ret) 229 return ret; 230 231 /* 232 * We may have raced with someone else, so check again if we the block 233 * reserve is still not full and release any excess space. 234 */ 235 spin_lock(&block_rsv->lock); 236 if (block_rsv->reserved < block_rsv->size) { 237 u64 needed = block_rsv->size - block_rsv->reserved; 238 239 if (num_bytes >= needed) { 240 block_rsv->reserved += needed; 241 block_rsv->full = true; 242 to_free = num_bytes - needed; 243 refilled_bytes = needed; 244 } else { 245 block_rsv->reserved += num_bytes; 246 to_free = 0; 247 refilled_bytes = num_bytes; 248 } 249 } else { 250 to_free = num_bytes; 251 refilled_bytes = 0; 252 } 253 spin_unlock(&block_rsv->lock); 254 255 if (to_free > 0) 256 btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); 257 258 if (refilled_bytes > 0) 259 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, 260 refilled_bytes, 1); 261 return 0; 262 } 263 264 /* 265 * compare two delayed data backrefs with same bytenr and type 266 */ 267 static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, 268 struct btrfs_delayed_ref_node *ref2) 269 { 270 if (ref1->data_ref.objectid < ref2->data_ref.objectid) 271 return -1; 272 if (ref1->data_ref.objectid > ref2->data_ref.objectid) 273 return 1; 274 if (ref1->data_ref.offset < ref2->data_ref.offset) 275 return -1; 276 if (ref1->data_ref.offset > ref2->data_ref.offset) 277 return 1; 278 return 0; 279 } 280 281 static int comp_refs(struct btrfs_delayed_ref_node *ref1, 282 struct btrfs_delayed_ref_node *ref2, 283 bool check_seq) 284 { 285 int ret = 0; 286 287 if (ref1->type < ref2->type) 288 return -1; 289 if (ref1->type > ref2->type) 290 return 1; 291 if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || 292 ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 293 if (ref1->parent < ref2->parent) 294 return -1; 295 if (ref1->parent > ref2->parent) 296 return 1; 297 } else { 298 if (ref1->ref_root < ref2->ref_root) 299 return -1; 300 if (ref1->ref_root > ref2->ref_root) 301 return -1; 302 if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) 303 ret = comp_data_refs(ref1, ref2); 304 } 305 if (ret) 306 return ret; 307 if (check_seq) { 308 if (ref1->seq < ref2->seq) 309 return -1; 310 if (ref1->seq > ref2->seq) 311 return 1; 312 } 313 return 0; 314 } 315 316 /* insert a new ref to head ref rbtree */ 317 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 318 struct rb_node *node) 319 { 320 struct rb_node **p = &root->rb_root.rb_node; 321 struct rb_node *parent_node = NULL; 322 struct btrfs_delayed_ref_head *entry; 323 struct btrfs_delayed_ref_head *ins; 324 u64 bytenr; 325 bool leftmost = true; 326 327 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 328 bytenr = ins->bytenr; 329 while (*p) { 330 parent_node = *p; 331 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 332 href_node); 333 334 if (bytenr < entry->bytenr) { 335 p = &(*p)->rb_left; 336 } else if (bytenr > entry->bytenr) { 337 p = &(*p)->rb_right; 338 leftmost = false; 339 } else { 340 return entry; 341 } 342 } 343 344 rb_link_node(node, parent_node, p); 345 rb_insert_color_cached(node, root, leftmost); 346 return NULL; 347 } 348 349 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 350 struct btrfs_delayed_ref_node *ins) 351 { 352 struct rb_node **p = &root->rb_root.rb_node; 353 struct rb_node *node = &ins->ref_node; 354 struct rb_node *parent_node = NULL; 355 struct btrfs_delayed_ref_node *entry; 356 bool leftmost = true; 357 358 while (*p) { 359 int comp; 360 361 parent_node = *p; 362 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 363 ref_node); 364 comp = comp_refs(ins, entry, true); 365 if (comp < 0) { 366 p = &(*p)->rb_left; 367 } else if (comp > 0) { 368 p = &(*p)->rb_right; 369 leftmost = false; 370 } else { 371 return entry; 372 } 373 } 374 375 rb_link_node(node, parent_node, p); 376 rb_insert_color_cached(node, root, leftmost); 377 return NULL; 378 } 379 380 static struct btrfs_delayed_ref_head *find_first_ref_head( 381 struct btrfs_delayed_ref_root *dr) 382 { 383 struct rb_node *n; 384 struct btrfs_delayed_ref_head *entry; 385 386 n = rb_first_cached(&dr->href_root); 387 if (!n) 388 return NULL; 389 390 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 391 392 return entry; 393 } 394 395 /* 396 * Find a head entry based on bytenr. This returns the delayed ref head if it 397 * was able to find one, or NULL if nothing was in that spot. If return_bigger 398 * is given, the next bigger entry is returned if no exact match is found. 399 */ 400 static struct btrfs_delayed_ref_head *find_ref_head( 401 struct btrfs_delayed_ref_root *dr, u64 bytenr, 402 bool return_bigger) 403 { 404 struct rb_root *root = &dr->href_root.rb_root; 405 struct rb_node *n; 406 struct btrfs_delayed_ref_head *entry; 407 408 n = root->rb_node; 409 entry = NULL; 410 while (n) { 411 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 412 413 if (bytenr < entry->bytenr) 414 n = n->rb_left; 415 else if (bytenr > entry->bytenr) 416 n = n->rb_right; 417 else 418 return entry; 419 } 420 if (entry && return_bigger) { 421 if (bytenr > entry->bytenr) { 422 n = rb_next(&entry->href_node); 423 if (!n) 424 return NULL; 425 entry = rb_entry(n, struct btrfs_delayed_ref_head, 426 href_node); 427 } 428 return entry; 429 } 430 return NULL; 431 } 432 433 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 434 struct btrfs_delayed_ref_head *head) 435 { 436 lockdep_assert_held(&delayed_refs->lock); 437 if (mutex_trylock(&head->mutex)) 438 return 0; 439 440 refcount_inc(&head->refs); 441 spin_unlock(&delayed_refs->lock); 442 443 mutex_lock(&head->mutex); 444 spin_lock(&delayed_refs->lock); 445 if (RB_EMPTY_NODE(&head->href_node)) { 446 mutex_unlock(&head->mutex); 447 btrfs_put_delayed_ref_head(head); 448 return -EAGAIN; 449 } 450 btrfs_put_delayed_ref_head(head); 451 return 0; 452 } 453 454 static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, 455 struct btrfs_delayed_ref_root *delayed_refs, 456 struct btrfs_delayed_ref_head *head, 457 struct btrfs_delayed_ref_node *ref) 458 { 459 lockdep_assert_held(&head->lock); 460 rb_erase_cached(&ref->ref_node, &head->ref_tree); 461 RB_CLEAR_NODE(&ref->ref_node); 462 if (!list_empty(&ref->add_list)) 463 list_del(&ref->add_list); 464 btrfs_put_delayed_ref(ref); 465 atomic_dec(&delayed_refs->num_entries); 466 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 467 } 468 469 static bool merge_ref(struct btrfs_fs_info *fs_info, 470 struct btrfs_delayed_ref_root *delayed_refs, 471 struct btrfs_delayed_ref_head *head, 472 struct btrfs_delayed_ref_node *ref, 473 u64 seq) 474 { 475 struct btrfs_delayed_ref_node *next; 476 struct rb_node *node = rb_next(&ref->ref_node); 477 bool done = false; 478 479 while (!done && node) { 480 int mod; 481 482 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 483 node = rb_next(node); 484 if (seq && next->seq >= seq) 485 break; 486 if (comp_refs(ref, next, false)) 487 break; 488 489 if (ref->action == next->action) { 490 mod = next->ref_mod; 491 } else { 492 if (ref->ref_mod < next->ref_mod) { 493 swap(ref, next); 494 done = true; 495 } 496 mod = -next->ref_mod; 497 } 498 499 drop_delayed_ref(fs_info, delayed_refs, head, next); 500 ref->ref_mod += mod; 501 if (ref->ref_mod == 0) { 502 drop_delayed_ref(fs_info, delayed_refs, head, ref); 503 done = true; 504 } else { 505 /* 506 * Can't have multiples of the same ref on a tree block. 507 */ 508 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 509 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 510 } 511 } 512 513 return done; 514 } 515 516 void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, 517 struct btrfs_delayed_ref_root *delayed_refs, 518 struct btrfs_delayed_ref_head *head) 519 { 520 struct btrfs_delayed_ref_node *ref; 521 struct rb_node *node; 522 u64 seq = 0; 523 524 lockdep_assert_held(&head->lock); 525 526 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 527 return; 528 529 /* We don't have too many refs to merge for data. */ 530 if (head->is_data) 531 return; 532 533 seq = btrfs_tree_mod_log_lowest_seq(fs_info); 534 again: 535 for (node = rb_first_cached(&head->ref_tree); node; 536 node = rb_next(node)) { 537 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 538 if (seq && ref->seq >= seq) 539 continue; 540 if (merge_ref(fs_info, delayed_refs, head, ref, seq)) 541 goto again; 542 } 543 } 544 545 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 546 { 547 int ret = 0; 548 u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); 549 550 if (min_seq != 0 && seq >= min_seq) { 551 btrfs_debug(fs_info, 552 "holding back delayed_ref %llu, lowest is %llu", 553 seq, min_seq); 554 ret = 1; 555 } 556 557 return ret; 558 } 559 560 struct btrfs_delayed_ref_head *btrfs_select_ref_head( 561 struct btrfs_delayed_ref_root *delayed_refs) 562 { 563 struct btrfs_delayed_ref_head *head; 564 565 lockdep_assert_held(&delayed_refs->lock); 566 again: 567 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 568 true); 569 if (!head && delayed_refs->run_delayed_start != 0) { 570 delayed_refs->run_delayed_start = 0; 571 head = find_first_ref_head(delayed_refs); 572 } 573 if (!head) 574 return NULL; 575 576 while (head->processing) { 577 struct rb_node *node; 578 579 node = rb_next(&head->href_node); 580 if (!node) { 581 if (delayed_refs->run_delayed_start == 0) 582 return NULL; 583 delayed_refs->run_delayed_start = 0; 584 goto again; 585 } 586 head = rb_entry(node, struct btrfs_delayed_ref_head, 587 href_node); 588 } 589 590 head->processing = true; 591 WARN_ON(delayed_refs->num_heads_ready == 0); 592 delayed_refs->num_heads_ready--; 593 delayed_refs->run_delayed_start = head->bytenr + 594 head->num_bytes; 595 return head; 596 } 597 598 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 599 struct btrfs_delayed_ref_head *head) 600 { 601 lockdep_assert_held(&delayed_refs->lock); 602 lockdep_assert_held(&head->lock); 603 604 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 605 RB_CLEAR_NODE(&head->href_node); 606 atomic_dec(&delayed_refs->num_entries); 607 delayed_refs->num_heads--; 608 if (!head->processing) 609 delayed_refs->num_heads_ready--; 610 } 611 612 /* 613 * Helper to insert the ref_node to the tail or merge with tail. 614 * 615 * Return false if the ref was inserted. 616 * Return true if the ref was merged into an existing one (and therefore can be 617 * freed by the caller). 618 */ 619 static bool insert_delayed_ref(struct btrfs_trans_handle *trans, 620 struct btrfs_delayed_ref_head *href, 621 struct btrfs_delayed_ref_node *ref) 622 { 623 struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; 624 struct btrfs_delayed_ref_node *exist; 625 int mod; 626 627 spin_lock(&href->lock); 628 exist = tree_insert(&href->ref_tree, ref); 629 if (!exist) { 630 if (ref->action == BTRFS_ADD_DELAYED_REF) 631 list_add_tail(&ref->add_list, &href->ref_add_list); 632 atomic_inc(&root->num_entries); 633 spin_unlock(&href->lock); 634 trans->delayed_ref_updates++; 635 return false; 636 } 637 638 /* Now we are sure we can merge */ 639 if (exist->action == ref->action) { 640 mod = ref->ref_mod; 641 } else { 642 /* Need to change action */ 643 if (exist->ref_mod < ref->ref_mod) { 644 exist->action = ref->action; 645 mod = -exist->ref_mod; 646 exist->ref_mod = ref->ref_mod; 647 if (ref->action == BTRFS_ADD_DELAYED_REF) 648 list_add_tail(&exist->add_list, 649 &href->ref_add_list); 650 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 651 ASSERT(!list_empty(&exist->add_list)); 652 list_del(&exist->add_list); 653 } else { 654 ASSERT(0); 655 } 656 } else 657 mod = -ref->ref_mod; 658 } 659 exist->ref_mod += mod; 660 661 /* remove existing tail if its ref_mod is zero */ 662 if (exist->ref_mod == 0) 663 drop_delayed_ref(trans->fs_info, root, href, exist); 664 spin_unlock(&href->lock); 665 return true; 666 } 667 668 /* 669 * helper function to update the accounting in the head ref 670 * existing and update must have the same bytenr 671 */ 672 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 673 struct btrfs_delayed_ref_head *existing, 674 struct btrfs_delayed_ref_head *update) 675 { 676 struct btrfs_delayed_ref_root *delayed_refs = 677 &trans->transaction->delayed_refs; 678 struct btrfs_fs_info *fs_info = trans->fs_info; 679 int old_ref_mod; 680 681 BUG_ON(existing->is_data != update->is_data); 682 683 spin_lock(&existing->lock); 684 685 /* 686 * When freeing an extent, we may not know the owning root when we 687 * first create the head_ref. However, some deref before the last deref 688 * will know it, so we just need to update the head_ref accordingly. 689 */ 690 if (!existing->owning_root) 691 existing->owning_root = update->owning_root; 692 693 if (update->must_insert_reserved) { 694 /* if the extent was freed and then 695 * reallocated before the delayed ref 696 * entries were processed, we can end up 697 * with an existing head ref without 698 * the must_insert_reserved flag set. 699 * Set it again here 700 */ 701 existing->must_insert_reserved = update->must_insert_reserved; 702 existing->owning_root = update->owning_root; 703 704 /* 705 * update the num_bytes so we make sure the accounting 706 * is done correctly 707 */ 708 existing->num_bytes = update->num_bytes; 709 710 } 711 712 if (update->extent_op) { 713 if (!existing->extent_op) { 714 existing->extent_op = update->extent_op; 715 } else { 716 if (update->extent_op->update_key) { 717 memcpy(&existing->extent_op->key, 718 &update->extent_op->key, 719 sizeof(update->extent_op->key)); 720 existing->extent_op->update_key = true; 721 } 722 if (update->extent_op->update_flags) { 723 existing->extent_op->flags_to_set |= 724 update->extent_op->flags_to_set; 725 existing->extent_op->update_flags = true; 726 } 727 btrfs_free_delayed_extent_op(update->extent_op); 728 } 729 } 730 /* 731 * update the reference mod on the head to reflect this new operation, 732 * only need the lock for this case cause we could be processing it 733 * currently, for refs we just added we know we're a-ok. 734 */ 735 old_ref_mod = existing->total_ref_mod; 736 existing->ref_mod += update->ref_mod; 737 existing->total_ref_mod += update->ref_mod; 738 739 /* 740 * If we are going to from a positive ref mod to a negative or vice 741 * versa we need to make sure to adjust pending_csums accordingly. 742 * We reserve bytes for csum deletion when adding or updating a ref head 743 * see add_delayed_ref_head() for more details. 744 */ 745 if (existing->is_data) { 746 u64 csum_leaves = 747 btrfs_csum_bytes_to_leaves(fs_info, 748 existing->num_bytes); 749 750 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 751 delayed_refs->pending_csums -= existing->num_bytes; 752 btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); 753 } 754 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 755 delayed_refs->pending_csums += existing->num_bytes; 756 trans->delayed_ref_csum_deletions += csum_leaves; 757 } 758 } 759 760 spin_unlock(&existing->lock); 761 } 762 763 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 764 struct btrfs_ref *generic_ref, 765 struct btrfs_qgroup_extent_record *qrecord, 766 u64 reserved) 767 { 768 int count_mod = 1; 769 bool must_insert_reserved = false; 770 771 /* If reserved is provided, it must be a data extent. */ 772 BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); 773 774 switch (generic_ref->action) { 775 case BTRFS_ADD_DELAYED_REF: 776 /* count_mod is already set to 1. */ 777 break; 778 case BTRFS_UPDATE_DELAYED_HEAD: 779 count_mod = 0; 780 break; 781 case BTRFS_DROP_DELAYED_REF: 782 /* 783 * The head node stores the sum of all the mods, so dropping a ref 784 * should drop the sum in the head node by one. 785 */ 786 count_mod = -1; 787 break; 788 case BTRFS_ADD_DELAYED_EXTENT: 789 /* 790 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the 791 * reserved accounting when the extent is finally added, or if a 792 * later modification deletes the delayed ref without ever 793 * inserting the extent into the extent allocation tree. 794 * ref->must_insert_reserved is the flag used to record that 795 * accounting mods are required. 796 * 797 * Once we record must_insert_reserved, switch the action to 798 * BTRFS_ADD_DELAYED_REF because other special casing is not 799 * required. 800 */ 801 must_insert_reserved = true; 802 break; 803 } 804 805 refcount_set(&head_ref->refs, 1); 806 head_ref->bytenr = generic_ref->bytenr; 807 head_ref->num_bytes = generic_ref->num_bytes; 808 head_ref->ref_mod = count_mod; 809 head_ref->reserved_bytes = reserved; 810 head_ref->must_insert_reserved = must_insert_reserved; 811 head_ref->owning_root = generic_ref->owning_root; 812 head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); 813 head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); 814 head_ref->ref_tree = RB_ROOT_CACHED; 815 INIT_LIST_HEAD(&head_ref->ref_add_list); 816 RB_CLEAR_NODE(&head_ref->href_node); 817 head_ref->processing = false; 818 head_ref->total_ref_mod = count_mod; 819 spin_lock_init(&head_ref->lock); 820 mutex_init(&head_ref->mutex); 821 822 /* If not metadata set an impossible level to help debugging. */ 823 if (generic_ref->type == BTRFS_REF_METADATA) 824 head_ref->level = generic_ref->tree_ref.level; 825 else 826 head_ref->level = U8_MAX; 827 828 if (qrecord) { 829 if (generic_ref->ref_root && reserved) { 830 qrecord->data_rsv = reserved; 831 qrecord->data_rsv_refroot = generic_ref->ref_root; 832 } 833 qrecord->bytenr = generic_ref->bytenr; 834 qrecord->num_bytes = generic_ref->num_bytes; 835 qrecord->old_roots = NULL; 836 } 837 } 838 839 /* 840 * helper function to actually insert a head node into the rbtree. 841 * this does all the dirty work in terms of maintaining the correct 842 * overall modification count. 843 */ 844 static noinline struct btrfs_delayed_ref_head * 845 add_delayed_ref_head(struct btrfs_trans_handle *trans, 846 struct btrfs_delayed_ref_head *head_ref, 847 struct btrfs_qgroup_extent_record *qrecord, 848 int action, bool *qrecord_inserted_ret) 849 { 850 struct btrfs_delayed_ref_head *existing; 851 struct btrfs_delayed_ref_root *delayed_refs; 852 bool qrecord_inserted = false; 853 854 delayed_refs = &trans->transaction->delayed_refs; 855 856 /* Record qgroup extent info if provided */ 857 if (qrecord) { 858 int ret; 859 860 ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info, 861 delayed_refs, qrecord); 862 if (ret) { 863 /* Clean up if insertion fails or item exists. */ 864 xa_release(&delayed_refs->dirty_extents, qrecord->bytenr); 865 kfree(qrecord); 866 } else { 867 qrecord_inserted = true; 868 } 869 } 870 871 trace_add_delayed_ref_head(trans->fs_info, head_ref, action); 872 873 existing = htree_insert(&delayed_refs->href_root, 874 &head_ref->href_node); 875 if (existing) { 876 update_existing_head_ref(trans, existing, head_ref); 877 /* 878 * we've updated the existing ref, free the newly 879 * allocated ref 880 */ 881 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 882 head_ref = existing; 883 } else { 884 /* 885 * We reserve the amount of bytes needed to delete csums when 886 * adding the ref head and not when adding individual drop refs 887 * since the csum items are deleted only after running the last 888 * delayed drop ref (the data extent's ref count drops to 0). 889 */ 890 if (head_ref->is_data && head_ref->ref_mod < 0) { 891 delayed_refs->pending_csums += head_ref->num_bytes; 892 trans->delayed_ref_csum_deletions += 893 btrfs_csum_bytes_to_leaves(trans->fs_info, 894 head_ref->num_bytes); 895 } 896 delayed_refs->num_heads++; 897 delayed_refs->num_heads_ready++; 898 atomic_inc(&delayed_refs->num_entries); 899 } 900 if (qrecord_inserted_ret) 901 *qrecord_inserted_ret = qrecord_inserted; 902 903 return head_ref; 904 } 905 906 /* 907 * Initialize the structure which represents a modification to a an extent. 908 * 909 * @fs_info: Internal to the mounted filesystem mount structure. 910 * 911 * @ref: The structure which is going to be initialized. 912 * 913 * @bytenr: The logical address of the extent for which a modification is 914 * going to be recorded. 915 * 916 * @num_bytes: Size of the extent whose modification is being recorded. 917 * 918 * @ref_root: The id of the root where this modification has originated, this 919 * can be either one of the well-known metadata trees or the 920 * subvolume id which references this extent. 921 * 922 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 923 * BTRFS_ADD_DELAYED_EXTENT 924 * 925 * @ref_type: Holds the type of the extent which is being recorded, can be 926 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 927 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 928 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 929 */ 930 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 931 struct btrfs_delayed_ref_node *ref, 932 struct btrfs_ref *generic_ref) 933 { 934 int action = generic_ref->action; 935 u64 seq = 0; 936 937 if (action == BTRFS_ADD_DELAYED_EXTENT) 938 action = BTRFS_ADD_DELAYED_REF; 939 940 if (is_fstree(generic_ref->ref_root)) 941 seq = atomic64_read(&fs_info->tree_mod_seq); 942 943 refcount_set(&ref->refs, 1); 944 ref->bytenr = generic_ref->bytenr; 945 ref->num_bytes = generic_ref->num_bytes; 946 ref->ref_mod = 1; 947 ref->action = action; 948 ref->seq = seq; 949 ref->type = btrfs_ref_type(generic_ref); 950 ref->ref_root = generic_ref->ref_root; 951 ref->parent = generic_ref->parent; 952 RB_CLEAR_NODE(&ref->ref_node); 953 INIT_LIST_HEAD(&ref->add_list); 954 955 if (generic_ref->type == BTRFS_REF_DATA) 956 ref->data_ref = generic_ref->data_ref; 957 else 958 ref->tree_ref = generic_ref->tree_ref; 959 } 960 961 void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, 962 bool skip_qgroup) 963 { 964 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 965 /* If @real_root not set, use @root as fallback */ 966 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 967 #endif 968 generic_ref->tree_ref.level = level; 969 generic_ref->type = BTRFS_REF_METADATA; 970 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 971 (!mod_root || is_fstree(mod_root)))) 972 generic_ref->skip_qgroup = true; 973 else 974 generic_ref->skip_qgroup = false; 975 976 } 977 978 void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, 979 u64 mod_root, bool skip_qgroup) 980 { 981 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 982 /* If @real_root not set, use @root as fallback */ 983 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 984 #endif 985 generic_ref->data_ref.objectid = ino; 986 generic_ref->data_ref.offset = offset; 987 generic_ref->type = BTRFS_REF_DATA; 988 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 989 (!mod_root || is_fstree(mod_root)))) 990 generic_ref->skip_qgroup = true; 991 else 992 generic_ref->skip_qgroup = false; 993 } 994 995 static int add_delayed_ref(struct btrfs_trans_handle *trans, 996 struct btrfs_ref *generic_ref, 997 struct btrfs_delayed_extent_op *extent_op, 998 u64 reserved) 999 { 1000 struct btrfs_fs_info *fs_info = trans->fs_info; 1001 struct btrfs_delayed_ref_node *node; 1002 struct btrfs_delayed_ref_head *head_ref; 1003 struct btrfs_delayed_ref_root *delayed_refs; 1004 struct btrfs_qgroup_extent_record *record = NULL; 1005 bool qrecord_inserted; 1006 int action = generic_ref->action; 1007 bool merged; 1008 1009 node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); 1010 if (!node) 1011 return -ENOMEM; 1012 1013 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1014 if (!head_ref) 1015 goto free_node; 1016 1017 if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { 1018 record = kzalloc(sizeof(*record), GFP_NOFS); 1019 if (!record) 1020 goto free_head_ref; 1021 if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, 1022 generic_ref->bytenr, GFP_NOFS)) 1023 goto free_record; 1024 } 1025 1026 init_delayed_ref_common(fs_info, node, generic_ref); 1027 init_delayed_ref_head(head_ref, generic_ref, record, reserved); 1028 head_ref->extent_op = extent_op; 1029 1030 delayed_refs = &trans->transaction->delayed_refs; 1031 spin_lock(&delayed_refs->lock); 1032 1033 /* 1034 * insert both the head node and the new ref without dropping 1035 * the spin lock 1036 */ 1037 head_ref = add_delayed_ref_head(trans, head_ref, record, 1038 action, &qrecord_inserted); 1039 1040 merged = insert_delayed_ref(trans, head_ref, node); 1041 spin_unlock(&delayed_refs->lock); 1042 1043 /* 1044 * Need to update the delayed_refs_rsv with any changes we may have 1045 * made. 1046 */ 1047 btrfs_update_delayed_refs_rsv(trans); 1048 1049 if (generic_ref->type == BTRFS_REF_DATA) 1050 trace_add_delayed_data_ref(trans->fs_info, node); 1051 else 1052 trace_add_delayed_tree_ref(trans->fs_info, node); 1053 if (merged) 1054 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1055 1056 if (qrecord_inserted) 1057 return btrfs_qgroup_trace_extent_post(trans, record); 1058 return 0; 1059 1060 free_record: 1061 kfree(record); 1062 free_head_ref: 1063 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 1064 free_node: 1065 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1066 return -ENOMEM; 1067 } 1068 1069 /* 1070 * Add a delayed tree ref. This does all of the accounting required to make sure 1071 * the delayed ref is eventually processed before this transaction commits. 1072 */ 1073 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 1074 struct btrfs_ref *generic_ref, 1075 struct btrfs_delayed_extent_op *extent_op) 1076 { 1077 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 1078 return add_delayed_ref(trans, generic_ref, extent_op, 0); 1079 } 1080 1081 /* 1082 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1083 */ 1084 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1085 struct btrfs_ref *generic_ref, 1086 u64 reserved) 1087 { 1088 ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); 1089 return add_delayed_ref(trans, generic_ref, NULL, reserved); 1090 } 1091 1092 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1093 u64 bytenr, u64 num_bytes, u8 level, 1094 struct btrfs_delayed_extent_op *extent_op) 1095 { 1096 struct btrfs_delayed_ref_head *head_ref; 1097 struct btrfs_delayed_ref_root *delayed_refs; 1098 struct btrfs_ref generic_ref = { 1099 .type = BTRFS_REF_METADATA, 1100 .action = BTRFS_UPDATE_DELAYED_HEAD, 1101 .bytenr = bytenr, 1102 .num_bytes = num_bytes, 1103 .tree_ref.level = level, 1104 }; 1105 1106 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1107 if (!head_ref) 1108 return -ENOMEM; 1109 1110 init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); 1111 head_ref->extent_op = extent_op; 1112 1113 delayed_refs = &trans->transaction->delayed_refs; 1114 spin_lock(&delayed_refs->lock); 1115 1116 add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, 1117 NULL); 1118 1119 spin_unlock(&delayed_refs->lock); 1120 1121 /* 1122 * Need to update the delayed_refs_rsv with any changes we may have 1123 * made. 1124 */ 1125 btrfs_update_delayed_refs_rsv(trans); 1126 return 0; 1127 } 1128 1129 void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 1130 { 1131 if (refcount_dec_and_test(&ref->refs)) { 1132 WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); 1133 kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); 1134 } 1135 } 1136 1137 /* 1138 * This does a simple search for the head node for a given extent. Returns the 1139 * head node if found, or NULL if not. 1140 */ 1141 struct btrfs_delayed_ref_head * 1142 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1143 { 1144 lockdep_assert_held(&delayed_refs->lock); 1145 1146 return find_ref_head(delayed_refs, bytenr, false); 1147 } 1148 1149 static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) 1150 { 1151 int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; 1152 1153 if (type < entry->type) 1154 return -1; 1155 if (type > entry->type) 1156 return 1; 1157 1158 if (type == BTRFS_TREE_BLOCK_REF_KEY) { 1159 if (root < entry->ref_root) 1160 return -1; 1161 if (root > entry->ref_root) 1162 return 1; 1163 } else { 1164 if (parent < entry->parent) 1165 return -1; 1166 if (parent > entry->parent) 1167 return 1; 1168 } 1169 return 0; 1170 } 1171 1172 /* 1173 * Check to see if a given root/parent reference is attached to the head. This 1174 * only checks for BTRFS_ADD_DELAYED_REF references that match, as that 1175 * indicates the reference exists for the given root or parent. This is for 1176 * tree blocks only. 1177 * 1178 * @head: the head of the bytenr we're searching. 1179 * @root: the root objectid of the reference if it is a normal reference. 1180 * @parent: the parent if this is a shared backref. 1181 */ 1182 bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, 1183 u64 root, u64 parent) 1184 { 1185 struct rb_node *node; 1186 bool found = false; 1187 1188 lockdep_assert_held(&head->mutex); 1189 1190 spin_lock(&head->lock); 1191 node = head->ref_tree.rb_root.rb_node; 1192 while (node) { 1193 struct btrfs_delayed_ref_node *entry; 1194 int ret; 1195 1196 entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 1197 ret = find_comp(entry, root, parent); 1198 if (ret < 0) { 1199 node = node->rb_left; 1200 } else if (ret > 0) { 1201 node = node->rb_right; 1202 } else { 1203 /* 1204 * We only want to count ADD actions, as drops mean the 1205 * ref doesn't exist. 1206 */ 1207 if (entry->action == BTRFS_ADD_DELAYED_REF) 1208 found = true; 1209 break; 1210 } 1211 } 1212 spin_unlock(&head->lock); 1213 return found; 1214 } 1215 1216 void __cold btrfs_delayed_ref_exit(void) 1217 { 1218 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1219 kmem_cache_destroy(btrfs_delayed_ref_node_cachep); 1220 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1221 } 1222 1223 int __init btrfs_delayed_ref_init(void) 1224 { 1225 btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); 1226 if (!btrfs_delayed_ref_head_cachep) 1227 goto fail; 1228 1229 btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); 1230 if (!btrfs_delayed_ref_node_cachep) 1231 goto fail; 1232 1233 btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); 1234 if (!btrfs_delayed_extent_op_cachep) 1235 goto fail; 1236 1237 return 0; 1238 fail: 1239 btrfs_delayed_ref_exit(); 1240 return -ENOMEM; 1241 } 1242