1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/sort.h> 9 #include "messages.h" 10 #include "ctree.h" 11 #include "delayed-ref.h" 12 #include "transaction.h" 13 #include "qgroup.h" 14 #include "space-info.h" 15 #include "tree-mod-log.h" 16 #include "fs.h" 17 18 struct kmem_cache *btrfs_delayed_ref_head_cachep; 19 struct kmem_cache *btrfs_delayed_ref_node_cachep; 20 struct kmem_cache *btrfs_delayed_extent_op_cachep; 21 /* 22 * delayed back reference update tracking. For subvolume trees 23 * we queue up extent allocations and backref maintenance for 24 * delayed processing. This avoids deep call chains where we 25 * add extents in the middle of btrfs_search_slot, and it allows 26 * us to buffer up frequently modified backrefs in an rb tree instead 27 * of hammering updates on the extent allocation tree. 28 */ 29 30 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 31 { 32 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 33 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 34 bool ret = false; 35 u64 reserved; 36 37 spin_lock(&global_rsv->lock); 38 reserved = global_rsv->reserved; 39 spin_unlock(&global_rsv->lock); 40 41 /* 42 * Since the global reserve is just kind of magic we don't really want 43 * to rely on it to save our bacon, so if our size is more than the 44 * delayed_refs_rsv and the global rsv then it's time to think about 45 * bailing. 46 */ 47 spin_lock(&delayed_refs_rsv->lock); 48 reserved += delayed_refs_rsv->reserved; 49 if (delayed_refs_rsv->size >= reserved) 50 ret = true; 51 spin_unlock(&delayed_refs_rsv->lock); 52 return ret; 53 } 54 55 /* 56 * Release a ref head's reservation. 57 * 58 * @fs_info: the filesystem 59 * @nr_refs: number of delayed refs to drop 60 * @nr_csums: number of csum items to drop 61 * 62 * Drops the delayed ref head's count from the delayed refs rsv and free any 63 * excess reservation we had. 64 */ 65 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) 66 { 67 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 68 u64 num_bytes; 69 u64 released; 70 71 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); 72 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 73 74 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 75 if (released) 76 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 77 0, released, 0); 78 } 79 80 /* 81 * Adjust the size of the delayed refs rsv. 82 * 83 * This is to be called anytime we may have adjusted trans->delayed_ref_updates 84 * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and 85 * add it to the delayed_refs_rsv. 86 */ 87 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 88 { 89 struct btrfs_fs_info *fs_info = trans->fs_info; 90 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 91 struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; 92 u64 num_bytes; 93 u64 reserved_bytes; 94 95 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); 96 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, 97 trans->delayed_ref_csum_deletions); 98 99 if (num_bytes == 0) 100 return; 101 102 /* 103 * Try to take num_bytes from the transaction's local delayed reserve. 104 * If not possible, try to take as much as it's available. If the local 105 * reserve doesn't have enough reserved space, the delayed refs reserve 106 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called 107 * by someone or if a transaction commit is triggered before that, the 108 * global block reserve will be used. We want to minimize using the 109 * global block reserve for cases we can account for in advance, to 110 * avoid exhausting it and reach -ENOSPC during a transaction commit. 111 */ 112 spin_lock(&local_rsv->lock); 113 reserved_bytes = min(num_bytes, local_rsv->reserved); 114 local_rsv->reserved -= reserved_bytes; 115 local_rsv->full = (local_rsv->reserved >= local_rsv->size); 116 spin_unlock(&local_rsv->lock); 117 118 spin_lock(&delayed_rsv->lock); 119 delayed_rsv->size += num_bytes; 120 delayed_rsv->reserved += reserved_bytes; 121 delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); 122 spin_unlock(&delayed_rsv->lock); 123 trans->delayed_ref_updates = 0; 124 trans->delayed_ref_csum_deletions = 0; 125 } 126 127 /* 128 * Adjust the size of the delayed refs block reserve for 1 block group item 129 * insertion, used after allocating a block group. 130 */ 131 void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 132 { 133 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 134 135 spin_lock(&delayed_rsv->lock); 136 /* 137 * Inserting a block group item does not require changing the free space 138 * tree, only the extent tree or the block group tree, so this is all we 139 * need. 140 */ 141 delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); 142 delayed_rsv->full = false; 143 spin_unlock(&delayed_rsv->lock); 144 } 145 146 /* 147 * Adjust the size of the delayed refs block reserve to release space for 1 148 * block group item insertion. 149 */ 150 void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 151 { 152 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 153 const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 154 u64 released; 155 156 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 157 if (released > 0) 158 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 159 0, released, 0); 160 } 161 162 /* 163 * Adjust the size of the delayed refs block reserve for 1 block group item 164 * update. 165 */ 166 void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 167 { 168 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 169 170 spin_lock(&delayed_rsv->lock); 171 /* 172 * Updating a block group item does not result in new nodes/leaves and 173 * does not require changing the free space tree, only the extent tree 174 * or the block group tree, so this is all we need. 175 */ 176 delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); 177 delayed_rsv->full = false; 178 spin_unlock(&delayed_rsv->lock); 179 } 180 181 /* 182 * Adjust the size of the delayed refs block reserve to release space for 1 183 * block group item update. 184 */ 185 void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 186 { 187 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 188 const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 189 u64 released; 190 191 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 192 if (released > 0) 193 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 194 0, released, 0); 195 } 196 197 /* 198 * Refill based on our delayed refs usage. 199 * 200 * @fs_info: the filesystem 201 * @flush: control how we can flush for this reservation. 202 * 203 * This will refill the delayed block_rsv up to 1 items size worth of space and 204 * will return -ENOSPC if we can't make the reservation. 205 */ 206 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 207 enum btrfs_reserve_flush_enum flush) 208 { 209 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 210 struct btrfs_space_info *space_info = block_rsv->space_info; 211 u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); 212 u64 num_bytes = 0; 213 u64 refilled_bytes; 214 u64 to_free; 215 int ret = -ENOSPC; 216 217 spin_lock(&block_rsv->lock); 218 if (block_rsv->reserved < block_rsv->size) { 219 num_bytes = block_rsv->size - block_rsv->reserved; 220 num_bytes = min(num_bytes, limit); 221 } 222 spin_unlock(&block_rsv->lock); 223 224 if (!num_bytes) 225 return 0; 226 227 ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); 228 if (ret) 229 return ret; 230 231 /* 232 * We may have raced with someone else, so check again if we the block 233 * reserve is still not full and release any excess space. 234 */ 235 spin_lock(&block_rsv->lock); 236 if (block_rsv->reserved < block_rsv->size) { 237 u64 needed = block_rsv->size - block_rsv->reserved; 238 239 if (num_bytes >= needed) { 240 block_rsv->reserved += needed; 241 block_rsv->full = true; 242 to_free = num_bytes - needed; 243 refilled_bytes = needed; 244 } else { 245 block_rsv->reserved += num_bytes; 246 to_free = 0; 247 refilled_bytes = num_bytes; 248 } 249 } else { 250 to_free = num_bytes; 251 refilled_bytes = 0; 252 } 253 spin_unlock(&block_rsv->lock); 254 255 if (to_free > 0) 256 btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); 257 258 if (refilled_bytes > 0) 259 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, 260 refilled_bytes, 1); 261 return 0; 262 } 263 264 /* 265 * compare two delayed data backrefs with same bytenr and type 266 */ 267 static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, 268 struct btrfs_delayed_ref_node *ref2) 269 { 270 if (ref1->data_ref.objectid < ref2->data_ref.objectid) 271 return -1; 272 if (ref1->data_ref.objectid > ref2->data_ref.objectid) 273 return 1; 274 if (ref1->data_ref.offset < ref2->data_ref.offset) 275 return -1; 276 if (ref1->data_ref.offset > ref2->data_ref.offset) 277 return 1; 278 return 0; 279 } 280 281 static int comp_refs(struct btrfs_delayed_ref_node *ref1, 282 struct btrfs_delayed_ref_node *ref2, 283 bool check_seq) 284 { 285 int ret = 0; 286 287 if (ref1->type < ref2->type) 288 return -1; 289 if (ref1->type > ref2->type) 290 return 1; 291 if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || 292 ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 293 if (ref1->parent < ref2->parent) 294 return -1; 295 if (ref1->parent > ref2->parent) 296 return 1; 297 } else { 298 if (ref1->ref_root < ref2->ref_root) 299 return -1; 300 if (ref1->ref_root > ref2->ref_root) 301 return -1; 302 if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) 303 ret = comp_data_refs(ref1, ref2); 304 } 305 if (ret) 306 return ret; 307 if (check_seq) { 308 if (ref1->seq < ref2->seq) 309 return -1; 310 if (ref1->seq > ref2->seq) 311 return 1; 312 } 313 return 0; 314 } 315 316 /* insert a new ref to head ref rbtree */ 317 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 318 struct rb_node *node) 319 { 320 struct rb_node **p = &root->rb_root.rb_node; 321 struct rb_node *parent_node = NULL; 322 struct btrfs_delayed_ref_head *entry; 323 struct btrfs_delayed_ref_head *ins; 324 u64 bytenr; 325 bool leftmost = true; 326 327 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 328 bytenr = ins->bytenr; 329 while (*p) { 330 parent_node = *p; 331 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 332 href_node); 333 334 if (bytenr < entry->bytenr) { 335 p = &(*p)->rb_left; 336 } else if (bytenr > entry->bytenr) { 337 p = &(*p)->rb_right; 338 leftmost = false; 339 } else { 340 return entry; 341 } 342 } 343 344 rb_link_node(node, parent_node, p); 345 rb_insert_color_cached(node, root, leftmost); 346 return NULL; 347 } 348 349 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 350 struct btrfs_delayed_ref_node *ins) 351 { 352 struct rb_node **p = &root->rb_root.rb_node; 353 struct rb_node *node = &ins->ref_node; 354 struct rb_node *parent_node = NULL; 355 struct btrfs_delayed_ref_node *entry; 356 bool leftmost = true; 357 358 while (*p) { 359 int comp; 360 361 parent_node = *p; 362 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 363 ref_node); 364 comp = comp_refs(ins, entry, true); 365 if (comp < 0) { 366 p = &(*p)->rb_left; 367 } else if (comp > 0) { 368 p = &(*p)->rb_right; 369 leftmost = false; 370 } else { 371 return entry; 372 } 373 } 374 375 rb_link_node(node, parent_node, p); 376 rb_insert_color_cached(node, root, leftmost); 377 return NULL; 378 } 379 380 static struct btrfs_delayed_ref_head *find_first_ref_head( 381 struct btrfs_delayed_ref_root *dr) 382 { 383 struct rb_node *n; 384 struct btrfs_delayed_ref_head *entry; 385 386 n = rb_first_cached(&dr->href_root); 387 if (!n) 388 return NULL; 389 390 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 391 392 return entry; 393 } 394 395 /* 396 * Find a head entry based on bytenr. This returns the delayed ref head if it 397 * was able to find one, or NULL if nothing was in that spot. If return_bigger 398 * is given, the next bigger entry is returned if no exact match is found. 399 */ 400 static struct btrfs_delayed_ref_head *find_ref_head( 401 struct btrfs_delayed_ref_root *dr, u64 bytenr, 402 bool return_bigger) 403 { 404 struct rb_root *root = &dr->href_root.rb_root; 405 struct rb_node *n; 406 struct btrfs_delayed_ref_head *entry; 407 408 n = root->rb_node; 409 entry = NULL; 410 while (n) { 411 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 412 413 if (bytenr < entry->bytenr) 414 n = n->rb_left; 415 else if (bytenr > entry->bytenr) 416 n = n->rb_right; 417 else 418 return entry; 419 } 420 if (entry && return_bigger) { 421 if (bytenr > entry->bytenr) { 422 n = rb_next(&entry->href_node); 423 if (!n) 424 return NULL; 425 entry = rb_entry(n, struct btrfs_delayed_ref_head, 426 href_node); 427 } 428 return entry; 429 } 430 return NULL; 431 } 432 433 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 434 struct btrfs_delayed_ref_head *head) 435 { 436 lockdep_assert_held(&delayed_refs->lock); 437 if (mutex_trylock(&head->mutex)) 438 return 0; 439 440 refcount_inc(&head->refs); 441 spin_unlock(&delayed_refs->lock); 442 443 mutex_lock(&head->mutex); 444 spin_lock(&delayed_refs->lock); 445 if (RB_EMPTY_NODE(&head->href_node)) { 446 mutex_unlock(&head->mutex); 447 btrfs_put_delayed_ref_head(head); 448 return -EAGAIN; 449 } 450 btrfs_put_delayed_ref_head(head); 451 return 0; 452 } 453 454 static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, 455 struct btrfs_delayed_ref_root *delayed_refs, 456 struct btrfs_delayed_ref_head *head, 457 struct btrfs_delayed_ref_node *ref) 458 { 459 lockdep_assert_held(&head->lock); 460 rb_erase_cached(&ref->ref_node, &head->ref_tree); 461 RB_CLEAR_NODE(&ref->ref_node); 462 if (!list_empty(&ref->add_list)) 463 list_del(&ref->add_list); 464 btrfs_put_delayed_ref(ref); 465 atomic_dec(&delayed_refs->num_entries); 466 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 467 } 468 469 static bool merge_ref(struct btrfs_fs_info *fs_info, 470 struct btrfs_delayed_ref_root *delayed_refs, 471 struct btrfs_delayed_ref_head *head, 472 struct btrfs_delayed_ref_node *ref, 473 u64 seq) 474 { 475 struct btrfs_delayed_ref_node *next; 476 struct rb_node *node = rb_next(&ref->ref_node); 477 bool done = false; 478 479 while (!done && node) { 480 int mod; 481 482 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 483 node = rb_next(node); 484 if (seq && next->seq >= seq) 485 break; 486 if (comp_refs(ref, next, false)) 487 break; 488 489 if (ref->action == next->action) { 490 mod = next->ref_mod; 491 } else { 492 if (ref->ref_mod < next->ref_mod) { 493 swap(ref, next); 494 done = true; 495 } 496 mod = -next->ref_mod; 497 } 498 499 drop_delayed_ref(fs_info, delayed_refs, head, next); 500 ref->ref_mod += mod; 501 if (ref->ref_mod == 0) { 502 drop_delayed_ref(fs_info, delayed_refs, head, ref); 503 done = true; 504 } else { 505 /* 506 * Can't have multiples of the same ref on a tree block. 507 */ 508 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 509 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 510 } 511 } 512 513 return done; 514 } 515 516 void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, 517 struct btrfs_delayed_ref_root *delayed_refs, 518 struct btrfs_delayed_ref_head *head) 519 { 520 struct btrfs_delayed_ref_node *ref; 521 struct rb_node *node; 522 u64 seq = 0; 523 524 lockdep_assert_held(&head->lock); 525 526 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 527 return; 528 529 /* We don't have too many refs to merge for data. */ 530 if (head->is_data) 531 return; 532 533 seq = btrfs_tree_mod_log_lowest_seq(fs_info); 534 again: 535 for (node = rb_first_cached(&head->ref_tree); node; 536 node = rb_next(node)) { 537 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 538 if (seq && ref->seq >= seq) 539 continue; 540 if (merge_ref(fs_info, delayed_refs, head, ref, seq)) 541 goto again; 542 } 543 } 544 545 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 546 { 547 int ret = 0; 548 u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); 549 550 if (min_seq != 0 && seq >= min_seq) { 551 btrfs_debug(fs_info, 552 "holding back delayed_ref %llu, lowest is %llu", 553 seq, min_seq); 554 ret = 1; 555 } 556 557 return ret; 558 } 559 560 struct btrfs_delayed_ref_head *btrfs_select_ref_head( 561 struct btrfs_delayed_ref_root *delayed_refs) 562 { 563 struct btrfs_delayed_ref_head *head; 564 565 lockdep_assert_held(&delayed_refs->lock); 566 again: 567 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 568 true); 569 if (!head && delayed_refs->run_delayed_start != 0) { 570 delayed_refs->run_delayed_start = 0; 571 head = find_first_ref_head(delayed_refs); 572 } 573 if (!head) 574 return NULL; 575 576 while (head->processing) { 577 struct rb_node *node; 578 579 node = rb_next(&head->href_node); 580 if (!node) { 581 if (delayed_refs->run_delayed_start == 0) 582 return NULL; 583 delayed_refs->run_delayed_start = 0; 584 goto again; 585 } 586 head = rb_entry(node, struct btrfs_delayed_ref_head, 587 href_node); 588 } 589 590 head->processing = true; 591 WARN_ON(delayed_refs->num_heads_ready == 0); 592 delayed_refs->num_heads_ready--; 593 delayed_refs->run_delayed_start = head->bytenr + 594 head->num_bytes; 595 return head; 596 } 597 598 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 599 struct btrfs_delayed_ref_head *head) 600 { 601 lockdep_assert_held(&delayed_refs->lock); 602 lockdep_assert_held(&head->lock); 603 604 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 605 RB_CLEAR_NODE(&head->href_node); 606 atomic_dec(&delayed_refs->num_entries); 607 delayed_refs->num_heads--; 608 if (!head->processing) 609 delayed_refs->num_heads_ready--; 610 } 611 612 /* 613 * Helper to insert the ref_node to the tail or merge with tail. 614 * 615 * Return false if the ref was inserted. 616 * Return true if the ref was merged into an existing one (and therefore can be 617 * freed by the caller). 618 */ 619 static bool insert_delayed_ref(struct btrfs_trans_handle *trans, 620 struct btrfs_delayed_ref_head *href, 621 struct btrfs_delayed_ref_node *ref) 622 { 623 struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; 624 struct btrfs_delayed_ref_node *exist; 625 int mod; 626 627 spin_lock(&href->lock); 628 exist = tree_insert(&href->ref_tree, ref); 629 if (!exist) { 630 if (ref->action == BTRFS_ADD_DELAYED_REF) 631 list_add_tail(&ref->add_list, &href->ref_add_list); 632 atomic_inc(&root->num_entries); 633 spin_unlock(&href->lock); 634 trans->delayed_ref_updates++; 635 return false; 636 } 637 638 /* Now we are sure we can merge */ 639 if (exist->action == ref->action) { 640 mod = ref->ref_mod; 641 } else { 642 /* Need to change action */ 643 if (exist->ref_mod < ref->ref_mod) { 644 exist->action = ref->action; 645 mod = -exist->ref_mod; 646 exist->ref_mod = ref->ref_mod; 647 if (ref->action == BTRFS_ADD_DELAYED_REF) 648 list_add_tail(&exist->add_list, 649 &href->ref_add_list); 650 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 651 ASSERT(!list_empty(&exist->add_list)); 652 list_del(&exist->add_list); 653 } else { 654 ASSERT(0); 655 } 656 } else 657 mod = -ref->ref_mod; 658 } 659 exist->ref_mod += mod; 660 661 /* remove existing tail if its ref_mod is zero */ 662 if (exist->ref_mod == 0) 663 drop_delayed_ref(trans->fs_info, root, href, exist); 664 spin_unlock(&href->lock); 665 return true; 666 } 667 668 /* 669 * helper function to update the accounting in the head ref 670 * existing and update must have the same bytenr 671 */ 672 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 673 struct btrfs_delayed_ref_head *existing, 674 struct btrfs_delayed_ref_head *update) 675 { 676 struct btrfs_delayed_ref_root *delayed_refs = 677 &trans->transaction->delayed_refs; 678 struct btrfs_fs_info *fs_info = trans->fs_info; 679 int old_ref_mod; 680 681 BUG_ON(existing->is_data != update->is_data); 682 683 spin_lock(&existing->lock); 684 685 /* 686 * When freeing an extent, we may not know the owning root when we 687 * first create the head_ref. However, some deref before the last deref 688 * will know it, so we just need to update the head_ref accordingly. 689 */ 690 if (!existing->owning_root) 691 existing->owning_root = update->owning_root; 692 693 if (update->must_insert_reserved) { 694 /* if the extent was freed and then 695 * reallocated before the delayed ref 696 * entries were processed, we can end up 697 * with an existing head ref without 698 * the must_insert_reserved flag set. 699 * Set it again here 700 */ 701 existing->must_insert_reserved = update->must_insert_reserved; 702 existing->owning_root = update->owning_root; 703 704 /* 705 * update the num_bytes so we make sure the accounting 706 * is done correctly 707 */ 708 existing->num_bytes = update->num_bytes; 709 710 } 711 712 if (update->extent_op) { 713 if (!existing->extent_op) { 714 existing->extent_op = update->extent_op; 715 } else { 716 if (update->extent_op->update_key) { 717 memcpy(&existing->extent_op->key, 718 &update->extent_op->key, 719 sizeof(update->extent_op->key)); 720 existing->extent_op->update_key = true; 721 } 722 if (update->extent_op->update_flags) { 723 existing->extent_op->flags_to_set |= 724 update->extent_op->flags_to_set; 725 existing->extent_op->update_flags = true; 726 } 727 btrfs_free_delayed_extent_op(update->extent_op); 728 } 729 } 730 /* 731 * update the reference mod on the head to reflect this new operation, 732 * only need the lock for this case cause we could be processing it 733 * currently, for refs we just added we know we're a-ok. 734 */ 735 old_ref_mod = existing->total_ref_mod; 736 existing->ref_mod += update->ref_mod; 737 existing->total_ref_mod += update->ref_mod; 738 739 /* 740 * If we are going to from a positive ref mod to a negative or vice 741 * versa we need to make sure to adjust pending_csums accordingly. 742 * We reserve bytes for csum deletion when adding or updating a ref head 743 * see add_delayed_ref_head() for more details. 744 */ 745 if (existing->is_data) { 746 u64 csum_leaves = 747 btrfs_csum_bytes_to_leaves(fs_info, 748 existing->num_bytes); 749 750 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 751 delayed_refs->pending_csums -= existing->num_bytes; 752 btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); 753 } 754 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 755 delayed_refs->pending_csums += existing->num_bytes; 756 trans->delayed_ref_csum_deletions += csum_leaves; 757 } 758 } 759 760 spin_unlock(&existing->lock); 761 } 762 763 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 764 struct btrfs_ref *generic_ref, 765 struct btrfs_qgroup_extent_record *qrecord, 766 u64 reserved) 767 { 768 int count_mod = 1; 769 bool must_insert_reserved = false; 770 771 /* If reserved is provided, it must be a data extent. */ 772 BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); 773 774 switch (generic_ref->action) { 775 case BTRFS_ADD_DELAYED_REF: 776 /* count_mod is already set to 1. */ 777 break; 778 case BTRFS_UPDATE_DELAYED_HEAD: 779 count_mod = 0; 780 break; 781 case BTRFS_DROP_DELAYED_REF: 782 /* 783 * The head node stores the sum of all the mods, so dropping a ref 784 * should drop the sum in the head node by one. 785 */ 786 count_mod = -1; 787 break; 788 case BTRFS_ADD_DELAYED_EXTENT: 789 /* 790 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the 791 * reserved accounting when the extent is finally added, or if a 792 * later modification deletes the delayed ref without ever 793 * inserting the extent into the extent allocation tree. 794 * ref->must_insert_reserved is the flag used to record that 795 * accounting mods are required. 796 * 797 * Once we record must_insert_reserved, switch the action to 798 * BTRFS_ADD_DELAYED_REF because other special casing is not 799 * required. 800 */ 801 must_insert_reserved = true; 802 break; 803 } 804 805 refcount_set(&head_ref->refs, 1); 806 head_ref->bytenr = generic_ref->bytenr; 807 head_ref->num_bytes = generic_ref->num_bytes; 808 head_ref->ref_mod = count_mod; 809 head_ref->reserved_bytes = reserved; 810 head_ref->must_insert_reserved = must_insert_reserved; 811 head_ref->owning_root = generic_ref->owning_root; 812 head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); 813 head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); 814 head_ref->ref_tree = RB_ROOT_CACHED; 815 INIT_LIST_HEAD(&head_ref->ref_add_list); 816 RB_CLEAR_NODE(&head_ref->href_node); 817 head_ref->processing = false; 818 head_ref->total_ref_mod = count_mod; 819 spin_lock_init(&head_ref->lock); 820 mutex_init(&head_ref->mutex); 821 822 /* If not metadata set an impossible level to help debugging. */ 823 if (generic_ref->type == BTRFS_REF_METADATA) 824 head_ref->level = generic_ref->tree_ref.level; 825 else 826 head_ref->level = U8_MAX; 827 828 if (qrecord) { 829 if (generic_ref->ref_root && reserved) { 830 qrecord->data_rsv = reserved; 831 qrecord->data_rsv_refroot = generic_ref->ref_root; 832 } 833 qrecord->bytenr = generic_ref->bytenr; 834 qrecord->num_bytes = generic_ref->num_bytes; 835 qrecord->old_roots = NULL; 836 } 837 } 838 839 /* 840 * helper function to actually insert a head node into the rbtree. 841 * this does all the dirty work in terms of maintaining the correct 842 * overall modification count. 843 * 844 * Returns an error pointer in case of an error. 845 */ 846 static noinline struct btrfs_delayed_ref_head * 847 add_delayed_ref_head(struct btrfs_trans_handle *trans, 848 struct btrfs_delayed_ref_head *head_ref, 849 struct btrfs_qgroup_extent_record *qrecord, 850 int action, bool *qrecord_inserted_ret) 851 { 852 struct btrfs_fs_info *fs_info = trans->fs_info; 853 struct btrfs_delayed_ref_head *existing; 854 struct btrfs_delayed_ref_root *delayed_refs; 855 bool qrecord_inserted = false; 856 857 delayed_refs = &trans->transaction->delayed_refs; 858 859 /* Record qgroup extent info if provided */ 860 if (qrecord) { 861 int ret; 862 863 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord); 864 if (ret) { 865 /* Clean up if insertion fails or item exists. */ 866 xa_release(&delayed_refs->dirty_extents, 867 qrecord->bytenr >> fs_info->sectorsize_bits); 868 /* Caller responsible for freeing qrecord on error. */ 869 if (ret < 0) 870 return ERR_PTR(ret); 871 kfree(qrecord); 872 } else { 873 qrecord_inserted = true; 874 } 875 } 876 877 trace_add_delayed_ref_head(fs_info, head_ref, action); 878 879 existing = htree_insert(&delayed_refs->href_root, 880 &head_ref->href_node); 881 if (existing) { 882 update_existing_head_ref(trans, existing, head_ref); 883 /* 884 * we've updated the existing ref, free the newly 885 * allocated ref 886 */ 887 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 888 head_ref = existing; 889 } else { 890 /* 891 * We reserve the amount of bytes needed to delete csums when 892 * adding the ref head and not when adding individual drop refs 893 * since the csum items are deleted only after running the last 894 * delayed drop ref (the data extent's ref count drops to 0). 895 */ 896 if (head_ref->is_data && head_ref->ref_mod < 0) { 897 delayed_refs->pending_csums += head_ref->num_bytes; 898 trans->delayed_ref_csum_deletions += 899 btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes); 900 } 901 delayed_refs->num_heads++; 902 delayed_refs->num_heads_ready++; 903 atomic_inc(&delayed_refs->num_entries); 904 } 905 if (qrecord_inserted_ret) 906 *qrecord_inserted_ret = qrecord_inserted; 907 908 return head_ref; 909 } 910 911 /* 912 * Initialize the structure which represents a modification to a an extent. 913 * 914 * @fs_info: Internal to the mounted filesystem mount structure. 915 * 916 * @ref: The structure which is going to be initialized. 917 * 918 * @bytenr: The logical address of the extent for which a modification is 919 * going to be recorded. 920 * 921 * @num_bytes: Size of the extent whose modification is being recorded. 922 * 923 * @ref_root: The id of the root where this modification has originated, this 924 * can be either one of the well-known metadata trees or the 925 * subvolume id which references this extent. 926 * 927 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 928 * BTRFS_ADD_DELAYED_EXTENT 929 * 930 * @ref_type: Holds the type of the extent which is being recorded, can be 931 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 932 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 933 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 934 */ 935 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 936 struct btrfs_delayed_ref_node *ref, 937 struct btrfs_ref *generic_ref) 938 { 939 int action = generic_ref->action; 940 u64 seq = 0; 941 942 if (action == BTRFS_ADD_DELAYED_EXTENT) 943 action = BTRFS_ADD_DELAYED_REF; 944 945 if (is_fstree(generic_ref->ref_root)) 946 seq = atomic64_read(&fs_info->tree_mod_seq); 947 948 refcount_set(&ref->refs, 1); 949 ref->bytenr = generic_ref->bytenr; 950 ref->num_bytes = generic_ref->num_bytes; 951 ref->ref_mod = 1; 952 ref->action = action; 953 ref->seq = seq; 954 ref->type = btrfs_ref_type(generic_ref); 955 ref->ref_root = generic_ref->ref_root; 956 ref->parent = generic_ref->parent; 957 RB_CLEAR_NODE(&ref->ref_node); 958 INIT_LIST_HEAD(&ref->add_list); 959 960 if (generic_ref->type == BTRFS_REF_DATA) 961 ref->data_ref = generic_ref->data_ref; 962 else 963 ref->tree_ref = generic_ref->tree_ref; 964 } 965 966 void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, 967 bool skip_qgroup) 968 { 969 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 970 /* If @real_root not set, use @root as fallback */ 971 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 972 #endif 973 generic_ref->tree_ref.level = level; 974 generic_ref->type = BTRFS_REF_METADATA; 975 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 976 (!mod_root || is_fstree(mod_root)))) 977 generic_ref->skip_qgroup = true; 978 else 979 generic_ref->skip_qgroup = false; 980 981 } 982 983 void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, 984 u64 mod_root, bool skip_qgroup) 985 { 986 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 987 /* If @real_root not set, use @root as fallback */ 988 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 989 #endif 990 generic_ref->data_ref.objectid = ino; 991 generic_ref->data_ref.offset = offset; 992 generic_ref->type = BTRFS_REF_DATA; 993 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 994 (!mod_root || is_fstree(mod_root)))) 995 generic_ref->skip_qgroup = true; 996 else 997 generic_ref->skip_qgroup = false; 998 } 999 1000 static int add_delayed_ref(struct btrfs_trans_handle *trans, 1001 struct btrfs_ref *generic_ref, 1002 struct btrfs_delayed_extent_op *extent_op, 1003 u64 reserved) 1004 { 1005 struct btrfs_fs_info *fs_info = trans->fs_info; 1006 struct btrfs_delayed_ref_node *node; 1007 struct btrfs_delayed_ref_head *head_ref; 1008 struct btrfs_delayed_ref_head *new_head_ref; 1009 struct btrfs_delayed_ref_root *delayed_refs; 1010 struct btrfs_qgroup_extent_record *record = NULL; 1011 bool qrecord_inserted; 1012 int action = generic_ref->action; 1013 bool merged; 1014 int ret; 1015 1016 node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); 1017 if (!node) 1018 return -ENOMEM; 1019 1020 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1021 if (!head_ref) { 1022 ret = -ENOMEM; 1023 goto free_node; 1024 } 1025 1026 if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { 1027 record = kzalloc(sizeof(*record), GFP_NOFS); 1028 if (!record) { 1029 ret = -ENOMEM; 1030 goto free_head_ref; 1031 } 1032 if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, 1033 generic_ref->bytenr >> fs_info->sectorsize_bits, 1034 GFP_NOFS)) { 1035 ret = -ENOMEM; 1036 goto free_record; 1037 } 1038 } 1039 1040 init_delayed_ref_common(fs_info, node, generic_ref); 1041 init_delayed_ref_head(head_ref, generic_ref, record, reserved); 1042 head_ref->extent_op = extent_op; 1043 1044 delayed_refs = &trans->transaction->delayed_refs; 1045 spin_lock(&delayed_refs->lock); 1046 1047 /* 1048 * insert both the head node and the new ref without dropping 1049 * the spin lock 1050 */ 1051 new_head_ref = add_delayed_ref_head(trans, head_ref, record, 1052 action, &qrecord_inserted); 1053 if (IS_ERR(new_head_ref)) { 1054 spin_unlock(&delayed_refs->lock); 1055 ret = PTR_ERR(new_head_ref); 1056 goto free_record; 1057 } 1058 head_ref = new_head_ref; 1059 1060 merged = insert_delayed_ref(trans, head_ref, node); 1061 spin_unlock(&delayed_refs->lock); 1062 1063 /* 1064 * Need to update the delayed_refs_rsv with any changes we may have 1065 * made. 1066 */ 1067 btrfs_update_delayed_refs_rsv(trans); 1068 1069 if (generic_ref->type == BTRFS_REF_DATA) 1070 trace_add_delayed_data_ref(trans->fs_info, node); 1071 else 1072 trace_add_delayed_tree_ref(trans->fs_info, node); 1073 if (merged) 1074 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1075 1076 if (qrecord_inserted) 1077 return btrfs_qgroup_trace_extent_post(trans, record); 1078 return 0; 1079 1080 free_record: 1081 kfree(record); 1082 free_head_ref: 1083 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 1084 free_node: 1085 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1086 return ret; 1087 } 1088 1089 /* 1090 * Add a delayed tree ref. This does all of the accounting required to make sure 1091 * the delayed ref is eventually processed before this transaction commits. 1092 */ 1093 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 1094 struct btrfs_ref *generic_ref, 1095 struct btrfs_delayed_extent_op *extent_op) 1096 { 1097 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 1098 return add_delayed_ref(trans, generic_ref, extent_op, 0); 1099 } 1100 1101 /* 1102 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1103 */ 1104 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1105 struct btrfs_ref *generic_ref, 1106 u64 reserved) 1107 { 1108 ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); 1109 return add_delayed_ref(trans, generic_ref, NULL, reserved); 1110 } 1111 1112 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1113 u64 bytenr, u64 num_bytes, u8 level, 1114 struct btrfs_delayed_extent_op *extent_op) 1115 { 1116 struct btrfs_delayed_ref_head *head_ref; 1117 struct btrfs_delayed_ref_head *head_ref_ret; 1118 struct btrfs_delayed_ref_root *delayed_refs; 1119 struct btrfs_ref generic_ref = { 1120 .type = BTRFS_REF_METADATA, 1121 .action = BTRFS_UPDATE_DELAYED_HEAD, 1122 .bytenr = bytenr, 1123 .num_bytes = num_bytes, 1124 .tree_ref.level = level, 1125 }; 1126 1127 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1128 if (!head_ref) 1129 return -ENOMEM; 1130 1131 init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); 1132 head_ref->extent_op = extent_op; 1133 1134 delayed_refs = &trans->transaction->delayed_refs; 1135 spin_lock(&delayed_refs->lock); 1136 1137 head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL, 1138 BTRFS_UPDATE_DELAYED_HEAD, NULL); 1139 spin_unlock(&delayed_refs->lock); 1140 1141 if (IS_ERR(head_ref_ret)) { 1142 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 1143 return PTR_ERR(head_ref_ret); 1144 } 1145 1146 /* 1147 * Need to update the delayed_refs_rsv with any changes we may have 1148 * made. 1149 */ 1150 btrfs_update_delayed_refs_rsv(trans); 1151 return 0; 1152 } 1153 1154 void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 1155 { 1156 if (refcount_dec_and_test(&ref->refs)) { 1157 WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); 1158 kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); 1159 } 1160 } 1161 1162 /* 1163 * This does a simple search for the head node for a given extent. Returns the 1164 * head node if found, or NULL if not. 1165 */ 1166 struct btrfs_delayed_ref_head * 1167 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1168 { 1169 lockdep_assert_held(&delayed_refs->lock); 1170 1171 return find_ref_head(delayed_refs, bytenr, false); 1172 } 1173 1174 static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) 1175 { 1176 int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; 1177 1178 if (type < entry->type) 1179 return -1; 1180 if (type > entry->type) 1181 return 1; 1182 1183 if (type == BTRFS_TREE_BLOCK_REF_KEY) { 1184 if (root < entry->ref_root) 1185 return -1; 1186 if (root > entry->ref_root) 1187 return 1; 1188 } else { 1189 if (parent < entry->parent) 1190 return -1; 1191 if (parent > entry->parent) 1192 return 1; 1193 } 1194 return 0; 1195 } 1196 1197 /* 1198 * Check to see if a given root/parent reference is attached to the head. This 1199 * only checks for BTRFS_ADD_DELAYED_REF references that match, as that 1200 * indicates the reference exists for the given root or parent. This is for 1201 * tree blocks only. 1202 * 1203 * @head: the head of the bytenr we're searching. 1204 * @root: the root objectid of the reference if it is a normal reference. 1205 * @parent: the parent if this is a shared backref. 1206 */ 1207 bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, 1208 u64 root, u64 parent) 1209 { 1210 struct rb_node *node; 1211 bool found = false; 1212 1213 lockdep_assert_held(&head->mutex); 1214 1215 spin_lock(&head->lock); 1216 node = head->ref_tree.rb_root.rb_node; 1217 while (node) { 1218 struct btrfs_delayed_ref_node *entry; 1219 int ret; 1220 1221 entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 1222 ret = find_comp(entry, root, parent); 1223 if (ret < 0) { 1224 node = node->rb_left; 1225 } else if (ret > 0) { 1226 node = node->rb_right; 1227 } else { 1228 /* 1229 * We only want to count ADD actions, as drops mean the 1230 * ref doesn't exist. 1231 */ 1232 if (entry->action == BTRFS_ADD_DELAYED_REF) 1233 found = true; 1234 break; 1235 } 1236 } 1237 spin_unlock(&head->lock); 1238 return found; 1239 } 1240 1241 void __cold btrfs_delayed_ref_exit(void) 1242 { 1243 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1244 kmem_cache_destroy(btrfs_delayed_ref_node_cachep); 1245 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1246 } 1247 1248 int __init btrfs_delayed_ref_init(void) 1249 { 1250 btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); 1251 if (!btrfs_delayed_ref_head_cachep) 1252 goto fail; 1253 1254 btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); 1255 if (!btrfs_delayed_ref_node_cachep) 1256 goto fail; 1257 1258 btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); 1259 if (!btrfs_delayed_extent_op_cachep) 1260 goto fail; 1261 1262 return 0; 1263 fail: 1264 btrfs_delayed_ref_exit(); 1265 return -ENOMEM; 1266 } 1267