1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/sort.h> 9 #include "messages.h" 10 #include "ctree.h" 11 #include "delayed-ref.h" 12 #include "transaction.h" 13 #include "qgroup.h" 14 #include "space-info.h" 15 #include "tree-mod-log.h" 16 #include "fs.h" 17 18 struct kmem_cache *btrfs_delayed_ref_head_cachep; 19 struct kmem_cache *btrfs_delayed_ref_node_cachep; 20 struct kmem_cache *btrfs_delayed_extent_op_cachep; 21 /* 22 * delayed back reference update tracking. For subvolume trees 23 * we queue up extent allocations and backref maintenance for 24 * delayed processing. This avoids deep call chains where we 25 * add extents in the middle of btrfs_search_slot, and it allows 26 * us to buffer up frequently modified backrefs in an rb tree instead 27 * of hammering updates on the extent allocation tree. 28 */ 29 30 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 31 { 32 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 33 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 34 bool ret = false; 35 u64 reserved; 36 37 spin_lock(&global_rsv->lock); 38 reserved = global_rsv->reserved; 39 spin_unlock(&global_rsv->lock); 40 41 /* 42 * Since the global reserve is just kind of magic we don't really want 43 * to rely on it to save our bacon, so if our size is more than the 44 * delayed_refs_rsv and the global rsv then it's time to think about 45 * bailing. 46 */ 47 spin_lock(&delayed_refs_rsv->lock); 48 reserved += delayed_refs_rsv->reserved; 49 if (delayed_refs_rsv->size >= reserved) 50 ret = true; 51 spin_unlock(&delayed_refs_rsv->lock); 52 return ret; 53 } 54 55 /* 56 * Release a ref head's reservation. 57 * 58 * @fs_info: the filesystem 59 * @nr_refs: number of delayed refs to drop 60 * @nr_csums: number of csum items to drop 61 * 62 * Drops the delayed ref head's count from the delayed refs rsv and free any 63 * excess reservation we had. 64 */ 65 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) 66 { 67 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 68 u64 num_bytes; 69 u64 released; 70 71 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); 72 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 73 74 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 75 if (released) 76 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 77 0, released, 0); 78 } 79 80 /* 81 * Adjust the size of the delayed refs rsv. 82 * 83 * This is to be called anytime we may have adjusted trans->delayed_ref_updates 84 * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and 85 * add it to the delayed_refs_rsv. 86 */ 87 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 88 { 89 struct btrfs_fs_info *fs_info = trans->fs_info; 90 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 91 struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; 92 u64 num_bytes; 93 u64 reserved_bytes; 94 95 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); 96 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, 97 trans->delayed_ref_csum_deletions); 98 99 if (num_bytes == 0) 100 return; 101 102 /* 103 * Try to take num_bytes from the transaction's local delayed reserve. 104 * If not possible, try to take as much as it's available. If the local 105 * reserve doesn't have enough reserved space, the delayed refs reserve 106 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called 107 * by someone or if a transaction commit is triggered before that, the 108 * global block reserve will be used. We want to minimize using the 109 * global block reserve for cases we can account for in advance, to 110 * avoid exhausting it and reach -ENOSPC during a transaction commit. 111 */ 112 spin_lock(&local_rsv->lock); 113 reserved_bytes = min(num_bytes, local_rsv->reserved); 114 local_rsv->reserved -= reserved_bytes; 115 local_rsv->full = (local_rsv->reserved >= local_rsv->size); 116 spin_unlock(&local_rsv->lock); 117 118 spin_lock(&delayed_rsv->lock); 119 delayed_rsv->size += num_bytes; 120 delayed_rsv->reserved += reserved_bytes; 121 delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); 122 spin_unlock(&delayed_rsv->lock); 123 trans->delayed_ref_updates = 0; 124 trans->delayed_ref_csum_deletions = 0; 125 } 126 127 /* 128 * Adjust the size of the delayed refs block reserve for 1 block group item 129 * insertion, used after allocating a block group. 130 */ 131 void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 132 { 133 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 134 135 spin_lock(&delayed_rsv->lock); 136 /* 137 * Inserting a block group item does not require changing the free space 138 * tree, only the extent tree or the block group tree, so this is all we 139 * need. 140 */ 141 delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); 142 delayed_rsv->full = false; 143 spin_unlock(&delayed_rsv->lock); 144 } 145 146 /* 147 * Adjust the size of the delayed refs block reserve to release space for 1 148 * block group item insertion. 149 */ 150 void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 151 { 152 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 153 const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 154 u64 released; 155 156 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 157 if (released > 0) 158 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 159 0, released, 0); 160 } 161 162 /* 163 * Adjust the size of the delayed refs block reserve for 1 block group item 164 * update. 165 */ 166 void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 167 { 168 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 169 170 spin_lock(&delayed_rsv->lock); 171 /* 172 * Updating a block group item does not result in new nodes/leaves and 173 * does not require changing the free space tree, only the extent tree 174 * or the block group tree, so this is all we need. 175 */ 176 delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); 177 delayed_rsv->full = false; 178 spin_unlock(&delayed_rsv->lock); 179 } 180 181 /* 182 * Adjust the size of the delayed refs block reserve to release space for 1 183 * block group item update. 184 */ 185 void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 186 { 187 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 188 const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 189 u64 released; 190 191 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 192 if (released > 0) 193 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 194 0, released, 0); 195 } 196 197 /* 198 * Transfer bytes to our delayed refs rsv. 199 * 200 * @fs_info: the filesystem 201 * @num_bytes: number of bytes to transfer 202 * 203 * This transfers up to the num_bytes amount, previously reserved, to the 204 * delayed_refs_rsv. Any extra bytes are returned to the space info. 205 */ 206 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 207 u64 num_bytes) 208 { 209 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 210 u64 to_free = 0; 211 212 spin_lock(&delayed_refs_rsv->lock); 213 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 214 u64 delta = delayed_refs_rsv->size - 215 delayed_refs_rsv->reserved; 216 if (num_bytes > delta) { 217 to_free = num_bytes - delta; 218 num_bytes = delta; 219 } 220 } else { 221 to_free = num_bytes; 222 num_bytes = 0; 223 } 224 225 if (num_bytes) 226 delayed_refs_rsv->reserved += num_bytes; 227 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 228 delayed_refs_rsv->full = true; 229 spin_unlock(&delayed_refs_rsv->lock); 230 231 if (num_bytes) 232 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 233 0, num_bytes, 1); 234 if (to_free) 235 btrfs_space_info_free_bytes_may_use(fs_info, 236 delayed_refs_rsv->space_info, to_free); 237 } 238 239 /* 240 * Refill based on our delayed refs usage. 241 * 242 * @fs_info: the filesystem 243 * @flush: control how we can flush for this reservation. 244 * 245 * This will refill the delayed block_rsv up to 1 items size worth of space and 246 * will return -ENOSPC if we can't make the reservation. 247 */ 248 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 249 enum btrfs_reserve_flush_enum flush) 250 { 251 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 252 struct btrfs_space_info *space_info = block_rsv->space_info; 253 u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); 254 u64 num_bytes = 0; 255 u64 refilled_bytes; 256 u64 to_free; 257 int ret = -ENOSPC; 258 259 spin_lock(&block_rsv->lock); 260 if (block_rsv->reserved < block_rsv->size) { 261 num_bytes = block_rsv->size - block_rsv->reserved; 262 num_bytes = min(num_bytes, limit); 263 } 264 spin_unlock(&block_rsv->lock); 265 266 if (!num_bytes) 267 return 0; 268 269 ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); 270 if (ret) 271 return ret; 272 273 /* 274 * We may have raced with someone else, so check again if we the block 275 * reserve is still not full and release any excess space. 276 */ 277 spin_lock(&block_rsv->lock); 278 if (block_rsv->reserved < block_rsv->size) { 279 u64 needed = block_rsv->size - block_rsv->reserved; 280 281 if (num_bytes >= needed) { 282 block_rsv->reserved += needed; 283 block_rsv->full = true; 284 to_free = num_bytes - needed; 285 refilled_bytes = needed; 286 } else { 287 block_rsv->reserved += num_bytes; 288 to_free = 0; 289 refilled_bytes = num_bytes; 290 } 291 } else { 292 to_free = num_bytes; 293 refilled_bytes = 0; 294 } 295 spin_unlock(&block_rsv->lock); 296 297 if (to_free > 0) 298 btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); 299 300 if (refilled_bytes > 0) 301 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, 302 refilled_bytes, 1); 303 return 0; 304 } 305 306 /* 307 * compare two delayed data backrefs with same bytenr and type 308 */ 309 static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, 310 struct btrfs_delayed_ref_node *ref2) 311 { 312 if (ref1->data_ref.objectid < ref2->data_ref.objectid) 313 return -1; 314 if (ref1->data_ref.objectid > ref2->data_ref.objectid) 315 return 1; 316 if (ref1->data_ref.offset < ref2->data_ref.offset) 317 return -1; 318 if (ref1->data_ref.offset > ref2->data_ref.offset) 319 return 1; 320 return 0; 321 } 322 323 static int comp_refs(struct btrfs_delayed_ref_node *ref1, 324 struct btrfs_delayed_ref_node *ref2, 325 bool check_seq) 326 { 327 int ret = 0; 328 329 if (ref1->type < ref2->type) 330 return -1; 331 if (ref1->type > ref2->type) 332 return 1; 333 if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || 334 ref1->type == BTRFS_SHARED_DATA_REF_KEY) { 335 if (ref1->parent < ref2->parent) 336 return -1; 337 if (ref1->parent > ref2->parent) 338 return 1; 339 } else { 340 if (ref1->ref_root < ref2->ref_root) 341 return -1; 342 if (ref1->ref_root > ref2->ref_root) 343 return -1; 344 if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) 345 ret = comp_data_refs(ref1, ref2); 346 } 347 if (ret) 348 return ret; 349 if (check_seq) { 350 if (ref1->seq < ref2->seq) 351 return -1; 352 if (ref1->seq > ref2->seq) 353 return 1; 354 } 355 return 0; 356 } 357 358 /* insert a new ref to head ref rbtree */ 359 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 360 struct rb_node *node) 361 { 362 struct rb_node **p = &root->rb_root.rb_node; 363 struct rb_node *parent_node = NULL; 364 struct btrfs_delayed_ref_head *entry; 365 struct btrfs_delayed_ref_head *ins; 366 u64 bytenr; 367 bool leftmost = true; 368 369 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 370 bytenr = ins->bytenr; 371 while (*p) { 372 parent_node = *p; 373 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 374 href_node); 375 376 if (bytenr < entry->bytenr) { 377 p = &(*p)->rb_left; 378 } else if (bytenr > entry->bytenr) { 379 p = &(*p)->rb_right; 380 leftmost = false; 381 } else { 382 return entry; 383 } 384 } 385 386 rb_link_node(node, parent_node, p); 387 rb_insert_color_cached(node, root, leftmost); 388 return NULL; 389 } 390 391 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 392 struct btrfs_delayed_ref_node *ins) 393 { 394 struct rb_node **p = &root->rb_root.rb_node; 395 struct rb_node *node = &ins->ref_node; 396 struct rb_node *parent_node = NULL; 397 struct btrfs_delayed_ref_node *entry; 398 bool leftmost = true; 399 400 while (*p) { 401 int comp; 402 403 parent_node = *p; 404 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 405 ref_node); 406 comp = comp_refs(ins, entry, true); 407 if (comp < 0) { 408 p = &(*p)->rb_left; 409 } else if (comp > 0) { 410 p = &(*p)->rb_right; 411 leftmost = false; 412 } else { 413 return entry; 414 } 415 } 416 417 rb_link_node(node, parent_node, p); 418 rb_insert_color_cached(node, root, leftmost); 419 return NULL; 420 } 421 422 static struct btrfs_delayed_ref_head *find_first_ref_head( 423 struct btrfs_delayed_ref_root *dr) 424 { 425 struct rb_node *n; 426 struct btrfs_delayed_ref_head *entry; 427 428 n = rb_first_cached(&dr->href_root); 429 if (!n) 430 return NULL; 431 432 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 433 434 return entry; 435 } 436 437 /* 438 * Find a head entry based on bytenr. This returns the delayed ref head if it 439 * was able to find one, or NULL if nothing was in that spot. If return_bigger 440 * is given, the next bigger entry is returned if no exact match is found. 441 */ 442 static struct btrfs_delayed_ref_head *find_ref_head( 443 struct btrfs_delayed_ref_root *dr, u64 bytenr, 444 bool return_bigger) 445 { 446 struct rb_root *root = &dr->href_root.rb_root; 447 struct rb_node *n; 448 struct btrfs_delayed_ref_head *entry; 449 450 n = root->rb_node; 451 entry = NULL; 452 while (n) { 453 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 454 455 if (bytenr < entry->bytenr) 456 n = n->rb_left; 457 else if (bytenr > entry->bytenr) 458 n = n->rb_right; 459 else 460 return entry; 461 } 462 if (entry && return_bigger) { 463 if (bytenr > entry->bytenr) { 464 n = rb_next(&entry->href_node); 465 if (!n) 466 return NULL; 467 entry = rb_entry(n, struct btrfs_delayed_ref_head, 468 href_node); 469 } 470 return entry; 471 } 472 return NULL; 473 } 474 475 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 476 struct btrfs_delayed_ref_head *head) 477 { 478 lockdep_assert_held(&delayed_refs->lock); 479 if (mutex_trylock(&head->mutex)) 480 return 0; 481 482 refcount_inc(&head->refs); 483 spin_unlock(&delayed_refs->lock); 484 485 mutex_lock(&head->mutex); 486 spin_lock(&delayed_refs->lock); 487 if (RB_EMPTY_NODE(&head->href_node)) { 488 mutex_unlock(&head->mutex); 489 btrfs_put_delayed_ref_head(head); 490 return -EAGAIN; 491 } 492 btrfs_put_delayed_ref_head(head); 493 return 0; 494 } 495 496 static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, 497 struct btrfs_delayed_ref_root *delayed_refs, 498 struct btrfs_delayed_ref_head *head, 499 struct btrfs_delayed_ref_node *ref) 500 { 501 lockdep_assert_held(&head->lock); 502 rb_erase_cached(&ref->ref_node, &head->ref_tree); 503 RB_CLEAR_NODE(&ref->ref_node); 504 if (!list_empty(&ref->add_list)) 505 list_del(&ref->add_list); 506 btrfs_put_delayed_ref(ref); 507 atomic_dec(&delayed_refs->num_entries); 508 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 509 } 510 511 static bool merge_ref(struct btrfs_fs_info *fs_info, 512 struct btrfs_delayed_ref_root *delayed_refs, 513 struct btrfs_delayed_ref_head *head, 514 struct btrfs_delayed_ref_node *ref, 515 u64 seq) 516 { 517 struct btrfs_delayed_ref_node *next; 518 struct rb_node *node = rb_next(&ref->ref_node); 519 bool done = false; 520 521 while (!done && node) { 522 int mod; 523 524 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 525 node = rb_next(node); 526 if (seq && next->seq >= seq) 527 break; 528 if (comp_refs(ref, next, false)) 529 break; 530 531 if (ref->action == next->action) { 532 mod = next->ref_mod; 533 } else { 534 if (ref->ref_mod < next->ref_mod) { 535 swap(ref, next); 536 done = true; 537 } 538 mod = -next->ref_mod; 539 } 540 541 drop_delayed_ref(fs_info, delayed_refs, head, next); 542 ref->ref_mod += mod; 543 if (ref->ref_mod == 0) { 544 drop_delayed_ref(fs_info, delayed_refs, head, ref); 545 done = true; 546 } else { 547 /* 548 * Can't have multiples of the same ref on a tree block. 549 */ 550 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 551 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 552 } 553 } 554 555 return done; 556 } 557 558 void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, 559 struct btrfs_delayed_ref_root *delayed_refs, 560 struct btrfs_delayed_ref_head *head) 561 { 562 struct btrfs_delayed_ref_node *ref; 563 struct rb_node *node; 564 u64 seq = 0; 565 566 lockdep_assert_held(&head->lock); 567 568 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 569 return; 570 571 /* We don't have too many refs to merge for data. */ 572 if (head->is_data) 573 return; 574 575 seq = btrfs_tree_mod_log_lowest_seq(fs_info); 576 again: 577 for (node = rb_first_cached(&head->ref_tree); node; 578 node = rb_next(node)) { 579 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 580 if (seq && ref->seq >= seq) 581 continue; 582 if (merge_ref(fs_info, delayed_refs, head, ref, seq)) 583 goto again; 584 } 585 } 586 587 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 588 { 589 int ret = 0; 590 u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); 591 592 if (min_seq != 0 && seq >= min_seq) { 593 btrfs_debug(fs_info, 594 "holding back delayed_ref %llu, lowest is %llu", 595 seq, min_seq); 596 ret = 1; 597 } 598 599 return ret; 600 } 601 602 struct btrfs_delayed_ref_head *btrfs_select_ref_head( 603 struct btrfs_delayed_ref_root *delayed_refs) 604 { 605 struct btrfs_delayed_ref_head *head; 606 607 lockdep_assert_held(&delayed_refs->lock); 608 again: 609 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 610 true); 611 if (!head && delayed_refs->run_delayed_start != 0) { 612 delayed_refs->run_delayed_start = 0; 613 head = find_first_ref_head(delayed_refs); 614 } 615 if (!head) 616 return NULL; 617 618 while (head->processing) { 619 struct rb_node *node; 620 621 node = rb_next(&head->href_node); 622 if (!node) { 623 if (delayed_refs->run_delayed_start == 0) 624 return NULL; 625 delayed_refs->run_delayed_start = 0; 626 goto again; 627 } 628 head = rb_entry(node, struct btrfs_delayed_ref_head, 629 href_node); 630 } 631 632 head->processing = true; 633 WARN_ON(delayed_refs->num_heads_ready == 0); 634 delayed_refs->num_heads_ready--; 635 delayed_refs->run_delayed_start = head->bytenr + 636 head->num_bytes; 637 return head; 638 } 639 640 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 641 struct btrfs_delayed_ref_head *head) 642 { 643 lockdep_assert_held(&delayed_refs->lock); 644 lockdep_assert_held(&head->lock); 645 646 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 647 RB_CLEAR_NODE(&head->href_node); 648 atomic_dec(&delayed_refs->num_entries); 649 delayed_refs->num_heads--; 650 if (!head->processing) 651 delayed_refs->num_heads_ready--; 652 } 653 654 /* 655 * Helper to insert the ref_node to the tail or merge with tail. 656 * 657 * Return false if the ref was inserted. 658 * Return true if the ref was merged into an existing one (and therefore can be 659 * freed by the caller). 660 */ 661 static bool insert_delayed_ref(struct btrfs_trans_handle *trans, 662 struct btrfs_delayed_ref_head *href, 663 struct btrfs_delayed_ref_node *ref) 664 { 665 struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; 666 struct btrfs_delayed_ref_node *exist; 667 int mod; 668 669 spin_lock(&href->lock); 670 exist = tree_insert(&href->ref_tree, ref); 671 if (!exist) { 672 if (ref->action == BTRFS_ADD_DELAYED_REF) 673 list_add_tail(&ref->add_list, &href->ref_add_list); 674 atomic_inc(&root->num_entries); 675 spin_unlock(&href->lock); 676 trans->delayed_ref_updates++; 677 return false; 678 } 679 680 /* Now we are sure we can merge */ 681 if (exist->action == ref->action) { 682 mod = ref->ref_mod; 683 } else { 684 /* Need to change action */ 685 if (exist->ref_mod < ref->ref_mod) { 686 exist->action = ref->action; 687 mod = -exist->ref_mod; 688 exist->ref_mod = ref->ref_mod; 689 if (ref->action == BTRFS_ADD_DELAYED_REF) 690 list_add_tail(&exist->add_list, 691 &href->ref_add_list); 692 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 693 ASSERT(!list_empty(&exist->add_list)); 694 list_del(&exist->add_list); 695 } else { 696 ASSERT(0); 697 } 698 } else 699 mod = -ref->ref_mod; 700 } 701 exist->ref_mod += mod; 702 703 /* remove existing tail if its ref_mod is zero */ 704 if (exist->ref_mod == 0) 705 drop_delayed_ref(trans->fs_info, root, href, exist); 706 spin_unlock(&href->lock); 707 return true; 708 } 709 710 /* 711 * helper function to update the accounting in the head ref 712 * existing and update must have the same bytenr 713 */ 714 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 715 struct btrfs_delayed_ref_head *existing, 716 struct btrfs_delayed_ref_head *update) 717 { 718 struct btrfs_delayed_ref_root *delayed_refs = 719 &trans->transaction->delayed_refs; 720 struct btrfs_fs_info *fs_info = trans->fs_info; 721 int old_ref_mod; 722 723 BUG_ON(existing->is_data != update->is_data); 724 725 spin_lock(&existing->lock); 726 727 /* 728 * When freeing an extent, we may not know the owning root when we 729 * first create the head_ref. However, some deref before the last deref 730 * will know it, so we just need to update the head_ref accordingly. 731 */ 732 if (!existing->owning_root) 733 existing->owning_root = update->owning_root; 734 735 if (update->must_insert_reserved) { 736 /* if the extent was freed and then 737 * reallocated before the delayed ref 738 * entries were processed, we can end up 739 * with an existing head ref without 740 * the must_insert_reserved flag set. 741 * Set it again here 742 */ 743 existing->must_insert_reserved = update->must_insert_reserved; 744 existing->owning_root = update->owning_root; 745 746 /* 747 * update the num_bytes so we make sure the accounting 748 * is done correctly 749 */ 750 existing->num_bytes = update->num_bytes; 751 752 } 753 754 if (update->extent_op) { 755 if (!existing->extent_op) { 756 existing->extent_op = update->extent_op; 757 } else { 758 if (update->extent_op->update_key) { 759 memcpy(&existing->extent_op->key, 760 &update->extent_op->key, 761 sizeof(update->extent_op->key)); 762 existing->extent_op->update_key = true; 763 } 764 if (update->extent_op->update_flags) { 765 existing->extent_op->flags_to_set |= 766 update->extent_op->flags_to_set; 767 existing->extent_op->update_flags = true; 768 } 769 btrfs_free_delayed_extent_op(update->extent_op); 770 } 771 } 772 /* 773 * update the reference mod on the head to reflect this new operation, 774 * only need the lock for this case cause we could be processing it 775 * currently, for refs we just added we know we're a-ok. 776 */ 777 old_ref_mod = existing->total_ref_mod; 778 existing->ref_mod += update->ref_mod; 779 existing->total_ref_mod += update->ref_mod; 780 781 /* 782 * If we are going to from a positive ref mod to a negative or vice 783 * versa we need to make sure to adjust pending_csums accordingly. 784 * We reserve bytes for csum deletion when adding or updating a ref head 785 * see add_delayed_ref_head() for more details. 786 */ 787 if (existing->is_data) { 788 u64 csum_leaves = 789 btrfs_csum_bytes_to_leaves(fs_info, 790 existing->num_bytes); 791 792 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 793 delayed_refs->pending_csums -= existing->num_bytes; 794 btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); 795 } 796 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 797 delayed_refs->pending_csums += existing->num_bytes; 798 trans->delayed_ref_csum_deletions += csum_leaves; 799 } 800 } 801 802 spin_unlock(&existing->lock); 803 } 804 805 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 806 struct btrfs_ref *generic_ref, 807 struct btrfs_qgroup_extent_record *qrecord, 808 u64 reserved) 809 { 810 int count_mod = 1; 811 bool must_insert_reserved = false; 812 813 /* If reserved is provided, it must be a data extent. */ 814 BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); 815 816 switch (generic_ref->action) { 817 case BTRFS_ADD_DELAYED_REF: 818 /* count_mod is already set to 1. */ 819 break; 820 case BTRFS_UPDATE_DELAYED_HEAD: 821 count_mod = 0; 822 break; 823 case BTRFS_DROP_DELAYED_REF: 824 /* 825 * The head node stores the sum of all the mods, so dropping a ref 826 * should drop the sum in the head node by one. 827 */ 828 count_mod = -1; 829 break; 830 case BTRFS_ADD_DELAYED_EXTENT: 831 /* 832 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the 833 * reserved accounting when the extent is finally added, or if a 834 * later modification deletes the delayed ref without ever 835 * inserting the extent into the extent allocation tree. 836 * ref->must_insert_reserved is the flag used to record that 837 * accounting mods are required. 838 * 839 * Once we record must_insert_reserved, switch the action to 840 * BTRFS_ADD_DELAYED_REF because other special casing is not 841 * required. 842 */ 843 must_insert_reserved = true; 844 break; 845 } 846 847 refcount_set(&head_ref->refs, 1); 848 head_ref->bytenr = generic_ref->bytenr; 849 head_ref->num_bytes = generic_ref->num_bytes; 850 head_ref->ref_mod = count_mod; 851 head_ref->reserved_bytes = reserved; 852 head_ref->must_insert_reserved = must_insert_reserved; 853 head_ref->owning_root = generic_ref->owning_root; 854 head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); 855 head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); 856 head_ref->ref_tree = RB_ROOT_CACHED; 857 INIT_LIST_HEAD(&head_ref->ref_add_list); 858 RB_CLEAR_NODE(&head_ref->href_node); 859 head_ref->processing = false; 860 head_ref->total_ref_mod = count_mod; 861 spin_lock_init(&head_ref->lock); 862 mutex_init(&head_ref->mutex); 863 864 if (qrecord) { 865 if (generic_ref->ref_root && reserved) { 866 qrecord->data_rsv = reserved; 867 qrecord->data_rsv_refroot = generic_ref->ref_root; 868 } 869 qrecord->bytenr = generic_ref->bytenr; 870 qrecord->num_bytes = generic_ref->num_bytes; 871 qrecord->old_roots = NULL; 872 } 873 } 874 875 /* 876 * helper function to actually insert a head node into the rbtree. 877 * this does all the dirty work in terms of maintaining the correct 878 * overall modification count. 879 */ 880 static noinline struct btrfs_delayed_ref_head * 881 add_delayed_ref_head(struct btrfs_trans_handle *trans, 882 struct btrfs_delayed_ref_head *head_ref, 883 struct btrfs_qgroup_extent_record *qrecord, 884 int action, bool *qrecord_inserted_ret) 885 { 886 struct btrfs_delayed_ref_head *existing; 887 struct btrfs_delayed_ref_root *delayed_refs; 888 bool qrecord_inserted = false; 889 890 delayed_refs = &trans->transaction->delayed_refs; 891 892 /* Record qgroup extent info if provided */ 893 if (qrecord) { 894 if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, 895 delayed_refs, qrecord)) 896 kfree(qrecord); 897 else 898 qrecord_inserted = true; 899 } 900 901 trace_add_delayed_ref_head(trans->fs_info, head_ref, action); 902 903 existing = htree_insert(&delayed_refs->href_root, 904 &head_ref->href_node); 905 if (existing) { 906 update_existing_head_ref(trans, existing, head_ref); 907 /* 908 * we've updated the existing ref, free the newly 909 * allocated ref 910 */ 911 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 912 head_ref = existing; 913 } else { 914 /* 915 * We reserve the amount of bytes needed to delete csums when 916 * adding the ref head and not when adding individual drop refs 917 * since the csum items are deleted only after running the last 918 * delayed drop ref (the data extent's ref count drops to 0). 919 */ 920 if (head_ref->is_data && head_ref->ref_mod < 0) { 921 delayed_refs->pending_csums += head_ref->num_bytes; 922 trans->delayed_ref_csum_deletions += 923 btrfs_csum_bytes_to_leaves(trans->fs_info, 924 head_ref->num_bytes); 925 } 926 delayed_refs->num_heads++; 927 delayed_refs->num_heads_ready++; 928 atomic_inc(&delayed_refs->num_entries); 929 } 930 if (qrecord_inserted_ret) 931 *qrecord_inserted_ret = qrecord_inserted; 932 933 return head_ref; 934 } 935 936 /* 937 * Initialize the structure which represents a modification to a an extent. 938 * 939 * @fs_info: Internal to the mounted filesystem mount structure. 940 * 941 * @ref: The structure which is going to be initialized. 942 * 943 * @bytenr: The logical address of the extent for which a modification is 944 * going to be recorded. 945 * 946 * @num_bytes: Size of the extent whose modification is being recorded. 947 * 948 * @ref_root: The id of the root where this modification has originated, this 949 * can be either one of the well-known metadata trees or the 950 * subvolume id which references this extent. 951 * 952 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 953 * BTRFS_ADD_DELAYED_EXTENT 954 * 955 * @ref_type: Holds the type of the extent which is being recorded, can be 956 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 957 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 958 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 959 */ 960 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 961 struct btrfs_delayed_ref_node *ref, 962 struct btrfs_ref *generic_ref) 963 { 964 int action = generic_ref->action; 965 u64 seq = 0; 966 967 if (action == BTRFS_ADD_DELAYED_EXTENT) 968 action = BTRFS_ADD_DELAYED_REF; 969 970 if (is_fstree(generic_ref->ref_root)) 971 seq = atomic64_read(&fs_info->tree_mod_seq); 972 973 refcount_set(&ref->refs, 1); 974 ref->bytenr = generic_ref->bytenr; 975 ref->num_bytes = generic_ref->num_bytes; 976 ref->ref_mod = 1; 977 ref->action = action; 978 ref->seq = seq; 979 ref->type = btrfs_ref_type(generic_ref); 980 ref->ref_root = generic_ref->ref_root; 981 ref->parent = generic_ref->parent; 982 RB_CLEAR_NODE(&ref->ref_node); 983 INIT_LIST_HEAD(&ref->add_list); 984 985 if (generic_ref->type == BTRFS_REF_DATA) 986 ref->data_ref = generic_ref->data_ref; 987 else 988 ref->tree_ref = generic_ref->tree_ref; 989 } 990 991 void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, 992 bool skip_qgroup) 993 { 994 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 995 /* If @real_root not set, use @root as fallback */ 996 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 997 #endif 998 generic_ref->tree_ref.level = level; 999 generic_ref->type = BTRFS_REF_METADATA; 1000 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 1001 (!mod_root || is_fstree(mod_root)))) 1002 generic_ref->skip_qgroup = true; 1003 else 1004 generic_ref->skip_qgroup = false; 1005 1006 } 1007 1008 void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, 1009 u64 mod_root, bool skip_qgroup) 1010 { 1011 #ifdef CONFIG_BTRFS_FS_REF_VERIFY 1012 /* If @real_root not set, use @root as fallback */ 1013 generic_ref->real_root = mod_root ?: generic_ref->ref_root; 1014 #endif 1015 generic_ref->data_ref.objectid = ino; 1016 generic_ref->data_ref.offset = offset; 1017 generic_ref->type = BTRFS_REF_DATA; 1018 if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && 1019 (!mod_root || is_fstree(mod_root)))) 1020 generic_ref->skip_qgroup = true; 1021 else 1022 generic_ref->skip_qgroup = false; 1023 } 1024 1025 static int add_delayed_ref(struct btrfs_trans_handle *trans, 1026 struct btrfs_ref *generic_ref, 1027 struct btrfs_delayed_extent_op *extent_op, 1028 u64 reserved) 1029 { 1030 struct btrfs_fs_info *fs_info = trans->fs_info; 1031 struct btrfs_delayed_ref_node *node; 1032 struct btrfs_delayed_ref_head *head_ref; 1033 struct btrfs_delayed_ref_root *delayed_refs; 1034 struct btrfs_qgroup_extent_record *record = NULL; 1035 bool qrecord_inserted; 1036 int action = generic_ref->action; 1037 bool merged; 1038 1039 node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); 1040 if (!node) 1041 return -ENOMEM; 1042 1043 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1044 if (!head_ref) { 1045 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1046 return -ENOMEM; 1047 } 1048 1049 if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { 1050 record = kzalloc(sizeof(*record), GFP_NOFS); 1051 if (!record) { 1052 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1053 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 1054 return -ENOMEM; 1055 } 1056 } 1057 1058 init_delayed_ref_common(fs_info, node, generic_ref); 1059 init_delayed_ref_head(head_ref, generic_ref, record, reserved); 1060 head_ref->extent_op = extent_op; 1061 1062 delayed_refs = &trans->transaction->delayed_refs; 1063 spin_lock(&delayed_refs->lock); 1064 1065 /* 1066 * insert both the head node and the new ref without dropping 1067 * the spin lock 1068 */ 1069 head_ref = add_delayed_ref_head(trans, head_ref, record, 1070 action, &qrecord_inserted); 1071 1072 merged = insert_delayed_ref(trans, head_ref, node); 1073 spin_unlock(&delayed_refs->lock); 1074 1075 /* 1076 * Need to update the delayed_refs_rsv with any changes we may have 1077 * made. 1078 */ 1079 btrfs_update_delayed_refs_rsv(trans); 1080 1081 if (generic_ref->type == BTRFS_REF_DATA) 1082 trace_add_delayed_data_ref(trans->fs_info, node); 1083 else 1084 trace_add_delayed_tree_ref(trans->fs_info, node); 1085 if (merged) 1086 kmem_cache_free(btrfs_delayed_ref_node_cachep, node); 1087 1088 if (qrecord_inserted) 1089 return btrfs_qgroup_trace_extent_post(trans, record); 1090 return 0; 1091 } 1092 1093 /* 1094 * Add a delayed tree ref. This does all of the accounting required to make sure 1095 * the delayed ref is eventually processed before this transaction commits. 1096 */ 1097 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 1098 struct btrfs_ref *generic_ref, 1099 struct btrfs_delayed_extent_op *extent_op) 1100 { 1101 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 1102 return add_delayed_ref(trans, generic_ref, extent_op, 0); 1103 } 1104 1105 /* 1106 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1107 */ 1108 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1109 struct btrfs_ref *generic_ref, 1110 u64 reserved) 1111 { 1112 ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); 1113 return add_delayed_ref(trans, generic_ref, NULL, reserved); 1114 } 1115 1116 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1117 u64 bytenr, u64 num_bytes, 1118 struct btrfs_delayed_extent_op *extent_op) 1119 { 1120 struct btrfs_delayed_ref_head *head_ref; 1121 struct btrfs_delayed_ref_root *delayed_refs; 1122 struct btrfs_ref generic_ref = { 1123 .type = BTRFS_REF_METADATA, 1124 .action = BTRFS_UPDATE_DELAYED_HEAD, 1125 .bytenr = bytenr, 1126 .num_bytes = num_bytes, 1127 }; 1128 1129 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1130 if (!head_ref) 1131 return -ENOMEM; 1132 1133 init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); 1134 head_ref->extent_op = extent_op; 1135 1136 delayed_refs = &trans->transaction->delayed_refs; 1137 spin_lock(&delayed_refs->lock); 1138 1139 add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, 1140 NULL); 1141 1142 spin_unlock(&delayed_refs->lock); 1143 1144 /* 1145 * Need to update the delayed_refs_rsv with any changes we may have 1146 * made. 1147 */ 1148 btrfs_update_delayed_refs_rsv(trans); 1149 return 0; 1150 } 1151 1152 void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 1153 { 1154 if (refcount_dec_and_test(&ref->refs)) { 1155 WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); 1156 kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); 1157 } 1158 } 1159 1160 /* 1161 * This does a simple search for the head node for a given extent. Returns the 1162 * head node if found, or NULL if not. 1163 */ 1164 struct btrfs_delayed_ref_head * 1165 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1166 { 1167 lockdep_assert_held(&delayed_refs->lock); 1168 1169 return find_ref_head(delayed_refs, bytenr, false); 1170 } 1171 1172 void __cold btrfs_delayed_ref_exit(void) 1173 { 1174 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1175 kmem_cache_destroy(btrfs_delayed_ref_node_cachep); 1176 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1177 } 1178 1179 int __init btrfs_delayed_ref_init(void) 1180 { 1181 btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); 1182 if (!btrfs_delayed_ref_head_cachep) 1183 goto fail; 1184 1185 btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); 1186 if (!btrfs_delayed_ref_node_cachep) 1187 goto fail; 1188 1189 btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); 1190 if (!btrfs_delayed_extent_op_cachep) 1191 goto fail; 1192 1193 return 0; 1194 fail: 1195 btrfs_delayed_ref_exit(); 1196 return -ENOMEM; 1197 } 1198