1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/slab.h> 8 #include <linux/sort.h> 9 #include "messages.h" 10 #include "ctree.h" 11 #include "delayed-ref.h" 12 #include "transaction.h" 13 #include "qgroup.h" 14 #include "space-info.h" 15 #include "tree-mod-log.h" 16 #include "fs.h" 17 18 struct kmem_cache *btrfs_delayed_ref_head_cachep; 19 struct kmem_cache *btrfs_delayed_tree_ref_cachep; 20 struct kmem_cache *btrfs_delayed_data_ref_cachep; 21 struct kmem_cache *btrfs_delayed_extent_op_cachep; 22 /* 23 * delayed back reference update tracking. For subvolume trees 24 * we queue up extent allocations and backref maintenance for 25 * delayed processing. This avoids deep call chains where we 26 * add extents in the middle of btrfs_search_slot, and it allows 27 * us to buffer up frequently modified backrefs in an rb tree instead 28 * of hammering updates on the extent allocation tree. 29 */ 30 31 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) 32 { 33 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 34 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 35 bool ret = false; 36 u64 reserved; 37 38 spin_lock(&global_rsv->lock); 39 reserved = global_rsv->reserved; 40 spin_unlock(&global_rsv->lock); 41 42 /* 43 * Since the global reserve is just kind of magic we don't really want 44 * to rely on it to save our bacon, so if our size is more than the 45 * delayed_refs_rsv and the global rsv then it's time to think about 46 * bailing. 47 */ 48 spin_lock(&delayed_refs_rsv->lock); 49 reserved += delayed_refs_rsv->reserved; 50 if (delayed_refs_rsv->size >= reserved) 51 ret = true; 52 spin_unlock(&delayed_refs_rsv->lock); 53 return ret; 54 } 55 56 /* 57 * Release a ref head's reservation. 58 * 59 * @fs_info: the filesystem 60 * @nr_refs: number of delayed refs to drop 61 * @nr_csums: number of csum items to drop 62 * 63 * Drops the delayed ref head's count from the delayed refs rsv and free any 64 * excess reservation we had. 65 */ 66 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) 67 { 68 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 69 u64 num_bytes; 70 u64 released; 71 72 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); 73 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); 74 75 released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); 76 if (released) 77 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 78 0, released, 0); 79 } 80 81 /* 82 * Adjust the size of the delayed refs rsv. 83 * 84 * This is to be called anytime we may have adjusted trans->delayed_ref_updates 85 * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and 86 * add it to the delayed_refs_rsv. 87 */ 88 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) 89 { 90 struct btrfs_fs_info *fs_info = trans->fs_info; 91 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 92 struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; 93 u64 num_bytes; 94 u64 reserved_bytes; 95 96 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); 97 num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, 98 trans->delayed_ref_csum_deletions); 99 100 if (num_bytes == 0) 101 return; 102 103 /* 104 * Try to take num_bytes from the transaction's local delayed reserve. 105 * If not possible, try to take as much as it's available. If the local 106 * reserve doesn't have enough reserved space, the delayed refs reserve 107 * will be refilled next time btrfs_delayed_refs_rsv_refill() is called 108 * by someone or if a transaction commit is triggered before that, the 109 * global block reserve will be used. We want to minimize using the 110 * global block reserve for cases we can account for in advance, to 111 * avoid exhausting it and reach -ENOSPC during a transaction commit. 112 */ 113 spin_lock(&local_rsv->lock); 114 reserved_bytes = min(num_bytes, local_rsv->reserved); 115 local_rsv->reserved -= reserved_bytes; 116 local_rsv->full = (local_rsv->reserved >= local_rsv->size); 117 spin_unlock(&local_rsv->lock); 118 119 spin_lock(&delayed_rsv->lock); 120 delayed_rsv->size += num_bytes; 121 delayed_rsv->reserved += reserved_bytes; 122 delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); 123 spin_unlock(&delayed_rsv->lock); 124 trans->delayed_ref_updates = 0; 125 trans->delayed_ref_csum_deletions = 0; 126 } 127 128 /* 129 * Adjust the size of the delayed refs block reserve for 1 block group item 130 * insertion, used after allocating a block group. 131 */ 132 void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 133 { 134 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 135 136 spin_lock(&delayed_rsv->lock); 137 /* 138 * Inserting a block group item does not require changing the free space 139 * tree, only the extent tree or the block group tree, so this is all we 140 * need. 141 */ 142 delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); 143 delayed_rsv->full = false; 144 spin_unlock(&delayed_rsv->lock); 145 } 146 147 /* 148 * Adjust the size of the delayed refs block reserve to release space for 1 149 * block group item insertion. 150 */ 151 void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) 152 { 153 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 154 const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 155 u64 released; 156 157 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 158 if (released > 0) 159 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 160 0, released, 0); 161 } 162 163 /* 164 * Adjust the size of the delayed refs block reserve for 1 block group item 165 * update. 166 */ 167 void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 168 { 169 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 170 171 spin_lock(&delayed_rsv->lock); 172 /* 173 * Updating a block group item does not result in new nodes/leaves and 174 * does not require changing the free space tree, only the extent tree 175 * or the block group tree, so this is all we need. 176 */ 177 delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); 178 delayed_rsv->full = false; 179 spin_unlock(&delayed_rsv->lock); 180 } 181 182 /* 183 * Adjust the size of the delayed refs block reserve to release space for 1 184 * block group item update. 185 */ 186 void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) 187 { 188 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; 189 const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 190 u64 released; 191 192 released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); 193 if (released > 0) 194 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 195 0, released, 0); 196 } 197 198 /* 199 * Transfer bytes to our delayed refs rsv. 200 * 201 * @fs_info: the filesystem 202 * @num_bytes: number of bytes to transfer 203 * 204 * This transfers up to the num_bytes amount, previously reserved, to the 205 * delayed_refs_rsv. Any extra bytes are returned to the space info. 206 */ 207 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, 208 u64 num_bytes) 209 { 210 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 211 u64 to_free = 0; 212 213 spin_lock(&delayed_refs_rsv->lock); 214 if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { 215 u64 delta = delayed_refs_rsv->size - 216 delayed_refs_rsv->reserved; 217 if (num_bytes > delta) { 218 to_free = num_bytes - delta; 219 num_bytes = delta; 220 } 221 } else { 222 to_free = num_bytes; 223 num_bytes = 0; 224 } 225 226 if (num_bytes) 227 delayed_refs_rsv->reserved += num_bytes; 228 if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) 229 delayed_refs_rsv->full = true; 230 spin_unlock(&delayed_refs_rsv->lock); 231 232 if (num_bytes) 233 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 234 0, num_bytes, 1); 235 if (to_free) 236 btrfs_space_info_free_bytes_may_use(fs_info, 237 delayed_refs_rsv->space_info, to_free); 238 } 239 240 /* 241 * Refill based on our delayed refs usage. 242 * 243 * @fs_info: the filesystem 244 * @flush: control how we can flush for this reservation. 245 * 246 * This will refill the delayed block_rsv up to 1 items size worth of space and 247 * will return -ENOSPC if we can't make the reservation. 248 */ 249 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 250 enum btrfs_reserve_flush_enum flush) 251 { 252 struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 253 struct btrfs_space_info *space_info = block_rsv->space_info; 254 u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); 255 u64 num_bytes = 0; 256 u64 refilled_bytes; 257 u64 to_free; 258 int ret = -ENOSPC; 259 260 spin_lock(&block_rsv->lock); 261 if (block_rsv->reserved < block_rsv->size) { 262 num_bytes = block_rsv->size - block_rsv->reserved; 263 num_bytes = min(num_bytes, limit); 264 } 265 spin_unlock(&block_rsv->lock); 266 267 if (!num_bytes) 268 return 0; 269 270 ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); 271 if (ret) 272 return ret; 273 274 /* 275 * We may have raced with someone else, so check again if we the block 276 * reserve is still not full and release any excess space. 277 */ 278 spin_lock(&block_rsv->lock); 279 if (block_rsv->reserved < block_rsv->size) { 280 u64 needed = block_rsv->size - block_rsv->reserved; 281 282 if (num_bytes >= needed) { 283 block_rsv->reserved += needed; 284 block_rsv->full = true; 285 to_free = num_bytes - needed; 286 refilled_bytes = needed; 287 } else { 288 block_rsv->reserved += num_bytes; 289 to_free = 0; 290 refilled_bytes = num_bytes; 291 } 292 } else { 293 to_free = num_bytes; 294 refilled_bytes = 0; 295 } 296 spin_unlock(&block_rsv->lock); 297 298 if (to_free > 0) 299 btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); 300 301 if (refilled_bytes > 0) 302 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, 303 refilled_bytes, 1); 304 return 0; 305 } 306 307 /* 308 * compare two delayed tree backrefs with same bytenr and type 309 */ 310 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, 311 struct btrfs_delayed_tree_ref *ref2) 312 { 313 if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { 314 if (ref1->root < ref2->root) 315 return -1; 316 if (ref1->root > ref2->root) 317 return 1; 318 } else { 319 if (ref1->parent < ref2->parent) 320 return -1; 321 if (ref1->parent > ref2->parent) 322 return 1; 323 } 324 return 0; 325 } 326 327 /* 328 * compare two delayed data backrefs with same bytenr and type 329 */ 330 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, 331 struct btrfs_delayed_data_ref *ref2) 332 { 333 if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { 334 if (ref1->root < ref2->root) 335 return -1; 336 if (ref1->root > ref2->root) 337 return 1; 338 if (ref1->objectid < ref2->objectid) 339 return -1; 340 if (ref1->objectid > ref2->objectid) 341 return 1; 342 if (ref1->offset < ref2->offset) 343 return -1; 344 if (ref1->offset > ref2->offset) 345 return 1; 346 } else { 347 if (ref1->parent < ref2->parent) 348 return -1; 349 if (ref1->parent > ref2->parent) 350 return 1; 351 } 352 return 0; 353 } 354 355 static int comp_refs(struct btrfs_delayed_ref_node *ref1, 356 struct btrfs_delayed_ref_node *ref2, 357 bool check_seq) 358 { 359 int ret = 0; 360 361 if (ref1->type < ref2->type) 362 return -1; 363 if (ref1->type > ref2->type) 364 return 1; 365 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 366 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) 367 ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), 368 btrfs_delayed_node_to_tree_ref(ref2)); 369 else 370 ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), 371 btrfs_delayed_node_to_data_ref(ref2)); 372 if (ret) 373 return ret; 374 if (check_seq) { 375 if (ref1->seq < ref2->seq) 376 return -1; 377 if (ref1->seq > ref2->seq) 378 return 1; 379 } 380 return 0; 381 } 382 383 /* insert a new ref to head ref rbtree */ 384 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root, 385 struct rb_node *node) 386 { 387 struct rb_node **p = &root->rb_root.rb_node; 388 struct rb_node *parent_node = NULL; 389 struct btrfs_delayed_ref_head *entry; 390 struct btrfs_delayed_ref_head *ins; 391 u64 bytenr; 392 bool leftmost = true; 393 394 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 395 bytenr = ins->bytenr; 396 while (*p) { 397 parent_node = *p; 398 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, 399 href_node); 400 401 if (bytenr < entry->bytenr) { 402 p = &(*p)->rb_left; 403 } else if (bytenr > entry->bytenr) { 404 p = &(*p)->rb_right; 405 leftmost = false; 406 } else { 407 return entry; 408 } 409 } 410 411 rb_link_node(node, parent_node, p); 412 rb_insert_color_cached(node, root, leftmost); 413 return NULL; 414 } 415 416 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, 417 struct btrfs_delayed_ref_node *ins) 418 { 419 struct rb_node **p = &root->rb_root.rb_node; 420 struct rb_node *node = &ins->ref_node; 421 struct rb_node *parent_node = NULL; 422 struct btrfs_delayed_ref_node *entry; 423 bool leftmost = true; 424 425 while (*p) { 426 int comp; 427 428 parent_node = *p; 429 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, 430 ref_node); 431 comp = comp_refs(ins, entry, true); 432 if (comp < 0) { 433 p = &(*p)->rb_left; 434 } else if (comp > 0) { 435 p = &(*p)->rb_right; 436 leftmost = false; 437 } else { 438 return entry; 439 } 440 } 441 442 rb_link_node(node, parent_node, p); 443 rb_insert_color_cached(node, root, leftmost); 444 return NULL; 445 } 446 447 static struct btrfs_delayed_ref_head *find_first_ref_head( 448 struct btrfs_delayed_ref_root *dr) 449 { 450 struct rb_node *n; 451 struct btrfs_delayed_ref_head *entry; 452 453 n = rb_first_cached(&dr->href_root); 454 if (!n) 455 return NULL; 456 457 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 458 459 return entry; 460 } 461 462 /* 463 * Find a head entry based on bytenr. This returns the delayed ref head if it 464 * was able to find one, or NULL if nothing was in that spot. If return_bigger 465 * is given, the next bigger entry is returned if no exact match is found. 466 */ 467 static struct btrfs_delayed_ref_head *find_ref_head( 468 struct btrfs_delayed_ref_root *dr, u64 bytenr, 469 bool return_bigger) 470 { 471 struct rb_root *root = &dr->href_root.rb_root; 472 struct rb_node *n; 473 struct btrfs_delayed_ref_head *entry; 474 475 n = root->rb_node; 476 entry = NULL; 477 while (n) { 478 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); 479 480 if (bytenr < entry->bytenr) 481 n = n->rb_left; 482 else if (bytenr > entry->bytenr) 483 n = n->rb_right; 484 else 485 return entry; 486 } 487 if (entry && return_bigger) { 488 if (bytenr > entry->bytenr) { 489 n = rb_next(&entry->href_node); 490 if (!n) 491 return NULL; 492 entry = rb_entry(n, struct btrfs_delayed_ref_head, 493 href_node); 494 } 495 return entry; 496 } 497 return NULL; 498 } 499 500 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, 501 struct btrfs_delayed_ref_head *head) 502 { 503 lockdep_assert_held(&delayed_refs->lock); 504 if (mutex_trylock(&head->mutex)) 505 return 0; 506 507 refcount_inc(&head->refs); 508 spin_unlock(&delayed_refs->lock); 509 510 mutex_lock(&head->mutex); 511 spin_lock(&delayed_refs->lock); 512 if (RB_EMPTY_NODE(&head->href_node)) { 513 mutex_unlock(&head->mutex); 514 btrfs_put_delayed_ref_head(head); 515 return -EAGAIN; 516 } 517 btrfs_put_delayed_ref_head(head); 518 return 0; 519 } 520 521 static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, 522 struct btrfs_delayed_ref_root *delayed_refs, 523 struct btrfs_delayed_ref_head *head, 524 struct btrfs_delayed_ref_node *ref) 525 { 526 lockdep_assert_held(&head->lock); 527 rb_erase_cached(&ref->ref_node, &head->ref_tree); 528 RB_CLEAR_NODE(&ref->ref_node); 529 if (!list_empty(&ref->add_list)) 530 list_del(&ref->add_list); 531 btrfs_put_delayed_ref(ref); 532 atomic_dec(&delayed_refs->num_entries); 533 btrfs_delayed_refs_rsv_release(fs_info, 1, 0); 534 } 535 536 static bool merge_ref(struct btrfs_fs_info *fs_info, 537 struct btrfs_delayed_ref_root *delayed_refs, 538 struct btrfs_delayed_ref_head *head, 539 struct btrfs_delayed_ref_node *ref, 540 u64 seq) 541 { 542 struct btrfs_delayed_ref_node *next; 543 struct rb_node *node = rb_next(&ref->ref_node); 544 bool done = false; 545 546 while (!done && node) { 547 int mod; 548 549 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 550 node = rb_next(node); 551 if (seq && next->seq >= seq) 552 break; 553 if (comp_refs(ref, next, false)) 554 break; 555 556 if (ref->action == next->action) { 557 mod = next->ref_mod; 558 } else { 559 if (ref->ref_mod < next->ref_mod) { 560 swap(ref, next); 561 done = true; 562 } 563 mod = -next->ref_mod; 564 } 565 566 drop_delayed_ref(fs_info, delayed_refs, head, next); 567 ref->ref_mod += mod; 568 if (ref->ref_mod == 0) { 569 drop_delayed_ref(fs_info, delayed_refs, head, ref); 570 done = true; 571 } else { 572 /* 573 * Can't have multiples of the same ref on a tree block. 574 */ 575 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 576 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 577 } 578 } 579 580 return done; 581 } 582 583 void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info, 584 struct btrfs_delayed_ref_root *delayed_refs, 585 struct btrfs_delayed_ref_head *head) 586 { 587 struct btrfs_delayed_ref_node *ref; 588 struct rb_node *node; 589 u64 seq = 0; 590 591 lockdep_assert_held(&head->lock); 592 593 if (RB_EMPTY_ROOT(&head->ref_tree.rb_root)) 594 return; 595 596 /* We don't have too many refs to merge for data. */ 597 if (head->is_data) 598 return; 599 600 seq = btrfs_tree_mod_log_lowest_seq(fs_info); 601 again: 602 for (node = rb_first_cached(&head->ref_tree); node; 603 node = rb_next(node)) { 604 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); 605 if (seq && ref->seq >= seq) 606 continue; 607 if (merge_ref(fs_info, delayed_refs, head, ref, seq)) 608 goto again; 609 } 610 } 611 612 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) 613 { 614 int ret = 0; 615 u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); 616 617 if (min_seq != 0 && seq >= min_seq) { 618 btrfs_debug(fs_info, 619 "holding back delayed_ref %llu, lowest is %llu", 620 seq, min_seq); 621 ret = 1; 622 } 623 624 return ret; 625 } 626 627 struct btrfs_delayed_ref_head *btrfs_select_ref_head( 628 struct btrfs_delayed_ref_root *delayed_refs) 629 { 630 struct btrfs_delayed_ref_head *head; 631 632 lockdep_assert_held(&delayed_refs->lock); 633 again: 634 head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, 635 true); 636 if (!head && delayed_refs->run_delayed_start != 0) { 637 delayed_refs->run_delayed_start = 0; 638 head = find_first_ref_head(delayed_refs); 639 } 640 if (!head) 641 return NULL; 642 643 while (head->processing) { 644 struct rb_node *node; 645 646 node = rb_next(&head->href_node); 647 if (!node) { 648 if (delayed_refs->run_delayed_start == 0) 649 return NULL; 650 delayed_refs->run_delayed_start = 0; 651 goto again; 652 } 653 head = rb_entry(node, struct btrfs_delayed_ref_head, 654 href_node); 655 } 656 657 head->processing = true; 658 WARN_ON(delayed_refs->num_heads_ready == 0); 659 delayed_refs->num_heads_ready--; 660 delayed_refs->run_delayed_start = head->bytenr + 661 head->num_bytes; 662 return head; 663 } 664 665 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, 666 struct btrfs_delayed_ref_head *head) 667 { 668 lockdep_assert_held(&delayed_refs->lock); 669 lockdep_assert_held(&head->lock); 670 671 rb_erase_cached(&head->href_node, &delayed_refs->href_root); 672 RB_CLEAR_NODE(&head->href_node); 673 atomic_dec(&delayed_refs->num_entries); 674 delayed_refs->num_heads--; 675 if (!head->processing) 676 delayed_refs->num_heads_ready--; 677 } 678 679 /* 680 * Helper to insert the ref_node to the tail or merge with tail. 681 * 682 * Return false if the ref was inserted. 683 * Return true if the ref was merged into an existing one (and therefore can be 684 * freed by the caller). 685 */ 686 static bool insert_delayed_ref(struct btrfs_trans_handle *trans, 687 struct btrfs_delayed_ref_head *href, 688 struct btrfs_delayed_ref_node *ref) 689 { 690 struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; 691 struct btrfs_delayed_ref_node *exist; 692 int mod; 693 694 spin_lock(&href->lock); 695 exist = tree_insert(&href->ref_tree, ref); 696 if (!exist) { 697 if (ref->action == BTRFS_ADD_DELAYED_REF) 698 list_add_tail(&ref->add_list, &href->ref_add_list); 699 atomic_inc(&root->num_entries); 700 spin_unlock(&href->lock); 701 trans->delayed_ref_updates++; 702 return false; 703 } 704 705 /* Now we are sure we can merge */ 706 if (exist->action == ref->action) { 707 mod = ref->ref_mod; 708 } else { 709 /* Need to change action */ 710 if (exist->ref_mod < ref->ref_mod) { 711 exist->action = ref->action; 712 mod = -exist->ref_mod; 713 exist->ref_mod = ref->ref_mod; 714 if (ref->action == BTRFS_ADD_DELAYED_REF) 715 list_add_tail(&exist->add_list, 716 &href->ref_add_list); 717 else if (ref->action == BTRFS_DROP_DELAYED_REF) { 718 ASSERT(!list_empty(&exist->add_list)); 719 list_del(&exist->add_list); 720 } else { 721 ASSERT(0); 722 } 723 } else 724 mod = -ref->ref_mod; 725 } 726 exist->ref_mod += mod; 727 728 /* remove existing tail if its ref_mod is zero */ 729 if (exist->ref_mod == 0) 730 drop_delayed_ref(trans->fs_info, root, href, exist); 731 spin_unlock(&href->lock); 732 return true; 733 } 734 735 /* 736 * helper function to update the accounting in the head ref 737 * existing and update must have the same bytenr 738 */ 739 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, 740 struct btrfs_delayed_ref_head *existing, 741 struct btrfs_delayed_ref_head *update) 742 { 743 struct btrfs_delayed_ref_root *delayed_refs = 744 &trans->transaction->delayed_refs; 745 struct btrfs_fs_info *fs_info = trans->fs_info; 746 int old_ref_mod; 747 748 BUG_ON(existing->is_data != update->is_data); 749 750 spin_lock(&existing->lock); 751 752 /* 753 * When freeing an extent, we may not know the owning root when we 754 * first create the head_ref. However, some deref before the last deref 755 * will know it, so we just need to update the head_ref accordingly. 756 */ 757 if (!existing->owning_root) 758 existing->owning_root = update->owning_root; 759 760 if (update->must_insert_reserved) { 761 /* if the extent was freed and then 762 * reallocated before the delayed ref 763 * entries were processed, we can end up 764 * with an existing head ref without 765 * the must_insert_reserved flag set. 766 * Set it again here 767 */ 768 existing->must_insert_reserved = update->must_insert_reserved; 769 existing->owning_root = update->owning_root; 770 771 /* 772 * update the num_bytes so we make sure the accounting 773 * is done correctly 774 */ 775 existing->num_bytes = update->num_bytes; 776 777 } 778 779 if (update->extent_op) { 780 if (!existing->extent_op) { 781 existing->extent_op = update->extent_op; 782 } else { 783 if (update->extent_op->update_key) { 784 memcpy(&existing->extent_op->key, 785 &update->extent_op->key, 786 sizeof(update->extent_op->key)); 787 existing->extent_op->update_key = true; 788 } 789 if (update->extent_op->update_flags) { 790 existing->extent_op->flags_to_set |= 791 update->extent_op->flags_to_set; 792 existing->extent_op->update_flags = true; 793 } 794 btrfs_free_delayed_extent_op(update->extent_op); 795 } 796 } 797 /* 798 * update the reference mod on the head to reflect this new operation, 799 * only need the lock for this case cause we could be processing it 800 * currently, for refs we just added we know we're a-ok. 801 */ 802 old_ref_mod = existing->total_ref_mod; 803 existing->ref_mod += update->ref_mod; 804 existing->total_ref_mod += update->ref_mod; 805 806 /* 807 * If we are going to from a positive ref mod to a negative or vice 808 * versa we need to make sure to adjust pending_csums accordingly. 809 * We reserve bytes for csum deletion when adding or updating a ref head 810 * see add_delayed_ref_head() for more details. 811 */ 812 if (existing->is_data) { 813 u64 csum_leaves = 814 btrfs_csum_bytes_to_leaves(fs_info, 815 existing->num_bytes); 816 817 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { 818 delayed_refs->pending_csums -= existing->num_bytes; 819 btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); 820 } 821 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { 822 delayed_refs->pending_csums += existing->num_bytes; 823 trans->delayed_ref_csum_deletions += csum_leaves; 824 } 825 } 826 827 spin_unlock(&existing->lock); 828 } 829 830 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, 831 struct btrfs_qgroup_extent_record *qrecord, 832 u64 bytenr, u64 num_bytes, u64 ref_root, 833 u64 reserved, int action, bool is_data, 834 bool is_system, u64 owning_root) 835 { 836 int count_mod = 1; 837 bool must_insert_reserved = false; 838 839 /* If reserved is provided, it must be a data extent. */ 840 BUG_ON(!is_data && reserved); 841 842 switch (action) { 843 case BTRFS_UPDATE_DELAYED_HEAD: 844 count_mod = 0; 845 break; 846 case BTRFS_DROP_DELAYED_REF: 847 /* 848 * The head node stores the sum of all the mods, so dropping a ref 849 * should drop the sum in the head node by one. 850 */ 851 count_mod = -1; 852 break; 853 case BTRFS_ADD_DELAYED_EXTENT: 854 /* 855 * BTRFS_ADD_DELAYED_EXTENT means that we need to update the 856 * reserved accounting when the extent is finally added, or if a 857 * later modification deletes the delayed ref without ever 858 * inserting the extent into the extent allocation tree. 859 * ref->must_insert_reserved is the flag used to record that 860 * accounting mods are required. 861 * 862 * Once we record must_insert_reserved, switch the action to 863 * BTRFS_ADD_DELAYED_REF because other special casing is not 864 * required. 865 */ 866 must_insert_reserved = true; 867 break; 868 } 869 870 refcount_set(&head_ref->refs, 1); 871 head_ref->bytenr = bytenr; 872 head_ref->num_bytes = num_bytes; 873 head_ref->ref_mod = count_mod; 874 head_ref->reserved_bytes = reserved; 875 head_ref->must_insert_reserved = must_insert_reserved; 876 head_ref->owning_root = owning_root; 877 head_ref->is_data = is_data; 878 head_ref->is_system = is_system; 879 head_ref->ref_tree = RB_ROOT_CACHED; 880 INIT_LIST_HEAD(&head_ref->ref_add_list); 881 RB_CLEAR_NODE(&head_ref->href_node); 882 head_ref->processing = false; 883 head_ref->total_ref_mod = count_mod; 884 spin_lock_init(&head_ref->lock); 885 mutex_init(&head_ref->mutex); 886 887 if (qrecord) { 888 if (ref_root && reserved) { 889 qrecord->data_rsv = reserved; 890 qrecord->data_rsv_refroot = ref_root; 891 } 892 qrecord->bytenr = bytenr; 893 qrecord->num_bytes = num_bytes; 894 qrecord->old_roots = NULL; 895 } 896 } 897 898 /* 899 * helper function to actually insert a head node into the rbtree. 900 * this does all the dirty work in terms of maintaining the correct 901 * overall modification count. 902 */ 903 static noinline struct btrfs_delayed_ref_head * 904 add_delayed_ref_head(struct btrfs_trans_handle *trans, 905 struct btrfs_delayed_ref_head *head_ref, 906 struct btrfs_qgroup_extent_record *qrecord, 907 int action, bool *qrecord_inserted_ret) 908 { 909 struct btrfs_delayed_ref_head *existing; 910 struct btrfs_delayed_ref_root *delayed_refs; 911 bool qrecord_inserted = false; 912 913 delayed_refs = &trans->transaction->delayed_refs; 914 915 /* Record qgroup extent info if provided */ 916 if (qrecord) { 917 if (btrfs_qgroup_trace_extent_nolock(trans->fs_info, 918 delayed_refs, qrecord)) 919 kfree(qrecord); 920 else 921 qrecord_inserted = true; 922 } 923 924 trace_add_delayed_ref_head(trans->fs_info, head_ref, action); 925 926 existing = htree_insert(&delayed_refs->href_root, 927 &head_ref->href_node); 928 if (existing) { 929 update_existing_head_ref(trans, existing, head_ref); 930 /* 931 * we've updated the existing ref, free the newly 932 * allocated ref 933 */ 934 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 935 head_ref = existing; 936 } else { 937 /* 938 * We reserve the amount of bytes needed to delete csums when 939 * adding the ref head and not when adding individual drop refs 940 * since the csum items are deleted only after running the last 941 * delayed drop ref (the data extent's ref count drops to 0). 942 */ 943 if (head_ref->is_data && head_ref->ref_mod < 0) { 944 delayed_refs->pending_csums += head_ref->num_bytes; 945 trans->delayed_ref_csum_deletions += 946 btrfs_csum_bytes_to_leaves(trans->fs_info, 947 head_ref->num_bytes); 948 } 949 delayed_refs->num_heads++; 950 delayed_refs->num_heads_ready++; 951 atomic_inc(&delayed_refs->num_entries); 952 } 953 if (qrecord_inserted_ret) 954 *qrecord_inserted_ret = qrecord_inserted; 955 956 return head_ref; 957 } 958 959 /* 960 * Initialize the structure which represents a modification to a an extent. 961 * 962 * @fs_info: Internal to the mounted filesystem mount structure. 963 * 964 * @ref: The structure which is going to be initialized. 965 * 966 * @bytenr: The logical address of the extent for which a modification is 967 * going to be recorded. 968 * 969 * @num_bytes: Size of the extent whose modification is being recorded. 970 * 971 * @ref_root: The id of the root where this modification has originated, this 972 * can be either one of the well-known metadata trees or the 973 * subvolume id which references this extent. 974 * 975 * @action: Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or 976 * BTRFS_ADD_DELAYED_EXTENT 977 * 978 * @ref_type: Holds the type of the extent which is being recorded, can be 979 * one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY 980 * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/ 981 * BTRFS_EXTENT_DATA_REF_KEY when recording data extent 982 */ 983 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, 984 struct btrfs_delayed_ref_node *ref, 985 u64 bytenr, u64 num_bytes, u64 ref_root, 986 int action, u8 ref_type) 987 { 988 u64 seq = 0; 989 990 if (action == BTRFS_ADD_DELAYED_EXTENT) 991 action = BTRFS_ADD_DELAYED_REF; 992 993 if (is_fstree(ref_root)) 994 seq = atomic64_read(&fs_info->tree_mod_seq); 995 996 refcount_set(&ref->refs, 1); 997 ref->bytenr = bytenr; 998 ref->num_bytes = num_bytes; 999 ref->ref_mod = 1; 1000 ref->action = action; 1001 ref->seq = seq; 1002 ref->type = ref_type; 1003 RB_CLEAR_NODE(&ref->ref_node); 1004 INIT_LIST_HEAD(&ref->add_list); 1005 } 1006 1007 /* 1008 * add a delayed tree ref. This does all of the accounting required 1009 * to make sure the delayed ref is eventually processed before this 1010 * transaction commits. 1011 */ 1012 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 1013 struct btrfs_ref *generic_ref, 1014 struct btrfs_delayed_extent_op *extent_op) 1015 { 1016 struct btrfs_fs_info *fs_info = trans->fs_info; 1017 struct btrfs_delayed_tree_ref *ref; 1018 struct btrfs_delayed_ref_head *head_ref; 1019 struct btrfs_delayed_ref_root *delayed_refs; 1020 struct btrfs_qgroup_extent_record *record = NULL; 1021 bool qrecord_inserted; 1022 bool is_system; 1023 bool merged; 1024 int action = generic_ref->action; 1025 int level = generic_ref->tree_ref.level; 1026 u64 bytenr = generic_ref->bytenr; 1027 u64 num_bytes = generic_ref->len; 1028 u64 parent = generic_ref->parent; 1029 u8 ref_type; 1030 1031 is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); 1032 1033 ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); 1034 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 1035 if (!ref) 1036 return -ENOMEM; 1037 1038 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1039 if (!head_ref) { 1040 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 1041 return -ENOMEM; 1042 } 1043 1044 if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { 1045 record = kzalloc(sizeof(*record), GFP_NOFS); 1046 if (!record) { 1047 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 1048 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 1049 return -ENOMEM; 1050 } 1051 } 1052 1053 if (parent) 1054 ref_type = BTRFS_SHARED_BLOCK_REF_KEY; 1055 else 1056 ref_type = BTRFS_TREE_BLOCK_REF_KEY; 1057 1058 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 1059 generic_ref->tree_ref.ref_root, action, 1060 ref_type); 1061 ref->root = generic_ref->tree_ref.ref_root; 1062 ref->parent = parent; 1063 ref->level = level; 1064 1065 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, 1066 generic_ref->tree_ref.ref_root, 0, action, 1067 false, is_system, generic_ref->owning_root); 1068 head_ref->extent_op = extent_op; 1069 1070 delayed_refs = &trans->transaction->delayed_refs; 1071 spin_lock(&delayed_refs->lock); 1072 1073 /* 1074 * insert both the head node and the new ref without dropping 1075 * the spin lock 1076 */ 1077 head_ref = add_delayed_ref_head(trans, head_ref, record, 1078 action, &qrecord_inserted); 1079 1080 merged = insert_delayed_ref(trans, head_ref, &ref->node); 1081 spin_unlock(&delayed_refs->lock); 1082 1083 /* 1084 * Need to update the delayed_refs_rsv with any changes we may have 1085 * made. 1086 */ 1087 btrfs_update_delayed_refs_rsv(trans); 1088 1089 trace_add_delayed_tree_ref(fs_info, &ref->node, ref, 1090 action == BTRFS_ADD_DELAYED_EXTENT ? 1091 BTRFS_ADD_DELAYED_REF : action); 1092 if (merged) 1093 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); 1094 1095 if (qrecord_inserted) 1096 btrfs_qgroup_trace_extent_post(trans, record); 1097 1098 return 0; 1099 } 1100 1101 /* 1102 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 1103 */ 1104 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 1105 struct btrfs_ref *generic_ref, 1106 u64 reserved) 1107 { 1108 struct btrfs_fs_info *fs_info = trans->fs_info; 1109 struct btrfs_delayed_data_ref *ref; 1110 struct btrfs_delayed_ref_head *head_ref; 1111 struct btrfs_delayed_ref_root *delayed_refs; 1112 struct btrfs_qgroup_extent_record *record = NULL; 1113 bool qrecord_inserted; 1114 int action = generic_ref->action; 1115 bool merged; 1116 u64 bytenr = generic_ref->bytenr; 1117 u64 num_bytes = generic_ref->len; 1118 u64 parent = generic_ref->parent; 1119 u64 ref_root = generic_ref->data_ref.ref_root; 1120 u64 owner = generic_ref->data_ref.ino; 1121 u64 offset = generic_ref->data_ref.offset; 1122 u8 ref_type; 1123 1124 ASSERT(generic_ref->type == BTRFS_REF_DATA && action); 1125 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 1126 if (!ref) 1127 return -ENOMEM; 1128 1129 if (parent) 1130 ref_type = BTRFS_SHARED_DATA_REF_KEY; 1131 else 1132 ref_type = BTRFS_EXTENT_DATA_REF_KEY; 1133 init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, 1134 ref_root, action, ref_type); 1135 ref->root = ref_root; 1136 ref->parent = parent; 1137 ref->objectid = owner; 1138 ref->offset = offset; 1139 1140 1141 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1142 if (!head_ref) { 1143 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1144 return -ENOMEM; 1145 } 1146 1147 if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { 1148 record = kzalloc(sizeof(*record), GFP_NOFS); 1149 if (!record) { 1150 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1151 kmem_cache_free(btrfs_delayed_ref_head_cachep, 1152 head_ref); 1153 return -ENOMEM; 1154 } 1155 } 1156 1157 init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, 1158 reserved, action, true, false, generic_ref->owning_root); 1159 head_ref->extent_op = NULL; 1160 1161 delayed_refs = &trans->transaction->delayed_refs; 1162 spin_lock(&delayed_refs->lock); 1163 1164 /* 1165 * insert both the head node and the new ref without dropping 1166 * the spin lock 1167 */ 1168 head_ref = add_delayed_ref_head(trans, head_ref, record, 1169 action, &qrecord_inserted); 1170 1171 merged = insert_delayed_ref(trans, head_ref, &ref->node); 1172 spin_unlock(&delayed_refs->lock); 1173 1174 /* 1175 * Need to update the delayed_refs_rsv with any changes we may have 1176 * made. 1177 */ 1178 btrfs_update_delayed_refs_rsv(trans); 1179 1180 trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, 1181 action == BTRFS_ADD_DELAYED_EXTENT ? 1182 BTRFS_ADD_DELAYED_REF : action); 1183 if (merged) 1184 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); 1185 1186 1187 if (qrecord_inserted) 1188 return btrfs_qgroup_trace_extent_post(trans, record); 1189 return 0; 1190 } 1191 1192 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 1193 u64 bytenr, u64 num_bytes, 1194 struct btrfs_delayed_extent_op *extent_op) 1195 { 1196 struct btrfs_delayed_ref_head *head_ref; 1197 struct btrfs_delayed_ref_root *delayed_refs; 1198 1199 head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); 1200 if (!head_ref) 1201 return -ENOMEM; 1202 1203 init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, 1204 BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); 1205 head_ref->extent_op = extent_op; 1206 1207 delayed_refs = &trans->transaction->delayed_refs; 1208 spin_lock(&delayed_refs->lock); 1209 1210 add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, 1211 NULL); 1212 1213 spin_unlock(&delayed_refs->lock); 1214 1215 /* 1216 * Need to update the delayed_refs_rsv with any changes we may have 1217 * made. 1218 */ 1219 btrfs_update_delayed_refs_rsv(trans); 1220 return 0; 1221 } 1222 1223 /* 1224 * This does a simple search for the head node for a given extent. Returns the 1225 * head node if found, or NULL if not. 1226 */ 1227 struct btrfs_delayed_ref_head * 1228 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) 1229 { 1230 lockdep_assert_held(&delayed_refs->lock); 1231 1232 return find_ref_head(delayed_refs, bytenr, false); 1233 } 1234 1235 void __cold btrfs_delayed_ref_exit(void) 1236 { 1237 kmem_cache_destroy(btrfs_delayed_ref_head_cachep); 1238 kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); 1239 kmem_cache_destroy(btrfs_delayed_data_ref_cachep); 1240 kmem_cache_destroy(btrfs_delayed_extent_op_cachep); 1241 } 1242 1243 int __init btrfs_delayed_ref_init(void) 1244 { 1245 btrfs_delayed_ref_head_cachep = kmem_cache_create( 1246 "btrfs_delayed_ref_head", 1247 sizeof(struct btrfs_delayed_ref_head), 0, 1248 SLAB_MEM_SPREAD, NULL); 1249 if (!btrfs_delayed_ref_head_cachep) 1250 goto fail; 1251 1252 btrfs_delayed_tree_ref_cachep = kmem_cache_create( 1253 "btrfs_delayed_tree_ref", 1254 sizeof(struct btrfs_delayed_tree_ref), 0, 1255 SLAB_MEM_SPREAD, NULL); 1256 if (!btrfs_delayed_tree_ref_cachep) 1257 goto fail; 1258 1259 btrfs_delayed_data_ref_cachep = kmem_cache_create( 1260 "btrfs_delayed_data_ref", 1261 sizeof(struct btrfs_delayed_data_ref), 0, 1262 SLAB_MEM_SPREAD, NULL); 1263 if (!btrfs_delayed_data_ref_cachep) 1264 goto fail; 1265 1266 btrfs_delayed_extent_op_cachep = kmem_cache_create( 1267 "btrfs_delayed_extent_op", 1268 sizeof(struct btrfs_delayed_extent_op), 0, 1269 SLAB_MEM_SPREAD, NULL); 1270 if (!btrfs_delayed_extent_op_cachep) 1271 goto fail; 1272 1273 return 0; 1274 fail: 1275 btrfs_delayed_ref_exit(); 1276 return -ENOMEM; 1277 } 1278