1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 union { 94 /* Use rb_simple_node for search/insert */ 95 struct { 96 struct rb_node rb_node; 97 u64 bytenr; 98 }; 99 100 struct rb_simple_node simple_node; 101 }; 102 void *data; 103 }; 104 105 struct mapping_tree { 106 struct rb_root rb_root; 107 spinlock_t lock; 108 }; 109 110 /* 111 * present a tree block to process 112 */ 113 struct tree_block { 114 union { 115 /* Use rb_simple_node for search/insert */ 116 struct { 117 struct rb_node rb_node; 118 u64 bytenr; 119 }; 120 121 struct rb_simple_node simple_node; 122 }; 123 u64 owner; 124 struct btrfs_key key; 125 u8 level; 126 bool key_ready; 127 }; 128 129 #define MAX_EXTENTS 128 130 131 struct file_extent_cluster { 132 u64 start; 133 u64 end; 134 u64 boundary[MAX_EXTENTS]; 135 unsigned int nr; 136 u64 owning_root; 137 }; 138 139 /* Stages of data relocation. */ 140 enum reloc_stage { 141 MOVE_DATA_EXTENTS, 142 UPDATE_DATA_PTRS 143 }; 144 145 struct reloc_control { 146 /* block group to relocate */ 147 struct btrfs_block_group *block_group; 148 /* extent tree */ 149 struct btrfs_root *extent_root; 150 /* inode for moving data */ 151 struct inode *data_inode; 152 153 struct btrfs_block_rsv *block_rsv; 154 155 struct btrfs_backref_cache backref_cache; 156 157 struct file_extent_cluster cluster; 158 /* tree blocks have been processed */ 159 struct extent_io_tree processed_blocks; 160 /* map start of tree root to corresponding reloc tree */ 161 struct mapping_tree reloc_root_tree; 162 /* list of reloc trees */ 163 struct list_head reloc_roots; 164 /* list of subvolume trees that get relocated */ 165 struct list_head dirty_subvol_roots; 166 /* size of metadata reservation for merging reloc trees */ 167 u64 merging_rsv_size; 168 /* size of relocated tree nodes */ 169 u64 nodes_relocated; 170 /* reserved size for block group relocation*/ 171 u64 reserved_bytes; 172 173 u64 search_start; 174 u64 extents_found; 175 176 enum reloc_stage stage; 177 bool create_reloc_tree; 178 bool merge_reloc_tree; 179 bool found_file_extent; 180 }; 181 182 static void mark_block_processed(struct reloc_control *rc, 183 struct btrfs_backref_node *node) 184 { 185 u32 blocksize; 186 187 if (node->level == 0 || 188 in_range(node->bytenr, rc->block_group->start, 189 rc->block_group->length)) { 190 blocksize = rc->extent_root->fs_info->nodesize; 191 btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, 192 node->bytenr + blocksize - 1, EXTENT_DIRTY, 193 NULL); 194 } 195 node->processed = 1; 196 } 197 198 /* 199 * walk up backref nodes until reach node presents tree root 200 */ 201 static struct btrfs_backref_node *walk_up_backref( 202 struct btrfs_backref_node *node, 203 struct btrfs_backref_edge *edges[], int *index) 204 { 205 struct btrfs_backref_edge *edge; 206 int idx = *index; 207 208 while (!list_empty(&node->upper)) { 209 edge = list_first_entry(&node->upper, struct btrfs_backref_edge, 210 list[LOWER]); 211 edges[idx++] = edge; 212 node = edge->node[UPPER]; 213 } 214 BUG_ON(node->detached); 215 *index = idx; 216 return node; 217 } 218 219 /* 220 * walk down backref nodes to find start of next reference path 221 */ 222 static struct btrfs_backref_node *walk_down_backref( 223 struct btrfs_backref_edge *edges[], int *index) 224 { 225 struct btrfs_backref_edge *edge; 226 struct btrfs_backref_node *lower; 227 int idx = *index; 228 229 while (idx > 0) { 230 edge = edges[idx - 1]; 231 lower = edge->node[LOWER]; 232 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 233 idx--; 234 continue; 235 } 236 edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, 237 list[LOWER]); 238 edges[idx - 1] = edge; 239 *index = idx; 240 return edge->node[UPPER]; 241 } 242 *index = 0; 243 return NULL; 244 } 245 246 static bool reloc_root_is_dead(const struct btrfs_root *root) 247 { 248 /* 249 * Pair with set_bit/clear_bit in clean_dirty_subvols and 250 * btrfs_update_reloc_root. We need to see the updated bit before 251 * trying to access reloc_root 252 */ 253 smp_rmb(); 254 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 255 return true; 256 return false; 257 } 258 259 /* 260 * Check if this subvolume tree has valid reloc tree. 261 * 262 * Reloc tree after swap is considered dead, thus not considered as valid. 263 * This is enough for most callers, as they don't distinguish dead reloc root 264 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 265 * special case. 266 */ 267 static bool have_reloc_root(const struct btrfs_root *root) 268 { 269 if (reloc_root_is_dead(root)) 270 return false; 271 if (!root->reloc_root) 272 return false; 273 return true; 274 } 275 276 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 277 { 278 struct btrfs_root *reloc_root; 279 280 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 281 return false; 282 283 /* This root has been merged with its reloc tree, we can ignore it */ 284 if (reloc_root_is_dead(root)) 285 return true; 286 287 reloc_root = root->reloc_root; 288 if (!reloc_root) 289 return false; 290 291 if (btrfs_header_generation(reloc_root->commit_root) == 292 root->fs_info->running_transaction->transid) 293 return false; 294 /* 295 * If there is reloc tree and it was created in previous transaction 296 * backref lookup can find the reloc tree, so backref node for the fs 297 * tree root is useless for relocation. 298 */ 299 return true; 300 } 301 302 /* 303 * find reloc tree by address of tree root 304 */ 305 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 306 { 307 struct reloc_control *rc = fs_info->reloc_ctl; 308 struct rb_node *rb_node; 309 struct mapping_node *node; 310 struct btrfs_root *root = NULL; 311 312 ASSERT(rc); 313 spin_lock(&rc->reloc_root_tree.lock); 314 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 315 if (rb_node) { 316 node = rb_entry(rb_node, struct mapping_node, rb_node); 317 root = node->data; 318 } 319 spin_unlock(&rc->reloc_root_tree.lock); 320 return btrfs_grab_root(root); 321 } 322 323 /* 324 * For useless nodes, do two major clean ups: 325 * 326 * - Cleanup the children edges and nodes 327 * If child node is also orphan (no parent) during cleanup, then the child 328 * node will also be cleaned up. 329 * 330 * - Freeing up leaves (level 0), keeps nodes detached 331 * For nodes, the node is still cached as "detached" 332 * 333 * Return false if @node is not in the @useless_nodes list. 334 * Return true if @node is in the @useless_nodes list. 335 */ 336 static bool handle_useless_nodes(struct reloc_control *rc, 337 struct btrfs_backref_node *node) 338 { 339 struct btrfs_backref_cache *cache = &rc->backref_cache; 340 struct list_head *useless_node = &cache->useless_node; 341 bool ret = false; 342 343 while (!list_empty(useless_node)) { 344 struct btrfs_backref_node *cur; 345 346 cur = list_first_entry(useless_node, struct btrfs_backref_node, 347 list); 348 list_del_init(&cur->list); 349 350 /* Only tree root nodes can be added to @useless_nodes */ 351 ASSERT(list_empty(&cur->upper)); 352 353 if (cur == node) 354 ret = true; 355 356 /* Cleanup the lower edges */ 357 while (!list_empty(&cur->lower)) { 358 struct btrfs_backref_edge *edge; 359 struct btrfs_backref_node *lower; 360 361 edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, 362 list[UPPER]); 363 list_del(&edge->list[UPPER]); 364 list_del(&edge->list[LOWER]); 365 lower = edge->node[LOWER]; 366 btrfs_backref_free_edge(cache, edge); 367 368 /* Child node is also orphan, queue for cleanup */ 369 if (list_empty(&lower->upper)) 370 list_add(&lower->list, useless_node); 371 } 372 /* Mark this block processed for relocation */ 373 mark_block_processed(rc, cur); 374 375 /* 376 * Backref nodes for tree leaves are deleted from the cache. 377 * Backref nodes for upper level tree blocks are left in the 378 * cache to avoid unnecessary backref lookup. 379 */ 380 if (cur->level > 0) { 381 cur->detached = 1; 382 } else { 383 rb_erase(&cur->rb_node, &cache->rb_root); 384 btrfs_backref_free_node(cache, cur); 385 } 386 } 387 return ret; 388 } 389 390 /* 391 * Build backref tree for a given tree block. Root of the backref tree 392 * corresponds the tree block, leaves of the backref tree correspond roots of 393 * b-trees that reference the tree block. 394 * 395 * The basic idea of this function is check backrefs of a given block to find 396 * upper level blocks that reference the block, and then check backrefs of 397 * these upper level blocks recursively. The recursion stops when tree root is 398 * reached or backrefs for the block is cached. 399 * 400 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 401 * all upper level blocks that directly/indirectly reference the block are also 402 * cached. 403 */ 404 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 405 struct btrfs_trans_handle *trans, 406 struct reloc_control *rc, struct btrfs_key *node_key, 407 int level, u64 bytenr) 408 { 409 struct btrfs_backref_iter *iter; 410 struct btrfs_backref_cache *cache = &rc->backref_cache; 411 /* For searching parent of TREE_BLOCK_REF */ 412 struct btrfs_path *path; 413 struct btrfs_backref_node *cur; 414 struct btrfs_backref_node *node = NULL; 415 struct btrfs_backref_edge *edge; 416 int ret; 417 418 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 419 if (!iter) 420 return ERR_PTR(-ENOMEM); 421 path = btrfs_alloc_path(); 422 if (!path) { 423 ret = -ENOMEM; 424 goto out; 425 } 426 427 node = btrfs_backref_alloc_node(cache, bytenr, level); 428 if (!node) { 429 ret = -ENOMEM; 430 goto out; 431 } 432 433 cur = node; 434 435 /* Breadth-first search to build backref cache */ 436 do { 437 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 438 node_key, cur); 439 if (ret < 0) 440 goto out; 441 442 edge = list_first_entry_or_null(&cache->pending_edge, 443 struct btrfs_backref_edge, list[UPPER]); 444 /* 445 * The pending list isn't empty, take the first block to 446 * process 447 */ 448 if (edge) { 449 list_del_init(&edge->list[UPPER]); 450 cur = edge->node[UPPER]; 451 } 452 } while (edge); 453 454 /* Finish the upper linkage of newly added edges/nodes */ 455 ret = btrfs_backref_finish_upper_links(cache, node); 456 if (ret < 0) 457 goto out; 458 459 if (handle_useless_nodes(rc, node)) 460 node = NULL; 461 out: 462 btrfs_free_path(iter->path); 463 kfree(iter); 464 btrfs_free_path(path); 465 if (ret) { 466 btrfs_backref_error_cleanup(cache, node); 467 return ERR_PTR(ret); 468 } 469 ASSERT(!node || !node->detached); 470 ASSERT(list_empty(&cache->useless_node) && 471 list_empty(&cache->pending_edge)); 472 return node; 473 } 474 475 /* 476 * helper to add 'address of tree root -> reloc tree' mapping 477 */ 478 static int __add_reloc_root(struct btrfs_root *root) 479 { 480 struct btrfs_fs_info *fs_info = root->fs_info; 481 struct rb_node *rb_node; 482 struct mapping_node *node; 483 struct reloc_control *rc = fs_info->reloc_ctl; 484 485 node = kmalloc(sizeof(*node), GFP_NOFS); 486 if (!node) 487 return -ENOMEM; 488 489 node->bytenr = root->commit_root->start; 490 node->data = root; 491 492 spin_lock(&rc->reloc_root_tree.lock); 493 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 494 spin_unlock(&rc->reloc_root_tree.lock); 495 if (rb_node) { 496 btrfs_err(fs_info, 497 "Duplicate root found for start=%llu while inserting into relocation tree", 498 node->bytenr); 499 return -EEXIST; 500 } 501 502 list_add_tail(&root->root_list, &rc->reloc_roots); 503 return 0; 504 } 505 506 /* 507 * helper to delete the 'address of tree root -> reloc tree' 508 * mapping 509 */ 510 static void __del_reloc_root(struct btrfs_root *root) 511 { 512 struct btrfs_fs_info *fs_info = root->fs_info; 513 struct rb_node *rb_node; 514 struct mapping_node *node = NULL; 515 struct reloc_control *rc = fs_info->reloc_ctl; 516 bool put_ref = false; 517 518 if (rc && root->node) { 519 spin_lock(&rc->reloc_root_tree.lock); 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 521 root->commit_root->start); 522 if (rb_node) { 523 node = rb_entry(rb_node, struct mapping_node, rb_node); 524 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 525 RB_CLEAR_NODE(&node->rb_node); 526 } 527 spin_unlock(&rc->reloc_root_tree.lock); 528 ASSERT(!node || (struct btrfs_root *)node->data == root); 529 } 530 531 /* 532 * We only put the reloc root here if it's on the list. There's a lot 533 * of places where the pattern is to splice the rc->reloc_roots, process 534 * the reloc roots, and then add the reloc root back onto 535 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 536 * list we don't want the reference being dropped, because the guy 537 * messing with the list is in charge of the reference. 538 */ 539 spin_lock(&fs_info->trans_lock); 540 if (!list_empty(&root->root_list)) { 541 put_ref = true; 542 list_del_init(&root->root_list); 543 } 544 spin_unlock(&fs_info->trans_lock); 545 if (put_ref) 546 btrfs_put_root(root); 547 kfree(node); 548 } 549 550 /* 551 * helper to update the 'address of tree root -> reloc tree' 552 * mapping 553 */ 554 static int __update_reloc_root(struct btrfs_root *root) 555 { 556 struct btrfs_fs_info *fs_info = root->fs_info; 557 struct rb_node *rb_node; 558 struct mapping_node *node = NULL; 559 struct reloc_control *rc = fs_info->reloc_ctl; 560 561 spin_lock(&rc->reloc_root_tree.lock); 562 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 563 root->commit_root->start); 564 if (rb_node) { 565 node = rb_entry(rb_node, struct mapping_node, rb_node); 566 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 567 } 568 spin_unlock(&rc->reloc_root_tree.lock); 569 570 if (!node) 571 return 0; 572 BUG_ON((struct btrfs_root *)node->data != root); 573 574 spin_lock(&rc->reloc_root_tree.lock); 575 node->bytenr = root->node->start; 576 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 577 spin_unlock(&rc->reloc_root_tree.lock); 578 if (rb_node) 579 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 580 return 0; 581 } 582 583 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, u64 objectid) 585 { 586 struct btrfs_fs_info *fs_info = root->fs_info; 587 struct btrfs_root *reloc_root; 588 struct extent_buffer *eb; 589 struct btrfs_root_item *root_item; 590 struct btrfs_key root_key; 591 int ret = 0; 592 bool must_abort = false; 593 594 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 595 if (!root_item) 596 return ERR_PTR(-ENOMEM); 597 598 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 599 root_key.type = BTRFS_ROOT_ITEM_KEY; 600 root_key.offset = objectid; 601 602 if (btrfs_root_id(root) == objectid) { 603 u64 commit_root_gen; 604 605 /* called by btrfs_init_reloc_root */ 606 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 607 BTRFS_TREE_RELOC_OBJECTID); 608 if (ret) 609 goto fail; 610 611 /* 612 * Set the last_snapshot field to the generation of the commit 613 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 614 * correctly (returns true) when the relocation root is created 615 * either inside the critical section of a transaction commit 616 * (through transaction.c:qgroup_account_snapshot()) and when 617 * it's created before the transaction commit is started. 618 */ 619 commit_root_gen = btrfs_header_generation(root->commit_root); 620 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 621 } else { 622 /* 623 * called by btrfs_reloc_post_snapshot_hook. 624 * the source tree is a reloc tree, all tree blocks 625 * modified after it was created have RELOC flag 626 * set in their headers. so it's OK to not update 627 * the 'last_snapshot'. 628 */ 629 ret = btrfs_copy_root(trans, root, root->node, &eb, 630 BTRFS_TREE_RELOC_OBJECTID); 631 if (ret) 632 goto fail; 633 } 634 635 /* 636 * We have changed references at this point, we must abort the 637 * transaction if anything fails. 638 */ 639 must_abort = true; 640 641 memcpy(root_item, &root->root_item, sizeof(*root_item)); 642 btrfs_set_root_bytenr(root_item, eb->start); 643 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 644 btrfs_set_root_generation(root_item, trans->transid); 645 646 if (btrfs_root_id(root) == objectid) { 647 btrfs_set_root_refs(root_item, 0); 648 memset(&root_item->drop_progress, 0, 649 sizeof(struct btrfs_disk_key)); 650 btrfs_set_root_drop_level(root_item, 0); 651 } 652 653 btrfs_tree_unlock(eb); 654 free_extent_buffer(eb); 655 656 ret = btrfs_insert_root(trans, fs_info->tree_root, 657 &root_key, root_item); 658 if (ret) 659 goto fail; 660 661 kfree(root_item); 662 663 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 664 if (IS_ERR(reloc_root)) { 665 ret = PTR_ERR(reloc_root); 666 goto abort; 667 } 668 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 669 btrfs_set_root_last_trans(reloc_root, trans->transid); 670 return reloc_root; 671 fail: 672 kfree(root_item); 673 abort: 674 if (must_abort) 675 btrfs_abort_transaction(trans, ret); 676 return ERR_PTR(ret); 677 } 678 679 /* 680 * create reloc tree for a given fs tree. reloc tree is just a 681 * snapshot of the fs tree with special root objectid. 682 * 683 * The reloc_root comes out of here with two references, one for 684 * root->reloc_root, and another for being on the rc->reloc_roots list. 685 */ 686 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 687 struct btrfs_root *root) 688 { 689 struct btrfs_fs_info *fs_info = root->fs_info; 690 struct btrfs_root *reloc_root; 691 struct reloc_control *rc = fs_info->reloc_ctl; 692 struct btrfs_block_rsv *rsv; 693 int clear_rsv = 0; 694 int ret; 695 696 if (!rc) 697 return 0; 698 699 /* 700 * The subvolume has reloc tree but the swap is finished, no need to 701 * create/update the dead reloc tree 702 */ 703 if (reloc_root_is_dead(root)) 704 return 0; 705 706 /* 707 * This is subtle but important. We do not do 708 * record_root_in_transaction for reloc roots, instead we record their 709 * corresponding fs root, and then here we update the last trans for the 710 * reloc root. This means that we have to do this for the entire life 711 * of the reloc root, regardless of which stage of the relocation we are 712 * in. 713 */ 714 if (root->reloc_root) { 715 reloc_root = root->reloc_root; 716 btrfs_set_root_last_trans(reloc_root, trans->transid); 717 return 0; 718 } 719 720 /* 721 * We are merging reloc roots, we do not need new reloc trees. Also 722 * reloc trees never need their own reloc tree. 723 */ 724 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 725 return 0; 726 727 if (!trans->reloc_reserved) { 728 rsv = trans->block_rsv; 729 trans->block_rsv = rc->block_rsv; 730 clear_rsv = 1; 731 } 732 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 733 if (clear_rsv) 734 trans->block_rsv = rsv; 735 if (IS_ERR(reloc_root)) 736 return PTR_ERR(reloc_root); 737 738 ret = __add_reloc_root(reloc_root); 739 ASSERT(ret != -EEXIST); 740 if (ret) { 741 /* Pairs with create_reloc_root */ 742 btrfs_put_root(reloc_root); 743 return ret; 744 } 745 root->reloc_root = btrfs_grab_root(reloc_root); 746 return 0; 747 } 748 749 /* 750 * update root item of reloc tree 751 */ 752 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 753 struct btrfs_root *root) 754 { 755 struct btrfs_fs_info *fs_info = root->fs_info; 756 struct btrfs_root *reloc_root; 757 struct btrfs_root_item *root_item; 758 int ret; 759 760 if (!have_reloc_root(root)) 761 return 0; 762 763 reloc_root = root->reloc_root; 764 root_item = &reloc_root->root_item; 765 766 /* 767 * We are probably ok here, but __del_reloc_root() will drop its ref of 768 * the root. We have the ref for root->reloc_root, but just in case 769 * hold it while we update the reloc root. 770 */ 771 btrfs_grab_root(reloc_root); 772 773 /* root->reloc_root will stay until current relocation finished */ 774 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 775 btrfs_root_refs(root_item) == 0) { 776 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 777 /* 778 * Mark the tree as dead before we change reloc_root so 779 * have_reloc_root will not touch it from now on. 780 */ 781 smp_wmb(); 782 __del_reloc_root(reloc_root); 783 } 784 785 if (reloc_root->commit_root != reloc_root->node) { 786 __update_reloc_root(reloc_root); 787 btrfs_set_root_node(root_item, reloc_root->node); 788 free_extent_buffer(reloc_root->commit_root); 789 reloc_root->commit_root = btrfs_root_node(reloc_root); 790 } 791 792 ret = btrfs_update_root(trans, fs_info->tree_root, 793 &reloc_root->root_key, root_item); 794 btrfs_put_root(reloc_root); 795 return ret; 796 } 797 798 /* 799 * get new location of data 800 */ 801 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 802 u64 bytenr, u64 num_bytes) 803 { 804 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 805 struct btrfs_path *path; 806 struct btrfs_file_extent_item *fi; 807 struct extent_buffer *leaf; 808 int ret; 809 810 path = btrfs_alloc_path(); 811 if (!path) 812 return -ENOMEM; 813 814 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 815 ret = btrfs_lookup_file_extent(NULL, root, path, 816 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 817 if (ret < 0) 818 goto out; 819 if (ret > 0) { 820 ret = -ENOENT; 821 goto out; 822 } 823 824 leaf = path->nodes[0]; 825 fi = btrfs_item_ptr(leaf, path->slots[0], 826 struct btrfs_file_extent_item); 827 828 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 829 btrfs_file_extent_compression(leaf, fi) || 830 btrfs_file_extent_encryption(leaf, fi) || 831 btrfs_file_extent_other_encoding(leaf, fi)); 832 833 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 834 ret = -EINVAL; 835 goto out; 836 } 837 838 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 839 ret = 0; 840 out: 841 btrfs_free_path(path); 842 return ret; 843 } 844 845 /* 846 * update file extent items in the tree leaf to point to 847 * the new locations. 848 */ 849 static noinline_for_stack 850 int replace_file_extents(struct btrfs_trans_handle *trans, 851 struct reloc_control *rc, 852 struct btrfs_root *root, 853 struct extent_buffer *leaf) 854 { 855 struct btrfs_fs_info *fs_info = root->fs_info; 856 struct btrfs_key key; 857 struct btrfs_file_extent_item *fi; 858 struct btrfs_inode *inode = NULL; 859 u64 parent; 860 u64 bytenr; 861 u64 new_bytenr = 0; 862 u64 num_bytes; 863 u64 end; 864 u32 nritems; 865 u32 i; 866 int ret = 0; 867 int first = 1; 868 869 if (rc->stage != UPDATE_DATA_PTRS) 870 return 0; 871 872 /* reloc trees always use full backref */ 873 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 874 parent = leaf->start; 875 else 876 parent = 0; 877 878 nritems = btrfs_header_nritems(leaf); 879 for (i = 0; i < nritems; i++) { 880 struct btrfs_ref ref = { 0 }; 881 882 cond_resched(); 883 btrfs_item_key_to_cpu(leaf, &key, i); 884 if (key.type != BTRFS_EXTENT_DATA_KEY) 885 continue; 886 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 887 if (btrfs_file_extent_type(leaf, fi) == 888 BTRFS_FILE_EXTENT_INLINE) 889 continue; 890 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 891 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 892 if (bytenr == 0) 893 continue; 894 if (!in_range(bytenr, rc->block_group->start, 895 rc->block_group->length)) 896 continue; 897 898 /* 899 * if we are modifying block in fs tree, wait for read_folio 900 * to complete and drop the extent cache 901 */ 902 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 903 if (first) { 904 inode = btrfs_find_first_inode(root, key.objectid); 905 first = 0; 906 } else if (inode && btrfs_ino(inode) < key.objectid) { 907 btrfs_add_delayed_iput(inode); 908 inode = btrfs_find_first_inode(root, key.objectid); 909 } 910 if (inode && btrfs_ino(inode) == key.objectid) { 911 struct extent_state *cached_state = NULL; 912 913 end = key.offset + 914 btrfs_file_extent_num_bytes(leaf, fi); 915 WARN_ON(!IS_ALIGNED(key.offset, 916 fs_info->sectorsize)); 917 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 918 end--; 919 /* Take mmap lock to serialize with reflinks. */ 920 if (!down_read_trylock(&inode->i_mmap_lock)) 921 continue; 922 ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, 923 end, &cached_state); 924 if (!ret) { 925 up_read(&inode->i_mmap_lock); 926 continue; 927 } 928 929 btrfs_drop_extent_map_range(inode, key.offset, end, true); 930 btrfs_unlock_extent(&inode->io_tree, key.offset, end, 931 &cached_state); 932 up_read(&inode->i_mmap_lock); 933 } 934 } 935 936 ret = get_new_location(rc->data_inode, &new_bytenr, 937 bytenr, num_bytes); 938 if (ret) { 939 /* 940 * Don't have to abort since we've not changed anything 941 * in the file extent yet. 942 */ 943 break; 944 } 945 946 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 947 948 key.offset -= btrfs_file_extent_offset(leaf, fi); 949 ref.action = BTRFS_ADD_DELAYED_REF; 950 ref.bytenr = new_bytenr; 951 ref.num_bytes = num_bytes; 952 ref.parent = parent; 953 ref.owning_root = btrfs_root_id(root); 954 ref.ref_root = btrfs_header_owner(leaf); 955 btrfs_init_data_ref(&ref, key.objectid, key.offset, 956 btrfs_root_id(root), false); 957 ret = btrfs_inc_extent_ref(trans, &ref); 958 if (ret) { 959 btrfs_abort_transaction(trans, ret); 960 break; 961 } 962 963 ref.action = BTRFS_DROP_DELAYED_REF; 964 ref.bytenr = bytenr; 965 ref.num_bytes = num_bytes; 966 ref.parent = parent; 967 ref.owning_root = btrfs_root_id(root); 968 ref.ref_root = btrfs_header_owner(leaf); 969 btrfs_init_data_ref(&ref, key.objectid, key.offset, 970 btrfs_root_id(root), false); 971 ret = btrfs_free_extent(trans, &ref); 972 if (ret) { 973 btrfs_abort_transaction(trans, ret); 974 break; 975 } 976 } 977 if (inode) 978 btrfs_add_delayed_iput(inode); 979 return ret; 980 } 981 982 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 983 int slot, const struct btrfs_path *path, 984 int level) 985 { 986 struct btrfs_disk_key key1; 987 struct btrfs_disk_key key2; 988 btrfs_node_key(eb, &key1, slot); 989 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 990 return memcmp(&key1, &key2, sizeof(key1)); 991 } 992 993 /* 994 * try to replace tree blocks in fs tree with the new blocks 995 * in reloc tree. tree blocks haven't been modified since the 996 * reloc tree was create can be replaced. 997 * 998 * if a block was replaced, level of the block + 1 is returned. 999 * if no block got replaced, 0 is returned. if there are other 1000 * errors, a negative error number is returned. 1001 */ 1002 static noinline_for_stack 1003 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1004 struct btrfs_root *dest, struct btrfs_root *src, 1005 struct btrfs_path *path, struct btrfs_key *next_key, 1006 int lowest_level, int max_level) 1007 { 1008 struct btrfs_fs_info *fs_info = dest->fs_info; 1009 struct extent_buffer *eb; 1010 struct extent_buffer *parent; 1011 struct btrfs_ref ref = { 0 }; 1012 struct btrfs_key key; 1013 u64 old_bytenr; 1014 u64 new_bytenr; 1015 u64 old_ptr_gen; 1016 u64 new_ptr_gen; 1017 u64 last_snapshot; 1018 u32 blocksize; 1019 int cow = 0; 1020 int level; 1021 int ret; 1022 int slot; 1023 1024 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1025 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1026 1027 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1028 again: 1029 slot = path->slots[lowest_level]; 1030 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1031 1032 eb = btrfs_lock_root_node(dest); 1033 level = btrfs_header_level(eb); 1034 1035 if (level < lowest_level) { 1036 btrfs_tree_unlock(eb); 1037 free_extent_buffer(eb); 1038 return 0; 1039 } 1040 1041 if (cow) { 1042 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1043 BTRFS_NESTING_COW); 1044 if (ret) { 1045 btrfs_tree_unlock(eb); 1046 free_extent_buffer(eb); 1047 return ret; 1048 } 1049 } 1050 1051 if (next_key) { 1052 next_key->objectid = (u64)-1; 1053 next_key->type = (u8)-1; 1054 next_key->offset = (u64)-1; 1055 } 1056 1057 parent = eb; 1058 while (1) { 1059 level = btrfs_header_level(parent); 1060 ASSERT(level >= lowest_level); 1061 1062 ret = btrfs_bin_search(parent, 0, &key, &slot); 1063 if (ret < 0) 1064 break; 1065 if (ret && slot > 0) 1066 slot--; 1067 1068 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1069 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1070 1071 old_bytenr = btrfs_node_blockptr(parent, slot); 1072 blocksize = fs_info->nodesize; 1073 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1074 1075 if (level <= max_level) { 1076 eb = path->nodes[level]; 1077 new_bytenr = btrfs_node_blockptr(eb, 1078 path->slots[level]); 1079 new_ptr_gen = btrfs_node_ptr_generation(eb, 1080 path->slots[level]); 1081 } else { 1082 new_bytenr = 0; 1083 new_ptr_gen = 0; 1084 } 1085 1086 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1087 ret = level; 1088 break; 1089 } 1090 1091 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1092 memcmp_node_keys(parent, slot, path, level)) { 1093 if (level <= lowest_level) { 1094 ret = 0; 1095 break; 1096 } 1097 1098 eb = btrfs_read_node_slot(parent, slot); 1099 if (IS_ERR(eb)) { 1100 ret = PTR_ERR(eb); 1101 break; 1102 } 1103 btrfs_tree_lock(eb); 1104 if (cow) { 1105 ret = btrfs_cow_block(trans, dest, eb, parent, 1106 slot, &eb, 1107 BTRFS_NESTING_COW); 1108 if (ret) { 1109 btrfs_tree_unlock(eb); 1110 free_extent_buffer(eb); 1111 break; 1112 } 1113 } 1114 1115 btrfs_tree_unlock(parent); 1116 free_extent_buffer(parent); 1117 1118 parent = eb; 1119 continue; 1120 } 1121 1122 if (!cow) { 1123 btrfs_tree_unlock(parent); 1124 free_extent_buffer(parent); 1125 cow = 1; 1126 goto again; 1127 } 1128 1129 btrfs_node_key_to_cpu(path->nodes[level], &key, 1130 path->slots[level]); 1131 btrfs_release_path(path); 1132 1133 path->lowest_level = level; 1134 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1135 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1136 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1137 path->lowest_level = 0; 1138 if (ret) { 1139 if (ret > 0) 1140 ret = -ENOENT; 1141 break; 1142 } 1143 1144 /* 1145 * Info qgroup to trace both subtrees. 1146 * 1147 * We must trace both trees. 1148 * 1) Tree reloc subtree 1149 * If not traced, we will leak data numbers 1150 * 2) Fs subtree 1151 * If not traced, we will double count old data 1152 * 1153 * We don't scan the subtree right now, but only record 1154 * the swapped tree blocks. 1155 * The real subtree rescan is delayed until we have new 1156 * CoW on the subtree root node before transaction commit. 1157 */ 1158 ret = btrfs_qgroup_add_swapped_blocks(dest, 1159 rc->block_group, parent, slot, 1160 path->nodes[level], path->slots[level], 1161 last_snapshot); 1162 if (ret < 0) 1163 break; 1164 /* 1165 * swap blocks in fs tree and reloc tree. 1166 */ 1167 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1168 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1169 1170 btrfs_set_node_blockptr(path->nodes[level], 1171 path->slots[level], old_bytenr); 1172 btrfs_set_node_ptr_generation(path->nodes[level], 1173 path->slots[level], old_ptr_gen); 1174 1175 ref.action = BTRFS_ADD_DELAYED_REF; 1176 ref.bytenr = old_bytenr; 1177 ref.num_bytes = blocksize; 1178 ref.parent = path->nodes[level]->start; 1179 ref.owning_root = btrfs_root_id(src); 1180 ref.ref_root = btrfs_root_id(src); 1181 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1182 ret = btrfs_inc_extent_ref(trans, &ref); 1183 if (ret) { 1184 btrfs_abort_transaction(trans, ret); 1185 break; 1186 } 1187 1188 ref.action = BTRFS_ADD_DELAYED_REF; 1189 ref.bytenr = new_bytenr; 1190 ref.num_bytes = blocksize; 1191 ref.parent = 0; 1192 ref.owning_root = btrfs_root_id(dest); 1193 ref.ref_root = btrfs_root_id(dest); 1194 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1195 ret = btrfs_inc_extent_ref(trans, &ref); 1196 if (ret) { 1197 btrfs_abort_transaction(trans, ret); 1198 break; 1199 } 1200 1201 /* We don't know the real owning_root, use 0. */ 1202 ref.action = BTRFS_DROP_DELAYED_REF; 1203 ref.bytenr = new_bytenr; 1204 ref.num_bytes = blocksize; 1205 ref.parent = path->nodes[level]->start; 1206 ref.owning_root = 0; 1207 ref.ref_root = btrfs_root_id(src); 1208 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1209 ret = btrfs_free_extent(trans, &ref); 1210 if (ret) { 1211 btrfs_abort_transaction(trans, ret); 1212 break; 1213 } 1214 1215 /* We don't know the real owning_root, use 0. */ 1216 ref.action = BTRFS_DROP_DELAYED_REF; 1217 ref.bytenr = old_bytenr; 1218 ref.num_bytes = blocksize; 1219 ref.parent = 0; 1220 ref.owning_root = 0; 1221 ref.ref_root = btrfs_root_id(dest); 1222 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1223 ret = btrfs_free_extent(trans, &ref); 1224 if (ret) { 1225 btrfs_abort_transaction(trans, ret); 1226 break; 1227 } 1228 1229 btrfs_unlock_up_safe(path, 0); 1230 1231 ret = level; 1232 break; 1233 } 1234 btrfs_tree_unlock(parent); 1235 free_extent_buffer(parent); 1236 return ret; 1237 } 1238 1239 /* 1240 * helper to find next relocated block in reloc tree 1241 */ 1242 static noinline_for_stack 1243 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1244 int *level) 1245 { 1246 struct extent_buffer *eb; 1247 int i; 1248 u64 last_snapshot; 1249 u32 nritems; 1250 1251 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1252 1253 for (i = 0; i < *level; i++) { 1254 free_extent_buffer(path->nodes[i]); 1255 path->nodes[i] = NULL; 1256 } 1257 1258 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1259 eb = path->nodes[i]; 1260 nritems = btrfs_header_nritems(eb); 1261 while (path->slots[i] + 1 < nritems) { 1262 path->slots[i]++; 1263 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1264 last_snapshot) 1265 continue; 1266 1267 *level = i; 1268 return 0; 1269 } 1270 free_extent_buffer(path->nodes[i]); 1271 path->nodes[i] = NULL; 1272 } 1273 return 1; 1274 } 1275 1276 /* 1277 * walk down reloc tree to find relocated block of lowest level 1278 */ 1279 static noinline_for_stack 1280 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1281 int *level) 1282 { 1283 struct extent_buffer *eb = NULL; 1284 int i; 1285 u64 ptr_gen = 0; 1286 u64 last_snapshot; 1287 u32 nritems; 1288 1289 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1290 1291 for (i = *level; i > 0; i--) { 1292 eb = path->nodes[i]; 1293 nritems = btrfs_header_nritems(eb); 1294 while (path->slots[i] < nritems) { 1295 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1296 if (ptr_gen > last_snapshot) 1297 break; 1298 path->slots[i]++; 1299 } 1300 if (path->slots[i] >= nritems) { 1301 if (i == *level) 1302 break; 1303 *level = i + 1; 1304 return 0; 1305 } 1306 if (i == 1) { 1307 *level = i; 1308 return 0; 1309 } 1310 1311 eb = btrfs_read_node_slot(eb, path->slots[i]); 1312 if (IS_ERR(eb)) 1313 return PTR_ERR(eb); 1314 BUG_ON(btrfs_header_level(eb) != i - 1); 1315 path->nodes[i - 1] = eb; 1316 path->slots[i - 1] = 0; 1317 } 1318 return 1; 1319 } 1320 1321 /* 1322 * invalidate extent cache for file extents whose key in range of 1323 * [min_key, max_key) 1324 */ 1325 static int invalidate_extent_cache(struct btrfs_root *root, 1326 const struct btrfs_key *min_key, 1327 const struct btrfs_key *max_key) 1328 { 1329 struct btrfs_fs_info *fs_info = root->fs_info; 1330 struct btrfs_inode *inode = NULL; 1331 u64 objectid; 1332 u64 start, end; 1333 u64 ino; 1334 1335 objectid = min_key->objectid; 1336 while (1) { 1337 struct extent_state *cached_state = NULL; 1338 1339 cond_resched(); 1340 if (inode) 1341 iput(&inode->vfs_inode); 1342 1343 if (objectid > max_key->objectid) 1344 break; 1345 1346 inode = btrfs_find_first_inode(root, objectid); 1347 if (!inode) 1348 break; 1349 ino = btrfs_ino(inode); 1350 1351 if (ino > max_key->objectid) { 1352 iput(&inode->vfs_inode); 1353 break; 1354 } 1355 1356 objectid = ino + 1; 1357 if (!S_ISREG(inode->vfs_inode.i_mode)) 1358 continue; 1359 1360 if (unlikely(min_key->objectid == ino)) { 1361 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1362 continue; 1363 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1364 start = 0; 1365 else { 1366 start = min_key->offset; 1367 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1368 } 1369 } else { 1370 start = 0; 1371 } 1372 1373 if (unlikely(max_key->objectid == ino)) { 1374 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1375 continue; 1376 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1377 end = (u64)-1; 1378 } else { 1379 if (max_key->offset == 0) 1380 continue; 1381 end = max_key->offset; 1382 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1383 end--; 1384 } 1385 } else { 1386 end = (u64)-1; 1387 } 1388 1389 /* the lock_extent waits for read_folio to complete */ 1390 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 1391 btrfs_drop_extent_map_range(inode, start, end, true); 1392 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1393 } 1394 return 0; 1395 } 1396 1397 static int find_next_key(struct btrfs_path *path, int level, 1398 struct btrfs_key *key) 1399 1400 { 1401 while (level < BTRFS_MAX_LEVEL) { 1402 if (!path->nodes[level]) 1403 break; 1404 if (path->slots[level] + 1 < 1405 btrfs_header_nritems(path->nodes[level])) { 1406 btrfs_node_key_to_cpu(path->nodes[level], key, 1407 path->slots[level] + 1); 1408 return 0; 1409 } 1410 level++; 1411 } 1412 return 1; 1413 } 1414 1415 /* 1416 * Insert current subvolume into reloc_control::dirty_subvol_roots 1417 */ 1418 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1419 struct reloc_control *rc, 1420 struct btrfs_root *root) 1421 { 1422 struct btrfs_root *reloc_root = root->reloc_root; 1423 struct btrfs_root_item *reloc_root_item; 1424 int ret; 1425 1426 /* @root must be a subvolume tree root with a valid reloc tree */ 1427 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1428 ASSERT(reloc_root); 1429 1430 reloc_root_item = &reloc_root->root_item; 1431 memset(&reloc_root_item->drop_progress, 0, 1432 sizeof(reloc_root_item->drop_progress)); 1433 btrfs_set_root_drop_level(reloc_root_item, 0); 1434 btrfs_set_root_refs(reloc_root_item, 0); 1435 ret = btrfs_update_reloc_root(trans, root); 1436 if (ret) 1437 return ret; 1438 1439 if (list_empty(&root->reloc_dirty_list)) { 1440 btrfs_grab_root(root); 1441 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1442 } 1443 1444 return 0; 1445 } 1446 1447 static int clean_dirty_subvols(struct reloc_control *rc) 1448 { 1449 struct btrfs_root *root; 1450 struct btrfs_root *next; 1451 int ret = 0; 1452 int ret2; 1453 1454 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1455 reloc_dirty_list) { 1456 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1457 /* Merged subvolume, cleanup its reloc root */ 1458 struct btrfs_root *reloc_root = root->reloc_root; 1459 1460 list_del_init(&root->reloc_dirty_list); 1461 root->reloc_root = NULL; 1462 /* 1463 * Need barrier to ensure clear_bit() only happens after 1464 * root->reloc_root = NULL. Pairs with have_reloc_root. 1465 */ 1466 smp_wmb(); 1467 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1468 if (reloc_root) { 1469 /* 1470 * btrfs_drop_snapshot drops our ref we hold for 1471 * ->reloc_root. If it fails however we must 1472 * drop the ref ourselves. 1473 */ 1474 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1475 if (ret2 < 0) { 1476 btrfs_put_root(reloc_root); 1477 if (!ret) 1478 ret = ret2; 1479 } 1480 } 1481 btrfs_put_root(root); 1482 } else { 1483 /* Orphan reloc tree, just clean it up */ 1484 ret2 = btrfs_drop_snapshot(root, 0, 1); 1485 if (ret2 < 0) { 1486 btrfs_put_root(root); 1487 if (!ret) 1488 ret = ret2; 1489 } 1490 } 1491 } 1492 return ret; 1493 } 1494 1495 /* 1496 * merge the relocated tree blocks in reloc tree with corresponding 1497 * fs tree. 1498 */ 1499 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1500 struct btrfs_root *root) 1501 { 1502 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1503 struct btrfs_key key; 1504 struct btrfs_key next_key; 1505 struct btrfs_trans_handle *trans = NULL; 1506 struct btrfs_root *reloc_root; 1507 struct btrfs_root_item *root_item; 1508 struct btrfs_path *path; 1509 struct extent_buffer *leaf; 1510 int reserve_level; 1511 int level; 1512 int max_level; 1513 int replaced = 0; 1514 int ret = 0; 1515 u32 min_reserved; 1516 1517 path = btrfs_alloc_path(); 1518 if (!path) 1519 return -ENOMEM; 1520 path->reada = READA_FORWARD; 1521 1522 reloc_root = root->reloc_root; 1523 root_item = &reloc_root->root_item; 1524 1525 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1526 level = btrfs_root_level(root_item); 1527 refcount_inc(&reloc_root->node->refs); 1528 path->nodes[level] = reloc_root->node; 1529 path->slots[level] = 0; 1530 } else { 1531 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1532 1533 level = btrfs_root_drop_level(root_item); 1534 BUG_ON(level == 0); 1535 path->lowest_level = level; 1536 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1537 path->lowest_level = 0; 1538 if (ret < 0) { 1539 btrfs_free_path(path); 1540 return ret; 1541 } 1542 1543 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1544 path->slots[level]); 1545 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1546 1547 btrfs_unlock_up_safe(path, 0); 1548 } 1549 1550 /* 1551 * In merge_reloc_root(), we modify the upper level pointer to swap the 1552 * tree blocks between reloc tree and subvolume tree. Thus for tree 1553 * block COW, we COW at most from level 1 to root level for each tree. 1554 * 1555 * Thus the needed metadata size is at most root_level * nodesize, 1556 * and * 2 since we have two trees to COW. 1557 */ 1558 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1559 min_reserved = fs_info->nodesize * reserve_level * 2; 1560 memset(&next_key, 0, sizeof(next_key)); 1561 1562 while (1) { 1563 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1564 min_reserved, 1565 BTRFS_RESERVE_FLUSH_LIMIT); 1566 if (ret) 1567 goto out; 1568 trans = btrfs_start_transaction(root, 0); 1569 if (IS_ERR(trans)) { 1570 ret = PTR_ERR(trans); 1571 trans = NULL; 1572 goto out; 1573 } 1574 1575 /* 1576 * At this point we no longer have a reloc_control, so we can't 1577 * depend on btrfs_init_reloc_root to update our last_trans. 1578 * 1579 * But that's ok, we started the trans handle on our 1580 * corresponding fs_root, which means it's been added to the 1581 * dirty list. At commit time we'll still call 1582 * btrfs_update_reloc_root() and update our root item 1583 * appropriately. 1584 */ 1585 btrfs_set_root_last_trans(reloc_root, trans->transid); 1586 trans->block_rsv = rc->block_rsv; 1587 1588 replaced = 0; 1589 max_level = level; 1590 1591 ret = walk_down_reloc_tree(reloc_root, path, &level); 1592 if (ret < 0) 1593 goto out; 1594 if (ret > 0) 1595 break; 1596 1597 if (!find_next_key(path, level, &key) && 1598 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1599 ret = 0; 1600 } else { 1601 ret = replace_path(trans, rc, root, reloc_root, path, 1602 &next_key, level, max_level); 1603 } 1604 if (ret < 0) 1605 goto out; 1606 if (ret > 0) { 1607 level = ret; 1608 btrfs_node_key_to_cpu(path->nodes[level], &key, 1609 path->slots[level]); 1610 replaced = 1; 1611 } 1612 1613 ret = walk_up_reloc_tree(reloc_root, path, &level); 1614 if (ret > 0) 1615 break; 1616 1617 BUG_ON(level == 0); 1618 /* 1619 * save the merging progress in the drop_progress. 1620 * this is OK since root refs == 1 in this case. 1621 */ 1622 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1623 path->slots[level]); 1624 btrfs_set_root_drop_level(root_item, level); 1625 1626 btrfs_end_transaction_throttle(trans); 1627 trans = NULL; 1628 1629 btrfs_btree_balance_dirty(fs_info); 1630 1631 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1632 invalidate_extent_cache(root, &key, &next_key); 1633 } 1634 1635 /* 1636 * handle the case only one block in the fs tree need to be 1637 * relocated and the block is tree root. 1638 */ 1639 leaf = btrfs_lock_root_node(root); 1640 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1641 BTRFS_NESTING_COW); 1642 btrfs_tree_unlock(leaf); 1643 free_extent_buffer(leaf); 1644 out: 1645 btrfs_free_path(path); 1646 1647 if (ret == 0) { 1648 ret = insert_dirty_subvol(trans, rc, root); 1649 if (ret) 1650 btrfs_abort_transaction(trans, ret); 1651 } 1652 1653 if (trans) 1654 btrfs_end_transaction_throttle(trans); 1655 1656 btrfs_btree_balance_dirty(fs_info); 1657 1658 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1659 invalidate_extent_cache(root, &key, &next_key); 1660 1661 return ret; 1662 } 1663 1664 static noinline_for_stack 1665 int prepare_to_merge(struct reloc_control *rc, int err) 1666 { 1667 struct btrfs_root *root = rc->extent_root; 1668 struct btrfs_fs_info *fs_info = root->fs_info; 1669 struct btrfs_root *reloc_root; 1670 struct btrfs_trans_handle *trans; 1671 LIST_HEAD(reloc_roots); 1672 u64 num_bytes = 0; 1673 int ret; 1674 1675 mutex_lock(&fs_info->reloc_mutex); 1676 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1677 rc->merging_rsv_size += rc->nodes_relocated * 2; 1678 mutex_unlock(&fs_info->reloc_mutex); 1679 1680 again: 1681 if (!err) { 1682 num_bytes = rc->merging_rsv_size; 1683 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1684 BTRFS_RESERVE_FLUSH_ALL); 1685 if (ret) 1686 err = ret; 1687 } 1688 1689 trans = btrfs_join_transaction(rc->extent_root); 1690 if (IS_ERR(trans)) { 1691 if (!err) 1692 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1693 num_bytes, NULL); 1694 return PTR_ERR(trans); 1695 } 1696 1697 if (!err) { 1698 if (num_bytes != rc->merging_rsv_size) { 1699 btrfs_end_transaction(trans); 1700 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1701 num_bytes, NULL); 1702 goto again; 1703 } 1704 } 1705 1706 rc->merge_reloc_tree = true; 1707 1708 while (!list_empty(&rc->reloc_roots)) { 1709 reloc_root = list_first_entry(&rc->reloc_roots, 1710 struct btrfs_root, root_list); 1711 list_del_init(&reloc_root->root_list); 1712 1713 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1714 false); 1715 if (IS_ERR(root)) { 1716 /* 1717 * Even if we have an error we need this reloc root 1718 * back on our list so we can clean up properly. 1719 */ 1720 list_add(&reloc_root->root_list, &reloc_roots); 1721 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1722 if (!err) 1723 err = PTR_ERR(root); 1724 break; 1725 } 1726 1727 if (unlikely(root->reloc_root != reloc_root)) { 1728 if (root->reloc_root) { 1729 btrfs_err(fs_info, 1730 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1731 btrfs_root_id(root), 1732 btrfs_root_id(root->reloc_root), 1733 root->reloc_root->root_key.type, 1734 root->reloc_root->root_key.offset, 1735 btrfs_root_generation( 1736 &root->reloc_root->root_item), 1737 btrfs_root_id(reloc_root), 1738 reloc_root->root_key.type, 1739 reloc_root->root_key.offset, 1740 btrfs_root_generation( 1741 &reloc_root->root_item)); 1742 } else { 1743 btrfs_err(fs_info, 1744 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1745 btrfs_root_id(root), 1746 btrfs_root_id(reloc_root), 1747 reloc_root->root_key.type, 1748 reloc_root->root_key.offset, 1749 btrfs_root_generation( 1750 &reloc_root->root_item)); 1751 } 1752 list_add(&reloc_root->root_list, &reloc_roots); 1753 btrfs_put_root(root); 1754 btrfs_abort_transaction(trans, -EUCLEAN); 1755 if (!err) 1756 err = -EUCLEAN; 1757 break; 1758 } 1759 1760 /* 1761 * set reference count to 1, so btrfs_recover_relocation 1762 * knows it should resumes merging 1763 */ 1764 if (!err) 1765 btrfs_set_root_refs(&reloc_root->root_item, 1); 1766 ret = btrfs_update_reloc_root(trans, root); 1767 1768 /* 1769 * Even if we have an error we need this reloc root back on our 1770 * list so we can clean up properly. 1771 */ 1772 list_add(&reloc_root->root_list, &reloc_roots); 1773 btrfs_put_root(root); 1774 1775 if (ret) { 1776 btrfs_abort_transaction(trans, ret); 1777 if (!err) 1778 err = ret; 1779 break; 1780 } 1781 } 1782 1783 list_splice(&reloc_roots, &rc->reloc_roots); 1784 1785 if (!err) 1786 err = btrfs_commit_transaction(trans); 1787 else 1788 btrfs_end_transaction(trans); 1789 return err; 1790 } 1791 1792 static noinline_for_stack 1793 void free_reloc_roots(struct list_head *list) 1794 { 1795 struct btrfs_root *reloc_root, *tmp; 1796 1797 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1798 __del_reloc_root(reloc_root); 1799 } 1800 1801 static noinline_for_stack 1802 void merge_reloc_roots(struct reloc_control *rc) 1803 { 1804 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1805 struct btrfs_root *root; 1806 struct btrfs_root *reloc_root; 1807 LIST_HEAD(reloc_roots); 1808 int found = 0; 1809 int ret = 0; 1810 again: 1811 root = rc->extent_root; 1812 1813 /* 1814 * this serializes us with btrfs_record_root_in_transaction, 1815 * we have to make sure nobody is in the middle of 1816 * adding their roots to the list while we are 1817 * doing this splice 1818 */ 1819 mutex_lock(&fs_info->reloc_mutex); 1820 list_splice_init(&rc->reloc_roots, &reloc_roots); 1821 mutex_unlock(&fs_info->reloc_mutex); 1822 1823 while (!list_empty(&reloc_roots)) { 1824 found = 1; 1825 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 1826 1827 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1828 false); 1829 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1830 if (WARN_ON(IS_ERR(root))) { 1831 /* 1832 * For recovery we read the fs roots on mount, 1833 * and if we didn't find the root then we marked 1834 * the reloc root as a garbage root. For normal 1835 * relocation obviously the root should exist in 1836 * memory. However there's no reason we can't 1837 * handle the error properly here just in case. 1838 */ 1839 ret = PTR_ERR(root); 1840 goto out; 1841 } 1842 if (WARN_ON(root->reloc_root != reloc_root)) { 1843 /* 1844 * This can happen if on-disk metadata has some 1845 * corruption, e.g. bad reloc tree key offset. 1846 */ 1847 ret = -EINVAL; 1848 goto out; 1849 } 1850 ret = merge_reloc_root(rc, root); 1851 btrfs_put_root(root); 1852 if (ret) { 1853 if (list_empty(&reloc_root->root_list)) 1854 list_add_tail(&reloc_root->root_list, 1855 &reloc_roots); 1856 goto out; 1857 } 1858 } else { 1859 if (!IS_ERR(root)) { 1860 if (root->reloc_root == reloc_root) { 1861 root->reloc_root = NULL; 1862 btrfs_put_root(reloc_root); 1863 } 1864 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1865 &root->state); 1866 btrfs_put_root(root); 1867 } 1868 1869 list_del_init(&reloc_root->root_list); 1870 /* Don't forget to queue this reloc root for cleanup */ 1871 list_add_tail(&reloc_root->reloc_dirty_list, 1872 &rc->dirty_subvol_roots); 1873 } 1874 } 1875 1876 if (found) { 1877 found = 0; 1878 goto again; 1879 } 1880 out: 1881 if (ret) { 1882 btrfs_handle_fs_error(fs_info, ret, NULL); 1883 free_reloc_roots(&reloc_roots); 1884 1885 /* new reloc root may be added */ 1886 mutex_lock(&fs_info->reloc_mutex); 1887 list_splice_init(&rc->reloc_roots, &reloc_roots); 1888 mutex_unlock(&fs_info->reloc_mutex); 1889 free_reloc_roots(&reloc_roots); 1890 } 1891 1892 /* 1893 * We used to have 1894 * 1895 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1896 * 1897 * here, but it's wrong. If we fail to start the transaction in 1898 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1899 * have actually been removed from the reloc_root_tree rb tree. This is 1900 * fine because we're bailing here, and we hold a reference on the root 1901 * for the list that holds it, so these roots will be cleaned up when we 1902 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1903 * will be cleaned up on unmount. 1904 * 1905 * The remaining nodes will be cleaned up by free_reloc_control. 1906 */ 1907 } 1908 1909 static void free_block_list(struct rb_root *blocks) 1910 { 1911 struct tree_block *block; 1912 struct rb_node *rb_node; 1913 while ((rb_node = rb_first(blocks))) { 1914 block = rb_entry(rb_node, struct tree_block, rb_node); 1915 rb_erase(rb_node, blocks); 1916 kfree(block); 1917 } 1918 } 1919 1920 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 1921 struct btrfs_root *reloc_root) 1922 { 1923 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 1924 struct btrfs_root *root; 1925 int ret; 1926 1927 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 1928 return 0; 1929 1930 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 1931 1932 /* 1933 * This should succeed, since we can't have a reloc root without having 1934 * already looked up the actual root and created the reloc root for this 1935 * root. 1936 * 1937 * However if there's some sort of corruption where we have a ref to a 1938 * reloc root without a corresponding root this could return ENOENT. 1939 */ 1940 if (IS_ERR(root)) { 1941 DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); 1942 return PTR_ERR(root); 1943 } 1944 if (root->reloc_root != reloc_root) { 1945 DEBUG_WARN("unexpected reloc root found"); 1946 btrfs_err(fs_info, 1947 "root %llu has two reloc roots associated with it", 1948 reloc_root->root_key.offset); 1949 btrfs_put_root(root); 1950 return -EUCLEAN; 1951 } 1952 ret = btrfs_record_root_in_trans(trans, root); 1953 btrfs_put_root(root); 1954 1955 return ret; 1956 } 1957 1958 static noinline_for_stack 1959 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 1960 struct reloc_control *rc, 1961 struct btrfs_backref_node *node, 1962 struct btrfs_backref_edge *edges[]) 1963 { 1964 struct btrfs_backref_node *next; 1965 struct btrfs_root *root; 1966 int index = 0; 1967 int ret; 1968 1969 next = walk_up_backref(node, edges, &index); 1970 root = next->root; 1971 1972 /* 1973 * If there is no root, then our references for this block are 1974 * incomplete, as we should be able to walk all the way up to a block 1975 * that is owned by a root. 1976 * 1977 * This path is only for SHAREABLE roots, so if we come upon a 1978 * non-SHAREABLE root then we have backrefs that resolve improperly. 1979 * 1980 * Both of these cases indicate file system corruption, or a bug in the 1981 * backref walking code. 1982 */ 1983 if (unlikely(!root)) { 1984 btrfs_err(trans->fs_info, 1985 "bytenr %llu doesn't have a backref path ending in a root", 1986 node->bytenr); 1987 return ERR_PTR(-EUCLEAN); 1988 } 1989 if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { 1990 btrfs_err(trans->fs_info, 1991 "bytenr %llu has multiple refs with one ending in a non-shareable root", 1992 node->bytenr); 1993 return ERR_PTR(-EUCLEAN); 1994 } 1995 1996 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 1997 ret = record_reloc_root_in_trans(trans, root); 1998 if (ret) 1999 return ERR_PTR(ret); 2000 goto found; 2001 } 2002 2003 ret = btrfs_record_root_in_trans(trans, root); 2004 if (ret) 2005 return ERR_PTR(ret); 2006 root = root->reloc_root; 2007 2008 /* 2009 * We could have raced with another thread which failed, so 2010 * root->reloc_root may not be set, return ENOENT in this case. 2011 */ 2012 if (!root) 2013 return ERR_PTR(-ENOENT); 2014 2015 if (next->new_bytenr) { 2016 /* 2017 * We just created the reloc root, so we shouldn't have 2018 * ->new_bytenr set yet. If it is then we have multiple roots 2019 * pointing at the same bytenr which indicates corruption, or 2020 * we've made a mistake in the backref walking code. 2021 */ 2022 ASSERT(next->new_bytenr == 0); 2023 btrfs_err(trans->fs_info, 2024 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2025 node->bytenr, next->bytenr); 2026 return ERR_PTR(-EUCLEAN); 2027 } 2028 2029 next->new_bytenr = root->node->start; 2030 btrfs_put_root(next->root); 2031 next->root = btrfs_grab_root(root); 2032 ASSERT(next->root); 2033 mark_block_processed(rc, next); 2034 found: 2035 next = node; 2036 /* setup backref node path for btrfs_reloc_cow_block */ 2037 while (1) { 2038 rc->backref_cache.path[next->level] = next; 2039 if (--index < 0) 2040 break; 2041 next = edges[index]->node[UPPER]; 2042 } 2043 return root; 2044 } 2045 2046 /* 2047 * Select a tree root for relocation. 2048 * 2049 * Return NULL if the block is not shareable. We should use do_relocation() in 2050 * this case. 2051 * 2052 * Return a tree root pointer if the block is shareable. 2053 * Return -ENOENT if the block is root of reloc tree. 2054 */ 2055 static noinline_for_stack 2056 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2057 { 2058 struct btrfs_backref_node *next; 2059 struct btrfs_root *root; 2060 struct btrfs_root *fs_root = NULL; 2061 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2062 int index = 0; 2063 2064 next = node; 2065 while (1) { 2066 cond_resched(); 2067 next = walk_up_backref(next, edges, &index); 2068 root = next->root; 2069 2070 /* 2071 * This can occur if we have incomplete extent refs leading all 2072 * the way up a particular path, in this case return -EUCLEAN. 2073 */ 2074 if (!root) 2075 return ERR_PTR(-EUCLEAN); 2076 2077 /* No other choice for non-shareable tree */ 2078 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2079 return root; 2080 2081 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2082 fs_root = root; 2083 2084 if (next != node) 2085 return NULL; 2086 2087 next = walk_down_backref(edges, &index); 2088 if (!next || next->level <= node->level) 2089 break; 2090 } 2091 2092 if (!fs_root) 2093 return ERR_PTR(-ENOENT); 2094 return fs_root; 2095 } 2096 2097 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2098 struct btrfs_backref_node *node) 2099 { 2100 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2101 struct btrfs_backref_node *next = node; 2102 struct btrfs_backref_edge *edge; 2103 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2104 u64 num_bytes = 0; 2105 int index = 0; 2106 2107 BUG_ON(node->processed); 2108 2109 while (next) { 2110 cond_resched(); 2111 while (1) { 2112 if (next->processed) 2113 break; 2114 2115 num_bytes += fs_info->nodesize; 2116 2117 if (list_empty(&next->upper)) 2118 break; 2119 2120 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2121 list[LOWER]); 2122 edges[index++] = edge; 2123 next = edge->node[UPPER]; 2124 } 2125 next = walk_down_backref(edges, &index); 2126 } 2127 return num_bytes; 2128 } 2129 2130 static int refill_metadata_space(struct btrfs_trans_handle *trans, 2131 struct reloc_control *rc, u64 num_bytes) 2132 { 2133 struct btrfs_fs_info *fs_info = trans->fs_info; 2134 int ret; 2135 2136 trans->block_rsv = rc->block_rsv; 2137 rc->reserved_bytes += num_bytes; 2138 2139 /* 2140 * We are under a transaction here so we can only do limited flushing. 2141 * If we get an enospc just kick back -EAGAIN so we know to drop the 2142 * transaction and try to refill when we can flush all the things. 2143 */ 2144 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2145 BTRFS_RESERVE_FLUSH_LIMIT); 2146 if (ret) { 2147 u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2148 2149 while (tmp <= rc->reserved_bytes) 2150 tmp <<= 1; 2151 /* 2152 * only one thread can access block_rsv at this point, 2153 * so we don't need hold lock to protect block_rsv. 2154 * we expand more reservation size here to allow enough 2155 * space for relocation and we will return earlier in 2156 * enospc case. 2157 */ 2158 rc->block_rsv->size = tmp + fs_info->nodesize * 2159 RELOCATION_RESERVED_NODES; 2160 return -EAGAIN; 2161 } 2162 2163 return 0; 2164 } 2165 2166 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2167 struct reloc_control *rc, 2168 struct btrfs_backref_node *node) 2169 { 2170 u64 num_bytes; 2171 2172 num_bytes = calcu_metadata_size(rc, node) * 2; 2173 return refill_metadata_space(trans, rc, num_bytes); 2174 } 2175 2176 /* 2177 * relocate a block tree, and then update pointers in upper level 2178 * blocks that reference the block to point to the new location. 2179 * 2180 * if called by link_to_upper, the block has already been relocated. 2181 * in that case this function just updates pointers. 2182 */ 2183 static int do_relocation(struct btrfs_trans_handle *trans, 2184 struct reloc_control *rc, 2185 struct btrfs_backref_node *node, 2186 struct btrfs_key *key, 2187 struct btrfs_path *path, int lowest) 2188 { 2189 struct btrfs_backref_node *upper; 2190 struct btrfs_backref_edge *edge; 2191 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2192 struct btrfs_root *root; 2193 struct extent_buffer *eb; 2194 u32 blocksize; 2195 u64 bytenr; 2196 int slot; 2197 int ret = 0; 2198 2199 /* 2200 * If we are lowest then this is the first time we're processing this 2201 * block, and thus shouldn't have an eb associated with it yet. 2202 */ 2203 ASSERT(!lowest || !node->eb); 2204 2205 path->lowest_level = node->level + 1; 2206 rc->backref_cache.path[node->level] = node; 2207 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2208 cond_resched(); 2209 2210 upper = edge->node[UPPER]; 2211 root = select_reloc_root(trans, rc, upper, edges); 2212 if (IS_ERR(root)) { 2213 ret = PTR_ERR(root); 2214 goto next; 2215 } 2216 2217 if (upper->eb && !upper->locked) { 2218 if (!lowest) { 2219 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2220 if (ret < 0) 2221 goto next; 2222 BUG_ON(ret); 2223 bytenr = btrfs_node_blockptr(upper->eb, slot); 2224 if (node->eb->start == bytenr) 2225 goto next; 2226 } 2227 btrfs_backref_drop_node_buffer(upper); 2228 } 2229 2230 if (!upper->eb) { 2231 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2232 if (ret) { 2233 if (ret > 0) 2234 ret = -ENOENT; 2235 2236 btrfs_release_path(path); 2237 break; 2238 } 2239 2240 if (!upper->eb) { 2241 upper->eb = path->nodes[upper->level]; 2242 path->nodes[upper->level] = NULL; 2243 } else { 2244 BUG_ON(upper->eb != path->nodes[upper->level]); 2245 } 2246 2247 upper->locked = 1; 2248 path->locks[upper->level] = 0; 2249 2250 slot = path->slots[upper->level]; 2251 btrfs_release_path(path); 2252 } else { 2253 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2254 if (ret < 0) 2255 goto next; 2256 BUG_ON(ret); 2257 } 2258 2259 bytenr = btrfs_node_blockptr(upper->eb, slot); 2260 if (lowest) { 2261 if (bytenr != node->bytenr) { 2262 btrfs_err(root->fs_info, 2263 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2264 bytenr, node->bytenr, slot, 2265 upper->eb->start); 2266 ret = -EIO; 2267 goto next; 2268 } 2269 } else { 2270 if (node->eb->start == bytenr) 2271 goto next; 2272 } 2273 2274 blocksize = root->fs_info->nodesize; 2275 eb = btrfs_read_node_slot(upper->eb, slot); 2276 if (IS_ERR(eb)) { 2277 ret = PTR_ERR(eb); 2278 goto next; 2279 } 2280 btrfs_tree_lock(eb); 2281 2282 if (!node->eb) { 2283 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2284 slot, &eb, BTRFS_NESTING_COW); 2285 btrfs_tree_unlock(eb); 2286 free_extent_buffer(eb); 2287 if (ret < 0) 2288 goto next; 2289 /* 2290 * We've just COWed this block, it should have updated 2291 * the correct backref node entry. 2292 */ 2293 ASSERT(node->eb == eb); 2294 } else { 2295 struct btrfs_ref ref = { 2296 .action = BTRFS_ADD_DELAYED_REF, 2297 .bytenr = node->eb->start, 2298 .num_bytes = blocksize, 2299 .parent = upper->eb->start, 2300 .owning_root = btrfs_header_owner(upper->eb), 2301 .ref_root = btrfs_header_owner(upper->eb), 2302 }; 2303 2304 btrfs_set_node_blockptr(upper->eb, slot, 2305 node->eb->start); 2306 btrfs_set_node_ptr_generation(upper->eb, slot, 2307 trans->transid); 2308 btrfs_mark_buffer_dirty(trans, upper->eb); 2309 2310 btrfs_init_tree_ref(&ref, node->level, 2311 btrfs_root_id(root), false); 2312 ret = btrfs_inc_extent_ref(trans, &ref); 2313 if (!ret) 2314 ret = btrfs_drop_subtree(trans, root, eb, 2315 upper->eb); 2316 if (ret) 2317 btrfs_abort_transaction(trans, ret); 2318 } 2319 next: 2320 if (!upper->pending) 2321 btrfs_backref_drop_node_buffer(upper); 2322 else 2323 btrfs_backref_unlock_node_buffer(upper); 2324 if (ret) 2325 break; 2326 } 2327 2328 if (!ret && node->pending) { 2329 btrfs_backref_drop_node_buffer(node); 2330 list_del_init(&node->list); 2331 node->pending = 0; 2332 } 2333 2334 path->lowest_level = 0; 2335 2336 /* 2337 * We should have allocated all of our space in the block rsv and thus 2338 * shouldn't ENOSPC. 2339 */ 2340 ASSERT(ret != -ENOSPC); 2341 return ret; 2342 } 2343 2344 static int link_to_upper(struct btrfs_trans_handle *trans, 2345 struct reloc_control *rc, 2346 struct btrfs_backref_node *node, 2347 struct btrfs_path *path) 2348 { 2349 struct btrfs_key key; 2350 2351 btrfs_node_key_to_cpu(node->eb, &key, 0); 2352 return do_relocation(trans, rc, node, &key, path, 0); 2353 } 2354 2355 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2356 struct reloc_control *rc, 2357 struct btrfs_path *path, int err) 2358 { 2359 LIST_HEAD(list); 2360 struct btrfs_backref_cache *cache = &rc->backref_cache; 2361 struct btrfs_backref_node *node; 2362 int level; 2363 int ret; 2364 2365 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2366 while (!list_empty(&cache->pending[level])) { 2367 node = list_first_entry(&cache->pending[level], 2368 struct btrfs_backref_node, list); 2369 list_move_tail(&node->list, &list); 2370 BUG_ON(!node->pending); 2371 2372 if (!err) { 2373 ret = link_to_upper(trans, rc, node, path); 2374 if (ret < 0) 2375 err = ret; 2376 } 2377 } 2378 list_splice_init(&list, &cache->pending[level]); 2379 } 2380 return err; 2381 } 2382 2383 /* 2384 * mark a block and all blocks directly/indirectly reference the block 2385 * as processed. 2386 */ 2387 static void update_processed_blocks(struct reloc_control *rc, 2388 struct btrfs_backref_node *node) 2389 { 2390 struct btrfs_backref_node *next = node; 2391 struct btrfs_backref_edge *edge; 2392 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2393 int index = 0; 2394 2395 while (next) { 2396 cond_resched(); 2397 while (1) { 2398 if (next->processed) 2399 break; 2400 2401 mark_block_processed(rc, next); 2402 2403 if (list_empty(&next->upper)) 2404 break; 2405 2406 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2407 list[LOWER]); 2408 edges[index++] = edge; 2409 next = edge->node[UPPER]; 2410 } 2411 next = walk_down_backref(edges, &index); 2412 } 2413 } 2414 2415 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2416 { 2417 u32 blocksize = rc->extent_root->fs_info->nodesize; 2418 2419 if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, 2420 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2421 return 1; 2422 return 0; 2423 } 2424 2425 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2426 struct tree_block *block) 2427 { 2428 struct btrfs_tree_parent_check check = { 2429 .level = block->level, 2430 .owner_root = block->owner, 2431 .transid = block->key.offset 2432 }; 2433 struct extent_buffer *eb; 2434 2435 eb = read_tree_block(fs_info, block->bytenr, &check); 2436 if (IS_ERR(eb)) 2437 return PTR_ERR(eb); 2438 if (!extent_buffer_uptodate(eb)) { 2439 free_extent_buffer(eb); 2440 return -EIO; 2441 } 2442 if (block->level == 0) 2443 btrfs_item_key_to_cpu(eb, &block->key, 0); 2444 else 2445 btrfs_node_key_to_cpu(eb, &block->key, 0); 2446 free_extent_buffer(eb); 2447 block->key_ready = true; 2448 return 0; 2449 } 2450 2451 /* 2452 * helper function to relocate a tree block 2453 */ 2454 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2455 struct reloc_control *rc, 2456 struct btrfs_backref_node *node, 2457 struct btrfs_key *key, 2458 struct btrfs_path *path) 2459 { 2460 struct btrfs_root *root; 2461 int ret = 0; 2462 2463 if (!node) 2464 return 0; 2465 2466 /* 2467 * If we fail here we want to drop our backref_node because we are going 2468 * to start over and regenerate the tree for it. 2469 */ 2470 ret = reserve_metadata_space(trans, rc, node); 2471 if (ret) 2472 goto out; 2473 2474 BUG_ON(node->processed); 2475 root = select_one_root(node); 2476 if (IS_ERR(root)) { 2477 ret = PTR_ERR(root); 2478 2479 /* See explanation in select_one_root for the -EUCLEAN case. */ 2480 ASSERT(ret == -ENOENT); 2481 if (ret == -ENOENT) { 2482 ret = 0; 2483 update_processed_blocks(rc, node); 2484 } 2485 goto out; 2486 } 2487 2488 if (root) { 2489 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2490 /* 2491 * This block was the root block of a root, and this is 2492 * the first time we're processing the block and thus it 2493 * should not have had the ->new_bytenr modified. 2494 * 2495 * However in the case of corruption we could have 2496 * multiple refs pointing to the same block improperly, 2497 * and thus we would trip over these checks. ASSERT() 2498 * for the developer case, because it could indicate a 2499 * bug in the backref code, however error out for a 2500 * normal user in the case of corruption. 2501 */ 2502 ASSERT(node->new_bytenr == 0); 2503 if (node->new_bytenr) { 2504 btrfs_err(root->fs_info, 2505 "bytenr %llu has improper references to it", 2506 node->bytenr); 2507 ret = -EUCLEAN; 2508 goto out; 2509 } 2510 ret = btrfs_record_root_in_trans(trans, root); 2511 if (ret) 2512 goto out; 2513 /* 2514 * Another thread could have failed, need to check if we 2515 * have reloc_root actually set. 2516 */ 2517 if (!root->reloc_root) { 2518 ret = -ENOENT; 2519 goto out; 2520 } 2521 root = root->reloc_root; 2522 node->new_bytenr = root->node->start; 2523 btrfs_put_root(node->root); 2524 node->root = btrfs_grab_root(root); 2525 ASSERT(node->root); 2526 } else { 2527 btrfs_err(root->fs_info, 2528 "bytenr %llu resolved to a non-shareable root", 2529 node->bytenr); 2530 ret = -EUCLEAN; 2531 goto out; 2532 } 2533 if (!ret) 2534 update_processed_blocks(rc, node); 2535 } else { 2536 ret = do_relocation(trans, rc, node, key, path, 1); 2537 } 2538 out: 2539 if (ret || node->level == 0) 2540 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2541 return ret; 2542 } 2543 2544 static int relocate_cowonly_block(struct btrfs_trans_handle *trans, 2545 struct reloc_control *rc, struct tree_block *block, 2546 struct btrfs_path *path) 2547 { 2548 struct btrfs_fs_info *fs_info = trans->fs_info; 2549 struct btrfs_root *root; 2550 u64 num_bytes; 2551 int nr_levels; 2552 int ret; 2553 2554 root = btrfs_get_fs_root(fs_info, block->owner, true); 2555 if (IS_ERR(root)) 2556 return PTR_ERR(root); 2557 2558 nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; 2559 2560 num_bytes = fs_info->nodesize * nr_levels; 2561 ret = refill_metadata_space(trans, rc, num_bytes); 2562 if (ret) { 2563 btrfs_put_root(root); 2564 return ret; 2565 } 2566 path->lowest_level = block->level; 2567 if (root == root->fs_info->chunk_root) 2568 btrfs_reserve_chunk_metadata(trans, false); 2569 2570 ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); 2571 path->lowest_level = 0; 2572 btrfs_release_path(path); 2573 2574 if (root == root->fs_info->chunk_root) 2575 btrfs_trans_release_chunk_metadata(trans); 2576 if (ret > 0) 2577 ret = 0; 2578 btrfs_put_root(root); 2579 2580 return ret; 2581 } 2582 2583 /* 2584 * relocate a list of blocks 2585 */ 2586 static noinline_for_stack 2587 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2588 struct reloc_control *rc, struct rb_root *blocks) 2589 { 2590 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2591 struct btrfs_backref_node *node; 2592 struct btrfs_path *path; 2593 struct tree_block *block; 2594 struct tree_block *next; 2595 int ret = 0; 2596 2597 path = btrfs_alloc_path(); 2598 if (!path) { 2599 ret = -ENOMEM; 2600 goto out_free_blocks; 2601 } 2602 2603 /* Kick in readahead for tree blocks with missing keys */ 2604 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2605 if (!block->key_ready) 2606 btrfs_readahead_tree_block(fs_info, block->bytenr, 2607 block->owner, 0, 2608 block->level); 2609 } 2610 2611 /* Get first keys */ 2612 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2613 if (!block->key_ready) { 2614 ret = get_tree_block_key(fs_info, block); 2615 if (ret) 2616 goto out_free_path; 2617 } 2618 } 2619 2620 /* Do tree relocation */ 2621 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2622 /* 2623 * For COWonly blocks, or the data reloc tree, we only need to 2624 * COW down to the block, there's no need to generate a backref 2625 * tree. 2626 */ 2627 if (block->owner && 2628 (!btrfs_is_fstree(block->owner) || 2629 block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { 2630 ret = relocate_cowonly_block(trans, rc, block, path); 2631 if (ret) 2632 break; 2633 continue; 2634 } 2635 2636 node = build_backref_tree(trans, rc, &block->key, 2637 block->level, block->bytenr); 2638 if (IS_ERR(node)) { 2639 ret = PTR_ERR(node); 2640 goto out; 2641 } 2642 2643 ret = relocate_tree_block(trans, rc, node, &block->key, 2644 path); 2645 if (ret < 0) 2646 break; 2647 } 2648 out: 2649 ret = finish_pending_nodes(trans, rc, path, ret); 2650 2651 out_free_path: 2652 btrfs_free_path(path); 2653 out_free_blocks: 2654 free_block_list(blocks); 2655 return ret; 2656 } 2657 2658 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2659 { 2660 const struct file_extent_cluster *cluster = &rc->cluster; 2661 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2662 u64 alloc_hint = 0; 2663 u64 start; 2664 u64 end; 2665 u64 offset = inode->reloc_block_group_start; 2666 u64 num_bytes; 2667 int nr; 2668 int ret = 0; 2669 u64 prealloc_start = cluster->start - offset; 2670 u64 prealloc_end = cluster->end - offset; 2671 u64 cur_offset = prealloc_start; 2672 2673 /* 2674 * For blocksize < folio size case (either bs < page size or large folios), 2675 * beyond i_size, all blocks are filled with zero. 2676 * 2677 * If the current cluster covers the above range, btrfs_do_readpage() 2678 * will skip the read, and relocate_one_folio() will later writeback 2679 * the padding zeros as new data, causing data corruption. 2680 * 2681 * Here we have to invalidate the cache covering our cluster. 2682 */ 2683 ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, 2684 prealloc_end); 2685 if (ret < 0) 2686 return ret; 2687 2688 BUG_ON(cluster->start != cluster->boundary[0]); 2689 ret = btrfs_alloc_data_chunk_ondemand(inode, 2690 prealloc_end + 1 - prealloc_start); 2691 if (ret) 2692 return ret; 2693 2694 btrfs_inode_lock(inode, 0); 2695 for (nr = 0; nr < cluster->nr; nr++) { 2696 struct extent_state *cached_state = NULL; 2697 2698 start = cluster->boundary[nr] - offset; 2699 if (nr + 1 < cluster->nr) 2700 end = cluster->boundary[nr + 1] - 1 - offset; 2701 else 2702 end = cluster->end - offset; 2703 2704 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2705 num_bytes = end + 1 - start; 2706 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2707 num_bytes, num_bytes, 2708 end + 1, &alloc_hint); 2709 cur_offset = end + 1; 2710 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2711 if (ret) 2712 break; 2713 } 2714 btrfs_inode_unlock(inode, 0); 2715 2716 if (cur_offset < prealloc_end) 2717 btrfs_free_reserved_data_space_noquota(inode, 2718 prealloc_end + 1 - cur_offset); 2719 return ret; 2720 } 2721 2722 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2723 { 2724 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2725 struct extent_map *em; 2726 struct extent_state *cached_state = NULL; 2727 u64 offset = inode->reloc_block_group_start; 2728 u64 start = rc->cluster.start - offset; 2729 u64 end = rc->cluster.end - offset; 2730 int ret = 0; 2731 2732 em = btrfs_alloc_extent_map(); 2733 if (!em) 2734 return -ENOMEM; 2735 2736 em->start = start; 2737 em->len = end + 1 - start; 2738 em->disk_bytenr = rc->cluster.start; 2739 em->disk_num_bytes = em->len; 2740 em->ram_bytes = em->len; 2741 em->flags |= EXTENT_FLAG_PINNED; 2742 2743 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2744 ret = btrfs_replace_extent_map_range(inode, em, false); 2745 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2746 btrfs_free_extent_map(em); 2747 2748 return ret; 2749 } 2750 2751 /* 2752 * Allow error injection to test balance/relocation cancellation 2753 */ 2754 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2755 { 2756 return atomic_read(&fs_info->balance_cancel_req) || 2757 atomic_read(&fs_info->reloc_cancel_req) || 2758 fatal_signal_pending(current); 2759 } 2760 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2761 2762 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2763 int cluster_nr) 2764 { 2765 /* Last extent, use cluster end directly */ 2766 if (cluster_nr >= cluster->nr - 1) 2767 return cluster->end; 2768 2769 /* Use next boundary start*/ 2770 return cluster->boundary[cluster_nr + 1] - 1; 2771 } 2772 2773 static int relocate_one_folio(struct reloc_control *rc, 2774 struct file_ra_state *ra, 2775 int *cluster_nr, u64 *file_offset_ret) 2776 { 2777 const struct file_extent_cluster *cluster = &rc->cluster; 2778 struct inode *inode = rc->data_inode; 2779 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2780 const u64 orig_file_offset = *file_offset_ret; 2781 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2782 const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; 2783 const pgoff_t index = orig_file_offset >> PAGE_SHIFT; 2784 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2785 struct folio *folio; 2786 u64 folio_start; 2787 u64 folio_end; 2788 u64 cur; 2789 int ret; 2790 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2791 2792 ASSERT(index <= last_index); 2793 again: 2794 folio = filemap_lock_folio(inode->i_mapping, index); 2795 if (IS_ERR(folio)) { 2796 2797 /* 2798 * On relocation we're doing readahead on the relocation inode, 2799 * but if the filesystem is backed by a RAID stripe tree we can 2800 * get ENOENT (e.g. due to preallocated extents not being 2801 * mapped in the RST) from the lookup. 2802 * 2803 * But readahead doesn't handle the error and submits invalid 2804 * reads to the device, causing a assertion failures. 2805 */ 2806 if (!use_rst) 2807 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2808 index, last_index + 1 - index); 2809 folio = __filemap_get_folio(inode->i_mapping, index, 2810 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2811 mask); 2812 if (IS_ERR(folio)) 2813 return PTR_ERR(folio); 2814 } 2815 2816 if (folio_test_readahead(folio) && !use_rst) 2817 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2818 folio, last_index + 1 - index); 2819 2820 if (!folio_test_uptodate(folio)) { 2821 btrfs_read_folio(NULL, folio); 2822 folio_lock(folio); 2823 if (!folio_test_uptodate(folio)) { 2824 ret = -EIO; 2825 goto release_folio; 2826 } 2827 if (folio->mapping != inode->i_mapping) { 2828 folio_unlock(folio); 2829 folio_put(folio); 2830 goto again; 2831 } 2832 } 2833 2834 /* 2835 * We could have lost folio private when we dropped the lock to read the 2836 * folio above, make sure we set_folio_extent_mapped() here so we have any 2837 * of the subpage blocksize stuff we need in place. 2838 */ 2839 ret = set_folio_extent_mapped(folio); 2840 if (ret < 0) 2841 goto release_folio; 2842 2843 folio_start = folio_pos(folio); 2844 folio_end = folio_start + folio_size(folio) - 1; 2845 2846 /* 2847 * Start from the cluster, as for subpage case, the cluster can start 2848 * inside the folio. 2849 */ 2850 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2851 while (cur <= folio_end) { 2852 struct extent_state *cached_state = NULL; 2853 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2854 u64 extent_end = get_cluster_boundary_end(cluster, 2855 *cluster_nr) - offset; 2856 u64 clamped_start = max(folio_start, extent_start); 2857 u64 clamped_end = min(folio_end, extent_end); 2858 u32 clamped_len = clamped_end + 1 - clamped_start; 2859 2860 /* Reserve metadata for this range */ 2861 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2862 clamped_len, clamped_len, 2863 false); 2864 if (ret) 2865 goto release_folio; 2866 2867 /* Mark the range delalloc and dirty for later writeback */ 2868 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, 2869 clamped_end, &cached_state); 2870 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2871 clamped_end, 0, &cached_state); 2872 if (ret) { 2873 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 2874 clamped_start, clamped_end, 2875 EXTENT_LOCKED | EXTENT_BOUNDARY, 2876 &cached_state); 2877 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2878 clamped_len, true); 2879 btrfs_delalloc_release_extents(BTRFS_I(inode), 2880 clamped_len); 2881 goto release_folio; 2882 } 2883 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2884 2885 /* 2886 * Set the boundary if it's inside the folio. 2887 * Data relocation requires the destination extents to have the 2888 * same size as the source. 2889 * EXTENT_BOUNDARY bit prevents current extent from being merged 2890 * with previous extent. 2891 */ 2892 if (in_range(cluster->boundary[*cluster_nr] - offset, 2893 folio_start, folio_size(folio))) { 2894 u64 boundary_start = cluster->boundary[*cluster_nr] - 2895 offset; 2896 u64 boundary_end = boundary_start + 2897 fs_info->sectorsize - 1; 2898 2899 btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, 2900 boundary_start, boundary_end, 2901 EXTENT_BOUNDARY, NULL); 2902 } 2903 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2904 &cached_state); 2905 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 2906 cur += clamped_len; 2907 2908 /* Crossed extent end, go to next extent */ 2909 if (cur >= extent_end) { 2910 (*cluster_nr)++; 2911 /* Just finished the last extent of the cluster, exit. */ 2912 if (*cluster_nr >= cluster->nr) 2913 break; 2914 } 2915 } 2916 folio_unlock(folio); 2917 folio_put(folio); 2918 2919 balance_dirty_pages_ratelimited(inode->i_mapping); 2920 btrfs_throttle(fs_info); 2921 if (btrfs_should_cancel_balance(fs_info)) 2922 ret = -ECANCELED; 2923 *file_offset_ret = folio_end + 1; 2924 return ret; 2925 2926 release_folio: 2927 folio_unlock(folio); 2928 folio_put(folio); 2929 return ret; 2930 } 2931 2932 static int relocate_file_extent_cluster(struct reloc_control *rc) 2933 { 2934 struct inode *inode = rc->data_inode; 2935 const struct file_extent_cluster *cluster = &rc->cluster; 2936 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2937 u64 cur_file_offset = cluster->start - offset; 2938 struct file_ra_state *ra; 2939 int cluster_nr = 0; 2940 int ret = 0; 2941 2942 if (!cluster->nr) 2943 return 0; 2944 2945 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2946 if (!ra) 2947 return -ENOMEM; 2948 2949 ret = prealloc_file_extent_cluster(rc); 2950 if (ret) 2951 goto out; 2952 2953 file_ra_state_init(ra, inode->i_mapping); 2954 2955 ret = setup_relocation_extent_mapping(rc); 2956 if (ret) 2957 goto out; 2958 2959 while (cur_file_offset < cluster->end - offset) { 2960 ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); 2961 if (ret) 2962 break; 2963 } 2964 if (ret == 0) 2965 WARN_ON(cluster_nr != cluster->nr); 2966 out: 2967 kfree(ra); 2968 return ret; 2969 } 2970 2971 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 2972 const struct btrfs_key *extent_key) 2973 { 2974 struct inode *inode = rc->data_inode; 2975 struct file_extent_cluster *cluster = &rc->cluster; 2976 int ret; 2977 struct btrfs_root *root = BTRFS_I(inode)->root; 2978 2979 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 2980 ret = relocate_file_extent_cluster(rc); 2981 if (ret) 2982 return ret; 2983 cluster->nr = 0; 2984 } 2985 2986 /* 2987 * Under simple quotas, we set root->relocation_src_root when we find 2988 * the extent. If adjacent extents have different owners, we can't merge 2989 * them while relocating. Handle this by storing the owning root that 2990 * started a cluster and if we see an extent from a different root break 2991 * cluster formation (just like the above case of non-adjacent extents). 2992 * 2993 * Without simple quotas, relocation_src_root is always 0, so we should 2994 * never see a mismatch, and it should have no effect on relocation 2995 * clusters. 2996 */ 2997 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 2998 u64 tmp = root->relocation_src_root; 2999 3000 /* 3001 * root->relocation_src_root is the state that actually affects 3002 * the preallocation we do here, so set it to the root owning 3003 * the cluster we need to relocate. 3004 */ 3005 root->relocation_src_root = cluster->owning_root; 3006 ret = relocate_file_extent_cluster(rc); 3007 if (ret) 3008 return ret; 3009 cluster->nr = 0; 3010 /* And reset it back for the current extent's owning root. */ 3011 root->relocation_src_root = tmp; 3012 } 3013 3014 if (!cluster->nr) { 3015 cluster->start = extent_key->objectid; 3016 cluster->owning_root = root->relocation_src_root; 3017 } 3018 else 3019 BUG_ON(cluster->nr >= MAX_EXTENTS); 3020 cluster->end = extent_key->objectid + extent_key->offset - 1; 3021 cluster->boundary[cluster->nr] = extent_key->objectid; 3022 cluster->nr++; 3023 3024 if (cluster->nr >= MAX_EXTENTS) { 3025 ret = relocate_file_extent_cluster(rc); 3026 if (ret) 3027 return ret; 3028 cluster->nr = 0; 3029 } 3030 return 0; 3031 } 3032 3033 /* 3034 * helper to add a tree block to the list. 3035 * the major work is getting the generation and level of the block 3036 */ 3037 static int add_tree_block(struct reloc_control *rc, 3038 const struct btrfs_key *extent_key, 3039 struct btrfs_path *path, 3040 struct rb_root *blocks) 3041 { 3042 struct extent_buffer *eb; 3043 struct btrfs_extent_item *ei; 3044 struct btrfs_tree_block_info *bi; 3045 struct tree_block *block; 3046 struct rb_node *rb_node; 3047 u32 item_size; 3048 int level = -1; 3049 u64 generation; 3050 u64 owner = 0; 3051 3052 eb = path->nodes[0]; 3053 item_size = btrfs_item_size(eb, path->slots[0]); 3054 3055 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3056 item_size >= sizeof(*ei) + sizeof(*bi)) { 3057 unsigned long ptr = 0, end; 3058 3059 ei = btrfs_item_ptr(eb, path->slots[0], 3060 struct btrfs_extent_item); 3061 end = (unsigned long)ei + item_size; 3062 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3063 bi = (struct btrfs_tree_block_info *)(ei + 1); 3064 level = btrfs_tree_block_level(eb, bi); 3065 ptr = (unsigned long)(bi + 1); 3066 } else { 3067 level = (int)extent_key->offset; 3068 ptr = (unsigned long)(ei + 1); 3069 } 3070 generation = btrfs_extent_generation(eb, ei); 3071 3072 /* 3073 * We're reading random blocks without knowing their owner ahead 3074 * of time. This is ok most of the time, as all reloc roots and 3075 * fs roots have the same lock type. However normal trees do 3076 * not, and the only way to know ahead of time is to read the 3077 * inline ref offset. We know it's an fs root if 3078 * 3079 * 1. There's more than one ref. 3080 * 2. There's a SHARED_DATA_REF_KEY set. 3081 * 3. FULL_BACKREF is set on the flags. 3082 * 3083 * Otherwise it's safe to assume that the ref offset == the 3084 * owner of this block, so we can use that when calling 3085 * read_tree_block. 3086 */ 3087 if (btrfs_extent_refs(eb, ei) == 1 && 3088 !(btrfs_extent_flags(eb, ei) & 3089 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3090 ptr < end) { 3091 struct btrfs_extent_inline_ref *iref; 3092 int type; 3093 3094 iref = (struct btrfs_extent_inline_ref *)ptr; 3095 type = btrfs_get_extent_inline_ref_type(eb, iref, 3096 BTRFS_REF_TYPE_BLOCK); 3097 if (type == BTRFS_REF_TYPE_INVALID) 3098 return -EINVAL; 3099 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3100 owner = btrfs_extent_inline_ref_offset(eb, iref); 3101 } 3102 } else { 3103 btrfs_print_leaf(eb); 3104 btrfs_err(rc->block_group->fs_info, 3105 "unrecognized tree backref at tree block %llu slot %u", 3106 eb->start, path->slots[0]); 3107 btrfs_release_path(path); 3108 return -EUCLEAN; 3109 } 3110 3111 btrfs_release_path(path); 3112 3113 BUG_ON(level == -1); 3114 3115 block = kmalloc(sizeof(*block), GFP_NOFS); 3116 if (!block) 3117 return -ENOMEM; 3118 3119 block->bytenr = extent_key->objectid; 3120 block->key.objectid = rc->extent_root->fs_info->nodesize; 3121 block->key.offset = generation; 3122 block->level = level; 3123 block->key_ready = false; 3124 block->owner = owner; 3125 3126 rb_node = rb_simple_insert(blocks, &block->simple_node); 3127 if (rb_node) 3128 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3129 -EEXIST); 3130 3131 return 0; 3132 } 3133 3134 /* 3135 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3136 */ 3137 static int __add_tree_block(struct reloc_control *rc, 3138 u64 bytenr, u32 blocksize, 3139 struct rb_root *blocks) 3140 { 3141 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3142 struct btrfs_path *path; 3143 struct btrfs_key key; 3144 int ret; 3145 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3146 3147 if (tree_block_processed(bytenr, rc)) 3148 return 0; 3149 3150 if (rb_simple_search(blocks, bytenr)) 3151 return 0; 3152 3153 path = btrfs_alloc_path(); 3154 if (!path) 3155 return -ENOMEM; 3156 again: 3157 key.objectid = bytenr; 3158 if (skinny) { 3159 key.type = BTRFS_METADATA_ITEM_KEY; 3160 key.offset = (u64)-1; 3161 } else { 3162 key.type = BTRFS_EXTENT_ITEM_KEY; 3163 key.offset = blocksize; 3164 } 3165 3166 path->search_commit_root = 1; 3167 path->skip_locking = 1; 3168 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3169 if (ret < 0) 3170 goto out; 3171 3172 if (ret > 0 && skinny) { 3173 if (path->slots[0]) { 3174 path->slots[0]--; 3175 btrfs_item_key_to_cpu(path->nodes[0], &key, 3176 path->slots[0]); 3177 if (key.objectid == bytenr && 3178 (key.type == BTRFS_METADATA_ITEM_KEY || 3179 (key.type == BTRFS_EXTENT_ITEM_KEY && 3180 key.offset == blocksize))) 3181 ret = 0; 3182 } 3183 3184 if (ret) { 3185 skinny = false; 3186 btrfs_release_path(path); 3187 goto again; 3188 } 3189 } 3190 if (ret) { 3191 ASSERT(ret == 1); 3192 btrfs_print_leaf(path->nodes[0]); 3193 btrfs_err(fs_info, 3194 "tree block extent item (%llu) is not found in extent tree", 3195 bytenr); 3196 WARN_ON(1); 3197 ret = -EINVAL; 3198 goto out; 3199 } 3200 3201 ret = add_tree_block(rc, &key, path, blocks); 3202 out: 3203 btrfs_free_path(path); 3204 return ret; 3205 } 3206 3207 static int delete_block_group_cache(struct btrfs_block_group *block_group, 3208 struct inode *inode, 3209 u64 ino) 3210 { 3211 struct btrfs_fs_info *fs_info = block_group->fs_info; 3212 struct btrfs_root *root = fs_info->tree_root; 3213 struct btrfs_trans_handle *trans; 3214 struct btrfs_inode *btrfs_inode; 3215 int ret = 0; 3216 3217 if (inode) 3218 goto truncate; 3219 3220 btrfs_inode = btrfs_iget(ino, root); 3221 if (IS_ERR(btrfs_inode)) 3222 return -ENOENT; 3223 inode = &btrfs_inode->vfs_inode; 3224 3225 truncate: 3226 ret = btrfs_check_trunc_cache_free_space(fs_info, 3227 &fs_info->global_block_rsv); 3228 if (ret) 3229 goto out; 3230 3231 trans = btrfs_join_transaction(root); 3232 if (IS_ERR(trans)) { 3233 ret = PTR_ERR(trans); 3234 goto out; 3235 } 3236 3237 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3238 3239 btrfs_end_transaction(trans); 3240 btrfs_btree_balance_dirty(fs_info); 3241 out: 3242 iput(inode); 3243 return ret; 3244 } 3245 3246 /* 3247 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3248 * cache inode, to avoid free space cache data extent blocking data relocation. 3249 */ 3250 static int delete_v1_space_cache(struct extent_buffer *leaf, 3251 struct btrfs_block_group *block_group, 3252 u64 data_bytenr) 3253 { 3254 u64 space_cache_ino; 3255 struct btrfs_file_extent_item *ei; 3256 struct btrfs_key key; 3257 bool found = false; 3258 int i; 3259 int ret; 3260 3261 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3262 return 0; 3263 3264 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3265 u8 type; 3266 3267 btrfs_item_key_to_cpu(leaf, &key, i); 3268 if (key.type != BTRFS_EXTENT_DATA_KEY) 3269 continue; 3270 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3271 type = btrfs_file_extent_type(leaf, ei); 3272 3273 if ((type == BTRFS_FILE_EXTENT_REG || 3274 type == BTRFS_FILE_EXTENT_PREALLOC) && 3275 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3276 found = true; 3277 space_cache_ino = key.objectid; 3278 break; 3279 } 3280 } 3281 if (!found) 3282 return -ENOENT; 3283 ret = delete_block_group_cache(block_group, NULL, space_cache_ino); 3284 return ret; 3285 } 3286 3287 /* 3288 * helper to find all tree blocks that reference a given data extent 3289 */ 3290 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3291 const struct btrfs_key *extent_key, 3292 struct btrfs_path *path, 3293 struct rb_root *blocks) 3294 { 3295 struct btrfs_backref_walk_ctx ctx = { 0 }; 3296 struct ulist_iterator leaf_uiter; 3297 struct ulist_node *ref_node = NULL; 3298 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3299 int ret = 0; 3300 3301 btrfs_release_path(path); 3302 3303 ctx.bytenr = extent_key->objectid; 3304 ctx.skip_inode_ref_list = true; 3305 ctx.fs_info = rc->extent_root->fs_info; 3306 3307 ret = btrfs_find_all_leafs(&ctx); 3308 if (ret < 0) 3309 return ret; 3310 3311 ULIST_ITER_INIT(&leaf_uiter); 3312 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3313 struct btrfs_tree_parent_check check = { 0 }; 3314 struct extent_buffer *eb; 3315 3316 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3317 if (IS_ERR(eb)) { 3318 ret = PTR_ERR(eb); 3319 break; 3320 } 3321 ret = delete_v1_space_cache(eb, rc->block_group, 3322 extent_key->objectid); 3323 free_extent_buffer(eb); 3324 if (ret < 0) 3325 break; 3326 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3327 if (ret < 0) 3328 break; 3329 } 3330 if (ret < 0) 3331 free_block_list(blocks); 3332 ulist_free(ctx.refs); 3333 return ret; 3334 } 3335 3336 /* 3337 * helper to find next unprocessed extent 3338 */ 3339 static noinline_for_stack 3340 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3341 struct btrfs_key *extent_key) 3342 { 3343 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3344 struct btrfs_key key; 3345 struct extent_buffer *leaf; 3346 u64 start, end, last; 3347 int ret; 3348 3349 last = rc->block_group->start + rc->block_group->length; 3350 while (1) { 3351 bool block_found; 3352 3353 cond_resched(); 3354 if (rc->search_start >= last) { 3355 ret = 1; 3356 break; 3357 } 3358 3359 key.objectid = rc->search_start; 3360 key.type = BTRFS_EXTENT_ITEM_KEY; 3361 key.offset = 0; 3362 3363 path->search_commit_root = 1; 3364 path->skip_locking = 1; 3365 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3366 0, 0); 3367 if (ret < 0) 3368 break; 3369 next: 3370 leaf = path->nodes[0]; 3371 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3372 ret = btrfs_next_leaf(rc->extent_root, path); 3373 if (ret != 0) 3374 break; 3375 leaf = path->nodes[0]; 3376 } 3377 3378 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3379 if (key.objectid >= last) { 3380 ret = 1; 3381 break; 3382 } 3383 3384 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3385 key.type != BTRFS_METADATA_ITEM_KEY) { 3386 path->slots[0]++; 3387 goto next; 3388 } 3389 3390 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3391 key.objectid + key.offset <= rc->search_start) { 3392 path->slots[0]++; 3393 goto next; 3394 } 3395 3396 if (key.type == BTRFS_METADATA_ITEM_KEY && 3397 key.objectid + fs_info->nodesize <= 3398 rc->search_start) { 3399 path->slots[0]++; 3400 goto next; 3401 } 3402 3403 block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, 3404 key.objectid, &start, &end, 3405 EXTENT_DIRTY, NULL); 3406 3407 if (block_found && start <= key.objectid) { 3408 btrfs_release_path(path); 3409 rc->search_start = end + 1; 3410 } else { 3411 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3412 rc->search_start = key.objectid + key.offset; 3413 else 3414 rc->search_start = key.objectid + 3415 fs_info->nodesize; 3416 memcpy(extent_key, &key, sizeof(key)); 3417 return 0; 3418 } 3419 } 3420 btrfs_release_path(path); 3421 return ret; 3422 } 3423 3424 static void set_reloc_control(struct reloc_control *rc) 3425 { 3426 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3427 3428 mutex_lock(&fs_info->reloc_mutex); 3429 fs_info->reloc_ctl = rc; 3430 mutex_unlock(&fs_info->reloc_mutex); 3431 } 3432 3433 static void unset_reloc_control(struct reloc_control *rc) 3434 { 3435 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3436 3437 mutex_lock(&fs_info->reloc_mutex); 3438 fs_info->reloc_ctl = NULL; 3439 mutex_unlock(&fs_info->reloc_mutex); 3440 } 3441 3442 static noinline_for_stack 3443 int prepare_to_relocate(struct reloc_control *rc) 3444 { 3445 struct btrfs_trans_handle *trans; 3446 int ret; 3447 3448 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3449 BTRFS_BLOCK_RSV_TEMP); 3450 if (!rc->block_rsv) 3451 return -ENOMEM; 3452 3453 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3454 rc->search_start = rc->block_group->start; 3455 rc->extents_found = 0; 3456 rc->nodes_relocated = 0; 3457 rc->merging_rsv_size = 0; 3458 rc->reserved_bytes = 0; 3459 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3460 RELOCATION_RESERVED_NODES; 3461 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3462 rc->block_rsv, rc->block_rsv->size, 3463 BTRFS_RESERVE_FLUSH_ALL); 3464 if (ret) 3465 return ret; 3466 3467 rc->create_reloc_tree = true; 3468 set_reloc_control(rc); 3469 3470 trans = btrfs_join_transaction(rc->extent_root); 3471 if (IS_ERR(trans)) { 3472 unset_reloc_control(rc); 3473 /* 3474 * extent tree is not a ref_cow tree and has no reloc_root to 3475 * cleanup. And callers are responsible to free the above 3476 * block rsv. 3477 */ 3478 return PTR_ERR(trans); 3479 } 3480 3481 ret = btrfs_commit_transaction(trans); 3482 if (ret) 3483 unset_reloc_control(rc); 3484 3485 return ret; 3486 } 3487 3488 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3489 { 3490 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3491 struct rb_root blocks = RB_ROOT; 3492 struct btrfs_key key; 3493 struct btrfs_trans_handle *trans = NULL; 3494 struct btrfs_path *path; 3495 struct btrfs_extent_item *ei; 3496 u64 flags; 3497 int ret; 3498 int err = 0; 3499 int progress = 0; 3500 3501 path = btrfs_alloc_path(); 3502 if (!path) 3503 return -ENOMEM; 3504 path->reada = READA_FORWARD; 3505 3506 ret = prepare_to_relocate(rc); 3507 if (ret) { 3508 err = ret; 3509 goto out_free; 3510 } 3511 3512 while (1) { 3513 rc->reserved_bytes = 0; 3514 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3515 rc->block_rsv->size, 3516 BTRFS_RESERVE_FLUSH_ALL); 3517 if (ret) { 3518 err = ret; 3519 break; 3520 } 3521 progress++; 3522 trans = btrfs_start_transaction(rc->extent_root, 0); 3523 if (IS_ERR(trans)) { 3524 err = PTR_ERR(trans); 3525 trans = NULL; 3526 break; 3527 } 3528 restart: 3529 if (rc->backref_cache.last_trans != trans->transid) 3530 btrfs_backref_release_cache(&rc->backref_cache); 3531 rc->backref_cache.last_trans = trans->transid; 3532 3533 ret = find_next_extent(rc, path, &key); 3534 if (ret < 0) 3535 err = ret; 3536 if (ret != 0) 3537 break; 3538 3539 rc->extents_found++; 3540 3541 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3542 struct btrfs_extent_item); 3543 flags = btrfs_extent_flags(path->nodes[0], ei); 3544 3545 /* 3546 * If we are relocating a simple quota owned extent item, we 3547 * need to note the owner on the reloc data root so that when 3548 * we allocate the replacement item, we can attribute it to the 3549 * correct eventual owner (rather than the reloc data root). 3550 */ 3551 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3552 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3553 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3554 path->nodes[0], 3555 path->slots[0]); 3556 3557 root->relocation_src_root = owning_root_id; 3558 } 3559 3560 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3561 ret = add_tree_block(rc, &key, path, &blocks); 3562 } else if (rc->stage == UPDATE_DATA_PTRS && 3563 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3564 ret = add_data_references(rc, &key, path, &blocks); 3565 } else { 3566 btrfs_release_path(path); 3567 ret = 0; 3568 } 3569 if (ret < 0) { 3570 err = ret; 3571 break; 3572 } 3573 3574 if (!RB_EMPTY_ROOT(&blocks)) { 3575 ret = relocate_tree_blocks(trans, rc, &blocks); 3576 if (ret < 0) { 3577 if (ret != -EAGAIN) { 3578 err = ret; 3579 break; 3580 } 3581 rc->extents_found--; 3582 rc->search_start = key.objectid; 3583 } 3584 } 3585 3586 btrfs_end_transaction_throttle(trans); 3587 btrfs_btree_balance_dirty(fs_info); 3588 trans = NULL; 3589 3590 if (rc->stage == MOVE_DATA_EXTENTS && 3591 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3592 rc->found_file_extent = true; 3593 ret = relocate_data_extent(rc, &key); 3594 if (ret < 0) { 3595 err = ret; 3596 break; 3597 } 3598 } 3599 if (btrfs_should_cancel_balance(fs_info)) { 3600 err = -ECANCELED; 3601 break; 3602 } 3603 } 3604 if (trans && progress && err == -ENOSPC) { 3605 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3606 if (ret == 1) { 3607 err = 0; 3608 progress = 0; 3609 goto restart; 3610 } 3611 } 3612 3613 btrfs_release_path(path); 3614 btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); 3615 3616 if (trans) { 3617 btrfs_end_transaction_throttle(trans); 3618 btrfs_btree_balance_dirty(fs_info); 3619 } 3620 3621 if (!err) { 3622 ret = relocate_file_extent_cluster(rc); 3623 if (ret < 0) 3624 err = ret; 3625 } 3626 3627 rc->create_reloc_tree = false; 3628 set_reloc_control(rc); 3629 3630 btrfs_backref_release_cache(&rc->backref_cache); 3631 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3632 3633 /* 3634 * Even in the case when the relocation is cancelled, we should all go 3635 * through prepare_to_merge() and merge_reloc_roots(). 3636 * 3637 * For error (including cancelled balance), prepare_to_merge() will 3638 * mark all reloc trees orphan, then queue them for cleanup in 3639 * merge_reloc_roots() 3640 */ 3641 err = prepare_to_merge(rc, err); 3642 3643 merge_reloc_roots(rc); 3644 3645 rc->merge_reloc_tree = false; 3646 unset_reloc_control(rc); 3647 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3648 3649 /* get rid of pinned extents */ 3650 trans = btrfs_join_transaction(rc->extent_root); 3651 if (IS_ERR(trans)) { 3652 err = PTR_ERR(trans); 3653 goto out_free; 3654 } 3655 ret = btrfs_commit_transaction(trans); 3656 if (ret && !err) 3657 err = ret; 3658 out_free: 3659 ret = clean_dirty_subvols(rc); 3660 if (ret < 0 && !err) 3661 err = ret; 3662 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3663 btrfs_free_path(path); 3664 return err; 3665 } 3666 3667 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3668 struct btrfs_root *root, u64 objectid) 3669 { 3670 struct btrfs_path *path; 3671 struct btrfs_inode_item *item; 3672 struct extent_buffer *leaf; 3673 int ret; 3674 3675 path = btrfs_alloc_path(); 3676 if (!path) 3677 return -ENOMEM; 3678 3679 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3680 if (ret) 3681 goto out; 3682 3683 leaf = path->nodes[0]; 3684 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3685 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3686 btrfs_set_inode_generation(leaf, item, 1); 3687 btrfs_set_inode_size(leaf, item, 0); 3688 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3689 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3690 BTRFS_INODE_PREALLOC); 3691 out: 3692 btrfs_free_path(path); 3693 return ret; 3694 } 3695 3696 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3697 struct btrfs_root *root, u64 objectid) 3698 { 3699 struct btrfs_path *path; 3700 struct btrfs_key key; 3701 int ret = 0; 3702 3703 path = btrfs_alloc_path(); 3704 if (!path) { 3705 ret = -ENOMEM; 3706 goto out; 3707 } 3708 3709 key.objectid = objectid; 3710 key.type = BTRFS_INODE_ITEM_KEY; 3711 key.offset = 0; 3712 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3713 if (ret) { 3714 if (ret > 0) 3715 ret = -ENOENT; 3716 goto out; 3717 } 3718 ret = btrfs_del_item(trans, root, path); 3719 out: 3720 if (ret) 3721 btrfs_abort_transaction(trans, ret); 3722 btrfs_free_path(path); 3723 } 3724 3725 /* 3726 * helper to create inode for data relocation. 3727 * the inode is in data relocation tree and its link count is 0 3728 */ 3729 static noinline_for_stack struct inode *create_reloc_inode( 3730 const struct btrfs_block_group *group) 3731 { 3732 struct btrfs_fs_info *fs_info = group->fs_info; 3733 struct btrfs_inode *inode = NULL; 3734 struct btrfs_trans_handle *trans; 3735 struct btrfs_root *root; 3736 u64 objectid; 3737 int ret = 0; 3738 3739 root = btrfs_grab_root(fs_info->data_reloc_root); 3740 trans = btrfs_start_transaction(root, 6); 3741 if (IS_ERR(trans)) { 3742 btrfs_put_root(root); 3743 return ERR_CAST(trans); 3744 } 3745 3746 ret = btrfs_get_free_objectid(root, &objectid); 3747 if (ret) 3748 goto out; 3749 3750 ret = __insert_orphan_inode(trans, root, objectid); 3751 if (ret) 3752 goto out; 3753 3754 inode = btrfs_iget(objectid, root); 3755 if (IS_ERR(inode)) { 3756 delete_orphan_inode(trans, root, objectid); 3757 ret = PTR_ERR(inode); 3758 inode = NULL; 3759 goto out; 3760 } 3761 inode->reloc_block_group_start = group->start; 3762 3763 ret = btrfs_orphan_add(trans, inode); 3764 out: 3765 btrfs_put_root(root); 3766 btrfs_end_transaction(trans); 3767 btrfs_btree_balance_dirty(fs_info); 3768 if (ret) { 3769 if (inode) 3770 iput(&inode->vfs_inode); 3771 return ERR_PTR(ret); 3772 } 3773 return &inode->vfs_inode; 3774 } 3775 3776 /* 3777 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3778 * has been requested meanwhile and don't start in that case. 3779 * 3780 * Return: 3781 * 0 success 3782 * -EINPROGRESS operation is already in progress, that's probably a bug 3783 * -ECANCELED cancellation request was set before the operation started 3784 */ 3785 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3786 { 3787 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3788 /* This should not happen */ 3789 btrfs_err(fs_info, "reloc already running, cannot start"); 3790 return -EINPROGRESS; 3791 } 3792 3793 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3794 btrfs_info(fs_info, "chunk relocation canceled on start"); 3795 /* 3796 * On cancel, clear all requests but let the caller mark 3797 * the end after cleanup operations. 3798 */ 3799 atomic_set(&fs_info->reloc_cancel_req, 0); 3800 return -ECANCELED; 3801 } 3802 return 0; 3803 } 3804 3805 /* 3806 * Mark end of chunk relocation that is cancellable and wake any waiters. 3807 */ 3808 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3809 { 3810 /* Requested after start, clear bit first so any waiters can continue */ 3811 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3812 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3813 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3814 atomic_set(&fs_info->reloc_cancel_req, 0); 3815 } 3816 3817 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3818 { 3819 struct reloc_control *rc; 3820 3821 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3822 if (!rc) 3823 return NULL; 3824 3825 INIT_LIST_HEAD(&rc->reloc_roots); 3826 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3827 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3828 rc->reloc_root_tree.rb_root = RB_ROOT; 3829 spin_lock_init(&rc->reloc_root_tree.lock); 3830 btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3831 return rc; 3832 } 3833 3834 static void free_reloc_control(struct reloc_control *rc) 3835 { 3836 struct mapping_node *node, *tmp; 3837 3838 free_reloc_roots(&rc->reloc_roots); 3839 rbtree_postorder_for_each_entry_safe(node, tmp, 3840 &rc->reloc_root_tree.rb_root, rb_node) 3841 kfree(node); 3842 3843 kfree(rc); 3844 } 3845 3846 /* 3847 * Print the block group being relocated 3848 */ 3849 static void describe_relocation(struct btrfs_block_group *block_group) 3850 { 3851 char buf[128] = "NONE"; 3852 3853 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3854 3855 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3856 block_group->start, buf); 3857 } 3858 3859 static const char *stage_to_string(enum reloc_stage stage) 3860 { 3861 if (stage == MOVE_DATA_EXTENTS) 3862 return "move data extents"; 3863 if (stage == UPDATE_DATA_PTRS) 3864 return "update data pointers"; 3865 return "unknown"; 3866 } 3867 3868 /* 3869 * function to relocate all extents in a block group. 3870 */ 3871 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, 3872 bool verbose) 3873 { 3874 struct btrfs_block_group *bg; 3875 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3876 struct reloc_control *rc; 3877 struct inode *inode; 3878 struct btrfs_path *path; 3879 int ret; 3880 int rw = 0; 3881 int err = 0; 3882 3883 /* 3884 * This only gets set if we had a half-deleted snapshot on mount. We 3885 * cannot allow relocation to start while we're still trying to clean up 3886 * these pending deletions. 3887 */ 3888 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3889 if (ret) 3890 return ret; 3891 3892 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3893 if (btrfs_fs_closing(fs_info)) 3894 return -EINTR; 3895 3896 bg = btrfs_lookup_block_group(fs_info, group_start); 3897 if (!bg) 3898 return -ENOENT; 3899 3900 /* 3901 * Relocation of a data block group creates ordered extents. Without 3902 * sb_start_write(), we can freeze the filesystem while unfinished 3903 * ordered extents are left. Such ordered extents can cause a deadlock 3904 * e.g. when syncfs() is waiting for their completion but they can't 3905 * finish because they block when joining a transaction, due to the 3906 * fact that the freeze locks are being held in write mode. 3907 */ 3908 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3909 ASSERT(sb_write_started(fs_info->sb)); 3910 3911 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3912 btrfs_put_block_group(bg); 3913 return -ETXTBSY; 3914 } 3915 3916 rc = alloc_reloc_control(fs_info); 3917 if (!rc) { 3918 btrfs_put_block_group(bg); 3919 return -ENOMEM; 3920 } 3921 3922 ret = reloc_chunk_start(fs_info); 3923 if (ret < 0) { 3924 err = ret; 3925 goto out_put_bg; 3926 } 3927 3928 rc->extent_root = extent_root; 3929 rc->block_group = bg; 3930 3931 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3932 if (ret) { 3933 err = ret; 3934 goto out; 3935 } 3936 rw = 1; 3937 3938 path = btrfs_alloc_path(); 3939 if (!path) { 3940 err = -ENOMEM; 3941 goto out; 3942 } 3943 3944 inode = lookup_free_space_inode(rc->block_group, path); 3945 btrfs_free_path(path); 3946 3947 if (!IS_ERR(inode)) 3948 ret = delete_block_group_cache(rc->block_group, inode, 0); 3949 else 3950 ret = PTR_ERR(inode); 3951 3952 if (ret && ret != -ENOENT) { 3953 err = ret; 3954 goto out; 3955 } 3956 3957 rc->data_inode = create_reloc_inode(rc->block_group); 3958 if (IS_ERR(rc->data_inode)) { 3959 err = PTR_ERR(rc->data_inode); 3960 rc->data_inode = NULL; 3961 goto out; 3962 } 3963 3964 if (verbose) 3965 describe_relocation(rc->block_group); 3966 3967 btrfs_wait_block_group_reservations(rc->block_group); 3968 btrfs_wait_nocow_writers(rc->block_group); 3969 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 3970 3971 ret = btrfs_zone_finish(rc->block_group); 3972 WARN_ON(ret && ret != -EAGAIN); 3973 3974 while (1) { 3975 enum reloc_stage finishes_stage; 3976 3977 mutex_lock(&fs_info->cleaner_mutex); 3978 ret = relocate_block_group(rc); 3979 mutex_unlock(&fs_info->cleaner_mutex); 3980 if (ret < 0) 3981 err = ret; 3982 3983 finishes_stage = rc->stage; 3984 /* 3985 * We may have gotten ENOSPC after we already dirtied some 3986 * extents. If writeout happens while we're relocating a 3987 * different block group we could end up hitting the 3988 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 3989 * btrfs_reloc_cow_block. Make sure we write everything out 3990 * properly so we don't trip over this problem, and then break 3991 * out of the loop if we hit an error. 3992 */ 3993 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 3994 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 3995 (u64)-1); 3996 if (ret) 3997 err = ret; 3998 invalidate_mapping_pages(rc->data_inode->i_mapping, 3999 0, -1); 4000 rc->stage = UPDATE_DATA_PTRS; 4001 } 4002 4003 if (err < 0) 4004 goto out; 4005 4006 if (rc->extents_found == 0) 4007 break; 4008 4009 if (verbose) 4010 btrfs_info(fs_info, "found %llu extents, stage: %s", 4011 rc->extents_found, 4012 stage_to_string(finishes_stage)); 4013 } 4014 4015 WARN_ON(rc->block_group->pinned > 0); 4016 WARN_ON(rc->block_group->reserved > 0); 4017 WARN_ON(rc->block_group->used > 0); 4018 out: 4019 if (err && rw) 4020 btrfs_dec_block_group_ro(rc->block_group); 4021 iput(rc->data_inode); 4022 out_put_bg: 4023 btrfs_put_block_group(bg); 4024 reloc_chunk_end(fs_info); 4025 free_reloc_control(rc); 4026 return err; 4027 } 4028 4029 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4030 { 4031 struct btrfs_fs_info *fs_info = root->fs_info; 4032 struct btrfs_trans_handle *trans; 4033 int ret, err; 4034 4035 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4036 if (IS_ERR(trans)) 4037 return PTR_ERR(trans); 4038 4039 memset(&root->root_item.drop_progress, 0, 4040 sizeof(root->root_item.drop_progress)); 4041 btrfs_set_root_drop_level(&root->root_item, 0); 4042 btrfs_set_root_refs(&root->root_item, 0); 4043 ret = btrfs_update_root(trans, fs_info->tree_root, 4044 &root->root_key, &root->root_item); 4045 4046 err = btrfs_end_transaction(trans); 4047 if (err) 4048 return err; 4049 return ret; 4050 } 4051 4052 /* 4053 * recover relocation interrupted by system crash. 4054 * 4055 * this function resumes merging reloc trees with corresponding fs trees. 4056 * this is important for keeping the sharing of tree blocks 4057 */ 4058 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4059 { 4060 LIST_HEAD(reloc_roots); 4061 struct btrfs_key key; 4062 struct btrfs_root *fs_root; 4063 struct btrfs_root *reloc_root; 4064 struct btrfs_path *path; 4065 struct extent_buffer *leaf; 4066 struct reloc_control *rc = NULL; 4067 struct btrfs_trans_handle *trans; 4068 int ret2; 4069 int ret = 0; 4070 4071 path = btrfs_alloc_path(); 4072 if (!path) 4073 return -ENOMEM; 4074 path->reada = READA_BACK; 4075 4076 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4077 key.type = BTRFS_ROOT_ITEM_KEY; 4078 key.offset = (u64)-1; 4079 4080 while (1) { 4081 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4082 path, 0, 0); 4083 if (ret < 0) 4084 goto out; 4085 if (ret > 0) { 4086 if (path->slots[0] == 0) 4087 break; 4088 path->slots[0]--; 4089 } 4090 ret = 0; 4091 leaf = path->nodes[0]; 4092 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4093 btrfs_release_path(path); 4094 4095 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4096 key.type != BTRFS_ROOT_ITEM_KEY) 4097 break; 4098 4099 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4100 if (IS_ERR(reloc_root)) { 4101 ret = PTR_ERR(reloc_root); 4102 goto out; 4103 } 4104 4105 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4106 list_add(&reloc_root->root_list, &reloc_roots); 4107 4108 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4109 fs_root = btrfs_get_fs_root(fs_info, 4110 reloc_root->root_key.offset, false); 4111 if (IS_ERR(fs_root)) { 4112 ret = PTR_ERR(fs_root); 4113 if (ret != -ENOENT) 4114 goto out; 4115 ret = mark_garbage_root(reloc_root); 4116 if (ret < 0) 4117 goto out; 4118 ret = 0; 4119 } else { 4120 btrfs_put_root(fs_root); 4121 } 4122 } 4123 4124 if (key.offset == 0) 4125 break; 4126 4127 key.offset--; 4128 } 4129 btrfs_release_path(path); 4130 4131 if (list_empty(&reloc_roots)) 4132 goto out; 4133 4134 rc = alloc_reloc_control(fs_info); 4135 if (!rc) { 4136 ret = -ENOMEM; 4137 goto out; 4138 } 4139 4140 ret = reloc_chunk_start(fs_info); 4141 if (ret < 0) 4142 goto out_end; 4143 4144 rc->extent_root = btrfs_extent_root(fs_info, 0); 4145 4146 set_reloc_control(rc); 4147 4148 trans = btrfs_join_transaction(rc->extent_root); 4149 if (IS_ERR(trans)) { 4150 ret = PTR_ERR(trans); 4151 goto out_unset; 4152 } 4153 4154 rc->merge_reloc_tree = true; 4155 4156 while (!list_empty(&reloc_roots)) { 4157 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 4158 list_del(&reloc_root->root_list); 4159 4160 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4161 list_add_tail(&reloc_root->root_list, 4162 &rc->reloc_roots); 4163 continue; 4164 } 4165 4166 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4167 false); 4168 if (IS_ERR(fs_root)) { 4169 ret = PTR_ERR(fs_root); 4170 list_add_tail(&reloc_root->root_list, &reloc_roots); 4171 btrfs_end_transaction(trans); 4172 goto out_unset; 4173 } 4174 4175 ret = __add_reloc_root(reloc_root); 4176 ASSERT(ret != -EEXIST); 4177 if (ret) { 4178 list_add_tail(&reloc_root->root_list, &reloc_roots); 4179 btrfs_put_root(fs_root); 4180 btrfs_end_transaction(trans); 4181 goto out_unset; 4182 } 4183 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4184 btrfs_put_root(fs_root); 4185 } 4186 4187 ret = btrfs_commit_transaction(trans); 4188 if (ret) 4189 goto out_unset; 4190 4191 merge_reloc_roots(rc); 4192 4193 unset_reloc_control(rc); 4194 4195 trans = btrfs_join_transaction(rc->extent_root); 4196 if (IS_ERR(trans)) { 4197 ret = PTR_ERR(trans); 4198 goto out_clean; 4199 } 4200 ret = btrfs_commit_transaction(trans); 4201 out_clean: 4202 ret2 = clean_dirty_subvols(rc); 4203 if (ret2 < 0 && !ret) 4204 ret = ret2; 4205 out_unset: 4206 unset_reloc_control(rc); 4207 out_end: 4208 reloc_chunk_end(fs_info); 4209 free_reloc_control(rc); 4210 out: 4211 free_reloc_roots(&reloc_roots); 4212 4213 btrfs_free_path(path); 4214 4215 if (ret == 0) { 4216 /* cleanup orphan inode in data relocation tree */ 4217 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4218 ASSERT(fs_root); 4219 ret = btrfs_orphan_cleanup(fs_root); 4220 btrfs_put_root(fs_root); 4221 } 4222 return ret; 4223 } 4224 4225 /* 4226 * helper to add ordered checksum for data relocation. 4227 * 4228 * cloning checksum properly handles the nodatasum extents. 4229 * it also saves CPU time to re-calculate the checksum. 4230 */ 4231 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4232 { 4233 struct btrfs_inode *inode = ordered->inode; 4234 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4235 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4236 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4237 LIST_HEAD(list); 4238 int ret; 4239 4240 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4241 disk_bytenr + ordered->num_bytes - 1, 4242 &list, false); 4243 if (ret < 0) { 4244 btrfs_mark_ordered_extent_error(ordered); 4245 return ret; 4246 } 4247 4248 while (!list_empty(&list)) { 4249 struct btrfs_ordered_sum *sums = 4250 list_first_entry(&list, struct btrfs_ordered_sum, list); 4251 4252 list_del_init(&sums->list); 4253 4254 /* 4255 * We need to offset the new_bytenr based on where the csum is. 4256 * We need to do this because we will read in entire prealloc 4257 * extents but we may have written to say the middle of the 4258 * prealloc extent, so we need to make sure the csum goes with 4259 * the right disk offset. 4260 * 4261 * We can do this because the data reloc inode refers strictly 4262 * to the on disk bytes, so we don't have to worry about 4263 * disk_len vs real len like with real inodes since it's all 4264 * disk length. 4265 */ 4266 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4267 btrfs_add_ordered_sum(ordered, sums); 4268 } 4269 4270 return 0; 4271 } 4272 4273 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4274 struct btrfs_root *root, 4275 const struct extent_buffer *buf, 4276 struct extent_buffer *cow) 4277 { 4278 struct btrfs_fs_info *fs_info = root->fs_info; 4279 struct reloc_control *rc; 4280 struct btrfs_backref_node *node; 4281 int first_cow = 0; 4282 int level; 4283 int ret = 0; 4284 4285 rc = fs_info->reloc_ctl; 4286 if (!rc) 4287 return 0; 4288 4289 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4290 4291 level = btrfs_header_level(buf); 4292 if (btrfs_header_generation(buf) <= 4293 btrfs_root_last_snapshot(&root->root_item)) 4294 first_cow = 1; 4295 4296 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4297 WARN_ON(!first_cow && level == 0); 4298 4299 node = rc->backref_cache.path[level]; 4300 4301 /* 4302 * If node->bytenr != buf->start and node->new_bytenr != 4303 * buf->start then we've got the wrong backref node for what we 4304 * expected to see here and the cache is incorrect. 4305 */ 4306 if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { 4307 btrfs_err(fs_info, 4308 "bytenr %llu was found but our backref cache was expecting %llu or %llu", 4309 buf->start, node->bytenr, node->new_bytenr); 4310 return -EUCLEAN; 4311 } 4312 4313 btrfs_backref_drop_node_buffer(node); 4314 refcount_inc(&cow->refs); 4315 node->eb = cow; 4316 node->new_bytenr = cow->start; 4317 4318 if (!node->pending) { 4319 list_move_tail(&node->list, 4320 &rc->backref_cache.pending[level]); 4321 node->pending = 1; 4322 } 4323 4324 if (first_cow) 4325 mark_block_processed(rc, node); 4326 4327 if (first_cow && level > 0) 4328 rc->nodes_relocated += buf->len; 4329 } 4330 4331 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4332 ret = replace_file_extents(trans, rc, root, cow); 4333 return ret; 4334 } 4335 4336 /* 4337 * called before creating snapshot. it calculates metadata reservation 4338 * required for relocating tree blocks in the snapshot 4339 */ 4340 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4341 u64 *bytes_to_reserve) 4342 { 4343 struct btrfs_root *root = pending->root; 4344 struct reloc_control *rc = root->fs_info->reloc_ctl; 4345 4346 if (!rc || !have_reloc_root(root)) 4347 return; 4348 4349 if (!rc->merge_reloc_tree) 4350 return; 4351 4352 root = root->reloc_root; 4353 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4354 /* 4355 * relocation is in the stage of merging trees. the space 4356 * used by merging a reloc tree is twice the size of 4357 * relocated tree nodes in the worst case. half for cowing 4358 * the reloc tree, half for cowing the fs tree. the space 4359 * used by cowing the reloc tree will be freed after the 4360 * tree is dropped. if we create snapshot, cowing the fs 4361 * tree may use more space than it frees. so we need 4362 * reserve extra space. 4363 */ 4364 *bytes_to_reserve += rc->nodes_relocated; 4365 } 4366 4367 /* 4368 * called after snapshot is created. migrate block reservation 4369 * and create reloc root for the newly created snapshot 4370 * 4371 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4372 * references held on the reloc_root, one for root->reloc_root and one for 4373 * rc->reloc_roots. 4374 */ 4375 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4376 struct btrfs_pending_snapshot *pending) 4377 { 4378 struct btrfs_root *root = pending->root; 4379 struct btrfs_root *reloc_root; 4380 struct btrfs_root *new_root; 4381 struct reloc_control *rc = root->fs_info->reloc_ctl; 4382 int ret; 4383 4384 if (!rc || !have_reloc_root(root)) 4385 return 0; 4386 4387 rc = root->fs_info->reloc_ctl; 4388 rc->merging_rsv_size += rc->nodes_relocated; 4389 4390 if (rc->merge_reloc_tree) { 4391 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4392 rc->block_rsv, 4393 rc->nodes_relocated, true); 4394 if (ret) 4395 return ret; 4396 } 4397 4398 new_root = pending->snap; 4399 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4400 if (IS_ERR(reloc_root)) 4401 return PTR_ERR(reloc_root); 4402 4403 ret = __add_reloc_root(reloc_root); 4404 ASSERT(ret != -EEXIST); 4405 if (ret) { 4406 /* Pairs with create_reloc_root */ 4407 btrfs_put_root(reloc_root); 4408 return ret; 4409 } 4410 new_root->reloc_root = btrfs_grab_root(reloc_root); 4411 return 0; 4412 } 4413 4414 /* 4415 * Get the current bytenr for the block group which is being relocated. 4416 * 4417 * Return U64_MAX if no running relocation. 4418 */ 4419 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4420 { 4421 u64 logical = U64_MAX; 4422 4423 lockdep_assert_held(&fs_info->reloc_mutex); 4424 4425 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4426 logical = fs_info->reloc_ctl->block_group->start; 4427 return logical; 4428 } 4429