1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 struct { 94 struct rb_node rb_node; 95 u64 bytenr; 96 }; /* Use rb_simle_node for search/insert */ 97 void *data; 98 }; 99 100 struct mapping_tree { 101 struct rb_root rb_root; 102 spinlock_t lock; 103 }; 104 105 /* 106 * present a tree block to process 107 */ 108 struct tree_block { 109 struct { 110 struct rb_node rb_node; 111 u64 bytenr; 112 }; /* Use rb_simple_node for search/insert */ 113 u64 owner; 114 struct btrfs_key key; 115 u8 level; 116 bool key_ready; 117 }; 118 119 #define MAX_EXTENTS 128 120 121 struct file_extent_cluster { 122 u64 start; 123 u64 end; 124 u64 boundary[MAX_EXTENTS]; 125 unsigned int nr; 126 u64 owning_root; 127 }; 128 129 /* Stages of data relocation. */ 130 enum reloc_stage { 131 MOVE_DATA_EXTENTS, 132 UPDATE_DATA_PTRS 133 }; 134 135 struct reloc_control { 136 /* block group to relocate */ 137 struct btrfs_block_group *block_group; 138 /* extent tree */ 139 struct btrfs_root *extent_root; 140 /* inode for moving data */ 141 struct inode *data_inode; 142 143 struct btrfs_block_rsv *block_rsv; 144 145 struct btrfs_backref_cache backref_cache; 146 147 struct file_extent_cluster cluster; 148 /* tree blocks have been processed */ 149 struct extent_io_tree processed_blocks; 150 /* map start of tree root to corresponding reloc tree */ 151 struct mapping_tree reloc_root_tree; 152 /* list of reloc trees */ 153 struct list_head reloc_roots; 154 /* list of subvolume trees that get relocated */ 155 struct list_head dirty_subvol_roots; 156 /* size of metadata reservation for merging reloc trees */ 157 u64 merging_rsv_size; 158 /* size of relocated tree nodes */ 159 u64 nodes_relocated; 160 /* reserved size for block group relocation*/ 161 u64 reserved_bytes; 162 163 u64 search_start; 164 u64 extents_found; 165 166 enum reloc_stage stage; 167 bool create_reloc_tree; 168 bool merge_reloc_tree; 169 bool found_file_extent; 170 }; 171 172 static void mark_block_processed(struct reloc_control *rc, 173 struct btrfs_backref_node *node) 174 { 175 u32 blocksize; 176 177 if (node->level == 0 || 178 in_range(node->bytenr, rc->block_group->start, 179 rc->block_group->length)) { 180 blocksize = rc->extent_root->fs_info->nodesize; 181 set_extent_bit(&rc->processed_blocks, node->bytenr, 182 node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); 183 } 184 node->processed = 1; 185 } 186 187 /* 188 * walk up backref nodes until reach node presents tree root 189 */ 190 static struct btrfs_backref_node *walk_up_backref( 191 struct btrfs_backref_node *node, 192 struct btrfs_backref_edge *edges[], int *index) 193 { 194 struct btrfs_backref_edge *edge; 195 int idx = *index; 196 197 while (!list_empty(&node->upper)) { 198 edge = list_entry(node->upper.next, 199 struct btrfs_backref_edge, list[LOWER]); 200 edges[idx++] = edge; 201 node = edge->node[UPPER]; 202 } 203 BUG_ON(node->detached); 204 *index = idx; 205 return node; 206 } 207 208 /* 209 * walk down backref nodes to find start of next reference path 210 */ 211 static struct btrfs_backref_node *walk_down_backref( 212 struct btrfs_backref_edge *edges[], int *index) 213 { 214 struct btrfs_backref_edge *edge; 215 struct btrfs_backref_node *lower; 216 int idx = *index; 217 218 while (idx > 0) { 219 edge = edges[idx - 1]; 220 lower = edge->node[LOWER]; 221 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 222 idx--; 223 continue; 224 } 225 edge = list_entry(edge->list[LOWER].next, 226 struct btrfs_backref_edge, list[LOWER]); 227 edges[idx - 1] = edge; 228 *index = idx; 229 return edge->node[UPPER]; 230 } 231 *index = 0; 232 return NULL; 233 } 234 235 static bool reloc_root_is_dead(const struct btrfs_root *root) 236 { 237 /* 238 * Pair with set_bit/clear_bit in clean_dirty_subvols and 239 * btrfs_update_reloc_root. We need to see the updated bit before 240 * trying to access reloc_root 241 */ 242 smp_rmb(); 243 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 244 return true; 245 return false; 246 } 247 248 /* 249 * Check if this subvolume tree has valid reloc tree. 250 * 251 * Reloc tree after swap is considered dead, thus not considered as valid. 252 * This is enough for most callers, as they don't distinguish dead reloc root 253 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 254 * special case. 255 */ 256 static bool have_reloc_root(const struct btrfs_root *root) 257 { 258 if (reloc_root_is_dead(root)) 259 return false; 260 if (!root->reloc_root) 261 return false; 262 return true; 263 } 264 265 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 266 { 267 struct btrfs_root *reloc_root; 268 269 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 270 return false; 271 272 /* This root has been merged with its reloc tree, we can ignore it */ 273 if (reloc_root_is_dead(root)) 274 return true; 275 276 reloc_root = root->reloc_root; 277 if (!reloc_root) 278 return false; 279 280 if (btrfs_header_generation(reloc_root->commit_root) == 281 root->fs_info->running_transaction->transid) 282 return false; 283 /* 284 * If there is reloc tree and it was created in previous transaction 285 * backref lookup can find the reloc tree, so backref node for the fs 286 * tree root is useless for relocation. 287 */ 288 return true; 289 } 290 291 /* 292 * find reloc tree by address of tree root 293 */ 294 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 295 { 296 struct reloc_control *rc = fs_info->reloc_ctl; 297 struct rb_node *rb_node; 298 struct mapping_node *node; 299 struct btrfs_root *root = NULL; 300 301 ASSERT(rc); 302 spin_lock(&rc->reloc_root_tree.lock); 303 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 304 if (rb_node) { 305 node = rb_entry(rb_node, struct mapping_node, rb_node); 306 root = node->data; 307 } 308 spin_unlock(&rc->reloc_root_tree.lock); 309 return btrfs_grab_root(root); 310 } 311 312 /* 313 * For useless nodes, do two major clean ups: 314 * 315 * - Cleanup the children edges and nodes 316 * If child node is also orphan (no parent) during cleanup, then the child 317 * node will also be cleaned up. 318 * 319 * - Freeing up leaves (level 0), keeps nodes detached 320 * For nodes, the node is still cached as "detached" 321 * 322 * Return false if @node is not in the @useless_nodes list. 323 * Return true if @node is in the @useless_nodes list. 324 */ 325 static bool handle_useless_nodes(struct reloc_control *rc, 326 struct btrfs_backref_node *node) 327 { 328 struct btrfs_backref_cache *cache = &rc->backref_cache; 329 struct list_head *useless_node = &cache->useless_node; 330 bool ret = false; 331 332 while (!list_empty(useless_node)) { 333 struct btrfs_backref_node *cur; 334 335 cur = list_first_entry(useless_node, struct btrfs_backref_node, 336 list); 337 list_del_init(&cur->list); 338 339 /* Only tree root nodes can be added to @useless_nodes */ 340 ASSERT(list_empty(&cur->upper)); 341 342 if (cur == node) 343 ret = true; 344 345 /* The node is the lowest node */ 346 if (cur->lowest) { 347 list_del_init(&cur->lower); 348 cur->lowest = 0; 349 } 350 351 /* Cleanup the lower edges */ 352 while (!list_empty(&cur->lower)) { 353 struct btrfs_backref_edge *edge; 354 struct btrfs_backref_node *lower; 355 356 edge = list_entry(cur->lower.next, 357 struct btrfs_backref_edge, list[UPPER]); 358 list_del(&edge->list[UPPER]); 359 list_del(&edge->list[LOWER]); 360 lower = edge->node[LOWER]; 361 btrfs_backref_free_edge(cache, edge); 362 363 /* Child node is also orphan, queue for cleanup */ 364 if (list_empty(&lower->upper)) 365 list_add(&lower->list, useless_node); 366 } 367 /* Mark this block processed for relocation */ 368 mark_block_processed(rc, cur); 369 370 /* 371 * Backref nodes for tree leaves are deleted from the cache. 372 * Backref nodes for upper level tree blocks are left in the 373 * cache to avoid unnecessary backref lookup. 374 */ 375 if (cur->level > 0) { 376 list_add(&cur->list, &cache->detached); 377 cur->detached = 1; 378 } else { 379 rb_erase(&cur->rb_node, &cache->rb_root); 380 btrfs_backref_free_node(cache, cur); 381 } 382 } 383 return ret; 384 } 385 386 /* 387 * Build backref tree for a given tree block. Root of the backref tree 388 * corresponds the tree block, leaves of the backref tree correspond roots of 389 * b-trees that reference the tree block. 390 * 391 * The basic idea of this function is check backrefs of a given block to find 392 * upper level blocks that reference the block, and then check backrefs of 393 * these upper level blocks recursively. The recursion stops when tree root is 394 * reached or backrefs for the block is cached. 395 * 396 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 397 * all upper level blocks that directly/indirectly reference the block are also 398 * cached. 399 */ 400 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 401 struct btrfs_trans_handle *trans, 402 struct reloc_control *rc, struct btrfs_key *node_key, 403 int level, u64 bytenr) 404 { 405 struct btrfs_backref_iter *iter; 406 struct btrfs_backref_cache *cache = &rc->backref_cache; 407 /* For searching parent of TREE_BLOCK_REF */ 408 struct btrfs_path *path; 409 struct btrfs_backref_node *cur; 410 struct btrfs_backref_node *node = NULL; 411 struct btrfs_backref_edge *edge; 412 int ret; 413 414 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 415 if (!iter) 416 return ERR_PTR(-ENOMEM); 417 path = btrfs_alloc_path(); 418 if (!path) { 419 ret = -ENOMEM; 420 goto out; 421 } 422 423 node = btrfs_backref_alloc_node(cache, bytenr, level); 424 if (!node) { 425 ret = -ENOMEM; 426 goto out; 427 } 428 429 node->lowest = 1; 430 cur = node; 431 432 /* Breadth-first search to build backref cache */ 433 do { 434 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 435 node_key, cur); 436 if (ret < 0) 437 goto out; 438 439 edge = list_first_entry_or_null(&cache->pending_edge, 440 struct btrfs_backref_edge, list[UPPER]); 441 /* 442 * The pending list isn't empty, take the first block to 443 * process 444 */ 445 if (edge) { 446 list_del_init(&edge->list[UPPER]); 447 cur = edge->node[UPPER]; 448 } 449 } while (edge); 450 451 /* Finish the upper linkage of newly added edges/nodes */ 452 ret = btrfs_backref_finish_upper_links(cache, node); 453 if (ret < 0) 454 goto out; 455 456 if (handle_useless_nodes(rc, node)) 457 node = NULL; 458 out: 459 btrfs_free_path(iter->path); 460 kfree(iter); 461 btrfs_free_path(path); 462 if (ret) { 463 btrfs_backref_error_cleanup(cache, node); 464 return ERR_PTR(ret); 465 } 466 ASSERT(!node || !node->detached); 467 ASSERT(list_empty(&cache->useless_node) && 468 list_empty(&cache->pending_edge)); 469 return node; 470 } 471 472 /* 473 * helper to add backref node for the newly created snapshot. 474 * the backref node is created by cloning backref node that 475 * corresponds to root of source tree 476 */ 477 static int clone_backref_node(struct btrfs_trans_handle *trans, 478 struct reloc_control *rc, 479 const struct btrfs_root *src, 480 struct btrfs_root *dest) 481 { 482 struct btrfs_root *reloc_root = src->reloc_root; 483 struct btrfs_backref_cache *cache = &rc->backref_cache; 484 struct btrfs_backref_node *node = NULL; 485 struct btrfs_backref_node *new_node; 486 struct btrfs_backref_edge *edge; 487 struct btrfs_backref_edge *new_edge; 488 struct rb_node *rb_node; 489 490 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 491 if (rb_node) { 492 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 493 if (node->detached) 494 node = NULL; 495 else 496 BUG_ON(node->new_bytenr != reloc_root->node->start); 497 } 498 499 if (!node) { 500 rb_node = rb_simple_search(&cache->rb_root, 501 reloc_root->commit_root->start); 502 if (rb_node) { 503 node = rb_entry(rb_node, struct btrfs_backref_node, 504 rb_node); 505 BUG_ON(node->detached); 506 } 507 } 508 509 if (!node) 510 return 0; 511 512 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 513 node->level); 514 if (!new_node) 515 return -ENOMEM; 516 517 new_node->lowest = node->lowest; 518 new_node->checked = 1; 519 new_node->root = btrfs_grab_root(dest); 520 ASSERT(new_node->root); 521 522 if (!node->lowest) { 523 list_for_each_entry(edge, &node->lower, list[UPPER]) { 524 new_edge = btrfs_backref_alloc_edge(cache); 525 if (!new_edge) 526 goto fail; 527 528 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 529 new_node, LINK_UPPER); 530 } 531 } else { 532 list_add_tail(&new_node->lower, &cache->leaves); 533 } 534 535 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 536 &new_node->rb_node); 537 if (rb_node) 538 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 539 540 if (!new_node->lowest) { 541 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 542 list_add_tail(&new_edge->list[LOWER], 543 &new_edge->node[LOWER]->upper); 544 } 545 } 546 return 0; 547 fail: 548 while (!list_empty(&new_node->lower)) { 549 new_edge = list_entry(new_node->lower.next, 550 struct btrfs_backref_edge, list[UPPER]); 551 list_del(&new_edge->list[UPPER]); 552 btrfs_backref_free_edge(cache, new_edge); 553 } 554 btrfs_backref_free_node(cache, new_node); 555 return -ENOMEM; 556 } 557 558 /* 559 * helper to add 'address of tree root -> reloc tree' mapping 560 */ 561 static int __add_reloc_root(struct btrfs_root *root) 562 { 563 struct btrfs_fs_info *fs_info = root->fs_info; 564 struct rb_node *rb_node; 565 struct mapping_node *node; 566 struct reloc_control *rc = fs_info->reloc_ctl; 567 568 node = kmalloc(sizeof(*node), GFP_NOFS); 569 if (!node) 570 return -ENOMEM; 571 572 node->bytenr = root->commit_root->start; 573 node->data = root; 574 575 spin_lock(&rc->reloc_root_tree.lock); 576 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 577 node->bytenr, &node->rb_node); 578 spin_unlock(&rc->reloc_root_tree.lock); 579 if (rb_node) { 580 btrfs_err(fs_info, 581 "Duplicate root found for start=%llu while inserting into relocation tree", 582 node->bytenr); 583 return -EEXIST; 584 } 585 586 list_add_tail(&root->root_list, &rc->reloc_roots); 587 return 0; 588 } 589 590 /* 591 * helper to delete the 'address of tree root -> reloc tree' 592 * mapping 593 */ 594 static void __del_reloc_root(struct btrfs_root *root) 595 { 596 struct btrfs_fs_info *fs_info = root->fs_info; 597 struct rb_node *rb_node; 598 struct mapping_node *node = NULL; 599 struct reloc_control *rc = fs_info->reloc_ctl; 600 bool put_ref = false; 601 602 if (rc && root->node) { 603 spin_lock(&rc->reloc_root_tree.lock); 604 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 605 root->commit_root->start); 606 if (rb_node) { 607 node = rb_entry(rb_node, struct mapping_node, rb_node); 608 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 609 RB_CLEAR_NODE(&node->rb_node); 610 } 611 spin_unlock(&rc->reloc_root_tree.lock); 612 ASSERT(!node || (struct btrfs_root *)node->data == root); 613 } 614 615 /* 616 * We only put the reloc root here if it's on the list. There's a lot 617 * of places where the pattern is to splice the rc->reloc_roots, process 618 * the reloc roots, and then add the reloc root back onto 619 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 620 * list we don't want the reference being dropped, because the guy 621 * messing with the list is in charge of the reference. 622 */ 623 spin_lock(&fs_info->trans_lock); 624 if (!list_empty(&root->root_list)) { 625 put_ref = true; 626 list_del_init(&root->root_list); 627 } 628 spin_unlock(&fs_info->trans_lock); 629 if (put_ref) 630 btrfs_put_root(root); 631 kfree(node); 632 } 633 634 /* 635 * helper to update the 'address of tree root -> reloc tree' 636 * mapping 637 */ 638 static int __update_reloc_root(struct btrfs_root *root) 639 { 640 struct btrfs_fs_info *fs_info = root->fs_info; 641 struct rb_node *rb_node; 642 struct mapping_node *node = NULL; 643 struct reloc_control *rc = fs_info->reloc_ctl; 644 645 spin_lock(&rc->reloc_root_tree.lock); 646 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 647 root->commit_root->start); 648 if (rb_node) { 649 node = rb_entry(rb_node, struct mapping_node, rb_node); 650 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 651 } 652 spin_unlock(&rc->reloc_root_tree.lock); 653 654 if (!node) 655 return 0; 656 BUG_ON((struct btrfs_root *)node->data != root); 657 658 spin_lock(&rc->reloc_root_tree.lock); 659 node->bytenr = root->node->start; 660 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 661 node->bytenr, &node->rb_node); 662 spin_unlock(&rc->reloc_root_tree.lock); 663 if (rb_node) 664 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 665 return 0; 666 } 667 668 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 669 struct btrfs_root *root, u64 objectid) 670 { 671 struct btrfs_fs_info *fs_info = root->fs_info; 672 struct btrfs_root *reloc_root; 673 struct extent_buffer *eb; 674 struct btrfs_root_item *root_item; 675 struct btrfs_key root_key; 676 int ret = 0; 677 bool must_abort = false; 678 679 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 680 if (!root_item) 681 return ERR_PTR(-ENOMEM); 682 683 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 684 root_key.type = BTRFS_ROOT_ITEM_KEY; 685 root_key.offset = objectid; 686 687 if (btrfs_root_id(root) == objectid) { 688 u64 commit_root_gen; 689 690 /* called by btrfs_init_reloc_root */ 691 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 692 BTRFS_TREE_RELOC_OBJECTID); 693 if (ret) 694 goto fail; 695 696 /* 697 * Set the last_snapshot field to the generation of the commit 698 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 699 * correctly (returns true) when the relocation root is created 700 * either inside the critical section of a transaction commit 701 * (through transaction.c:qgroup_account_snapshot()) and when 702 * it's created before the transaction commit is started. 703 */ 704 commit_root_gen = btrfs_header_generation(root->commit_root); 705 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 706 } else { 707 /* 708 * called by btrfs_reloc_post_snapshot_hook. 709 * the source tree is a reloc tree, all tree blocks 710 * modified after it was created have RELOC flag 711 * set in their headers. so it's OK to not update 712 * the 'last_snapshot'. 713 */ 714 ret = btrfs_copy_root(trans, root, root->node, &eb, 715 BTRFS_TREE_RELOC_OBJECTID); 716 if (ret) 717 goto fail; 718 } 719 720 /* 721 * We have changed references at this point, we must abort the 722 * transaction if anything fails. 723 */ 724 must_abort = true; 725 726 memcpy(root_item, &root->root_item, sizeof(*root_item)); 727 btrfs_set_root_bytenr(root_item, eb->start); 728 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 729 btrfs_set_root_generation(root_item, trans->transid); 730 731 if (btrfs_root_id(root) == objectid) { 732 btrfs_set_root_refs(root_item, 0); 733 memset(&root_item->drop_progress, 0, 734 sizeof(struct btrfs_disk_key)); 735 btrfs_set_root_drop_level(root_item, 0); 736 } 737 738 btrfs_tree_unlock(eb); 739 free_extent_buffer(eb); 740 741 ret = btrfs_insert_root(trans, fs_info->tree_root, 742 &root_key, root_item); 743 if (ret) 744 goto fail; 745 746 kfree(root_item); 747 748 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 749 if (IS_ERR(reloc_root)) { 750 ret = PTR_ERR(reloc_root); 751 goto abort; 752 } 753 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 754 btrfs_set_root_last_trans(reloc_root, trans->transid); 755 return reloc_root; 756 fail: 757 kfree(root_item); 758 abort: 759 if (must_abort) 760 btrfs_abort_transaction(trans, ret); 761 return ERR_PTR(ret); 762 } 763 764 /* 765 * create reloc tree for a given fs tree. reloc tree is just a 766 * snapshot of the fs tree with special root objectid. 767 * 768 * The reloc_root comes out of here with two references, one for 769 * root->reloc_root, and another for being on the rc->reloc_roots list. 770 */ 771 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 772 struct btrfs_root *root) 773 { 774 struct btrfs_fs_info *fs_info = root->fs_info; 775 struct btrfs_root *reloc_root; 776 struct reloc_control *rc = fs_info->reloc_ctl; 777 struct btrfs_block_rsv *rsv; 778 int clear_rsv = 0; 779 int ret; 780 781 if (!rc) 782 return 0; 783 784 /* 785 * The subvolume has reloc tree but the swap is finished, no need to 786 * create/update the dead reloc tree 787 */ 788 if (reloc_root_is_dead(root)) 789 return 0; 790 791 /* 792 * This is subtle but important. We do not do 793 * record_root_in_transaction for reloc roots, instead we record their 794 * corresponding fs root, and then here we update the last trans for the 795 * reloc root. This means that we have to do this for the entire life 796 * of the reloc root, regardless of which stage of the relocation we are 797 * in. 798 */ 799 if (root->reloc_root) { 800 reloc_root = root->reloc_root; 801 btrfs_set_root_last_trans(reloc_root, trans->transid); 802 return 0; 803 } 804 805 /* 806 * We are merging reloc roots, we do not need new reloc trees. Also 807 * reloc trees never need their own reloc tree. 808 */ 809 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 810 return 0; 811 812 if (!trans->reloc_reserved) { 813 rsv = trans->block_rsv; 814 trans->block_rsv = rc->block_rsv; 815 clear_rsv = 1; 816 } 817 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 818 if (clear_rsv) 819 trans->block_rsv = rsv; 820 if (IS_ERR(reloc_root)) 821 return PTR_ERR(reloc_root); 822 823 ret = __add_reloc_root(reloc_root); 824 ASSERT(ret != -EEXIST); 825 if (ret) { 826 /* Pairs with create_reloc_root */ 827 btrfs_put_root(reloc_root); 828 return ret; 829 } 830 root->reloc_root = btrfs_grab_root(reloc_root); 831 return 0; 832 } 833 834 /* 835 * update root item of reloc tree 836 */ 837 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 838 struct btrfs_root *root) 839 { 840 struct btrfs_fs_info *fs_info = root->fs_info; 841 struct btrfs_root *reloc_root; 842 struct btrfs_root_item *root_item; 843 int ret; 844 845 if (!have_reloc_root(root)) 846 return 0; 847 848 reloc_root = root->reloc_root; 849 root_item = &reloc_root->root_item; 850 851 /* 852 * We are probably ok here, but __del_reloc_root() will drop its ref of 853 * the root. We have the ref for root->reloc_root, but just in case 854 * hold it while we update the reloc root. 855 */ 856 btrfs_grab_root(reloc_root); 857 858 /* root->reloc_root will stay until current relocation finished */ 859 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 860 btrfs_root_refs(root_item) == 0) { 861 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 862 /* 863 * Mark the tree as dead before we change reloc_root so 864 * have_reloc_root will not touch it from now on. 865 */ 866 smp_wmb(); 867 __del_reloc_root(reloc_root); 868 } 869 870 if (reloc_root->commit_root != reloc_root->node) { 871 __update_reloc_root(reloc_root); 872 btrfs_set_root_node(root_item, reloc_root->node); 873 free_extent_buffer(reloc_root->commit_root); 874 reloc_root->commit_root = btrfs_root_node(reloc_root); 875 } 876 877 ret = btrfs_update_root(trans, fs_info->tree_root, 878 &reloc_root->root_key, root_item); 879 btrfs_put_root(reloc_root); 880 return ret; 881 } 882 883 /* 884 * get new location of data 885 */ 886 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 887 u64 bytenr, u64 num_bytes) 888 { 889 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 890 struct btrfs_path *path; 891 struct btrfs_file_extent_item *fi; 892 struct extent_buffer *leaf; 893 int ret; 894 895 path = btrfs_alloc_path(); 896 if (!path) 897 return -ENOMEM; 898 899 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 900 ret = btrfs_lookup_file_extent(NULL, root, path, 901 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 902 if (ret < 0) 903 goto out; 904 if (ret > 0) { 905 ret = -ENOENT; 906 goto out; 907 } 908 909 leaf = path->nodes[0]; 910 fi = btrfs_item_ptr(leaf, path->slots[0], 911 struct btrfs_file_extent_item); 912 913 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 914 btrfs_file_extent_compression(leaf, fi) || 915 btrfs_file_extent_encryption(leaf, fi) || 916 btrfs_file_extent_other_encoding(leaf, fi)); 917 918 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 919 ret = -EINVAL; 920 goto out; 921 } 922 923 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 924 ret = 0; 925 out: 926 btrfs_free_path(path); 927 return ret; 928 } 929 930 /* 931 * update file extent items in the tree leaf to point to 932 * the new locations. 933 */ 934 static noinline_for_stack 935 int replace_file_extents(struct btrfs_trans_handle *trans, 936 struct reloc_control *rc, 937 struct btrfs_root *root, 938 struct extent_buffer *leaf) 939 { 940 struct btrfs_fs_info *fs_info = root->fs_info; 941 struct btrfs_key key; 942 struct btrfs_file_extent_item *fi; 943 struct btrfs_inode *inode = NULL; 944 u64 parent; 945 u64 bytenr; 946 u64 new_bytenr = 0; 947 u64 num_bytes; 948 u64 end; 949 u32 nritems; 950 u32 i; 951 int ret = 0; 952 int first = 1; 953 int dirty = 0; 954 955 if (rc->stage != UPDATE_DATA_PTRS) 956 return 0; 957 958 /* reloc trees always use full backref */ 959 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 960 parent = leaf->start; 961 else 962 parent = 0; 963 964 nritems = btrfs_header_nritems(leaf); 965 for (i = 0; i < nritems; i++) { 966 struct btrfs_ref ref = { 0 }; 967 968 cond_resched(); 969 btrfs_item_key_to_cpu(leaf, &key, i); 970 if (key.type != BTRFS_EXTENT_DATA_KEY) 971 continue; 972 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 973 if (btrfs_file_extent_type(leaf, fi) == 974 BTRFS_FILE_EXTENT_INLINE) 975 continue; 976 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 977 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 978 if (bytenr == 0) 979 continue; 980 if (!in_range(bytenr, rc->block_group->start, 981 rc->block_group->length)) 982 continue; 983 984 /* 985 * if we are modifying block in fs tree, wait for read_folio 986 * to complete and drop the extent cache 987 */ 988 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 989 if (first) { 990 inode = btrfs_find_first_inode(root, key.objectid); 991 first = 0; 992 } else if (inode && btrfs_ino(inode) < key.objectid) { 993 btrfs_add_delayed_iput(inode); 994 inode = btrfs_find_first_inode(root, key.objectid); 995 } 996 if (inode && btrfs_ino(inode) == key.objectid) { 997 struct extent_state *cached_state = NULL; 998 999 end = key.offset + 1000 btrfs_file_extent_num_bytes(leaf, fi); 1001 WARN_ON(!IS_ALIGNED(key.offset, 1002 fs_info->sectorsize)); 1003 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1004 end--; 1005 /* Take mmap lock to serialize with reflinks. */ 1006 if (!down_read_trylock(&inode->i_mmap_lock)) 1007 continue; 1008 ret = try_lock_extent(&inode->io_tree, key.offset, 1009 end, &cached_state); 1010 if (!ret) { 1011 up_read(&inode->i_mmap_lock); 1012 continue; 1013 } 1014 1015 btrfs_drop_extent_map_range(inode, key.offset, end, true); 1016 unlock_extent(&inode->io_tree, key.offset, end, 1017 &cached_state); 1018 up_read(&inode->i_mmap_lock); 1019 } 1020 } 1021 1022 ret = get_new_location(rc->data_inode, &new_bytenr, 1023 bytenr, num_bytes); 1024 if (ret) { 1025 /* 1026 * Don't have to abort since we've not changed anything 1027 * in the file extent yet. 1028 */ 1029 break; 1030 } 1031 1032 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1033 dirty = 1; 1034 1035 key.offset -= btrfs_file_extent_offset(leaf, fi); 1036 ref.action = BTRFS_ADD_DELAYED_REF; 1037 ref.bytenr = new_bytenr; 1038 ref.num_bytes = num_bytes; 1039 ref.parent = parent; 1040 ref.owning_root = btrfs_root_id(root); 1041 ref.ref_root = btrfs_header_owner(leaf); 1042 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1043 btrfs_root_id(root), false); 1044 ret = btrfs_inc_extent_ref(trans, &ref); 1045 if (ret) { 1046 btrfs_abort_transaction(trans, ret); 1047 break; 1048 } 1049 1050 ref.action = BTRFS_DROP_DELAYED_REF; 1051 ref.bytenr = bytenr; 1052 ref.num_bytes = num_bytes; 1053 ref.parent = parent; 1054 ref.owning_root = btrfs_root_id(root); 1055 ref.ref_root = btrfs_header_owner(leaf); 1056 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1057 btrfs_root_id(root), false); 1058 ret = btrfs_free_extent(trans, &ref); 1059 if (ret) { 1060 btrfs_abort_transaction(trans, ret); 1061 break; 1062 } 1063 } 1064 if (dirty) 1065 btrfs_mark_buffer_dirty(trans, leaf); 1066 if (inode) 1067 btrfs_add_delayed_iput(inode); 1068 return ret; 1069 } 1070 1071 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 1072 int slot, const struct btrfs_path *path, 1073 int level) 1074 { 1075 struct btrfs_disk_key key1; 1076 struct btrfs_disk_key key2; 1077 btrfs_node_key(eb, &key1, slot); 1078 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1079 return memcmp(&key1, &key2, sizeof(key1)); 1080 } 1081 1082 /* 1083 * try to replace tree blocks in fs tree with the new blocks 1084 * in reloc tree. tree blocks haven't been modified since the 1085 * reloc tree was create can be replaced. 1086 * 1087 * if a block was replaced, level of the block + 1 is returned. 1088 * if no block got replaced, 0 is returned. if there are other 1089 * errors, a negative error number is returned. 1090 */ 1091 static noinline_for_stack 1092 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1093 struct btrfs_root *dest, struct btrfs_root *src, 1094 struct btrfs_path *path, struct btrfs_key *next_key, 1095 int lowest_level, int max_level) 1096 { 1097 struct btrfs_fs_info *fs_info = dest->fs_info; 1098 struct extent_buffer *eb; 1099 struct extent_buffer *parent; 1100 struct btrfs_ref ref = { 0 }; 1101 struct btrfs_key key; 1102 u64 old_bytenr; 1103 u64 new_bytenr; 1104 u64 old_ptr_gen; 1105 u64 new_ptr_gen; 1106 u64 last_snapshot; 1107 u32 blocksize; 1108 int cow = 0; 1109 int level; 1110 int ret; 1111 int slot; 1112 1113 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1114 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1115 1116 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1117 again: 1118 slot = path->slots[lowest_level]; 1119 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1120 1121 eb = btrfs_lock_root_node(dest); 1122 level = btrfs_header_level(eb); 1123 1124 if (level < lowest_level) { 1125 btrfs_tree_unlock(eb); 1126 free_extent_buffer(eb); 1127 return 0; 1128 } 1129 1130 if (cow) { 1131 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1132 BTRFS_NESTING_COW); 1133 if (ret) { 1134 btrfs_tree_unlock(eb); 1135 free_extent_buffer(eb); 1136 return ret; 1137 } 1138 } 1139 1140 if (next_key) { 1141 next_key->objectid = (u64)-1; 1142 next_key->type = (u8)-1; 1143 next_key->offset = (u64)-1; 1144 } 1145 1146 parent = eb; 1147 while (1) { 1148 level = btrfs_header_level(parent); 1149 ASSERT(level >= lowest_level); 1150 1151 ret = btrfs_bin_search(parent, 0, &key, &slot); 1152 if (ret < 0) 1153 break; 1154 if (ret && slot > 0) 1155 slot--; 1156 1157 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1158 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1159 1160 old_bytenr = btrfs_node_blockptr(parent, slot); 1161 blocksize = fs_info->nodesize; 1162 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1163 1164 if (level <= max_level) { 1165 eb = path->nodes[level]; 1166 new_bytenr = btrfs_node_blockptr(eb, 1167 path->slots[level]); 1168 new_ptr_gen = btrfs_node_ptr_generation(eb, 1169 path->slots[level]); 1170 } else { 1171 new_bytenr = 0; 1172 new_ptr_gen = 0; 1173 } 1174 1175 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1176 ret = level; 1177 break; 1178 } 1179 1180 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1181 memcmp_node_keys(parent, slot, path, level)) { 1182 if (level <= lowest_level) { 1183 ret = 0; 1184 break; 1185 } 1186 1187 eb = btrfs_read_node_slot(parent, slot); 1188 if (IS_ERR(eb)) { 1189 ret = PTR_ERR(eb); 1190 break; 1191 } 1192 btrfs_tree_lock(eb); 1193 if (cow) { 1194 ret = btrfs_cow_block(trans, dest, eb, parent, 1195 slot, &eb, 1196 BTRFS_NESTING_COW); 1197 if (ret) { 1198 btrfs_tree_unlock(eb); 1199 free_extent_buffer(eb); 1200 break; 1201 } 1202 } 1203 1204 btrfs_tree_unlock(parent); 1205 free_extent_buffer(parent); 1206 1207 parent = eb; 1208 continue; 1209 } 1210 1211 if (!cow) { 1212 btrfs_tree_unlock(parent); 1213 free_extent_buffer(parent); 1214 cow = 1; 1215 goto again; 1216 } 1217 1218 btrfs_node_key_to_cpu(path->nodes[level], &key, 1219 path->slots[level]); 1220 btrfs_release_path(path); 1221 1222 path->lowest_level = level; 1223 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1224 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1225 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1226 path->lowest_level = 0; 1227 if (ret) { 1228 if (ret > 0) 1229 ret = -ENOENT; 1230 break; 1231 } 1232 1233 /* 1234 * Info qgroup to trace both subtrees. 1235 * 1236 * We must trace both trees. 1237 * 1) Tree reloc subtree 1238 * If not traced, we will leak data numbers 1239 * 2) Fs subtree 1240 * If not traced, we will double count old data 1241 * 1242 * We don't scan the subtree right now, but only record 1243 * the swapped tree blocks. 1244 * The real subtree rescan is delayed until we have new 1245 * CoW on the subtree root node before transaction commit. 1246 */ 1247 ret = btrfs_qgroup_add_swapped_blocks(dest, 1248 rc->block_group, parent, slot, 1249 path->nodes[level], path->slots[level], 1250 last_snapshot); 1251 if (ret < 0) 1252 break; 1253 /* 1254 * swap blocks in fs tree and reloc tree. 1255 */ 1256 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1257 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1258 btrfs_mark_buffer_dirty(trans, parent); 1259 1260 btrfs_set_node_blockptr(path->nodes[level], 1261 path->slots[level], old_bytenr); 1262 btrfs_set_node_ptr_generation(path->nodes[level], 1263 path->slots[level], old_ptr_gen); 1264 btrfs_mark_buffer_dirty(trans, path->nodes[level]); 1265 1266 ref.action = BTRFS_ADD_DELAYED_REF; 1267 ref.bytenr = old_bytenr; 1268 ref.num_bytes = blocksize; 1269 ref.parent = path->nodes[level]->start; 1270 ref.owning_root = btrfs_root_id(src); 1271 ref.ref_root = btrfs_root_id(src); 1272 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1273 ret = btrfs_inc_extent_ref(trans, &ref); 1274 if (ret) { 1275 btrfs_abort_transaction(trans, ret); 1276 break; 1277 } 1278 1279 ref.action = BTRFS_ADD_DELAYED_REF; 1280 ref.bytenr = new_bytenr; 1281 ref.num_bytes = blocksize; 1282 ref.parent = 0; 1283 ref.owning_root = btrfs_root_id(dest); 1284 ref.ref_root = btrfs_root_id(dest); 1285 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1286 ret = btrfs_inc_extent_ref(trans, &ref); 1287 if (ret) { 1288 btrfs_abort_transaction(trans, ret); 1289 break; 1290 } 1291 1292 /* We don't know the real owning_root, use 0. */ 1293 ref.action = BTRFS_DROP_DELAYED_REF; 1294 ref.bytenr = new_bytenr; 1295 ref.num_bytes = blocksize; 1296 ref.parent = path->nodes[level]->start; 1297 ref.owning_root = 0; 1298 ref.ref_root = btrfs_root_id(src); 1299 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1300 ret = btrfs_free_extent(trans, &ref); 1301 if (ret) { 1302 btrfs_abort_transaction(trans, ret); 1303 break; 1304 } 1305 1306 /* We don't know the real owning_root, use 0. */ 1307 ref.action = BTRFS_DROP_DELAYED_REF; 1308 ref.bytenr = old_bytenr; 1309 ref.num_bytes = blocksize; 1310 ref.parent = 0; 1311 ref.owning_root = 0; 1312 ref.ref_root = btrfs_root_id(dest); 1313 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1314 ret = btrfs_free_extent(trans, &ref); 1315 if (ret) { 1316 btrfs_abort_transaction(trans, ret); 1317 break; 1318 } 1319 1320 btrfs_unlock_up_safe(path, 0); 1321 1322 ret = level; 1323 break; 1324 } 1325 btrfs_tree_unlock(parent); 1326 free_extent_buffer(parent); 1327 return ret; 1328 } 1329 1330 /* 1331 * helper to find next relocated block in reloc tree 1332 */ 1333 static noinline_for_stack 1334 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1335 int *level) 1336 { 1337 struct extent_buffer *eb; 1338 int i; 1339 u64 last_snapshot; 1340 u32 nritems; 1341 1342 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1343 1344 for (i = 0; i < *level; i++) { 1345 free_extent_buffer(path->nodes[i]); 1346 path->nodes[i] = NULL; 1347 } 1348 1349 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1350 eb = path->nodes[i]; 1351 nritems = btrfs_header_nritems(eb); 1352 while (path->slots[i] + 1 < nritems) { 1353 path->slots[i]++; 1354 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1355 last_snapshot) 1356 continue; 1357 1358 *level = i; 1359 return 0; 1360 } 1361 free_extent_buffer(path->nodes[i]); 1362 path->nodes[i] = NULL; 1363 } 1364 return 1; 1365 } 1366 1367 /* 1368 * walk down reloc tree to find relocated block of lowest level 1369 */ 1370 static noinline_for_stack 1371 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1372 int *level) 1373 { 1374 struct extent_buffer *eb = NULL; 1375 int i; 1376 u64 ptr_gen = 0; 1377 u64 last_snapshot; 1378 u32 nritems; 1379 1380 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1381 1382 for (i = *level; i > 0; i--) { 1383 eb = path->nodes[i]; 1384 nritems = btrfs_header_nritems(eb); 1385 while (path->slots[i] < nritems) { 1386 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1387 if (ptr_gen > last_snapshot) 1388 break; 1389 path->slots[i]++; 1390 } 1391 if (path->slots[i] >= nritems) { 1392 if (i == *level) 1393 break; 1394 *level = i + 1; 1395 return 0; 1396 } 1397 if (i == 1) { 1398 *level = i; 1399 return 0; 1400 } 1401 1402 eb = btrfs_read_node_slot(eb, path->slots[i]); 1403 if (IS_ERR(eb)) 1404 return PTR_ERR(eb); 1405 BUG_ON(btrfs_header_level(eb) != i - 1); 1406 path->nodes[i - 1] = eb; 1407 path->slots[i - 1] = 0; 1408 } 1409 return 1; 1410 } 1411 1412 /* 1413 * invalidate extent cache for file extents whose key in range of 1414 * [min_key, max_key) 1415 */ 1416 static int invalidate_extent_cache(struct btrfs_root *root, 1417 const struct btrfs_key *min_key, 1418 const struct btrfs_key *max_key) 1419 { 1420 struct btrfs_fs_info *fs_info = root->fs_info; 1421 struct btrfs_inode *inode = NULL; 1422 u64 objectid; 1423 u64 start, end; 1424 u64 ino; 1425 1426 objectid = min_key->objectid; 1427 while (1) { 1428 struct extent_state *cached_state = NULL; 1429 1430 cond_resched(); 1431 if (inode) 1432 iput(&inode->vfs_inode); 1433 1434 if (objectid > max_key->objectid) 1435 break; 1436 1437 inode = btrfs_find_first_inode(root, objectid); 1438 if (!inode) 1439 break; 1440 ino = btrfs_ino(inode); 1441 1442 if (ino > max_key->objectid) { 1443 iput(&inode->vfs_inode); 1444 break; 1445 } 1446 1447 objectid = ino + 1; 1448 if (!S_ISREG(inode->vfs_inode.i_mode)) 1449 continue; 1450 1451 if (unlikely(min_key->objectid == ino)) { 1452 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1453 continue; 1454 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1455 start = 0; 1456 else { 1457 start = min_key->offset; 1458 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1459 } 1460 } else { 1461 start = 0; 1462 } 1463 1464 if (unlikely(max_key->objectid == ino)) { 1465 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1466 continue; 1467 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1468 end = (u64)-1; 1469 } else { 1470 if (max_key->offset == 0) 1471 continue; 1472 end = max_key->offset; 1473 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1474 end--; 1475 } 1476 } else { 1477 end = (u64)-1; 1478 } 1479 1480 /* the lock_extent waits for read_folio to complete */ 1481 lock_extent(&inode->io_tree, start, end, &cached_state); 1482 btrfs_drop_extent_map_range(inode, start, end, true); 1483 unlock_extent(&inode->io_tree, start, end, &cached_state); 1484 } 1485 return 0; 1486 } 1487 1488 static int find_next_key(struct btrfs_path *path, int level, 1489 struct btrfs_key *key) 1490 1491 { 1492 while (level < BTRFS_MAX_LEVEL) { 1493 if (!path->nodes[level]) 1494 break; 1495 if (path->slots[level] + 1 < 1496 btrfs_header_nritems(path->nodes[level])) { 1497 btrfs_node_key_to_cpu(path->nodes[level], key, 1498 path->slots[level] + 1); 1499 return 0; 1500 } 1501 level++; 1502 } 1503 return 1; 1504 } 1505 1506 /* 1507 * Insert current subvolume into reloc_control::dirty_subvol_roots 1508 */ 1509 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1510 struct reloc_control *rc, 1511 struct btrfs_root *root) 1512 { 1513 struct btrfs_root *reloc_root = root->reloc_root; 1514 struct btrfs_root_item *reloc_root_item; 1515 int ret; 1516 1517 /* @root must be a subvolume tree root with a valid reloc tree */ 1518 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1519 ASSERT(reloc_root); 1520 1521 reloc_root_item = &reloc_root->root_item; 1522 memset(&reloc_root_item->drop_progress, 0, 1523 sizeof(reloc_root_item->drop_progress)); 1524 btrfs_set_root_drop_level(reloc_root_item, 0); 1525 btrfs_set_root_refs(reloc_root_item, 0); 1526 ret = btrfs_update_reloc_root(trans, root); 1527 if (ret) 1528 return ret; 1529 1530 if (list_empty(&root->reloc_dirty_list)) { 1531 btrfs_grab_root(root); 1532 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1533 } 1534 1535 return 0; 1536 } 1537 1538 static int clean_dirty_subvols(struct reloc_control *rc) 1539 { 1540 struct btrfs_root *root; 1541 struct btrfs_root *next; 1542 int ret = 0; 1543 int ret2; 1544 1545 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1546 reloc_dirty_list) { 1547 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1548 /* Merged subvolume, cleanup its reloc root */ 1549 struct btrfs_root *reloc_root = root->reloc_root; 1550 1551 list_del_init(&root->reloc_dirty_list); 1552 root->reloc_root = NULL; 1553 /* 1554 * Need barrier to ensure clear_bit() only happens after 1555 * root->reloc_root = NULL. Pairs with have_reloc_root. 1556 */ 1557 smp_wmb(); 1558 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1559 if (reloc_root) { 1560 /* 1561 * btrfs_drop_snapshot drops our ref we hold for 1562 * ->reloc_root. If it fails however we must 1563 * drop the ref ourselves. 1564 */ 1565 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1566 if (ret2 < 0) { 1567 btrfs_put_root(reloc_root); 1568 if (!ret) 1569 ret = ret2; 1570 } 1571 } 1572 btrfs_put_root(root); 1573 } else { 1574 /* Orphan reloc tree, just clean it up */ 1575 ret2 = btrfs_drop_snapshot(root, 0, 1); 1576 if (ret2 < 0) { 1577 btrfs_put_root(root); 1578 if (!ret) 1579 ret = ret2; 1580 } 1581 } 1582 } 1583 return ret; 1584 } 1585 1586 /* 1587 * merge the relocated tree blocks in reloc tree with corresponding 1588 * fs tree. 1589 */ 1590 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1591 struct btrfs_root *root) 1592 { 1593 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1594 struct btrfs_key key; 1595 struct btrfs_key next_key; 1596 struct btrfs_trans_handle *trans = NULL; 1597 struct btrfs_root *reloc_root; 1598 struct btrfs_root_item *root_item; 1599 struct btrfs_path *path; 1600 struct extent_buffer *leaf; 1601 int reserve_level; 1602 int level; 1603 int max_level; 1604 int replaced = 0; 1605 int ret = 0; 1606 u32 min_reserved; 1607 1608 path = btrfs_alloc_path(); 1609 if (!path) 1610 return -ENOMEM; 1611 path->reada = READA_FORWARD; 1612 1613 reloc_root = root->reloc_root; 1614 root_item = &reloc_root->root_item; 1615 1616 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1617 level = btrfs_root_level(root_item); 1618 atomic_inc(&reloc_root->node->refs); 1619 path->nodes[level] = reloc_root->node; 1620 path->slots[level] = 0; 1621 } else { 1622 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1623 1624 level = btrfs_root_drop_level(root_item); 1625 BUG_ON(level == 0); 1626 path->lowest_level = level; 1627 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1628 path->lowest_level = 0; 1629 if (ret < 0) { 1630 btrfs_free_path(path); 1631 return ret; 1632 } 1633 1634 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1635 path->slots[level]); 1636 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1637 1638 btrfs_unlock_up_safe(path, 0); 1639 } 1640 1641 /* 1642 * In merge_reloc_root(), we modify the upper level pointer to swap the 1643 * tree blocks between reloc tree and subvolume tree. Thus for tree 1644 * block COW, we COW at most from level 1 to root level for each tree. 1645 * 1646 * Thus the needed metadata size is at most root_level * nodesize, 1647 * and * 2 since we have two trees to COW. 1648 */ 1649 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1650 min_reserved = fs_info->nodesize * reserve_level * 2; 1651 memset(&next_key, 0, sizeof(next_key)); 1652 1653 while (1) { 1654 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1655 min_reserved, 1656 BTRFS_RESERVE_FLUSH_LIMIT); 1657 if (ret) 1658 goto out; 1659 trans = btrfs_start_transaction(root, 0); 1660 if (IS_ERR(trans)) { 1661 ret = PTR_ERR(trans); 1662 trans = NULL; 1663 goto out; 1664 } 1665 1666 /* 1667 * At this point we no longer have a reloc_control, so we can't 1668 * depend on btrfs_init_reloc_root to update our last_trans. 1669 * 1670 * But that's ok, we started the trans handle on our 1671 * corresponding fs_root, which means it's been added to the 1672 * dirty list. At commit time we'll still call 1673 * btrfs_update_reloc_root() and update our root item 1674 * appropriately. 1675 */ 1676 btrfs_set_root_last_trans(reloc_root, trans->transid); 1677 trans->block_rsv = rc->block_rsv; 1678 1679 replaced = 0; 1680 max_level = level; 1681 1682 ret = walk_down_reloc_tree(reloc_root, path, &level); 1683 if (ret < 0) 1684 goto out; 1685 if (ret > 0) 1686 break; 1687 1688 if (!find_next_key(path, level, &key) && 1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1690 ret = 0; 1691 } else { 1692 ret = replace_path(trans, rc, root, reloc_root, path, 1693 &next_key, level, max_level); 1694 } 1695 if (ret < 0) 1696 goto out; 1697 if (ret > 0) { 1698 level = ret; 1699 btrfs_node_key_to_cpu(path->nodes[level], &key, 1700 path->slots[level]); 1701 replaced = 1; 1702 } 1703 1704 ret = walk_up_reloc_tree(reloc_root, path, &level); 1705 if (ret > 0) 1706 break; 1707 1708 BUG_ON(level == 0); 1709 /* 1710 * save the merging progress in the drop_progress. 1711 * this is OK since root refs == 1 in this case. 1712 */ 1713 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1714 path->slots[level]); 1715 btrfs_set_root_drop_level(root_item, level); 1716 1717 btrfs_end_transaction_throttle(trans); 1718 trans = NULL; 1719 1720 btrfs_btree_balance_dirty(fs_info); 1721 1722 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1723 invalidate_extent_cache(root, &key, &next_key); 1724 } 1725 1726 /* 1727 * handle the case only one block in the fs tree need to be 1728 * relocated and the block is tree root. 1729 */ 1730 leaf = btrfs_lock_root_node(root); 1731 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1732 BTRFS_NESTING_COW); 1733 btrfs_tree_unlock(leaf); 1734 free_extent_buffer(leaf); 1735 out: 1736 btrfs_free_path(path); 1737 1738 if (ret == 0) { 1739 ret = insert_dirty_subvol(trans, rc, root); 1740 if (ret) 1741 btrfs_abort_transaction(trans, ret); 1742 } 1743 1744 if (trans) 1745 btrfs_end_transaction_throttle(trans); 1746 1747 btrfs_btree_balance_dirty(fs_info); 1748 1749 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1750 invalidate_extent_cache(root, &key, &next_key); 1751 1752 return ret; 1753 } 1754 1755 static noinline_for_stack 1756 int prepare_to_merge(struct reloc_control *rc, int err) 1757 { 1758 struct btrfs_root *root = rc->extent_root; 1759 struct btrfs_fs_info *fs_info = root->fs_info; 1760 struct btrfs_root *reloc_root; 1761 struct btrfs_trans_handle *trans; 1762 LIST_HEAD(reloc_roots); 1763 u64 num_bytes = 0; 1764 int ret; 1765 1766 mutex_lock(&fs_info->reloc_mutex); 1767 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1768 rc->merging_rsv_size += rc->nodes_relocated * 2; 1769 mutex_unlock(&fs_info->reloc_mutex); 1770 1771 again: 1772 if (!err) { 1773 num_bytes = rc->merging_rsv_size; 1774 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1775 BTRFS_RESERVE_FLUSH_ALL); 1776 if (ret) 1777 err = ret; 1778 } 1779 1780 trans = btrfs_join_transaction(rc->extent_root); 1781 if (IS_ERR(trans)) { 1782 if (!err) 1783 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1784 num_bytes, NULL); 1785 return PTR_ERR(trans); 1786 } 1787 1788 if (!err) { 1789 if (num_bytes != rc->merging_rsv_size) { 1790 btrfs_end_transaction(trans); 1791 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1792 num_bytes, NULL); 1793 goto again; 1794 } 1795 } 1796 1797 rc->merge_reloc_tree = true; 1798 1799 while (!list_empty(&rc->reloc_roots)) { 1800 reloc_root = list_entry(rc->reloc_roots.next, 1801 struct btrfs_root, root_list); 1802 list_del_init(&reloc_root->root_list); 1803 1804 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1805 false); 1806 if (IS_ERR(root)) { 1807 /* 1808 * Even if we have an error we need this reloc root 1809 * back on our list so we can clean up properly. 1810 */ 1811 list_add(&reloc_root->root_list, &reloc_roots); 1812 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1813 if (!err) 1814 err = PTR_ERR(root); 1815 break; 1816 } 1817 1818 if (unlikely(root->reloc_root != reloc_root)) { 1819 if (root->reloc_root) { 1820 btrfs_err(fs_info, 1821 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1822 btrfs_root_id(root), 1823 btrfs_root_id(root->reloc_root), 1824 root->reloc_root->root_key.type, 1825 root->reloc_root->root_key.offset, 1826 btrfs_root_generation( 1827 &root->reloc_root->root_item), 1828 btrfs_root_id(reloc_root), 1829 reloc_root->root_key.type, 1830 reloc_root->root_key.offset, 1831 btrfs_root_generation( 1832 &reloc_root->root_item)); 1833 } else { 1834 btrfs_err(fs_info, 1835 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1836 btrfs_root_id(root), 1837 btrfs_root_id(reloc_root), 1838 reloc_root->root_key.type, 1839 reloc_root->root_key.offset, 1840 btrfs_root_generation( 1841 &reloc_root->root_item)); 1842 } 1843 list_add(&reloc_root->root_list, &reloc_roots); 1844 btrfs_put_root(root); 1845 btrfs_abort_transaction(trans, -EUCLEAN); 1846 if (!err) 1847 err = -EUCLEAN; 1848 break; 1849 } 1850 1851 /* 1852 * set reference count to 1, so btrfs_recover_relocation 1853 * knows it should resumes merging 1854 */ 1855 if (!err) 1856 btrfs_set_root_refs(&reloc_root->root_item, 1); 1857 ret = btrfs_update_reloc_root(trans, root); 1858 1859 /* 1860 * Even if we have an error we need this reloc root back on our 1861 * list so we can clean up properly. 1862 */ 1863 list_add(&reloc_root->root_list, &reloc_roots); 1864 btrfs_put_root(root); 1865 1866 if (ret) { 1867 btrfs_abort_transaction(trans, ret); 1868 if (!err) 1869 err = ret; 1870 break; 1871 } 1872 } 1873 1874 list_splice(&reloc_roots, &rc->reloc_roots); 1875 1876 if (!err) 1877 err = btrfs_commit_transaction(trans); 1878 else 1879 btrfs_end_transaction(trans); 1880 return err; 1881 } 1882 1883 static noinline_for_stack 1884 void free_reloc_roots(struct list_head *list) 1885 { 1886 struct btrfs_root *reloc_root, *tmp; 1887 1888 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1889 __del_reloc_root(reloc_root); 1890 } 1891 1892 static noinline_for_stack 1893 void merge_reloc_roots(struct reloc_control *rc) 1894 { 1895 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1896 struct btrfs_root *root; 1897 struct btrfs_root *reloc_root; 1898 LIST_HEAD(reloc_roots); 1899 int found = 0; 1900 int ret = 0; 1901 again: 1902 root = rc->extent_root; 1903 1904 /* 1905 * this serializes us with btrfs_record_root_in_transaction, 1906 * we have to make sure nobody is in the middle of 1907 * adding their roots to the list while we are 1908 * doing this splice 1909 */ 1910 mutex_lock(&fs_info->reloc_mutex); 1911 list_splice_init(&rc->reloc_roots, &reloc_roots); 1912 mutex_unlock(&fs_info->reloc_mutex); 1913 1914 while (!list_empty(&reloc_roots)) { 1915 found = 1; 1916 reloc_root = list_entry(reloc_roots.next, 1917 struct btrfs_root, root_list); 1918 1919 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1920 false); 1921 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1922 if (WARN_ON(IS_ERR(root))) { 1923 /* 1924 * For recovery we read the fs roots on mount, 1925 * and if we didn't find the root then we marked 1926 * the reloc root as a garbage root. For normal 1927 * relocation obviously the root should exist in 1928 * memory. However there's no reason we can't 1929 * handle the error properly here just in case. 1930 */ 1931 ret = PTR_ERR(root); 1932 goto out; 1933 } 1934 if (WARN_ON(root->reloc_root != reloc_root)) { 1935 /* 1936 * This can happen if on-disk metadata has some 1937 * corruption, e.g. bad reloc tree key offset. 1938 */ 1939 ret = -EINVAL; 1940 goto out; 1941 } 1942 ret = merge_reloc_root(rc, root); 1943 btrfs_put_root(root); 1944 if (ret) { 1945 if (list_empty(&reloc_root->root_list)) 1946 list_add_tail(&reloc_root->root_list, 1947 &reloc_roots); 1948 goto out; 1949 } 1950 } else { 1951 if (!IS_ERR(root)) { 1952 if (root->reloc_root == reloc_root) { 1953 root->reloc_root = NULL; 1954 btrfs_put_root(reloc_root); 1955 } 1956 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1957 &root->state); 1958 btrfs_put_root(root); 1959 } 1960 1961 list_del_init(&reloc_root->root_list); 1962 /* Don't forget to queue this reloc root for cleanup */ 1963 list_add_tail(&reloc_root->reloc_dirty_list, 1964 &rc->dirty_subvol_roots); 1965 } 1966 } 1967 1968 if (found) { 1969 found = 0; 1970 goto again; 1971 } 1972 out: 1973 if (ret) { 1974 btrfs_handle_fs_error(fs_info, ret, NULL); 1975 free_reloc_roots(&reloc_roots); 1976 1977 /* new reloc root may be added */ 1978 mutex_lock(&fs_info->reloc_mutex); 1979 list_splice_init(&rc->reloc_roots, &reloc_roots); 1980 mutex_unlock(&fs_info->reloc_mutex); 1981 free_reloc_roots(&reloc_roots); 1982 } 1983 1984 /* 1985 * We used to have 1986 * 1987 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1988 * 1989 * here, but it's wrong. If we fail to start the transaction in 1990 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1991 * have actually been removed from the reloc_root_tree rb tree. This is 1992 * fine because we're bailing here, and we hold a reference on the root 1993 * for the list that holds it, so these roots will be cleaned up when we 1994 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1995 * will be cleaned up on unmount. 1996 * 1997 * The remaining nodes will be cleaned up by free_reloc_control. 1998 */ 1999 } 2000 2001 static void free_block_list(struct rb_root *blocks) 2002 { 2003 struct tree_block *block; 2004 struct rb_node *rb_node; 2005 while ((rb_node = rb_first(blocks))) { 2006 block = rb_entry(rb_node, struct tree_block, rb_node); 2007 rb_erase(rb_node, blocks); 2008 kfree(block); 2009 } 2010 } 2011 2012 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2013 struct btrfs_root *reloc_root) 2014 { 2015 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2016 struct btrfs_root *root; 2017 int ret; 2018 2019 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 2020 return 0; 2021 2022 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2023 2024 /* 2025 * This should succeed, since we can't have a reloc root without having 2026 * already looked up the actual root and created the reloc root for this 2027 * root. 2028 * 2029 * However if there's some sort of corruption where we have a ref to a 2030 * reloc root without a corresponding root this could return ENOENT. 2031 */ 2032 if (IS_ERR(root)) { 2033 ASSERT(0); 2034 return PTR_ERR(root); 2035 } 2036 if (root->reloc_root != reloc_root) { 2037 ASSERT(0); 2038 btrfs_err(fs_info, 2039 "root %llu has two reloc roots associated with it", 2040 reloc_root->root_key.offset); 2041 btrfs_put_root(root); 2042 return -EUCLEAN; 2043 } 2044 ret = btrfs_record_root_in_trans(trans, root); 2045 btrfs_put_root(root); 2046 2047 return ret; 2048 } 2049 2050 static noinline_for_stack 2051 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2052 struct reloc_control *rc, 2053 struct btrfs_backref_node *node, 2054 struct btrfs_backref_edge *edges[]) 2055 { 2056 struct btrfs_backref_node *next; 2057 struct btrfs_root *root; 2058 int index = 0; 2059 int ret; 2060 2061 next = node; 2062 while (1) { 2063 cond_resched(); 2064 next = walk_up_backref(next, edges, &index); 2065 root = next->root; 2066 2067 /* 2068 * If there is no root, then our references for this block are 2069 * incomplete, as we should be able to walk all the way up to a 2070 * block that is owned by a root. 2071 * 2072 * This path is only for SHAREABLE roots, so if we come upon a 2073 * non-SHAREABLE root then we have backrefs that resolve 2074 * improperly. 2075 * 2076 * Both of these cases indicate file system corruption, or a bug 2077 * in the backref walking code. 2078 */ 2079 if (!root) { 2080 ASSERT(0); 2081 btrfs_err(trans->fs_info, 2082 "bytenr %llu doesn't have a backref path ending in a root", 2083 node->bytenr); 2084 return ERR_PTR(-EUCLEAN); 2085 } 2086 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2087 ASSERT(0); 2088 btrfs_err(trans->fs_info, 2089 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2090 node->bytenr); 2091 return ERR_PTR(-EUCLEAN); 2092 } 2093 2094 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2095 ret = record_reloc_root_in_trans(trans, root); 2096 if (ret) 2097 return ERR_PTR(ret); 2098 break; 2099 } 2100 2101 ret = btrfs_record_root_in_trans(trans, root); 2102 if (ret) 2103 return ERR_PTR(ret); 2104 root = root->reloc_root; 2105 2106 /* 2107 * We could have raced with another thread which failed, so 2108 * root->reloc_root may not be set, return ENOENT in this case. 2109 */ 2110 if (!root) 2111 return ERR_PTR(-ENOENT); 2112 2113 if (next->new_bytenr != root->node->start) { 2114 /* 2115 * We just created the reloc root, so we shouldn't have 2116 * ->new_bytenr set and this shouldn't be in the changed 2117 * list. If it is then we have multiple roots pointing 2118 * at the same bytenr which indicates corruption, or 2119 * we've made a mistake in the backref walking code. 2120 */ 2121 ASSERT(next->new_bytenr == 0); 2122 ASSERT(list_empty(&next->list)); 2123 if (next->new_bytenr || !list_empty(&next->list)) { 2124 btrfs_err(trans->fs_info, 2125 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2126 node->bytenr, next->bytenr); 2127 return ERR_PTR(-EUCLEAN); 2128 } 2129 2130 next->new_bytenr = root->node->start; 2131 btrfs_put_root(next->root); 2132 next->root = btrfs_grab_root(root); 2133 ASSERT(next->root); 2134 list_add_tail(&next->list, 2135 &rc->backref_cache.changed); 2136 mark_block_processed(rc, next); 2137 break; 2138 } 2139 2140 WARN_ON(1); 2141 root = NULL; 2142 next = walk_down_backref(edges, &index); 2143 if (!next || next->level <= node->level) 2144 break; 2145 } 2146 if (!root) { 2147 /* 2148 * This can happen if there's fs corruption or if there's a bug 2149 * in the backref lookup code. 2150 */ 2151 ASSERT(0); 2152 return ERR_PTR(-ENOENT); 2153 } 2154 2155 next = node; 2156 /* setup backref node path for btrfs_reloc_cow_block */ 2157 while (1) { 2158 rc->backref_cache.path[next->level] = next; 2159 if (--index < 0) 2160 break; 2161 next = edges[index]->node[UPPER]; 2162 } 2163 return root; 2164 } 2165 2166 /* 2167 * Select a tree root for relocation. 2168 * 2169 * Return NULL if the block is not shareable. We should use do_relocation() in 2170 * this case. 2171 * 2172 * Return a tree root pointer if the block is shareable. 2173 * Return -ENOENT if the block is root of reloc tree. 2174 */ 2175 static noinline_for_stack 2176 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2177 { 2178 struct btrfs_backref_node *next; 2179 struct btrfs_root *root; 2180 struct btrfs_root *fs_root = NULL; 2181 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2182 int index = 0; 2183 2184 next = node; 2185 while (1) { 2186 cond_resched(); 2187 next = walk_up_backref(next, edges, &index); 2188 root = next->root; 2189 2190 /* 2191 * This can occur if we have incomplete extent refs leading all 2192 * the way up a particular path, in this case return -EUCLEAN. 2193 */ 2194 if (!root) 2195 return ERR_PTR(-EUCLEAN); 2196 2197 /* No other choice for non-shareable tree */ 2198 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2199 return root; 2200 2201 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2202 fs_root = root; 2203 2204 if (next != node) 2205 return NULL; 2206 2207 next = walk_down_backref(edges, &index); 2208 if (!next || next->level <= node->level) 2209 break; 2210 } 2211 2212 if (!fs_root) 2213 return ERR_PTR(-ENOENT); 2214 return fs_root; 2215 } 2216 2217 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2218 struct btrfs_backref_node *node) 2219 { 2220 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2221 struct btrfs_backref_node *next = node; 2222 struct btrfs_backref_edge *edge; 2223 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2224 u64 num_bytes = 0; 2225 int index = 0; 2226 2227 BUG_ON(node->processed); 2228 2229 while (next) { 2230 cond_resched(); 2231 while (1) { 2232 if (next->processed) 2233 break; 2234 2235 num_bytes += fs_info->nodesize; 2236 2237 if (list_empty(&next->upper)) 2238 break; 2239 2240 edge = list_entry(next->upper.next, 2241 struct btrfs_backref_edge, list[LOWER]); 2242 edges[index++] = edge; 2243 next = edge->node[UPPER]; 2244 } 2245 next = walk_down_backref(edges, &index); 2246 } 2247 return num_bytes; 2248 } 2249 2250 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2251 struct reloc_control *rc, 2252 struct btrfs_backref_node *node) 2253 { 2254 struct btrfs_root *root = rc->extent_root; 2255 struct btrfs_fs_info *fs_info = root->fs_info; 2256 u64 num_bytes; 2257 int ret; 2258 u64 tmp; 2259 2260 num_bytes = calcu_metadata_size(rc, node) * 2; 2261 2262 trans->block_rsv = rc->block_rsv; 2263 rc->reserved_bytes += num_bytes; 2264 2265 /* 2266 * We are under a transaction here so we can only do limited flushing. 2267 * If we get an enospc just kick back -EAGAIN so we know to drop the 2268 * transaction and try to refill when we can flush all the things. 2269 */ 2270 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2271 BTRFS_RESERVE_FLUSH_LIMIT); 2272 if (ret) { 2273 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2274 while (tmp <= rc->reserved_bytes) 2275 tmp <<= 1; 2276 /* 2277 * only one thread can access block_rsv at this point, 2278 * so we don't need hold lock to protect block_rsv. 2279 * we expand more reservation size here to allow enough 2280 * space for relocation and we will return earlier in 2281 * enospc case. 2282 */ 2283 rc->block_rsv->size = tmp + fs_info->nodesize * 2284 RELOCATION_RESERVED_NODES; 2285 return -EAGAIN; 2286 } 2287 2288 return 0; 2289 } 2290 2291 /* 2292 * relocate a block tree, and then update pointers in upper level 2293 * blocks that reference the block to point to the new location. 2294 * 2295 * if called by link_to_upper, the block has already been relocated. 2296 * in that case this function just updates pointers. 2297 */ 2298 static int do_relocation(struct btrfs_trans_handle *trans, 2299 struct reloc_control *rc, 2300 struct btrfs_backref_node *node, 2301 struct btrfs_key *key, 2302 struct btrfs_path *path, int lowest) 2303 { 2304 struct btrfs_backref_node *upper; 2305 struct btrfs_backref_edge *edge; 2306 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2307 struct btrfs_root *root; 2308 struct extent_buffer *eb; 2309 u32 blocksize; 2310 u64 bytenr; 2311 int slot; 2312 int ret = 0; 2313 2314 /* 2315 * If we are lowest then this is the first time we're processing this 2316 * block, and thus shouldn't have an eb associated with it yet. 2317 */ 2318 ASSERT(!lowest || !node->eb); 2319 2320 path->lowest_level = node->level + 1; 2321 rc->backref_cache.path[node->level] = node; 2322 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2323 cond_resched(); 2324 2325 upper = edge->node[UPPER]; 2326 root = select_reloc_root(trans, rc, upper, edges); 2327 if (IS_ERR(root)) { 2328 ret = PTR_ERR(root); 2329 goto next; 2330 } 2331 2332 if (upper->eb && !upper->locked) { 2333 if (!lowest) { 2334 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2335 if (ret < 0) 2336 goto next; 2337 BUG_ON(ret); 2338 bytenr = btrfs_node_blockptr(upper->eb, slot); 2339 if (node->eb->start == bytenr) 2340 goto next; 2341 } 2342 btrfs_backref_drop_node_buffer(upper); 2343 } 2344 2345 if (!upper->eb) { 2346 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2347 if (ret) { 2348 if (ret > 0) 2349 ret = -ENOENT; 2350 2351 btrfs_release_path(path); 2352 break; 2353 } 2354 2355 if (!upper->eb) { 2356 upper->eb = path->nodes[upper->level]; 2357 path->nodes[upper->level] = NULL; 2358 } else { 2359 BUG_ON(upper->eb != path->nodes[upper->level]); 2360 } 2361 2362 upper->locked = 1; 2363 path->locks[upper->level] = 0; 2364 2365 slot = path->slots[upper->level]; 2366 btrfs_release_path(path); 2367 } else { 2368 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2369 if (ret < 0) 2370 goto next; 2371 BUG_ON(ret); 2372 } 2373 2374 bytenr = btrfs_node_blockptr(upper->eb, slot); 2375 if (lowest) { 2376 if (bytenr != node->bytenr) { 2377 btrfs_err(root->fs_info, 2378 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2379 bytenr, node->bytenr, slot, 2380 upper->eb->start); 2381 ret = -EIO; 2382 goto next; 2383 } 2384 } else { 2385 if (node->eb->start == bytenr) 2386 goto next; 2387 } 2388 2389 blocksize = root->fs_info->nodesize; 2390 eb = btrfs_read_node_slot(upper->eb, slot); 2391 if (IS_ERR(eb)) { 2392 ret = PTR_ERR(eb); 2393 goto next; 2394 } 2395 btrfs_tree_lock(eb); 2396 2397 if (!node->eb) { 2398 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2399 slot, &eb, BTRFS_NESTING_COW); 2400 btrfs_tree_unlock(eb); 2401 free_extent_buffer(eb); 2402 if (ret < 0) 2403 goto next; 2404 /* 2405 * We've just COWed this block, it should have updated 2406 * the correct backref node entry. 2407 */ 2408 ASSERT(node->eb == eb); 2409 } else { 2410 struct btrfs_ref ref = { 2411 .action = BTRFS_ADD_DELAYED_REF, 2412 .bytenr = node->eb->start, 2413 .num_bytes = blocksize, 2414 .parent = upper->eb->start, 2415 .owning_root = btrfs_header_owner(upper->eb), 2416 .ref_root = btrfs_header_owner(upper->eb), 2417 }; 2418 2419 btrfs_set_node_blockptr(upper->eb, slot, 2420 node->eb->start); 2421 btrfs_set_node_ptr_generation(upper->eb, slot, 2422 trans->transid); 2423 btrfs_mark_buffer_dirty(trans, upper->eb); 2424 2425 btrfs_init_tree_ref(&ref, node->level, 2426 btrfs_root_id(root), false); 2427 ret = btrfs_inc_extent_ref(trans, &ref); 2428 if (!ret) 2429 ret = btrfs_drop_subtree(trans, root, eb, 2430 upper->eb); 2431 if (ret) 2432 btrfs_abort_transaction(trans, ret); 2433 } 2434 next: 2435 if (!upper->pending) 2436 btrfs_backref_drop_node_buffer(upper); 2437 else 2438 btrfs_backref_unlock_node_buffer(upper); 2439 if (ret) 2440 break; 2441 } 2442 2443 if (!ret && node->pending) { 2444 btrfs_backref_drop_node_buffer(node); 2445 list_move_tail(&node->list, &rc->backref_cache.changed); 2446 node->pending = 0; 2447 } 2448 2449 path->lowest_level = 0; 2450 2451 /* 2452 * We should have allocated all of our space in the block rsv and thus 2453 * shouldn't ENOSPC. 2454 */ 2455 ASSERT(ret != -ENOSPC); 2456 return ret; 2457 } 2458 2459 static int link_to_upper(struct btrfs_trans_handle *trans, 2460 struct reloc_control *rc, 2461 struct btrfs_backref_node *node, 2462 struct btrfs_path *path) 2463 { 2464 struct btrfs_key key; 2465 2466 btrfs_node_key_to_cpu(node->eb, &key, 0); 2467 return do_relocation(trans, rc, node, &key, path, 0); 2468 } 2469 2470 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2471 struct reloc_control *rc, 2472 struct btrfs_path *path, int err) 2473 { 2474 LIST_HEAD(list); 2475 struct btrfs_backref_cache *cache = &rc->backref_cache; 2476 struct btrfs_backref_node *node; 2477 int level; 2478 int ret; 2479 2480 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2481 while (!list_empty(&cache->pending[level])) { 2482 node = list_entry(cache->pending[level].next, 2483 struct btrfs_backref_node, list); 2484 list_move_tail(&node->list, &list); 2485 BUG_ON(!node->pending); 2486 2487 if (!err) { 2488 ret = link_to_upper(trans, rc, node, path); 2489 if (ret < 0) 2490 err = ret; 2491 } 2492 } 2493 list_splice_init(&list, &cache->pending[level]); 2494 } 2495 return err; 2496 } 2497 2498 /* 2499 * mark a block and all blocks directly/indirectly reference the block 2500 * as processed. 2501 */ 2502 static void update_processed_blocks(struct reloc_control *rc, 2503 struct btrfs_backref_node *node) 2504 { 2505 struct btrfs_backref_node *next = node; 2506 struct btrfs_backref_edge *edge; 2507 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2508 int index = 0; 2509 2510 while (next) { 2511 cond_resched(); 2512 while (1) { 2513 if (next->processed) 2514 break; 2515 2516 mark_block_processed(rc, next); 2517 2518 if (list_empty(&next->upper)) 2519 break; 2520 2521 edge = list_entry(next->upper.next, 2522 struct btrfs_backref_edge, list[LOWER]); 2523 edges[index++] = edge; 2524 next = edge->node[UPPER]; 2525 } 2526 next = walk_down_backref(edges, &index); 2527 } 2528 } 2529 2530 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2531 { 2532 u32 blocksize = rc->extent_root->fs_info->nodesize; 2533 2534 if (test_range_bit(&rc->processed_blocks, bytenr, 2535 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2536 return 1; 2537 return 0; 2538 } 2539 2540 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2541 struct tree_block *block) 2542 { 2543 struct btrfs_tree_parent_check check = { 2544 .level = block->level, 2545 .owner_root = block->owner, 2546 .transid = block->key.offset 2547 }; 2548 struct extent_buffer *eb; 2549 2550 eb = read_tree_block(fs_info, block->bytenr, &check); 2551 if (IS_ERR(eb)) 2552 return PTR_ERR(eb); 2553 if (!extent_buffer_uptodate(eb)) { 2554 free_extent_buffer(eb); 2555 return -EIO; 2556 } 2557 if (block->level == 0) 2558 btrfs_item_key_to_cpu(eb, &block->key, 0); 2559 else 2560 btrfs_node_key_to_cpu(eb, &block->key, 0); 2561 free_extent_buffer(eb); 2562 block->key_ready = true; 2563 return 0; 2564 } 2565 2566 /* 2567 * helper function to relocate a tree block 2568 */ 2569 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2570 struct reloc_control *rc, 2571 struct btrfs_backref_node *node, 2572 struct btrfs_key *key, 2573 struct btrfs_path *path) 2574 { 2575 struct btrfs_root *root; 2576 int ret = 0; 2577 2578 if (!node) 2579 return 0; 2580 2581 /* 2582 * If we fail here we want to drop our backref_node because we are going 2583 * to start over and regenerate the tree for it. 2584 */ 2585 ret = reserve_metadata_space(trans, rc, node); 2586 if (ret) 2587 goto out; 2588 2589 BUG_ON(node->processed); 2590 root = select_one_root(node); 2591 if (IS_ERR(root)) { 2592 ret = PTR_ERR(root); 2593 2594 /* See explanation in select_one_root for the -EUCLEAN case. */ 2595 ASSERT(ret == -ENOENT); 2596 if (ret == -ENOENT) { 2597 ret = 0; 2598 update_processed_blocks(rc, node); 2599 } 2600 goto out; 2601 } 2602 2603 if (root) { 2604 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2605 /* 2606 * This block was the root block of a root, and this is 2607 * the first time we're processing the block and thus it 2608 * should not have had the ->new_bytenr modified and 2609 * should have not been included on the changed list. 2610 * 2611 * However in the case of corruption we could have 2612 * multiple refs pointing to the same block improperly, 2613 * and thus we would trip over these checks. ASSERT() 2614 * for the developer case, because it could indicate a 2615 * bug in the backref code, however error out for a 2616 * normal user in the case of corruption. 2617 */ 2618 ASSERT(node->new_bytenr == 0); 2619 ASSERT(list_empty(&node->list)); 2620 if (node->new_bytenr || !list_empty(&node->list)) { 2621 btrfs_err(root->fs_info, 2622 "bytenr %llu has improper references to it", 2623 node->bytenr); 2624 ret = -EUCLEAN; 2625 goto out; 2626 } 2627 ret = btrfs_record_root_in_trans(trans, root); 2628 if (ret) 2629 goto out; 2630 /* 2631 * Another thread could have failed, need to check if we 2632 * have reloc_root actually set. 2633 */ 2634 if (!root->reloc_root) { 2635 ret = -ENOENT; 2636 goto out; 2637 } 2638 root = root->reloc_root; 2639 node->new_bytenr = root->node->start; 2640 btrfs_put_root(node->root); 2641 node->root = btrfs_grab_root(root); 2642 ASSERT(node->root); 2643 list_add_tail(&node->list, &rc->backref_cache.changed); 2644 } else { 2645 path->lowest_level = node->level; 2646 if (root == root->fs_info->chunk_root) 2647 btrfs_reserve_chunk_metadata(trans, false); 2648 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2649 btrfs_release_path(path); 2650 if (root == root->fs_info->chunk_root) 2651 btrfs_trans_release_chunk_metadata(trans); 2652 if (ret > 0) 2653 ret = 0; 2654 } 2655 if (!ret) 2656 update_processed_blocks(rc, node); 2657 } else { 2658 ret = do_relocation(trans, rc, node, key, path, 1); 2659 } 2660 out: 2661 if (ret || node->level == 0 || node->cowonly) 2662 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2663 return ret; 2664 } 2665 2666 /* 2667 * relocate a list of blocks 2668 */ 2669 static noinline_for_stack 2670 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2671 struct reloc_control *rc, struct rb_root *blocks) 2672 { 2673 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2674 struct btrfs_backref_node *node; 2675 struct btrfs_path *path; 2676 struct tree_block *block; 2677 struct tree_block *next; 2678 int ret = 0; 2679 2680 path = btrfs_alloc_path(); 2681 if (!path) { 2682 ret = -ENOMEM; 2683 goto out_free_blocks; 2684 } 2685 2686 /* Kick in readahead for tree blocks with missing keys */ 2687 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2688 if (!block->key_ready) 2689 btrfs_readahead_tree_block(fs_info, block->bytenr, 2690 block->owner, 0, 2691 block->level); 2692 } 2693 2694 /* Get first keys */ 2695 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2696 if (!block->key_ready) { 2697 ret = get_tree_block_key(fs_info, block); 2698 if (ret) 2699 goto out_free_path; 2700 } 2701 } 2702 2703 /* Do tree relocation */ 2704 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2705 node = build_backref_tree(trans, rc, &block->key, 2706 block->level, block->bytenr); 2707 if (IS_ERR(node)) { 2708 ret = PTR_ERR(node); 2709 goto out; 2710 } 2711 2712 ret = relocate_tree_block(trans, rc, node, &block->key, 2713 path); 2714 if (ret < 0) 2715 break; 2716 } 2717 out: 2718 ret = finish_pending_nodes(trans, rc, path, ret); 2719 2720 out_free_path: 2721 btrfs_free_path(path); 2722 out_free_blocks: 2723 free_block_list(blocks); 2724 return ret; 2725 } 2726 2727 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2728 { 2729 const struct file_extent_cluster *cluster = &rc->cluster; 2730 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2731 u64 alloc_hint = 0; 2732 u64 start; 2733 u64 end; 2734 u64 offset = inode->reloc_block_group_start; 2735 u64 num_bytes; 2736 int nr; 2737 int ret = 0; 2738 u64 i_size = i_size_read(&inode->vfs_inode); 2739 u64 prealloc_start = cluster->start - offset; 2740 u64 prealloc_end = cluster->end - offset; 2741 u64 cur_offset = prealloc_start; 2742 2743 /* 2744 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2745 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2746 * btrfs_do_readpage() call of previously relocated file cluster. 2747 * 2748 * If the current cluster starts in the above range, btrfs_do_readpage() 2749 * will skip the read, and relocate_one_folio() will later writeback 2750 * the padding zeros as new data, causing data corruption. 2751 * 2752 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2753 */ 2754 if (!PAGE_ALIGNED(i_size)) { 2755 struct address_space *mapping = inode->vfs_inode.i_mapping; 2756 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2757 const u32 sectorsize = fs_info->sectorsize; 2758 struct folio *folio; 2759 2760 ASSERT(sectorsize < PAGE_SIZE); 2761 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2762 2763 /* 2764 * Subpage can't handle page with DIRTY but without UPTODATE 2765 * bit as it can lead to the following deadlock: 2766 * 2767 * btrfs_read_folio() 2768 * | Page already *locked* 2769 * |- btrfs_lock_and_flush_ordered_range() 2770 * |- btrfs_start_ordered_extent() 2771 * |- extent_write_cache_pages() 2772 * |- lock_page() 2773 * We try to lock the page we already hold. 2774 * 2775 * Here we just writeback the whole data reloc inode, so that 2776 * we will be ensured to have no dirty range in the page, and 2777 * are safe to clear the uptodate bits. 2778 * 2779 * This shouldn't cause too much overhead, as we need to write 2780 * the data back anyway. 2781 */ 2782 ret = filemap_write_and_wait(mapping); 2783 if (ret < 0) 2784 return ret; 2785 2786 clear_extent_bits(&inode->io_tree, i_size, 2787 round_up(i_size, PAGE_SIZE) - 1, 2788 EXTENT_UPTODATE); 2789 folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); 2790 /* 2791 * If page is freed we don't need to do anything then, as we 2792 * will re-read the whole page anyway. 2793 */ 2794 if (!IS_ERR(folio)) { 2795 btrfs_subpage_clear_uptodate(fs_info, folio, i_size, 2796 round_up(i_size, PAGE_SIZE) - i_size); 2797 folio_unlock(folio); 2798 folio_put(folio); 2799 } 2800 } 2801 2802 BUG_ON(cluster->start != cluster->boundary[0]); 2803 ret = btrfs_alloc_data_chunk_ondemand(inode, 2804 prealloc_end + 1 - prealloc_start); 2805 if (ret) 2806 return ret; 2807 2808 btrfs_inode_lock(inode, 0); 2809 for (nr = 0; nr < cluster->nr; nr++) { 2810 struct extent_state *cached_state = NULL; 2811 2812 start = cluster->boundary[nr] - offset; 2813 if (nr + 1 < cluster->nr) 2814 end = cluster->boundary[nr + 1] - 1 - offset; 2815 else 2816 end = cluster->end - offset; 2817 2818 lock_extent(&inode->io_tree, start, end, &cached_state); 2819 num_bytes = end + 1 - start; 2820 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2821 num_bytes, num_bytes, 2822 end + 1, &alloc_hint); 2823 cur_offset = end + 1; 2824 unlock_extent(&inode->io_tree, start, end, &cached_state); 2825 if (ret) 2826 break; 2827 } 2828 btrfs_inode_unlock(inode, 0); 2829 2830 if (cur_offset < prealloc_end) 2831 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2832 prealloc_end + 1 - cur_offset); 2833 return ret; 2834 } 2835 2836 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2837 { 2838 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2839 struct extent_map *em; 2840 struct extent_state *cached_state = NULL; 2841 u64 offset = inode->reloc_block_group_start; 2842 u64 start = rc->cluster.start - offset; 2843 u64 end = rc->cluster.end - offset; 2844 int ret = 0; 2845 2846 em = alloc_extent_map(); 2847 if (!em) 2848 return -ENOMEM; 2849 2850 em->start = start; 2851 em->len = end + 1 - start; 2852 em->disk_bytenr = rc->cluster.start; 2853 em->disk_num_bytes = em->len; 2854 em->ram_bytes = em->len; 2855 em->flags |= EXTENT_FLAG_PINNED; 2856 2857 lock_extent(&inode->io_tree, start, end, &cached_state); 2858 ret = btrfs_replace_extent_map_range(inode, em, false); 2859 unlock_extent(&inode->io_tree, start, end, &cached_state); 2860 free_extent_map(em); 2861 2862 return ret; 2863 } 2864 2865 /* 2866 * Allow error injection to test balance/relocation cancellation 2867 */ 2868 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2869 { 2870 return atomic_read(&fs_info->balance_cancel_req) || 2871 atomic_read(&fs_info->reloc_cancel_req) || 2872 fatal_signal_pending(current); 2873 } 2874 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2875 2876 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2877 int cluster_nr) 2878 { 2879 /* Last extent, use cluster end directly */ 2880 if (cluster_nr >= cluster->nr - 1) 2881 return cluster->end; 2882 2883 /* Use next boundary start*/ 2884 return cluster->boundary[cluster_nr + 1] - 1; 2885 } 2886 2887 static int relocate_one_folio(struct reloc_control *rc, 2888 struct file_ra_state *ra, 2889 int *cluster_nr, unsigned long index) 2890 { 2891 const struct file_extent_cluster *cluster = &rc->cluster; 2892 struct inode *inode = rc->data_inode; 2893 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2894 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2895 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2896 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2897 struct folio *folio; 2898 u64 folio_start; 2899 u64 folio_end; 2900 u64 cur; 2901 int ret; 2902 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2903 2904 ASSERT(index <= last_index); 2905 folio = filemap_lock_folio(inode->i_mapping, index); 2906 if (IS_ERR(folio)) { 2907 2908 /* 2909 * On relocation we're doing readahead on the relocation inode, 2910 * but if the filesystem is backed by a RAID stripe tree we can 2911 * get ENOENT (e.g. due to preallocated extents not being 2912 * mapped in the RST) from the lookup. 2913 * 2914 * But readahead doesn't handle the error and submits invalid 2915 * reads to the device, causing a assertion failures. 2916 */ 2917 if (!use_rst) 2918 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2919 index, last_index + 1 - index); 2920 folio = __filemap_get_folio(inode->i_mapping, index, 2921 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2922 mask); 2923 if (IS_ERR(folio)) 2924 return PTR_ERR(folio); 2925 } 2926 2927 WARN_ON(folio_order(folio)); 2928 2929 if (folio_test_readahead(folio) && !use_rst) 2930 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2931 folio, last_index + 1 - index); 2932 2933 if (!folio_test_uptodate(folio)) { 2934 btrfs_read_folio(NULL, folio); 2935 folio_lock(folio); 2936 if (!folio_test_uptodate(folio)) { 2937 ret = -EIO; 2938 goto release_folio; 2939 } 2940 } 2941 2942 /* 2943 * We could have lost folio private when we dropped the lock to read the 2944 * folio above, make sure we set_page_extent_mapped here so we have any 2945 * of the subpage blocksize stuff we need in place. 2946 */ 2947 ret = set_folio_extent_mapped(folio); 2948 if (ret < 0) 2949 goto release_folio; 2950 2951 folio_start = folio_pos(folio); 2952 folio_end = folio_start + PAGE_SIZE - 1; 2953 2954 /* 2955 * Start from the cluster, as for subpage case, the cluster can start 2956 * inside the folio. 2957 */ 2958 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2959 while (cur <= folio_end) { 2960 struct extent_state *cached_state = NULL; 2961 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2962 u64 extent_end = get_cluster_boundary_end(cluster, 2963 *cluster_nr) - offset; 2964 u64 clamped_start = max(folio_start, extent_start); 2965 u64 clamped_end = min(folio_end, extent_end); 2966 u32 clamped_len = clamped_end + 1 - clamped_start; 2967 2968 /* Reserve metadata for this range */ 2969 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2970 clamped_len, clamped_len, 2971 false); 2972 if (ret) 2973 goto release_folio; 2974 2975 /* Mark the range delalloc and dirty for later writeback */ 2976 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2977 &cached_state); 2978 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2979 clamped_end, 0, &cached_state); 2980 if (ret) { 2981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 2982 clamped_start, clamped_end, 2983 EXTENT_LOCKED | EXTENT_BOUNDARY, 2984 &cached_state); 2985 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2986 clamped_len, true); 2987 btrfs_delalloc_release_extents(BTRFS_I(inode), 2988 clamped_len); 2989 goto release_folio; 2990 } 2991 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2992 2993 /* 2994 * Set the boundary if it's inside the folio. 2995 * Data relocation requires the destination extents to have the 2996 * same size as the source. 2997 * EXTENT_BOUNDARY bit prevents current extent from being merged 2998 * with previous extent. 2999 */ 3000 if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { 3001 u64 boundary_start = cluster->boundary[*cluster_nr] - 3002 offset; 3003 u64 boundary_end = boundary_start + 3004 fs_info->sectorsize - 1; 3005 3006 set_extent_bit(&BTRFS_I(inode)->io_tree, 3007 boundary_start, boundary_end, 3008 EXTENT_BOUNDARY, NULL); 3009 } 3010 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3011 &cached_state); 3012 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3013 cur += clamped_len; 3014 3015 /* Crossed extent end, go to next extent */ 3016 if (cur >= extent_end) { 3017 (*cluster_nr)++; 3018 /* Just finished the last extent of the cluster, exit. */ 3019 if (*cluster_nr >= cluster->nr) 3020 break; 3021 } 3022 } 3023 folio_unlock(folio); 3024 folio_put(folio); 3025 3026 balance_dirty_pages_ratelimited(inode->i_mapping); 3027 btrfs_throttle(fs_info); 3028 if (btrfs_should_cancel_balance(fs_info)) 3029 ret = -ECANCELED; 3030 return ret; 3031 3032 release_folio: 3033 folio_unlock(folio); 3034 folio_put(folio); 3035 return ret; 3036 } 3037 3038 static int relocate_file_extent_cluster(struct reloc_control *rc) 3039 { 3040 struct inode *inode = rc->data_inode; 3041 const struct file_extent_cluster *cluster = &rc->cluster; 3042 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 3043 unsigned long index; 3044 unsigned long last_index; 3045 struct file_ra_state *ra; 3046 int cluster_nr = 0; 3047 int ret = 0; 3048 3049 if (!cluster->nr) 3050 return 0; 3051 3052 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3053 if (!ra) 3054 return -ENOMEM; 3055 3056 ret = prealloc_file_extent_cluster(rc); 3057 if (ret) 3058 goto out; 3059 3060 file_ra_state_init(ra, inode->i_mapping); 3061 3062 ret = setup_relocation_extent_mapping(rc); 3063 if (ret) 3064 goto out; 3065 3066 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3067 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3068 index <= last_index && !ret; index++) 3069 ret = relocate_one_folio(rc, ra, &cluster_nr, index); 3070 if (ret == 0) 3071 WARN_ON(cluster_nr != cluster->nr); 3072 out: 3073 kfree(ra); 3074 return ret; 3075 } 3076 3077 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 3078 const struct btrfs_key *extent_key) 3079 { 3080 struct inode *inode = rc->data_inode; 3081 struct file_extent_cluster *cluster = &rc->cluster; 3082 int ret; 3083 struct btrfs_root *root = BTRFS_I(inode)->root; 3084 3085 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3086 ret = relocate_file_extent_cluster(rc); 3087 if (ret) 3088 return ret; 3089 cluster->nr = 0; 3090 } 3091 3092 /* 3093 * Under simple quotas, we set root->relocation_src_root when we find 3094 * the extent. If adjacent extents have different owners, we can't merge 3095 * them while relocating. Handle this by storing the owning root that 3096 * started a cluster and if we see an extent from a different root break 3097 * cluster formation (just like the above case of non-adjacent extents). 3098 * 3099 * Without simple quotas, relocation_src_root is always 0, so we should 3100 * never see a mismatch, and it should have no effect on relocation 3101 * clusters. 3102 */ 3103 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3104 u64 tmp = root->relocation_src_root; 3105 3106 /* 3107 * root->relocation_src_root is the state that actually affects 3108 * the preallocation we do here, so set it to the root owning 3109 * the cluster we need to relocate. 3110 */ 3111 root->relocation_src_root = cluster->owning_root; 3112 ret = relocate_file_extent_cluster(rc); 3113 if (ret) 3114 return ret; 3115 cluster->nr = 0; 3116 /* And reset it back for the current extent's owning root. */ 3117 root->relocation_src_root = tmp; 3118 } 3119 3120 if (!cluster->nr) { 3121 cluster->start = extent_key->objectid; 3122 cluster->owning_root = root->relocation_src_root; 3123 } 3124 else 3125 BUG_ON(cluster->nr >= MAX_EXTENTS); 3126 cluster->end = extent_key->objectid + extent_key->offset - 1; 3127 cluster->boundary[cluster->nr] = extent_key->objectid; 3128 cluster->nr++; 3129 3130 if (cluster->nr >= MAX_EXTENTS) { 3131 ret = relocate_file_extent_cluster(rc); 3132 if (ret) 3133 return ret; 3134 cluster->nr = 0; 3135 } 3136 return 0; 3137 } 3138 3139 /* 3140 * helper to add a tree block to the list. 3141 * the major work is getting the generation and level of the block 3142 */ 3143 static int add_tree_block(struct reloc_control *rc, 3144 const struct btrfs_key *extent_key, 3145 struct btrfs_path *path, 3146 struct rb_root *blocks) 3147 { 3148 struct extent_buffer *eb; 3149 struct btrfs_extent_item *ei; 3150 struct btrfs_tree_block_info *bi; 3151 struct tree_block *block; 3152 struct rb_node *rb_node; 3153 u32 item_size; 3154 int level = -1; 3155 u64 generation; 3156 u64 owner = 0; 3157 3158 eb = path->nodes[0]; 3159 item_size = btrfs_item_size(eb, path->slots[0]); 3160 3161 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3162 item_size >= sizeof(*ei) + sizeof(*bi)) { 3163 unsigned long ptr = 0, end; 3164 3165 ei = btrfs_item_ptr(eb, path->slots[0], 3166 struct btrfs_extent_item); 3167 end = (unsigned long)ei + item_size; 3168 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3169 bi = (struct btrfs_tree_block_info *)(ei + 1); 3170 level = btrfs_tree_block_level(eb, bi); 3171 ptr = (unsigned long)(bi + 1); 3172 } else { 3173 level = (int)extent_key->offset; 3174 ptr = (unsigned long)(ei + 1); 3175 } 3176 generation = btrfs_extent_generation(eb, ei); 3177 3178 /* 3179 * We're reading random blocks without knowing their owner ahead 3180 * of time. This is ok most of the time, as all reloc roots and 3181 * fs roots have the same lock type. However normal trees do 3182 * not, and the only way to know ahead of time is to read the 3183 * inline ref offset. We know it's an fs root if 3184 * 3185 * 1. There's more than one ref. 3186 * 2. There's a SHARED_DATA_REF_KEY set. 3187 * 3. FULL_BACKREF is set on the flags. 3188 * 3189 * Otherwise it's safe to assume that the ref offset == the 3190 * owner of this block, so we can use that when calling 3191 * read_tree_block. 3192 */ 3193 if (btrfs_extent_refs(eb, ei) == 1 && 3194 !(btrfs_extent_flags(eb, ei) & 3195 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3196 ptr < end) { 3197 struct btrfs_extent_inline_ref *iref; 3198 int type; 3199 3200 iref = (struct btrfs_extent_inline_ref *)ptr; 3201 type = btrfs_get_extent_inline_ref_type(eb, iref, 3202 BTRFS_REF_TYPE_BLOCK); 3203 if (type == BTRFS_REF_TYPE_INVALID) 3204 return -EINVAL; 3205 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3206 owner = btrfs_extent_inline_ref_offset(eb, iref); 3207 } 3208 } else { 3209 btrfs_print_leaf(eb); 3210 btrfs_err(rc->block_group->fs_info, 3211 "unrecognized tree backref at tree block %llu slot %u", 3212 eb->start, path->slots[0]); 3213 btrfs_release_path(path); 3214 return -EUCLEAN; 3215 } 3216 3217 btrfs_release_path(path); 3218 3219 BUG_ON(level == -1); 3220 3221 block = kmalloc(sizeof(*block), GFP_NOFS); 3222 if (!block) 3223 return -ENOMEM; 3224 3225 block->bytenr = extent_key->objectid; 3226 block->key.objectid = rc->extent_root->fs_info->nodesize; 3227 block->key.offset = generation; 3228 block->level = level; 3229 block->key_ready = false; 3230 block->owner = owner; 3231 3232 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3233 if (rb_node) 3234 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3235 -EEXIST); 3236 3237 return 0; 3238 } 3239 3240 /* 3241 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3242 */ 3243 static int __add_tree_block(struct reloc_control *rc, 3244 u64 bytenr, u32 blocksize, 3245 struct rb_root *blocks) 3246 { 3247 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3248 struct btrfs_path *path; 3249 struct btrfs_key key; 3250 int ret; 3251 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3252 3253 if (tree_block_processed(bytenr, rc)) 3254 return 0; 3255 3256 if (rb_simple_search(blocks, bytenr)) 3257 return 0; 3258 3259 path = btrfs_alloc_path(); 3260 if (!path) 3261 return -ENOMEM; 3262 again: 3263 key.objectid = bytenr; 3264 if (skinny) { 3265 key.type = BTRFS_METADATA_ITEM_KEY; 3266 key.offset = (u64)-1; 3267 } else { 3268 key.type = BTRFS_EXTENT_ITEM_KEY; 3269 key.offset = blocksize; 3270 } 3271 3272 path->search_commit_root = 1; 3273 path->skip_locking = 1; 3274 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3275 if (ret < 0) 3276 goto out; 3277 3278 if (ret > 0 && skinny) { 3279 if (path->slots[0]) { 3280 path->slots[0]--; 3281 btrfs_item_key_to_cpu(path->nodes[0], &key, 3282 path->slots[0]); 3283 if (key.objectid == bytenr && 3284 (key.type == BTRFS_METADATA_ITEM_KEY || 3285 (key.type == BTRFS_EXTENT_ITEM_KEY && 3286 key.offset == blocksize))) 3287 ret = 0; 3288 } 3289 3290 if (ret) { 3291 skinny = false; 3292 btrfs_release_path(path); 3293 goto again; 3294 } 3295 } 3296 if (ret) { 3297 ASSERT(ret == 1); 3298 btrfs_print_leaf(path->nodes[0]); 3299 btrfs_err(fs_info, 3300 "tree block extent item (%llu) is not found in extent tree", 3301 bytenr); 3302 WARN_ON(1); 3303 ret = -EINVAL; 3304 goto out; 3305 } 3306 3307 ret = add_tree_block(rc, &key, path, blocks); 3308 out: 3309 btrfs_free_path(path); 3310 return ret; 3311 } 3312 3313 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3314 struct btrfs_block_group *block_group, 3315 struct inode *inode, 3316 u64 ino) 3317 { 3318 struct btrfs_root *root = fs_info->tree_root; 3319 struct btrfs_trans_handle *trans; 3320 int ret = 0; 3321 3322 if (inode) 3323 goto truncate; 3324 3325 inode = btrfs_iget(ino, root); 3326 if (IS_ERR(inode)) 3327 return -ENOENT; 3328 3329 truncate: 3330 ret = btrfs_check_trunc_cache_free_space(fs_info, 3331 &fs_info->global_block_rsv); 3332 if (ret) 3333 goto out; 3334 3335 trans = btrfs_join_transaction(root); 3336 if (IS_ERR(trans)) { 3337 ret = PTR_ERR(trans); 3338 goto out; 3339 } 3340 3341 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3342 3343 btrfs_end_transaction(trans); 3344 btrfs_btree_balance_dirty(fs_info); 3345 out: 3346 iput(inode); 3347 return ret; 3348 } 3349 3350 /* 3351 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3352 * cache inode, to avoid free space cache data extent blocking data relocation. 3353 */ 3354 static int delete_v1_space_cache(struct extent_buffer *leaf, 3355 struct btrfs_block_group *block_group, 3356 u64 data_bytenr) 3357 { 3358 u64 space_cache_ino; 3359 struct btrfs_file_extent_item *ei; 3360 struct btrfs_key key; 3361 bool found = false; 3362 int i; 3363 int ret; 3364 3365 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3366 return 0; 3367 3368 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3369 u8 type; 3370 3371 btrfs_item_key_to_cpu(leaf, &key, i); 3372 if (key.type != BTRFS_EXTENT_DATA_KEY) 3373 continue; 3374 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3375 type = btrfs_file_extent_type(leaf, ei); 3376 3377 if ((type == BTRFS_FILE_EXTENT_REG || 3378 type == BTRFS_FILE_EXTENT_PREALLOC) && 3379 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3380 found = true; 3381 space_cache_ino = key.objectid; 3382 break; 3383 } 3384 } 3385 if (!found) 3386 return -ENOENT; 3387 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3388 space_cache_ino); 3389 return ret; 3390 } 3391 3392 /* 3393 * helper to find all tree blocks that reference a given data extent 3394 */ 3395 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3396 const struct btrfs_key *extent_key, 3397 struct btrfs_path *path, 3398 struct rb_root *blocks) 3399 { 3400 struct btrfs_backref_walk_ctx ctx = { 0 }; 3401 struct ulist_iterator leaf_uiter; 3402 struct ulist_node *ref_node = NULL; 3403 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3404 int ret = 0; 3405 3406 btrfs_release_path(path); 3407 3408 ctx.bytenr = extent_key->objectid; 3409 ctx.skip_inode_ref_list = true; 3410 ctx.fs_info = rc->extent_root->fs_info; 3411 3412 ret = btrfs_find_all_leafs(&ctx); 3413 if (ret < 0) 3414 return ret; 3415 3416 ULIST_ITER_INIT(&leaf_uiter); 3417 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3418 struct btrfs_tree_parent_check check = { 0 }; 3419 struct extent_buffer *eb; 3420 3421 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3422 if (IS_ERR(eb)) { 3423 ret = PTR_ERR(eb); 3424 break; 3425 } 3426 ret = delete_v1_space_cache(eb, rc->block_group, 3427 extent_key->objectid); 3428 free_extent_buffer(eb); 3429 if (ret < 0) 3430 break; 3431 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3432 if (ret < 0) 3433 break; 3434 } 3435 if (ret < 0) 3436 free_block_list(blocks); 3437 ulist_free(ctx.refs); 3438 return ret; 3439 } 3440 3441 /* 3442 * helper to find next unprocessed extent 3443 */ 3444 static noinline_for_stack 3445 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3446 struct btrfs_key *extent_key) 3447 { 3448 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3449 struct btrfs_key key; 3450 struct extent_buffer *leaf; 3451 u64 start, end, last; 3452 int ret; 3453 3454 last = rc->block_group->start + rc->block_group->length; 3455 while (1) { 3456 bool block_found; 3457 3458 cond_resched(); 3459 if (rc->search_start >= last) { 3460 ret = 1; 3461 break; 3462 } 3463 3464 key.objectid = rc->search_start; 3465 key.type = BTRFS_EXTENT_ITEM_KEY; 3466 key.offset = 0; 3467 3468 path->search_commit_root = 1; 3469 path->skip_locking = 1; 3470 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3471 0, 0); 3472 if (ret < 0) 3473 break; 3474 next: 3475 leaf = path->nodes[0]; 3476 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3477 ret = btrfs_next_leaf(rc->extent_root, path); 3478 if (ret != 0) 3479 break; 3480 leaf = path->nodes[0]; 3481 } 3482 3483 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3484 if (key.objectid >= last) { 3485 ret = 1; 3486 break; 3487 } 3488 3489 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3490 key.type != BTRFS_METADATA_ITEM_KEY) { 3491 path->slots[0]++; 3492 goto next; 3493 } 3494 3495 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3496 key.objectid + key.offset <= rc->search_start) { 3497 path->slots[0]++; 3498 goto next; 3499 } 3500 3501 if (key.type == BTRFS_METADATA_ITEM_KEY && 3502 key.objectid + fs_info->nodesize <= 3503 rc->search_start) { 3504 path->slots[0]++; 3505 goto next; 3506 } 3507 3508 block_found = find_first_extent_bit(&rc->processed_blocks, 3509 key.objectid, &start, &end, 3510 EXTENT_DIRTY, NULL); 3511 3512 if (block_found && start <= key.objectid) { 3513 btrfs_release_path(path); 3514 rc->search_start = end + 1; 3515 } else { 3516 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3517 rc->search_start = key.objectid + key.offset; 3518 else 3519 rc->search_start = key.objectid + 3520 fs_info->nodesize; 3521 memcpy(extent_key, &key, sizeof(key)); 3522 return 0; 3523 } 3524 } 3525 btrfs_release_path(path); 3526 return ret; 3527 } 3528 3529 static void set_reloc_control(struct reloc_control *rc) 3530 { 3531 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3532 3533 mutex_lock(&fs_info->reloc_mutex); 3534 fs_info->reloc_ctl = rc; 3535 mutex_unlock(&fs_info->reloc_mutex); 3536 } 3537 3538 static void unset_reloc_control(struct reloc_control *rc) 3539 { 3540 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3541 3542 mutex_lock(&fs_info->reloc_mutex); 3543 fs_info->reloc_ctl = NULL; 3544 mutex_unlock(&fs_info->reloc_mutex); 3545 } 3546 3547 static noinline_for_stack 3548 int prepare_to_relocate(struct reloc_control *rc) 3549 { 3550 struct btrfs_trans_handle *trans; 3551 int ret; 3552 3553 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3554 BTRFS_BLOCK_RSV_TEMP); 3555 if (!rc->block_rsv) 3556 return -ENOMEM; 3557 3558 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3559 rc->search_start = rc->block_group->start; 3560 rc->extents_found = 0; 3561 rc->nodes_relocated = 0; 3562 rc->merging_rsv_size = 0; 3563 rc->reserved_bytes = 0; 3564 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3565 RELOCATION_RESERVED_NODES; 3566 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3567 rc->block_rsv, rc->block_rsv->size, 3568 BTRFS_RESERVE_FLUSH_ALL); 3569 if (ret) 3570 return ret; 3571 3572 rc->create_reloc_tree = true; 3573 set_reloc_control(rc); 3574 3575 trans = btrfs_join_transaction(rc->extent_root); 3576 if (IS_ERR(trans)) { 3577 unset_reloc_control(rc); 3578 /* 3579 * extent tree is not a ref_cow tree and has no reloc_root to 3580 * cleanup. And callers are responsible to free the above 3581 * block rsv. 3582 */ 3583 return PTR_ERR(trans); 3584 } 3585 3586 ret = btrfs_commit_transaction(trans); 3587 if (ret) 3588 unset_reloc_control(rc); 3589 3590 return ret; 3591 } 3592 3593 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3594 { 3595 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3596 struct rb_root blocks = RB_ROOT; 3597 struct btrfs_key key; 3598 struct btrfs_trans_handle *trans = NULL; 3599 struct btrfs_path *path; 3600 struct btrfs_extent_item *ei; 3601 u64 flags; 3602 int ret; 3603 int err = 0; 3604 int progress = 0; 3605 3606 path = btrfs_alloc_path(); 3607 if (!path) 3608 return -ENOMEM; 3609 path->reada = READA_FORWARD; 3610 3611 ret = prepare_to_relocate(rc); 3612 if (ret) { 3613 err = ret; 3614 goto out_free; 3615 } 3616 3617 while (1) { 3618 rc->reserved_bytes = 0; 3619 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3620 rc->block_rsv->size, 3621 BTRFS_RESERVE_FLUSH_ALL); 3622 if (ret) { 3623 err = ret; 3624 break; 3625 } 3626 progress++; 3627 trans = btrfs_start_transaction(rc->extent_root, 0); 3628 if (IS_ERR(trans)) { 3629 err = PTR_ERR(trans); 3630 trans = NULL; 3631 break; 3632 } 3633 restart: 3634 if (rc->backref_cache.last_trans != trans->transid) 3635 btrfs_backref_release_cache(&rc->backref_cache); 3636 rc->backref_cache.last_trans = trans->transid; 3637 3638 ret = find_next_extent(rc, path, &key); 3639 if (ret < 0) 3640 err = ret; 3641 if (ret != 0) 3642 break; 3643 3644 rc->extents_found++; 3645 3646 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3647 struct btrfs_extent_item); 3648 flags = btrfs_extent_flags(path->nodes[0], ei); 3649 3650 /* 3651 * If we are relocating a simple quota owned extent item, we 3652 * need to note the owner on the reloc data root so that when 3653 * we allocate the replacement item, we can attribute it to the 3654 * correct eventual owner (rather than the reloc data root). 3655 */ 3656 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3657 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3658 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3659 path->nodes[0], 3660 path->slots[0]); 3661 3662 root->relocation_src_root = owning_root_id; 3663 } 3664 3665 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3666 ret = add_tree_block(rc, &key, path, &blocks); 3667 } else if (rc->stage == UPDATE_DATA_PTRS && 3668 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3669 ret = add_data_references(rc, &key, path, &blocks); 3670 } else { 3671 btrfs_release_path(path); 3672 ret = 0; 3673 } 3674 if (ret < 0) { 3675 err = ret; 3676 break; 3677 } 3678 3679 if (!RB_EMPTY_ROOT(&blocks)) { 3680 ret = relocate_tree_blocks(trans, rc, &blocks); 3681 if (ret < 0) { 3682 if (ret != -EAGAIN) { 3683 err = ret; 3684 break; 3685 } 3686 rc->extents_found--; 3687 rc->search_start = key.objectid; 3688 } 3689 } 3690 3691 btrfs_end_transaction_throttle(trans); 3692 btrfs_btree_balance_dirty(fs_info); 3693 trans = NULL; 3694 3695 if (rc->stage == MOVE_DATA_EXTENTS && 3696 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3697 rc->found_file_extent = true; 3698 ret = relocate_data_extent(rc, &key); 3699 if (ret < 0) { 3700 err = ret; 3701 break; 3702 } 3703 } 3704 if (btrfs_should_cancel_balance(fs_info)) { 3705 err = -ECANCELED; 3706 break; 3707 } 3708 } 3709 if (trans && progress && err == -ENOSPC) { 3710 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3711 if (ret == 1) { 3712 err = 0; 3713 progress = 0; 3714 goto restart; 3715 } 3716 } 3717 3718 btrfs_release_path(path); 3719 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3720 3721 if (trans) { 3722 btrfs_end_transaction_throttle(trans); 3723 btrfs_btree_balance_dirty(fs_info); 3724 } 3725 3726 if (!err) { 3727 ret = relocate_file_extent_cluster(rc); 3728 if (ret < 0) 3729 err = ret; 3730 } 3731 3732 rc->create_reloc_tree = false; 3733 set_reloc_control(rc); 3734 3735 btrfs_backref_release_cache(&rc->backref_cache); 3736 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3737 3738 /* 3739 * Even in the case when the relocation is cancelled, we should all go 3740 * through prepare_to_merge() and merge_reloc_roots(). 3741 * 3742 * For error (including cancelled balance), prepare_to_merge() will 3743 * mark all reloc trees orphan, then queue them for cleanup in 3744 * merge_reloc_roots() 3745 */ 3746 err = prepare_to_merge(rc, err); 3747 3748 merge_reloc_roots(rc); 3749 3750 rc->merge_reloc_tree = false; 3751 unset_reloc_control(rc); 3752 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3753 3754 /* get rid of pinned extents */ 3755 trans = btrfs_join_transaction(rc->extent_root); 3756 if (IS_ERR(trans)) { 3757 err = PTR_ERR(trans); 3758 goto out_free; 3759 } 3760 ret = btrfs_commit_transaction(trans); 3761 if (ret && !err) 3762 err = ret; 3763 out_free: 3764 ret = clean_dirty_subvols(rc); 3765 if (ret < 0 && !err) 3766 err = ret; 3767 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3768 btrfs_free_path(path); 3769 return err; 3770 } 3771 3772 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3773 struct btrfs_root *root, u64 objectid) 3774 { 3775 struct btrfs_path *path; 3776 struct btrfs_inode_item *item; 3777 struct extent_buffer *leaf; 3778 int ret; 3779 3780 path = btrfs_alloc_path(); 3781 if (!path) 3782 return -ENOMEM; 3783 3784 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3785 if (ret) 3786 goto out; 3787 3788 leaf = path->nodes[0]; 3789 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3790 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3791 btrfs_set_inode_generation(leaf, item, 1); 3792 btrfs_set_inode_size(leaf, item, 0); 3793 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3794 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3795 BTRFS_INODE_PREALLOC); 3796 btrfs_mark_buffer_dirty(trans, leaf); 3797 out: 3798 btrfs_free_path(path); 3799 return ret; 3800 } 3801 3802 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3803 struct btrfs_root *root, u64 objectid) 3804 { 3805 struct btrfs_path *path; 3806 struct btrfs_key key; 3807 int ret = 0; 3808 3809 path = btrfs_alloc_path(); 3810 if (!path) { 3811 ret = -ENOMEM; 3812 goto out; 3813 } 3814 3815 key.objectid = objectid; 3816 key.type = BTRFS_INODE_ITEM_KEY; 3817 key.offset = 0; 3818 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3819 if (ret) { 3820 if (ret > 0) 3821 ret = -ENOENT; 3822 goto out; 3823 } 3824 ret = btrfs_del_item(trans, root, path); 3825 out: 3826 if (ret) 3827 btrfs_abort_transaction(trans, ret); 3828 btrfs_free_path(path); 3829 } 3830 3831 /* 3832 * helper to create inode for data relocation. 3833 * the inode is in data relocation tree and its link count is 0 3834 */ 3835 static noinline_for_stack struct inode *create_reloc_inode( 3836 struct btrfs_fs_info *fs_info, 3837 const struct btrfs_block_group *group) 3838 { 3839 struct inode *inode = NULL; 3840 struct btrfs_trans_handle *trans; 3841 struct btrfs_root *root; 3842 u64 objectid; 3843 int ret = 0; 3844 3845 root = btrfs_grab_root(fs_info->data_reloc_root); 3846 trans = btrfs_start_transaction(root, 6); 3847 if (IS_ERR(trans)) { 3848 btrfs_put_root(root); 3849 return ERR_CAST(trans); 3850 } 3851 3852 ret = btrfs_get_free_objectid(root, &objectid); 3853 if (ret) 3854 goto out; 3855 3856 ret = __insert_orphan_inode(trans, root, objectid); 3857 if (ret) 3858 goto out; 3859 3860 inode = btrfs_iget(objectid, root); 3861 if (IS_ERR(inode)) { 3862 delete_orphan_inode(trans, root, objectid); 3863 ret = PTR_ERR(inode); 3864 inode = NULL; 3865 goto out; 3866 } 3867 BTRFS_I(inode)->reloc_block_group_start = group->start; 3868 3869 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 3870 out: 3871 btrfs_put_root(root); 3872 btrfs_end_transaction(trans); 3873 btrfs_btree_balance_dirty(fs_info); 3874 if (ret) { 3875 iput(inode); 3876 inode = ERR_PTR(ret); 3877 } 3878 return inode; 3879 } 3880 3881 /* 3882 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3883 * has been requested meanwhile and don't start in that case. 3884 * 3885 * Return: 3886 * 0 success 3887 * -EINPROGRESS operation is already in progress, that's probably a bug 3888 * -ECANCELED cancellation request was set before the operation started 3889 */ 3890 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3891 { 3892 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3893 /* This should not happen */ 3894 btrfs_err(fs_info, "reloc already running, cannot start"); 3895 return -EINPROGRESS; 3896 } 3897 3898 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3899 btrfs_info(fs_info, "chunk relocation canceled on start"); 3900 /* 3901 * On cancel, clear all requests but let the caller mark 3902 * the end after cleanup operations. 3903 */ 3904 atomic_set(&fs_info->reloc_cancel_req, 0); 3905 return -ECANCELED; 3906 } 3907 return 0; 3908 } 3909 3910 /* 3911 * Mark end of chunk relocation that is cancellable and wake any waiters. 3912 */ 3913 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3914 { 3915 /* Requested after start, clear bit first so any waiters can continue */ 3916 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3917 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3918 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3919 atomic_set(&fs_info->reloc_cancel_req, 0); 3920 } 3921 3922 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3923 { 3924 struct reloc_control *rc; 3925 3926 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3927 if (!rc) 3928 return NULL; 3929 3930 INIT_LIST_HEAD(&rc->reloc_roots); 3931 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3932 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3933 rc->reloc_root_tree.rb_root = RB_ROOT; 3934 spin_lock_init(&rc->reloc_root_tree.lock); 3935 extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3936 return rc; 3937 } 3938 3939 static void free_reloc_control(struct reloc_control *rc) 3940 { 3941 struct mapping_node *node, *tmp; 3942 3943 free_reloc_roots(&rc->reloc_roots); 3944 rbtree_postorder_for_each_entry_safe(node, tmp, 3945 &rc->reloc_root_tree.rb_root, rb_node) 3946 kfree(node); 3947 3948 kfree(rc); 3949 } 3950 3951 /* 3952 * Print the block group being relocated 3953 */ 3954 static void describe_relocation(struct btrfs_block_group *block_group) 3955 { 3956 char buf[128] = {'\0'}; 3957 3958 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3959 3960 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3961 block_group->start, buf); 3962 } 3963 3964 static const char *stage_to_string(enum reloc_stage stage) 3965 { 3966 if (stage == MOVE_DATA_EXTENTS) 3967 return "move data extents"; 3968 if (stage == UPDATE_DATA_PTRS) 3969 return "update data pointers"; 3970 return "unknown"; 3971 } 3972 3973 /* 3974 * function to relocate all extents in a block group. 3975 */ 3976 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3977 { 3978 struct btrfs_block_group *bg; 3979 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3980 struct reloc_control *rc; 3981 struct inode *inode; 3982 struct btrfs_path *path; 3983 int ret; 3984 int rw = 0; 3985 int err = 0; 3986 3987 /* 3988 * This only gets set if we had a half-deleted snapshot on mount. We 3989 * cannot allow relocation to start while we're still trying to clean up 3990 * these pending deletions. 3991 */ 3992 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3993 if (ret) 3994 return ret; 3995 3996 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3997 if (btrfs_fs_closing(fs_info)) 3998 return -EINTR; 3999 4000 bg = btrfs_lookup_block_group(fs_info, group_start); 4001 if (!bg) 4002 return -ENOENT; 4003 4004 /* 4005 * Relocation of a data block group creates ordered extents. Without 4006 * sb_start_write(), we can freeze the filesystem while unfinished 4007 * ordered extents are left. Such ordered extents can cause a deadlock 4008 * e.g. when syncfs() is waiting for their completion but they can't 4009 * finish because they block when joining a transaction, due to the 4010 * fact that the freeze locks are being held in write mode. 4011 */ 4012 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 4013 ASSERT(sb_write_started(fs_info->sb)); 4014 4015 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4016 btrfs_put_block_group(bg); 4017 return -ETXTBSY; 4018 } 4019 4020 rc = alloc_reloc_control(fs_info); 4021 if (!rc) { 4022 btrfs_put_block_group(bg); 4023 return -ENOMEM; 4024 } 4025 4026 ret = reloc_chunk_start(fs_info); 4027 if (ret < 0) { 4028 err = ret; 4029 goto out_put_bg; 4030 } 4031 4032 rc->extent_root = extent_root; 4033 rc->block_group = bg; 4034 4035 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4036 if (ret) { 4037 err = ret; 4038 goto out; 4039 } 4040 rw = 1; 4041 4042 path = btrfs_alloc_path(); 4043 if (!path) { 4044 err = -ENOMEM; 4045 goto out; 4046 } 4047 4048 inode = lookup_free_space_inode(rc->block_group, path); 4049 btrfs_free_path(path); 4050 4051 if (!IS_ERR(inode)) 4052 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4053 else 4054 ret = PTR_ERR(inode); 4055 4056 if (ret && ret != -ENOENT) { 4057 err = ret; 4058 goto out; 4059 } 4060 4061 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4062 if (IS_ERR(rc->data_inode)) { 4063 err = PTR_ERR(rc->data_inode); 4064 rc->data_inode = NULL; 4065 goto out; 4066 } 4067 4068 describe_relocation(rc->block_group); 4069 4070 btrfs_wait_block_group_reservations(rc->block_group); 4071 btrfs_wait_nocow_writers(rc->block_group); 4072 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 4073 4074 ret = btrfs_zone_finish(rc->block_group); 4075 WARN_ON(ret && ret != -EAGAIN); 4076 4077 while (1) { 4078 enum reloc_stage finishes_stage; 4079 4080 mutex_lock(&fs_info->cleaner_mutex); 4081 ret = relocate_block_group(rc); 4082 mutex_unlock(&fs_info->cleaner_mutex); 4083 if (ret < 0) 4084 err = ret; 4085 4086 finishes_stage = rc->stage; 4087 /* 4088 * We may have gotten ENOSPC after we already dirtied some 4089 * extents. If writeout happens while we're relocating a 4090 * different block group we could end up hitting the 4091 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4092 * btrfs_reloc_cow_block. Make sure we write everything out 4093 * properly so we don't trip over this problem, and then break 4094 * out of the loop if we hit an error. 4095 */ 4096 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4097 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 4098 (u64)-1); 4099 if (ret) 4100 err = ret; 4101 invalidate_mapping_pages(rc->data_inode->i_mapping, 4102 0, -1); 4103 rc->stage = UPDATE_DATA_PTRS; 4104 } 4105 4106 if (err < 0) 4107 goto out; 4108 4109 if (rc->extents_found == 0) 4110 break; 4111 4112 btrfs_info(fs_info, "found %llu extents, stage: %s", 4113 rc->extents_found, stage_to_string(finishes_stage)); 4114 } 4115 4116 WARN_ON(rc->block_group->pinned > 0); 4117 WARN_ON(rc->block_group->reserved > 0); 4118 WARN_ON(rc->block_group->used > 0); 4119 out: 4120 if (err && rw) 4121 btrfs_dec_block_group_ro(rc->block_group); 4122 iput(rc->data_inode); 4123 out_put_bg: 4124 btrfs_put_block_group(bg); 4125 reloc_chunk_end(fs_info); 4126 free_reloc_control(rc); 4127 return err; 4128 } 4129 4130 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4131 { 4132 struct btrfs_fs_info *fs_info = root->fs_info; 4133 struct btrfs_trans_handle *trans; 4134 int ret, err; 4135 4136 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4137 if (IS_ERR(trans)) 4138 return PTR_ERR(trans); 4139 4140 memset(&root->root_item.drop_progress, 0, 4141 sizeof(root->root_item.drop_progress)); 4142 btrfs_set_root_drop_level(&root->root_item, 0); 4143 btrfs_set_root_refs(&root->root_item, 0); 4144 ret = btrfs_update_root(trans, fs_info->tree_root, 4145 &root->root_key, &root->root_item); 4146 4147 err = btrfs_end_transaction(trans); 4148 if (err) 4149 return err; 4150 return ret; 4151 } 4152 4153 /* 4154 * recover relocation interrupted by system crash. 4155 * 4156 * this function resumes merging reloc trees with corresponding fs trees. 4157 * this is important for keeping the sharing of tree blocks 4158 */ 4159 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4160 { 4161 LIST_HEAD(reloc_roots); 4162 struct btrfs_key key; 4163 struct btrfs_root *fs_root; 4164 struct btrfs_root *reloc_root; 4165 struct btrfs_path *path; 4166 struct extent_buffer *leaf; 4167 struct reloc_control *rc = NULL; 4168 struct btrfs_trans_handle *trans; 4169 int ret2; 4170 int ret = 0; 4171 4172 path = btrfs_alloc_path(); 4173 if (!path) 4174 return -ENOMEM; 4175 path->reada = READA_BACK; 4176 4177 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4178 key.type = BTRFS_ROOT_ITEM_KEY; 4179 key.offset = (u64)-1; 4180 4181 while (1) { 4182 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4183 path, 0, 0); 4184 if (ret < 0) 4185 goto out; 4186 if (ret > 0) { 4187 if (path->slots[0] == 0) 4188 break; 4189 path->slots[0]--; 4190 } 4191 ret = 0; 4192 leaf = path->nodes[0]; 4193 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4194 btrfs_release_path(path); 4195 4196 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4197 key.type != BTRFS_ROOT_ITEM_KEY) 4198 break; 4199 4200 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4201 if (IS_ERR(reloc_root)) { 4202 ret = PTR_ERR(reloc_root); 4203 goto out; 4204 } 4205 4206 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4207 list_add(&reloc_root->root_list, &reloc_roots); 4208 4209 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4210 fs_root = btrfs_get_fs_root(fs_info, 4211 reloc_root->root_key.offset, false); 4212 if (IS_ERR(fs_root)) { 4213 ret = PTR_ERR(fs_root); 4214 if (ret != -ENOENT) 4215 goto out; 4216 ret = mark_garbage_root(reloc_root); 4217 if (ret < 0) 4218 goto out; 4219 ret = 0; 4220 } else { 4221 btrfs_put_root(fs_root); 4222 } 4223 } 4224 4225 if (key.offset == 0) 4226 break; 4227 4228 key.offset--; 4229 } 4230 btrfs_release_path(path); 4231 4232 if (list_empty(&reloc_roots)) 4233 goto out; 4234 4235 rc = alloc_reloc_control(fs_info); 4236 if (!rc) { 4237 ret = -ENOMEM; 4238 goto out; 4239 } 4240 4241 ret = reloc_chunk_start(fs_info); 4242 if (ret < 0) 4243 goto out_end; 4244 4245 rc->extent_root = btrfs_extent_root(fs_info, 0); 4246 4247 set_reloc_control(rc); 4248 4249 trans = btrfs_join_transaction(rc->extent_root); 4250 if (IS_ERR(trans)) { 4251 ret = PTR_ERR(trans); 4252 goto out_unset; 4253 } 4254 4255 rc->merge_reloc_tree = true; 4256 4257 while (!list_empty(&reloc_roots)) { 4258 reloc_root = list_entry(reloc_roots.next, 4259 struct btrfs_root, root_list); 4260 list_del(&reloc_root->root_list); 4261 4262 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4263 list_add_tail(&reloc_root->root_list, 4264 &rc->reloc_roots); 4265 continue; 4266 } 4267 4268 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4269 false); 4270 if (IS_ERR(fs_root)) { 4271 ret = PTR_ERR(fs_root); 4272 list_add_tail(&reloc_root->root_list, &reloc_roots); 4273 btrfs_end_transaction(trans); 4274 goto out_unset; 4275 } 4276 4277 ret = __add_reloc_root(reloc_root); 4278 ASSERT(ret != -EEXIST); 4279 if (ret) { 4280 list_add_tail(&reloc_root->root_list, &reloc_roots); 4281 btrfs_put_root(fs_root); 4282 btrfs_end_transaction(trans); 4283 goto out_unset; 4284 } 4285 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4286 btrfs_put_root(fs_root); 4287 } 4288 4289 ret = btrfs_commit_transaction(trans); 4290 if (ret) 4291 goto out_unset; 4292 4293 merge_reloc_roots(rc); 4294 4295 unset_reloc_control(rc); 4296 4297 trans = btrfs_join_transaction(rc->extent_root); 4298 if (IS_ERR(trans)) { 4299 ret = PTR_ERR(trans); 4300 goto out_clean; 4301 } 4302 ret = btrfs_commit_transaction(trans); 4303 out_clean: 4304 ret2 = clean_dirty_subvols(rc); 4305 if (ret2 < 0 && !ret) 4306 ret = ret2; 4307 out_unset: 4308 unset_reloc_control(rc); 4309 out_end: 4310 reloc_chunk_end(fs_info); 4311 free_reloc_control(rc); 4312 out: 4313 free_reloc_roots(&reloc_roots); 4314 4315 btrfs_free_path(path); 4316 4317 if (ret == 0) { 4318 /* cleanup orphan inode in data relocation tree */ 4319 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4320 ASSERT(fs_root); 4321 ret = btrfs_orphan_cleanup(fs_root); 4322 btrfs_put_root(fs_root); 4323 } 4324 return ret; 4325 } 4326 4327 /* 4328 * helper to add ordered checksum for data relocation. 4329 * 4330 * cloning checksum properly handles the nodatasum extents. 4331 * it also saves CPU time to re-calculate the checksum. 4332 */ 4333 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4334 { 4335 struct btrfs_inode *inode = ordered->inode; 4336 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4337 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4338 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4339 LIST_HEAD(list); 4340 int ret; 4341 4342 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4343 disk_bytenr + ordered->num_bytes - 1, 4344 &list, false); 4345 if (ret < 0) { 4346 btrfs_mark_ordered_extent_error(ordered); 4347 return ret; 4348 } 4349 4350 while (!list_empty(&list)) { 4351 struct btrfs_ordered_sum *sums = 4352 list_entry(list.next, struct btrfs_ordered_sum, list); 4353 4354 list_del_init(&sums->list); 4355 4356 /* 4357 * We need to offset the new_bytenr based on where the csum is. 4358 * We need to do this because we will read in entire prealloc 4359 * extents but we may have written to say the middle of the 4360 * prealloc extent, so we need to make sure the csum goes with 4361 * the right disk offset. 4362 * 4363 * We can do this because the data reloc inode refers strictly 4364 * to the on disk bytes, so we don't have to worry about 4365 * disk_len vs real len like with real inodes since it's all 4366 * disk length. 4367 */ 4368 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4369 btrfs_add_ordered_sum(ordered, sums); 4370 } 4371 4372 return 0; 4373 } 4374 4375 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4376 struct btrfs_root *root, 4377 const struct extent_buffer *buf, 4378 struct extent_buffer *cow) 4379 { 4380 struct btrfs_fs_info *fs_info = root->fs_info; 4381 struct reloc_control *rc; 4382 struct btrfs_backref_node *node; 4383 int first_cow = 0; 4384 int level; 4385 int ret = 0; 4386 4387 rc = fs_info->reloc_ctl; 4388 if (!rc) 4389 return 0; 4390 4391 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4392 4393 level = btrfs_header_level(buf); 4394 if (btrfs_header_generation(buf) <= 4395 btrfs_root_last_snapshot(&root->root_item)) 4396 first_cow = 1; 4397 4398 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4399 WARN_ON(!first_cow && level == 0); 4400 4401 node = rc->backref_cache.path[level]; 4402 BUG_ON(node->bytenr != buf->start && 4403 node->new_bytenr != buf->start); 4404 4405 btrfs_backref_drop_node_buffer(node); 4406 atomic_inc(&cow->refs); 4407 node->eb = cow; 4408 node->new_bytenr = cow->start; 4409 4410 if (!node->pending) { 4411 list_move_tail(&node->list, 4412 &rc->backref_cache.pending[level]); 4413 node->pending = 1; 4414 } 4415 4416 if (first_cow) 4417 mark_block_processed(rc, node); 4418 4419 if (first_cow && level > 0) 4420 rc->nodes_relocated += buf->len; 4421 } 4422 4423 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4424 ret = replace_file_extents(trans, rc, root, cow); 4425 return ret; 4426 } 4427 4428 /* 4429 * called before creating snapshot. it calculates metadata reservation 4430 * required for relocating tree blocks in the snapshot 4431 */ 4432 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4433 u64 *bytes_to_reserve) 4434 { 4435 struct btrfs_root *root = pending->root; 4436 struct reloc_control *rc = root->fs_info->reloc_ctl; 4437 4438 if (!rc || !have_reloc_root(root)) 4439 return; 4440 4441 if (!rc->merge_reloc_tree) 4442 return; 4443 4444 root = root->reloc_root; 4445 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4446 /* 4447 * relocation is in the stage of merging trees. the space 4448 * used by merging a reloc tree is twice the size of 4449 * relocated tree nodes in the worst case. half for cowing 4450 * the reloc tree, half for cowing the fs tree. the space 4451 * used by cowing the reloc tree will be freed after the 4452 * tree is dropped. if we create snapshot, cowing the fs 4453 * tree may use more space than it frees. so we need 4454 * reserve extra space. 4455 */ 4456 *bytes_to_reserve += rc->nodes_relocated; 4457 } 4458 4459 /* 4460 * called after snapshot is created. migrate block reservation 4461 * and create reloc root for the newly created snapshot 4462 * 4463 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4464 * references held on the reloc_root, one for root->reloc_root and one for 4465 * rc->reloc_roots. 4466 */ 4467 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4468 struct btrfs_pending_snapshot *pending) 4469 { 4470 struct btrfs_root *root = pending->root; 4471 struct btrfs_root *reloc_root; 4472 struct btrfs_root *new_root; 4473 struct reloc_control *rc = root->fs_info->reloc_ctl; 4474 int ret; 4475 4476 if (!rc || !have_reloc_root(root)) 4477 return 0; 4478 4479 rc = root->fs_info->reloc_ctl; 4480 rc->merging_rsv_size += rc->nodes_relocated; 4481 4482 if (rc->merge_reloc_tree) { 4483 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4484 rc->block_rsv, 4485 rc->nodes_relocated, true); 4486 if (ret) 4487 return ret; 4488 } 4489 4490 new_root = pending->snap; 4491 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4492 if (IS_ERR(reloc_root)) 4493 return PTR_ERR(reloc_root); 4494 4495 ret = __add_reloc_root(reloc_root); 4496 ASSERT(ret != -EEXIST); 4497 if (ret) { 4498 /* Pairs with create_reloc_root */ 4499 btrfs_put_root(reloc_root); 4500 return ret; 4501 } 4502 new_root->reloc_root = btrfs_grab_root(reloc_root); 4503 4504 if (rc->create_reloc_tree) 4505 ret = clone_backref_node(trans, rc, root, reloc_root); 4506 return ret; 4507 } 4508 4509 /* 4510 * Get the current bytenr for the block group which is being relocated. 4511 * 4512 * Return U64_MAX if no running relocation. 4513 */ 4514 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4515 { 4516 u64 logical = U64_MAX; 4517 4518 lockdep_assert_held(&fs_info->reloc_mutex); 4519 4520 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4521 logical = fs_info->reloc_ctl->block_group->start; 4522 return logical; 4523 } 4524