1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 struct { 94 struct rb_node rb_node; 95 u64 bytenr; 96 }; /* Use rb_simle_node for search/insert */ 97 void *data; 98 }; 99 100 struct mapping_tree { 101 struct rb_root rb_root; 102 spinlock_t lock; 103 }; 104 105 /* 106 * present a tree block to process 107 */ 108 struct tree_block { 109 struct { 110 struct rb_node rb_node; 111 u64 bytenr; 112 }; /* Use rb_simple_node for search/insert */ 113 u64 owner; 114 struct btrfs_key key; 115 u8 level; 116 bool key_ready; 117 }; 118 119 #define MAX_EXTENTS 128 120 121 struct file_extent_cluster { 122 u64 start; 123 u64 end; 124 u64 boundary[MAX_EXTENTS]; 125 unsigned int nr; 126 u64 owning_root; 127 }; 128 129 /* Stages of data relocation. */ 130 enum reloc_stage { 131 MOVE_DATA_EXTENTS, 132 UPDATE_DATA_PTRS 133 }; 134 135 struct reloc_control { 136 /* block group to relocate */ 137 struct btrfs_block_group *block_group; 138 /* extent tree */ 139 struct btrfs_root *extent_root; 140 /* inode for moving data */ 141 struct inode *data_inode; 142 143 struct btrfs_block_rsv *block_rsv; 144 145 struct btrfs_backref_cache backref_cache; 146 147 struct file_extent_cluster cluster; 148 /* tree blocks have been processed */ 149 struct extent_io_tree processed_blocks; 150 /* map start of tree root to corresponding reloc tree */ 151 struct mapping_tree reloc_root_tree; 152 /* list of reloc trees */ 153 struct list_head reloc_roots; 154 /* list of subvolume trees that get relocated */ 155 struct list_head dirty_subvol_roots; 156 /* size of metadata reservation for merging reloc trees */ 157 u64 merging_rsv_size; 158 /* size of relocated tree nodes */ 159 u64 nodes_relocated; 160 /* reserved size for block group relocation*/ 161 u64 reserved_bytes; 162 163 u64 search_start; 164 u64 extents_found; 165 166 enum reloc_stage stage; 167 bool create_reloc_tree; 168 bool merge_reloc_tree; 169 bool found_file_extent; 170 }; 171 172 static void mark_block_processed(struct reloc_control *rc, 173 struct btrfs_backref_node *node) 174 { 175 u32 blocksize; 176 177 if (node->level == 0 || 178 in_range(node->bytenr, rc->block_group->start, 179 rc->block_group->length)) { 180 blocksize = rc->extent_root->fs_info->nodesize; 181 set_extent_bit(&rc->processed_blocks, node->bytenr, 182 node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); 183 } 184 node->processed = 1; 185 } 186 187 /* 188 * walk up backref nodes until reach node presents tree root 189 */ 190 static struct btrfs_backref_node *walk_up_backref( 191 struct btrfs_backref_node *node, 192 struct btrfs_backref_edge *edges[], int *index) 193 { 194 struct btrfs_backref_edge *edge; 195 int idx = *index; 196 197 while (!list_empty(&node->upper)) { 198 edge = list_entry(node->upper.next, 199 struct btrfs_backref_edge, list[LOWER]); 200 edges[idx++] = edge; 201 node = edge->node[UPPER]; 202 } 203 BUG_ON(node->detached); 204 *index = idx; 205 return node; 206 } 207 208 /* 209 * walk down backref nodes to find start of next reference path 210 */ 211 static struct btrfs_backref_node *walk_down_backref( 212 struct btrfs_backref_edge *edges[], int *index) 213 { 214 struct btrfs_backref_edge *edge; 215 struct btrfs_backref_node *lower; 216 int idx = *index; 217 218 while (idx > 0) { 219 edge = edges[idx - 1]; 220 lower = edge->node[LOWER]; 221 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 222 idx--; 223 continue; 224 } 225 edge = list_entry(edge->list[LOWER].next, 226 struct btrfs_backref_edge, list[LOWER]); 227 edges[idx - 1] = edge; 228 *index = idx; 229 return edge->node[UPPER]; 230 } 231 *index = 0; 232 return NULL; 233 } 234 235 static bool reloc_root_is_dead(const struct btrfs_root *root) 236 { 237 /* 238 * Pair with set_bit/clear_bit in clean_dirty_subvols and 239 * btrfs_update_reloc_root. We need to see the updated bit before 240 * trying to access reloc_root 241 */ 242 smp_rmb(); 243 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 244 return true; 245 return false; 246 } 247 248 /* 249 * Check if this subvolume tree has valid reloc tree. 250 * 251 * Reloc tree after swap is considered dead, thus not considered as valid. 252 * This is enough for most callers, as they don't distinguish dead reloc root 253 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 254 * special case. 255 */ 256 static bool have_reloc_root(const struct btrfs_root *root) 257 { 258 if (reloc_root_is_dead(root)) 259 return false; 260 if (!root->reloc_root) 261 return false; 262 return true; 263 } 264 265 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 266 { 267 struct btrfs_root *reloc_root; 268 269 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 270 return false; 271 272 /* This root has been merged with its reloc tree, we can ignore it */ 273 if (reloc_root_is_dead(root)) 274 return true; 275 276 reloc_root = root->reloc_root; 277 if (!reloc_root) 278 return false; 279 280 if (btrfs_header_generation(reloc_root->commit_root) == 281 root->fs_info->running_transaction->transid) 282 return false; 283 /* 284 * If there is reloc tree and it was created in previous transaction 285 * backref lookup can find the reloc tree, so backref node for the fs 286 * tree root is useless for relocation. 287 */ 288 return true; 289 } 290 291 /* 292 * find reloc tree by address of tree root 293 */ 294 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 295 { 296 struct reloc_control *rc = fs_info->reloc_ctl; 297 struct rb_node *rb_node; 298 struct mapping_node *node; 299 struct btrfs_root *root = NULL; 300 301 ASSERT(rc); 302 spin_lock(&rc->reloc_root_tree.lock); 303 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 304 if (rb_node) { 305 node = rb_entry(rb_node, struct mapping_node, rb_node); 306 root = node->data; 307 } 308 spin_unlock(&rc->reloc_root_tree.lock); 309 return btrfs_grab_root(root); 310 } 311 312 /* 313 * For useless nodes, do two major clean ups: 314 * 315 * - Cleanup the children edges and nodes 316 * If child node is also orphan (no parent) during cleanup, then the child 317 * node will also be cleaned up. 318 * 319 * - Freeing up leaves (level 0), keeps nodes detached 320 * For nodes, the node is still cached as "detached" 321 * 322 * Return false if @node is not in the @useless_nodes list. 323 * Return true if @node is in the @useless_nodes list. 324 */ 325 static bool handle_useless_nodes(struct reloc_control *rc, 326 struct btrfs_backref_node *node) 327 { 328 struct btrfs_backref_cache *cache = &rc->backref_cache; 329 struct list_head *useless_node = &cache->useless_node; 330 bool ret = false; 331 332 while (!list_empty(useless_node)) { 333 struct btrfs_backref_node *cur; 334 335 cur = list_first_entry(useless_node, struct btrfs_backref_node, 336 list); 337 list_del_init(&cur->list); 338 339 /* Only tree root nodes can be added to @useless_nodes */ 340 ASSERT(list_empty(&cur->upper)); 341 342 if (cur == node) 343 ret = true; 344 345 /* The node is the lowest node */ 346 if (cur->lowest) { 347 list_del_init(&cur->lower); 348 cur->lowest = 0; 349 } 350 351 /* Cleanup the lower edges */ 352 while (!list_empty(&cur->lower)) { 353 struct btrfs_backref_edge *edge; 354 struct btrfs_backref_node *lower; 355 356 edge = list_entry(cur->lower.next, 357 struct btrfs_backref_edge, list[UPPER]); 358 list_del(&edge->list[UPPER]); 359 list_del(&edge->list[LOWER]); 360 lower = edge->node[LOWER]; 361 btrfs_backref_free_edge(cache, edge); 362 363 /* Child node is also orphan, queue for cleanup */ 364 if (list_empty(&lower->upper)) 365 list_add(&lower->list, useless_node); 366 } 367 /* Mark this block processed for relocation */ 368 mark_block_processed(rc, cur); 369 370 /* 371 * Backref nodes for tree leaves are deleted from the cache. 372 * Backref nodes for upper level tree blocks are left in the 373 * cache to avoid unnecessary backref lookup. 374 */ 375 if (cur->level > 0) { 376 list_add(&cur->list, &cache->detached); 377 cur->detached = 1; 378 } else { 379 rb_erase(&cur->rb_node, &cache->rb_root); 380 btrfs_backref_free_node(cache, cur); 381 } 382 } 383 return ret; 384 } 385 386 /* 387 * Build backref tree for a given tree block. Root of the backref tree 388 * corresponds the tree block, leaves of the backref tree correspond roots of 389 * b-trees that reference the tree block. 390 * 391 * The basic idea of this function is check backrefs of a given block to find 392 * upper level blocks that reference the block, and then check backrefs of 393 * these upper level blocks recursively. The recursion stops when tree root is 394 * reached or backrefs for the block is cached. 395 * 396 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 397 * all upper level blocks that directly/indirectly reference the block are also 398 * cached. 399 */ 400 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 401 struct btrfs_trans_handle *trans, 402 struct reloc_control *rc, struct btrfs_key *node_key, 403 int level, u64 bytenr) 404 { 405 struct btrfs_backref_iter *iter; 406 struct btrfs_backref_cache *cache = &rc->backref_cache; 407 /* For searching parent of TREE_BLOCK_REF */ 408 struct btrfs_path *path; 409 struct btrfs_backref_node *cur; 410 struct btrfs_backref_node *node = NULL; 411 struct btrfs_backref_edge *edge; 412 int ret; 413 414 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 415 if (!iter) 416 return ERR_PTR(-ENOMEM); 417 path = btrfs_alloc_path(); 418 if (!path) { 419 ret = -ENOMEM; 420 goto out; 421 } 422 423 node = btrfs_backref_alloc_node(cache, bytenr, level); 424 if (!node) { 425 ret = -ENOMEM; 426 goto out; 427 } 428 429 node->lowest = 1; 430 cur = node; 431 432 /* Breadth-first search to build backref cache */ 433 do { 434 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 435 node_key, cur); 436 if (ret < 0) 437 goto out; 438 439 edge = list_first_entry_or_null(&cache->pending_edge, 440 struct btrfs_backref_edge, list[UPPER]); 441 /* 442 * The pending list isn't empty, take the first block to 443 * process 444 */ 445 if (edge) { 446 list_del_init(&edge->list[UPPER]); 447 cur = edge->node[UPPER]; 448 } 449 } while (edge); 450 451 /* Finish the upper linkage of newly added edges/nodes */ 452 ret = btrfs_backref_finish_upper_links(cache, node); 453 if (ret < 0) 454 goto out; 455 456 if (handle_useless_nodes(rc, node)) 457 node = NULL; 458 out: 459 btrfs_free_path(iter->path); 460 kfree(iter); 461 btrfs_free_path(path); 462 if (ret) { 463 btrfs_backref_error_cleanup(cache, node); 464 return ERR_PTR(ret); 465 } 466 ASSERT(!node || !node->detached); 467 ASSERT(list_empty(&cache->useless_node) && 468 list_empty(&cache->pending_edge)); 469 return node; 470 } 471 472 /* 473 * helper to add backref node for the newly created snapshot. 474 * the backref node is created by cloning backref node that 475 * corresponds to root of source tree 476 */ 477 static int clone_backref_node(struct btrfs_trans_handle *trans, 478 struct reloc_control *rc, 479 const struct btrfs_root *src, 480 struct btrfs_root *dest) 481 { 482 struct btrfs_root *reloc_root = src->reloc_root; 483 struct btrfs_backref_cache *cache = &rc->backref_cache; 484 struct btrfs_backref_node *node = NULL; 485 struct btrfs_backref_node *new_node; 486 struct btrfs_backref_edge *edge; 487 struct btrfs_backref_edge *new_edge; 488 struct rb_node *rb_node; 489 490 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 491 if (rb_node) { 492 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 493 if (node->detached) 494 node = NULL; 495 else 496 BUG_ON(node->new_bytenr != reloc_root->node->start); 497 } 498 499 if (!node) { 500 rb_node = rb_simple_search(&cache->rb_root, 501 reloc_root->commit_root->start); 502 if (rb_node) { 503 node = rb_entry(rb_node, struct btrfs_backref_node, 504 rb_node); 505 BUG_ON(node->detached); 506 } 507 } 508 509 if (!node) 510 return 0; 511 512 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 513 node->level); 514 if (!new_node) 515 return -ENOMEM; 516 517 new_node->lowest = node->lowest; 518 new_node->checked = 1; 519 new_node->root = btrfs_grab_root(dest); 520 ASSERT(new_node->root); 521 522 if (!node->lowest) { 523 list_for_each_entry(edge, &node->lower, list[UPPER]) { 524 new_edge = btrfs_backref_alloc_edge(cache); 525 if (!new_edge) 526 goto fail; 527 528 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 529 new_node, LINK_UPPER); 530 } 531 } else { 532 list_add_tail(&new_node->lower, &cache->leaves); 533 } 534 535 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 536 &new_node->rb_node); 537 if (rb_node) 538 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 539 540 if (!new_node->lowest) { 541 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 542 list_add_tail(&new_edge->list[LOWER], 543 &new_edge->node[LOWER]->upper); 544 } 545 } 546 return 0; 547 fail: 548 while (!list_empty(&new_node->lower)) { 549 new_edge = list_entry(new_node->lower.next, 550 struct btrfs_backref_edge, list[UPPER]); 551 list_del(&new_edge->list[UPPER]); 552 btrfs_backref_free_edge(cache, new_edge); 553 } 554 btrfs_backref_free_node(cache, new_node); 555 return -ENOMEM; 556 } 557 558 /* 559 * helper to add 'address of tree root -> reloc tree' mapping 560 */ 561 static int __add_reloc_root(struct btrfs_root *root) 562 { 563 struct btrfs_fs_info *fs_info = root->fs_info; 564 struct rb_node *rb_node; 565 struct mapping_node *node; 566 struct reloc_control *rc = fs_info->reloc_ctl; 567 568 node = kmalloc(sizeof(*node), GFP_NOFS); 569 if (!node) 570 return -ENOMEM; 571 572 node->bytenr = root->commit_root->start; 573 node->data = root; 574 575 spin_lock(&rc->reloc_root_tree.lock); 576 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 577 node->bytenr, &node->rb_node); 578 spin_unlock(&rc->reloc_root_tree.lock); 579 if (rb_node) { 580 btrfs_err(fs_info, 581 "Duplicate root found for start=%llu while inserting into relocation tree", 582 node->bytenr); 583 return -EEXIST; 584 } 585 586 list_add_tail(&root->root_list, &rc->reloc_roots); 587 return 0; 588 } 589 590 /* 591 * helper to delete the 'address of tree root -> reloc tree' 592 * mapping 593 */ 594 static void __del_reloc_root(struct btrfs_root *root) 595 { 596 struct btrfs_fs_info *fs_info = root->fs_info; 597 struct rb_node *rb_node; 598 struct mapping_node *node = NULL; 599 struct reloc_control *rc = fs_info->reloc_ctl; 600 bool put_ref = false; 601 602 if (rc && root->node) { 603 spin_lock(&rc->reloc_root_tree.lock); 604 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 605 root->commit_root->start); 606 if (rb_node) { 607 node = rb_entry(rb_node, struct mapping_node, rb_node); 608 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 609 RB_CLEAR_NODE(&node->rb_node); 610 } 611 spin_unlock(&rc->reloc_root_tree.lock); 612 ASSERT(!node || (struct btrfs_root *)node->data == root); 613 } 614 615 /* 616 * We only put the reloc root here if it's on the list. There's a lot 617 * of places where the pattern is to splice the rc->reloc_roots, process 618 * the reloc roots, and then add the reloc root back onto 619 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 620 * list we don't want the reference being dropped, because the guy 621 * messing with the list is in charge of the reference. 622 */ 623 spin_lock(&fs_info->trans_lock); 624 if (!list_empty(&root->root_list)) { 625 put_ref = true; 626 list_del_init(&root->root_list); 627 } 628 spin_unlock(&fs_info->trans_lock); 629 if (put_ref) 630 btrfs_put_root(root); 631 kfree(node); 632 } 633 634 /* 635 * helper to update the 'address of tree root -> reloc tree' 636 * mapping 637 */ 638 static int __update_reloc_root(struct btrfs_root *root) 639 { 640 struct btrfs_fs_info *fs_info = root->fs_info; 641 struct rb_node *rb_node; 642 struct mapping_node *node = NULL; 643 struct reloc_control *rc = fs_info->reloc_ctl; 644 645 spin_lock(&rc->reloc_root_tree.lock); 646 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 647 root->commit_root->start); 648 if (rb_node) { 649 node = rb_entry(rb_node, struct mapping_node, rb_node); 650 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 651 } 652 spin_unlock(&rc->reloc_root_tree.lock); 653 654 if (!node) 655 return 0; 656 BUG_ON((struct btrfs_root *)node->data != root); 657 658 spin_lock(&rc->reloc_root_tree.lock); 659 node->bytenr = root->node->start; 660 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 661 node->bytenr, &node->rb_node); 662 spin_unlock(&rc->reloc_root_tree.lock); 663 if (rb_node) 664 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 665 return 0; 666 } 667 668 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 669 struct btrfs_root *root, u64 objectid) 670 { 671 struct btrfs_fs_info *fs_info = root->fs_info; 672 struct btrfs_root *reloc_root; 673 struct extent_buffer *eb; 674 struct btrfs_root_item *root_item; 675 struct btrfs_key root_key; 676 int ret = 0; 677 bool must_abort = false; 678 679 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 680 if (!root_item) 681 return ERR_PTR(-ENOMEM); 682 683 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 684 root_key.type = BTRFS_ROOT_ITEM_KEY; 685 root_key.offset = objectid; 686 687 if (btrfs_root_id(root) == objectid) { 688 u64 commit_root_gen; 689 690 /* called by btrfs_init_reloc_root */ 691 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 692 BTRFS_TREE_RELOC_OBJECTID); 693 if (ret) 694 goto fail; 695 696 /* 697 * Set the last_snapshot field to the generation of the commit 698 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 699 * correctly (returns true) when the relocation root is created 700 * either inside the critical section of a transaction commit 701 * (through transaction.c:qgroup_account_snapshot()) and when 702 * it's created before the transaction commit is started. 703 */ 704 commit_root_gen = btrfs_header_generation(root->commit_root); 705 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 706 } else { 707 /* 708 * called by btrfs_reloc_post_snapshot_hook. 709 * the source tree is a reloc tree, all tree blocks 710 * modified after it was created have RELOC flag 711 * set in their headers. so it's OK to not update 712 * the 'last_snapshot'. 713 */ 714 ret = btrfs_copy_root(trans, root, root->node, &eb, 715 BTRFS_TREE_RELOC_OBJECTID); 716 if (ret) 717 goto fail; 718 } 719 720 /* 721 * We have changed references at this point, we must abort the 722 * transaction if anything fails. 723 */ 724 must_abort = true; 725 726 memcpy(root_item, &root->root_item, sizeof(*root_item)); 727 btrfs_set_root_bytenr(root_item, eb->start); 728 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 729 btrfs_set_root_generation(root_item, trans->transid); 730 731 if (btrfs_root_id(root) == objectid) { 732 btrfs_set_root_refs(root_item, 0); 733 memset(&root_item->drop_progress, 0, 734 sizeof(struct btrfs_disk_key)); 735 btrfs_set_root_drop_level(root_item, 0); 736 } 737 738 btrfs_tree_unlock(eb); 739 free_extent_buffer(eb); 740 741 ret = btrfs_insert_root(trans, fs_info->tree_root, 742 &root_key, root_item); 743 if (ret) 744 goto fail; 745 746 kfree(root_item); 747 748 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 749 if (IS_ERR(reloc_root)) { 750 ret = PTR_ERR(reloc_root); 751 goto abort; 752 } 753 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 754 btrfs_set_root_last_trans(reloc_root, trans->transid); 755 return reloc_root; 756 fail: 757 kfree(root_item); 758 abort: 759 if (must_abort) 760 btrfs_abort_transaction(trans, ret); 761 return ERR_PTR(ret); 762 } 763 764 /* 765 * create reloc tree for a given fs tree. reloc tree is just a 766 * snapshot of the fs tree with special root objectid. 767 * 768 * The reloc_root comes out of here with two references, one for 769 * root->reloc_root, and another for being on the rc->reloc_roots list. 770 */ 771 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 772 struct btrfs_root *root) 773 { 774 struct btrfs_fs_info *fs_info = root->fs_info; 775 struct btrfs_root *reloc_root; 776 struct reloc_control *rc = fs_info->reloc_ctl; 777 struct btrfs_block_rsv *rsv; 778 int clear_rsv = 0; 779 int ret; 780 781 if (!rc) 782 return 0; 783 784 /* 785 * The subvolume has reloc tree but the swap is finished, no need to 786 * create/update the dead reloc tree 787 */ 788 if (reloc_root_is_dead(root)) 789 return 0; 790 791 /* 792 * This is subtle but important. We do not do 793 * record_root_in_transaction for reloc roots, instead we record their 794 * corresponding fs root, and then here we update the last trans for the 795 * reloc root. This means that we have to do this for the entire life 796 * of the reloc root, regardless of which stage of the relocation we are 797 * in. 798 */ 799 if (root->reloc_root) { 800 reloc_root = root->reloc_root; 801 btrfs_set_root_last_trans(reloc_root, trans->transid); 802 return 0; 803 } 804 805 /* 806 * We are merging reloc roots, we do not need new reloc trees. Also 807 * reloc trees never need their own reloc tree. 808 */ 809 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 810 return 0; 811 812 if (!trans->reloc_reserved) { 813 rsv = trans->block_rsv; 814 trans->block_rsv = rc->block_rsv; 815 clear_rsv = 1; 816 } 817 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 818 if (clear_rsv) 819 trans->block_rsv = rsv; 820 if (IS_ERR(reloc_root)) 821 return PTR_ERR(reloc_root); 822 823 ret = __add_reloc_root(reloc_root); 824 ASSERT(ret != -EEXIST); 825 if (ret) { 826 /* Pairs with create_reloc_root */ 827 btrfs_put_root(reloc_root); 828 return ret; 829 } 830 root->reloc_root = btrfs_grab_root(reloc_root); 831 return 0; 832 } 833 834 /* 835 * update root item of reloc tree 836 */ 837 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 838 struct btrfs_root *root) 839 { 840 struct btrfs_fs_info *fs_info = root->fs_info; 841 struct btrfs_root *reloc_root; 842 struct btrfs_root_item *root_item; 843 int ret; 844 845 if (!have_reloc_root(root)) 846 return 0; 847 848 reloc_root = root->reloc_root; 849 root_item = &reloc_root->root_item; 850 851 /* 852 * We are probably ok here, but __del_reloc_root() will drop its ref of 853 * the root. We have the ref for root->reloc_root, but just in case 854 * hold it while we update the reloc root. 855 */ 856 btrfs_grab_root(reloc_root); 857 858 /* root->reloc_root will stay until current relocation finished */ 859 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 860 btrfs_root_refs(root_item) == 0) { 861 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 862 /* 863 * Mark the tree as dead before we change reloc_root so 864 * have_reloc_root will not touch it from now on. 865 */ 866 smp_wmb(); 867 __del_reloc_root(reloc_root); 868 } 869 870 if (reloc_root->commit_root != reloc_root->node) { 871 __update_reloc_root(reloc_root); 872 btrfs_set_root_node(root_item, reloc_root->node); 873 free_extent_buffer(reloc_root->commit_root); 874 reloc_root->commit_root = btrfs_root_node(reloc_root); 875 } 876 877 ret = btrfs_update_root(trans, fs_info->tree_root, 878 &reloc_root->root_key, root_item); 879 btrfs_put_root(reloc_root); 880 return ret; 881 } 882 883 /* 884 * get new location of data 885 */ 886 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 887 u64 bytenr, u64 num_bytes) 888 { 889 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 890 struct btrfs_path *path; 891 struct btrfs_file_extent_item *fi; 892 struct extent_buffer *leaf; 893 int ret; 894 895 path = btrfs_alloc_path(); 896 if (!path) 897 return -ENOMEM; 898 899 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 900 ret = btrfs_lookup_file_extent(NULL, root, path, 901 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 902 if (ret < 0) 903 goto out; 904 if (ret > 0) { 905 ret = -ENOENT; 906 goto out; 907 } 908 909 leaf = path->nodes[0]; 910 fi = btrfs_item_ptr(leaf, path->slots[0], 911 struct btrfs_file_extent_item); 912 913 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 914 btrfs_file_extent_compression(leaf, fi) || 915 btrfs_file_extent_encryption(leaf, fi) || 916 btrfs_file_extent_other_encoding(leaf, fi)); 917 918 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 919 ret = -EINVAL; 920 goto out; 921 } 922 923 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 924 ret = 0; 925 out: 926 btrfs_free_path(path); 927 return ret; 928 } 929 930 /* 931 * update file extent items in the tree leaf to point to 932 * the new locations. 933 */ 934 static noinline_for_stack 935 int replace_file_extents(struct btrfs_trans_handle *trans, 936 struct reloc_control *rc, 937 struct btrfs_root *root, 938 struct extent_buffer *leaf) 939 { 940 struct btrfs_fs_info *fs_info = root->fs_info; 941 struct btrfs_key key; 942 struct btrfs_file_extent_item *fi; 943 struct btrfs_inode *inode = NULL; 944 u64 parent; 945 u64 bytenr; 946 u64 new_bytenr = 0; 947 u64 num_bytes; 948 u64 end; 949 u32 nritems; 950 u32 i; 951 int ret = 0; 952 int first = 1; 953 int dirty = 0; 954 955 if (rc->stage != UPDATE_DATA_PTRS) 956 return 0; 957 958 /* reloc trees always use full backref */ 959 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 960 parent = leaf->start; 961 else 962 parent = 0; 963 964 nritems = btrfs_header_nritems(leaf); 965 for (i = 0; i < nritems; i++) { 966 struct btrfs_ref ref = { 0 }; 967 968 cond_resched(); 969 btrfs_item_key_to_cpu(leaf, &key, i); 970 if (key.type != BTRFS_EXTENT_DATA_KEY) 971 continue; 972 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 973 if (btrfs_file_extent_type(leaf, fi) == 974 BTRFS_FILE_EXTENT_INLINE) 975 continue; 976 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 977 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 978 if (bytenr == 0) 979 continue; 980 if (!in_range(bytenr, rc->block_group->start, 981 rc->block_group->length)) 982 continue; 983 984 /* 985 * if we are modifying block in fs tree, wait for read_folio 986 * to complete and drop the extent cache 987 */ 988 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 989 if (first) { 990 inode = btrfs_find_first_inode(root, key.objectid); 991 first = 0; 992 } else if (inode && btrfs_ino(inode) < key.objectid) { 993 btrfs_add_delayed_iput(inode); 994 inode = btrfs_find_first_inode(root, key.objectid); 995 } 996 if (inode && btrfs_ino(inode) == key.objectid) { 997 struct extent_state *cached_state = NULL; 998 999 end = key.offset + 1000 btrfs_file_extent_num_bytes(leaf, fi); 1001 WARN_ON(!IS_ALIGNED(key.offset, 1002 fs_info->sectorsize)); 1003 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1004 end--; 1005 /* Take mmap lock to serialize with reflinks. */ 1006 if (!down_read_trylock(&inode->i_mmap_lock)) 1007 continue; 1008 ret = try_lock_extent(&inode->io_tree, key.offset, 1009 end, &cached_state); 1010 if (!ret) { 1011 up_read(&inode->i_mmap_lock); 1012 continue; 1013 } 1014 1015 btrfs_drop_extent_map_range(inode, key.offset, end, true); 1016 unlock_extent(&inode->io_tree, key.offset, end, 1017 &cached_state); 1018 up_read(&inode->i_mmap_lock); 1019 } 1020 } 1021 1022 ret = get_new_location(rc->data_inode, &new_bytenr, 1023 bytenr, num_bytes); 1024 if (ret) { 1025 /* 1026 * Don't have to abort since we've not changed anything 1027 * in the file extent yet. 1028 */ 1029 break; 1030 } 1031 1032 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1033 dirty = 1; 1034 1035 key.offset -= btrfs_file_extent_offset(leaf, fi); 1036 ref.action = BTRFS_ADD_DELAYED_REF; 1037 ref.bytenr = new_bytenr; 1038 ref.num_bytes = num_bytes; 1039 ref.parent = parent; 1040 ref.owning_root = btrfs_root_id(root); 1041 ref.ref_root = btrfs_header_owner(leaf); 1042 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1043 btrfs_root_id(root), false); 1044 ret = btrfs_inc_extent_ref(trans, &ref); 1045 if (ret) { 1046 btrfs_abort_transaction(trans, ret); 1047 break; 1048 } 1049 1050 ref.action = BTRFS_DROP_DELAYED_REF; 1051 ref.bytenr = bytenr; 1052 ref.num_bytes = num_bytes; 1053 ref.parent = parent; 1054 ref.owning_root = btrfs_root_id(root); 1055 ref.ref_root = btrfs_header_owner(leaf); 1056 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1057 btrfs_root_id(root), false); 1058 ret = btrfs_free_extent(trans, &ref); 1059 if (ret) { 1060 btrfs_abort_transaction(trans, ret); 1061 break; 1062 } 1063 } 1064 if (dirty) 1065 btrfs_mark_buffer_dirty(trans, leaf); 1066 if (inode) 1067 btrfs_add_delayed_iput(inode); 1068 return ret; 1069 } 1070 1071 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 1072 int slot, const struct btrfs_path *path, 1073 int level) 1074 { 1075 struct btrfs_disk_key key1; 1076 struct btrfs_disk_key key2; 1077 btrfs_node_key(eb, &key1, slot); 1078 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1079 return memcmp(&key1, &key2, sizeof(key1)); 1080 } 1081 1082 /* 1083 * try to replace tree blocks in fs tree with the new blocks 1084 * in reloc tree. tree blocks haven't been modified since the 1085 * reloc tree was create can be replaced. 1086 * 1087 * if a block was replaced, level of the block + 1 is returned. 1088 * if no block got replaced, 0 is returned. if there are other 1089 * errors, a negative error number is returned. 1090 */ 1091 static noinline_for_stack 1092 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1093 struct btrfs_root *dest, struct btrfs_root *src, 1094 struct btrfs_path *path, struct btrfs_key *next_key, 1095 int lowest_level, int max_level) 1096 { 1097 struct btrfs_fs_info *fs_info = dest->fs_info; 1098 struct extent_buffer *eb; 1099 struct extent_buffer *parent; 1100 struct btrfs_ref ref = { 0 }; 1101 struct btrfs_key key; 1102 u64 old_bytenr; 1103 u64 new_bytenr; 1104 u64 old_ptr_gen; 1105 u64 new_ptr_gen; 1106 u64 last_snapshot; 1107 u32 blocksize; 1108 int cow = 0; 1109 int level; 1110 int ret; 1111 int slot; 1112 1113 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1114 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1115 1116 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1117 again: 1118 slot = path->slots[lowest_level]; 1119 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1120 1121 eb = btrfs_lock_root_node(dest); 1122 level = btrfs_header_level(eb); 1123 1124 if (level < lowest_level) { 1125 btrfs_tree_unlock(eb); 1126 free_extent_buffer(eb); 1127 return 0; 1128 } 1129 1130 if (cow) { 1131 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1132 BTRFS_NESTING_COW); 1133 if (ret) { 1134 btrfs_tree_unlock(eb); 1135 free_extent_buffer(eb); 1136 return ret; 1137 } 1138 } 1139 1140 if (next_key) { 1141 next_key->objectid = (u64)-1; 1142 next_key->type = (u8)-1; 1143 next_key->offset = (u64)-1; 1144 } 1145 1146 parent = eb; 1147 while (1) { 1148 level = btrfs_header_level(parent); 1149 ASSERT(level >= lowest_level); 1150 1151 ret = btrfs_bin_search(parent, 0, &key, &slot); 1152 if (ret < 0) 1153 break; 1154 if (ret && slot > 0) 1155 slot--; 1156 1157 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1158 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1159 1160 old_bytenr = btrfs_node_blockptr(parent, slot); 1161 blocksize = fs_info->nodesize; 1162 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1163 1164 if (level <= max_level) { 1165 eb = path->nodes[level]; 1166 new_bytenr = btrfs_node_blockptr(eb, 1167 path->slots[level]); 1168 new_ptr_gen = btrfs_node_ptr_generation(eb, 1169 path->slots[level]); 1170 } else { 1171 new_bytenr = 0; 1172 new_ptr_gen = 0; 1173 } 1174 1175 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1176 ret = level; 1177 break; 1178 } 1179 1180 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1181 memcmp_node_keys(parent, slot, path, level)) { 1182 if (level <= lowest_level) { 1183 ret = 0; 1184 break; 1185 } 1186 1187 eb = btrfs_read_node_slot(parent, slot); 1188 if (IS_ERR(eb)) { 1189 ret = PTR_ERR(eb); 1190 break; 1191 } 1192 btrfs_tree_lock(eb); 1193 if (cow) { 1194 ret = btrfs_cow_block(trans, dest, eb, parent, 1195 slot, &eb, 1196 BTRFS_NESTING_COW); 1197 if (ret) { 1198 btrfs_tree_unlock(eb); 1199 free_extent_buffer(eb); 1200 break; 1201 } 1202 } 1203 1204 btrfs_tree_unlock(parent); 1205 free_extent_buffer(parent); 1206 1207 parent = eb; 1208 continue; 1209 } 1210 1211 if (!cow) { 1212 btrfs_tree_unlock(parent); 1213 free_extent_buffer(parent); 1214 cow = 1; 1215 goto again; 1216 } 1217 1218 btrfs_node_key_to_cpu(path->nodes[level], &key, 1219 path->slots[level]); 1220 btrfs_release_path(path); 1221 1222 path->lowest_level = level; 1223 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1224 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1225 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1226 path->lowest_level = 0; 1227 if (ret) { 1228 if (ret > 0) 1229 ret = -ENOENT; 1230 break; 1231 } 1232 1233 /* 1234 * Info qgroup to trace both subtrees. 1235 * 1236 * We must trace both trees. 1237 * 1) Tree reloc subtree 1238 * If not traced, we will leak data numbers 1239 * 2) Fs subtree 1240 * If not traced, we will double count old data 1241 * 1242 * We don't scan the subtree right now, but only record 1243 * the swapped tree blocks. 1244 * The real subtree rescan is delayed until we have new 1245 * CoW on the subtree root node before transaction commit. 1246 */ 1247 ret = btrfs_qgroup_add_swapped_blocks(dest, 1248 rc->block_group, parent, slot, 1249 path->nodes[level], path->slots[level], 1250 last_snapshot); 1251 if (ret < 0) 1252 break; 1253 /* 1254 * swap blocks in fs tree and reloc tree. 1255 */ 1256 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1257 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1258 btrfs_mark_buffer_dirty(trans, parent); 1259 1260 btrfs_set_node_blockptr(path->nodes[level], 1261 path->slots[level], old_bytenr); 1262 btrfs_set_node_ptr_generation(path->nodes[level], 1263 path->slots[level], old_ptr_gen); 1264 btrfs_mark_buffer_dirty(trans, path->nodes[level]); 1265 1266 ref.action = BTRFS_ADD_DELAYED_REF; 1267 ref.bytenr = old_bytenr; 1268 ref.num_bytes = blocksize; 1269 ref.parent = path->nodes[level]->start; 1270 ref.owning_root = btrfs_root_id(src); 1271 ref.ref_root = btrfs_root_id(src); 1272 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1273 ret = btrfs_inc_extent_ref(trans, &ref); 1274 if (ret) { 1275 btrfs_abort_transaction(trans, ret); 1276 break; 1277 } 1278 1279 ref.action = BTRFS_ADD_DELAYED_REF; 1280 ref.bytenr = new_bytenr; 1281 ref.num_bytes = blocksize; 1282 ref.parent = 0; 1283 ref.owning_root = btrfs_root_id(dest); 1284 ref.ref_root = btrfs_root_id(dest); 1285 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1286 ret = btrfs_inc_extent_ref(trans, &ref); 1287 if (ret) { 1288 btrfs_abort_transaction(trans, ret); 1289 break; 1290 } 1291 1292 /* We don't know the real owning_root, use 0. */ 1293 ref.action = BTRFS_DROP_DELAYED_REF; 1294 ref.bytenr = new_bytenr; 1295 ref.num_bytes = blocksize; 1296 ref.parent = path->nodes[level]->start; 1297 ref.owning_root = 0; 1298 ref.ref_root = btrfs_root_id(src); 1299 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1300 ret = btrfs_free_extent(trans, &ref); 1301 if (ret) { 1302 btrfs_abort_transaction(trans, ret); 1303 break; 1304 } 1305 1306 /* We don't know the real owning_root, use 0. */ 1307 ref.action = BTRFS_DROP_DELAYED_REF; 1308 ref.bytenr = old_bytenr; 1309 ref.num_bytes = blocksize; 1310 ref.parent = 0; 1311 ref.owning_root = 0; 1312 ref.ref_root = btrfs_root_id(dest); 1313 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1314 ret = btrfs_free_extent(trans, &ref); 1315 if (ret) { 1316 btrfs_abort_transaction(trans, ret); 1317 break; 1318 } 1319 1320 btrfs_unlock_up_safe(path, 0); 1321 1322 ret = level; 1323 break; 1324 } 1325 btrfs_tree_unlock(parent); 1326 free_extent_buffer(parent); 1327 return ret; 1328 } 1329 1330 /* 1331 * helper to find next relocated block in reloc tree 1332 */ 1333 static noinline_for_stack 1334 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1335 int *level) 1336 { 1337 struct extent_buffer *eb; 1338 int i; 1339 u64 last_snapshot; 1340 u32 nritems; 1341 1342 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1343 1344 for (i = 0; i < *level; i++) { 1345 free_extent_buffer(path->nodes[i]); 1346 path->nodes[i] = NULL; 1347 } 1348 1349 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1350 eb = path->nodes[i]; 1351 nritems = btrfs_header_nritems(eb); 1352 while (path->slots[i] + 1 < nritems) { 1353 path->slots[i]++; 1354 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1355 last_snapshot) 1356 continue; 1357 1358 *level = i; 1359 return 0; 1360 } 1361 free_extent_buffer(path->nodes[i]); 1362 path->nodes[i] = NULL; 1363 } 1364 return 1; 1365 } 1366 1367 /* 1368 * walk down reloc tree to find relocated block of lowest level 1369 */ 1370 static noinline_for_stack 1371 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1372 int *level) 1373 { 1374 struct extent_buffer *eb = NULL; 1375 int i; 1376 u64 ptr_gen = 0; 1377 u64 last_snapshot; 1378 u32 nritems; 1379 1380 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1381 1382 for (i = *level; i > 0; i--) { 1383 eb = path->nodes[i]; 1384 nritems = btrfs_header_nritems(eb); 1385 while (path->slots[i] < nritems) { 1386 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1387 if (ptr_gen > last_snapshot) 1388 break; 1389 path->slots[i]++; 1390 } 1391 if (path->slots[i] >= nritems) { 1392 if (i == *level) 1393 break; 1394 *level = i + 1; 1395 return 0; 1396 } 1397 if (i == 1) { 1398 *level = i; 1399 return 0; 1400 } 1401 1402 eb = btrfs_read_node_slot(eb, path->slots[i]); 1403 if (IS_ERR(eb)) 1404 return PTR_ERR(eb); 1405 BUG_ON(btrfs_header_level(eb) != i - 1); 1406 path->nodes[i - 1] = eb; 1407 path->slots[i - 1] = 0; 1408 } 1409 return 1; 1410 } 1411 1412 /* 1413 * invalidate extent cache for file extents whose key in range of 1414 * [min_key, max_key) 1415 */ 1416 static int invalidate_extent_cache(struct btrfs_root *root, 1417 const struct btrfs_key *min_key, 1418 const struct btrfs_key *max_key) 1419 { 1420 struct btrfs_fs_info *fs_info = root->fs_info; 1421 struct btrfs_inode *inode = NULL; 1422 u64 objectid; 1423 u64 start, end; 1424 u64 ino; 1425 1426 objectid = min_key->objectid; 1427 while (1) { 1428 struct extent_state *cached_state = NULL; 1429 1430 cond_resched(); 1431 if (inode) 1432 iput(&inode->vfs_inode); 1433 1434 if (objectid > max_key->objectid) 1435 break; 1436 1437 inode = btrfs_find_first_inode(root, objectid); 1438 if (!inode) 1439 break; 1440 ino = btrfs_ino(inode); 1441 1442 if (ino > max_key->objectid) { 1443 iput(&inode->vfs_inode); 1444 break; 1445 } 1446 1447 objectid = ino + 1; 1448 if (!S_ISREG(inode->vfs_inode.i_mode)) 1449 continue; 1450 1451 if (unlikely(min_key->objectid == ino)) { 1452 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1453 continue; 1454 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1455 start = 0; 1456 else { 1457 start = min_key->offset; 1458 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1459 } 1460 } else { 1461 start = 0; 1462 } 1463 1464 if (unlikely(max_key->objectid == ino)) { 1465 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1466 continue; 1467 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1468 end = (u64)-1; 1469 } else { 1470 if (max_key->offset == 0) 1471 continue; 1472 end = max_key->offset; 1473 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1474 end--; 1475 } 1476 } else { 1477 end = (u64)-1; 1478 } 1479 1480 /* the lock_extent waits for read_folio to complete */ 1481 lock_extent(&inode->io_tree, start, end, &cached_state); 1482 btrfs_drop_extent_map_range(inode, start, end, true); 1483 unlock_extent(&inode->io_tree, start, end, &cached_state); 1484 } 1485 return 0; 1486 } 1487 1488 static int find_next_key(struct btrfs_path *path, int level, 1489 struct btrfs_key *key) 1490 1491 { 1492 while (level < BTRFS_MAX_LEVEL) { 1493 if (!path->nodes[level]) 1494 break; 1495 if (path->slots[level] + 1 < 1496 btrfs_header_nritems(path->nodes[level])) { 1497 btrfs_node_key_to_cpu(path->nodes[level], key, 1498 path->slots[level] + 1); 1499 return 0; 1500 } 1501 level++; 1502 } 1503 return 1; 1504 } 1505 1506 /* 1507 * Insert current subvolume into reloc_control::dirty_subvol_roots 1508 */ 1509 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1510 struct reloc_control *rc, 1511 struct btrfs_root *root) 1512 { 1513 struct btrfs_root *reloc_root = root->reloc_root; 1514 struct btrfs_root_item *reloc_root_item; 1515 int ret; 1516 1517 /* @root must be a subvolume tree root with a valid reloc tree */ 1518 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1519 ASSERT(reloc_root); 1520 1521 reloc_root_item = &reloc_root->root_item; 1522 memset(&reloc_root_item->drop_progress, 0, 1523 sizeof(reloc_root_item->drop_progress)); 1524 btrfs_set_root_drop_level(reloc_root_item, 0); 1525 btrfs_set_root_refs(reloc_root_item, 0); 1526 ret = btrfs_update_reloc_root(trans, root); 1527 if (ret) 1528 return ret; 1529 1530 if (list_empty(&root->reloc_dirty_list)) { 1531 btrfs_grab_root(root); 1532 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1533 } 1534 1535 return 0; 1536 } 1537 1538 static int clean_dirty_subvols(struct reloc_control *rc) 1539 { 1540 struct btrfs_root *root; 1541 struct btrfs_root *next; 1542 int ret = 0; 1543 int ret2; 1544 1545 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1546 reloc_dirty_list) { 1547 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1548 /* Merged subvolume, cleanup its reloc root */ 1549 struct btrfs_root *reloc_root = root->reloc_root; 1550 1551 list_del_init(&root->reloc_dirty_list); 1552 root->reloc_root = NULL; 1553 /* 1554 * Need barrier to ensure clear_bit() only happens after 1555 * root->reloc_root = NULL. Pairs with have_reloc_root. 1556 */ 1557 smp_wmb(); 1558 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1559 if (reloc_root) { 1560 /* 1561 * btrfs_drop_snapshot drops our ref we hold for 1562 * ->reloc_root. If it fails however we must 1563 * drop the ref ourselves. 1564 */ 1565 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1566 if (ret2 < 0) { 1567 btrfs_put_root(reloc_root); 1568 if (!ret) 1569 ret = ret2; 1570 } 1571 } 1572 btrfs_put_root(root); 1573 } else { 1574 /* Orphan reloc tree, just clean it up */ 1575 ret2 = btrfs_drop_snapshot(root, 0, 1); 1576 if (ret2 < 0) { 1577 btrfs_put_root(root); 1578 if (!ret) 1579 ret = ret2; 1580 } 1581 } 1582 } 1583 return ret; 1584 } 1585 1586 /* 1587 * merge the relocated tree blocks in reloc tree with corresponding 1588 * fs tree. 1589 */ 1590 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1591 struct btrfs_root *root) 1592 { 1593 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1594 struct btrfs_key key; 1595 struct btrfs_key next_key; 1596 struct btrfs_trans_handle *trans = NULL; 1597 struct btrfs_root *reloc_root; 1598 struct btrfs_root_item *root_item; 1599 struct btrfs_path *path; 1600 struct extent_buffer *leaf; 1601 int reserve_level; 1602 int level; 1603 int max_level; 1604 int replaced = 0; 1605 int ret = 0; 1606 u32 min_reserved; 1607 1608 path = btrfs_alloc_path(); 1609 if (!path) 1610 return -ENOMEM; 1611 path->reada = READA_FORWARD; 1612 1613 reloc_root = root->reloc_root; 1614 root_item = &reloc_root->root_item; 1615 1616 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1617 level = btrfs_root_level(root_item); 1618 atomic_inc(&reloc_root->node->refs); 1619 path->nodes[level] = reloc_root->node; 1620 path->slots[level] = 0; 1621 } else { 1622 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1623 1624 level = btrfs_root_drop_level(root_item); 1625 BUG_ON(level == 0); 1626 path->lowest_level = level; 1627 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1628 path->lowest_level = 0; 1629 if (ret < 0) { 1630 btrfs_free_path(path); 1631 return ret; 1632 } 1633 1634 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1635 path->slots[level]); 1636 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1637 1638 btrfs_unlock_up_safe(path, 0); 1639 } 1640 1641 /* 1642 * In merge_reloc_root(), we modify the upper level pointer to swap the 1643 * tree blocks between reloc tree and subvolume tree. Thus for tree 1644 * block COW, we COW at most from level 1 to root level for each tree. 1645 * 1646 * Thus the needed metadata size is at most root_level * nodesize, 1647 * and * 2 since we have two trees to COW. 1648 */ 1649 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1650 min_reserved = fs_info->nodesize * reserve_level * 2; 1651 memset(&next_key, 0, sizeof(next_key)); 1652 1653 while (1) { 1654 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1655 min_reserved, 1656 BTRFS_RESERVE_FLUSH_LIMIT); 1657 if (ret) 1658 goto out; 1659 trans = btrfs_start_transaction(root, 0); 1660 if (IS_ERR(trans)) { 1661 ret = PTR_ERR(trans); 1662 trans = NULL; 1663 goto out; 1664 } 1665 1666 /* 1667 * At this point we no longer have a reloc_control, so we can't 1668 * depend on btrfs_init_reloc_root to update our last_trans. 1669 * 1670 * But that's ok, we started the trans handle on our 1671 * corresponding fs_root, which means it's been added to the 1672 * dirty list. At commit time we'll still call 1673 * btrfs_update_reloc_root() and update our root item 1674 * appropriately. 1675 */ 1676 btrfs_set_root_last_trans(reloc_root, trans->transid); 1677 trans->block_rsv = rc->block_rsv; 1678 1679 replaced = 0; 1680 max_level = level; 1681 1682 ret = walk_down_reloc_tree(reloc_root, path, &level); 1683 if (ret < 0) 1684 goto out; 1685 if (ret > 0) 1686 break; 1687 1688 if (!find_next_key(path, level, &key) && 1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1690 ret = 0; 1691 } else { 1692 ret = replace_path(trans, rc, root, reloc_root, path, 1693 &next_key, level, max_level); 1694 } 1695 if (ret < 0) 1696 goto out; 1697 if (ret > 0) { 1698 level = ret; 1699 btrfs_node_key_to_cpu(path->nodes[level], &key, 1700 path->slots[level]); 1701 replaced = 1; 1702 } 1703 1704 ret = walk_up_reloc_tree(reloc_root, path, &level); 1705 if (ret > 0) 1706 break; 1707 1708 BUG_ON(level == 0); 1709 /* 1710 * save the merging progress in the drop_progress. 1711 * this is OK since root refs == 1 in this case. 1712 */ 1713 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1714 path->slots[level]); 1715 btrfs_set_root_drop_level(root_item, level); 1716 1717 btrfs_end_transaction_throttle(trans); 1718 trans = NULL; 1719 1720 btrfs_btree_balance_dirty(fs_info); 1721 1722 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1723 invalidate_extent_cache(root, &key, &next_key); 1724 } 1725 1726 /* 1727 * handle the case only one block in the fs tree need to be 1728 * relocated and the block is tree root. 1729 */ 1730 leaf = btrfs_lock_root_node(root); 1731 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1732 BTRFS_NESTING_COW); 1733 btrfs_tree_unlock(leaf); 1734 free_extent_buffer(leaf); 1735 out: 1736 btrfs_free_path(path); 1737 1738 if (ret == 0) { 1739 ret = insert_dirty_subvol(trans, rc, root); 1740 if (ret) 1741 btrfs_abort_transaction(trans, ret); 1742 } 1743 1744 if (trans) 1745 btrfs_end_transaction_throttle(trans); 1746 1747 btrfs_btree_balance_dirty(fs_info); 1748 1749 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1750 invalidate_extent_cache(root, &key, &next_key); 1751 1752 return ret; 1753 } 1754 1755 static noinline_for_stack 1756 int prepare_to_merge(struct reloc_control *rc, int err) 1757 { 1758 struct btrfs_root *root = rc->extent_root; 1759 struct btrfs_fs_info *fs_info = root->fs_info; 1760 struct btrfs_root *reloc_root; 1761 struct btrfs_trans_handle *trans; 1762 LIST_HEAD(reloc_roots); 1763 u64 num_bytes = 0; 1764 int ret; 1765 1766 mutex_lock(&fs_info->reloc_mutex); 1767 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1768 rc->merging_rsv_size += rc->nodes_relocated * 2; 1769 mutex_unlock(&fs_info->reloc_mutex); 1770 1771 again: 1772 if (!err) { 1773 num_bytes = rc->merging_rsv_size; 1774 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1775 BTRFS_RESERVE_FLUSH_ALL); 1776 if (ret) 1777 err = ret; 1778 } 1779 1780 trans = btrfs_join_transaction(rc->extent_root); 1781 if (IS_ERR(trans)) { 1782 if (!err) 1783 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1784 num_bytes, NULL); 1785 return PTR_ERR(trans); 1786 } 1787 1788 if (!err) { 1789 if (num_bytes != rc->merging_rsv_size) { 1790 btrfs_end_transaction(trans); 1791 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1792 num_bytes, NULL); 1793 goto again; 1794 } 1795 } 1796 1797 rc->merge_reloc_tree = true; 1798 1799 while (!list_empty(&rc->reloc_roots)) { 1800 reloc_root = list_entry(rc->reloc_roots.next, 1801 struct btrfs_root, root_list); 1802 list_del_init(&reloc_root->root_list); 1803 1804 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1805 false); 1806 if (IS_ERR(root)) { 1807 /* 1808 * Even if we have an error we need this reloc root 1809 * back on our list so we can clean up properly. 1810 */ 1811 list_add(&reloc_root->root_list, &reloc_roots); 1812 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1813 if (!err) 1814 err = PTR_ERR(root); 1815 break; 1816 } 1817 1818 if (unlikely(root->reloc_root != reloc_root)) { 1819 if (root->reloc_root) { 1820 btrfs_err(fs_info, 1821 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1822 btrfs_root_id(root), 1823 btrfs_root_id(root->reloc_root), 1824 root->reloc_root->root_key.type, 1825 root->reloc_root->root_key.offset, 1826 btrfs_root_generation( 1827 &root->reloc_root->root_item), 1828 btrfs_root_id(reloc_root), 1829 reloc_root->root_key.type, 1830 reloc_root->root_key.offset, 1831 btrfs_root_generation( 1832 &reloc_root->root_item)); 1833 } else { 1834 btrfs_err(fs_info, 1835 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1836 btrfs_root_id(root), 1837 btrfs_root_id(reloc_root), 1838 reloc_root->root_key.type, 1839 reloc_root->root_key.offset, 1840 btrfs_root_generation( 1841 &reloc_root->root_item)); 1842 } 1843 list_add(&reloc_root->root_list, &reloc_roots); 1844 btrfs_put_root(root); 1845 btrfs_abort_transaction(trans, -EUCLEAN); 1846 if (!err) 1847 err = -EUCLEAN; 1848 break; 1849 } 1850 1851 /* 1852 * set reference count to 1, so btrfs_recover_relocation 1853 * knows it should resumes merging 1854 */ 1855 if (!err) 1856 btrfs_set_root_refs(&reloc_root->root_item, 1); 1857 ret = btrfs_update_reloc_root(trans, root); 1858 1859 /* 1860 * Even if we have an error we need this reloc root back on our 1861 * list so we can clean up properly. 1862 */ 1863 list_add(&reloc_root->root_list, &reloc_roots); 1864 btrfs_put_root(root); 1865 1866 if (ret) { 1867 btrfs_abort_transaction(trans, ret); 1868 if (!err) 1869 err = ret; 1870 break; 1871 } 1872 } 1873 1874 list_splice(&reloc_roots, &rc->reloc_roots); 1875 1876 if (!err) 1877 err = btrfs_commit_transaction(trans); 1878 else 1879 btrfs_end_transaction(trans); 1880 return err; 1881 } 1882 1883 static noinline_for_stack 1884 void free_reloc_roots(struct list_head *list) 1885 { 1886 struct btrfs_root *reloc_root, *tmp; 1887 1888 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1889 __del_reloc_root(reloc_root); 1890 } 1891 1892 static noinline_for_stack 1893 void merge_reloc_roots(struct reloc_control *rc) 1894 { 1895 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1896 struct btrfs_root *root; 1897 struct btrfs_root *reloc_root; 1898 LIST_HEAD(reloc_roots); 1899 int found = 0; 1900 int ret = 0; 1901 again: 1902 root = rc->extent_root; 1903 1904 /* 1905 * this serializes us with btrfs_record_root_in_transaction, 1906 * we have to make sure nobody is in the middle of 1907 * adding their roots to the list while we are 1908 * doing this splice 1909 */ 1910 mutex_lock(&fs_info->reloc_mutex); 1911 list_splice_init(&rc->reloc_roots, &reloc_roots); 1912 mutex_unlock(&fs_info->reloc_mutex); 1913 1914 while (!list_empty(&reloc_roots)) { 1915 found = 1; 1916 reloc_root = list_entry(reloc_roots.next, 1917 struct btrfs_root, root_list); 1918 1919 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1920 false); 1921 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1922 if (WARN_ON(IS_ERR(root))) { 1923 /* 1924 * For recovery we read the fs roots on mount, 1925 * and if we didn't find the root then we marked 1926 * the reloc root as a garbage root. For normal 1927 * relocation obviously the root should exist in 1928 * memory. However there's no reason we can't 1929 * handle the error properly here just in case. 1930 */ 1931 ret = PTR_ERR(root); 1932 goto out; 1933 } 1934 if (WARN_ON(root->reloc_root != reloc_root)) { 1935 /* 1936 * This can happen if on-disk metadata has some 1937 * corruption, e.g. bad reloc tree key offset. 1938 */ 1939 ret = -EINVAL; 1940 goto out; 1941 } 1942 ret = merge_reloc_root(rc, root); 1943 btrfs_put_root(root); 1944 if (ret) { 1945 if (list_empty(&reloc_root->root_list)) 1946 list_add_tail(&reloc_root->root_list, 1947 &reloc_roots); 1948 goto out; 1949 } 1950 } else { 1951 if (!IS_ERR(root)) { 1952 if (root->reloc_root == reloc_root) { 1953 root->reloc_root = NULL; 1954 btrfs_put_root(reloc_root); 1955 } 1956 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1957 &root->state); 1958 btrfs_put_root(root); 1959 } 1960 1961 list_del_init(&reloc_root->root_list); 1962 /* Don't forget to queue this reloc root for cleanup */ 1963 list_add_tail(&reloc_root->reloc_dirty_list, 1964 &rc->dirty_subvol_roots); 1965 } 1966 } 1967 1968 if (found) { 1969 found = 0; 1970 goto again; 1971 } 1972 out: 1973 if (ret) { 1974 btrfs_handle_fs_error(fs_info, ret, NULL); 1975 free_reloc_roots(&reloc_roots); 1976 1977 /* new reloc root may be added */ 1978 mutex_lock(&fs_info->reloc_mutex); 1979 list_splice_init(&rc->reloc_roots, &reloc_roots); 1980 mutex_unlock(&fs_info->reloc_mutex); 1981 free_reloc_roots(&reloc_roots); 1982 } 1983 1984 /* 1985 * We used to have 1986 * 1987 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1988 * 1989 * here, but it's wrong. If we fail to start the transaction in 1990 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1991 * have actually been removed from the reloc_root_tree rb tree. This is 1992 * fine because we're bailing here, and we hold a reference on the root 1993 * for the list that holds it, so these roots will be cleaned up when we 1994 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1995 * will be cleaned up on unmount. 1996 * 1997 * The remaining nodes will be cleaned up by free_reloc_control. 1998 */ 1999 } 2000 2001 static void free_block_list(struct rb_root *blocks) 2002 { 2003 struct tree_block *block; 2004 struct rb_node *rb_node; 2005 while ((rb_node = rb_first(blocks))) { 2006 block = rb_entry(rb_node, struct tree_block, rb_node); 2007 rb_erase(rb_node, blocks); 2008 kfree(block); 2009 } 2010 } 2011 2012 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2013 struct btrfs_root *reloc_root) 2014 { 2015 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2016 struct btrfs_root *root; 2017 int ret; 2018 2019 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 2020 return 0; 2021 2022 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2023 2024 /* 2025 * This should succeed, since we can't have a reloc root without having 2026 * already looked up the actual root and created the reloc root for this 2027 * root. 2028 * 2029 * However if there's some sort of corruption where we have a ref to a 2030 * reloc root without a corresponding root this could return ENOENT. 2031 */ 2032 if (IS_ERR(root)) { 2033 ASSERT(0); 2034 return PTR_ERR(root); 2035 } 2036 if (root->reloc_root != reloc_root) { 2037 ASSERT(0); 2038 btrfs_err(fs_info, 2039 "root %llu has two reloc roots associated with it", 2040 reloc_root->root_key.offset); 2041 btrfs_put_root(root); 2042 return -EUCLEAN; 2043 } 2044 ret = btrfs_record_root_in_trans(trans, root); 2045 btrfs_put_root(root); 2046 2047 return ret; 2048 } 2049 2050 static noinline_for_stack 2051 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2052 struct reloc_control *rc, 2053 struct btrfs_backref_node *node, 2054 struct btrfs_backref_edge *edges[]) 2055 { 2056 struct btrfs_backref_node *next; 2057 struct btrfs_root *root; 2058 int index = 0; 2059 int ret; 2060 2061 next = node; 2062 while (1) { 2063 cond_resched(); 2064 next = walk_up_backref(next, edges, &index); 2065 root = next->root; 2066 2067 /* 2068 * If there is no root, then our references for this block are 2069 * incomplete, as we should be able to walk all the way up to a 2070 * block that is owned by a root. 2071 * 2072 * This path is only for SHAREABLE roots, so if we come upon a 2073 * non-SHAREABLE root then we have backrefs that resolve 2074 * improperly. 2075 * 2076 * Both of these cases indicate file system corruption, or a bug 2077 * in the backref walking code. 2078 */ 2079 if (!root) { 2080 ASSERT(0); 2081 btrfs_err(trans->fs_info, 2082 "bytenr %llu doesn't have a backref path ending in a root", 2083 node->bytenr); 2084 return ERR_PTR(-EUCLEAN); 2085 } 2086 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2087 ASSERT(0); 2088 btrfs_err(trans->fs_info, 2089 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2090 node->bytenr); 2091 return ERR_PTR(-EUCLEAN); 2092 } 2093 2094 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2095 ret = record_reloc_root_in_trans(trans, root); 2096 if (ret) 2097 return ERR_PTR(ret); 2098 break; 2099 } 2100 2101 ret = btrfs_record_root_in_trans(trans, root); 2102 if (ret) 2103 return ERR_PTR(ret); 2104 root = root->reloc_root; 2105 2106 /* 2107 * We could have raced with another thread which failed, so 2108 * root->reloc_root may not be set, return ENOENT in this case. 2109 */ 2110 if (!root) 2111 return ERR_PTR(-ENOENT); 2112 2113 if (next->new_bytenr != root->node->start) { 2114 /* 2115 * We just created the reloc root, so we shouldn't have 2116 * ->new_bytenr set and this shouldn't be in the changed 2117 * list. If it is then we have multiple roots pointing 2118 * at the same bytenr which indicates corruption, or 2119 * we've made a mistake in the backref walking code. 2120 */ 2121 ASSERT(next->new_bytenr == 0); 2122 ASSERT(list_empty(&next->list)); 2123 if (next->new_bytenr || !list_empty(&next->list)) { 2124 btrfs_err(trans->fs_info, 2125 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2126 node->bytenr, next->bytenr); 2127 return ERR_PTR(-EUCLEAN); 2128 } 2129 2130 next->new_bytenr = root->node->start; 2131 btrfs_put_root(next->root); 2132 next->root = btrfs_grab_root(root); 2133 ASSERT(next->root); 2134 list_add_tail(&next->list, 2135 &rc->backref_cache.changed); 2136 mark_block_processed(rc, next); 2137 break; 2138 } 2139 2140 WARN_ON(1); 2141 root = NULL; 2142 next = walk_down_backref(edges, &index); 2143 if (!next || next->level <= node->level) 2144 break; 2145 } 2146 if (!root) { 2147 /* 2148 * This can happen if there's fs corruption or if there's a bug 2149 * in the backref lookup code. 2150 */ 2151 ASSERT(0); 2152 return ERR_PTR(-ENOENT); 2153 } 2154 2155 next = node; 2156 /* setup backref node path for btrfs_reloc_cow_block */ 2157 while (1) { 2158 rc->backref_cache.path[next->level] = next; 2159 if (--index < 0) 2160 break; 2161 next = edges[index]->node[UPPER]; 2162 } 2163 return root; 2164 } 2165 2166 /* 2167 * Select a tree root for relocation. 2168 * 2169 * Return NULL if the block is not shareable. We should use do_relocation() in 2170 * this case. 2171 * 2172 * Return a tree root pointer if the block is shareable. 2173 * Return -ENOENT if the block is root of reloc tree. 2174 */ 2175 static noinline_for_stack 2176 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2177 { 2178 struct btrfs_backref_node *next; 2179 struct btrfs_root *root; 2180 struct btrfs_root *fs_root = NULL; 2181 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2182 int index = 0; 2183 2184 next = node; 2185 while (1) { 2186 cond_resched(); 2187 next = walk_up_backref(next, edges, &index); 2188 root = next->root; 2189 2190 /* 2191 * This can occur if we have incomplete extent refs leading all 2192 * the way up a particular path, in this case return -EUCLEAN. 2193 */ 2194 if (!root) 2195 return ERR_PTR(-EUCLEAN); 2196 2197 /* No other choice for non-shareable tree */ 2198 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2199 return root; 2200 2201 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2202 fs_root = root; 2203 2204 if (next != node) 2205 return NULL; 2206 2207 next = walk_down_backref(edges, &index); 2208 if (!next || next->level <= node->level) 2209 break; 2210 } 2211 2212 if (!fs_root) 2213 return ERR_PTR(-ENOENT); 2214 return fs_root; 2215 } 2216 2217 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2218 struct btrfs_backref_node *node) 2219 { 2220 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2221 struct btrfs_backref_node *next = node; 2222 struct btrfs_backref_edge *edge; 2223 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2224 u64 num_bytes = 0; 2225 int index = 0; 2226 2227 BUG_ON(node->processed); 2228 2229 while (next) { 2230 cond_resched(); 2231 while (1) { 2232 if (next->processed) 2233 break; 2234 2235 num_bytes += fs_info->nodesize; 2236 2237 if (list_empty(&next->upper)) 2238 break; 2239 2240 edge = list_entry(next->upper.next, 2241 struct btrfs_backref_edge, list[LOWER]); 2242 edges[index++] = edge; 2243 next = edge->node[UPPER]; 2244 } 2245 next = walk_down_backref(edges, &index); 2246 } 2247 return num_bytes; 2248 } 2249 2250 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2251 struct reloc_control *rc, 2252 struct btrfs_backref_node *node) 2253 { 2254 struct btrfs_root *root = rc->extent_root; 2255 struct btrfs_fs_info *fs_info = root->fs_info; 2256 u64 num_bytes; 2257 int ret; 2258 u64 tmp; 2259 2260 num_bytes = calcu_metadata_size(rc, node) * 2; 2261 2262 trans->block_rsv = rc->block_rsv; 2263 rc->reserved_bytes += num_bytes; 2264 2265 /* 2266 * We are under a transaction here so we can only do limited flushing. 2267 * If we get an enospc just kick back -EAGAIN so we know to drop the 2268 * transaction and try to refill when we can flush all the things. 2269 */ 2270 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2271 BTRFS_RESERVE_FLUSH_LIMIT); 2272 if (ret) { 2273 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2274 while (tmp <= rc->reserved_bytes) 2275 tmp <<= 1; 2276 /* 2277 * only one thread can access block_rsv at this point, 2278 * so we don't need hold lock to protect block_rsv. 2279 * we expand more reservation size here to allow enough 2280 * space for relocation and we will return earlier in 2281 * enospc case. 2282 */ 2283 rc->block_rsv->size = tmp + fs_info->nodesize * 2284 RELOCATION_RESERVED_NODES; 2285 return -EAGAIN; 2286 } 2287 2288 return 0; 2289 } 2290 2291 /* 2292 * relocate a block tree, and then update pointers in upper level 2293 * blocks that reference the block to point to the new location. 2294 * 2295 * if called by link_to_upper, the block has already been relocated. 2296 * in that case this function just updates pointers. 2297 */ 2298 static int do_relocation(struct btrfs_trans_handle *trans, 2299 struct reloc_control *rc, 2300 struct btrfs_backref_node *node, 2301 struct btrfs_key *key, 2302 struct btrfs_path *path, int lowest) 2303 { 2304 struct btrfs_backref_node *upper; 2305 struct btrfs_backref_edge *edge; 2306 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2307 struct btrfs_root *root; 2308 struct extent_buffer *eb; 2309 u32 blocksize; 2310 u64 bytenr; 2311 int slot; 2312 int ret = 0; 2313 2314 /* 2315 * If we are lowest then this is the first time we're processing this 2316 * block, and thus shouldn't have an eb associated with it yet. 2317 */ 2318 ASSERT(!lowest || !node->eb); 2319 2320 path->lowest_level = node->level + 1; 2321 rc->backref_cache.path[node->level] = node; 2322 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2323 cond_resched(); 2324 2325 upper = edge->node[UPPER]; 2326 root = select_reloc_root(trans, rc, upper, edges); 2327 if (IS_ERR(root)) { 2328 ret = PTR_ERR(root); 2329 goto next; 2330 } 2331 2332 if (upper->eb && !upper->locked) { 2333 if (!lowest) { 2334 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2335 if (ret < 0) 2336 goto next; 2337 BUG_ON(ret); 2338 bytenr = btrfs_node_blockptr(upper->eb, slot); 2339 if (node->eb->start == bytenr) 2340 goto next; 2341 } 2342 btrfs_backref_drop_node_buffer(upper); 2343 } 2344 2345 if (!upper->eb) { 2346 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2347 if (ret) { 2348 if (ret > 0) 2349 ret = -ENOENT; 2350 2351 btrfs_release_path(path); 2352 break; 2353 } 2354 2355 if (!upper->eb) { 2356 upper->eb = path->nodes[upper->level]; 2357 path->nodes[upper->level] = NULL; 2358 } else { 2359 BUG_ON(upper->eb != path->nodes[upper->level]); 2360 } 2361 2362 upper->locked = 1; 2363 path->locks[upper->level] = 0; 2364 2365 slot = path->slots[upper->level]; 2366 btrfs_release_path(path); 2367 } else { 2368 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2369 if (ret < 0) 2370 goto next; 2371 BUG_ON(ret); 2372 } 2373 2374 bytenr = btrfs_node_blockptr(upper->eb, slot); 2375 if (lowest) { 2376 if (bytenr != node->bytenr) { 2377 btrfs_err(root->fs_info, 2378 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2379 bytenr, node->bytenr, slot, 2380 upper->eb->start); 2381 ret = -EIO; 2382 goto next; 2383 } 2384 } else { 2385 if (node->eb->start == bytenr) 2386 goto next; 2387 } 2388 2389 blocksize = root->fs_info->nodesize; 2390 eb = btrfs_read_node_slot(upper->eb, slot); 2391 if (IS_ERR(eb)) { 2392 ret = PTR_ERR(eb); 2393 goto next; 2394 } 2395 btrfs_tree_lock(eb); 2396 2397 if (!node->eb) { 2398 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2399 slot, &eb, BTRFS_NESTING_COW); 2400 btrfs_tree_unlock(eb); 2401 free_extent_buffer(eb); 2402 if (ret < 0) 2403 goto next; 2404 /* 2405 * We've just COWed this block, it should have updated 2406 * the correct backref node entry. 2407 */ 2408 ASSERT(node->eb == eb); 2409 } else { 2410 struct btrfs_ref ref = { 2411 .action = BTRFS_ADD_DELAYED_REF, 2412 .bytenr = node->eb->start, 2413 .num_bytes = blocksize, 2414 .parent = upper->eb->start, 2415 .owning_root = btrfs_header_owner(upper->eb), 2416 .ref_root = btrfs_header_owner(upper->eb), 2417 }; 2418 2419 btrfs_set_node_blockptr(upper->eb, slot, 2420 node->eb->start); 2421 btrfs_set_node_ptr_generation(upper->eb, slot, 2422 trans->transid); 2423 btrfs_mark_buffer_dirty(trans, upper->eb); 2424 2425 btrfs_init_tree_ref(&ref, node->level, 2426 btrfs_root_id(root), false); 2427 ret = btrfs_inc_extent_ref(trans, &ref); 2428 if (!ret) 2429 ret = btrfs_drop_subtree(trans, root, eb, 2430 upper->eb); 2431 if (ret) 2432 btrfs_abort_transaction(trans, ret); 2433 } 2434 next: 2435 if (!upper->pending) 2436 btrfs_backref_drop_node_buffer(upper); 2437 else 2438 btrfs_backref_unlock_node_buffer(upper); 2439 if (ret) 2440 break; 2441 } 2442 2443 if (!ret && node->pending) { 2444 btrfs_backref_drop_node_buffer(node); 2445 list_move_tail(&node->list, &rc->backref_cache.changed); 2446 node->pending = 0; 2447 } 2448 2449 path->lowest_level = 0; 2450 2451 /* 2452 * We should have allocated all of our space in the block rsv and thus 2453 * shouldn't ENOSPC. 2454 */ 2455 ASSERT(ret != -ENOSPC); 2456 return ret; 2457 } 2458 2459 static int link_to_upper(struct btrfs_trans_handle *trans, 2460 struct reloc_control *rc, 2461 struct btrfs_backref_node *node, 2462 struct btrfs_path *path) 2463 { 2464 struct btrfs_key key; 2465 2466 btrfs_node_key_to_cpu(node->eb, &key, 0); 2467 return do_relocation(trans, rc, node, &key, path, 0); 2468 } 2469 2470 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2471 struct reloc_control *rc, 2472 struct btrfs_path *path, int err) 2473 { 2474 LIST_HEAD(list); 2475 struct btrfs_backref_cache *cache = &rc->backref_cache; 2476 struct btrfs_backref_node *node; 2477 int level; 2478 int ret; 2479 2480 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2481 while (!list_empty(&cache->pending[level])) { 2482 node = list_entry(cache->pending[level].next, 2483 struct btrfs_backref_node, list); 2484 list_move_tail(&node->list, &list); 2485 BUG_ON(!node->pending); 2486 2487 if (!err) { 2488 ret = link_to_upper(trans, rc, node, path); 2489 if (ret < 0) 2490 err = ret; 2491 } 2492 } 2493 list_splice_init(&list, &cache->pending[level]); 2494 } 2495 return err; 2496 } 2497 2498 /* 2499 * mark a block and all blocks directly/indirectly reference the block 2500 * as processed. 2501 */ 2502 static void update_processed_blocks(struct reloc_control *rc, 2503 struct btrfs_backref_node *node) 2504 { 2505 struct btrfs_backref_node *next = node; 2506 struct btrfs_backref_edge *edge; 2507 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2508 int index = 0; 2509 2510 while (next) { 2511 cond_resched(); 2512 while (1) { 2513 if (next->processed) 2514 break; 2515 2516 mark_block_processed(rc, next); 2517 2518 if (list_empty(&next->upper)) 2519 break; 2520 2521 edge = list_entry(next->upper.next, 2522 struct btrfs_backref_edge, list[LOWER]); 2523 edges[index++] = edge; 2524 next = edge->node[UPPER]; 2525 } 2526 next = walk_down_backref(edges, &index); 2527 } 2528 } 2529 2530 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2531 { 2532 u32 blocksize = rc->extent_root->fs_info->nodesize; 2533 2534 if (test_range_bit(&rc->processed_blocks, bytenr, 2535 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2536 return 1; 2537 return 0; 2538 } 2539 2540 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2541 struct tree_block *block) 2542 { 2543 struct btrfs_tree_parent_check check = { 2544 .level = block->level, 2545 .owner_root = block->owner, 2546 .transid = block->key.offset 2547 }; 2548 struct extent_buffer *eb; 2549 2550 eb = read_tree_block(fs_info, block->bytenr, &check); 2551 if (IS_ERR(eb)) 2552 return PTR_ERR(eb); 2553 if (!extent_buffer_uptodate(eb)) { 2554 free_extent_buffer(eb); 2555 return -EIO; 2556 } 2557 if (block->level == 0) 2558 btrfs_item_key_to_cpu(eb, &block->key, 0); 2559 else 2560 btrfs_node_key_to_cpu(eb, &block->key, 0); 2561 free_extent_buffer(eb); 2562 block->key_ready = true; 2563 return 0; 2564 } 2565 2566 /* 2567 * helper function to relocate a tree block 2568 */ 2569 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2570 struct reloc_control *rc, 2571 struct btrfs_backref_node *node, 2572 struct btrfs_key *key, 2573 struct btrfs_path *path) 2574 { 2575 struct btrfs_root *root; 2576 int ret = 0; 2577 2578 if (!node) 2579 return 0; 2580 2581 /* 2582 * If we fail here we want to drop our backref_node because we are going 2583 * to start over and regenerate the tree for it. 2584 */ 2585 ret = reserve_metadata_space(trans, rc, node); 2586 if (ret) 2587 goto out; 2588 2589 BUG_ON(node->processed); 2590 root = select_one_root(node); 2591 if (IS_ERR(root)) { 2592 ret = PTR_ERR(root); 2593 2594 /* See explanation in select_one_root for the -EUCLEAN case. */ 2595 ASSERT(ret == -ENOENT); 2596 if (ret == -ENOENT) { 2597 ret = 0; 2598 update_processed_blocks(rc, node); 2599 } 2600 goto out; 2601 } 2602 2603 if (root) { 2604 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2605 /* 2606 * This block was the root block of a root, and this is 2607 * the first time we're processing the block and thus it 2608 * should not have had the ->new_bytenr modified and 2609 * should have not been included on the changed list. 2610 * 2611 * However in the case of corruption we could have 2612 * multiple refs pointing to the same block improperly, 2613 * and thus we would trip over these checks. ASSERT() 2614 * for the developer case, because it could indicate a 2615 * bug in the backref code, however error out for a 2616 * normal user in the case of corruption. 2617 */ 2618 ASSERT(node->new_bytenr == 0); 2619 ASSERT(list_empty(&node->list)); 2620 if (node->new_bytenr || !list_empty(&node->list)) { 2621 btrfs_err(root->fs_info, 2622 "bytenr %llu has improper references to it", 2623 node->bytenr); 2624 ret = -EUCLEAN; 2625 goto out; 2626 } 2627 ret = btrfs_record_root_in_trans(trans, root); 2628 if (ret) 2629 goto out; 2630 /* 2631 * Another thread could have failed, need to check if we 2632 * have reloc_root actually set. 2633 */ 2634 if (!root->reloc_root) { 2635 ret = -ENOENT; 2636 goto out; 2637 } 2638 root = root->reloc_root; 2639 node->new_bytenr = root->node->start; 2640 btrfs_put_root(node->root); 2641 node->root = btrfs_grab_root(root); 2642 ASSERT(node->root); 2643 list_add_tail(&node->list, &rc->backref_cache.changed); 2644 } else { 2645 path->lowest_level = node->level; 2646 if (root == root->fs_info->chunk_root) 2647 btrfs_reserve_chunk_metadata(trans, false); 2648 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2649 btrfs_release_path(path); 2650 if (root == root->fs_info->chunk_root) 2651 btrfs_trans_release_chunk_metadata(trans); 2652 if (ret > 0) 2653 ret = 0; 2654 } 2655 if (!ret) 2656 update_processed_blocks(rc, node); 2657 } else { 2658 ret = do_relocation(trans, rc, node, key, path, 1); 2659 } 2660 out: 2661 if (ret || node->level == 0 || node->cowonly) 2662 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2663 return ret; 2664 } 2665 2666 /* 2667 * relocate a list of blocks 2668 */ 2669 static noinline_for_stack 2670 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2671 struct reloc_control *rc, struct rb_root *blocks) 2672 { 2673 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2674 struct btrfs_backref_node *node; 2675 struct btrfs_path *path; 2676 struct tree_block *block; 2677 struct tree_block *next; 2678 int ret = 0; 2679 2680 path = btrfs_alloc_path(); 2681 if (!path) { 2682 ret = -ENOMEM; 2683 goto out_free_blocks; 2684 } 2685 2686 /* Kick in readahead for tree blocks with missing keys */ 2687 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2688 if (!block->key_ready) 2689 btrfs_readahead_tree_block(fs_info, block->bytenr, 2690 block->owner, 0, 2691 block->level); 2692 } 2693 2694 /* Get first keys */ 2695 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2696 if (!block->key_ready) { 2697 ret = get_tree_block_key(fs_info, block); 2698 if (ret) 2699 goto out_free_path; 2700 } 2701 } 2702 2703 /* Do tree relocation */ 2704 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2705 node = build_backref_tree(trans, rc, &block->key, 2706 block->level, block->bytenr); 2707 if (IS_ERR(node)) { 2708 ret = PTR_ERR(node); 2709 goto out; 2710 } 2711 2712 ret = relocate_tree_block(trans, rc, node, &block->key, 2713 path); 2714 if (ret < 0) 2715 break; 2716 } 2717 out: 2718 ret = finish_pending_nodes(trans, rc, path, ret); 2719 2720 out_free_path: 2721 btrfs_free_path(path); 2722 out_free_blocks: 2723 free_block_list(blocks); 2724 return ret; 2725 } 2726 2727 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2728 { 2729 const struct file_extent_cluster *cluster = &rc->cluster; 2730 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2731 u64 alloc_hint = 0; 2732 u64 start; 2733 u64 end; 2734 u64 offset = inode->reloc_block_group_start; 2735 u64 num_bytes; 2736 int nr; 2737 int ret = 0; 2738 u64 i_size = i_size_read(&inode->vfs_inode); 2739 u64 prealloc_start = cluster->start - offset; 2740 u64 prealloc_end = cluster->end - offset; 2741 u64 cur_offset = prealloc_start; 2742 2743 /* 2744 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2745 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2746 * btrfs_do_readpage() call of previously relocated file cluster. 2747 * 2748 * If the current cluster starts in the above range, btrfs_do_readpage() 2749 * will skip the read, and relocate_one_folio() will later writeback 2750 * the padding zeros as new data, causing data corruption. 2751 * 2752 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2753 */ 2754 if (!PAGE_ALIGNED(i_size)) { 2755 struct address_space *mapping = inode->vfs_inode.i_mapping; 2756 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2757 const u32 sectorsize = fs_info->sectorsize; 2758 struct folio *folio; 2759 2760 ASSERT(sectorsize < PAGE_SIZE); 2761 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2762 2763 /* 2764 * Subpage can't handle page with DIRTY but without UPTODATE 2765 * bit as it can lead to the following deadlock: 2766 * 2767 * btrfs_read_folio() 2768 * | Page already *locked* 2769 * |- btrfs_lock_and_flush_ordered_range() 2770 * |- btrfs_start_ordered_extent() 2771 * |- extent_write_cache_pages() 2772 * |- lock_page() 2773 * We try to lock the page we already hold. 2774 * 2775 * Here we just writeback the whole data reloc inode, so that 2776 * we will be ensured to have no dirty range in the page, and 2777 * are safe to clear the uptodate bits. 2778 * 2779 * This shouldn't cause too much overhead, as we need to write 2780 * the data back anyway. 2781 */ 2782 ret = filemap_write_and_wait(mapping); 2783 if (ret < 0) 2784 return ret; 2785 2786 clear_extent_bits(&inode->io_tree, i_size, 2787 round_up(i_size, PAGE_SIZE) - 1, 2788 EXTENT_UPTODATE); 2789 folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); 2790 /* 2791 * If page is freed we don't need to do anything then, as we 2792 * will re-read the whole page anyway. 2793 */ 2794 if (!IS_ERR(folio)) { 2795 btrfs_subpage_clear_uptodate(fs_info, folio, i_size, 2796 round_up(i_size, PAGE_SIZE) - i_size); 2797 folio_unlock(folio); 2798 folio_put(folio); 2799 } 2800 } 2801 2802 BUG_ON(cluster->start != cluster->boundary[0]); 2803 ret = btrfs_alloc_data_chunk_ondemand(inode, 2804 prealloc_end + 1 - prealloc_start); 2805 if (ret) 2806 return ret; 2807 2808 btrfs_inode_lock(inode, 0); 2809 for (nr = 0; nr < cluster->nr; nr++) { 2810 struct extent_state *cached_state = NULL; 2811 2812 start = cluster->boundary[nr] - offset; 2813 if (nr + 1 < cluster->nr) 2814 end = cluster->boundary[nr + 1] - 1 - offset; 2815 else 2816 end = cluster->end - offset; 2817 2818 lock_extent(&inode->io_tree, start, end, &cached_state); 2819 num_bytes = end + 1 - start; 2820 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2821 num_bytes, num_bytes, 2822 end + 1, &alloc_hint); 2823 cur_offset = end + 1; 2824 unlock_extent(&inode->io_tree, start, end, &cached_state); 2825 if (ret) 2826 break; 2827 } 2828 btrfs_inode_unlock(inode, 0); 2829 2830 if (cur_offset < prealloc_end) 2831 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2832 prealloc_end + 1 - cur_offset); 2833 return ret; 2834 } 2835 2836 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2837 { 2838 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2839 struct extent_map *em; 2840 struct extent_state *cached_state = NULL; 2841 u64 offset = inode->reloc_block_group_start; 2842 u64 start = rc->cluster.start - offset; 2843 u64 end = rc->cluster.end - offset; 2844 int ret = 0; 2845 2846 em = alloc_extent_map(); 2847 if (!em) 2848 return -ENOMEM; 2849 2850 em->start = start; 2851 em->len = end + 1 - start; 2852 em->disk_bytenr = rc->cluster.start; 2853 em->disk_num_bytes = em->len; 2854 em->ram_bytes = em->len; 2855 em->flags |= EXTENT_FLAG_PINNED; 2856 2857 lock_extent(&inode->io_tree, start, end, &cached_state); 2858 ret = btrfs_replace_extent_map_range(inode, em, false); 2859 unlock_extent(&inode->io_tree, start, end, &cached_state); 2860 free_extent_map(em); 2861 2862 return ret; 2863 } 2864 2865 /* 2866 * Allow error injection to test balance/relocation cancellation 2867 */ 2868 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2869 { 2870 return atomic_read(&fs_info->balance_cancel_req) || 2871 atomic_read(&fs_info->reloc_cancel_req) || 2872 fatal_signal_pending(current); 2873 } 2874 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2875 2876 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2877 int cluster_nr) 2878 { 2879 /* Last extent, use cluster end directly */ 2880 if (cluster_nr >= cluster->nr - 1) 2881 return cluster->end; 2882 2883 /* Use next boundary start*/ 2884 return cluster->boundary[cluster_nr + 1] - 1; 2885 } 2886 2887 static int relocate_one_folio(struct reloc_control *rc, 2888 struct file_ra_state *ra, 2889 int *cluster_nr, unsigned long index) 2890 { 2891 const struct file_extent_cluster *cluster = &rc->cluster; 2892 struct inode *inode = rc->data_inode; 2893 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2894 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2895 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2896 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2897 struct folio *folio; 2898 u64 folio_start; 2899 u64 folio_end; 2900 u64 cur; 2901 int ret; 2902 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2903 2904 ASSERT(index <= last_index); 2905 again: 2906 folio = filemap_lock_folio(inode->i_mapping, index); 2907 if (IS_ERR(folio)) { 2908 2909 /* 2910 * On relocation we're doing readahead on the relocation inode, 2911 * but if the filesystem is backed by a RAID stripe tree we can 2912 * get ENOENT (e.g. due to preallocated extents not being 2913 * mapped in the RST) from the lookup. 2914 * 2915 * But readahead doesn't handle the error and submits invalid 2916 * reads to the device, causing a assertion failures. 2917 */ 2918 if (!use_rst) 2919 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2920 index, last_index + 1 - index); 2921 folio = __filemap_get_folio(inode->i_mapping, index, 2922 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2923 mask); 2924 if (IS_ERR(folio)) 2925 return PTR_ERR(folio); 2926 } 2927 2928 WARN_ON(folio_order(folio)); 2929 2930 if (folio_test_readahead(folio) && !use_rst) 2931 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2932 folio, last_index + 1 - index); 2933 2934 if (!folio_test_uptodate(folio)) { 2935 btrfs_read_folio(NULL, folio); 2936 folio_lock(folio); 2937 if (!folio_test_uptodate(folio)) { 2938 ret = -EIO; 2939 goto release_folio; 2940 } 2941 if (folio->mapping != inode->i_mapping) { 2942 folio_unlock(folio); 2943 folio_put(folio); 2944 goto again; 2945 } 2946 } 2947 2948 /* 2949 * We could have lost folio private when we dropped the lock to read the 2950 * folio above, make sure we set_page_extent_mapped here so we have any 2951 * of the subpage blocksize stuff we need in place. 2952 */ 2953 ret = set_folio_extent_mapped(folio); 2954 if (ret < 0) 2955 goto release_folio; 2956 2957 folio_start = folio_pos(folio); 2958 folio_end = folio_start + PAGE_SIZE - 1; 2959 2960 /* 2961 * Start from the cluster, as for subpage case, the cluster can start 2962 * inside the folio. 2963 */ 2964 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2965 while (cur <= folio_end) { 2966 struct extent_state *cached_state = NULL; 2967 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2968 u64 extent_end = get_cluster_boundary_end(cluster, 2969 *cluster_nr) - offset; 2970 u64 clamped_start = max(folio_start, extent_start); 2971 u64 clamped_end = min(folio_end, extent_end); 2972 u32 clamped_len = clamped_end + 1 - clamped_start; 2973 2974 /* Reserve metadata for this range */ 2975 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2976 clamped_len, clamped_len, 2977 false); 2978 if (ret) 2979 goto release_folio; 2980 2981 /* Mark the range delalloc and dirty for later writeback */ 2982 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2983 &cached_state); 2984 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2985 clamped_end, 0, &cached_state); 2986 if (ret) { 2987 clear_extent_bit(&BTRFS_I(inode)->io_tree, 2988 clamped_start, clamped_end, 2989 EXTENT_LOCKED | EXTENT_BOUNDARY, 2990 &cached_state); 2991 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2992 clamped_len, true); 2993 btrfs_delalloc_release_extents(BTRFS_I(inode), 2994 clamped_len); 2995 goto release_folio; 2996 } 2997 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2998 2999 /* 3000 * Set the boundary if it's inside the folio. 3001 * Data relocation requires the destination extents to have the 3002 * same size as the source. 3003 * EXTENT_BOUNDARY bit prevents current extent from being merged 3004 * with previous extent. 3005 */ 3006 if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { 3007 u64 boundary_start = cluster->boundary[*cluster_nr] - 3008 offset; 3009 u64 boundary_end = boundary_start + 3010 fs_info->sectorsize - 1; 3011 3012 set_extent_bit(&BTRFS_I(inode)->io_tree, 3013 boundary_start, boundary_end, 3014 EXTENT_BOUNDARY, NULL); 3015 } 3016 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3017 &cached_state); 3018 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3019 cur += clamped_len; 3020 3021 /* Crossed extent end, go to next extent */ 3022 if (cur >= extent_end) { 3023 (*cluster_nr)++; 3024 /* Just finished the last extent of the cluster, exit. */ 3025 if (*cluster_nr >= cluster->nr) 3026 break; 3027 } 3028 } 3029 folio_unlock(folio); 3030 folio_put(folio); 3031 3032 balance_dirty_pages_ratelimited(inode->i_mapping); 3033 btrfs_throttle(fs_info); 3034 if (btrfs_should_cancel_balance(fs_info)) 3035 ret = -ECANCELED; 3036 return ret; 3037 3038 release_folio: 3039 folio_unlock(folio); 3040 folio_put(folio); 3041 return ret; 3042 } 3043 3044 static int relocate_file_extent_cluster(struct reloc_control *rc) 3045 { 3046 struct inode *inode = rc->data_inode; 3047 const struct file_extent_cluster *cluster = &rc->cluster; 3048 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 3049 unsigned long index; 3050 unsigned long last_index; 3051 struct file_ra_state *ra; 3052 int cluster_nr = 0; 3053 int ret = 0; 3054 3055 if (!cluster->nr) 3056 return 0; 3057 3058 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3059 if (!ra) 3060 return -ENOMEM; 3061 3062 ret = prealloc_file_extent_cluster(rc); 3063 if (ret) 3064 goto out; 3065 3066 file_ra_state_init(ra, inode->i_mapping); 3067 3068 ret = setup_relocation_extent_mapping(rc); 3069 if (ret) 3070 goto out; 3071 3072 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3073 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3074 index <= last_index && !ret; index++) 3075 ret = relocate_one_folio(rc, ra, &cluster_nr, index); 3076 if (ret == 0) 3077 WARN_ON(cluster_nr != cluster->nr); 3078 out: 3079 kfree(ra); 3080 return ret; 3081 } 3082 3083 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 3084 const struct btrfs_key *extent_key) 3085 { 3086 struct inode *inode = rc->data_inode; 3087 struct file_extent_cluster *cluster = &rc->cluster; 3088 int ret; 3089 struct btrfs_root *root = BTRFS_I(inode)->root; 3090 3091 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3092 ret = relocate_file_extent_cluster(rc); 3093 if (ret) 3094 return ret; 3095 cluster->nr = 0; 3096 } 3097 3098 /* 3099 * Under simple quotas, we set root->relocation_src_root when we find 3100 * the extent. If adjacent extents have different owners, we can't merge 3101 * them while relocating. Handle this by storing the owning root that 3102 * started a cluster and if we see an extent from a different root break 3103 * cluster formation (just like the above case of non-adjacent extents). 3104 * 3105 * Without simple quotas, relocation_src_root is always 0, so we should 3106 * never see a mismatch, and it should have no effect on relocation 3107 * clusters. 3108 */ 3109 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3110 u64 tmp = root->relocation_src_root; 3111 3112 /* 3113 * root->relocation_src_root is the state that actually affects 3114 * the preallocation we do here, so set it to the root owning 3115 * the cluster we need to relocate. 3116 */ 3117 root->relocation_src_root = cluster->owning_root; 3118 ret = relocate_file_extent_cluster(rc); 3119 if (ret) 3120 return ret; 3121 cluster->nr = 0; 3122 /* And reset it back for the current extent's owning root. */ 3123 root->relocation_src_root = tmp; 3124 } 3125 3126 if (!cluster->nr) { 3127 cluster->start = extent_key->objectid; 3128 cluster->owning_root = root->relocation_src_root; 3129 } 3130 else 3131 BUG_ON(cluster->nr >= MAX_EXTENTS); 3132 cluster->end = extent_key->objectid + extent_key->offset - 1; 3133 cluster->boundary[cluster->nr] = extent_key->objectid; 3134 cluster->nr++; 3135 3136 if (cluster->nr >= MAX_EXTENTS) { 3137 ret = relocate_file_extent_cluster(rc); 3138 if (ret) 3139 return ret; 3140 cluster->nr = 0; 3141 } 3142 return 0; 3143 } 3144 3145 /* 3146 * helper to add a tree block to the list. 3147 * the major work is getting the generation and level of the block 3148 */ 3149 static int add_tree_block(struct reloc_control *rc, 3150 const struct btrfs_key *extent_key, 3151 struct btrfs_path *path, 3152 struct rb_root *blocks) 3153 { 3154 struct extent_buffer *eb; 3155 struct btrfs_extent_item *ei; 3156 struct btrfs_tree_block_info *bi; 3157 struct tree_block *block; 3158 struct rb_node *rb_node; 3159 u32 item_size; 3160 int level = -1; 3161 u64 generation; 3162 u64 owner = 0; 3163 3164 eb = path->nodes[0]; 3165 item_size = btrfs_item_size(eb, path->slots[0]); 3166 3167 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3168 item_size >= sizeof(*ei) + sizeof(*bi)) { 3169 unsigned long ptr = 0, end; 3170 3171 ei = btrfs_item_ptr(eb, path->slots[0], 3172 struct btrfs_extent_item); 3173 end = (unsigned long)ei + item_size; 3174 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3175 bi = (struct btrfs_tree_block_info *)(ei + 1); 3176 level = btrfs_tree_block_level(eb, bi); 3177 ptr = (unsigned long)(bi + 1); 3178 } else { 3179 level = (int)extent_key->offset; 3180 ptr = (unsigned long)(ei + 1); 3181 } 3182 generation = btrfs_extent_generation(eb, ei); 3183 3184 /* 3185 * We're reading random blocks without knowing their owner ahead 3186 * of time. This is ok most of the time, as all reloc roots and 3187 * fs roots have the same lock type. However normal trees do 3188 * not, and the only way to know ahead of time is to read the 3189 * inline ref offset. We know it's an fs root if 3190 * 3191 * 1. There's more than one ref. 3192 * 2. There's a SHARED_DATA_REF_KEY set. 3193 * 3. FULL_BACKREF is set on the flags. 3194 * 3195 * Otherwise it's safe to assume that the ref offset == the 3196 * owner of this block, so we can use that when calling 3197 * read_tree_block. 3198 */ 3199 if (btrfs_extent_refs(eb, ei) == 1 && 3200 !(btrfs_extent_flags(eb, ei) & 3201 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3202 ptr < end) { 3203 struct btrfs_extent_inline_ref *iref; 3204 int type; 3205 3206 iref = (struct btrfs_extent_inline_ref *)ptr; 3207 type = btrfs_get_extent_inline_ref_type(eb, iref, 3208 BTRFS_REF_TYPE_BLOCK); 3209 if (type == BTRFS_REF_TYPE_INVALID) 3210 return -EINVAL; 3211 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3212 owner = btrfs_extent_inline_ref_offset(eb, iref); 3213 } 3214 } else { 3215 btrfs_print_leaf(eb); 3216 btrfs_err(rc->block_group->fs_info, 3217 "unrecognized tree backref at tree block %llu slot %u", 3218 eb->start, path->slots[0]); 3219 btrfs_release_path(path); 3220 return -EUCLEAN; 3221 } 3222 3223 btrfs_release_path(path); 3224 3225 BUG_ON(level == -1); 3226 3227 block = kmalloc(sizeof(*block), GFP_NOFS); 3228 if (!block) 3229 return -ENOMEM; 3230 3231 block->bytenr = extent_key->objectid; 3232 block->key.objectid = rc->extent_root->fs_info->nodesize; 3233 block->key.offset = generation; 3234 block->level = level; 3235 block->key_ready = false; 3236 block->owner = owner; 3237 3238 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3239 if (rb_node) 3240 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3241 -EEXIST); 3242 3243 return 0; 3244 } 3245 3246 /* 3247 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3248 */ 3249 static int __add_tree_block(struct reloc_control *rc, 3250 u64 bytenr, u32 blocksize, 3251 struct rb_root *blocks) 3252 { 3253 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3254 struct btrfs_path *path; 3255 struct btrfs_key key; 3256 int ret; 3257 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3258 3259 if (tree_block_processed(bytenr, rc)) 3260 return 0; 3261 3262 if (rb_simple_search(blocks, bytenr)) 3263 return 0; 3264 3265 path = btrfs_alloc_path(); 3266 if (!path) 3267 return -ENOMEM; 3268 again: 3269 key.objectid = bytenr; 3270 if (skinny) { 3271 key.type = BTRFS_METADATA_ITEM_KEY; 3272 key.offset = (u64)-1; 3273 } else { 3274 key.type = BTRFS_EXTENT_ITEM_KEY; 3275 key.offset = blocksize; 3276 } 3277 3278 path->search_commit_root = 1; 3279 path->skip_locking = 1; 3280 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3281 if (ret < 0) 3282 goto out; 3283 3284 if (ret > 0 && skinny) { 3285 if (path->slots[0]) { 3286 path->slots[0]--; 3287 btrfs_item_key_to_cpu(path->nodes[0], &key, 3288 path->slots[0]); 3289 if (key.objectid == bytenr && 3290 (key.type == BTRFS_METADATA_ITEM_KEY || 3291 (key.type == BTRFS_EXTENT_ITEM_KEY && 3292 key.offset == blocksize))) 3293 ret = 0; 3294 } 3295 3296 if (ret) { 3297 skinny = false; 3298 btrfs_release_path(path); 3299 goto again; 3300 } 3301 } 3302 if (ret) { 3303 ASSERT(ret == 1); 3304 btrfs_print_leaf(path->nodes[0]); 3305 btrfs_err(fs_info, 3306 "tree block extent item (%llu) is not found in extent tree", 3307 bytenr); 3308 WARN_ON(1); 3309 ret = -EINVAL; 3310 goto out; 3311 } 3312 3313 ret = add_tree_block(rc, &key, path, blocks); 3314 out: 3315 btrfs_free_path(path); 3316 return ret; 3317 } 3318 3319 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3320 struct btrfs_block_group *block_group, 3321 struct inode *inode, 3322 u64 ino) 3323 { 3324 struct btrfs_root *root = fs_info->tree_root; 3325 struct btrfs_trans_handle *trans; 3326 int ret = 0; 3327 3328 if (inode) 3329 goto truncate; 3330 3331 inode = btrfs_iget(ino, root); 3332 if (IS_ERR(inode)) 3333 return -ENOENT; 3334 3335 truncate: 3336 ret = btrfs_check_trunc_cache_free_space(fs_info, 3337 &fs_info->global_block_rsv); 3338 if (ret) 3339 goto out; 3340 3341 trans = btrfs_join_transaction(root); 3342 if (IS_ERR(trans)) { 3343 ret = PTR_ERR(trans); 3344 goto out; 3345 } 3346 3347 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3348 3349 btrfs_end_transaction(trans); 3350 btrfs_btree_balance_dirty(fs_info); 3351 out: 3352 iput(inode); 3353 return ret; 3354 } 3355 3356 /* 3357 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3358 * cache inode, to avoid free space cache data extent blocking data relocation. 3359 */ 3360 static int delete_v1_space_cache(struct extent_buffer *leaf, 3361 struct btrfs_block_group *block_group, 3362 u64 data_bytenr) 3363 { 3364 u64 space_cache_ino; 3365 struct btrfs_file_extent_item *ei; 3366 struct btrfs_key key; 3367 bool found = false; 3368 int i; 3369 int ret; 3370 3371 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3372 return 0; 3373 3374 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3375 u8 type; 3376 3377 btrfs_item_key_to_cpu(leaf, &key, i); 3378 if (key.type != BTRFS_EXTENT_DATA_KEY) 3379 continue; 3380 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3381 type = btrfs_file_extent_type(leaf, ei); 3382 3383 if ((type == BTRFS_FILE_EXTENT_REG || 3384 type == BTRFS_FILE_EXTENT_PREALLOC) && 3385 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3386 found = true; 3387 space_cache_ino = key.objectid; 3388 break; 3389 } 3390 } 3391 if (!found) 3392 return -ENOENT; 3393 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3394 space_cache_ino); 3395 return ret; 3396 } 3397 3398 /* 3399 * helper to find all tree blocks that reference a given data extent 3400 */ 3401 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3402 const struct btrfs_key *extent_key, 3403 struct btrfs_path *path, 3404 struct rb_root *blocks) 3405 { 3406 struct btrfs_backref_walk_ctx ctx = { 0 }; 3407 struct ulist_iterator leaf_uiter; 3408 struct ulist_node *ref_node = NULL; 3409 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3410 int ret = 0; 3411 3412 btrfs_release_path(path); 3413 3414 ctx.bytenr = extent_key->objectid; 3415 ctx.skip_inode_ref_list = true; 3416 ctx.fs_info = rc->extent_root->fs_info; 3417 3418 ret = btrfs_find_all_leafs(&ctx); 3419 if (ret < 0) 3420 return ret; 3421 3422 ULIST_ITER_INIT(&leaf_uiter); 3423 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3424 struct btrfs_tree_parent_check check = { 0 }; 3425 struct extent_buffer *eb; 3426 3427 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3428 if (IS_ERR(eb)) { 3429 ret = PTR_ERR(eb); 3430 break; 3431 } 3432 ret = delete_v1_space_cache(eb, rc->block_group, 3433 extent_key->objectid); 3434 free_extent_buffer(eb); 3435 if (ret < 0) 3436 break; 3437 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3438 if (ret < 0) 3439 break; 3440 } 3441 if (ret < 0) 3442 free_block_list(blocks); 3443 ulist_free(ctx.refs); 3444 return ret; 3445 } 3446 3447 /* 3448 * helper to find next unprocessed extent 3449 */ 3450 static noinline_for_stack 3451 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3452 struct btrfs_key *extent_key) 3453 { 3454 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3455 struct btrfs_key key; 3456 struct extent_buffer *leaf; 3457 u64 start, end, last; 3458 int ret; 3459 3460 last = rc->block_group->start + rc->block_group->length; 3461 while (1) { 3462 bool block_found; 3463 3464 cond_resched(); 3465 if (rc->search_start >= last) { 3466 ret = 1; 3467 break; 3468 } 3469 3470 key.objectid = rc->search_start; 3471 key.type = BTRFS_EXTENT_ITEM_KEY; 3472 key.offset = 0; 3473 3474 path->search_commit_root = 1; 3475 path->skip_locking = 1; 3476 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3477 0, 0); 3478 if (ret < 0) 3479 break; 3480 next: 3481 leaf = path->nodes[0]; 3482 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3483 ret = btrfs_next_leaf(rc->extent_root, path); 3484 if (ret != 0) 3485 break; 3486 leaf = path->nodes[0]; 3487 } 3488 3489 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3490 if (key.objectid >= last) { 3491 ret = 1; 3492 break; 3493 } 3494 3495 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3496 key.type != BTRFS_METADATA_ITEM_KEY) { 3497 path->slots[0]++; 3498 goto next; 3499 } 3500 3501 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3502 key.objectid + key.offset <= rc->search_start) { 3503 path->slots[0]++; 3504 goto next; 3505 } 3506 3507 if (key.type == BTRFS_METADATA_ITEM_KEY && 3508 key.objectid + fs_info->nodesize <= 3509 rc->search_start) { 3510 path->slots[0]++; 3511 goto next; 3512 } 3513 3514 block_found = find_first_extent_bit(&rc->processed_blocks, 3515 key.objectid, &start, &end, 3516 EXTENT_DIRTY, NULL); 3517 3518 if (block_found && start <= key.objectid) { 3519 btrfs_release_path(path); 3520 rc->search_start = end + 1; 3521 } else { 3522 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3523 rc->search_start = key.objectid + key.offset; 3524 else 3525 rc->search_start = key.objectid + 3526 fs_info->nodesize; 3527 memcpy(extent_key, &key, sizeof(key)); 3528 return 0; 3529 } 3530 } 3531 btrfs_release_path(path); 3532 return ret; 3533 } 3534 3535 static void set_reloc_control(struct reloc_control *rc) 3536 { 3537 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3538 3539 mutex_lock(&fs_info->reloc_mutex); 3540 fs_info->reloc_ctl = rc; 3541 mutex_unlock(&fs_info->reloc_mutex); 3542 } 3543 3544 static void unset_reloc_control(struct reloc_control *rc) 3545 { 3546 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3547 3548 mutex_lock(&fs_info->reloc_mutex); 3549 fs_info->reloc_ctl = NULL; 3550 mutex_unlock(&fs_info->reloc_mutex); 3551 } 3552 3553 static noinline_for_stack 3554 int prepare_to_relocate(struct reloc_control *rc) 3555 { 3556 struct btrfs_trans_handle *trans; 3557 int ret; 3558 3559 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3560 BTRFS_BLOCK_RSV_TEMP); 3561 if (!rc->block_rsv) 3562 return -ENOMEM; 3563 3564 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3565 rc->search_start = rc->block_group->start; 3566 rc->extents_found = 0; 3567 rc->nodes_relocated = 0; 3568 rc->merging_rsv_size = 0; 3569 rc->reserved_bytes = 0; 3570 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3571 RELOCATION_RESERVED_NODES; 3572 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3573 rc->block_rsv, rc->block_rsv->size, 3574 BTRFS_RESERVE_FLUSH_ALL); 3575 if (ret) 3576 return ret; 3577 3578 rc->create_reloc_tree = true; 3579 set_reloc_control(rc); 3580 3581 trans = btrfs_join_transaction(rc->extent_root); 3582 if (IS_ERR(trans)) { 3583 unset_reloc_control(rc); 3584 /* 3585 * extent tree is not a ref_cow tree and has no reloc_root to 3586 * cleanup. And callers are responsible to free the above 3587 * block rsv. 3588 */ 3589 return PTR_ERR(trans); 3590 } 3591 3592 ret = btrfs_commit_transaction(trans); 3593 if (ret) 3594 unset_reloc_control(rc); 3595 3596 return ret; 3597 } 3598 3599 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3600 { 3601 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3602 struct rb_root blocks = RB_ROOT; 3603 struct btrfs_key key; 3604 struct btrfs_trans_handle *trans = NULL; 3605 struct btrfs_path *path; 3606 struct btrfs_extent_item *ei; 3607 u64 flags; 3608 int ret; 3609 int err = 0; 3610 int progress = 0; 3611 3612 path = btrfs_alloc_path(); 3613 if (!path) 3614 return -ENOMEM; 3615 path->reada = READA_FORWARD; 3616 3617 ret = prepare_to_relocate(rc); 3618 if (ret) { 3619 err = ret; 3620 goto out_free; 3621 } 3622 3623 while (1) { 3624 rc->reserved_bytes = 0; 3625 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3626 rc->block_rsv->size, 3627 BTRFS_RESERVE_FLUSH_ALL); 3628 if (ret) { 3629 err = ret; 3630 break; 3631 } 3632 progress++; 3633 trans = btrfs_start_transaction(rc->extent_root, 0); 3634 if (IS_ERR(trans)) { 3635 err = PTR_ERR(trans); 3636 trans = NULL; 3637 break; 3638 } 3639 restart: 3640 if (rc->backref_cache.last_trans != trans->transid) 3641 btrfs_backref_release_cache(&rc->backref_cache); 3642 rc->backref_cache.last_trans = trans->transid; 3643 3644 ret = find_next_extent(rc, path, &key); 3645 if (ret < 0) 3646 err = ret; 3647 if (ret != 0) 3648 break; 3649 3650 rc->extents_found++; 3651 3652 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3653 struct btrfs_extent_item); 3654 flags = btrfs_extent_flags(path->nodes[0], ei); 3655 3656 /* 3657 * If we are relocating a simple quota owned extent item, we 3658 * need to note the owner on the reloc data root so that when 3659 * we allocate the replacement item, we can attribute it to the 3660 * correct eventual owner (rather than the reloc data root). 3661 */ 3662 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3663 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3664 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3665 path->nodes[0], 3666 path->slots[0]); 3667 3668 root->relocation_src_root = owning_root_id; 3669 } 3670 3671 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3672 ret = add_tree_block(rc, &key, path, &blocks); 3673 } else if (rc->stage == UPDATE_DATA_PTRS && 3674 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3675 ret = add_data_references(rc, &key, path, &blocks); 3676 } else { 3677 btrfs_release_path(path); 3678 ret = 0; 3679 } 3680 if (ret < 0) { 3681 err = ret; 3682 break; 3683 } 3684 3685 if (!RB_EMPTY_ROOT(&blocks)) { 3686 ret = relocate_tree_blocks(trans, rc, &blocks); 3687 if (ret < 0) { 3688 if (ret != -EAGAIN) { 3689 err = ret; 3690 break; 3691 } 3692 rc->extents_found--; 3693 rc->search_start = key.objectid; 3694 } 3695 } 3696 3697 btrfs_end_transaction_throttle(trans); 3698 btrfs_btree_balance_dirty(fs_info); 3699 trans = NULL; 3700 3701 if (rc->stage == MOVE_DATA_EXTENTS && 3702 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3703 rc->found_file_extent = true; 3704 ret = relocate_data_extent(rc, &key); 3705 if (ret < 0) { 3706 err = ret; 3707 break; 3708 } 3709 } 3710 if (btrfs_should_cancel_balance(fs_info)) { 3711 err = -ECANCELED; 3712 break; 3713 } 3714 } 3715 if (trans && progress && err == -ENOSPC) { 3716 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3717 if (ret == 1) { 3718 err = 0; 3719 progress = 0; 3720 goto restart; 3721 } 3722 } 3723 3724 btrfs_release_path(path); 3725 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3726 3727 if (trans) { 3728 btrfs_end_transaction_throttle(trans); 3729 btrfs_btree_balance_dirty(fs_info); 3730 } 3731 3732 if (!err) { 3733 ret = relocate_file_extent_cluster(rc); 3734 if (ret < 0) 3735 err = ret; 3736 } 3737 3738 rc->create_reloc_tree = false; 3739 set_reloc_control(rc); 3740 3741 btrfs_backref_release_cache(&rc->backref_cache); 3742 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3743 3744 /* 3745 * Even in the case when the relocation is cancelled, we should all go 3746 * through prepare_to_merge() and merge_reloc_roots(). 3747 * 3748 * For error (including cancelled balance), prepare_to_merge() will 3749 * mark all reloc trees orphan, then queue them for cleanup in 3750 * merge_reloc_roots() 3751 */ 3752 err = prepare_to_merge(rc, err); 3753 3754 merge_reloc_roots(rc); 3755 3756 rc->merge_reloc_tree = false; 3757 unset_reloc_control(rc); 3758 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3759 3760 /* get rid of pinned extents */ 3761 trans = btrfs_join_transaction(rc->extent_root); 3762 if (IS_ERR(trans)) { 3763 err = PTR_ERR(trans); 3764 goto out_free; 3765 } 3766 ret = btrfs_commit_transaction(trans); 3767 if (ret && !err) 3768 err = ret; 3769 out_free: 3770 ret = clean_dirty_subvols(rc); 3771 if (ret < 0 && !err) 3772 err = ret; 3773 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3774 btrfs_free_path(path); 3775 return err; 3776 } 3777 3778 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3779 struct btrfs_root *root, u64 objectid) 3780 { 3781 struct btrfs_path *path; 3782 struct btrfs_inode_item *item; 3783 struct extent_buffer *leaf; 3784 int ret; 3785 3786 path = btrfs_alloc_path(); 3787 if (!path) 3788 return -ENOMEM; 3789 3790 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3791 if (ret) 3792 goto out; 3793 3794 leaf = path->nodes[0]; 3795 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3796 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3797 btrfs_set_inode_generation(leaf, item, 1); 3798 btrfs_set_inode_size(leaf, item, 0); 3799 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3800 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3801 BTRFS_INODE_PREALLOC); 3802 btrfs_mark_buffer_dirty(trans, leaf); 3803 out: 3804 btrfs_free_path(path); 3805 return ret; 3806 } 3807 3808 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3809 struct btrfs_root *root, u64 objectid) 3810 { 3811 struct btrfs_path *path; 3812 struct btrfs_key key; 3813 int ret = 0; 3814 3815 path = btrfs_alloc_path(); 3816 if (!path) { 3817 ret = -ENOMEM; 3818 goto out; 3819 } 3820 3821 key.objectid = objectid; 3822 key.type = BTRFS_INODE_ITEM_KEY; 3823 key.offset = 0; 3824 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3825 if (ret) { 3826 if (ret > 0) 3827 ret = -ENOENT; 3828 goto out; 3829 } 3830 ret = btrfs_del_item(trans, root, path); 3831 out: 3832 if (ret) 3833 btrfs_abort_transaction(trans, ret); 3834 btrfs_free_path(path); 3835 } 3836 3837 /* 3838 * helper to create inode for data relocation. 3839 * the inode is in data relocation tree and its link count is 0 3840 */ 3841 static noinline_for_stack struct inode *create_reloc_inode( 3842 struct btrfs_fs_info *fs_info, 3843 const struct btrfs_block_group *group) 3844 { 3845 struct inode *inode = NULL; 3846 struct btrfs_trans_handle *trans; 3847 struct btrfs_root *root; 3848 u64 objectid; 3849 int ret = 0; 3850 3851 root = btrfs_grab_root(fs_info->data_reloc_root); 3852 trans = btrfs_start_transaction(root, 6); 3853 if (IS_ERR(trans)) { 3854 btrfs_put_root(root); 3855 return ERR_CAST(trans); 3856 } 3857 3858 ret = btrfs_get_free_objectid(root, &objectid); 3859 if (ret) 3860 goto out; 3861 3862 ret = __insert_orphan_inode(trans, root, objectid); 3863 if (ret) 3864 goto out; 3865 3866 inode = btrfs_iget(objectid, root); 3867 if (IS_ERR(inode)) { 3868 delete_orphan_inode(trans, root, objectid); 3869 ret = PTR_ERR(inode); 3870 inode = NULL; 3871 goto out; 3872 } 3873 BTRFS_I(inode)->reloc_block_group_start = group->start; 3874 3875 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 3876 out: 3877 btrfs_put_root(root); 3878 btrfs_end_transaction(trans); 3879 btrfs_btree_balance_dirty(fs_info); 3880 if (ret) { 3881 iput(inode); 3882 inode = ERR_PTR(ret); 3883 } 3884 return inode; 3885 } 3886 3887 /* 3888 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3889 * has been requested meanwhile and don't start in that case. 3890 * 3891 * Return: 3892 * 0 success 3893 * -EINPROGRESS operation is already in progress, that's probably a bug 3894 * -ECANCELED cancellation request was set before the operation started 3895 */ 3896 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3897 { 3898 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3899 /* This should not happen */ 3900 btrfs_err(fs_info, "reloc already running, cannot start"); 3901 return -EINPROGRESS; 3902 } 3903 3904 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3905 btrfs_info(fs_info, "chunk relocation canceled on start"); 3906 /* 3907 * On cancel, clear all requests but let the caller mark 3908 * the end after cleanup operations. 3909 */ 3910 atomic_set(&fs_info->reloc_cancel_req, 0); 3911 return -ECANCELED; 3912 } 3913 return 0; 3914 } 3915 3916 /* 3917 * Mark end of chunk relocation that is cancellable and wake any waiters. 3918 */ 3919 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3920 { 3921 /* Requested after start, clear bit first so any waiters can continue */ 3922 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3923 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3924 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3925 atomic_set(&fs_info->reloc_cancel_req, 0); 3926 } 3927 3928 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3929 { 3930 struct reloc_control *rc; 3931 3932 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3933 if (!rc) 3934 return NULL; 3935 3936 INIT_LIST_HEAD(&rc->reloc_roots); 3937 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3938 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3939 rc->reloc_root_tree.rb_root = RB_ROOT; 3940 spin_lock_init(&rc->reloc_root_tree.lock); 3941 extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3942 return rc; 3943 } 3944 3945 static void free_reloc_control(struct reloc_control *rc) 3946 { 3947 struct mapping_node *node, *tmp; 3948 3949 free_reloc_roots(&rc->reloc_roots); 3950 rbtree_postorder_for_each_entry_safe(node, tmp, 3951 &rc->reloc_root_tree.rb_root, rb_node) 3952 kfree(node); 3953 3954 kfree(rc); 3955 } 3956 3957 /* 3958 * Print the block group being relocated 3959 */ 3960 static void describe_relocation(struct btrfs_block_group *block_group) 3961 { 3962 char buf[128] = {'\0'}; 3963 3964 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3965 3966 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3967 block_group->start, buf); 3968 } 3969 3970 static const char *stage_to_string(enum reloc_stage stage) 3971 { 3972 if (stage == MOVE_DATA_EXTENTS) 3973 return "move data extents"; 3974 if (stage == UPDATE_DATA_PTRS) 3975 return "update data pointers"; 3976 return "unknown"; 3977 } 3978 3979 /* 3980 * function to relocate all extents in a block group. 3981 */ 3982 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 3983 { 3984 struct btrfs_block_group *bg; 3985 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3986 struct reloc_control *rc; 3987 struct inode *inode; 3988 struct btrfs_path *path; 3989 int ret; 3990 int rw = 0; 3991 int err = 0; 3992 3993 /* 3994 * This only gets set if we had a half-deleted snapshot on mount. We 3995 * cannot allow relocation to start while we're still trying to clean up 3996 * these pending deletions. 3997 */ 3998 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3999 if (ret) 4000 return ret; 4001 4002 /* We may have been woken up by close_ctree, so bail if we're closing. */ 4003 if (btrfs_fs_closing(fs_info)) 4004 return -EINTR; 4005 4006 bg = btrfs_lookup_block_group(fs_info, group_start); 4007 if (!bg) 4008 return -ENOENT; 4009 4010 /* 4011 * Relocation of a data block group creates ordered extents. Without 4012 * sb_start_write(), we can freeze the filesystem while unfinished 4013 * ordered extents are left. Such ordered extents can cause a deadlock 4014 * e.g. when syncfs() is waiting for their completion but they can't 4015 * finish because they block when joining a transaction, due to the 4016 * fact that the freeze locks are being held in write mode. 4017 */ 4018 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 4019 ASSERT(sb_write_started(fs_info->sb)); 4020 4021 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4022 btrfs_put_block_group(bg); 4023 return -ETXTBSY; 4024 } 4025 4026 rc = alloc_reloc_control(fs_info); 4027 if (!rc) { 4028 btrfs_put_block_group(bg); 4029 return -ENOMEM; 4030 } 4031 4032 ret = reloc_chunk_start(fs_info); 4033 if (ret < 0) { 4034 err = ret; 4035 goto out_put_bg; 4036 } 4037 4038 rc->extent_root = extent_root; 4039 rc->block_group = bg; 4040 4041 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4042 if (ret) { 4043 err = ret; 4044 goto out; 4045 } 4046 rw = 1; 4047 4048 path = btrfs_alloc_path(); 4049 if (!path) { 4050 err = -ENOMEM; 4051 goto out; 4052 } 4053 4054 inode = lookup_free_space_inode(rc->block_group, path); 4055 btrfs_free_path(path); 4056 4057 if (!IS_ERR(inode)) 4058 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4059 else 4060 ret = PTR_ERR(inode); 4061 4062 if (ret && ret != -ENOENT) { 4063 err = ret; 4064 goto out; 4065 } 4066 4067 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4068 if (IS_ERR(rc->data_inode)) { 4069 err = PTR_ERR(rc->data_inode); 4070 rc->data_inode = NULL; 4071 goto out; 4072 } 4073 4074 describe_relocation(rc->block_group); 4075 4076 btrfs_wait_block_group_reservations(rc->block_group); 4077 btrfs_wait_nocow_writers(rc->block_group); 4078 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 4079 4080 ret = btrfs_zone_finish(rc->block_group); 4081 WARN_ON(ret && ret != -EAGAIN); 4082 4083 while (1) { 4084 enum reloc_stage finishes_stage; 4085 4086 mutex_lock(&fs_info->cleaner_mutex); 4087 ret = relocate_block_group(rc); 4088 mutex_unlock(&fs_info->cleaner_mutex); 4089 if (ret < 0) 4090 err = ret; 4091 4092 finishes_stage = rc->stage; 4093 /* 4094 * We may have gotten ENOSPC after we already dirtied some 4095 * extents. If writeout happens while we're relocating a 4096 * different block group we could end up hitting the 4097 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4098 * btrfs_reloc_cow_block. Make sure we write everything out 4099 * properly so we don't trip over this problem, and then break 4100 * out of the loop if we hit an error. 4101 */ 4102 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4103 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 4104 (u64)-1); 4105 if (ret) 4106 err = ret; 4107 invalidate_mapping_pages(rc->data_inode->i_mapping, 4108 0, -1); 4109 rc->stage = UPDATE_DATA_PTRS; 4110 } 4111 4112 if (err < 0) 4113 goto out; 4114 4115 if (rc->extents_found == 0) 4116 break; 4117 4118 btrfs_info(fs_info, "found %llu extents, stage: %s", 4119 rc->extents_found, stage_to_string(finishes_stage)); 4120 } 4121 4122 WARN_ON(rc->block_group->pinned > 0); 4123 WARN_ON(rc->block_group->reserved > 0); 4124 WARN_ON(rc->block_group->used > 0); 4125 out: 4126 if (err && rw) 4127 btrfs_dec_block_group_ro(rc->block_group); 4128 iput(rc->data_inode); 4129 out_put_bg: 4130 btrfs_put_block_group(bg); 4131 reloc_chunk_end(fs_info); 4132 free_reloc_control(rc); 4133 return err; 4134 } 4135 4136 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4137 { 4138 struct btrfs_fs_info *fs_info = root->fs_info; 4139 struct btrfs_trans_handle *trans; 4140 int ret, err; 4141 4142 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4143 if (IS_ERR(trans)) 4144 return PTR_ERR(trans); 4145 4146 memset(&root->root_item.drop_progress, 0, 4147 sizeof(root->root_item.drop_progress)); 4148 btrfs_set_root_drop_level(&root->root_item, 0); 4149 btrfs_set_root_refs(&root->root_item, 0); 4150 ret = btrfs_update_root(trans, fs_info->tree_root, 4151 &root->root_key, &root->root_item); 4152 4153 err = btrfs_end_transaction(trans); 4154 if (err) 4155 return err; 4156 return ret; 4157 } 4158 4159 /* 4160 * recover relocation interrupted by system crash. 4161 * 4162 * this function resumes merging reloc trees with corresponding fs trees. 4163 * this is important for keeping the sharing of tree blocks 4164 */ 4165 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4166 { 4167 LIST_HEAD(reloc_roots); 4168 struct btrfs_key key; 4169 struct btrfs_root *fs_root; 4170 struct btrfs_root *reloc_root; 4171 struct btrfs_path *path; 4172 struct extent_buffer *leaf; 4173 struct reloc_control *rc = NULL; 4174 struct btrfs_trans_handle *trans; 4175 int ret2; 4176 int ret = 0; 4177 4178 path = btrfs_alloc_path(); 4179 if (!path) 4180 return -ENOMEM; 4181 path->reada = READA_BACK; 4182 4183 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4184 key.type = BTRFS_ROOT_ITEM_KEY; 4185 key.offset = (u64)-1; 4186 4187 while (1) { 4188 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4189 path, 0, 0); 4190 if (ret < 0) 4191 goto out; 4192 if (ret > 0) { 4193 if (path->slots[0] == 0) 4194 break; 4195 path->slots[0]--; 4196 } 4197 ret = 0; 4198 leaf = path->nodes[0]; 4199 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4200 btrfs_release_path(path); 4201 4202 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4203 key.type != BTRFS_ROOT_ITEM_KEY) 4204 break; 4205 4206 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4207 if (IS_ERR(reloc_root)) { 4208 ret = PTR_ERR(reloc_root); 4209 goto out; 4210 } 4211 4212 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4213 list_add(&reloc_root->root_list, &reloc_roots); 4214 4215 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4216 fs_root = btrfs_get_fs_root(fs_info, 4217 reloc_root->root_key.offset, false); 4218 if (IS_ERR(fs_root)) { 4219 ret = PTR_ERR(fs_root); 4220 if (ret != -ENOENT) 4221 goto out; 4222 ret = mark_garbage_root(reloc_root); 4223 if (ret < 0) 4224 goto out; 4225 ret = 0; 4226 } else { 4227 btrfs_put_root(fs_root); 4228 } 4229 } 4230 4231 if (key.offset == 0) 4232 break; 4233 4234 key.offset--; 4235 } 4236 btrfs_release_path(path); 4237 4238 if (list_empty(&reloc_roots)) 4239 goto out; 4240 4241 rc = alloc_reloc_control(fs_info); 4242 if (!rc) { 4243 ret = -ENOMEM; 4244 goto out; 4245 } 4246 4247 ret = reloc_chunk_start(fs_info); 4248 if (ret < 0) 4249 goto out_end; 4250 4251 rc->extent_root = btrfs_extent_root(fs_info, 0); 4252 4253 set_reloc_control(rc); 4254 4255 trans = btrfs_join_transaction(rc->extent_root); 4256 if (IS_ERR(trans)) { 4257 ret = PTR_ERR(trans); 4258 goto out_unset; 4259 } 4260 4261 rc->merge_reloc_tree = true; 4262 4263 while (!list_empty(&reloc_roots)) { 4264 reloc_root = list_entry(reloc_roots.next, 4265 struct btrfs_root, root_list); 4266 list_del(&reloc_root->root_list); 4267 4268 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4269 list_add_tail(&reloc_root->root_list, 4270 &rc->reloc_roots); 4271 continue; 4272 } 4273 4274 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4275 false); 4276 if (IS_ERR(fs_root)) { 4277 ret = PTR_ERR(fs_root); 4278 list_add_tail(&reloc_root->root_list, &reloc_roots); 4279 btrfs_end_transaction(trans); 4280 goto out_unset; 4281 } 4282 4283 ret = __add_reloc_root(reloc_root); 4284 ASSERT(ret != -EEXIST); 4285 if (ret) { 4286 list_add_tail(&reloc_root->root_list, &reloc_roots); 4287 btrfs_put_root(fs_root); 4288 btrfs_end_transaction(trans); 4289 goto out_unset; 4290 } 4291 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4292 btrfs_put_root(fs_root); 4293 } 4294 4295 ret = btrfs_commit_transaction(trans); 4296 if (ret) 4297 goto out_unset; 4298 4299 merge_reloc_roots(rc); 4300 4301 unset_reloc_control(rc); 4302 4303 trans = btrfs_join_transaction(rc->extent_root); 4304 if (IS_ERR(trans)) { 4305 ret = PTR_ERR(trans); 4306 goto out_clean; 4307 } 4308 ret = btrfs_commit_transaction(trans); 4309 out_clean: 4310 ret2 = clean_dirty_subvols(rc); 4311 if (ret2 < 0 && !ret) 4312 ret = ret2; 4313 out_unset: 4314 unset_reloc_control(rc); 4315 out_end: 4316 reloc_chunk_end(fs_info); 4317 free_reloc_control(rc); 4318 out: 4319 free_reloc_roots(&reloc_roots); 4320 4321 btrfs_free_path(path); 4322 4323 if (ret == 0) { 4324 /* cleanup orphan inode in data relocation tree */ 4325 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4326 ASSERT(fs_root); 4327 ret = btrfs_orphan_cleanup(fs_root); 4328 btrfs_put_root(fs_root); 4329 } 4330 return ret; 4331 } 4332 4333 /* 4334 * helper to add ordered checksum for data relocation. 4335 * 4336 * cloning checksum properly handles the nodatasum extents. 4337 * it also saves CPU time to re-calculate the checksum. 4338 */ 4339 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4340 { 4341 struct btrfs_inode *inode = ordered->inode; 4342 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4343 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4344 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4345 LIST_HEAD(list); 4346 int ret; 4347 4348 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4349 disk_bytenr + ordered->num_bytes - 1, 4350 &list, false); 4351 if (ret < 0) { 4352 btrfs_mark_ordered_extent_error(ordered); 4353 return ret; 4354 } 4355 4356 while (!list_empty(&list)) { 4357 struct btrfs_ordered_sum *sums = 4358 list_entry(list.next, struct btrfs_ordered_sum, list); 4359 4360 list_del_init(&sums->list); 4361 4362 /* 4363 * We need to offset the new_bytenr based on where the csum is. 4364 * We need to do this because we will read in entire prealloc 4365 * extents but we may have written to say the middle of the 4366 * prealloc extent, so we need to make sure the csum goes with 4367 * the right disk offset. 4368 * 4369 * We can do this because the data reloc inode refers strictly 4370 * to the on disk bytes, so we don't have to worry about 4371 * disk_len vs real len like with real inodes since it's all 4372 * disk length. 4373 */ 4374 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4375 btrfs_add_ordered_sum(ordered, sums); 4376 } 4377 4378 return 0; 4379 } 4380 4381 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4382 struct btrfs_root *root, 4383 const struct extent_buffer *buf, 4384 struct extent_buffer *cow) 4385 { 4386 struct btrfs_fs_info *fs_info = root->fs_info; 4387 struct reloc_control *rc; 4388 struct btrfs_backref_node *node; 4389 int first_cow = 0; 4390 int level; 4391 int ret = 0; 4392 4393 rc = fs_info->reloc_ctl; 4394 if (!rc) 4395 return 0; 4396 4397 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4398 4399 level = btrfs_header_level(buf); 4400 if (btrfs_header_generation(buf) <= 4401 btrfs_root_last_snapshot(&root->root_item)) 4402 first_cow = 1; 4403 4404 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4405 WARN_ON(!first_cow && level == 0); 4406 4407 node = rc->backref_cache.path[level]; 4408 BUG_ON(node->bytenr != buf->start && 4409 node->new_bytenr != buf->start); 4410 4411 btrfs_backref_drop_node_buffer(node); 4412 atomic_inc(&cow->refs); 4413 node->eb = cow; 4414 node->new_bytenr = cow->start; 4415 4416 if (!node->pending) { 4417 list_move_tail(&node->list, 4418 &rc->backref_cache.pending[level]); 4419 node->pending = 1; 4420 } 4421 4422 if (first_cow) 4423 mark_block_processed(rc, node); 4424 4425 if (first_cow && level > 0) 4426 rc->nodes_relocated += buf->len; 4427 } 4428 4429 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4430 ret = replace_file_extents(trans, rc, root, cow); 4431 return ret; 4432 } 4433 4434 /* 4435 * called before creating snapshot. it calculates metadata reservation 4436 * required for relocating tree blocks in the snapshot 4437 */ 4438 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4439 u64 *bytes_to_reserve) 4440 { 4441 struct btrfs_root *root = pending->root; 4442 struct reloc_control *rc = root->fs_info->reloc_ctl; 4443 4444 if (!rc || !have_reloc_root(root)) 4445 return; 4446 4447 if (!rc->merge_reloc_tree) 4448 return; 4449 4450 root = root->reloc_root; 4451 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4452 /* 4453 * relocation is in the stage of merging trees. the space 4454 * used by merging a reloc tree is twice the size of 4455 * relocated tree nodes in the worst case. half for cowing 4456 * the reloc tree, half for cowing the fs tree. the space 4457 * used by cowing the reloc tree will be freed after the 4458 * tree is dropped. if we create snapshot, cowing the fs 4459 * tree may use more space than it frees. so we need 4460 * reserve extra space. 4461 */ 4462 *bytes_to_reserve += rc->nodes_relocated; 4463 } 4464 4465 /* 4466 * called after snapshot is created. migrate block reservation 4467 * and create reloc root for the newly created snapshot 4468 * 4469 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4470 * references held on the reloc_root, one for root->reloc_root and one for 4471 * rc->reloc_roots. 4472 */ 4473 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4474 struct btrfs_pending_snapshot *pending) 4475 { 4476 struct btrfs_root *root = pending->root; 4477 struct btrfs_root *reloc_root; 4478 struct btrfs_root *new_root; 4479 struct reloc_control *rc = root->fs_info->reloc_ctl; 4480 int ret; 4481 4482 if (!rc || !have_reloc_root(root)) 4483 return 0; 4484 4485 rc = root->fs_info->reloc_ctl; 4486 rc->merging_rsv_size += rc->nodes_relocated; 4487 4488 if (rc->merge_reloc_tree) { 4489 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4490 rc->block_rsv, 4491 rc->nodes_relocated, true); 4492 if (ret) 4493 return ret; 4494 } 4495 4496 new_root = pending->snap; 4497 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4498 if (IS_ERR(reloc_root)) 4499 return PTR_ERR(reloc_root); 4500 4501 ret = __add_reloc_root(reloc_root); 4502 ASSERT(ret != -EEXIST); 4503 if (ret) { 4504 /* Pairs with create_reloc_root */ 4505 btrfs_put_root(reloc_root); 4506 return ret; 4507 } 4508 new_root->reloc_root = btrfs_grab_root(reloc_root); 4509 4510 if (rc->create_reloc_tree) 4511 ret = clone_backref_node(trans, rc, root, reloc_root); 4512 return ret; 4513 } 4514 4515 /* 4516 * Get the current bytenr for the block group which is being relocated. 4517 * 4518 * Return U64_MAX if no running relocation. 4519 */ 4520 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4521 { 4522 u64 logical = U64_MAX; 4523 4524 lockdep_assert_held(&fs_info->reloc_mutex); 4525 4526 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4527 logical = fs_info->reloc_ctl->block_group->start; 4528 return logical; 4529 } 4530