1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 union { 94 /* Use rb_simple_node for search/insert */ 95 struct { 96 struct rb_node rb_node; 97 u64 bytenr; 98 }; 99 100 struct rb_simple_node simple_node; 101 }; 102 void *data; 103 }; 104 105 struct mapping_tree { 106 struct rb_root rb_root; 107 spinlock_t lock; 108 }; 109 110 /* 111 * present a tree block to process 112 */ 113 struct tree_block { 114 union { 115 /* Use rb_simple_node for search/insert */ 116 struct { 117 struct rb_node rb_node; 118 u64 bytenr; 119 }; 120 121 struct rb_simple_node simple_node; 122 }; 123 u64 owner; 124 struct btrfs_key key; 125 u8 level; 126 bool key_ready; 127 }; 128 129 #define MAX_EXTENTS 128 130 131 struct file_extent_cluster { 132 u64 start; 133 u64 end; 134 u64 boundary[MAX_EXTENTS]; 135 unsigned int nr; 136 u64 owning_root; 137 }; 138 139 /* Stages of data relocation. */ 140 enum reloc_stage { 141 MOVE_DATA_EXTENTS, 142 UPDATE_DATA_PTRS 143 }; 144 145 struct reloc_control { 146 /* block group to relocate */ 147 struct btrfs_block_group *block_group; 148 /* extent tree */ 149 struct btrfs_root *extent_root; 150 /* inode for moving data */ 151 struct inode *data_inode; 152 153 struct btrfs_block_rsv *block_rsv; 154 155 struct btrfs_backref_cache backref_cache; 156 157 struct file_extent_cluster cluster; 158 /* tree blocks have been processed */ 159 struct extent_io_tree processed_blocks; 160 /* map start of tree root to corresponding reloc tree */ 161 struct mapping_tree reloc_root_tree; 162 /* list of reloc trees */ 163 struct list_head reloc_roots; 164 /* list of subvolume trees that get relocated */ 165 struct list_head dirty_subvol_roots; 166 /* size of metadata reservation for merging reloc trees */ 167 u64 merging_rsv_size; 168 /* size of relocated tree nodes */ 169 u64 nodes_relocated; 170 /* reserved size for block group relocation*/ 171 u64 reserved_bytes; 172 173 u64 search_start; 174 u64 extents_found; 175 176 enum reloc_stage stage; 177 bool create_reloc_tree; 178 bool merge_reloc_tree; 179 bool found_file_extent; 180 }; 181 182 static void mark_block_processed(struct reloc_control *rc, 183 struct btrfs_backref_node *node) 184 { 185 u32 blocksize; 186 187 if (node->level == 0 || 188 in_range(node->bytenr, rc->block_group->start, 189 rc->block_group->length)) { 190 blocksize = rc->extent_root->fs_info->nodesize; 191 btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, 192 node->bytenr + blocksize - 1, EXTENT_DIRTY, 193 NULL); 194 } 195 node->processed = 1; 196 } 197 198 /* 199 * walk up backref nodes until reach node presents tree root 200 */ 201 static struct btrfs_backref_node *walk_up_backref( 202 struct btrfs_backref_node *node, 203 struct btrfs_backref_edge *edges[], int *index) 204 { 205 struct btrfs_backref_edge *edge; 206 int idx = *index; 207 208 while (!list_empty(&node->upper)) { 209 edge = list_first_entry(&node->upper, struct btrfs_backref_edge, 210 list[LOWER]); 211 edges[idx++] = edge; 212 node = edge->node[UPPER]; 213 } 214 BUG_ON(node->detached); 215 *index = idx; 216 return node; 217 } 218 219 /* 220 * walk down backref nodes to find start of next reference path 221 */ 222 static struct btrfs_backref_node *walk_down_backref( 223 struct btrfs_backref_edge *edges[], int *index) 224 { 225 struct btrfs_backref_edge *edge; 226 struct btrfs_backref_node *lower; 227 int idx = *index; 228 229 while (idx > 0) { 230 edge = edges[idx - 1]; 231 lower = edge->node[LOWER]; 232 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 233 idx--; 234 continue; 235 } 236 edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, 237 list[LOWER]); 238 edges[idx - 1] = edge; 239 *index = idx; 240 return edge->node[UPPER]; 241 } 242 *index = 0; 243 return NULL; 244 } 245 246 static bool reloc_root_is_dead(const struct btrfs_root *root) 247 { 248 /* 249 * Pair with set_bit/clear_bit in clean_dirty_subvols and 250 * btrfs_update_reloc_root. We need to see the updated bit before 251 * trying to access reloc_root 252 */ 253 smp_rmb(); 254 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 255 return true; 256 return false; 257 } 258 259 /* 260 * Check if this subvolume tree has valid reloc tree. 261 * 262 * Reloc tree after swap is considered dead, thus not considered as valid. 263 * This is enough for most callers, as they don't distinguish dead reloc root 264 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 265 * special case. 266 */ 267 static bool have_reloc_root(const struct btrfs_root *root) 268 { 269 if (reloc_root_is_dead(root)) 270 return false; 271 if (!root->reloc_root) 272 return false; 273 return true; 274 } 275 276 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 277 { 278 struct btrfs_root *reloc_root; 279 280 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 281 return false; 282 283 /* This root has been merged with its reloc tree, we can ignore it */ 284 if (reloc_root_is_dead(root)) 285 return true; 286 287 reloc_root = root->reloc_root; 288 if (!reloc_root) 289 return false; 290 291 if (btrfs_header_generation(reloc_root->commit_root) == 292 root->fs_info->running_transaction->transid) 293 return false; 294 /* 295 * If there is reloc tree and it was created in previous transaction 296 * backref lookup can find the reloc tree, so backref node for the fs 297 * tree root is useless for relocation. 298 */ 299 return true; 300 } 301 302 /* 303 * find reloc tree by address of tree root 304 */ 305 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 306 { 307 struct reloc_control *rc = fs_info->reloc_ctl; 308 struct rb_node *rb_node; 309 struct mapping_node *node; 310 struct btrfs_root *root = NULL; 311 312 ASSERT(rc); 313 spin_lock(&rc->reloc_root_tree.lock); 314 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 315 if (rb_node) { 316 node = rb_entry(rb_node, struct mapping_node, rb_node); 317 root = node->data; 318 } 319 spin_unlock(&rc->reloc_root_tree.lock); 320 return btrfs_grab_root(root); 321 } 322 323 /* 324 * For useless nodes, do two major clean ups: 325 * 326 * - Cleanup the children edges and nodes 327 * If child node is also orphan (no parent) during cleanup, then the child 328 * node will also be cleaned up. 329 * 330 * - Freeing up leaves (level 0), keeps nodes detached 331 * For nodes, the node is still cached as "detached" 332 * 333 * Return false if @node is not in the @useless_nodes list. 334 * Return true if @node is in the @useless_nodes list. 335 */ 336 static bool handle_useless_nodes(struct reloc_control *rc, 337 struct btrfs_backref_node *node) 338 { 339 struct btrfs_backref_cache *cache = &rc->backref_cache; 340 struct list_head *useless_node = &cache->useless_node; 341 bool ret = false; 342 343 while (!list_empty(useless_node)) { 344 struct btrfs_backref_node *cur; 345 346 cur = list_first_entry(useless_node, struct btrfs_backref_node, 347 list); 348 list_del_init(&cur->list); 349 350 /* Only tree root nodes can be added to @useless_nodes */ 351 ASSERT(list_empty(&cur->upper)); 352 353 if (cur == node) 354 ret = true; 355 356 /* Cleanup the lower edges */ 357 while (!list_empty(&cur->lower)) { 358 struct btrfs_backref_edge *edge; 359 struct btrfs_backref_node *lower; 360 361 edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, 362 list[UPPER]); 363 list_del(&edge->list[UPPER]); 364 list_del(&edge->list[LOWER]); 365 lower = edge->node[LOWER]; 366 btrfs_backref_free_edge(cache, edge); 367 368 /* Child node is also orphan, queue for cleanup */ 369 if (list_empty(&lower->upper)) 370 list_add(&lower->list, useless_node); 371 } 372 /* Mark this block processed for relocation */ 373 mark_block_processed(rc, cur); 374 375 /* 376 * Backref nodes for tree leaves are deleted from the cache. 377 * Backref nodes for upper level tree blocks are left in the 378 * cache to avoid unnecessary backref lookup. 379 */ 380 if (cur->level > 0) { 381 cur->detached = 1; 382 } else { 383 rb_erase(&cur->rb_node, &cache->rb_root); 384 btrfs_backref_free_node(cache, cur); 385 } 386 } 387 return ret; 388 } 389 390 /* 391 * Build backref tree for a given tree block. Root of the backref tree 392 * corresponds the tree block, leaves of the backref tree correspond roots of 393 * b-trees that reference the tree block. 394 * 395 * The basic idea of this function is check backrefs of a given block to find 396 * upper level blocks that reference the block, and then check backrefs of 397 * these upper level blocks recursively. The recursion stops when tree root is 398 * reached or backrefs for the block is cached. 399 * 400 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 401 * all upper level blocks that directly/indirectly reference the block are also 402 * cached. 403 */ 404 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 405 struct btrfs_trans_handle *trans, 406 struct reloc_control *rc, struct btrfs_key *node_key, 407 int level, u64 bytenr) 408 { 409 struct btrfs_backref_iter *iter; 410 struct btrfs_backref_cache *cache = &rc->backref_cache; 411 /* For searching parent of TREE_BLOCK_REF */ 412 struct btrfs_path *path; 413 struct btrfs_backref_node *cur; 414 struct btrfs_backref_node *node = NULL; 415 struct btrfs_backref_edge *edge; 416 int ret; 417 418 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 419 if (!iter) 420 return ERR_PTR(-ENOMEM); 421 path = btrfs_alloc_path(); 422 if (!path) { 423 ret = -ENOMEM; 424 goto out; 425 } 426 427 node = btrfs_backref_alloc_node(cache, bytenr, level); 428 if (!node) { 429 ret = -ENOMEM; 430 goto out; 431 } 432 433 cur = node; 434 435 /* Breadth-first search to build backref cache */ 436 do { 437 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 438 node_key, cur); 439 if (ret < 0) 440 goto out; 441 442 edge = list_first_entry_or_null(&cache->pending_edge, 443 struct btrfs_backref_edge, list[UPPER]); 444 /* 445 * The pending list isn't empty, take the first block to 446 * process 447 */ 448 if (edge) { 449 list_del_init(&edge->list[UPPER]); 450 cur = edge->node[UPPER]; 451 } 452 } while (edge); 453 454 /* Finish the upper linkage of newly added edges/nodes */ 455 ret = btrfs_backref_finish_upper_links(cache, node); 456 if (ret < 0) 457 goto out; 458 459 if (handle_useless_nodes(rc, node)) 460 node = NULL; 461 out: 462 btrfs_free_path(iter->path); 463 kfree(iter); 464 btrfs_free_path(path); 465 if (ret) { 466 btrfs_backref_error_cleanup(cache, node); 467 return ERR_PTR(ret); 468 } 469 ASSERT(!node || !node->detached); 470 ASSERT(list_empty(&cache->useless_node) && 471 list_empty(&cache->pending_edge)); 472 return node; 473 } 474 475 /* 476 * helper to add 'address of tree root -> reloc tree' mapping 477 */ 478 static int __add_reloc_root(struct btrfs_root *root) 479 { 480 struct btrfs_fs_info *fs_info = root->fs_info; 481 struct rb_node *rb_node; 482 struct mapping_node *node; 483 struct reloc_control *rc = fs_info->reloc_ctl; 484 485 node = kmalloc(sizeof(*node), GFP_NOFS); 486 if (!node) 487 return -ENOMEM; 488 489 node->bytenr = root->commit_root->start; 490 node->data = root; 491 492 spin_lock(&rc->reloc_root_tree.lock); 493 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 494 spin_unlock(&rc->reloc_root_tree.lock); 495 if (rb_node) { 496 btrfs_err(fs_info, 497 "Duplicate root found for start=%llu while inserting into relocation tree", 498 node->bytenr); 499 return -EEXIST; 500 } 501 502 list_add_tail(&root->root_list, &rc->reloc_roots); 503 return 0; 504 } 505 506 /* 507 * helper to delete the 'address of tree root -> reloc tree' 508 * mapping 509 */ 510 static void __del_reloc_root(struct btrfs_root *root) 511 { 512 struct btrfs_fs_info *fs_info = root->fs_info; 513 struct rb_node *rb_node; 514 struct mapping_node *node = NULL; 515 struct reloc_control *rc = fs_info->reloc_ctl; 516 bool put_ref = false; 517 518 if (rc && root->node) { 519 spin_lock(&rc->reloc_root_tree.lock); 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 521 root->commit_root->start); 522 if (rb_node) { 523 node = rb_entry(rb_node, struct mapping_node, rb_node); 524 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 525 RB_CLEAR_NODE(&node->rb_node); 526 } 527 spin_unlock(&rc->reloc_root_tree.lock); 528 ASSERT(!node || (struct btrfs_root *)node->data == root); 529 } 530 531 /* 532 * We only put the reloc root here if it's on the list. There's a lot 533 * of places where the pattern is to splice the rc->reloc_roots, process 534 * the reloc roots, and then add the reloc root back onto 535 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 536 * list we don't want the reference being dropped, because the guy 537 * messing with the list is in charge of the reference. 538 */ 539 spin_lock(&fs_info->trans_lock); 540 if (!list_empty(&root->root_list)) { 541 put_ref = true; 542 list_del_init(&root->root_list); 543 } 544 spin_unlock(&fs_info->trans_lock); 545 if (put_ref) 546 btrfs_put_root(root); 547 kfree(node); 548 } 549 550 /* 551 * helper to update the 'address of tree root -> reloc tree' 552 * mapping 553 */ 554 static int __update_reloc_root(struct btrfs_root *root) 555 { 556 struct btrfs_fs_info *fs_info = root->fs_info; 557 struct rb_node *rb_node; 558 struct mapping_node *node = NULL; 559 struct reloc_control *rc = fs_info->reloc_ctl; 560 561 spin_lock(&rc->reloc_root_tree.lock); 562 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 563 root->commit_root->start); 564 if (rb_node) { 565 node = rb_entry(rb_node, struct mapping_node, rb_node); 566 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 567 } 568 spin_unlock(&rc->reloc_root_tree.lock); 569 570 if (!node) 571 return 0; 572 BUG_ON((struct btrfs_root *)node->data != root); 573 574 spin_lock(&rc->reloc_root_tree.lock); 575 node->bytenr = root->node->start; 576 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 577 spin_unlock(&rc->reloc_root_tree.lock); 578 if (rb_node) 579 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 580 return 0; 581 } 582 583 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, u64 objectid) 585 { 586 struct btrfs_fs_info *fs_info = root->fs_info; 587 struct btrfs_root *reloc_root; 588 struct extent_buffer *eb; 589 struct btrfs_root_item *root_item; 590 struct btrfs_key root_key; 591 int ret = 0; 592 bool must_abort = false; 593 594 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 595 if (!root_item) 596 return ERR_PTR(-ENOMEM); 597 598 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 599 root_key.type = BTRFS_ROOT_ITEM_KEY; 600 root_key.offset = objectid; 601 602 if (btrfs_root_id(root) == objectid) { 603 u64 commit_root_gen; 604 605 /* 606 * Relocation will wait for cleaner thread, and any half-dropped 607 * subvolume will be fully cleaned up at mount time. 608 * So here we shouldn't hit a subvolume with non-zero drop_progress. 609 * 610 * If this isn't the case, error out since it can make us attempt to 611 * drop references for extents that were already dropped before. 612 */ 613 if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { 614 struct btrfs_key cpu_key; 615 616 btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); 617 btrfs_err(fs_info, 618 "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", 619 objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); 620 ret = -EUCLEAN; 621 goto fail; 622 } 623 624 /* called by btrfs_init_reloc_root */ 625 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 626 BTRFS_TREE_RELOC_OBJECTID); 627 if (ret) 628 goto fail; 629 630 /* 631 * Set the last_snapshot field to the generation of the commit 632 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 633 * correctly (returns true) when the relocation root is created 634 * either inside the critical section of a transaction commit 635 * (through transaction.c:qgroup_account_snapshot()) and when 636 * it's created before the transaction commit is started. 637 */ 638 commit_root_gen = btrfs_header_generation(root->commit_root); 639 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 640 } else { 641 /* 642 * called by btrfs_reloc_post_snapshot_hook. 643 * the source tree is a reloc tree, all tree blocks 644 * modified after it was created have RELOC flag 645 * set in their headers. so it's OK to not update 646 * the 'last_snapshot'. 647 */ 648 ret = btrfs_copy_root(trans, root, root->node, &eb, 649 BTRFS_TREE_RELOC_OBJECTID); 650 if (ret) 651 goto fail; 652 } 653 654 /* 655 * We have changed references at this point, we must abort the 656 * transaction if anything fails. 657 */ 658 must_abort = true; 659 660 memcpy(root_item, &root->root_item, sizeof(*root_item)); 661 btrfs_set_root_bytenr(root_item, eb->start); 662 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 663 btrfs_set_root_generation(root_item, trans->transid); 664 665 if (btrfs_root_id(root) == objectid) { 666 btrfs_set_root_refs(root_item, 0); 667 memset(&root_item->drop_progress, 0, 668 sizeof(struct btrfs_disk_key)); 669 btrfs_set_root_drop_level(root_item, 0); 670 } 671 672 btrfs_tree_unlock(eb); 673 free_extent_buffer(eb); 674 675 ret = btrfs_insert_root(trans, fs_info->tree_root, 676 &root_key, root_item); 677 if (ret) 678 goto fail; 679 680 kfree(root_item); 681 682 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 683 if (IS_ERR(reloc_root)) { 684 ret = PTR_ERR(reloc_root); 685 goto abort; 686 } 687 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 688 btrfs_set_root_last_trans(reloc_root, trans->transid); 689 return reloc_root; 690 fail: 691 kfree(root_item); 692 abort: 693 if (must_abort) 694 btrfs_abort_transaction(trans, ret); 695 return ERR_PTR(ret); 696 } 697 698 /* 699 * create reloc tree for a given fs tree. reloc tree is just a 700 * snapshot of the fs tree with special root objectid. 701 * 702 * The reloc_root comes out of here with two references, one for 703 * root->reloc_root, and another for being on the rc->reloc_roots list. 704 */ 705 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 706 struct btrfs_root *root) 707 { 708 struct btrfs_fs_info *fs_info = root->fs_info; 709 struct btrfs_root *reloc_root; 710 struct reloc_control *rc = fs_info->reloc_ctl; 711 struct btrfs_block_rsv *rsv; 712 int clear_rsv = 0; 713 int ret; 714 715 if (!rc) 716 return 0; 717 718 /* 719 * The subvolume has reloc tree but the swap is finished, no need to 720 * create/update the dead reloc tree 721 */ 722 if (reloc_root_is_dead(root)) 723 return 0; 724 725 /* 726 * This is subtle but important. We do not do 727 * record_root_in_transaction for reloc roots, instead we record their 728 * corresponding fs root, and then here we update the last trans for the 729 * reloc root. This means that we have to do this for the entire life 730 * of the reloc root, regardless of which stage of the relocation we are 731 * in. 732 */ 733 if (root->reloc_root) { 734 reloc_root = root->reloc_root; 735 btrfs_set_root_last_trans(reloc_root, trans->transid); 736 return 0; 737 } 738 739 /* 740 * We are merging reloc roots, we do not need new reloc trees. Also 741 * reloc trees never need their own reloc tree. 742 */ 743 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 744 return 0; 745 746 if (!trans->reloc_reserved) { 747 rsv = trans->block_rsv; 748 trans->block_rsv = rc->block_rsv; 749 clear_rsv = 1; 750 } 751 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 752 if (clear_rsv) 753 trans->block_rsv = rsv; 754 if (IS_ERR(reloc_root)) 755 return PTR_ERR(reloc_root); 756 757 ret = __add_reloc_root(reloc_root); 758 ASSERT(ret != -EEXIST); 759 if (ret) { 760 /* Pairs with create_reloc_root */ 761 btrfs_put_root(reloc_root); 762 return ret; 763 } 764 root->reloc_root = btrfs_grab_root(reloc_root); 765 return 0; 766 } 767 768 /* 769 * update root item of reloc tree 770 */ 771 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 772 struct btrfs_root *root) 773 { 774 struct btrfs_fs_info *fs_info = root->fs_info; 775 struct btrfs_root *reloc_root; 776 struct btrfs_root_item *root_item; 777 int ret; 778 779 if (!have_reloc_root(root)) 780 return 0; 781 782 reloc_root = root->reloc_root; 783 root_item = &reloc_root->root_item; 784 785 /* 786 * We are probably ok here, but __del_reloc_root() will drop its ref of 787 * the root. We have the ref for root->reloc_root, but just in case 788 * hold it while we update the reloc root. 789 */ 790 btrfs_grab_root(reloc_root); 791 792 /* root->reloc_root will stay until current relocation finished */ 793 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 794 btrfs_root_refs(root_item) == 0) { 795 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 796 /* 797 * Mark the tree as dead before we change reloc_root so 798 * have_reloc_root will not touch it from now on. 799 */ 800 smp_wmb(); 801 __del_reloc_root(reloc_root); 802 } 803 804 if (reloc_root->commit_root != reloc_root->node) { 805 __update_reloc_root(reloc_root); 806 btrfs_set_root_node(root_item, reloc_root->node); 807 free_extent_buffer(reloc_root->commit_root); 808 reloc_root->commit_root = btrfs_root_node(reloc_root); 809 } 810 811 ret = btrfs_update_root(trans, fs_info->tree_root, 812 &reloc_root->root_key, root_item); 813 btrfs_put_root(reloc_root); 814 return ret; 815 } 816 817 /* 818 * get new location of data 819 */ 820 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 821 u64 bytenr, u64 num_bytes) 822 { 823 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 824 struct btrfs_path *path; 825 struct btrfs_file_extent_item *fi; 826 struct extent_buffer *leaf; 827 int ret; 828 829 path = btrfs_alloc_path(); 830 if (!path) 831 return -ENOMEM; 832 833 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 834 ret = btrfs_lookup_file_extent(NULL, root, path, 835 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 836 if (ret < 0) 837 goto out; 838 if (ret > 0) { 839 ret = -ENOENT; 840 goto out; 841 } 842 843 leaf = path->nodes[0]; 844 fi = btrfs_item_ptr(leaf, path->slots[0], 845 struct btrfs_file_extent_item); 846 847 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 848 btrfs_file_extent_compression(leaf, fi) || 849 btrfs_file_extent_encryption(leaf, fi) || 850 btrfs_file_extent_other_encoding(leaf, fi)); 851 852 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 853 ret = -EINVAL; 854 goto out; 855 } 856 857 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 858 ret = 0; 859 out: 860 btrfs_free_path(path); 861 return ret; 862 } 863 864 /* 865 * update file extent items in the tree leaf to point to 866 * the new locations. 867 */ 868 static noinline_for_stack 869 int replace_file_extents(struct btrfs_trans_handle *trans, 870 struct reloc_control *rc, 871 struct btrfs_root *root, 872 struct extent_buffer *leaf) 873 { 874 struct btrfs_fs_info *fs_info = root->fs_info; 875 struct btrfs_key key; 876 struct btrfs_file_extent_item *fi; 877 struct btrfs_inode *inode = NULL; 878 u64 parent; 879 u64 bytenr; 880 u64 new_bytenr = 0; 881 u64 num_bytes; 882 u64 end; 883 u32 nritems; 884 u32 i; 885 int ret = 0; 886 int first = 1; 887 888 if (rc->stage != UPDATE_DATA_PTRS) 889 return 0; 890 891 /* reloc trees always use full backref */ 892 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 893 parent = leaf->start; 894 else 895 parent = 0; 896 897 nritems = btrfs_header_nritems(leaf); 898 for (i = 0; i < nritems; i++) { 899 struct btrfs_ref ref = { 0 }; 900 901 cond_resched(); 902 btrfs_item_key_to_cpu(leaf, &key, i); 903 if (key.type != BTRFS_EXTENT_DATA_KEY) 904 continue; 905 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 906 if (btrfs_file_extent_type(leaf, fi) == 907 BTRFS_FILE_EXTENT_INLINE) 908 continue; 909 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 910 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 911 if (bytenr == 0) 912 continue; 913 if (!in_range(bytenr, rc->block_group->start, 914 rc->block_group->length)) 915 continue; 916 917 /* 918 * if we are modifying block in fs tree, wait for read_folio 919 * to complete and drop the extent cache 920 */ 921 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 922 if (first) { 923 inode = btrfs_find_first_inode(root, key.objectid); 924 first = 0; 925 } else if (inode && btrfs_ino(inode) < key.objectid) { 926 btrfs_add_delayed_iput(inode); 927 inode = btrfs_find_first_inode(root, key.objectid); 928 } 929 if (inode && btrfs_ino(inode) == key.objectid) { 930 struct extent_state *cached_state = NULL; 931 932 end = key.offset + 933 btrfs_file_extent_num_bytes(leaf, fi); 934 WARN_ON(!IS_ALIGNED(key.offset, 935 fs_info->sectorsize)); 936 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 937 end--; 938 /* Take mmap lock to serialize with reflinks. */ 939 if (!down_read_trylock(&inode->i_mmap_lock)) 940 continue; 941 ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, 942 end, &cached_state); 943 if (!ret) { 944 up_read(&inode->i_mmap_lock); 945 continue; 946 } 947 948 btrfs_drop_extent_map_range(inode, key.offset, end, true); 949 btrfs_unlock_extent(&inode->io_tree, key.offset, end, 950 &cached_state); 951 up_read(&inode->i_mmap_lock); 952 } 953 } 954 955 ret = get_new_location(rc->data_inode, &new_bytenr, 956 bytenr, num_bytes); 957 if (ret) { 958 /* 959 * Don't have to abort since we've not changed anything 960 * in the file extent yet. 961 */ 962 break; 963 } 964 965 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 966 967 key.offset -= btrfs_file_extent_offset(leaf, fi); 968 ref.action = BTRFS_ADD_DELAYED_REF; 969 ref.bytenr = new_bytenr; 970 ref.num_bytes = num_bytes; 971 ref.parent = parent; 972 ref.owning_root = btrfs_root_id(root); 973 ref.ref_root = btrfs_header_owner(leaf); 974 btrfs_init_data_ref(&ref, key.objectid, key.offset, 975 btrfs_root_id(root), false); 976 ret = btrfs_inc_extent_ref(trans, &ref); 977 if (ret) { 978 btrfs_abort_transaction(trans, ret); 979 break; 980 } 981 982 ref.action = BTRFS_DROP_DELAYED_REF; 983 ref.bytenr = bytenr; 984 ref.num_bytes = num_bytes; 985 ref.parent = parent; 986 ref.owning_root = btrfs_root_id(root); 987 ref.ref_root = btrfs_header_owner(leaf); 988 btrfs_init_data_ref(&ref, key.objectid, key.offset, 989 btrfs_root_id(root), false); 990 ret = btrfs_free_extent(trans, &ref); 991 if (ret) { 992 btrfs_abort_transaction(trans, ret); 993 break; 994 } 995 } 996 if (inode) 997 btrfs_add_delayed_iput(inode); 998 return ret; 999 } 1000 1001 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 1002 int slot, const struct btrfs_path *path, 1003 int level) 1004 { 1005 struct btrfs_disk_key key1; 1006 struct btrfs_disk_key key2; 1007 btrfs_node_key(eb, &key1, slot); 1008 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1009 return memcmp(&key1, &key2, sizeof(key1)); 1010 } 1011 1012 /* 1013 * try to replace tree blocks in fs tree with the new blocks 1014 * in reloc tree. tree blocks haven't been modified since the 1015 * reloc tree was create can be replaced. 1016 * 1017 * if a block was replaced, level of the block + 1 is returned. 1018 * if no block got replaced, 0 is returned. if there are other 1019 * errors, a negative error number is returned. 1020 */ 1021 static noinline_for_stack 1022 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1023 struct btrfs_root *dest, struct btrfs_root *src, 1024 struct btrfs_path *path, struct btrfs_key *next_key, 1025 int lowest_level, int max_level) 1026 { 1027 struct btrfs_fs_info *fs_info = dest->fs_info; 1028 struct extent_buffer *eb; 1029 struct extent_buffer *parent; 1030 struct btrfs_ref ref = { 0 }; 1031 struct btrfs_key key; 1032 u64 old_bytenr; 1033 u64 new_bytenr; 1034 u64 old_ptr_gen; 1035 u64 new_ptr_gen; 1036 u64 last_snapshot; 1037 u32 blocksize; 1038 int cow = 0; 1039 int level; 1040 int ret; 1041 int slot; 1042 1043 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1044 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1045 1046 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1047 again: 1048 slot = path->slots[lowest_level]; 1049 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1050 1051 eb = btrfs_lock_root_node(dest); 1052 level = btrfs_header_level(eb); 1053 1054 if (level < lowest_level) { 1055 btrfs_tree_unlock(eb); 1056 free_extent_buffer(eb); 1057 return 0; 1058 } 1059 1060 if (cow) { 1061 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1062 BTRFS_NESTING_COW); 1063 if (ret) { 1064 btrfs_tree_unlock(eb); 1065 free_extent_buffer(eb); 1066 return ret; 1067 } 1068 } 1069 1070 if (next_key) { 1071 next_key->objectid = (u64)-1; 1072 next_key->type = (u8)-1; 1073 next_key->offset = (u64)-1; 1074 } 1075 1076 parent = eb; 1077 while (1) { 1078 level = btrfs_header_level(parent); 1079 ASSERT(level >= lowest_level); 1080 1081 ret = btrfs_bin_search(parent, 0, &key, &slot); 1082 if (ret < 0) 1083 break; 1084 if (ret && slot > 0) 1085 slot--; 1086 1087 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1088 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1089 1090 old_bytenr = btrfs_node_blockptr(parent, slot); 1091 blocksize = fs_info->nodesize; 1092 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1093 1094 if (level <= max_level) { 1095 eb = path->nodes[level]; 1096 new_bytenr = btrfs_node_blockptr(eb, 1097 path->slots[level]); 1098 new_ptr_gen = btrfs_node_ptr_generation(eb, 1099 path->slots[level]); 1100 } else { 1101 new_bytenr = 0; 1102 new_ptr_gen = 0; 1103 } 1104 1105 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1106 ret = level; 1107 break; 1108 } 1109 1110 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1111 memcmp_node_keys(parent, slot, path, level)) { 1112 if (level <= lowest_level) { 1113 ret = 0; 1114 break; 1115 } 1116 1117 eb = btrfs_read_node_slot(parent, slot); 1118 if (IS_ERR(eb)) { 1119 ret = PTR_ERR(eb); 1120 break; 1121 } 1122 btrfs_tree_lock(eb); 1123 if (cow) { 1124 ret = btrfs_cow_block(trans, dest, eb, parent, 1125 slot, &eb, 1126 BTRFS_NESTING_COW); 1127 if (ret) { 1128 btrfs_tree_unlock(eb); 1129 free_extent_buffer(eb); 1130 break; 1131 } 1132 } 1133 1134 btrfs_tree_unlock(parent); 1135 free_extent_buffer(parent); 1136 1137 parent = eb; 1138 continue; 1139 } 1140 1141 if (!cow) { 1142 btrfs_tree_unlock(parent); 1143 free_extent_buffer(parent); 1144 cow = 1; 1145 goto again; 1146 } 1147 1148 btrfs_node_key_to_cpu(path->nodes[level], &key, 1149 path->slots[level]); 1150 btrfs_release_path(path); 1151 1152 path->lowest_level = level; 1153 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1154 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1155 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1156 path->lowest_level = 0; 1157 if (ret) { 1158 if (ret > 0) 1159 ret = -ENOENT; 1160 break; 1161 } 1162 1163 /* 1164 * Info qgroup to trace both subtrees. 1165 * 1166 * We must trace both trees. 1167 * 1) Tree reloc subtree 1168 * If not traced, we will leak data numbers 1169 * 2) Fs subtree 1170 * If not traced, we will double count old data 1171 * 1172 * We don't scan the subtree right now, but only record 1173 * the swapped tree blocks. 1174 * The real subtree rescan is delayed until we have new 1175 * CoW on the subtree root node before transaction commit. 1176 */ 1177 ret = btrfs_qgroup_add_swapped_blocks(dest, 1178 rc->block_group, parent, slot, 1179 path->nodes[level], path->slots[level], 1180 last_snapshot); 1181 if (ret < 0) 1182 break; 1183 /* 1184 * swap blocks in fs tree and reloc tree. 1185 */ 1186 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1187 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1188 1189 btrfs_set_node_blockptr(path->nodes[level], 1190 path->slots[level], old_bytenr); 1191 btrfs_set_node_ptr_generation(path->nodes[level], 1192 path->slots[level], old_ptr_gen); 1193 1194 ref.action = BTRFS_ADD_DELAYED_REF; 1195 ref.bytenr = old_bytenr; 1196 ref.num_bytes = blocksize; 1197 ref.parent = path->nodes[level]->start; 1198 ref.owning_root = btrfs_root_id(src); 1199 ref.ref_root = btrfs_root_id(src); 1200 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1201 ret = btrfs_inc_extent_ref(trans, &ref); 1202 if (ret) { 1203 btrfs_abort_transaction(trans, ret); 1204 break; 1205 } 1206 1207 ref.action = BTRFS_ADD_DELAYED_REF; 1208 ref.bytenr = new_bytenr; 1209 ref.num_bytes = blocksize; 1210 ref.parent = 0; 1211 ref.owning_root = btrfs_root_id(dest); 1212 ref.ref_root = btrfs_root_id(dest); 1213 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1214 ret = btrfs_inc_extent_ref(trans, &ref); 1215 if (ret) { 1216 btrfs_abort_transaction(trans, ret); 1217 break; 1218 } 1219 1220 /* We don't know the real owning_root, use 0. */ 1221 ref.action = BTRFS_DROP_DELAYED_REF; 1222 ref.bytenr = new_bytenr; 1223 ref.num_bytes = blocksize; 1224 ref.parent = path->nodes[level]->start; 1225 ref.owning_root = 0; 1226 ref.ref_root = btrfs_root_id(src); 1227 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1228 ret = btrfs_free_extent(trans, &ref); 1229 if (ret) { 1230 btrfs_abort_transaction(trans, ret); 1231 break; 1232 } 1233 1234 /* We don't know the real owning_root, use 0. */ 1235 ref.action = BTRFS_DROP_DELAYED_REF; 1236 ref.bytenr = old_bytenr; 1237 ref.num_bytes = blocksize; 1238 ref.parent = 0; 1239 ref.owning_root = 0; 1240 ref.ref_root = btrfs_root_id(dest); 1241 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1242 ret = btrfs_free_extent(trans, &ref); 1243 if (ret) { 1244 btrfs_abort_transaction(trans, ret); 1245 break; 1246 } 1247 1248 btrfs_unlock_up_safe(path, 0); 1249 1250 ret = level; 1251 break; 1252 } 1253 btrfs_tree_unlock(parent); 1254 free_extent_buffer(parent); 1255 return ret; 1256 } 1257 1258 /* 1259 * helper to find next relocated block in reloc tree 1260 */ 1261 static noinline_for_stack 1262 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1263 int *level) 1264 { 1265 struct extent_buffer *eb; 1266 int i; 1267 u64 last_snapshot; 1268 u32 nritems; 1269 1270 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1271 1272 for (i = 0; i < *level; i++) { 1273 free_extent_buffer(path->nodes[i]); 1274 path->nodes[i] = NULL; 1275 } 1276 1277 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1278 eb = path->nodes[i]; 1279 nritems = btrfs_header_nritems(eb); 1280 while (path->slots[i] + 1 < nritems) { 1281 path->slots[i]++; 1282 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1283 last_snapshot) 1284 continue; 1285 1286 *level = i; 1287 return 0; 1288 } 1289 free_extent_buffer(path->nodes[i]); 1290 path->nodes[i] = NULL; 1291 } 1292 return 1; 1293 } 1294 1295 /* 1296 * walk down reloc tree to find relocated block of lowest level 1297 */ 1298 static noinline_for_stack 1299 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1300 int *level) 1301 { 1302 struct extent_buffer *eb = NULL; 1303 int i; 1304 u64 ptr_gen = 0; 1305 u64 last_snapshot; 1306 u32 nritems; 1307 1308 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1309 1310 for (i = *level; i > 0; i--) { 1311 eb = path->nodes[i]; 1312 nritems = btrfs_header_nritems(eb); 1313 while (path->slots[i] < nritems) { 1314 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1315 if (ptr_gen > last_snapshot) 1316 break; 1317 path->slots[i]++; 1318 } 1319 if (path->slots[i] >= nritems) { 1320 if (i == *level) 1321 break; 1322 *level = i + 1; 1323 return 0; 1324 } 1325 if (i == 1) { 1326 *level = i; 1327 return 0; 1328 } 1329 1330 eb = btrfs_read_node_slot(eb, path->slots[i]); 1331 if (IS_ERR(eb)) 1332 return PTR_ERR(eb); 1333 BUG_ON(btrfs_header_level(eb) != i - 1); 1334 path->nodes[i - 1] = eb; 1335 path->slots[i - 1] = 0; 1336 } 1337 return 1; 1338 } 1339 1340 /* 1341 * invalidate extent cache for file extents whose key in range of 1342 * [min_key, max_key) 1343 */ 1344 static int invalidate_extent_cache(struct btrfs_root *root, 1345 const struct btrfs_key *min_key, 1346 const struct btrfs_key *max_key) 1347 { 1348 struct btrfs_fs_info *fs_info = root->fs_info; 1349 struct btrfs_inode *inode = NULL; 1350 u64 objectid; 1351 u64 start, end; 1352 u64 ino; 1353 1354 objectid = min_key->objectid; 1355 while (1) { 1356 struct extent_state *cached_state = NULL; 1357 1358 cond_resched(); 1359 if (inode) 1360 iput(&inode->vfs_inode); 1361 1362 if (objectid > max_key->objectid) 1363 break; 1364 1365 inode = btrfs_find_first_inode(root, objectid); 1366 if (!inode) 1367 break; 1368 ino = btrfs_ino(inode); 1369 1370 if (ino > max_key->objectid) { 1371 iput(&inode->vfs_inode); 1372 break; 1373 } 1374 1375 objectid = ino + 1; 1376 if (!S_ISREG(inode->vfs_inode.i_mode)) 1377 continue; 1378 1379 if (unlikely(min_key->objectid == ino)) { 1380 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1381 continue; 1382 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1383 start = 0; 1384 else { 1385 start = min_key->offset; 1386 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1387 } 1388 } else { 1389 start = 0; 1390 } 1391 1392 if (unlikely(max_key->objectid == ino)) { 1393 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1394 continue; 1395 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1396 end = (u64)-1; 1397 } else { 1398 if (max_key->offset == 0) 1399 continue; 1400 end = max_key->offset; 1401 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1402 end--; 1403 } 1404 } else { 1405 end = (u64)-1; 1406 } 1407 1408 /* the lock_extent waits for read_folio to complete */ 1409 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 1410 btrfs_drop_extent_map_range(inode, start, end, true); 1411 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1412 } 1413 return 0; 1414 } 1415 1416 static int find_next_key(struct btrfs_path *path, int level, 1417 struct btrfs_key *key) 1418 1419 { 1420 while (level < BTRFS_MAX_LEVEL) { 1421 if (!path->nodes[level]) 1422 break; 1423 if (path->slots[level] + 1 < 1424 btrfs_header_nritems(path->nodes[level])) { 1425 btrfs_node_key_to_cpu(path->nodes[level], key, 1426 path->slots[level] + 1); 1427 return 0; 1428 } 1429 level++; 1430 } 1431 return 1; 1432 } 1433 1434 /* 1435 * Insert current subvolume into reloc_control::dirty_subvol_roots 1436 */ 1437 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1438 struct reloc_control *rc, 1439 struct btrfs_root *root) 1440 { 1441 struct btrfs_root *reloc_root = root->reloc_root; 1442 struct btrfs_root_item *reloc_root_item; 1443 int ret; 1444 1445 /* @root must be a subvolume tree root with a valid reloc tree */ 1446 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1447 ASSERT(reloc_root); 1448 1449 reloc_root_item = &reloc_root->root_item; 1450 memset(&reloc_root_item->drop_progress, 0, 1451 sizeof(reloc_root_item->drop_progress)); 1452 btrfs_set_root_drop_level(reloc_root_item, 0); 1453 btrfs_set_root_refs(reloc_root_item, 0); 1454 ret = btrfs_update_reloc_root(trans, root); 1455 if (ret) 1456 return ret; 1457 1458 if (list_empty(&root->reloc_dirty_list)) { 1459 btrfs_grab_root(root); 1460 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1461 } 1462 1463 return 0; 1464 } 1465 1466 static int clean_dirty_subvols(struct reloc_control *rc) 1467 { 1468 struct btrfs_root *root; 1469 struct btrfs_root *next; 1470 int ret = 0; 1471 int ret2; 1472 1473 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1474 reloc_dirty_list) { 1475 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1476 /* Merged subvolume, cleanup its reloc root */ 1477 struct btrfs_root *reloc_root = root->reloc_root; 1478 1479 list_del_init(&root->reloc_dirty_list); 1480 root->reloc_root = NULL; 1481 /* 1482 * Need barrier to ensure clear_bit() only happens after 1483 * root->reloc_root = NULL. Pairs with have_reloc_root. 1484 */ 1485 smp_wmb(); 1486 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1487 if (reloc_root) { 1488 /* 1489 * btrfs_drop_snapshot drops our ref we hold for 1490 * ->reloc_root. If it fails however we must 1491 * drop the ref ourselves. 1492 */ 1493 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1494 if (ret2 < 0) { 1495 btrfs_put_root(reloc_root); 1496 if (!ret) 1497 ret = ret2; 1498 } 1499 } 1500 btrfs_put_root(root); 1501 } else { 1502 /* Orphan reloc tree, just clean it up */ 1503 ret2 = btrfs_drop_snapshot(root, 0, 1); 1504 if (ret2 < 0) { 1505 btrfs_put_root(root); 1506 if (!ret) 1507 ret = ret2; 1508 } 1509 } 1510 } 1511 return ret; 1512 } 1513 1514 /* 1515 * merge the relocated tree blocks in reloc tree with corresponding 1516 * fs tree. 1517 */ 1518 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1519 struct btrfs_root *root) 1520 { 1521 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1522 struct btrfs_key key; 1523 struct btrfs_key next_key; 1524 struct btrfs_trans_handle *trans = NULL; 1525 struct btrfs_root *reloc_root; 1526 struct btrfs_root_item *root_item; 1527 struct btrfs_path *path; 1528 struct extent_buffer *leaf; 1529 int reserve_level; 1530 int level; 1531 int max_level; 1532 int replaced = 0; 1533 int ret = 0; 1534 u32 min_reserved; 1535 1536 path = btrfs_alloc_path(); 1537 if (!path) 1538 return -ENOMEM; 1539 path->reada = READA_FORWARD; 1540 1541 reloc_root = root->reloc_root; 1542 root_item = &reloc_root->root_item; 1543 1544 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1545 level = btrfs_root_level(root_item); 1546 refcount_inc(&reloc_root->node->refs); 1547 path->nodes[level] = reloc_root->node; 1548 path->slots[level] = 0; 1549 } else { 1550 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1551 1552 level = btrfs_root_drop_level(root_item); 1553 BUG_ON(level == 0); 1554 path->lowest_level = level; 1555 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1556 path->lowest_level = 0; 1557 if (ret < 0) { 1558 btrfs_free_path(path); 1559 return ret; 1560 } 1561 1562 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1563 path->slots[level]); 1564 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1565 1566 btrfs_unlock_up_safe(path, 0); 1567 } 1568 1569 /* 1570 * In merge_reloc_root(), we modify the upper level pointer to swap the 1571 * tree blocks between reloc tree and subvolume tree. Thus for tree 1572 * block COW, we COW at most from level 1 to root level for each tree. 1573 * 1574 * Thus the needed metadata size is at most root_level * nodesize, 1575 * and * 2 since we have two trees to COW. 1576 */ 1577 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1578 min_reserved = fs_info->nodesize * reserve_level * 2; 1579 memset(&next_key, 0, sizeof(next_key)); 1580 1581 while (1) { 1582 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1583 min_reserved, 1584 BTRFS_RESERVE_FLUSH_LIMIT); 1585 if (ret) 1586 goto out; 1587 trans = btrfs_start_transaction(root, 0); 1588 if (IS_ERR(trans)) { 1589 ret = PTR_ERR(trans); 1590 trans = NULL; 1591 goto out; 1592 } 1593 1594 /* 1595 * At this point we no longer have a reloc_control, so we can't 1596 * depend on btrfs_init_reloc_root to update our last_trans. 1597 * 1598 * But that's ok, we started the trans handle on our 1599 * corresponding fs_root, which means it's been added to the 1600 * dirty list. At commit time we'll still call 1601 * btrfs_update_reloc_root() and update our root item 1602 * appropriately. 1603 */ 1604 btrfs_set_root_last_trans(reloc_root, trans->transid); 1605 trans->block_rsv = rc->block_rsv; 1606 1607 replaced = 0; 1608 max_level = level; 1609 1610 ret = walk_down_reloc_tree(reloc_root, path, &level); 1611 if (ret < 0) 1612 goto out; 1613 if (ret > 0) 1614 break; 1615 1616 if (!find_next_key(path, level, &key) && 1617 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1618 ret = 0; 1619 } else { 1620 ret = replace_path(trans, rc, root, reloc_root, path, 1621 &next_key, level, max_level); 1622 } 1623 if (ret < 0) 1624 goto out; 1625 if (ret > 0) { 1626 level = ret; 1627 btrfs_node_key_to_cpu(path->nodes[level], &key, 1628 path->slots[level]); 1629 replaced = 1; 1630 } 1631 1632 ret = walk_up_reloc_tree(reloc_root, path, &level); 1633 if (ret > 0) 1634 break; 1635 1636 BUG_ON(level == 0); 1637 /* 1638 * save the merging progress in the drop_progress. 1639 * this is OK since root refs == 1 in this case. 1640 */ 1641 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1642 path->slots[level]); 1643 btrfs_set_root_drop_level(root_item, level); 1644 1645 btrfs_end_transaction_throttle(trans); 1646 trans = NULL; 1647 1648 btrfs_btree_balance_dirty(fs_info); 1649 1650 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1651 invalidate_extent_cache(root, &key, &next_key); 1652 } 1653 1654 /* 1655 * handle the case only one block in the fs tree need to be 1656 * relocated and the block is tree root. 1657 */ 1658 leaf = btrfs_lock_root_node(root); 1659 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1660 BTRFS_NESTING_COW); 1661 btrfs_tree_unlock(leaf); 1662 free_extent_buffer(leaf); 1663 out: 1664 btrfs_free_path(path); 1665 1666 if (ret == 0) { 1667 ret = insert_dirty_subvol(trans, rc, root); 1668 if (ret) 1669 btrfs_abort_transaction(trans, ret); 1670 } 1671 1672 if (trans) 1673 btrfs_end_transaction_throttle(trans); 1674 1675 btrfs_btree_balance_dirty(fs_info); 1676 1677 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1678 invalidate_extent_cache(root, &key, &next_key); 1679 1680 return ret; 1681 } 1682 1683 static noinline_for_stack 1684 int prepare_to_merge(struct reloc_control *rc, int err) 1685 { 1686 struct btrfs_root *root = rc->extent_root; 1687 struct btrfs_fs_info *fs_info = root->fs_info; 1688 struct btrfs_root *reloc_root; 1689 struct btrfs_trans_handle *trans; 1690 LIST_HEAD(reloc_roots); 1691 u64 num_bytes = 0; 1692 int ret; 1693 1694 mutex_lock(&fs_info->reloc_mutex); 1695 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1696 rc->merging_rsv_size += rc->nodes_relocated * 2; 1697 mutex_unlock(&fs_info->reloc_mutex); 1698 1699 again: 1700 if (!err) { 1701 num_bytes = rc->merging_rsv_size; 1702 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1703 BTRFS_RESERVE_FLUSH_ALL); 1704 if (ret) 1705 err = ret; 1706 } 1707 1708 trans = btrfs_join_transaction(rc->extent_root); 1709 if (IS_ERR(trans)) { 1710 if (!err) 1711 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1712 num_bytes, NULL); 1713 return PTR_ERR(trans); 1714 } 1715 1716 if (!err) { 1717 if (num_bytes != rc->merging_rsv_size) { 1718 btrfs_end_transaction(trans); 1719 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1720 num_bytes, NULL); 1721 goto again; 1722 } 1723 } 1724 1725 rc->merge_reloc_tree = true; 1726 1727 while (!list_empty(&rc->reloc_roots)) { 1728 reloc_root = list_first_entry(&rc->reloc_roots, 1729 struct btrfs_root, root_list); 1730 list_del_init(&reloc_root->root_list); 1731 1732 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1733 false); 1734 if (IS_ERR(root)) { 1735 /* 1736 * Even if we have an error we need this reloc root 1737 * back on our list so we can clean up properly. 1738 */ 1739 list_add(&reloc_root->root_list, &reloc_roots); 1740 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1741 if (!err) 1742 err = PTR_ERR(root); 1743 break; 1744 } 1745 1746 if (unlikely(root->reloc_root != reloc_root)) { 1747 if (root->reloc_root) { 1748 btrfs_err(fs_info, 1749 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1750 btrfs_root_id(root), 1751 btrfs_root_id(root->reloc_root), 1752 root->reloc_root->root_key.type, 1753 root->reloc_root->root_key.offset, 1754 btrfs_root_generation( 1755 &root->reloc_root->root_item), 1756 btrfs_root_id(reloc_root), 1757 reloc_root->root_key.type, 1758 reloc_root->root_key.offset, 1759 btrfs_root_generation( 1760 &reloc_root->root_item)); 1761 } else { 1762 btrfs_err(fs_info, 1763 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1764 btrfs_root_id(root), 1765 btrfs_root_id(reloc_root), 1766 reloc_root->root_key.type, 1767 reloc_root->root_key.offset, 1768 btrfs_root_generation( 1769 &reloc_root->root_item)); 1770 } 1771 list_add(&reloc_root->root_list, &reloc_roots); 1772 btrfs_put_root(root); 1773 btrfs_abort_transaction(trans, -EUCLEAN); 1774 if (!err) 1775 err = -EUCLEAN; 1776 break; 1777 } 1778 1779 /* 1780 * set reference count to 1, so btrfs_recover_relocation 1781 * knows it should resumes merging 1782 */ 1783 if (!err) 1784 btrfs_set_root_refs(&reloc_root->root_item, 1); 1785 ret = btrfs_update_reloc_root(trans, root); 1786 1787 /* 1788 * Even if we have an error we need this reloc root back on our 1789 * list so we can clean up properly. 1790 */ 1791 list_add(&reloc_root->root_list, &reloc_roots); 1792 btrfs_put_root(root); 1793 1794 if (ret) { 1795 btrfs_abort_transaction(trans, ret); 1796 if (!err) 1797 err = ret; 1798 break; 1799 } 1800 } 1801 1802 list_splice(&reloc_roots, &rc->reloc_roots); 1803 1804 if (!err) 1805 err = btrfs_commit_transaction(trans); 1806 else 1807 btrfs_end_transaction(trans); 1808 return err; 1809 } 1810 1811 static noinline_for_stack 1812 void free_reloc_roots(struct list_head *list) 1813 { 1814 struct btrfs_root *reloc_root, *tmp; 1815 1816 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1817 __del_reloc_root(reloc_root); 1818 } 1819 1820 static noinline_for_stack 1821 void merge_reloc_roots(struct reloc_control *rc) 1822 { 1823 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1824 struct btrfs_root *root; 1825 struct btrfs_root *reloc_root; 1826 LIST_HEAD(reloc_roots); 1827 int found = 0; 1828 int ret = 0; 1829 again: 1830 root = rc->extent_root; 1831 1832 /* 1833 * this serializes us with btrfs_record_root_in_transaction, 1834 * we have to make sure nobody is in the middle of 1835 * adding their roots to the list while we are 1836 * doing this splice 1837 */ 1838 mutex_lock(&fs_info->reloc_mutex); 1839 list_splice_init(&rc->reloc_roots, &reloc_roots); 1840 mutex_unlock(&fs_info->reloc_mutex); 1841 1842 while (!list_empty(&reloc_roots)) { 1843 found = 1; 1844 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 1845 1846 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1847 false); 1848 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1849 if (WARN_ON(IS_ERR(root))) { 1850 /* 1851 * For recovery we read the fs roots on mount, 1852 * and if we didn't find the root then we marked 1853 * the reloc root as a garbage root. For normal 1854 * relocation obviously the root should exist in 1855 * memory. However there's no reason we can't 1856 * handle the error properly here just in case. 1857 */ 1858 ret = PTR_ERR(root); 1859 goto out; 1860 } 1861 if (WARN_ON(root->reloc_root != reloc_root)) { 1862 /* 1863 * This can happen if on-disk metadata has some 1864 * corruption, e.g. bad reloc tree key offset. 1865 */ 1866 ret = -EINVAL; 1867 goto out; 1868 } 1869 ret = merge_reloc_root(rc, root); 1870 btrfs_put_root(root); 1871 if (ret) { 1872 if (list_empty(&reloc_root->root_list)) 1873 list_add_tail(&reloc_root->root_list, 1874 &reloc_roots); 1875 goto out; 1876 } 1877 } else { 1878 if (!IS_ERR(root)) { 1879 if (root->reloc_root == reloc_root) { 1880 root->reloc_root = NULL; 1881 btrfs_put_root(reloc_root); 1882 } 1883 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1884 &root->state); 1885 btrfs_put_root(root); 1886 } 1887 1888 list_del_init(&reloc_root->root_list); 1889 /* Don't forget to queue this reloc root for cleanup */ 1890 list_add_tail(&reloc_root->reloc_dirty_list, 1891 &rc->dirty_subvol_roots); 1892 } 1893 } 1894 1895 if (found) { 1896 found = 0; 1897 goto again; 1898 } 1899 out: 1900 if (ret) { 1901 btrfs_handle_fs_error(fs_info, ret, NULL); 1902 free_reloc_roots(&reloc_roots); 1903 1904 /* new reloc root may be added */ 1905 mutex_lock(&fs_info->reloc_mutex); 1906 list_splice_init(&rc->reloc_roots, &reloc_roots); 1907 mutex_unlock(&fs_info->reloc_mutex); 1908 free_reloc_roots(&reloc_roots); 1909 } 1910 1911 /* 1912 * We used to have 1913 * 1914 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1915 * 1916 * here, but it's wrong. If we fail to start the transaction in 1917 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1918 * have actually been removed from the reloc_root_tree rb tree. This is 1919 * fine because we're bailing here, and we hold a reference on the root 1920 * for the list that holds it, so these roots will be cleaned up when we 1921 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1922 * will be cleaned up on unmount. 1923 * 1924 * The remaining nodes will be cleaned up by free_reloc_control. 1925 */ 1926 } 1927 1928 static void free_block_list(struct rb_root *blocks) 1929 { 1930 struct tree_block *block; 1931 struct rb_node *rb_node; 1932 while ((rb_node = rb_first(blocks))) { 1933 block = rb_entry(rb_node, struct tree_block, rb_node); 1934 rb_erase(rb_node, blocks); 1935 kfree(block); 1936 } 1937 } 1938 1939 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 1940 struct btrfs_root *reloc_root) 1941 { 1942 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 1943 struct btrfs_root *root; 1944 int ret; 1945 1946 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 1947 return 0; 1948 1949 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 1950 1951 /* 1952 * This should succeed, since we can't have a reloc root without having 1953 * already looked up the actual root and created the reloc root for this 1954 * root. 1955 * 1956 * However if there's some sort of corruption where we have a ref to a 1957 * reloc root without a corresponding root this could return ENOENT. 1958 */ 1959 if (IS_ERR(root)) { 1960 DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); 1961 return PTR_ERR(root); 1962 } 1963 if (root->reloc_root != reloc_root) { 1964 DEBUG_WARN("unexpected reloc root found"); 1965 btrfs_err(fs_info, 1966 "root %llu has two reloc roots associated with it", 1967 reloc_root->root_key.offset); 1968 btrfs_put_root(root); 1969 return -EUCLEAN; 1970 } 1971 ret = btrfs_record_root_in_trans(trans, root); 1972 btrfs_put_root(root); 1973 1974 return ret; 1975 } 1976 1977 static noinline_for_stack 1978 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 1979 struct reloc_control *rc, 1980 struct btrfs_backref_node *node, 1981 struct btrfs_backref_edge *edges[]) 1982 { 1983 struct btrfs_backref_node *next; 1984 struct btrfs_root *root; 1985 int index = 0; 1986 int ret; 1987 1988 next = walk_up_backref(node, edges, &index); 1989 root = next->root; 1990 1991 /* 1992 * If there is no root, then our references for this block are 1993 * incomplete, as we should be able to walk all the way up to a block 1994 * that is owned by a root. 1995 * 1996 * This path is only for SHAREABLE roots, so if we come upon a 1997 * non-SHAREABLE root then we have backrefs that resolve improperly. 1998 * 1999 * Both of these cases indicate file system corruption, or a bug in the 2000 * backref walking code. 2001 */ 2002 if (unlikely(!root)) { 2003 btrfs_err(trans->fs_info, 2004 "bytenr %llu doesn't have a backref path ending in a root", 2005 node->bytenr); 2006 return ERR_PTR(-EUCLEAN); 2007 } 2008 if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { 2009 btrfs_err(trans->fs_info, 2010 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2011 node->bytenr); 2012 return ERR_PTR(-EUCLEAN); 2013 } 2014 2015 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2016 ret = record_reloc_root_in_trans(trans, root); 2017 if (ret) 2018 return ERR_PTR(ret); 2019 goto found; 2020 } 2021 2022 ret = btrfs_record_root_in_trans(trans, root); 2023 if (ret) 2024 return ERR_PTR(ret); 2025 root = root->reloc_root; 2026 2027 /* 2028 * We could have raced with another thread which failed, so 2029 * root->reloc_root may not be set, return ENOENT in this case. 2030 */ 2031 if (!root) 2032 return ERR_PTR(-ENOENT); 2033 2034 if (next->new_bytenr) { 2035 /* 2036 * We just created the reloc root, so we shouldn't have 2037 * ->new_bytenr set yet. If it is then we have multiple roots 2038 * pointing at the same bytenr which indicates corruption, or 2039 * we've made a mistake in the backref walking code. 2040 */ 2041 ASSERT(next->new_bytenr == 0); 2042 btrfs_err(trans->fs_info, 2043 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2044 node->bytenr, next->bytenr); 2045 return ERR_PTR(-EUCLEAN); 2046 } 2047 2048 next->new_bytenr = root->node->start; 2049 btrfs_put_root(next->root); 2050 next->root = btrfs_grab_root(root); 2051 ASSERT(next->root); 2052 mark_block_processed(rc, next); 2053 found: 2054 next = node; 2055 /* setup backref node path for btrfs_reloc_cow_block */ 2056 while (1) { 2057 rc->backref_cache.path[next->level] = next; 2058 if (--index < 0) 2059 break; 2060 next = edges[index]->node[UPPER]; 2061 } 2062 return root; 2063 } 2064 2065 /* 2066 * Select a tree root for relocation. 2067 * 2068 * Return NULL if the block is not shareable. We should use do_relocation() in 2069 * this case. 2070 * 2071 * Return a tree root pointer if the block is shareable. 2072 * Return -ENOENT if the block is root of reloc tree. 2073 */ 2074 static noinline_for_stack 2075 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2076 { 2077 struct btrfs_backref_node *next; 2078 struct btrfs_root *root; 2079 struct btrfs_root *fs_root = NULL; 2080 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2081 int index = 0; 2082 2083 next = node; 2084 while (1) { 2085 cond_resched(); 2086 next = walk_up_backref(next, edges, &index); 2087 root = next->root; 2088 2089 /* 2090 * This can occur if we have incomplete extent refs leading all 2091 * the way up a particular path, in this case return -EUCLEAN. 2092 */ 2093 if (!root) 2094 return ERR_PTR(-EUCLEAN); 2095 2096 /* No other choice for non-shareable tree */ 2097 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2098 return root; 2099 2100 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2101 fs_root = root; 2102 2103 if (next != node) 2104 return NULL; 2105 2106 next = walk_down_backref(edges, &index); 2107 if (!next || next->level <= node->level) 2108 break; 2109 } 2110 2111 if (!fs_root) 2112 return ERR_PTR(-ENOENT); 2113 return fs_root; 2114 } 2115 2116 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2117 struct btrfs_backref_node *node) 2118 { 2119 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2120 struct btrfs_backref_node *next = node; 2121 struct btrfs_backref_edge *edge; 2122 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2123 u64 num_bytes = 0; 2124 int index = 0; 2125 2126 BUG_ON(node->processed); 2127 2128 while (next) { 2129 cond_resched(); 2130 while (1) { 2131 if (next->processed) 2132 break; 2133 2134 num_bytes += fs_info->nodesize; 2135 2136 if (list_empty(&next->upper)) 2137 break; 2138 2139 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2140 list[LOWER]); 2141 edges[index++] = edge; 2142 next = edge->node[UPPER]; 2143 } 2144 next = walk_down_backref(edges, &index); 2145 } 2146 return num_bytes; 2147 } 2148 2149 static int refill_metadata_space(struct btrfs_trans_handle *trans, 2150 struct reloc_control *rc, u64 num_bytes) 2151 { 2152 struct btrfs_fs_info *fs_info = trans->fs_info; 2153 int ret; 2154 2155 trans->block_rsv = rc->block_rsv; 2156 rc->reserved_bytes += num_bytes; 2157 2158 /* 2159 * We are under a transaction here so we can only do limited flushing. 2160 * If we get an enospc just kick back -EAGAIN so we know to drop the 2161 * transaction and try to refill when we can flush all the things. 2162 */ 2163 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2164 BTRFS_RESERVE_FLUSH_LIMIT); 2165 if (ret) { 2166 u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2167 2168 while (tmp <= rc->reserved_bytes) 2169 tmp <<= 1; 2170 /* 2171 * only one thread can access block_rsv at this point, 2172 * so we don't need hold lock to protect block_rsv. 2173 * we expand more reservation size here to allow enough 2174 * space for relocation and we will return earlier in 2175 * enospc case. 2176 */ 2177 rc->block_rsv->size = tmp + fs_info->nodesize * 2178 RELOCATION_RESERVED_NODES; 2179 return -EAGAIN; 2180 } 2181 2182 return 0; 2183 } 2184 2185 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2186 struct reloc_control *rc, 2187 struct btrfs_backref_node *node) 2188 { 2189 u64 num_bytes; 2190 2191 num_bytes = calcu_metadata_size(rc, node) * 2; 2192 return refill_metadata_space(trans, rc, num_bytes); 2193 } 2194 2195 /* 2196 * relocate a block tree, and then update pointers in upper level 2197 * blocks that reference the block to point to the new location. 2198 * 2199 * if called by link_to_upper, the block has already been relocated. 2200 * in that case this function just updates pointers. 2201 */ 2202 static int do_relocation(struct btrfs_trans_handle *trans, 2203 struct reloc_control *rc, 2204 struct btrfs_backref_node *node, 2205 struct btrfs_key *key, 2206 struct btrfs_path *path, int lowest) 2207 { 2208 struct btrfs_backref_node *upper; 2209 struct btrfs_backref_edge *edge; 2210 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2211 struct btrfs_root *root; 2212 struct extent_buffer *eb; 2213 u32 blocksize; 2214 u64 bytenr; 2215 int slot; 2216 int ret = 0; 2217 2218 /* 2219 * If we are lowest then this is the first time we're processing this 2220 * block, and thus shouldn't have an eb associated with it yet. 2221 */ 2222 ASSERT(!lowest || !node->eb); 2223 2224 path->lowest_level = node->level + 1; 2225 rc->backref_cache.path[node->level] = node; 2226 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2227 cond_resched(); 2228 2229 upper = edge->node[UPPER]; 2230 root = select_reloc_root(trans, rc, upper, edges); 2231 if (IS_ERR(root)) { 2232 ret = PTR_ERR(root); 2233 goto next; 2234 } 2235 2236 if (upper->eb && !upper->locked) { 2237 if (!lowest) { 2238 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2239 if (ret < 0) 2240 goto next; 2241 BUG_ON(ret); 2242 bytenr = btrfs_node_blockptr(upper->eb, slot); 2243 if (node->eb->start == bytenr) 2244 goto next; 2245 } 2246 btrfs_backref_drop_node_buffer(upper); 2247 } 2248 2249 if (!upper->eb) { 2250 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2251 if (ret) { 2252 if (ret > 0) 2253 ret = -ENOENT; 2254 2255 btrfs_release_path(path); 2256 break; 2257 } 2258 2259 if (!upper->eb) { 2260 upper->eb = path->nodes[upper->level]; 2261 path->nodes[upper->level] = NULL; 2262 } else { 2263 BUG_ON(upper->eb != path->nodes[upper->level]); 2264 } 2265 2266 upper->locked = 1; 2267 path->locks[upper->level] = 0; 2268 2269 slot = path->slots[upper->level]; 2270 btrfs_release_path(path); 2271 } else { 2272 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2273 if (ret < 0) 2274 goto next; 2275 BUG_ON(ret); 2276 } 2277 2278 bytenr = btrfs_node_blockptr(upper->eb, slot); 2279 if (lowest) { 2280 if (bytenr != node->bytenr) { 2281 btrfs_err(root->fs_info, 2282 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2283 bytenr, node->bytenr, slot, 2284 upper->eb->start); 2285 ret = -EIO; 2286 goto next; 2287 } 2288 } else { 2289 if (node->eb->start == bytenr) 2290 goto next; 2291 } 2292 2293 blocksize = root->fs_info->nodesize; 2294 eb = btrfs_read_node_slot(upper->eb, slot); 2295 if (IS_ERR(eb)) { 2296 ret = PTR_ERR(eb); 2297 goto next; 2298 } 2299 btrfs_tree_lock(eb); 2300 2301 if (!node->eb) { 2302 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2303 slot, &eb, BTRFS_NESTING_COW); 2304 btrfs_tree_unlock(eb); 2305 free_extent_buffer(eb); 2306 if (ret < 0) 2307 goto next; 2308 /* 2309 * We've just COWed this block, it should have updated 2310 * the correct backref node entry. 2311 */ 2312 ASSERT(node->eb == eb); 2313 } else { 2314 struct btrfs_ref ref = { 2315 .action = BTRFS_ADD_DELAYED_REF, 2316 .bytenr = node->eb->start, 2317 .num_bytes = blocksize, 2318 .parent = upper->eb->start, 2319 .owning_root = btrfs_header_owner(upper->eb), 2320 .ref_root = btrfs_header_owner(upper->eb), 2321 }; 2322 2323 btrfs_set_node_blockptr(upper->eb, slot, 2324 node->eb->start); 2325 btrfs_set_node_ptr_generation(upper->eb, slot, 2326 trans->transid); 2327 btrfs_mark_buffer_dirty(trans, upper->eb); 2328 2329 btrfs_init_tree_ref(&ref, node->level, 2330 btrfs_root_id(root), false); 2331 ret = btrfs_inc_extent_ref(trans, &ref); 2332 if (!ret) 2333 ret = btrfs_drop_subtree(trans, root, eb, 2334 upper->eb); 2335 if (ret) 2336 btrfs_abort_transaction(trans, ret); 2337 } 2338 next: 2339 if (!upper->pending) 2340 btrfs_backref_drop_node_buffer(upper); 2341 else 2342 btrfs_backref_unlock_node_buffer(upper); 2343 if (ret) 2344 break; 2345 } 2346 2347 if (!ret && node->pending) { 2348 btrfs_backref_drop_node_buffer(node); 2349 list_del_init(&node->list); 2350 node->pending = 0; 2351 } 2352 2353 path->lowest_level = 0; 2354 2355 /* 2356 * We should have allocated all of our space in the block rsv and thus 2357 * shouldn't ENOSPC. 2358 */ 2359 ASSERT(ret != -ENOSPC); 2360 return ret; 2361 } 2362 2363 static int link_to_upper(struct btrfs_trans_handle *trans, 2364 struct reloc_control *rc, 2365 struct btrfs_backref_node *node, 2366 struct btrfs_path *path) 2367 { 2368 struct btrfs_key key; 2369 2370 btrfs_node_key_to_cpu(node->eb, &key, 0); 2371 return do_relocation(trans, rc, node, &key, path, 0); 2372 } 2373 2374 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2375 struct reloc_control *rc, 2376 struct btrfs_path *path, int err) 2377 { 2378 LIST_HEAD(list); 2379 struct btrfs_backref_cache *cache = &rc->backref_cache; 2380 struct btrfs_backref_node *node; 2381 int level; 2382 int ret; 2383 2384 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2385 while (!list_empty(&cache->pending[level])) { 2386 node = list_first_entry(&cache->pending[level], 2387 struct btrfs_backref_node, list); 2388 list_move_tail(&node->list, &list); 2389 BUG_ON(!node->pending); 2390 2391 if (!err) { 2392 ret = link_to_upper(trans, rc, node, path); 2393 if (ret < 0) 2394 err = ret; 2395 } 2396 } 2397 list_splice_init(&list, &cache->pending[level]); 2398 } 2399 return err; 2400 } 2401 2402 /* 2403 * mark a block and all blocks directly/indirectly reference the block 2404 * as processed. 2405 */ 2406 static void update_processed_blocks(struct reloc_control *rc, 2407 struct btrfs_backref_node *node) 2408 { 2409 struct btrfs_backref_node *next = node; 2410 struct btrfs_backref_edge *edge; 2411 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2412 int index = 0; 2413 2414 while (next) { 2415 cond_resched(); 2416 while (1) { 2417 if (next->processed) 2418 break; 2419 2420 mark_block_processed(rc, next); 2421 2422 if (list_empty(&next->upper)) 2423 break; 2424 2425 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2426 list[LOWER]); 2427 edges[index++] = edge; 2428 next = edge->node[UPPER]; 2429 } 2430 next = walk_down_backref(edges, &index); 2431 } 2432 } 2433 2434 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2435 { 2436 u32 blocksize = rc->extent_root->fs_info->nodesize; 2437 2438 if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, 2439 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2440 return 1; 2441 return 0; 2442 } 2443 2444 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2445 struct tree_block *block) 2446 { 2447 struct btrfs_tree_parent_check check = { 2448 .level = block->level, 2449 .owner_root = block->owner, 2450 .transid = block->key.offset 2451 }; 2452 struct extent_buffer *eb; 2453 2454 eb = read_tree_block(fs_info, block->bytenr, &check); 2455 if (IS_ERR(eb)) 2456 return PTR_ERR(eb); 2457 if (!extent_buffer_uptodate(eb)) { 2458 free_extent_buffer(eb); 2459 return -EIO; 2460 } 2461 if (block->level == 0) 2462 btrfs_item_key_to_cpu(eb, &block->key, 0); 2463 else 2464 btrfs_node_key_to_cpu(eb, &block->key, 0); 2465 free_extent_buffer(eb); 2466 block->key_ready = true; 2467 return 0; 2468 } 2469 2470 /* 2471 * helper function to relocate a tree block 2472 */ 2473 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2474 struct reloc_control *rc, 2475 struct btrfs_backref_node *node, 2476 struct btrfs_key *key, 2477 struct btrfs_path *path) 2478 { 2479 struct btrfs_root *root; 2480 int ret = 0; 2481 2482 if (!node) 2483 return 0; 2484 2485 /* 2486 * If we fail here we want to drop our backref_node because we are going 2487 * to start over and regenerate the tree for it. 2488 */ 2489 ret = reserve_metadata_space(trans, rc, node); 2490 if (ret) 2491 goto out; 2492 2493 BUG_ON(node->processed); 2494 root = select_one_root(node); 2495 if (IS_ERR(root)) { 2496 ret = PTR_ERR(root); 2497 2498 /* See explanation in select_one_root for the -EUCLEAN case. */ 2499 ASSERT(ret == -ENOENT); 2500 if (ret == -ENOENT) { 2501 ret = 0; 2502 update_processed_blocks(rc, node); 2503 } 2504 goto out; 2505 } 2506 2507 if (root) { 2508 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2509 /* 2510 * This block was the root block of a root, and this is 2511 * the first time we're processing the block and thus it 2512 * should not have had the ->new_bytenr modified. 2513 * 2514 * However in the case of corruption we could have 2515 * multiple refs pointing to the same block improperly, 2516 * and thus we would trip over these checks. ASSERT() 2517 * for the developer case, because it could indicate a 2518 * bug in the backref code, however error out for a 2519 * normal user in the case of corruption. 2520 */ 2521 ASSERT(node->new_bytenr == 0); 2522 if (node->new_bytenr) { 2523 btrfs_err(root->fs_info, 2524 "bytenr %llu has improper references to it", 2525 node->bytenr); 2526 ret = -EUCLEAN; 2527 goto out; 2528 } 2529 ret = btrfs_record_root_in_trans(trans, root); 2530 if (ret) 2531 goto out; 2532 /* 2533 * Another thread could have failed, need to check if we 2534 * have reloc_root actually set. 2535 */ 2536 if (!root->reloc_root) { 2537 ret = -ENOENT; 2538 goto out; 2539 } 2540 root = root->reloc_root; 2541 node->new_bytenr = root->node->start; 2542 btrfs_put_root(node->root); 2543 node->root = btrfs_grab_root(root); 2544 ASSERT(node->root); 2545 } else { 2546 btrfs_err(root->fs_info, 2547 "bytenr %llu resolved to a non-shareable root", 2548 node->bytenr); 2549 ret = -EUCLEAN; 2550 goto out; 2551 } 2552 if (!ret) 2553 update_processed_blocks(rc, node); 2554 } else { 2555 ret = do_relocation(trans, rc, node, key, path, 1); 2556 } 2557 out: 2558 if (ret || node->level == 0) 2559 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2560 return ret; 2561 } 2562 2563 static int relocate_cowonly_block(struct btrfs_trans_handle *trans, 2564 struct reloc_control *rc, struct tree_block *block, 2565 struct btrfs_path *path) 2566 { 2567 struct btrfs_fs_info *fs_info = trans->fs_info; 2568 struct btrfs_root *root; 2569 u64 num_bytes; 2570 int nr_levels; 2571 int ret; 2572 2573 root = btrfs_get_fs_root(fs_info, block->owner, true); 2574 if (IS_ERR(root)) 2575 return PTR_ERR(root); 2576 2577 nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; 2578 2579 num_bytes = fs_info->nodesize * nr_levels; 2580 ret = refill_metadata_space(trans, rc, num_bytes); 2581 if (ret) { 2582 btrfs_put_root(root); 2583 return ret; 2584 } 2585 path->lowest_level = block->level; 2586 if (root == root->fs_info->chunk_root) 2587 btrfs_reserve_chunk_metadata(trans, false); 2588 2589 ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); 2590 path->lowest_level = 0; 2591 btrfs_release_path(path); 2592 2593 if (root == root->fs_info->chunk_root) 2594 btrfs_trans_release_chunk_metadata(trans); 2595 if (ret > 0) 2596 ret = 0; 2597 btrfs_put_root(root); 2598 2599 return ret; 2600 } 2601 2602 /* 2603 * relocate a list of blocks 2604 */ 2605 static noinline_for_stack 2606 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2607 struct reloc_control *rc, struct rb_root *blocks) 2608 { 2609 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2610 struct btrfs_backref_node *node; 2611 struct btrfs_path *path; 2612 struct tree_block *block; 2613 struct tree_block *next; 2614 int ret = 0; 2615 2616 path = btrfs_alloc_path(); 2617 if (!path) { 2618 ret = -ENOMEM; 2619 goto out_free_blocks; 2620 } 2621 2622 /* Kick in readahead for tree blocks with missing keys */ 2623 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2624 if (!block->key_ready) 2625 btrfs_readahead_tree_block(fs_info, block->bytenr, 2626 block->owner, 0, 2627 block->level); 2628 } 2629 2630 /* Get first keys */ 2631 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2632 if (!block->key_ready) { 2633 ret = get_tree_block_key(fs_info, block); 2634 if (ret) 2635 goto out_free_path; 2636 } 2637 } 2638 2639 /* Do tree relocation */ 2640 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2641 /* 2642 * For COWonly blocks, or the data reloc tree, we only need to 2643 * COW down to the block, there's no need to generate a backref 2644 * tree. 2645 */ 2646 if (block->owner && 2647 (!btrfs_is_fstree(block->owner) || 2648 block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { 2649 ret = relocate_cowonly_block(trans, rc, block, path); 2650 if (ret) 2651 break; 2652 continue; 2653 } 2654 2655 node = build_backref_tree(trans, rc, &block->key, 2656 block->level, block->bytenr); 2657 if (IS_ERR(node)) { 2658 ret = PTR_ERR(node); 2659 goto out; 2660 } 2661 2662 ret = relocate_tree_block(trans, rc, node, &block->key, 2663 path); 2664 if (ret < 0) 2665 break; 2666 } 2667 out: 2668 ret = finish_pending_nodes(trans, rc, path, ret); 2669 2670 out_free_path: 2671 btrfs_free_path(path); 2672 out_free_blocks: 2673 free_block_list(blocks); 2674 return ret; 2675 } 2676 2677 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2678 { 2679 const struct file_extent_cluster *cluster = &rc->cluster; 2680 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2681 u64 alloc_hint = 0; 2682 u64 start; 2683 u64 end; 2684 u64 offset = inode->reloc_block_group_start; 2685 u64 num_bytes; 2686 int nr; 2687 int ret = 0; 2688 u64 prealloc_start = cluster->start - offset; 2689 u64 prealloc_end = cluster->end - offset; 2690 u64 cur_offset = prealloc_start; 2691 2692 /* 2693 * For blocksize < folio size case (either bs < page size or large folios), 2694 * beyond i_size, all blocks are filled with zero. 2695 * 2696 * If the current cluster covers the above range, btrfs_do_readpage() 2697 * will skip the read, and relocate_one_folio() will later writeback 2698 * the padding zeros as new data, causing data corruption. 2699 * 2700 * Here we have to invalidate the cache covering our cluster. 2701 */ 2702 ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, 2703 prealloc_end); 2704 if (ret < 0) 2705 return ret; 2706 2707 BUG_ON(cluster->start != cluster->boundary[0]); 2708 ret = btrfs_alloc_data_chunk_ondemand(inode, 2709 prealloc_end + 1 - prealloc_start); 2710 if (ret) 2711 return ret; 2712 2713 btrfs_inode_lock(inode, 0); 2714 for (nr = 0; nr < cluster->nr; nr++) { 2715 struct extent_state *cached_state = NULL; 2716 2717 start = cluster->boundary[nr] - offset; 2718 if (nr + 1 < cluster->nr) 2719 end = cluster->boundary[nr + 1] - 1 - offset; 2720 else 2721 end = cluster->end - offset; 2722 2723 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2724 num_bytes = end + 1 - start; 2725 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2726 num_bytes, num_bytes, 2727 end + 1, &alloc_hint); 2728 cur_offset = end + 1; 2729 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2730 if (ret) 2731 break; 2732 } 2733 btrfs_inode_unlock(inode, 0); 2734 2735 if (cur_offset < prealloc_end) 2736 btrfs_free_reserved_data_space_noquota(inode, 2737 prealloc_end + 1 - cur_offset); 2738 return ret; 2739 } 2740 2741 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2742 { 2743 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2744 struct extent_map *em; 2745 struct extent_state *cached_state = NULL; 2746 u64 offset = inode->reloc_block_group_start; 2747 u64 start = rc->cluster.start - offset; 2748 u64 end = rc->cluster.end - offset; 2749 int ret = 0; 2750 2751 em = btrfs_alloc_extent_map(); 2752 if (!em) 2753 return -ENOMEM; 2754 2755 em->start = start; 2756 em->len = end + 1 - start; 2757 em->disk_bytenr = rc->cluster.start; 2758 em->disk_num_bytes = em->len; 2759 em->ram_bytes = em->len; 2760 em->flags |= EXTENT_FLAG_PINNED; 2761 2762 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2763 ret = btrfs_replace_extent_map_range(inode, em, false); 2764 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2765 btrfs_free_extent_map(em); 2766 2767 return ret; 2768 } 2769 2770 /* 2771 * Allow error injection to test balance/relocation cancellation 2772 */ 2773 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2774 { 2775 return atomic_read(&fs_info->balance_cancel_req) || 2776 atomic_read(&fs_info->reloc_cancel_req) || 2777 fatal_signal_pending(current); 2778 } 2779 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2780 2781 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2782 int cluster_nr) 2783 { 2784 /* Last extent, use cluster end directly */ 2785 if (cluster_nr >= cluster->nr - 1) 2786 return cluster->end; 2787 2788 /* Use next boundary start*/ 2789 return cluster->boundary[cluster_nr + 1] - 1; 2790 } 2791 2792 static int relocate_one_folio(struct reloc_control *rc, 2793 struct file_ra_state *ra, 2794 int *cluster_nr, u64 *file_offset_ret) 2795 { 2796 const struct file_extent_cluster *cluster = &rc->cluster; 2797 struct inode *inode = rc->data_inode; 2798 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2799 const u64 orig_file_offset = *file_offset_ret; 2800 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2801 const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; 2802 const pgoff_t index = orig_file_offset >> PAGE_SHIFT; 2803 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2804 struct folio *folio; 2805 u64 folio_start; 2806 u64 folio_end; 2807 u64 cur; 2808 int ret; 2809 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2810 2811 ASSERT(index <= last_index); 2812 again: 2813 folio = filemap_lock_folio(inode->i_mapping, index); 2814 if (IS_ERR(folio)) { 2815 2816 /* 2817 * On relocation we're doing readahead on the relocation inode, 2818 * but if the filesystem is backed by a RAID stripe tree we can 2819 * get ENOENT (e.g. due to preallocated extents not being 2820 * mapped in the RST) from the lookup. 2821 * 2822 * But readahead doesn't handle the error and submits invalid 2823 * reads to the device, causing a assertion failures. 2824 */ 2825 if (!use_rst) 2826 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2827 index, last_index + 1 - index); 2828 folio = __filemap_get_folio(inode->i_mapping, index, 2829 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2830 mask); 2831 if (IS_ERR(folio)) 2832 return PTR_ERR(folio); 2833 } 2834 2835 if (folio_test_readahead(folio) && !use_rst) 2836 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2837 folio, last_index + 1 - index); 2838 2839 if (!folio_test_uptodate(folio)) { 2840 btrfs_read_folio(NULL, folio); 2841 folio_lock(folio); 2842 if (!folio_test_uptodate(folio)) { 2843 ret = -EIO; 2844 goto release_folio; 2845 } 2846 if (folio->mapping != inode->i_mapping) { 2847 folio_unlock(folio); 2848 folio_put(folio); 2849 goto again; 2850 } 2851 } 2852 2853 /* 2854 * We could have lost folio private when we dropped the lock to read the 2855 * folio above, make sure we set_folio_extent_mapped() here so we have any 2856 * of the subpage blocksize stuff we need in place. 2857 */ 2858 ret = set_folio_extent_mapped(folio); 2859 if (ret < 0) 2860 goto release_folio; 2861 2862 folio_start = folio_pos(folio); 2863 folio_end = folio_start + folio_size(folio) - 1; 2864 2865 /* 2866 * Start from the cluster, as for subpage case, the cluster can start 2867 * inside the folio. 2868 */ 2869 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2870 while (cur <= folio_end) { 2871 struct extent_state *cached_state = NULL; 2872 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2873 u64 extent_end = get_cluster_boundary_end(cluster, 2874 *cluster_nr) - offset; 2875 u64 clamped_start = max(folio_start, extent_start); 2876 u64 clamped_end = min(folio_end, extent_end); 2877 u32 clamped_len = clamped_end + 1 - clamped_start; 2878 2879 /* Reserve metadata for this range */ 2880 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2881 clamped_len, clamped_len, 2882 false); 2883 if (ret) 2884 goto release_folio; 2885 2886 /* Mark the range delalloc and dirty for later writeback */ 2887 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, 2888 clamped_end, &cached_state); 2889 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2890 clamped_end, 0, &cached_state); 2891 if (ret) { 2892 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 2893 clamped_start, clamped_end, 2894 EXTENT_LOCKED | EXTENT_BOUNDARY, 2895 &cached_state); 2896 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2897 clamped_len, true); 2898 btrfs_delalloc_release_extents(BTRFS_I(inode), 2899 clamped_len); 2900 goto release_folio; 2901 } 2902 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2903 2904 /* 2905 * Set the boundary if it's inside the folio. 2906 * Data relocation requires the destination extents to have the 2907 * same size as the source. 2908 * EXTENT_BOUNDARY bit prevents current extent from being merged 2909 * with previous extent. 2910 */ 2911 if (in_range(cluster->boundary[*cluster_nr] - offset, 2912 folio_start, folio_size(folio))) { 2913 u64 boundary_start = cluster->boundary[*cluster_nr] - 2914 offset; 2915 u64 boundary_end = boundary_start + 2916 fs_info->sectorsize - 1; 2917 2918 btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, 2919 boundary_start, boundary_end, 2920 EXTENT_BOUNDARY, NULL); 2921 } 2922 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2923 &cached_state); 2924 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 2925 cur += clamped_len; 2926 2927 /* Crossed extent end, go to next extent */ 2928 if (cur >= extent_end) { 2929 (*cluster_nr)++; 2930 /* Just finished the last extent of the cluster, exit. */ 2931 if (*cluster_nr >= cluster->nr) 2932 break; 2933 } 2934 } 2935 folio_unlock(folio); 2936 folio_put(folio); 2937 2938 balance_dirty_pages_ratelimited(inode->i_mapping); 2939 btrfs_throttle(fs_info); 2940 if (btrfs_should_cancel_balance(fs_info)) 2941 ret = -ECANCELED; 2942 *file_offset_ret = folio_end + 1; 2943 return ret; 2944 2945 release_folio: 2946 folio_unlock(folio); 2947 folio_put(folio); 2948 return ret; 2949 } 2950 2951 static int relocate_file_extent_cluster(struct reloc_control *rc) 2952 { 2953 struct inode *inode = rc->data_inode; 2954 const struct file_extent_cluster *cluster = &rc->cluster; 2955 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2956 u64 cur_file_offset = cluster->start - offset; 2957 struct file_ra_state *ra; 2958 int cluster_nr = 0; 2959 int ret = 0; 2960 2961 if (!cluster->nr) 2962 return 0; 2963 2964 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2965 if (!ra) 2966 return -ENOMEM; 2967 2968 ret = prealloc_file_extent_cluster(rc); 2969 if (ret) 2970 goto out; 2971 2972 file_ra_state_init(ra, inode->i_mapping); 2973 2974 ret = setup_relocation_extent_mapping(rc); 2975 if (ret) 2976 goto out; 2977 2978 while (cur_file_offset < cluster->end - offset) { 2979 ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); 2980 if (ret) 2981 break; 2982 } 2983 if (ret == 0) 2984 WARN_ON(cluster_nr != cluster->nr); 2985 out: 2986 kfree(ra); 2987 return ret; 2988 } 2989 2990 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 2991 const struct btrfs_key *extent_key) 2992 { 2993 struct inode *inode = rc->data_inode; 2994 struct file_extent_cluster *cluster = &rc->cluster; 2995 int ret; 2996 struct btrfs_root *root = BTRFS_I(inode)->root; 2997 2998 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 2999 ret = relocate_file_extent_cluster(rc); 3000 if (ret) 3001 return ret; 3002 cluster->nr = 0; 3003 } 3004 3005 /* 3006 * Under simple quotas, we set root->relocation_src_root when we find 3007 * the extent. If adjacent extents have different owners, we can't merge 3008 * them while relocating. Handle this by storing the owning root that 3009 * started a cluster and if we see an extent from a different root break 3010 * cluster formation (just like the above case of non-adjacent extents). 3011 * 3012 * Without simple quotas, relocation_src_root is always 0, so we should 3013 * never see a mismatch, and it should have no effect on relocation 3014 * clusters. 3015 */ 3016 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3017 u64 tmp = root->relocation_src_root; 3018 3019 /* 3020 * root->relocation_src_root is the state that actually affects 3021 * the preallocation we do here, so set it to the root owning 3022 * the cluster we need to relocate. 3023 */ 3024 root->relocation_src_root = cluster->owning_root; 3025 ret = relocate_file_extent_cluster(rc); 3026 if (ret) 3027 return ret; 3028 cluster->nr = 0; 3029 /* And reset it back for the current extent's owning root. */ 3030 root->relocation_src_root = tmp; 3031 } 3032 3033 if (!cluster->nr) { 3034 cluster->start = extent_key->objectid; 3035 cluster->owning_root = root->relocation_src_root; 3036 } 3037 else 3038 BUG_ON(cluster->nr >= MAX_EXTENTS); 3039 cluster->end = extent_key->objectid + extent_key->offset - 1; 3040 cluster->boundary[cluster->nr] = extent_key->objectid; 3041 cluster->nr++; 3042 3043 if (cluster->nr >= MAX_EXTENTS) { 3044 ret = relocate_file_extent_cluster(rc); 3045 if (ret) 3046 return ret; 3047 cluster->nr = 0; 3048 } 3049 return 0; 3050 } 3051 3052 /* 3053 * helper to add a tree block to the list. 3054 * the major work is getting the generation and level of the block 3055 */ 3056 static int add_tree_block(struct reloc_control *rc, 3057 const struct btrfs_key *extent_key, 3058 struct btrfs_path *path, 3059 struct rb_root *blocks) 3060 { 3061 struct extent_buffer *eb; 3062 struct btrfs_extent_item *ei; 3063 struct btrfs_tree_block_info *bi; 3064 struct tree_block *block; 3065 struct rb_node *rb_node; 3066 u32 item_size; 3067 int level = -1; 3068 u64 generation; 3069 u64 owner = 0; 3070 3071 eb = path->nodes[0]; 3072 item_size = btrfs_item_size(eb, path->slots[0]); 3073 3074 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3075 item_size >= sizeof(*ei) + sizeof(*bi)) { 3076 unsigned long ptr = 0, end; 3077 3078 ei = btrfs_item_ptr(eb, path->slots[0], 3079 struct btrfs_extent_item); 3080 end = (unsigned long)ei + item_size; 3081 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3082 bi = (struct btrfs_tree_block_info *)(ei + 1); 3083 level = btrfs_tree_block_level(eb, bi); 3084 ptr = (unsigned long)(bi + 1); 3085 } else { 3086 level = (int)extent_key->offset; 3087 ptr = (unsigned long)(ei + 1); 3088 } 3089 generation = btrfs_extent_generation(eb, ei); 3090 3091 /* 3092 * We're reading random blocks without knowing their owner ahead 3093 * of time. This is ok most of the time, as all reloc roots and 3094 * fs roots have the same lock type. However normal trees do 3095 * not, and the only way to know ahead of time is to read the 3096 * inline ref offset. We know it's an fs root if 3097 * 3098 * 1. There's more than one ref. 3099 * 2. There's a SHARED_DATA_REF_KEY set. 3100 * 3. FULL_BACKREF is set on the flags. 3101 * 3102 * Otherwise it's safe to assume that the ref offset == the 3103 * owner of this block, so we can use that when calling 3104 * read_tree_block. 3105 */ 3106 if (btrfs_extent_refs(eb, ei) == 1 && 3107 !(btrfs_extent_flags(eb, ei) & 3108 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3109 ptr < end) { 3110 struct btrfs_extent_inline_ref *iref; 3111 int type; 3112 3113 iref = (struct btrfs_extent_inline_ref *)ptr; 3114 type = btrfs_get_extent_inline_ref_type(eb, iref, 3115 BTRFS_REF_TYPE_BLOCK); 3116 if (type == BTRFS_REF_TYPE_INVALID) 3117 return -EINVAL; 3118 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3119 owner = btrfs_extent_inline_ref_offset(eb, iref); 3120 } 3121 } else { 3122 btrfs_print_leaf(eb); 3123 btrfs_err(rc->block_group->fs_info, 3124 "unrecognized tree backref at tree block %llu slot %u", 3125 eb->start, path->slots[0]); 3126 btrfs_release_path(path); 3127 return -EUCLEAN; 3128 } 3129 3130 btrfs_release_path(path); 3131 3132 BUG_ON(level == -1); 3133 3134 block = kmalloc(sizeof(*block), GFP_NOFS); 3135 if (!block) 3136 return -ENOMEM; 3137 3138 block->bytenr = extent_key->objectid; 3139 block->key.objectid = rc->extent_root->fs_info->nodesize; 3140 block->key.offset = generation; 3141 block->level = level; 3142 block->key_ready = false; 3143 block->owner = owner; 3144 3145 rb_node = rb_simple_insert(blocks, &block->simple_node); 3146 if (rb_node) 3147 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3148 -EEXIST); 3149 3150 return 0; 3151 } 3152 3153 /* 3154 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3155 */ 3156 static int __add_tree_block(struct reloc_control *rc, 3157 u64 bytenr, u32 blocksize, 3158 struct rb_root *blocks) 3159 { 3160 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3161 struct btrfs_path *path; 3162 struct btrfs_key key; 3163 int ret; 3164 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3165 3166 if (tree_block_processed(bytenr, rc)) 3167 return 0; 3168 3169 if (rb_simple_search(blocks, bytenr)) 3170 return 0; 3171 3172 path = btrfs_alloc_path(); 3173 if (!path) 3174 return -ENOMEM; 3175 again: 3176 key.objectid = bytenr; 3177 if (skinny) { 3178 key.type = BTRFS_METADATA_ITEM_KEY; 3179 key.offset = (u64)-1; 3180 } else { 3181 key.type = BTRFS_EXTENT_ITEM_KEY; 3182 key.offset = blocksize; 3183 } 3184 3185 path->search_commit_root = 1; 3186 path->skip_locking = 1; 3187 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3188 if (ret < 0) 3189 goto out; 3190 3191 if (ret > 0 && skinny) { 3192 if (path->slots[0]) { 3193 path->slots[0]--; 3194 btrfs_item_key_to_cpu(path->nodes[0], &key, 3195 path->slots[0]); 3196 if (key.objectid == bytenr && 3197 (key.type == BTRFS_METADATA_ITEM_KEY || 3198 (key.type == BTRFS_EXTENT_ITEM_KEY && 3199 key.offset == blocksize))) 3200 ret = 0; 3201 } 3202 3203 if (ret) { 3204 skinny = false; 3205 btrfs_release_path(path); 3206 goto again; 3207 } 3208 } 3209 if (ret) { 3210 ASSERT(ret == 1); 3211 btrfs_print_leaf(path->nodes[0]); 3212 btrfs_err(fs_info, 3213 "tree block extent item (%llu) is not found in extent tree", 3214 bytenr); 3215 WARN_ON(1); 3216 ret = -EINVAL; 3217 goto out; 3218 } 3219 3220 ret = add_tree_block(rc, &key, path, blocks); 3221 out: 3222 btrfs_free_path(path); 3223 return ret; 3224 } 3225 3226 static int delete_block_group_cache(struct btrfs_block_group *block_group, 3227 struct inode *inode, 3228 u64 ino) 3229 { 3230 struct btrfs_fs_info *fs_info = block_group->fs_info; 3231 struct btrfs_root *root = fs_info->tree_root; 3232 struct btrfs_trans_handle *trans; 3233 struct btrfs_inode *btrfs_inode; 3234 int ret = 0; 3235 3236 if (inode) 3237 goto truncate; 3238 3239 btrfs_inode = btrfs_iget(ino, root); 3240 if (IS_ERR(btrfs_inode)) 3241 return -ENOENT; 3242 inode = &btrfs_inode->vfs_inode; 3243 3244 truncate: 3245 ret = btrfs_check_trunc_cache_free_space(fs_info, 3246 &fs_info->global_block_rsv); 3247 if (ret) 3248 goto out; 3249 3250 trans = btrfs_join_transaction(root); 3251 if (IS_ERR(trans)) { 3252 ret = PTR_ERR(trans); 3253 goto out; 3254 } 3255 3256 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3257 3258 btrfs_end_transaction(trans); 3259 btrfs_btree_balance_dirty(fs_info); 3260 out: 3261 iput(inode); 3262 return ret; 3263 } 3264 3265 /* 3266 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3267 * cache inode, to avoid free space cache data extent blocking data relocation. 3268 */ 3269 static int delete_v1_space_cache(struct extent_buffer *leaf, 3270 struct btrfs_block_group *block_group, 3271 u64 data_bytenr) 3272 { 3273 u64 space_cache_ino; 3274 struct btrfs_file_extent_item *ei; 3275 struct btrfs_key key; 3276 bool found = false; 3277 int i; 3278 int ret; 3279 3280 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3281 return 0; 3282 3283 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3284 u8 type; 3285 3286 btrfs_item_key_to_cpu(leaf, &key, i); 3287 if (key.type != BTRFS_EXTENT_DATA_KEY) 3288 continue; 3289 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3290 type = btrfs_file_extent_type(leaf, ei); 3291 3292 if ((type == BTRFS_FILE_EXTENT_REG || 3293 type == BTRFS_FILE_EXTENT_PREALLOC) && 3294 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3295 found = true; 3296 space_cache_ino = key.objectid; 3297 break; 3298 } 3299 } 3300 if (!found) 3301 return -ENOENT; 3302 ret = delete_block_group_cache(block_group, NULL, space_cache_ino); 3303 return ret; 3304 } 3305 3306 /* 3307 * helper to find all tree blocks that reference a given data extent 3308 */ 3309 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3310 const struct btrfs_key *extent_key, 3311 struct btrfs_path *path, 3312 struct rb_root *blocks) 3313 { 3314 struct btrfs_backref_walk_ctx ctx = { 0 }; 3315 struct ulist_iterator leaf_uiter; 3316 struct ulist_node *ref_node = NULL; 3317 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3318 int ret = 0; 3319 3320 btrfs_release_path(path); 3321 3322 ctx.bytenr = extent_key->objectid; 3323 ctx.skip_inode_ref_list = true; 3324 ctx.fs_info = rc->extent_root->fs_info; 3325 3326 ret = btrfs_find_all_leafs(&ctx); 3327 if (ret < 0) 3328 return ret; 3329 3330 ULIST_ITER_INIT(&leaf_uiter); 3331 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3332 struct btrfs_tree_parent_check check = { 0 }; 3333 struct extent_buffer *eb; 3334 3335 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3336 if (IS_ERR(eb)) { 3337 ret = PTR_ERR(eb); 3338 break; 3339 } 3340 ret = delete_v1_space_cache(eb, rc->block_group, 3341 extent_key->objectid); 3342 free_extent_buffer(eb); 3343 if (ret < 0) 3344 break; 3345 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3346 if (ret < 0) 3347 break; 3348 } 3349 if (ret < 0) 3350 free_block_list(blocks); 3351 ulist_free(ctx.refs); 3352 return ret; 3353 } 3354 3355 /* 3356 * helper to find next unprocessed extent 3357 */ 3358 static noinline_for_stack 3359 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3360 struct btrfs_key *extent_key) 3361 { 3362 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3363 struct btrfs_key key; 3364 struct extent_buffer *leaf; 3365 u64 start, end, last; 3366 int ret; 3367 3368 last = rc->block_group->start + rc->block_group->length; 3369 while (1) { 3370 bool block_found; 3371 3372 cond_resched(); 3373 if (rc->search_start >= last) { 3374 ret = 1; 3375 break; 3376 } 3377 3378 key.objectid = rc->search_start; 3379 key.type = BTRFS_EXTENT_ITEM_KEY; 3380 key.offset = 0; 3381 3382 path->search_commit_root = 1; 3383 path->skip_locking = 1; 3384 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3385 0, 0); 3386 if (ret < 0) 3387 break; 3388 next: 3389 leaf = path->nodes[0]; 3390 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3391 ret = btrfs_next_leaf(rc->extent_root, path); 3392 if (ret != 0) 3393 break; 3394 leaf = path->nodes[0]; 3395 } 3396 3397 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3398 if (key.objectid >= last) { 3399 ret = 1; 3400 break; 3401 } 3402 3403 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3404 key.type != BTRFS_METADATA_ITEM_KEY) { 3405 path->slots[0]++; 3406 goto next; 3407 } 3408 3409 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3410 key.objectid + key.offset <= rc->search_start) { 3411 path->slots[0]++; 3412 goto next; 3413 } 3414 3415 if (key.type == BTRFS_METADATA_ITEM_KEY && 3416 key.objectid + fs_info->nodesize <= 3417 rc->search_start) { 3418 path->slots[0]++; 3419 goto next; 3420 } 3421 3422 block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, 3423 key.objectid, &start, &end, 3424 EXTENT_DIRTY, NULL); 3425 3426 if (block_found && start <= key.objectid) { 3427 btrfs_release_path(path); 3428 rc->search_start = end + 1; 3429 } else { 3430 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3431 rc->search_start = key.objectid + key.offset; 3432 else 3433 rc->search_start = key.objectid + 3434 fs_info->nodesize; 3435 memcpy(extent_key, &key, sizeof(key)); 3436 return 0; 3437 } 3438 } 3439 btrfs_release_path(path); 3440 return ret; 3441 } 3442 3443 static void set_reloc_control(struct reloc_control *rc) 3444 { 3445 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3446 3447 mutex_lock(&fs_info->reloc_mutex); 3448 fs_info->reloc_ctl = rc; 3449 mutex_unlock(&fs_info->reloc_mutex); 3450 } 3451 3452 static void unset_reloc_control(struct reloc_control *rc) 3453 { 3454 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3455 3456 mutex_lock(&fs_info->reloc_mutex); 3457 fs_info->reloc_ctl = NULL; 3458 mutex_unlock(&fs_info->reloc_mutex); 3459 } 3460 3461 static noinline_for_stack 3462 int prepare_to_relocate(struct reloc_control *rc) 3463 { 3464 struct btrfs_trans_handle *trans; 3465 int ret; 3466 3467 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3468 BTRFS_BLOCK_RSV_TEMP); 3469 if (!rc->block_rsv) 3470 return -ENOMEM; 3471 3472 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3473 rc->search_start = rc->block_group->start; 3474 rc->extents_found = 0; 3475 rc->nodes_relocated = 0; 3476 rc->merging_rsv_size = 0; 3477 rc->reserved_bytes = 0; 3478 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3479 RELOCATION_RESERVED_NODES; 3480 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3481 rc->block_rsv, rc->block_rsv->size, 3482 BTRFS_RESERVE_FLUSH_ALL); 3483 if (ret) 3484 return ret; 3485 3486 rc->create_reloc_tree = true; 3487 set_reloc_control(rc); 3488 3489 trans = btrfs_join_transaction(rc->extent_root); 3490 if (IS_ERR(trans)) { 3491 unset_reloc_control(rc); 3492 /* 3493 * extent tree is not a ref_cow tree and has no reloc_root to 3494 * cleanup. And callers are responsible to free the above 3495 * block rsv. 3496 */ 3497 return PTR_ERR(trans); 3498 } 3499 3500 ret = btrfs_commit_transaction(trans); 3501 if (ret) 3502 unset_reloc_control(rc); 3503 3504 return ret; 3505 } 3506 3507 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3508 { 3509 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3510 struct rb_root blocks = RB_ROOT; 3511 struct btrfs_key key; 3512 struct btrfs_trans_handle *trans = NULL; 3513 struct btrfs_path *path; 3514 struct btrfs_extent_item *ei; 3515 u64 flags; 3516 int ret; 3517 int err = 0; 3518 int progress = 0; 3519 3520 path = btrfs_alloc_path(); 3521 if (!path) 3522 return -ENOMEM; 3523 path->reada = READA_FORWARD; 3524 3525 ret = prepare_to_relocate(rc); 3526 if (ret) { 3527 err = ret; 3528 goto out_free; 3529 } 3530 3531 while (1) { 3532 rc->reserved_bytes = 0; 3533 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3534 rc->block_rsv->size, 3535 BTRFS_RESERVE_FLUSH_ALL); 3536 if (ret) { 3537 err = ret; 3538 break; 3539 } 3540 progress++; 3541 trans = btrfs_start_transaction(rc->extent_root, 0); 3542 if (IS_ERR(trans)) { 3543 err = PTR_ERR(trans); 3544 trans = NULL; 3545 break; 3546 } 3547 restart: 3548 if (rc->backref_cache.last_trans != trans->transid) 3549 btrfs_backref_release_cache(&rc->backref_cache); 3550 rc->backref_cache.last_trans = trans->transid; 3551 3552 ret = find_next_extent(rc, path, &key); 3553 if (ret < 0) 3554 err = ret; 3555 if (ret != 0) 3556 break; 3557 3558 rc->extents_found++; 3559 3560 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3561 struct btrfs_extent_item); 3562 flags = btrfs_extent_flags(path->nodes[0], ei); 3563 3564 /* 3565 * If we are relocating a simple quota owned extent item, we 3566 * need to note the owner on the reloc data root so that when 3567 * we allocate the replacement item, we can attribute it to the 3568 * correct eventual owner (rather than the reloc data root). 3569 */ 3570 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3571 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3572 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3573 path->nodes[0], 3574 path->slots[0]); 3575 3576 root->relocation_src_root = owning_root_id; 3577 } 3578 3579 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3580 ret = add_tree_block(rc, &key, path, &blocks); 3581 } else if (rc->stage == UPDATE_DATA_PTRS && 3582 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3583 ret = add_data_references(rc, &key, path, &blocks); 3584 } else { 3585 btrfs_release_path(path); 3586 ret = 0; 3587 } 3588 if (ret < 0) { 3589 err = ret; 3590 break; 3591 } 3592 3593 if (!RB_EMPTY_ROOT(&blocks)) { 3594 ret = relocate_tree_blocks(trans, rc, &blocks); 3595 if (ret < 0) { 3596 if (ret != -EAGAIN) { 3597 err = ret; 3598 break; 3599 } 3600 rc->extents_found--; 3601 rc->search_start = key.objectid; 3602 } 3603 } 3604 3605 btrfs_end_transaction_throttle(trans); 3606 btrfs_btree_balance_dirty(fs_info); 3607 trans = NULL; 3608 3609 if (rc->stage == MOVE_DATA_EXTENTS && 3610 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3611 rc->found_file_extent = true; 3612 ret = relocate_data_extent(rc, &key); 3613 if (ret < 0) { 3614 err = ret; 3615 break; 3616 } 3617 } 3618 if (btrfs_should_cancel_balance(fs_info)) { 3619 err = -ECANCELED; 3620 break; 3621 } 3622 } 3623 if (trans && progress && err == -ENOSPC) { 3624 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3625 if (ret == 1) { 3626 err = 0; 3627 progress = 0; 3628 goto restart; 3629 } 3630 } 3631 3632 btrfs_release_path(path); 3633 btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); 3634 3635 if (trans) { 3636 btrfs_end_transaction_throttle(trans); 3637 btrfs_btree_balance_dirty(fs_info); 3638 } 3639 3640 if (!err) { 3641 ret = relocate_file_extent_cluster(rc); 3642 if (ret < 0) 3643 err = ret; 3644 } 3645 3646 rc->create_reloc_tree = false; 3647 set_reloc_control(rc); 3648 3649 btrfs_backref_release_cache(&rc->backref_cache); 3650 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3651 3652 /* 3653 * Even in the case when the relocation is cancelled, we should all go 3654 * through prepare_to_merge() and merge_reloc_roots(). 3655 * 3656 * For error (including cancelled balance), prepare_to_merge() will 3657 * mark all reloc trees orphan, then queue them for cleanup in 3658 * merge_reloc_roots() 3659 */ 3660 err = prepare_to_merge(rc, err); 3661 3662 merge_reloc_roots(rc); 3663 3664 rc->merge_reloc_tree = false; 3665 unset_reloc_control(rc); 3666 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3667 3668 /* get rid of pinned extents */ 3669 trans = btrfs_join_transaction(rc->extent_root); 3670 if (IS_ERR(trans)) { 3671 err = PTR_ERR(trans); 3672 goto out_free; 3673 } 3674 ret = btrfs_commit_transaction(trans); 3675 if (ret && !err) 3676 err = ret; 3677 out_free: 3678 ret = clean_dirty_subvols(rc); 3679 if (ret < 0 && !err) 3680 err = ret; 3681 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3682 btrfs_free_path(path); 3683 return err; 3684 } 3685 3686 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3687 struct btrfs_root *root, u64 objectid) 3688 { 3689 struct btrfs_path *path; 3690 struct btrfs_inode_item *item; 3691 struct extent_buffer *leaf; 3692 int ret; 3693 3694 path = btrfs_alloc_path(); 3695 if (!path) 3696 return -ENOMEM; 3697 3698 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3699 if (ret) 3700 goto out; 3701 3702 leaf = path->nodes[0]; 3703 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3704 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3705 btrfs_set_inode_generation(leaf, item, 1); 3706 btrfs_set_inode_size(leaf, item, 0); 3707 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3708 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3709 BTRFS_INODE_PREALLOC); 3710 out: 3711 btrfs_free_path(path); 3712 return ret; 3713 } 3714 3715 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3716 struct btrfs_root *root, u64 objectid) 3717 { 3718 struct btrfs_path *path; 3719 struct btrfs_key key; 3720 int ret = 0; 3721 3722 path = btrfs_alloc_path(); 3723 if (!path) { 3724 ret = -ENOMEM; 3725 goto out; 3726 } 3727 3728 key.objectid = objectid; 3729 key.type = BTRFS_INODE_ITEM_KEY; 3730 key.offset = 0; 3731 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3732 if (ret) { 3733 if (ret > 0) 3734 ret = -ENOENT; 3735 goto out; 3736 } 3737 ret = btrfs_del_item(trans, root, path); 3738 out: 3739 if (ret) 3740 btrfs_abort_transaction(trans, ret); 3741 btrfs_free_path(path); 3742 } 3743 3744 /* 3745 * helper to create inode for data relocation. 3746 * the inode is in data relocation tree and its link count is 0 3747 */ 3748 static noinline_for_stack struct inode *create_reloc_inode( 3749 const struct btrfs_block_group *group) 3750 { 3751 struct btrfs_fs_info *fs_info = group->fs_info; 3752 struct btrfs_inode *inode = NULL; 3753 struct btrfs_trans_handle *trans; 3754 struct btrfs_root *root; 3755 u64 objectid; 3756 int ret = 0; 3757 3758 root = btrfs_grab_root(fs_info->data_reloc_root); 3759 trans = btrfs_start_transaction(root, 6); 3760 if (IS_ERR(trans)) { 3761 btrfs_put_root(root); 3762 return ERR_CAST(trans); 3763 } 3764 3765 ret = btrfs_get_free_objectid(root, &objectid); 3766 if (ret) 3767 goto out; 3768 3769 ret = __insert_orphan_inode(trans, root, objectid); 3770 if (ret) 3771 goto out; 3772 3773 inode = btrfs_iget(objectid, root); 3774 if (IS_ERR(inode)) { 3775 delete_orphan_inode(trans, root, objectid); 3776 ret = PTR_ERR(inode); 3777 inode = NULL; 3778 goto out; 3779 } 3780 inode->reloc_block_group_start = group->start; 3781 3782 ret = btrfs_orphan_add(trans, inode); 3783 out: 3784 btrfs_put_root(root); 3785 btrfs_end_transaction(trans); 3786 btrfs_btree_balance_dirty(fs_info); 3787 if (ret) { 3788 if (inode) 3789 iput(&inode->vfs_inode); 3790 return ERR_PTR(ret); 3791 } 3792 return &inode->vfs_inode; 3793 } 3794 3795 /* 3796 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3797 * has been requested meanwhile and don't start in that case. 3798 * 3799 * Return: 3800 * 0 success 3801 * -EINPROGRESS operation is already in progress, that's probably a bug 3802 * -ECANCELED cancellation request was set before the operation started 3803 */ 3804 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3805 { 3806 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3807 /* This should not happen */ 3808 btrfs_err(fs_info, "reloc already running, cannot start"); 3809 return -EINPROGRESS; 3810 } 3811 3812 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3813 btrfs_info(fs_info, "chunk relocation canceled on start"); 3814 /* 3815 * On cancel, clear all requests but let the caller mark 3816 * the end after cleanup operations. 3817 */ 3818 atomic_set(&fs_info->reloc_cancel_req, 0); 3819 return -ECANCELED; 3820 } 3821 return 0; 3822 } 3823 3824 /* 3825 * Mark end of chunk relocation that is cancellable and wake any waiters. 3826 */ 3827 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3828 { 3829 /* Requested after start, clear bit first so any waiters can continue */ 3830 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3831 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3832 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3833 atomic_set(&fs_info->reloc_cancel_req, 0); 3834 } 3835 3836 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3837 { 3838 struct reloc_control *rc; 3839 3840 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3841 if (!rc) 3842 return NULL; 3843 3844 INIT_LIST_HEAD(&rc->reloc_roots); 3845 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3846 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3847 rc->reloc_root_tree.rb_root = RB_ROOT; 3848 spin_lock_init(&rc->reloc_root_tree.lock); 3849 btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3850 return rc; 3851 } 3852 3853 static void free_reloc_control(struct reloc_control *rc) 3854 { 3855 struct mapping_node *node, *tmp; 3856 3857 free_reloc_roots(&rc->reloc_roots); 3858 rbtree_postorder_for_each_entry_safe(node, tmp, 3859 &rc->reloc_root_tree.rb_root, rb_node) 3860 kfree(node); 3861 3862 kfree(rc); 3863 } 3864 3865 /* 3866 * Print the block group being relocated 3867 */ 3868 static void describe_relocation(struct btrfs_block_group *block_group) 3869 { 3870 char buf[128] = "NONE"; 3871 3872 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3873 3874 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3875 block_group->start, buf); 3876 } 3877 3878 static const char *stage_to_string(enum reloc_stage stage) 3879 { 3880 if (stage == MOVE_DATA_EXTENTS) 3881 return "move data extents"; 3882 if (stage == UPDATE_DATA_PTRS) 3883 return "update data pointers"; 3884 return "unknown"; 3885 } 3886 3887 /* 3888 * function to relocate all extents in a block group. 3889 */ 3890 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, 3891 bool verbose) 3892 { 3893 struct btrfs_block_group *bg; 3894 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3895 struct reloc_control *rc; 3896 struct inode *inode; 3897 struct btrfs_path *path; 3898 int ret; 3899 int rw = 0; 3900 int err = 0; 3901 3902 /* 3903 * This only gets set if we had a half-deleted snapshot on mount. We 3904 * cannot allow relocation to start while we're still trying to clean up 3905 * these pending deletions. 3906 */ 3907 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3908 if (ret) 3909 return ret; 3910 3911 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3912 if (btrfs_fs_closing(fs_info)) 3913 return -EINTR; 3914 3915 bg = btrfs_lookup_block_group(fs_info, group_start); 3916 if (!bg) 3917 return -ENOENT; 3918 3919 /* 3920 * Relocation of a data block group creates ordered extents. Without 3921 * sb_start_write(), we can freeze the filesystem while unfinished 3922 * ordered extents are left. Such ordered extents can cause a deadlock 3923 * e.g. when syncfs() is waiting for their completion but they can't 3924 * finish because they block when joining a transaction, due to the 3925 * fact that the freeze locks are being held in write mode. 3926 */ 3927 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3928 ASSERT(sb_write_started(fs_info->sb)); 3929 3930 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3931 btrfs_put_block_group(bg); 3932 return -ETXTBSY; 3933 } 3934 3935 rc = alloc_reloc_control(fs_info); 3936 if (!rc) { 3937 btrfs_put_block_group(bg); 3938 return -ENOMEM; 3939 } 3940 3941 ret = reloc_chunk_start(fs_info); 3942 if (ret < 0) { 3943 err = ret; 3944 goto out_put_bg; 3945 } 3946 3947 rc->extent_root = extent_root; 3948 rc->block_group = bg; 3949 3950 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3951 if (ret) { 3952 err = ret; 3953 goto out; 3954 } 3955 rw = 1; 3956 3957 path = btrfs_alloc_path(); 3958 if (!path) { 3959 err = -ENOMEM; 3960 goto out; 3961 } 3962 3963 inode = lookup_free_space_inode(rc->block_group, path); 3964 btrfs_free_path(path); 3965 3966 if (!IS_ERR(inode)) 3967 ret = delete_block_group_cache(rc->block_group, inode, 0); 3968 else 3969 ret = PTR_ERR(inode); 3970 3971 if (ret && ret != -ENOENT) { 3972 err = ret; 3973 goto out; 3974 } 3975 3976 rc->data_inode = create_reloc_inode(rc->block_group); 3977 if (IS_ERR(rc->data_inode)) { 3978 err = PTR_ERR(rc->data_inode); 3979 rc->data_inode = NULL; 3980 goto out; 3981 } 3982 3983 if (verbose) 3984 describe_relocation(rc->block_group); 3985 3986 btrfs_wait_block_group_reservations(rc->block_group); 3987 btrfs_wait_nocow_writers(rc->block_group); 3988 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 3989 3990 ret = btrfs_zone_finish(rc->block_group); 3991 WARN_ON(ret && ret != -EAGAIN); 3992 3993 while (1) { 3994 enum reloc_stage finishes_stage; 3995 3996 mutex_lock(&fs_info->cleaner_mutex); 3997 ret = relocate_block_group(rc); 3998 mutex_unlock(&fs_info->cleaner_mutex); 3999 if (ret < 0) 4000 err = ret; 4001 4002 finishes_stage = rc->stage; 4003 /* 4004 * We may have gotten ENOSPC after we already dirtied some 4005 * extents. If writeout happens while we're relocating a 4006 * different block group we could end up hitting the 4007 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4008 * btrfs_reloc_cow_block. Make sure we write everything out 4009 * properly so we don't trip over this problem, and then break 4010 * out of the loop if we hit an error. 4011 */ 4012 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4013 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 4014 (u64)-1); 4015 if (ret) 4016 err = ret; 4017 invalidate_mapping_pages(rc->data_inode->i_mapping, 4018 0, -1); 4019 rc->stage = UPDATE_DATA_PTRS; 4020 } 4021 4022 if (err < 0) 4023 goto out; 4024 4025 if (rc->extents_found == 0) 4026 break; 4027 4028 if (verbose) 4029 btrfs_info(fs_info, "found %llu extents, stage: %s", 4030 rc->extents_found, 4031 stage_to_string(finishes_stage)); 4032 } 4033 4034 WARN_ON(rc->block_group->pinned > 0); 4035 WARN_ON(rc->block_group->reserved > 0); 4036 WARN_ON(rc->block_group->used > 0); 4037 out: 4038 if (err && rw) 4039 btrfs_dec_block_group_ro(rc->block_group); 4040 iput(rc->data_inode); 4041 out_put_bg: 4042 btrfs_put_block_group(bg); 4043 reloc_chunk_end(fs_info); 4044 free_reloc_control(rc); 4045 return err; 4046 } 4047 4048 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4049 { 4050 struct btrfs_fs_info *fs_info = root->fs_info; 4051 struct btrfs_trans_handle *trans; 4052 int ret, err; 4053 4054 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4055 if (IS_ERR(trans)) 4056 return PTR_ERR(trans); 4057 4058 memset(&root->root_item.drop_progress, 0, 4059 sizeof(root->root_item.drop_progress)); 4060 btrfs_set_root_drop_level(&root->root_item, 0); 4061 btrfs_set_root_refs(&root->root_item, 0); 4062 ret = btrfs_update_root(trans, fs_info->tree_root, 4063 &root->root_key, &root->root_item); 4064 4065 err = btrfs_end_transaction(trans); 4066 if (err) 4067 return err; 4068 return ret; 4069 } 4070 4071 /* 4072 * recover relocation interrupted by system crash. 4073 * 4074 * this function resumes merging reloc trees with corresponding fs trees. 4075 * this is important for keeping the sharing of tree blocks 4076 */ 4077 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4078 { 4079 LIST_HEAD(reloc_roots); 4080 struct btrfs_key key; 4081 struct btrfs_root *fs_root; 4082 struct btrfs_root *reloc_root; 4083 struct btrfs_path *path; 4084 struct extent_buffer *leaf; 4085 struct reloc_control *rc = NULL; 4086 struct btrfs_trans_handle *trans; 4087 int ret2; 4088 int ret = 0; 4089 4090 path = btrfs_alloc_path(); 4091 if (!path) 4092 return -ENOMEM; 4093 path->reada = READA_BACK; 4094 4095 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4096 key.type = BTRFS_ROOT_ITEM_KEY; 4097 key.offset = (u64)-1; 4098 4099 while (1) { 4100 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4101 path, 0, 0); 4102 if (ret < 0) 4103 goto out; 4104 if (ret > 0) { 4105 if (path->slots[0] == 0) 4106 break; 4107 path->slots[0]--; 4108 } 4109 ret = 0; 4110 leaf = path->nodes[0]; 4111 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4112 btrfs_release_path(path); 4113 4114 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4115 key.type != BTRFS_ROOT_ITEM_KEY) 4116 break; 4117 4118 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4119 if (IS_ERR(reloc_root)) { 4120 ret = PTR_ERR(reloc_root); 4121 goto out; 4122 } 4123 4124 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4125 list_add(&reloc_root->root_list, &reloc_roots); 4126 4127 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4128 fs_root = btrfs_get_fs_root(fs_info, 4129 reloc_root->root_key.offset, false); 4130 if (IS_ERR(fs_root)) { 4131 ret = PTR_ERR(fs_root); 4132 if (ret != -ENOENT) 4133 goto out; 4134 ret = mark_garbage_root(reloc_root); 4135 if (ret < 0) 4136 goto out; 4137 ret = 0; 4138 } else { 4139 btrfs_put_root(fs_root); 4140 } 4141 } 4142 4143 if (key.offset == 0) 4144 break; 4145 4146 key.offset--; 4147 } 4148 btrfs_release_path(path); 4149 4150 if (list_empty(&reloc_roots)) 4151 goto out; 4152 4153 rc = alloc_reloc_control(fs_info); 4154 if (!rc) { 4155 ret = -ENOMEM; 4156 goto out; 4157 } 4158 4159 ret = reloc_chunk_start(fs_info); 4160 if (ret < 0) 4161 goto out_end; 4162 4163 rc->extent_root = btrfs_extent_root(fs_info, 0); 4164 4165 set_reloc_control(rc); 4166 4167 trans = btrfs_join_transaction(rc->extent_root); 4168 if (IS_ERR(trans)) { 4169 ret = PTR_ERR(trans); 4170 goto out_unset; 4171 } 4172 4173 rc->merge_reloc_tree = true; 4174 4175 while (!list_empty(&reloc_roots)) { 4176 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 4177 list_del(&reloc_root->root_list); 4178 4179 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4180 list_add_tail(&reloc_root->root_list, 4181 &rc->reloc_roots); 4182 continue; 4183 } 4184 4185 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4186 false); 4187 if (IS_ERR(fs_root)) { 4188 ret = PTR_ERR(fs_root); 4189 list_add_tail(&reloc_root->root_list, &reloc_roots); 4190 btrfs_end_transaction(trans); 4191 goto out_unset; 4192 } 4193 4194 ret = __add_reloc_root(reloc_root); 4195 ASSERT(ret != -EEXIST); 4196 if (ret) { 4197 list_add_tail(&reloc_root->root_list, &reloc_roots); 4198 btrfs_put_root(fs_root); 4199 btrfs_end_transaction(trans); 4200 goto out_unset; 4201 } 4202 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4203 btrfs_put_root(fs_root); 4204 } 4205 4206 ret = btrfs_commit_transaction(trans); 4207 if (ret) 4208 goto out_unset; 4209 4210 merge_reloc_roots(rc); 4211 4212 unset_reloc_control(rc); 4213 4214 trans = btrfs_join_transaction(rc->extent_root); 4215 if (IS_ERR(trans)) { 4216 ret = PTR_ERR(trans); 4217 goto out_clean; 4218 } 4219 ret = btrfs_commit_transaction(trans); 4220 out_clean: 4221 ret2 = clean_dirty_subvols(rc); 4222 if (ret2 < 0 && !ret) 4223 ret = ret2; 4224 out_unset: 4225 unset_reloc_control(rc); 4226 out_end: 4227 reloc_chunk_end(fs_info); 4228 free_reloc_control(rc); 4229 out: 4230 free_reloc_roots(&reloc_roots); 4231 4232 btrfs_free_path(path); 4233 4234 if (ret == 0) { 4235 /* cleanup orphan inode in data relocation tree */ 4236 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4237 ASSERT(fs_root); 4238 ret = btrfs_orphan_cleanup(fs_root); 4239 btrfs_put_root(fs_root); 4240 } 4241 return ret; 4242 } 4243 4244 /* 4245 * helper to add ordered checksum for data relocation. 4246 * 4247 * cloning checksum properly handles the nodatasum extents. 4248 * it also saves CPU time to re-calculate the checksum. 4249 */ 4250 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4251 { 4252 struct btrfs_inode *inode = ordered->inode; 4253 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4254 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4255 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4256 LIST_HEAD(list); 4257 int ret; 4258 4259 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4260 disk_bytenr + ordered->num_bytes - 1, 4261 &list, false); 4262 if (ret < 0) { 4263 btrfs_mark_ordered_extent_error(ordered); 4264 return ret; 4265 } 4266 4267 while (!list_empty(&list)) { 4268 struct btrfs_ordered_sum *sums = 4269 list_first_entry(&list, struct btrfs_ordered_sum, list); 4270 4271 list_del_init(&sums->list); 4272 4273 /* 4274 * We need to offset the new_bytenr based on where the csum is. 4275 * We need to do this because we will read in entire prealloc 4276 * extents but we may have written to say the middle of the 4277 * prealloc extent, so we need to make sure the csum goes with 4278 * the right disk offset. 4279 * 4280 * We can do this because the data reloc inode refers strictly 4281 * to the on disk bytes, so we don't have to worry about 4282 * disk_len vs real len like with real inodes since it's all 4283 * disk length. 4284 */ 4285 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4286 btrfs_add_ordered_sum(ordered, sums); 4287 } 4288 4289 return 0; 4290 } 4291 4292 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4293 struct btrfs_root *root, 4294 const struct extent_buffer *buf, 4295 struct extent_buffer *cow) 4296 { 4297 struct btrfs_fs_info *fs_info = root->fs_info; 4298 struct reloc_control *rc; 4299 struct btrfs_backref_node *node; 4300 int first_cow = 0; 4301 int level; 4302 int ret = 0; 4303 4304 rc = fs_info->reloc_ctl; 4305 if (!rc) 4306 return 0; 4307 4308 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4309 4310 level = btrfs_header_level(buf); 4311 if (btrfs_header_generation(buf) <= 4312 btrfs_root_last_snapshot(&root->root_item)) 4313 first_cow = 1; 4314 4315 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4316 WARN_ON(!first_cow && level == 0); 4317 4318 node = rc->backref_cache.path[level]; 4319 4320 /* 4321 * If node->bytenr != buf->start and node->new_bytenr != 4322 * buf->start then we've got the wrong backref node for what we 4323 * expected to see here and the cache is incorrect. 4324 */ 4325 if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { 4326 btrfs_err(fs_info, 4327 "bytenr %llu was found but our backref cache was expecting %llu or %llu", 4328 buf->start, node->bytenr, node->new_bytenr); 4329 return -EUCLEAN; 4330 } 4331 4332 btrfs_backref_drop_node_buffer(node); 4333 refcount_inc(&cow->refs); 4334 node->eb = cow; 4335 node->new_bytenr = cow->start; 4336 4337 if (!node->pending) { 4338 list_move_tail(&node->list, 4339 &rc->backref_cache.pending[level]); 4340 node->pending = 1; 4341 } 4342 4343 if (first_cow) 4344 mark_block_processed(rc, node); 4345 4346 if (first_cow && level > 0) 4347 rc->nodes_relocated += buf->len; 4348 } 4349 4350 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4351 ret = replace_file_extents(trans, rc, root, cow); 4352 return ret; 4353 } 4354 4355 /* 4356 * called before creating snapshot. it calculates metadata reservation 4357 * required for relocating tree blocks in the snapshot 4358 */ 4359 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4360 u64 *bytes_to_reserve) 4361 { 4362 struct btrfs_root *root = pending->root; 4363 struct reloc_control *rc = root->fs_info->reloc_ctl; 4364 4365 if (!rc || !have_reloc_root(root)) 4366 return; 4367 4368 if (!rc->merge_reloc_tree) 4369 return; 4370 4371 root = root->reloc_root; 4372 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4373 /* 4374 * relocation is in the stage of merging trees. the space 4375 * used by merging a reloc tree is twice the size of 4376 * relocated tree nodes in the worst case. half for cowing 4377 * the reloc tree, half for cowing the fs tree. the space 4378 * used by cowing the reloc tree will be freed after the 4379 * tree is dropped. if we create snapshot, cowing the fs 4380 * tree may use more space than it frees. so we need 4381 * reserve extra space. 4382 */ 4383 *bytes_to_reserve += rc->nodes_relocated; 4384 } 4385 4386 /* 4387 * called after snapshot is created. migrate block reservation 4388 * and create reloc root for the newly created snapshot 4389 * 4390 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4391 * references held on the reloc_root, one for root->reloc_root and one for 4392 * rc->reloc_roots. 4393 */ 4394 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4395 struct btrfs_pending_snapshot *pending) 4396 { 4397 struct btrfs_root *root = pending->root; 4398 struct btrfs_root *reloc_root; 4399 struct btrfs_root *new_root; 4400 struct reloc_control *rc = root->fs_info->reloc_ctl; 4401 int ret; 4402 4403 if (!rc || !have_reloc_root(root)) 4404 return 0; 4405 4406 rc = root->fs_info->reloc_ctl; 4407 rc->merging_rsv_size += rc->nodes_relocated; 4408 4409 if (rc->merge_reloc_tree) { 4410 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4411 rc->block_rsv, 4412 rc->nodes_relocated, true); 4413 if (ret) 4414 return ret; 4415 } 4416 4417 new_root = pending->snap; 4418 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4419 if (IS_ERR(reloc_root)) 4420 return PTR_ERR(reloc_root); 4421 4422 ret = __add_reloc_root(reloc_root); 4423 ASSERT(ret != -EEXIST); 4424 if (ret) { 4425 /* Pairs with create_reloc_root */ 4426 btrfs_put_root(reloc_root); 4427 return ret; 4428 } 4429 new_root->reloc_root = btrfs_grab_root(reloc_root); 4430 return 0; 4431 } 4432 4433 /* 4434 * Get the current bytenr for the block group which is being relocated. 4435 * 4436 * Return U64_MAX if no running relocation. 4437 */ 4438 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4439 { 4440 u64 logical = U64_MAX; 4441 4442 lockdep_assert_held(&fs_info->reloc_mutex); 4443 4444 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4445 logical = fs_info->reloc_ctl->block_group->start; 4446 return logical; 4447 } 4448