1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 union { 94 /* Use rb_simple_node for search/insert */ 95 struct { 96 struct rb_node rb_node; 97 u64 bytenr; 98 }; 99 100 struct rb_simple_node simple_node; 101 }; 102 void *data; 103 }; 104 105 struct mapping_tree { 106 struct rb_root rb_root; 107 spinlock_t lock; 108 }; 109 110 /* 111 * present a tree block to process 112 */ 113 struct tree_block { 114 union { 115 /* Use rb_simple_node for search/insert */ 116 struct { 117 struct rb_node rb_node; 118 u64 bytenr; 119 }; 120 121 struct rb_simple_node simple_node; 122 }; 123 u64 owner; 124 struct btrfs_key key; 125 u8 level; 126 bool key_ready; 127 }; 128 129 #define MAX_EXTENTS 128 130 131 struct file_extent_cluster { 132 u64 start; 133 u64 end; 134 u64 boundary[MAX_EXTENTS]; 135 unsigned int nr; 136 u64 owning_root; 137 }; 138 139 /* Stages of data relocation. */ 140 enum reloc_stage { 141 MOVE_DATA_EXTENTS, 142 UPDATE_DATA_PTRS 143 }; 144 145 struct reloc_control { 146 /* block group to relocate */ 147 struct btrfs_block_group *block_group; 148 /* extent tree */ 149 struct btrfs_root *extent_root; 150 /* inode for moving data */ 151 struct inode *data_inode; 152 153 struct btrfs_block_rsv *block_rsv; 154 155 struct btrfs_backref_cache backref_cache; 156 157 struct file_extent_cluster cluster; 158 /* tree blocks have been processed */ 159 struct extent_io_tree processed_blocks; 160 /* map start of tree root to corresponding reloc tree */ 161 struct mapping_tree reloc_root_tree; 162 /* list of reloc trees */ 163 struct list_head reloc_roots; 164 /* list of subvolume trees that get relocated */ 165 struct list_head dirty_subvol_roots; 166 /* size of metadata reservation for merging reloc trees */ 167 u64 merging_rsv_size; 168 /* size of relocated tree nodes */ 169 u64 nodes_relocated; 170 /* reserved size for block group relocation*/ 171 u64 reserved_bytes; 172 173 u64 search_start; 174 u64 extents_found; 175 176 enum reloc_stage stage; 177 bool create_reloc_tree; 178 bool merge_reloc_tree; 179 bool found_file_extent; 180 }; 181 182 static void mark_block_processed(struct reloc_control *rc, 183 struct btrfs_backref_node *node) 184 { 185 u32 blocksize; 186 187 if (node->level == 0 || 188 in_range(node->bytenr, rc->block_group->start, 189 rc->block_group->length)) { 190 blocksize = rc->extent_root->fs_info->nodesize; 191 btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, 192 node->bytenr + blocksize - 1, EXTENT_DIRTY, 193 NULL); 194 } 195 node->processed = 1; 196 } 197 198 /* 199 * walk up backref nodes until reach node presents tree root 200 */ 201 static struct btrfs_backref_node *walk_up_backref( 202 struct btrfs_backref_node *node, 203 struct btrfs_backref_edge *edges[], int *index) 204 { 205 struct btrfs_backref_edge *edge; 206 int idx = *index; 207 208 while (!list_empty(&node->upper)) { 209 edge = list_first_entry(&node->upper, struct btrfs_backref_edge, 210 list[LOWER]); 211 edges[idx++] = edge; 212 node = edge->node[UPPER]; 213 } 214 BUG_ON(node->detached); 215 *index = idx; 216 return node; 217 } 218 219 /* 220 * walk down backref nodes to find start of next reference path 221 */ 222 static struct btrfs_backref_node *walk_down_backref( 223 struct btrfs_backref_edge *edges[], int *index) 224 { 225 struct btrfs_backref_edge *edge; 226 struct btrfs_backref_node *lower; 227 int idx = *index; 228 229 while (idx > 0) { 230 edge = edges[idx - 1]; 231 lower = edge->node[LOWER]; 232 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 233 idx--; 234 continue; 235 } 236 edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, 237 list[LOWER]); 238 edges[idx - 1] = edge; 239 *index = idx; 240 return edge->node[UPPER]; 241 } 242 *index = 0; 243 return NULL; 244 } 245 246 static bool reloc_root_is_dead(const struct btrfs_root *root) 247 { 248 /* 249 * Pair with set_bit/clear_bit in clean_dirty_subvols and 250 * btrfs_update_reloc_root. We need to see the updated bit before 251 * trying to access reloc_root 252 */ 253 smp_rmb(); 254 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 255 return true; 256 return false; 257 } 258 259 /* 260 * Check if this subvolume tree has valid reloc tree. 261 * 262 * Reloc tree after swap is considered dead, thus not considered as valid. 263 * This is enough for most callers, as they don't distinguish dead reloc root 264 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 265 * special case. 266 */ 267 static bool have_reloc_root(const struct btrfs_root *root) 268 { 269 if (reloc_root_is_dead(root)) 270 return false; 271 if (!root->reloc_root) 272 return false; 273 return true; 274 } 275 276 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 277 { 278 struct btrfs_root *reloc_root; 279 280 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 281 return false; 282 283 /* This root has been merged with its reloc tree, we can ignore it */ 284 if (reloc_root_is_dead(root)) 285 return true; 286 287 reloc_root = root->reloc_root; 288 if (!reloc_root) 289 return false; 290 291 if (btrfs_header_generation(reloc_root->commit_root) == 292 root->fs_info->running_transaction->transid) 293 return false; 294 /* 295 * If there is reloc tree and it was created in previous transaction 296 * backref lookup can find the reloc tree, so backref node for the fs 297 * tree root is useless for relocation. 298 */ 299 return true; 300 } 301 302 /* 303 * find reloc tree by address of tree root 304 */ 305 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 306 { 307 struct reloc_control *rc = fs_info->reloc_ctl; 308 struct rb_node *rb_node; 309 struct mapping_node *node; 310 struct btrfs_root *root = NULL; 311 312 ASSERT(rc); 313 spin_lock(&rc->reloc_root_tree.lock); 314 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 315 if (rb_node) { 316 node = rb_entry(rb_node, struct mapping_node, rb_node); 317 root = node->data; 318 } 319 spin_unlock(&rc->reloc_root_tree.lock); 320 return btrfs_grab_root(root); 321 } 322 323 /* 324 * For useless nodes, do two major clean ups: 325 * 326 * - Cleanup the children edges and nodes 327 * If child node is also orphan (no parent) during cleanup, then the child 328 * node will also be cleaned up. 329 * 330 * - Freeing up leaves (level 0), keeps nodes detached 331 * For nodes, the node is still cached as "detached" 332 * 333 * Return false if @node is not in the @useless_nodes list. 334 * Return true if @node is in the @useless_nodes list. 335 */ 336 static bool handle_useless_nodes(struct reloc_control *rc, 337 struct btrfs_backref_node *node) 338 { 339 struct btrfs_backref_cache *cache = &rc->backref_cache; 340 struct list_head *useless_node = &cache->useless_node; 341 bool ret = false; 342 343 while (!list_empty(useless_node)) { 344 struct btrfs_backref_node *cur; 345 346 cur = list_first_entry(useless_node, struct btrfs_backref_node, 347 list); 348 list_del_init(&cur->list); 349 350 /* Only tree root nodes can be added to @useless_nodes */ 351 ASSERT(list_empty(&cur->upper)); 352 353 if (cur == node) 354 ret = true; 355 356 /* Cleanup the lower edges */ 357 while (!list_empty(&cur->lower)) { 358 struct btrfs_backref_edge *edge; 359 struct btrfs_backref_node *lower; 360 361 edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, 362 list[UPPER]); 363 list_del(&edge->list[UPPER]); 364 list_del(&edge->list[LOWER]); 365 lower = edge->node[LOWER]; 366 btrfs_backref_free_edge(cache, edge); 367 368 /* Child node is also orphan, queue for cleanup */ 369 if (list_empty(&lower->upper)) 370 list_add(&lower->list, useless_node); 371 } 372 /* Mark this block processed for relocation */ 373 mark_block_processed(rc, cur); 374 375 /* 376 * Backref nodes for tree leaves are deleted from the cache. 377 * Backref nodes for upper level tree blocks are left in the 378 * cache to avoid unnecessary backref lookup. 379 */ 380 if (cur->level > 0) { 381 cur->detached = 1; 382 } else { 383 rb_erase(&cur->rb_node, &cache->rb_root); 384 btrfs_backref_free_node(cache, cur); 385 } 386 } 387 return ret; 388 } 389 390 /* 391 * Build backref tree for a given tree block. Root of the backref tree 392 * corresponds the tree block, leaves of the backref tree correspond roots of 393 * b-trees that reference the tree block. 394 * 395 * The basic idea of this function is check backrefs of a given block to find 396 * upper level blocks that reference the block, and then check backrefs of 397 * these upper level blocks recursively. The recursion stops when tree root is 398 * reached or backrefs for the block is cached. 399 * 400 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 401 * all upper level blocks that directly/indirectly reference the block are also 402 * cached. 403 */ 404 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 405 struct btrfs_trans_handle *trans, 406 struct reloc_control *rc, struct btrfs_key *node_key, 407 int level, u64 bytenr) 408 { 409 struct btrfs_backref_iter *iter; 410 struct btrfs_backref_cache *cache = &rc->backref_cache; 411 /* For searching parent of TREE_BLOCK_REF */ 412 struct btrfs_path *path; 413 struct btrfs_backref_node *cur; 414 struct btrfs_backref_node *node = NULL; 415 struct btrfs_backref_edge *edge; 416 int ret; 417 418 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 419 if (!iter) 420 return ERR_PTR(-ENOMEM); 421 path = btrfs_alloc_path(); 422 if (!path) { 423 ret = -ENOMEM; 424 goto out; 425 } 426 427 node = btrfs_backref_alloc_node(cache, bytenr, level); 428 if (!node) { 429 ret = -ENOMEM; 430 goto out; 431 } 432 433 cur = node; 434 435 /* Breadth-first search to build backref cache */ 436 do { 437 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 438 node_key, cur); 439 if (ret < 0) 440 goto out; 441 442 edge = list_first_entry_or_null(&cache->pending_edge, 443 struct btrfs_backref_edge, list[UPPER]); 444 /* 445 * The pending list isn't empty, take the first block to 446 * process 447 */ 448 if (edge) { 449 list_del_init(&edge->list[UPPER]); 450 cur = edge->node[UPPER]; 451 } 452 } while (edge); 453 454 /* Finish the upper linkage of newly added edges/nodes */ 455 ret = btrfs_backref_finish_upper_links(cache, node); 456 if (ret < 0) 457 goto out; 458 459 if (handle_useless_nodes(rc, node)) 460 node = NULL; 461 out: 462 btrfs_free_path(iter->path); 463 kfree(iter); 464 btrfs_free_path(path); 465 if (ret) { 466 btrfs_backref_error_cleanup(cache, node); 467 return ERR_PTR(ret); 468 } 469 ASSERT(!node || !node->detached); 470 ASSERT(list_empty(&cache->useless_node) && 471 list_empty(&cache->pending_edge)); 472 return node; 473 } 474 475 /* 476 * helper to add 'address of tree root -> reloc tree' mapping 477 */ 478 static int __add_reloc_root(struct btrfs_root *root) 479 { 480 struct btrfs_fs_info *fs_info = root->fs_info; 481 struct rb_node *rb_node; 482 struct mapping_node *node; 483 struct reloc_control *rc = fs_info->reloc_ctl; 484 485 node = kmalloc(sizeof(*node), GFP_NOFS); 486 if (!node) 487 return -ENOMEM; 488 489 node->bytenr = root->commit_root->start; 490 node->data = root; 491 492 spin_lock(&rc->reloc_root_tree.lock); 493 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 494 spin_unlock(&rc->reloc_root_tree.lock); 495 if (rb_node) { 496 btrfs_err(fs_info, 497 "Duplicate root found for start=%llu while inserting into relocation tree", 498 node->bytenr); 499 return -EEXIST; 500 } 501 502 list_add_tail(&root->root_list, &rc->reloc_roots); 503 return 0; 504 } 505 506 /* 507 * helper to delete the 'address of tree root -> reloc tree' 508 * mapping 509 */ 510 static void __del_reloc_root(struct btrfs_root *root) 511 { 512 struct btrfs_fs_info *fs_info = root->fs_info; 513 struct rb_node *rb_node; 514 struct mapping_node *node = NULL; 515 struct reloc_control *rc = fs_info->reloc_ctl; 516 bool put_ref = false; 517 518 if (rc && root->node) { 519 spin_lock(&rc->reloc_root_tree.lock); 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 521 root->commit_root->start); 522 if (rb_node) { 523 node = rb_entry(rb_node, struct mapping_node, rb_node); 524 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 525 RB_CLEAR_NODE(&node->rb_node); 526 } 527 spin_unlock(&rc->reloc_root_tree.lock); 528 ASSERT(!node || (struct btrfs_root *)node->data == root); 529 } 530 531 /* 532 * We only put the reloc root here if it's on the list. There's a lot 533 * of places where the pattern is to splice the rc->reloc_roots, process 534 * the reloc roots, and then add the reloc root back onto 535 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 536 * list we don't want the reference being dropped, because the guy 537 * messing with the list is in charge of the reference. 538 */ 539 spin_lock(&fs_info->trans_lock); 540 if (!list_empty(&root->root_list)) { 541 put_ref = true; 542 list_del_init(&root->root_list); 543 } 544 spin_unlock(&fs_info->trans_lock); 545 if (put_ref) 546 btrfs_put_root(root); 547 kfree(node); 548 } 549 550 /* 551 * helper to update the 'address of tree root -> reloc tree' 552 * mapping 553 */ 554 static int __update_reloc_root(struct btrfs_root *root) 555 { 556 struct btrfs_fs_info *fs_info = root->fs_info; 557 struct rb_node *rb_node; 558 struct mapping_node *node = NULL; 559 struct reloc_control *rc = fs_info->reloc_ctl; 560 561 spin_lock(&rc->reloc_root_tree.lock); 562 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 563 root->commit_root->start); 564 if (rb_node) { 565 node = rb_entry(rb_node, struct mapping_node, rb_node); 566 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 567 } 568 spin_unlock(&rc->reloc_root_tree.lock); 569 570 if (!node) 571 return 0; 572 BUG_ON((struct btrfs_root *)node->data != root); 573 574 spin_lock(&rc->reloc_root_tree.lock); 575 node->bytenr = root->node->start; 576 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 577 spin_unlock(&rc->reloc_root_tree.lock); 578 if (rb_node) 579 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 580 return 0; 581 } 582 583 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 584 struct btrfs_root *root, u64 objectid) 585 { 586 struct btrfs_fs_info *fs_info = root->fs_info; 587 struct btrfs_root *reloc_root; 588 struct extent_buffer *eb; 589 struct btrfs_root_item *root_item; 590 struct btrfs_key root_key; 591 int ret = 0; 592 bool must_abort = false; 593 594 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 595 if (!root_item) 596 return ERR_PTR(-ENOMEM); 597 598 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 599 root_key.type = BTRFS_ROOT_ITEM_KEY; 600 root_key.offset = objectid; 601 602 if (btrfs_root_id(root) == objectid) { 603 u64 commit_root_gen; 604 605 /* 606 * Relocation will wait for cleaner thread, and any half-dropped 607 * subvolume will be fully cleaned up at mount time. 608 * So here we shouldn't hit a subvolume with non-zero drop_progress. 609 * 610 * If this isn't the case, error out since it can make us attempt to 611 * drop references for extents that were already dropped before. 612 */ 613 if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { 614 struct btrfs_key cpu_key; 615 616 btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); 617 btrfs_err(fs_info, 618 "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", 619 objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); 620 ret = -EUCLEAN; 621 goto fail; 622 } 623 624 /* called by btrfs_init_reloc_root */ 625 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 626 BTRFS_TREE_RELOC_OBJECTID); 627 if (ret) 628 goto fail; 629 630 /* 631 * Set the last_snapshot field to the generation of the commit 632 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 633 * correctly (returns true) when the relocation root is created 634 * either inside the critical section of a transaction commit 635 * (through transaction.c:qgroup_account_snapshot()) and when 636 * it's created before the transaction commit is started. 637 */ 638 commit_root_gen = btrfs_header_generation(root->commit_root); 639 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 640 } else { 641 /* 642 * called by btrfs_reloc_post_snapshot_hook. 643 * the source tree is a reloc tree, all tree blocks 644 * modified after it was created have RELOC flag 645 * set in their headers. so it's OK to not update 646 * the 'last_snapshot'. 647 */ 648 ret = btrfs_copy_root(trans, root, root->node, &eb, 649 BTRFS_TREE_RELOC_OBJECTID); 650 if (ret) 651 goto fail; 652 } 653 654 /* 655 * We have changed references at this point, we must abort the 656 * transaction if anything fails. 657 */ 658 must_abort = true; 659 660 memcpy(root_item, &root->root_item, sizeof(*root_item)); 661 btrfs_set_root_bytenr(root_item, eb->start); 662 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 663 btrfs_set_root_generation(root_item, trans->transid); 664 665 if (btrfs_root_id(root) == objectid) { 666 btrfs_set_root_refs(root_item, 0); 667 memset(&root_item->drop_progress, 0, 668 sizeof(struct btrfs_disk_key)); 669 btrfs_set_root_drop_level(root_item, 0); 670 } 671 672 btrfs_tree_unlock(eb); 673 free_extent_buffer(eb); 674 675 ret = btrfs_insert_root(trans, fs_info->tree_root, 676 &root_key, root_item); 677 if (ret) 678 goto fail; 679 680 kfree(root_item); 681 682 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 683 if (IS_ERR(reloc_root)) { 684 ret = PTR_ERR(reloc_root); 685 goto abort; 686 } 687 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 688 btrfs_set_root_last_trans(reloc_root, trans->transid); 689 return reloc_root; 690 fail: 691 kfree(root_item); 692 abort: 693 if (must_abort) 694 btrfs_abort_transaction(trans, ret); 695 return ERR_PTR(ret); 696 } 697 698 /* 699 * create reloc tree for a given fs tree. reloc tree is just a 700 * snapshot of the fs tree with special root objectid. 701 * 702 * The reloc_root comes out of here with two references, one for 703 * root->reloc_root, and another for being on the rc->reloc_roots list. 704 */ 705 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 706 struct btrfs_root *root) 707 { 708 struct btrfs_fs_info *fs_info = root->fs_info; 709 struct btrfs_root *reloc_root; 710 struct reloc_control *rc = fs_info->reloc_ctl; 711 struct btrfs_block_rsv *rsv; 712 int clear_rsv = 0; 713 int ret; 714 715 if (!rc) 716 return 0; 717 718 /* 719 * The subvolume has reloc tree but the swap is finished, no need to 720 * create/update the dead reloc tree 721 */ 722 if (reloc_root_is_dead(root)) 723 return 0; 724 725 /* 726 * This is subtle but important. We do not do 727 * record_root_in_transaction for reloc roots, instead we record their 728 * corresponding fs root, and then here we update the last trans for the 729 * reloc root. This means that we have to do this for the entire life 730 * of the reloc root, regardless of which stage of the relocation we are 731 * in. 732 */ 733 if (root->reloc_root) { 734 reloc_root = root->reloc_root; 735 btrfs_set_root_last_trans(reloc_root, trans->transid); 736 return 0; 737 } 738 739 /* 740 * We are merging reloc roots, we do not need new reloc trees. Also 741 * reloc trees never need their own reloc tree. 742 */ 743 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 744 return 0; 745 746 if (!trans->reloc_reserved) { 747 rsv = trans->block_rsv; 748 trans->block_rsv = rc->block_rsv; 749 clear_rsv = 1; 750 } 751 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 752 if (clear_rsv) 753 trans->block_rsv = rsv; 754 if (IS_ERR(reloc_root)) 755 return PTR_ERR(reloc_root); 756 757 ret = __add_reloc_root(reloc_root); 758 ASSERT(ret != -EEXIST); 759 if (ret) { 760 /* Pairs with create_reloc_root */ 761 btrfs_put_root(reloc_root); 762 return ret; 763 } 764 root->reloc_root = btrfs_grab_root(reloc_root); 765 return 0; 766 } 767 768 /* 769 * update root item of reloc tree 770 */ 771 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 772 struct btrfs_root *root) 773 { 774 struct btrfs_fs_info *fs_info = root->fs_info; 775 struct btrfs_root *reloc_root; 776 struct btrfs_root_item *root_item; 777 int ret; 778 779 if (!have_reloc_root(root)) 780 return 0; 781 782 reloc_root = root->reloc_root; 783 root_item = &reloc_root->root_item; 784 785 /* 786 * We are probably ok here, but __del_reloc_root() will drop its ref of 787 * the root. We have the ref for root->reloc_root, but just in case 788 * hold it while we update the reloc root. 789 */ 790 btrfs_grab_root(reloc_root); 791 792 /* root->reloc_root will stay until current relocation finished */ 793 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 794 btrfs_root_refs(root_item) == 0) { 795 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 796 /* 797 * Mark the tree as dead before we change reloc_root so 798 * have_reloc_root will not touch it from now on. 799 */ 800 smp_wmb(); 801 __del_reloc_root(reloc_root); 802 } 803 804 if (reloc_root->commit_root != reloc_root->node) { 805 __update_reloc_root(reloc_root); 806 btrfs_set_root_node(root_item, reloc_root->node); 807 free_extent_buffer(reloc_root->commit_root); 808 reloc_root->commit_root = btrfs_root_node(reloc_root); 809 } 810 811 ret = btrfs_update_root(trans, fs_info->tree_root, 812 &reloc_root->root_key, root_item); 813 btrfs_put_root(reloc_root); 814 return ret; 815 } 816 817 /* 818 * get new location of data 819 */ 820 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 821 u64 bytenr, u64 num_bytes) 822 { 823 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 824 BTRFS_PATH_AUTO_FREE(path); 825 struct btrfs_file_extent_item *fi; 826 struct extent_buffer *leaf; 827 int ret; 828 829 path = btrfs_alloc_path(); 830 if (!path) 831 return -ENOMEM; 832 833 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 834 ret = btrfs_lookup_file_extent(NULL, root, path, 835 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 836 if (ret < 0) 837 return ret; 838 if (ret > 0) 839 return -ENOENT; 840 841 leaf = path->nodes[0]; 842 fi = btrfs_item_ptr(leaf, path->slots[0], 843 struct btrfs_file_extent_item); 844 845 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 846 btrfs_file_extent_compression(leaf, fi) || 847 btrfs_file_extent_encryption(leaf, fi) || 848 btrfs_file_extent_other_encoding(leaf, fi)); 849 850 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) 851 return -EINVAL; 852 853 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 854 return 0; 855 } 856 857 /* 858 * update file extent items in the tree leaf to point to 859 * the new locations. 860 */ 861 static noinline_for_stack 862 int replace_file_extents(struct btrfs_trans_handle *trans, 863 struct reloc_control *rc, 864 struct btrfs_root *root, 865 struct extent_buffer *leaf) 866 { 867 struct btrfs_fs_info *fs_info = root->fs_info; 868 struct btrfs_key key; 869 struct btrfs_file_extent_item *fi; 870 struct btrfs_inode *inode = NULL; 871 u64 parent; 872 u64 bytenr; 873 u64 new_bytenr = 0; 874 u64 num_bytes; 875 u64 end; 876 u32 nritems; 877 u32 i; 878 int ret = 0; 879 int first = 1; 880 881 if (rc->stage != UPDATE_DATA_PTRS) 882 return 0; 883 884 /* reloc trees always use full backref */ 885 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 886 parent = leaf->start; 887 else 888 parent = 0; 889 890 nritems = btrfs_header_nritems(leaf); 891 for (i = 0; i < nritems; i++) { 892 struct btrfs_ref ref = { 0 }; 893 894 cond_resched(); 895 btrfs_item_key_to_cpu(leaf, &key, i); 896 if (key.type != BTRFS_EXTENT_DATA_KEY) 897 continue; 898 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 899 if (btrfs_file_extent_type(leaf, fi) == 900 BTRFS_FILE_EXTENT_INLINE) 901 continue; 902 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 903 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 904 if (bytenr == 0) 905 continue; 906 if (!in_range(bytenr, rc->block_group->start, 907 rc->block_group->length)) 908 continue; 909 910 /* 911 * if we are modifying block in fs tree, wait for read_folio 912 * to complete and drop the extent cache 913 */ 914 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 915 if (first) { 916 inode = btrfs_find_first_inode(root, key.objectid); 917 first = 0; 918 } else if (inode && btrfs_ino(inode) < key.objectid) { 919 btrfs_add_delayed_iput(inode); 920 inode = btrfs_find_first_inode(root, key.objectid); 921 } 922 if (inode && btrfs_ino(inode) == key.objectid) { 923 struct extent_state *cached_state = NULL; 924 925 end = key.offset + 926 btrfs_file_extent_num_bytes(leaf, fi); 927 WARN_ON(!IS_ALIGNED(key.offset, 928 fs_info->sectorsize)); 929 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 930 end--; 931 /* Take mmap lock to serialize with reflinks. */ 932 if (!down_read_trylock(&inode->i_mmap_lock)) 933 continue; 934 ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, 935 end, &cached_state); 936 if (!ret) { 937 up_read(&inode->i_mmap_lock); 938 continue; 939 } 940 941 btrfs_drop_extent_map_range(inode, key.offset, end, true); 942 btrfs_unlock_extent(&inode->io_tree, key.offset, end, 943 &cached_state); 944 up_read(&inode->i_mmap_lock); 945 } 946 } 947 948 ret = get_new_location(rc->data_inode, &new_bytenr, 949 bytenr, num_bytes); 950 if (ret) { 951 /* 952 * Don't have to abort since we've not changed anything 953 * in the file extent yet. 954 */ 955 break; 956 } 957 958 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 959 960 key.offset -= btrfs_file_extent_offset(leaf, fi); 961 ref.action = BTRFS_ADD_DELAYED_REF; 962 ref.bytenr = new_bytenr; 963 ref.num_bytes = num_bytes; 964 ref.parent = parent; 965 ref.owning_root = btrfs_root_id(root); 966 ref.ref_root = btrfs_header_owner(leaf); 967 btrfs_init_data_ref(&ref, key.objectid, key.offset, 968 btrfs_root_id(root), false); 969 ret = btrfs_inc_extent_ref(trans, &ref); 970 if (unlikely(ret)) { 971 btrfs_abort_transaction(trans, ret); 972 break; 973 } 974 975 ref.action = BTRFS_DROP_DELAYED_REF; 976 ref.bytenr = bytenr; 977 ref.num_bytes = num_bytes; 978 ref.parent = parent; 979 ref.owning_root = btrfs_root_id(root); 980 ref.ref_root = btrfs_header_owner(leaf); 981 btrfs_init_data_ref(&ref, key.objectid, key.offset, 982 btrfs_root_id(root), false); 983 ret = btrfs_free_extent(trans, &ref); 984 if (unlikely(ret)) { 985 btrfs_abort_transaction(trans, ret); 986 break; 987 } 988 } 989 if (inode) 990 btrfs_add_delayed_iput(inode); 991 return ret; 992 } 993 994 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 995 int slot, const struct btrfs_path *path, 996 int level) 997 { 998 struct btrfs_disk_key key1; 999 struct btrfs_disk_key key2; 1000 btrfs_node_key(eb, &key1, slot); 1001 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1002 return memcmp(&key1, &key2, sizeof(key1)); 1003 } 1004 1005 /* 1006 * try to replace tree blocks in fs tree with the new blocks 1007 * in reloc tree. tree blocks haven't been modified since the 1008 * reloc tree was create can be replaced. 1009 * 1010 * if a block was replaced, level of the block + 1 is returned. 1011 * if no block got replaced, 0 is returned. if there are other 1012 * errors, a negative error number is returned. 1013 */ 1014 static noinline_for_stack 1015 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1016 struct btrfs_root *dest, struct btrfs_root *src, 1017 struct btrfs_path *path, struct btrfs_key *next_key, 1018 int lowest_level, int max_level) 1019 { 1020 struct btrfs_fs_info *fs_info = dest->fs_info; 1021 struct extent_buffer *eb; 1022 struct extent_buffer *parent; 1023 struct btrfs_ref ref = { 0 }; 1024 struct btrfs_key key; 1025 u64 old_bytenr; 1026 u64 new_bytenr; 1027 u64 old_ptr_gen; 1028 u64 new_ptr_gen; 1029 u64 last_snapshot; 1030 u32 blocksize; 1031 int cow = 0; 1032 int level; 1033 int ret; 1034 int slot; 1035 1036 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1037 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1038 1039 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1040 again: 1041 slot = path->slots[lowest_level]; 1042 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1043 1044 eb = btrfs_lock_root_node(dest); 1045 level = btrfs_header_level(eb); 1046 1047 if (level < lowest_level) { 1048 btrfs_tree_unlock(eb); 1049 free_extent_buffer(eb); 1050 return 0; 1051 } 1052 1053 if (cow) { 1054 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1055 BTRFS_NESTING_COW); 1056 if (ret) { 1057 btrfs_tree_unlock(eb); 1058 free_extent_buffer(eb); 1059 return ret; 1060 } 1061 } 1062 1063 if (next_key) { 1064 next_key->objectid = (u64)-1; 1065 next_key->type = (u8)-1; 1066 next_key->offset = (u64)-1; 1067 } 1068 1069 parent = eb; 1070 while (1) { 1071 level = btrfs_header_level(parent); 1072 ASSERT(level >= lowest_level); 1073 1074 ret = btrfs_bin_search(parent, 0, &key, &slot); 1075 if (ret < 0) 1076 break; 1077 if (ret && slot > 0) 1078 slot--; 1079 1080 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1081 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1082 1083 old_bytenr = btrfs_node_blockptr(parent, slot); 1084 blocksize = fs_info->nodesize; 1085 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1086 1087 if (level <= max_level) { 1088 eb = path->nodes[level]; 1089 new_bytenr = btrfs_node_blockptr(eb, 1090 path->slots[level]); 1091 new_ptr_gen = btrfs_node_ptr_generation(eb, 1092 path->slots[level]); 1093 } else { 1094 new_bytenr = 0; 1095 new_ptr_gen = 0; 1096 } 1097 1098 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1099 ret = level; 1100 break; 1101 } 1102 1103 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1104 memcmp_node_keys(parent, slot, path, level)) { 1105 if (level <= lowest_level) { 1106 ret = 0; 1107 break; 1108 } 1109 1110 eb = btrfs_read_node_slot(parent, slot); 1111 if (IS_ERR(eb)) { 1112 ret = PTR_ERR(eb); 1113 break; 1114 } 1115 btrfs_tree_lock(eb); 1116 if (cow) { 1117 ret = btrfs_cow_block(trans, dest, eb, parent, 1118 slot, &eb, 1119 BTRFS_NESTING_COW); 1120 if (ret) { 1121 btrfs_tree_unlock(eb); 1122 free_extent_buffer(eb); 1123 break; 1124 } 1125 } 1126 1127 btrfs_tree_unlock(parent); 1128 free_extent_buffer(parent); 1129 1130 parent = eb; 1131 continue; 1132 } 1133 1134 if (!cow) { 1135 btrfs_tree_unlock(parent); 1136 free_extent_buffer(parent); 1137 cow = 1; 1138 goto again; 1139 } 1140 1141 btrfs_node_key_to_cpu(path->nodes[level], &key, 1142 path->slots[level]); 1143 btrfs_release_path(path); 1144 1145 path->lowest_level = level; 1146 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1147 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1148 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1149 path->lowest_level = 0; 1150 if (ret) { 1151 if (ret > 0) 1152 ret = -ENOENT; 1153 break; 1154 } 1155 1156 /* 1157 * Info qgroup to trace both subtrees. 1158 * 1159 * We must trace both trees. 1160 * 1) Tree reloc subtree 1161 * If not traced, we will leak data numbers 1162 * 2) Fs subtree 1163 * If not traced, we will double count old data 1164 * 1165 * We don't scan the subtree right now, but only record 1166 * the swapped tree blocks. 1167 * The real subtree rescan is delayed until we have new 1168 * CoW on the subtree root node before transaction commit. 1169 */ 1170 ret = btrfs_qgroup_add_swapped_blocks(dest, 1171 rc->block_group, parent, slot, 1172 path->nodes[level], path->slots[level], 1173 last_snapshot); 1174 if (ret < 0) 1175 break; 1176 /* 1177 * swap blocks in fs tree and reloc tree. 1178 */ 1179 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1180 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1181 1182 btrfs_set_node_blockptr(path->nodes[level], 1183 path->slots[level], old_bytenr); 1184 btrfs_set_node_ptr_generation(path->nodes[level], 1185 path->slots[level], old_ptr_gen); 1186 1187 ref.action = BTRFS_ADD_DELAYED_REF; 1188 ref.bytenr = old_bytenr; 1189 ref.num_bytes = blocksize; 1190 ref.parent = path->nodes[level]->start; 1191 ref.owning_root = btrfs_root_id(src); 1192 ref.ref_root = btrfs_root_id(src); 1193 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1194 ret = btrfs_inc_extent_ref(trans, &ref); 1195 if (unlikely(ret)) { 1196 btrfs_abort_transaction(trans, ret); 1197 break; 1198 } 1199 1200 ref.action = BTRFS_ADD_DELAYED_REF; 1201 ref.bytenr = new_bytenr; 1202 ref.num_bytes = blocksize; 1203 ref.parent = 0; 1204 ref.owning_root = btrfs_root_id(dest); 1205 ref.ref_root = btrfs_root_id(dest); 1206 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1207 ret = btrfs_inc_extent_ref(trans, &ref); 1208 if (unlikely(ret)) { 1209 btrfs_abort_transaction(trans, ret); 1210 break; 1211 } 1212 1213 /* We don't know the real owning_root, use 0. */ 1214 ref.action = BTRFS_DROP_DELAYED_REF; 1215 ref.bytenr = new_bytenr; 1216 ref.num_bytes = blocksize; 1217 ref.parent = path->nodes[level]->start; 1218 ref.owning_root = 0; 1219 ref.ref_root = btrfs_root_id(src); 1220 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1221 ret = btrfs_free_extent(trans, &ref); 1222 if (unlikely(ret)) { 1223 btrfs_abort_transaction(trans, ret); 1224 break; 1225 } 1226 1227 /* We don't know the real owning_root, use 0. */ 1228 ref.action = BTRFS_DROP_DELAYED_REF; 1229 ref.bytenr = old_bytenr; 1230 ref.num_bytes = blocksize; 1231 ref.parent = 0; 1232 ref.owning_root = 0; 1233 ref.ref_root = btrfs_root_id(dest); 1234 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1235 ret = btrfs_free_extent(trans, &ref); 1236 if (unlikely(ret)) { 1237 btrfs_abort_transaction(trans, ret); 1238 break; 1239 } 1240 1241 btrfs_unlock_up_safe(path, 0); 1242 1243 ret = level; 1244 break; 1245 } 1246 btrfs_tree_unlock(parent); 1247 free_extent_buffer(parent); 1248 return ret; 1249 } 1250 1251 /* 1252 * helper to find next relocated block in reloc tree 1253 */ 1254 static noinline_for_stack 1255 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1256 int *level) 1257 { 1258 struct extent_buffer *eb; 1259 int i; 1260 u64 last_snapshot; 1261 u32 nritems; 1262 1263 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1264 1265 for (i = 0; i < *level; i++) { 1266 free_extent_buffer(path->nodes[i]); 1267 path->nodes[i] = NULL; 1268 } 1269 1270 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1271 eb = path->nodes[i]; 1272 nritems = btrfs_header_nritems(eb); 1273 while (path->slots[i] + 1 < nritems) { 1274 path->slots[i]++; 1275 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1276 last_snapshot) 1277 continue; 1278 1279 *level = i; 1280 return 0; 1281 } 1282 free_extent_buffer(path->nodes[i]); 1283 path->nodes[i] = NULL; 1284 } 1285 return 1; 1286 } 1287 1288 /* 1289 * walk down reloc tree to find relocated block of lowest level 1290 */ 1291 static noinline_for_stack 1292 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1293 int *level) 1294 { 1295 struct extent_buffer *eb = NULL; 1296 int i; 1297 u64 ptr_gen = 0; 1298 u64 last_snapshot; 1299 u32 nritems; 1300 1301 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1302 1303 for (i = *level; i > 0; i--) { 1304 eb = path->nodes[i]; 1305 nritems = btrfs_header_nritems(eb); 1306 while (path->slots[i] < nritems) { 1307 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1308 if (ptr_gen > last_snapshot) 1309 break; 1310 path->slots[i]++; 1311 } 1312 if (path->slots[i] >= nritems) { 1313 if (i == *level) 1314 break; 1315 *level = i + 1; 1316 return 0; 1317 } 1318 if (i == 1) { 1319 *level = i; 1320 return 0; 1321 } 1322 1323 eb = btrfs_read_node_slot(eb, path->slots[i]); 1324 if (IS_ERR(eb)) 1325 return PTR_ERR(eb); 1326 BUG_ON(btrfs_header_level(eb) != i - 1); 1327 path->nodes[i - 1] = eb; 1328 path->slots[i - 1] = 0; 1329 } 1330 return 1; 1331 } 1332 1333 /* 1334 * invalidate extent cache for file extents whose key in range of 1335 * [min_key, max_key) 1336 */ 1337 static int invalidate_extent_cache(struct btrfs_root *root, 1338 const struct btrfs_key *min_key, 1339 const struct btrfs_key *max_key) 1340 { 1341 struct btrfs_fs_info *fs_info = root->fs_info; 1342 struct btrfs_inode *inode = NULL; 1343 u64 objectid; 1344 u64 start, end; 1345 u64 ino; 1346 1347 objectid = min_key->objectid; 1348 while (1) { 1349 struct extent_state *cached_state = NULL; 1350 1351 cond_resched(); 1352 if (inode) 1353 iput(&inode->vfs_inode); 1354 1355 if (objectid > max_key->objectid) 1356 break; 1357 1358 inode = btrfs_find_first_inode(root, objectid); 1359 if (!inode) 1360 break; 1361 ino = btrfs_ino(inode); 1362 1363 if (ino > max_key->objectid) { 1364 iput(&inode->vfs_inode); 1365 break; 1366 } 1367 1368 objectid = ino + 1; 1369 if (!S_ISREG(inode->vfs_inode.i_mode)) 1370 continue; 1371 1372 if (unlikely(min_key->objectid == ino)) { 1373 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1374 continue; 1375 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1376 start = 0; 1377 else { 1378 start = min_key->offset; 1379 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1380 } 1381 } else { 1382 start = 0; 1383 } 1384 1385 if (unlikely(max_key->objectid == ino)) { 1386 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1387 continue; 1388 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1389 end = (u64)-1; 1390 } else { 1391 if (max_key->offset == 0) 1392 continue; 1393 end = max_key->offset; 1394 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1395 end--; 1396 } 1397 } else { 1398 end = (u64)-1; 1399 } 1400 1401 /* the lock_extent waits for read_folio to complete */ 1402 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 1403 btrfs_drop_extent_map_range(inode, start, end, true); 1404 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1405 } 1406 return 0; 1407 } 1408 1409 static int find_next_key(struct btrfs_path *path, int level, 1410 struct btrfs_key *key) 1411 1412 { 1413 while (level < BTRFS_MAX_LEVEL) { 1414 if (!path->nodes[level]) 1415 break; 1416 if (path->slots[level] + 1 < 1417 btrfs_header_nritems(path->nodes[level])) { 1418 btrfs_node_key_to_cpu(path->nodes[level], key, 1419 path->slots[level] + 1); 1420 return 0; 1421 } 1422 level++; 1423 } 1424 return 1; 1425 } 1426 1427 /* 1428 * Insert current subvolume into reloc_control::dirty_subvol_roots 1429 */ 1430 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1431 struct reloc_control *rc, 1432 struct btrfs_root *root) 1433 { 1434 struct btrfs_root *reloc_root = root->reloc_root; 1435 struct btrfs_root_item *reloc_root_item; 1436 int ret; 1437 1438 /* @root must be a subvolume tree root with a valid reloc tree */ 1439 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1440 ASSERT(reloc_root); 1441 1442 reloc_root_item = &reloc_root->root_item; 1443 memset(&reloc_root_item->drop_progress, 0, 1444 sizeof(reloc_root_item->drop_progress)); 1445 btrfs_set_root_drop_level(reloc_root_item, 0); 1446 btrfs_set_root_refs(reloc_root_item, 0); 1447 ret = btrfs_update_reloc_root(trans, root); 1448 if (ret) 1449 return ret; 1450 1451 if (list_empty(&root->reloc_dirty_list)) { 1452 btrfs_grab_root(root); 1453 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1454 } 1455 1456 return 0; 1457 } 1458 1459 static int clean_dirty_subvols(struct reloc_control *rc) 1460 { 1461 struct btrfs_root *root; 1462 struct btrfs_root *next; 1463 int ret = 0; 1464 int ret2; 1465 1466 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1467 reloc_dirty_list) { 1468 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1469 /* Merged subvolume, cleanup its reloc root */ 1470 struct btrfs_root *reloc_root = root->reloc_root; 1471 1472 list_del_init(&root->reloc_dirty_list); 1473 root->reloc_root = NULL; 1474 /* 1475 * Need barrier to ensure clear_bit() only happens after 1476 * root->reloc_root = NULL. Pairs with have_reloc_root. 1477 */ 1478 smp_wmb(); 1479 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1480 if (reloc_root) { 1481 /* 1482 * btrfs_drop_snapshot drops our ref we hold for 1483 * ->reloc_root. If it fails however we must 1484 * drop the ref ourselves. 1485 */ 1486 ret2 = btrfs_drop_snapshot(reloc_root, false, true); 1487 if (ret2 < 0) { 1488 btrfs_put_root(reloc_root); 1489 if (!ret) 1490 ret = ret2; 1491 } 1492 } 1493 btrfs_put_root(root); 1494 } else { 1495 /* Orphan reloc tree, just clean it up */ 1496 ret2 = btrfs_drop_snapshot(root, false, true); 1497 if (ret2 < 0) { 1498 btrfs_put_root(root); 1499 if (!ret) 1500 ret = ret2; 1501 } 1502 } 1503 } 1504 return ret; 1505 } 1506 1507 /* 1508 * merge the relocated tree blocks in reloc tree with corresponding 1509 * fs tree. 1510 */ 1511 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1512 struct btrfs_root *root) 1513 { 1514 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1515 struct btrfs_key key; 1516 struct btrfs_key next_key; 1517 struct btrfs_trans_handle *trans = NULL; 1518 struct btrfs_root *reloc_root; 1519 struct btrfs_root_item *root_item; 1520 struct btrfs_path *path; 1521 struct extent_buffer *leaf; 1522 int reserve_level; 1523 int level; 1524 int max_level; 1525 int replaced = 0; 1526 int ret = 0; 1527 u32 min_reserved; 1528 1529 path = btrfs_alloc_path(); 1530 if (!path) 1531 return -ENOMEM; 1532 path->reada = READA_FORWARD; 1533 1534 reloc_root = root->reloc_root; 1535 root_item = &reloc_root->root_item; 1536 1537 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1538 level = btrfs_root_level(root_item); 1539 refcount_inc(&reloc_root->node->refs); 1540 path->nodes[level] = reloc_root->node; 1541 path->slots[level] = 0; 1542 } else { 1543 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1544 1545 level = btrfs_root_drop_level(root_item); 1546 BUG_ON(level == 0); 1547 path->lowest_level = level; 1548 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1549 path->lowest_level = 0; 1550 if (ret < 0) { 1551 btrfs_free_path(path); 1552 return ret; 1553 } 1554 1555 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1556 path->slots[level]); 1557 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1558 1559 btrfs_unlock_up_safe(path, 0); 1560 } 1561 1562 /* 1563 * In merge_reloc_root(), we modify the upper level pointer to swap the 1564 * tree blocks between reloc tree and subvolume tree. Thus for tree 1565 * block COW, we COW at most from level 1 to root level for each tree. 1566 * 1567 * Thus the needed metadata size is at most root_level * nodesize, 1568 * and * 2 since we have two trees to COW. 1569 */ 1570 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1571 min_reserved = fs_info->nodesize * reserve_level * 2; 1572 memset(&next_key, 0, sizeof(next_key)); 1573 1574 while (1) { 1575 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1576 min_reserved, 1577 BTRFS_RESERVE_FLUSH_LIMIT); 1578 if (ret) 1579 goto out; 1580 trans = btrfs_start_transaction(root, 0); 1581 if (IS_ERR(trans)) { 1582 ret = PTR_ERR(trans); 1583 trans = NULL; 1584 goto out; 1585 } 1586 1587 /* 1588 * At this point we no longer have a reloc_control, so we can't 1589 * depend on btrfs_init_reloc_root to update our last_trans. 1590 * 1591 * But that's ok, we started the trans handle on our 1592 * corresponding fs_root, which means it's been added to the 1593 * dirty list. At commit time we'll still call 1594 * btrfs_update_reloc_root() and update our root item 1595 * appropriately. 1596 */ 1597 btrfs_set_root_last_trans(reloc_root, trans->transid); 1598 trans->block_rsv = rc->block_rsv; 1599 1600 replaced = 0; 1601 max_level = level; 1602 1603 ret = walk_down_reloc_tree(reloc_root, path, &level); 1604 if (ret < 0) 1605 goto out; 1606 if (ret > 0) 1607 break; 1608 1609 if (!find_next_key(path, level, &key) && 1610 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1611 ret = 0; 1612 } else { 1613 ret = replace_path(trans, rc, root, reloc_root, path, 1614 &next_key, level, max_level); 1615 } 1616 if (ret < 0) 1617 goto out; 1618 if (ret > 0) { 1619 level = ret; 1620 btrfs_node_key_to_cpu(path->nodes[level], &key, 1621 path->slots[level]); 1622 replaced = 1; 1623 } 1624 1625 ret = walk_up_reloc_tree(reloc_root, path, &level); 1626 if (ret > 0) 1627 break; 1628 1629 BUG_ON(level == 0); 1630 /* 1631 * save the merging progress in the drop_progress. 1632 * this is OK since root refs == 1 in this case. 1633 */ 1634 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1635 path->slots[level]); 1636 btrfs_set_root_drop_level(root_item, level); 1637 1638 btrfs_end_transaction_throttle(trans); 1639 trans = NULL; 1640 1641 btrfs_btree_balance_dirty(fs_info); 1642 1643 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1644 invalidate_extent_cache(root, &key, &next_key); 1645 } 1646 1647 /* 1648 * handle the case only one block in the fs tree need to be 1649 * relocated and the block is tree root. 1650 */ 1651 leaf = btrfs_lock_root_node(root); 1652 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1653 BTRFS_NESTING_COW); 1654 btrfs_tree_unlock(leaf); 1655 free_extent_buffer(leaf); 1656 out: 1657 btrfs_free_path(path); 1658 1659 if (ret == 0) { 1660 ret = insert_dirty_subvol(trans, rc, root); 1661 if (ret) 1662 btrfs_abort_transaction(trans, ret); 1663 } 1664 1665 if (trans) 1666 btrfs_end_transaction_throttle(trans); 1667 1668 btrfs_btree_balance_dirty(fs_info); 1669 1670 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1671 invalidate_extent_cache(root, &key, &next_key); 1672 1673 return ret; 1674 } 1675 1676 static noinline_for_stack 1677 int prepare_to_merge(struct reloc_control *rc, int err) 1678 { 1679 struct btrfs_root *root = rc->extent_root; 1680 struct btrfs_fs_info *fs_info = root->fs_info; 1681 struct btrfs_root *reloc_root; 1682 struct btrfs_trans_handle *trans; 1683 LIST_HEAD(reloc_roots); 1684 u64 num_bytes = 0; 1685 int ret; 1686 1687 mutex_lock(&fs_info->reloc_mutex); 1688 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1689 rc->merging_rsv_size += rc->nodes_relocated * 2; 1690 mutex_unlock(&fs_info->reloc_mutex); 1691 1692 again: 1693 if (!err) { 1694 num_bytes = rc->merging_rsv_size; 1695 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1696 BTRFS_RESERVE_FLUSH_ALL); 1697 if (ret) 1698 err = ret; 1699 } 1700 1701 trans = btrfs_join_transaction(rc->extent_root); 1702 if (IS_ERR(trans)) { 1703 if (!err) 1704 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1705 num_bytes, NULL); 1706 return PTR_ERR(trans); 1707 } 1708 1709 if (!err) { 1710 if (num_bytes != rc->merging_rsv_size) { 1711 btrfs_end_transaction(trans); 1712 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1713 num_bytes, NULL); 1714 goto again; 1715 } 1716 } 1717 1718 rc->merge_reloc_tree = true; 1719 1720 while (!list_empty(&rc->reloc_roots)) { 1721 reloc_root = list_first_entry(&rc->reloc_roots, 1722 struct btrfs_root, root_list); 1723 list_del_init(&reloc_root->root_list); 1724 1725 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1726 false); 1727 if (IS_ERR(root)) { 1728 /* 1729 * Even if we have an error we need this reloc root 1730 * back on our list so we can clean up properly. 1731 */ 1732 list_add(&reloc_root->root_list, &reloc_roots); 1733 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1734 if (!err) 1735 err = PTR_ERR(root); 1736 break; 1737 } 1738 1739 if (unlikely(root->reloc_root != reloc_root)) { 1740 if (root->reloc_root) { 1741 btrfs_err(fs_info, 1742 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1743 btrfs_root_id(root), 1744 btrfs_root_id(root->reloc_root), 1745 root->reloc_root->root_key.type, 1746 root->reloc_root->root_key.offset, 1747 btrfs_root_generation( 1748 &root->reloc_root->root_item), 1749 btrfs_root_id(reloc_root), 1750 reloc_root->root_key.type, 1751 reloc_root->root_key.offset, 1752 btrfs_root_generation( 1753 &reloc_root->root_item)); 1754 } else { 1755 btrfs_err(fs_info, 1756 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1757 btrfs_root_id(root), 1758 btrfs_root_id(reloc_root), 1759 reloc_root->root_key.type, 1760 reloc_root->root_key.offset, 1761 btrfs_root_generation( 1762 &reloc_root->root_item)); 1763 } 1764 list_add(&reloc_root->root_list, &reloc_roots); 1765 btrfs_put_root(root); 1766 btrfs_abort_transaction(trans, -EUCLEAN); 1767 if (!err) 1768 err = -EUCLEAN; 1769 break; 1770 } 1771 1772 /* 1773 * set reference count to 1, so btrfs_recover_relocation 1774 * knows it should resumes merging 1775 */ 1776 if (!err) 1777 btrfs_set_root_refs(&reloc_root->root_item, 1); 1778 ret = btrfs_update_reloc_root(trans, root); 1779 1780 /* 1781 * Even if we have an error we need this reloc root back on our 1782 * list so we can clean up properly. 1783 */ 1784 list_add(&reloc_root->root_list, &reloc_roots); 1785 btrfs_put_root(root); 1786 1787 if (unlikely(ret)) { 1788 btrfs_abort_transaction(trans, ret); 1789 if (!err) 1790 err = ret; 1791 break; 1792 } 1793 } 1794 1795 list_splice(&reloc_roots, &rc->reloc_roots); 1796 1797 if (!err) 1798 err = btrfs_commit_transaction(trans); 1799 else 1800 btrfs_end_transaction(trans); 1801 return err; 1802 } 1803 1804 static noinline_for_stack 1805 void free_reloc_roots(struct list_head *list) 1806 { 1807 struct btrfs_root *reloc_root, *tmp; 1808 1809 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1810 __del_reloc_root(reloc_root); 1811 } 1812 1813 static noinline_for_stack 1814 void merge_reloc_roots(struct reloc_control *rc) 1815 { 1816 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1817 struct btrfs_root *root; 1818 struct btrfs_root *reloc_root; 1819 LIST_HEAD(reloc_roots); 1820 int found = 0; 1821 int ret = 0; 1822 again: 1823 root = rc->extent_root; 1824 1825 /* 1826 * this serializes us with btrfs_record_root_in_transaction, 1827 * we have to make sure nobody is in the middle of 1828 * adding their roots to the list while we are 1829 * doing this splice 1830 */ 1831 mutex_lock(&fs_info->reloc_mutex); 1832 list_splice_init(&rc->reloc_roots, &reloc_roots); 1833 mutex_unlock(&fs_info->reloc_mutex); 1834 1835 while (!list_empty(&reloc_roots)) { 1836 found = 1; 1837 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 1838 1839 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1840 false); 1841 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1842 if (WARN_ON(IS_ERR(root))) { 1843 /* 1844 * For recovery we read the fs roots on mount, 1845 * and if we didn't find the root then we marked 1846 * the reloc root as a garbage root. For normal 1847 * relocation obviously the root should exist in 1848 * memory. However there's no reason we can't 1849 * handle the error properly here just in case. 1850 */ 1851 ret = PTR_ERR(root); 1852 goto out; 1853 } 1854 if (WARN_ON(root->reloc_root != reloc_root)) { 1855 /* 1856 * This can happen if on-disk metadata has some 1857 * corruption, e.g. bad reloc tree key offset. 1858 */ 1859 ret = -EINVAL; 1860 goto out; 1861 } 1862 ret = merge_reloc_root(rc, root); 1863 btrfs_put_root(root); 1864 if (ret) { 1865 if (list_empty(&reloc_root->root_list)) 1866 list_add_tail(&reloc_root->root_list, 1867 &reloc_roots); 1868 goto out; 1869 } 1870 } else { 1871 if (!IS_ERR(root)) { 1872 if (root->reloc_root == reloc_root) { 1873 root->reloc_root = NULL; 1874 btrfs_put_root(reloc_root); 1875 } 1876 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1877 &root->state); 1878 btrfs_put_root(root); 1879 } 1880 1881 list_del_init(&reloc_root->root_list); 1882 /* Don't forget to queue this reloc root for cleanup */ 1883 list_add_tail(&reloc_root->reloc_dirty_list, 1884 &rc->dirty_subvol_roots); 1885 } 1886 } 1887 1888 if (found) { 1889 found = 0; 1890 goto again; 1891 } 1892 out: 1893 if (ret) { 1894 btrfs_handle_fs_error(fs_info, ret, NULL); 1895 free_reloc_roots(&reloc_roots); 1896 1897 /* new reloc root may be added */ 1898 mutex_lock(&fs_info->reloc_mutex); 1899 list_splice_init(&rc->reloc_roots, &reloc_roots); 1900 mutex_unlock(&fs_info->reloc_mutex); 1901 free_reloc_roots(&reloc_roots); 1902 } 1903 1904 /* 1905 * We used to have 1906 * 1907 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1908 * 1909 * here, but it's wrong. If we fail to start the transaction in 1910 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1911 * have actually been removed from the reloc_root_tree rb tree. This is 1912 * fine because we're bailing here, and we hold a reference on the root 1913 * for the list that holds it, so these roots will be cleaned up when we 1914 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1915 * will be cleaned up on unmount. 1916 * 1917 * The remaining nodes will be cleaned up by free_reloc_control. 1918 */ 1919 } 1920 1921 static void free_block_list(struct rb_root *blocks) 1922 { 1923 struct tree_block *block; 1924 struct rb_node *rb_node; 1925 while ((rb_node = rb_first(blocks))) { 1926 block = rb_entry(rb_node, struct tree_block, rb_node); 1927 rb_erase(rb_node, blocks); 1928 kfree(block); 1929 } 1930 } 1931 1932 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 1933 struct btrfs_root *reloc_root) 1934 { 1935 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 1936 struct btrfs_root *root; 1937 int ret; 1938 1939 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 1940 return 0; 1941 1942 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 1943 1944 /* 1945 * This should succeed, since we can't have a reloc root without having 1946 * already looked up the actual root and created the reloc root for this 1947 * root. 1948 * 1949 * However if there's some sort of corruption where we have a ref to a 1950 * reloc root without a corresponding root this could return ENOENT. 1951 */ 1952 if (IS_ERR(root)) { 1953 DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); 1954 return PTR_ERR(root); 1955 } 1956 if (unlikely(root->reloc_root != reloc_root)) { 1957 DEBUG_WARN("unexpected reloc root found"); 1958 btrfs_err(fs_info, 1959 "root %llu has two reloc roots associated with it", 1960 reloc_root->root_key.offset); 1961 btrfs_put_root(root); 1962 return -EUCLEAN; 1963 } 1964 ret = btrfs_record_root_in_trans(trans, root); 1965 btrfs_put_root(root); 1966 1967 return ret; 1968 } 1969 1970 static noinline_for_stack 1971 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 1972 struct reloc_control *rc, 1973 struct btrfs_backref_node *node, 1974 struct btrfs_backref_edge *edges[]) 1975 { 1976 struct btrfs_backref_node *next; 1977 struct btrfs_root *root; 1978 int index = 0; 1979 int ret; 1980 1981 next = walk_up_backref(node, edges, &index); 1982 root = next->root; 1983 1984 /* 1985 * If there is no root, then our references for this block are 1986 * incomplete, as we should be able to walk all the way up to a block 1987 * that is owned by a root. 1988 * 1989 * This path is only for SHAREABLE roots, so if we come upon a 1990 * non-SHAREABLE root then we have backrefs that resolve improperly. 1991 * 1992 * Both of these cases indicate file system corruption, or a bug in the 1993 * backref walking code. 1994 */ 1995 if (unlikely(!root)) { 1996 btrfs_err(trans->fs_info, 1997 "bytenr %llu doesn't have a backref path ending in a root", 1998 node->bytenr); 1999 return ERR_PTR(-EUCLEAN); 2000 } 2001 if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { 2002 btrfs_err(trans->fs_info, 2003 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2004 node->bytenr); 2005 return ERR_PTR(-EUCLEAN); 2006 } 2007 2008 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2009 ret = record_reloc_root_in_trans(trans, root); 2010 if (ret) 2011 return ERR_PTR(ret); 2012 goto found; 2013 } 2014 2015 ret = btrfs_record_root_in_trans(trans, root); 2016 if (ret) 2017 return ERR_PTR(ret); 2018 root = root->reloc_root; 2019 2020 /* 2021 * We could have raced with another thread which failed, so 2022 * root->reloc_root may not be set, return ENOENT in this case. 2023 */ 2024 if (!root) 2025 return ERR_PTR(-ENOENT); 2026 2027 if (unlikely(next->new_bytenr)) { 2028 /* 2029 * We just created the reloc root, so we shouldn't have 2030 * ->new_bytenr set yet. If it is then we have multiple roots 2031 * pointing at the same bytenr which indicates corruption, or 2032 * we've made a mistake in the backref walking code. 2033 */ 2034 ASSERT(next->new_bytenr == 0); 2035 btrfs_err(trans->fs_info, 2036 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2037 node->bytenr, next->bytenr); 2038 return ERR_PTR(-EUCLEAN); 2039 } 2040 2041 next->new_bytenr = root->node->start; 2042 btrfs_put_root(next->root); 2043 next->root = btrfs_grab_root(root); 2044 ASSERT(next->root); 2045 mark_block_processed(rc, next); 2046 found: 2047 next = node; 2048 /* setup backref node path for btrfs_reloc_cow_block */ 2049 while (1) { 2050 rc->backref_cache.path[next->level] = next; 2051 if (--index < 0) 2052 break; 2053 next = edges[index]->node[UPPER]; 2054 } 2055 return root; 2056 } 2057 2058 /* 2059 * Select a tree root for relocation. 2060 * 2061 * Return NULL if the block is not shareable. We should use do_relocation() in 2062 * this case. 2063 * 2064 * Return a tree root pointer if the block is shareable. 2065 * Return -ENOENT if the block is root of reloc tree. 2066 */ 2067 static noinline_for_stack 2068 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2069 { 2070 struct btrfs_backref_node *next; 2071 struct btrfs_root *root; 2072 struct btrfs_root *fs_root = NULL; 2073 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2074 int index = 0; 2075 2076 next = node; 2077 while (1) { 2078 cond_resched(); 2079 next = walk_up_backref(next, edges, &index); 2080 root = next->root; 2081 2082 /* 2083 * This can occur if we have incomplete extent refs leading all 2084 * the way up a particular path, in this case return -EUCLEAN. 2085 */ 2086 if (unlikely(!root)) 2087 return ERR_PTR(-EUCLEAN); 2088 2089 /* No other choice for non-shareable tree */ 2090 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2091 return root; 2092 2093 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2094 fs_root = root; 2095 2096 if (next != node) 2097 return NULL; 2098 2099 next = walk_down_backref(edges, &index); 2100 if (!next || next->level <= node->level) 2101 break; 2102 } 2103 2104 if (!fs_root) 2105 return ERR_PTR(-ENOENT); 2106 return fs_root; 2107 } 2108 2109 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2110 struct btrfs_backref_node *node) 2111 { 2112 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2113 struct btrfs_backref_node *next = node; 2114 struct btrfs_backref_edge *edge; 2115 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2116 u64 num_bytes = 0; 2117 int index = 0; 2118 2119 BUG_ON(node->processed); 2120 2121 while (next) { 2122 cond_resched(); 2123 while (1) { 2124 if (next->processed) 2125 break; 2126 2127 num_bytes += fs_info->nodesize; 2128 2129 if (list_empty(&next->upper)) 2130 break; 2131 2132 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2133 list[LOWER]); 2134 edges[index++] = edge; 2135 next = edge->node[UPPER]; 2136 } 2137 next = walk_down_backref(edges, &index); 2138 } 2139 return num_bytes; 2140 } 2141 2142 static int refill_metadata_space(struct btrfs_trans_handle *trans, 2143 struct reloc_control *rc, u64 num_bytes) 2144 { 2145 struct btrfs_fs_info *fs_info = trans->fs_info; 2146 int ret; 2147 2148 trans->block_rsv = rc->block_rsv; 2149 rc->reserved_bytes += num_bytes; 2150 2151 /* 2152 * We are under a transaction here so we can only do limited flushing. 2153 * If we get an enospc just kick back -EAGAIN so we know to drop the 2154 * transaction and try to refill when we can flush all the things. 2155 */ 2156 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2157 BTRFS_RESERVE_FLUSH_LIMIT); 2158 if (ret) { 2159 u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2160 2161 while (tmp <= rc->reserved_bytes) 2162 tmp <<= 1; 2163 /* 2164 * only one thread can access block_rsv at this point, 2165 * so we don't need hold lock to protect block_rsv. 2166 * we expand more reservation size here to allow enough 2167 * space for relocation and we will return earlier in 2168 * enospc case. 2169 */ 2170 rc->block_rsv->size = tmp + fs_info->nodesize * 2171 RELOCATION_RESERVED_NODES; 2172 return -EAGAIN; 2173 } 2174 2175 return 0; 2176 } 2177 2178 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2179 struct reloc_control *rc, 2180 struct btrfs_backref_node *node) 2181 { 2182 u64 num_bytes; 2183 2184 num_bytes = calcu_metadata_size(rc, node) * 2; 2185 return refill_metadata_space(trans, rc, num_bytes); 2186 } 2187 2188 /* 2189 * relocate a block tree, and then update pointers in upper level 2190 * blocks that reference the block to point to the new location. 2191 * 2192 * if called by link_to_upper, the block has already been relocated. 2193 * in that case this function just updates pointers. 2194 */ 2195 static int do_relocation(struct btrfs_trans_handle *trans, 2196 struct reloc_control *rc, 2197 struct btrfs_backref_node *node, 2198 struct btrfs_key *key, 2199 struct btrfs_path *path, int lowest) 2200 { 2201 struct btrfs_backref_node *upper; 2202 struct btrfs_backref_edge *edge; 2203 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2204 struct btrfs_root *root; 2205 struct extent_buffer *eb; 2206 u32 blocksize; 2207 u64 bytenr; 2208 int slot; 2209 int ret = 0; 2210 2211 /* 2212 * If we are lowest then this is the first time we're processing this 2213 * block, and thus shouldn't have an eb associated with it yet. 2214 */ 2215 ASSERT(!lowest || !node->eb); 2216 2217 path->lowest_level = node->level + 1; 2218 rc->backref_cache.path[node->level] = node; 2219 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2220 cond_resched(); 2221 2222 upper = edge->node[UPPER]; 2223 root = select_reloc_root(trans, rc, upper, edges); 2224 if (IS_ERR(root)) { 2225 ret = PTR_ERR(root); 2226 goto next; 2227 } 2228 2229 if (upper->eb && !upper->locked) { 2230 if (!lowest) { 2231 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2232 if (ret < 0) 2233 goto next; 2234 BUG_ON(ret); 2235 bytenr = btrfs_node_blockptr(upper->eb, slot); 2236 if (node->eb->start == bytenr) 2237 goto next; 2238 } 2239 btrfs_backref_drop_node_buffer(upper); 2240 } 2241 2242 if (!upper->eb) { 2243 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2244 if (ret) { 2245 if (ret > 0) 2246 ret = -ENOENT; 2247 2248 btrfs_release_path(path); 2249 break; 2250 } 2251 2252 if (!upper->eb) { 2253 upper->eb = path->nodes[upper->level]; 2254 path->nodes[upper->level] = NULL; 2255 } else { 2256 BUG_ON(upper->eb != path->nodes[upper->level]); 2257 } 2258 2259 upper->locked = 1; 2260 path->locks[upper->level] = 0; 2261 2262 slot = path->slots[upper->level]; 2263 btrfs_release_path(path); 2264 } else { 2265 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2266 if (ret < 0) 2267 goto next; 2268 BUG_ON(ret); 2269 } 2270 2271 bytenr = btrfs_node_blockptr(upper->eb, slot); 2272 if (lowest) { 2273 if (unlikely(bytenr != node->bytenr)) { 2274 btrfs_err(root->fs_info, 2275 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2276 bytenr, node->bytenr, slot, 2277 upper->eb->start); 2278 ret = -EIO; 2279 goto next; 2280 } 2281 } else { 2282 if (node->eb->start == bytenr) 2283 goto next; 2284 } 2285 2286 blocksize = root->fs_info->nodesize; 2287 eb = btrfs_read_node_slot(upper->eb, slot); 2288 if (IS_ERR(eb)) { 2289 ret = PTR_ERR(eb); 2290 goto next; 2291 } 2292 btrfs_tree_lock(eb); 2293 2294 if (!node->eb) { 2295 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2296 slot, &eb, BTRFS_NESTING_COW); 2297 btrfs_tree_unlock(eb); 2298 free_extent_buffer(eb); 2299 if (ret < 0) 2300 goto next; 2301 /* 2302 * We've just COWed this block, it should have updated 2303 * the correct backref node entry. 2304 */ 2305 ASSERT(node->eb == eb); 2306 } else { 2307 struct btrfs_ref ref = { 2308 .action = BTRFS_ADD_DELAYED_REF, 2309 .bytenr = node->eb->start, 2310 .num_bytes = blocksize, 2311 .parent = upper->eb->start, 2312 .owning_root = btrfs_header_owner(upper->eb), 2313 .ref_root = btrfs_header_owner(upper->eb), 2314 }; 2315 2316 btrfs_set_node_blockptr(upper->eb, slot, 2317 node->eb->start); 2318 btrfs_set_node_ptr_generation(upper->eb, slot, 2319 trans->transid); 2320 btrfs_mark_buffer_dirty(trans, upper->eb); 2321 2322 btrfs_init_tree_ref(&ref, node->level, 2323 btrfs_root_id(root), false); 2324 ret = btrfs_inc_extent_ref(trans, &ref); 2325 if (!ret) 2326 ret = btrfs_drop_subtree(trans, root, eb, 2327 upper->eb); 2328 if (unlikely(ret)) 2329 btrfs_abort_transaction(trans, ret); 2330 } 2331 next: 2332 if (!upper->pending) 2333 btrfs_backref_drop_node_buffer(upper); 2334 else 2335 btrfs_backref_unlock_node_buffer(upper); 2336 if (ret) 2337 break; 2338 } 2339 2340 if (!ret && node->pending) { 2341 btrfs_backref_drop_node_buffer(node); 2342 list_del_init(&node->list); 2343 node->pending = 0; 2344 } 2345 2346 path->lowest_level = 0; 2347 2348 /* 2349 * We should have allocated all of our space in the block rsv and thus 2350 * shouldn't ENOSPC. 2351 */ 2352 ASSERT(ret != -ENOSPC); 2353 return ret; 2354 } 2355 2356 static int link_to_upper(struct btrfs_trans_handle *trans, 2357 struct reloc_control *rc, 2358 struct btrfs_backref_node *node, 2359 struct btrfs_path *path) 2360 { 2361 struct btrfs_key key; 2362 2363 btrfs_node_key_to_cpu(node->eb, &key, 0); 2364 return do_relocation(trans, rc, node, &key, path, 0); 2365 } 2366 2367 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2368 struct reloc_control *rc, 2369 struct btrfs_path *path, int err) 2370 { 2371 LIST_HEAD(list); 2372 struct btrfs_backref_cache *cache = &rc->backref_cache; 2373 struct btrfs_backref_node *node; 2374 int level; 2375 int ret; 2376 2377 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2378 while (!list_empty(&cache->pending[level])) { 2379 node = list_first_entry(&cache->pending[level], 2380 struct btrfs_backref_node, list); 2381 list_move_tail(&node->list, &list); 2382 BUG_ON(!node->pending); 2383 2384 if (!err) { 2385 ret = link_to_upper(trans, rc, node, path); 2386 if (ret < 0) 2387 err = ret; 2388 } 2389 } 2390 list_splice_init(&list, &cache->pending[level]); 2391 } 2392 return err; 2393 } 2394 2395 /* 2396 * mark a block and all blocks directly/indirectly reference the block 2397 * as processed. 2398 */ 2399 static void update_processed_blocks(struct reloc_control *rc, 2400 struct btrfs_backref_node *node) 2401 { 2402 struct btrfs_backref_node *next = node; 2403 struct btrfs_backref_edge *edge; 2404 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2405 int index = 0; 2406 2407 while (next) { 2408 cond_resched(); 2409 while (1) { 2410 if (next->processed) 2411 break; 2412 2413 mark_block_processed(rc, next); 2414 2415 if (list_empty(&next->upper)) 2416 break; 2417 2418 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2419 list[LOWER]); 2420 edges[index++] = edge; 2421 next = edge->node[UPPER]; 2422 } 2423 next = walk_down_backref(edges, &index); 2424 } 2425 } 2426 2427 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2428 { 2429 u32 blocksize = rc->extent_root->fs_info->nodesize; 2430 2431 if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, 2432 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2433 return 1; 2434 return 0; 2435 } 2436 2437 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2438 struct tree_block *block) 2439 { 2440 struct btrfs_tree_parent_check check = { 2441 .level = block->level, 2442 .owner_root = block->owner, 2443 .transid = block->key.offset 2444 }; 2445 struct extent_buffer *eb; 2446 2447 eb = read_tree_block(fs_info, block->bytenr, &check); 2448 if (IS_ERR(eb)) 2449 return PTR_ERR(eb); 2450 if (unlikely(!extent_buffer_uptodate(eb))) { 2451 free_extent_buffer(eb); 2452 return -EIO; 2453 } 2454 if (block->level == 0) 2455 btrfs_item_key_to_cpu(eb, &block->key, 0); 2456 else 2457 btrfs_node_key_to_cpu(eb, &block->key, 0); 2458 free_extent_buffer(eb); 2459 block->key_ready = true; 2460 return 0; 2461 } 2462 2463 /* 2464 * helper function to relocate a tree block 2465 */ 2466 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2467 struct reloc_control *rc, 2468 struct btrfs_backref_node *node, 2469 struct btrfs_key *key, 2470 struct btrfs_path *path) 2471 { 2472 struct btrfs_root *root; 2473 int ret = 0; 2474 2475 if (!node) 2476 return 0; 2477 2478 /* 2479 * If we fail here we want to drop our backref_node because we are going 2480 * to start over and regenerate the tree for it. 2481 */ 2482 ret = reserve_metadata_space(trans, rc, node); 2483 if (ret) 2484 goto out; 2485 2486 BUG_ON(node->processed); 2487 root = select_one_root(node); 2488 if (IS_ERR(root)) { 2489 ret = PTR_ERR(root); 2490 2491 /* See explanation in select_one_root for the -EUCLEAN case. */ 2492 ASSERT(ret == -ENOENT); 2493 if (ret == -ENOENT) { 2494 ret = 0; 2495 update_processed_blocks(rc, node); 2496 } 2497 goto out; 2498 } 2499 2500 if (root) { 2501 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2502 /* 2503 * This block was the root block of a root, and this is 2504 * the first time we're processing the block and thus it 2505 * should not have had the ->new_bytenr modified. 2506 * 2507 * However in the case of corruption we could have 2508 * multiple refs pointing to the same block improperly, 2509 * and thus we would trip over these checks. ASSERT() 2510 * for the developer case, because it could indicate a 2511 * bug in the backref code, however error out for a 2512 * normal user in the case of corruption. 2513 */ 2514 ASSERT(node->new_bytenr == 0); 2515 if (unlikely(node->new_bytenr)) { 2516 btrfs_err(root->fs_info, 2517 "bytenr %llu has improper references to it", 2518 node->bytenr); 2519 ret = -EUCLEAN; 2520 goto out; 2521 } 2522 ret = btrfs_record_root_in_trans(trans, root); 2523 if (ret) 2524 goto out; 2525 /* 2526 * Another thread could have failed, need to check if we 2527 * have reloc_root actually set. 2528 */ 2529 if (!root->reloc_root) { 2530 ret = -ENOENT; 2531 goto out; 2532 } 2533 root = root->reloc_root; 2534 node->new_bytenr = root->node->start; 2535 btrfs_put_root(node->root); 2536 node->root = btrfs_grab_root(root); 2537 ASSERT(node->root); 2538 } else { 2539 btrfs_err(root->fs_info, 2540 "bytenr %llu resolved to a non-shareable root", 2541 node->bytenr); 2542 ret = -EUCLEAN; 2543 goto out; 2544 } 2545 if (!ret) 2546 update_processed_blocks(rc, node); 2547 } else { 2548 ret = do_relocation(trans, rc, node, key, path, 1); 2549 } 2550 out: 2551 if (ret || node->level == 0) 2552 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2553 return ret; 2554 } 2555 2556 static int relocate_cowonly_block(struct btrfs_trans_handle *trans, 2557 struct reloc_control *rc, struct tree_block *block, 2558 struct btrfs_path *path) 2559 { 2560 struct btrfs_fs_info *fs_info = trans->fs_info; 2561 struct btrfs_root *root; 2562 u64 num_bytes; 2563 int nr_levels; 2564 int ret; 2565 2566 root = btrfs_get_fs_root(fs_info, block->owner, true); 2567 if (IS_ERR(root)) 2568 return PTR_ERR(root); 2569 2570 nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; 2571 2572 num_bytes = fs_info->nodesize * nr_levels; 2573 ret = refill_metadata_space(trans, rc, num_bytes); 2574 if (ret) { 2575 btrfs_put_root(root); 2576 return ret; 2577 } 2578 path->lowest_level = block->level; 2579 if (root == root->fs_info->chunk_root) 2580 btrfs_reserve_chunk_metadata(trans, false); 2581 2582 ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); 2583 path->lowest_level = 0; 2584 btrfs_release_path(path); 2585 2586 if (root == root->fs_info->chunk_root) 2587 btrfs_trans_release_chunk_metadata(trans); 2588 if (ret > 0) 2589 ret = 0; 2590 btrfs_put_root(root); 2591 2592 return ret; 2593 } 2594 2595 /* 2596 * relocate a list of blocks 2597 */ 2598 static noinline_for_stack 2599 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2600 struct reloc_control *rc, struct rb_root *blocks) 2601 { 2602 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2603 struct btrfs_backref_node *node; 2604 struct btrfs_path *path; 2605 struct tree_block *block; 2606 struct tree_block *next; 2607 int ret = 0; 2608 2609 path = btrfs_alloc_path(); 2610 if (!path) { 2611 ret = -ENOMEM; 2612 goto out_free_blocks; 2613 } 2614 2615 /* Kick in readahead for tree blocks with missing keys */ 2616 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2617 if (!block->key_ready) 2618 btrfs_readahead_tree_block(fs_info, block->bytenr, 2619 block->owner, 0, 2620 block->level); 2621 } 2622 2623 /* Get first keys */ 2624 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2625 if (!block->key_ready) { 2626 ret = get_tree_block_key(fs_info, block); 2627 if (ret) 2628 goto out_free_path; 2629 } 2630 } 2631 2632 /* Do tree relocation */ 2633 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2634 /* 2635 * For COWonly blocks, or the data reloc tree, we only need to 2636 * COW down to the block, there's no need to generate a backref 2637 * tree. 2638 */ 2639 if (block->owner && 2640 (!btrfs_is_fstree(block->owner) || 2641 block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { 2642 ret = relocate_cowonly_block(trans, rc, block, path); 2643 if (ret) 2644 break; 2645 continue; 2646 } 2647 2648 node = build_backref_tree(trans, rc, &block->key, 2649 block->level, block->bytenr); 2650 if (IS_ERR(node)) { 2651 ret = PTR_ERR(node); 2652 goto out; 2653 } 2654 2655 ret = relocate_tree_block(trans, rc, node, &block->key, 2656 path); 2657 if (ret < 0) 2658 break; 2659 } 2660 out: 2661 ret = finish_pending_nodes(trans, rc, path, ret); 2662 2663 out_free_path: 2664 btrfs_free_path(path); 2665 out_free_blocks: 2666 free_block_list(blocks); 2667 return ret; 2668 } 2669 2670 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2671 { 2672 const struct file_extent_cluster *cluster = &rc->cluster; 2673 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2674 u64 alloc_hint = 0; 2675 u64 start; 2676 u64 end; 2677 u64 offset = inode->reloc_block_group_start; 2678 u64 num_bytes; 2679 int nr; 2680 int ret = 0; 2681 u64 prealloc_start = cluster->start - offset; 2682 u64 prealloc_end = cluster->end - offset; 2683 u64 cur_offset = prealloc_start; 2684 2685 /* 2686 * For blocksize < folio size case (either bs < page size or large folios), 2687 * beyond i_size, all blocks are filled with zero. 2688 * 2689 * If the current cluster covers the above range, btrfs_do_readpage() 2690 * will skip the read, and relocate_one_folio() will later writeback 2691 * the padding zeros as new data, causing data corruption. 2692 * 2693 * Here we have to invalidate the cache covering our cluster. 2694 */ 2695 ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, 2696 prealloc_end); 2697 if (ret < 0) 2698 return ret; 2699 2700 BUG_ON(cluster->start != cluster->boundary[0]); 2701 ret = btrfs_alloc_data_chunk_ondemand(inode, 2702 prealloc_end + 1 - prealloc_start); 2703 if (ret) 2704 return ret; 2705 2706 btrfs_inode_lock(inode, 0); 2707 for (nr = 0; nr < cluster->nr; nr++) { 2708 struct extent_state *cached_state = NULL; 2709 2710 start = cluster->boundary[nr] - offset; 2711 if (nr + 1 < cluster->nr) 2712 end = cluster->boundary[nr + 1] - 1 - offset; 2713 else 2714 end = cluster->end - offset; 2715 2716 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2717 num_bytes = end + 1 - start; 2718 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2719 num_bytes, num_bytes, 2720 end + 1, &alloc_hint); 2721 cur_offset = end + 1; 2722 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2723 if (ret) 2724 break; 2725 } 2726 btrfs_inode_unlock(inode, 0); 2727 2728 if (cur_offset < prealloc_end) 2729 btrfs_free_reserved_data_space_noquota(inode, 2730 prealloc_end + 1 - cur_offset); 2731 return ret; 2732 } 2733 2734 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2735 { 2736 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2737 struct extent_map *em; 2738 struct extent_state *cached_state = NULL; 2739 u64 offset = inode->reloc_block_group_start; 2740 u64 start = rc->cluster.start - offset; 2741 u64 end = rc->cluster.end - offset; 2742 int ret = 0; 2743 2744 em = btrfs_alloc_extent_map(); 2745 if (!em) 2746 return -ENOMEM; 2747 2748 em->start = start; 2749 em->len = end + 1 - start; 2750 em->disk_bytenr = rc->cluster.start; 2751 em->disk_num_bytes = em->len; 2752 em->ram_bytes = em->len; 2753 em->flags |= EXTENT_FLAG_PINNED; 2754 2755 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2756 ret = btrfs_replace_extent_map_range(inode, em, false); 2757 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2758 btrfs_free_extent_map(em); 2759 2760 return ret; 2761 } 2762 2763 /* 2764 * Allow error injection to test balance/relocation cancellation 2765 */ 2766 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2767 { 2768 return atomic_read(&fs_info->balance_cancel_req) || 2769 atomic_read(&fs_info->reloc_cancel_req) || 2770 fatal_signal_pending(current); 2771 } 2772 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2773 2774 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2775 int cluster_nr) 2776 { 2777 /* Last extent, use cluster end directly */ 2778 if (cluster_nr >= cluster->nr - 1) 2779 return cluster->end; 2780 2781 /* Use next boundary start*/ 2782 return cluster->boundary[cluster_nr + 1] - 1; 2783 } 2784 2785 static int relocate_one_folio(struct reloc_control *rc, 2786 struct file_ra_state *ra, 2787 int *cluster_nr, u64 *file_offset_ret) 2788 { 2789 const struct file_extent_cluster *cluster = &rc->cluster; 2790 struct inode *inode = rc->data_inode; 2791 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2792 const u64 orig_file_offset = *file_offset_ret; 2793 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2794 const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; 2795 const pgoff_t index = orig_file_offset >> PAGE_SHIFT; 2796 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2797 struct folio *folio; 2798 u64 folio_start; 2799 u64 folio_end; 2800 u64 cur; 2801 int ret; 2802 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2803 2804 ASSERT(index <= last_index); 2805 again: 2806 folio = filemap_lock_folio(inode->i_mapping, index); 2807 if (IS_ERR(folio)) { 2808 2809 /* 2810 * On relocation we're doing readahead on the relocation inode, 2811 * but if the filesystem is backed by a RAID stripe tree we can 2812 * get ENOENT (e.g. due to preallocated extents not being 2813 * mapped in the RST) from the lookup. 2814 * 2815 * But readahead doesn't handle the error and submits invalid 2816 * reads to the device, causing a assertion failures. 2817 */ 2818 if (!use_rst) 2819 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2820 index, last_index + 1 - index); 2821 folio = __filemap_get_folio(inode->i_mapping, index, 2822 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2823 mask); 2824 if (IS_ERR(folio)) 2825 return PTR_ERR(folio); 2826 } 2827 2828 if (folio_test_readahead(folio) && !use_rst) 2829 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2830 folio, last_index + 1 - index); 2831 2832 if (!folio_test_uptodate(folio)) { 2833 btrfs_read_folio(NULL, folio); 2834 folio_lock(folio); 2835 if (unlikely(!folio_test_uptodate(folio))) { 2836 ret = -EIO; 2837 goto release_folio; 2838 } 2839 if (folio->mapping != inode->i_mapping) { 2840 folio_unlock(folio); 2841 folio_put(folio); 2842 goto again; 2843 } 2844 } 2845 2846 /* 2847 * We could have lost folio private when we dropped the lock to read the 2848 * folio above, make sure we set_folio_extent_mapped() here so we have any 2849 * of the subpage blocksize stuff we need in place. 2850 */ 2851 ret = set_folio_extent_mapped(folio); 2852 if (ret < 0) 2853 goto release_folio; 2854 2855 folio_start = folio_pos(folio); 2856 folio_end = folio_start + folio_size(folio) - 1; 2857 2858 /* 2859 * Start from the cluster, as for subpage case, the cluster can start 2860 * inside the folio. 2861 */ 2862 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2863 while (cur <= folio_end) { 2864 struct extent_state *cached_state = NULL; 2865 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2866 u64 extent_end = get_cluster_boundary_end(cluster, 2867 *cluster_nr) - offset; 2868 u64 clamped_start = max(folio_start, extent_start); 2869 u64 clamped_end = min(folio_end, extent_end); 2870 u32 clamped_len = clamped_end + 1 - clamped_start; 2871 2872 /* Reserve metadata for this range */ 2873 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2874 clamped_len, clamped_len, 2875 false); 2876 if (ret) 2877 goto release_folio; 2878 2879 /* Mark the range delalloc and dirty for later writeback */ 2880 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, 2881 clamped_end, &cached_state); 2882 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2883 clamped_end, 0, &cached_state); 2884 if (ret) { 2885 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 2886 clamped_start, clamped_end, 2887 EXTENT_LOCKED | EXTENT_BOUNDARY, 2888 &cached_state); 2889 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2890 clamped_len, true); 2891 btrfs_delalloc_release_extents(BTRFS_I(inode), 2892 clamped_len); 2893 goto release_folio; 2894 } 2895 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2896 2897 /* 2898 * Set the boundary if it's inside the folio. 2899 * Data relocation requires the destination extents to have the 2900 * same size as the source. 2901 * EXTENT_BOUNDARY bit prevents current extent from being merged 2902 * with previous extent. 2903 */ 2904 if (in_range(cluster->boundary[*cluster_nr] - offset, 2905 folio_start, folio_size(folio))) { 2906 u64 boundary_start = cluster->boundary[*cluster_nr] - 2907 offset; 2908 u64 boundary_end = boundary_start + 2909 fs_info->sectorsize - 1; 2910 2911 btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, 2912 boundary_start, boundary_end, 2913 EXTENT_BOUNDARY, NULL); 2914 } 2915 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2916 &cached_state); 2917 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 2918 cur += clamped_len; 2919 2920 /* Crossed extent end, go to next extent */ 2921 if (cur >= extent_end) { 2922 (*cluster_nr)++; 2923 /* Just finished the last extent of the cluster, exit. */ 2924 if (*cluster_nr >= cluster->nr) 2925 break; 2926 } 2927 } 2928 folio_unlock(folio); 2929 folio_put(folio); 2930 2931 balance_dirty_pages_ratelimited(inode->i_mapping); 2932 btrfs_throttle(fs_info); 2933 if (btrfs_should_cancel_balance(fs_info)) 2934 ret = -ECANCELED; 2935 *file_offset_ret = folio_end + 1; 2936 return ret; 2937 2938 release_folio: 2939 folio_unlock(folio); 2940 folio_put(folio); 2941 return ret; 2942 } 2943 2944 static int relocate_file_extent_cluster(struct reloc_control *rc) 2945 { 2946 struct inode *inode = rc->data_inode; 2947 const struct file_extent_cluster *cluster = &rc->cluster; 2948 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2949 u64 cur_file_offset = cluster->start - offset; 2950 struct file_ra_state *ra; 2951 int cluster_nr = 0; 2952 int ret = 0; 2953 2954 if (!cluster->nr) 2955 return 0; 2956 2957 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2958 if (!ra) 2959 return -ENOMEM; 2960 2961 ret = prealloc_file_extent_cluster(rc); 2962 if (ret) 2963 goto out; 2964 2965 file_ra_state_init(ra, inode->i_mapping); 2966 2967 ret = setup_relocation_extent_mapping(rc); 2968 if (ret) 2969 goto out; 2970 2971 while (cur_file_offset < cluster->end - offset) { 2972 ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); 2973 if (ret) 2974 break; 2975 } 2976 if (ret == 0) 2977 WARN_ON(cluster_nr != cluster->nr); 2978 out: 2979 kfree(ra); 2980 return ret; 2981 } 2982 2983 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 2984 const struct btrfs_key *extent_key) 2985 { 2986 struct inode *inode = rc->data_inode; 2987 struct file_extent_cluster *cluster = &rc->cluster; 2988 int ret; 2989 struct btrfs_root *root = BTRFS_I(inode)->root; 2990 2991 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 2992 ret = relocate_file_extent_cluster(rc); 2993 if (ret) 2994 return ret; 2995 cluster->nr = 0; 2996 } 2997 2998 /* 2999 * Under simple quotas, we set root->relocation_src_root when we find 3000 * the extent. If adjacent extents have different owners, we can't merge 3001 * them while relocating. Handle this by storing the owning root that 3002 * started a cluster and if we see an extent from a different root break 3003 * cluster formation (just like the above case of non-adjacent extents). 3004 * 3005 * Without simple quotas, relocation_src_root is always 0, so we should 3006 * never see a mismatch, and it should have no effect on relocation 3007 * clusters. 3008 */ 3009 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3010 u64 tmp = root->relocation_src_root; 3011 3012 /* 3013 * root->relocation_src_root is the state that actually affects 3014 * the preallocation we do here, so set it to the root owning 3015 * the cluster we need to relocate. 3016 */ 3017 root->relocation_src_root = cluster->owning_root; 3018 ret = relocate_file_extent_cluster(rc); 3019 if (ret) 3020 return ret; 3021 cluster->nr = 0; 3022 /* And reset it back for the current extent's owning root. */ 3023 root->relocation_src_root = tmp; 3024 } 3025 3026 if (!cluster->nr) { 3027 cluster->start = extent_key->objectid; 3028 cluster->owning_root = root->relocation_src_root; 3029 } 3030 else 3031 BUG_ON(cluster->nr >= MAX_EXTENTS); 3032 cluster->end = extent_key->objectid + extent_key->offset - 1; 3033 cluster->boundary[cluster->nr] = extent_key->objectid; 3034 cluster->nr++; 3035 3036 if (cluster->nr >= MAX_EXTENTS) { 3037 ret = relocate_file_extent_cluster(rc); 3038 if (ret) 3039 return ret; 3040 cluster->nr = 0; 3041 } 3042 return 0; 3043 } 3044 3045 /* 3046 * helper to add a tree block to the list. 3047 * the major work is getting the generation and level of the block 3048 */ 3049 static int add_tree_block(struct reloc_control *rc, 3050 const struct btrfs_key *extent_key, 3051 struct btrfs_path *path, 3052 struct rb_root *blocks) 3053 { 3054 struct extent_buffer *eb; 3055 struct btrfs_extent_item *ei; 3056 struct btrfs_tree_block_info *bi; 3057 struct tree_block *block; 3058 struct rb_node *rb_node; 3059 u32 item_size; 3060 int level = -1; 3061 u64 generation; 3062 u64 owner = 0; 3063 3064 eb = path->nodes[0]; 3065 item_size = btrfs_item_size(eb, path->slots[0]); 3066 3067 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3068 item_size >= sizeof(*ei) + sizeof(*bi)) { 3069 unsigned long ptr = 0, end; 3070 3071 ei = btrfs_item_ptr(eb, path->slots[0], 3072 struct btrfs_extent_item); 3073 end = (unsigned long)ei + item_size; 3074 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3075 bi = (struct btrfs_tree_block_info *)(ei + 1); 3076 level = btrfs_tree_block_level(eb, bi); 3077 ptr = (unsigned long)(bi + 1); 3078 } else { 3079 level = (int)extent_key->offset; 3080 ptr = (unsigned long)(ei + 1); 3081 } 3082 generation = btrfs_extent_generation(eb, ei); 3083 3084 /* 3085 * We're reading random blocks without knowing their owner ahead 3086 * of time. This is ok most of the time, as all reloc roots and 3087 * fs roots have the same lock type. However normal trees do 3088 * not, and the only way to know ahead of time is to read the 3089 * inline ref offset. We know it's an fs root if 3090 * 3091 * 1. There's more than one ref. 3092 * 2. There's a SHARED_DATA_REF_KEY set. 3093 * 3. FULL_BACKREF is set on the flags. 3094 * 3095 * Otherwise it's safe to assume that the ref offset == the 3096 * owner of this block, so we can use that when calling 3097 * read_tree_block. 3098 */ 3099 if (btrfs_extent_refs(eb, ei) == 1 && 3100 !(btrfs_extent_flags(eb, ei) & 3101 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3102 ptr < end) { 3103 struct btrfs_extent_inline_ref *iref; 3104 int type; 3105 3106 iref = (struct btrfs_extent_inline_ref *)ptr; 3107 type = btrfs_get_extent_inline_ref_type(eb, iref, 3108 BTRFS_REF_TYPE_BLOCK); 3109 if (type == BTRFS_REF_TYPE_INVALID) 3110 return -EINVAL; 3111 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3112 owner = btrfs_extent_inline_ref_offset(eb, iref); 3113 } 3114 } else { 3115 btrfs_print_leaf(eb); 3116 btrfs_err(rc->block_group->fs_info, 3117 "unrecognized tree backref at tree block %llu slot %u", 3118 eb->start, path->slots[0]); 3119 btrfs_release_path(path); 3120 return -EUCLEAN; 3121 } 3122 3123 btrfs_release_path(path); 3124 3125 BUG_ON(level == -1); 3126 3127 block = kmalloc(sizeof(*block), GFP_NOFS); 3128 if (!block) 3129 return -ENOMEM; 3130 3131 block->bytenr = extent_key->objectid; 3132 block->key.objectid = rc->extent_root->fs_info->nodesize; 3133 block->key.offset = generation; 3134 block->level = level; 3135 block->key_ready = false; 3136 block->owner = owner; 3137 3138 rb_node = rb_simple_insert(blocks, &block->simple_node); 3139 if (rb_node) 3140 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3141 -EEXIST); 3142 3143 return 0; 3144 } 3145 3146 /* 3147 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3148 */ 3149 static int __add_tree_block(struct reloc_control *rc, 3150 u64 bytenr, u32 blocksize, 3151 struct rb_root *blocks) 3152 { 3153 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3154 BTRFS_PATH_AUTO_FREE(path); 3155 struct btrfs_key key; 3156 int ret; 3157 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3158 3159 if (tree_block_processed(bytenr, rc)) 3160 return 0; 3161 3162 if (rb_simple_search(blocks, bytenr)) 3163 return 0; 3164 3165 path = btrfs_alloc_path(); 3166 if (!path) 3167 return -ENOMEM; 3168 again: 3169 key.objectid = bytenr; 3170 if (skinny) { 3171 key.type = BTRFS_METADATA_ITEM_KEY; 3172 key.offset = (u64)-1; 3173 } else { 3174 key.type = BTRFS_EXTENT_ITEM_KEY; 3175 key.offset = blocksize; 3176 } 3177 3178 path->search_commit_root = 1; 3179 path->skip_locking = 1; 3180 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3181 if (ret < 0) 3182 return ret; 3183 3184 if (ret > 0 && skinny) { 3185 if (path->slots[0]) { 3186 path->slots[0]--; 3187 btrfs_item_key_to_cpu(path->nodes[0], &key, 3188 path->slots[0]); 3189 if (key.objectid == bytenr && 3190 (key.type == BTRFS_METADATA_ITEM_KEY || 3191 (key.type == BTRFS_EXTENT_ITEM_KEY && 3192 key.offset == blocksize))) 3193 ret = 0; 3194 } 3195 3196 if (ret) { 3197 skinny = false; 3198 btrfs_release_path(path); 3199 goto again; 3200 } 3201 } 3202 if (ret) { 3203 ASSERT(ret == 1); 3204 btrfs_print_leaf(path->nodes[0]); 3205 btrfs_err(fs_info, 3206 "tree block extent item (%llu) is not found in extent tree", 3207 bytenr); 3208 WARN_ON(1); 3209 return -EINVAL; 3210 } 3211 3212 return add_tree_block(rc, &key, path, blocks); 3213 } 3214 3215 static int delete_block_group_cache(struct btrfs_block_group *block_group, 3216 struct inode *inode, 3217 u64 ino) 3218 { 3219 struct btrfs_fs_info *fs_info = block_group->fs_info; 3220 struct btrfs_root *root = fs_info->tree_root; 3221 struct btrfs_trans_handle *trans; 3222 struct btrfs_inode *btrfs_inode; 3223 int ret = 0; 3224 3225 if (inode) 3226 goto truncate; 3227 3228 btrfs_inode = btrfs_iget(ino, root); 3229 if (IS_ERR(btrfs_inode)) 3230 return -ENOENT; 3231 inode = &btrfs_inode->vfs_inode; 3232 3233 truncate: 3234 ret = btrfs_check_trunc_cache_free_space(fs_info, 3235 &fs_info->global_block_rsv); 3236 if (ret) 3237 goto out; 3238 3239 trans = btrfs_join_transaction(root); 3240 if (IS_ERR(trans)) { 3241 ret = PTR_ERR(trans); 3242 goto out; 3243 } 3244 3245 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3246 3247 btrfs_end_transaction(trans); 3248 btrfs_btree_balance_dirty(fs_info); 3249 out: 3250 iput(inode); 3251 return ret; 3252 } 3253 3254 /* 3255 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3256 * cache inode, to avoid free space cache data extent blocking data relocation. 3257 */ 3258 static int delete_v1_space_cache(struct extent_buffer *leaf, 3259 struct btrfs_block_group *block_group, 3260 u64 data_bytenr) 3261 { 3262 u64 space_cache_ino; 3263 struct btrfs_file_extent_item *ei; 3264 struct btrfs_key key; 3265 bool found = false; 3266 int i; 3267 int ret; 3268 3269 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3270 return 0; 3271 3272 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3273 u8 type; 3274 3275 btrfs_item_key_to_cpu(leaf, &key, i); 3276 if (key.type != BTRFS_EXTENT_DATA_KEY) 3277 continue; 3278 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3279 type = btrfs_file_extent_type(leaf, ei); 3280 3281 if ((type == BTRFS_FILE_EXTENT_REG || 3282 type == BTRFS_FILE_EXTENT_PREALLOC) && 3283 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3284 found = true; 3285 space_cache_ino = key.objectid; 3286 break; 3287 } 3288 } 3289 if (!found) 3290 return -ENOENT; 3291 ret = delete_block_group_cache(block_group, NULL, space_cache_ino); 3292 return ret; 3293 } 3294 3295 /* 3296 * helper to find all tree blocks that reference a given data extent 3297 */ 3298 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3299 const struct btrfs_key *extent_key, 3300 struct btrfs_path *path, 3301 struct rb_root *blocks) 3302 { 3303 struct btrfs_backref_walk_ctx ctx = { 0 }; 3304 struct ulist_iterator leaf_uiter; 3305 struct ulist_node *ref_node = NULL; 3306 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3307 int ret = 0; 3308 3309 btrfs_release_path(path); 3310 3311 ctx.bytenr = extent_key->objectid; 3312 ctx.skip_inode_ref_list = true; 3313 ctx.fs_info = rc->extent_root->fs_info; 3314 3315 ret = btrfs_find_all_leafs(&ctx); 3316 if (ret < 0) 3317 return ret; 3318 3319 ULIST_ITER_INIT(&leaf_uiter); 3320 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3321 struct btrfs_tree_parent_check check = { 0 }; 3322 struct extent_buffer *eb; 3323 3324 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3325 if (IS_ERR(eb)) { 3326 ret = PTR_ERR(eb); 3327 break; 3328 } 3329 ret = delete_v1_space_cache(eb, rc->block_group, 3330 extent_key->objectid); 3331 free_extent_buffer(eb); 3332 if (ret < 0) 3333 break; 3334 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3335 if (ret < 0) 3336 break; 3337 } 3338 if (ret < 0) 3339 free_block_list(blocks); 3340 ulist_free(ctx.refs); 3341 return ret; 3342 } 3343 3344 /* 3345 * helper to find next unprocessed extent 3346 */ 3347 static noinline_for_stack 3348 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3349 struct btrfs_key *extent_key) 3350 { 3351 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3352 struct btrfs_key key; 3353 struct extent_buffer *leaf; 3354 u64 start, end, last; 3355 int ret; 3356 3357 last = rc->block_group->start + rc->block_group->length; 3358 while (1) { 3359 bool block_found; 3360 3361 cond_resched(); 3362 if (rc->search_start >= last) { 3363 ret = 1; 3364 break; 3365 } 3366 3367 key.objectid = rc->search_start; 3368 key.type = BTRFS_EXTENT_ITEM_KEY; 3369 key.offset = 0; 3370 3371 path->search_commit_root = 1; 3372 path->skip_locking = 1; 3373 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3374 0, 0); 3375 if (ret < 0) 3376 break; 3377 next: 3378 leaf = path->nodes[0]; 3379 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3380 ret = btrfs_next_leaf(rc->extent_root, path); 3381 if (ret != 0) 3382 break; 3383 leaf = path->nodes[0]; 3384 } 3385 3386 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3387 if (key.objectid >= last) { 3388 ret = 1; 3389 break; 3390 } 3391 3392 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3393 key.type != BTRFS_METADATA_ITEM_KEY) { 3394 path->slots[0]++; 3395 goto next; 3396 } 3397 3398 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3399 key.objectid + key.offset <= rc->search_start) { 3400 path->slots[0]++; 3401 goto next; 3402 } 3403 3404 if (key.type == BTRFS_METADATA_ITEM_KEY && 3405 key.objectid + fs_info->nodesize <= 3406 rc->search_start) { 3407 path->slots[0]++; 3408 goto next; 3409 } 3410 3411 block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, 3412 key.objectid, &start, &end, 3413 EXTENT_DIRTY, NULL); 3414 3415 if (block_found && start <= key.objectid) { 3416 btrfs_release_path(path); 3417 rc->search_start = end + 1; 3418 } else { 3419 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3420 rc->search_start = key.objectid + key.offset; 3421 else 3422 rc->search_start = key.objectid + 3423 fs_info->nodesize; 3424 memcpy(extent_key, &key, sizeof(key)); 3425 return 0; 3426 } 3427 } 3428 btrfs_release_path(path); 3429 return ret; 3430 } 3431 3432 static void set_reloc_control(struct reloc_control *rc) 3433 { 3434 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3435 3436 mutex_lock(&fs_info->reloc_mutex); 3437 fs_info->reloc_ctl = rc; 3438 mutex_unlock(&fs_info->reloc_mutex); 3439 } 3440 3441 static void unset_reloc_control(struct reloc_control *rc) 3442 { 3443 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3444 3445 mutex_lock(&fs_info->reloc_mutex); 3446 fs_info->reloc_ctl = NULL; 3447 mutex_unlock(&fs_info->reloc_mutex); 3448 } 3449 3450 static noinline_for_stack 3451 int prepare_to_relocate(struct reloc_control *rc) 3452 { 3453 struct btrfs_trans_handle *trans; 3454 int ret; 3455 3456 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3457 BTRFS_BLOCK_RSV_TEMP); 3458 if (!rc->block_rsv) 3459 return -ENOMEM; 3460 3461 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3462 rc->search_start = rc->block_group->start; 3463 rc->extents_found = 0; 3464 rc->nodes_relocated = 0; 3465 rc->merging_rsv_size = 0; 3466 rc->reserved_bytes = 0; 3467 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3468 RELOCATION_RESERVED_NODES; 3469 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3470 rc->block_rsv, rc->block_rsv->size, 3471 BTRFS_RESERVE_FLUSH_ALL); 3472 if (ret) 3473 return ret; 3474 3475 rc->create_reloc_tree = true; 3476 set_reloc_control(rc); 3477 3478 trans = btrfs_join_transaction(rc->extent_root); 3479 if (IS_ERR(trans)) { 3480 unset_reloc_control(rc); 3481 /* 3482 * extent tree is not a ref_cow tree and has no reloc_root to 3483 * cleanup. And callers are responsible to free the above 3484 * block rsv. 3485 */ 3486 return PTR_ERR(trans); 3487 } 3488 3489 ret = btrfs_commit_transaction(trans); 3490 if (ret) 3491 unset_reloc_control(rc); 3492 3493 return ret; 3494 } 3495 3496 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3497 { 3498 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3499 struct rb_root blocks = RB_ROOT; 3500 struct btrfs_key key; 3501 struct btrfs_trans_handle *trans = NULL; 3502 BTRFS_PATH_AUTO_FREE(path); 3503 struct btrfs_extent_item *ei; 3504 u64 flags; 3505 int ret; 3506 int err = 0; 3507 int progress = 0; 3508 3509 path = btrfs_alloc_path(); 3510 if (!path) 3511 return -ENOMEM; 3512 path->reada = READA_FORWARD; 3513 3514 ret = prepare_to_relocate(rc); 3515 if (ret) { 3516 err = ret; 3517 goto out_free; 3518 } 3519 3520 while (1) { 3521 rc->reserved_bytes = 0; 3522 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3523 rc->block_rsv->size, 3524 BTRFS_RESERVE_FLUSH_ALL); 3525 if (ret) { 3526 err = ret; 3527 break; 3528 } 3529 progress++; 3530 trans = btrfs_start_transaction(rc->extent_root, 0); 3531 if (IS_ERR(trans)) { 3532 err = PTR_ERR(trans); 3533 trans = NULL; 3534 break; 3535 } 3536 restart: 3537 if (rc->backref_cache.last_trans != trans->transid) 3538 btrfs_backref_release_cache(&rc->backref_cache); 3539 rc->backref_cache.last_trans = trans->transid; 3540 3541 ret = find_next_extent(rc, path, &key); 3542 if (ret < 0) 3543 err = ret; 3544 if (ret != 0) 3545 break; 3546 3547 rc->extents_found++; 3548 3549 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3550 struct btrfs_extent_item); 3551 flags = btrfs_extent_flags(path->nodes[0], ei); 3552 3553 /* 3554 * If we are relocating a simple quota owned extent item, we 3555 * need to note the owner on the reloc data root so that when 3556 * we allocate the replacement item, we can attribute it to the 3557 * correct eventual owner (rather than the reloc data root). 3558 */ 3559 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3560 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3561 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3562 path->nodes[0], 3563 path->slots[0]); 3564 3565 root->relocation_src_root = owning_root_id; 3566 } 3567 3568 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3569 ret = add_tree_block(rc, &key, path, &blocks); 3570 } else if (rc->stage == UPDATE_DATA_PTRS && 3571 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3572 ret = add_data_references(rc, &key, path, &blocks); 3573 } else { 3574 btrfs_release_path(path); 3575 ret = 0; 3576 } 3577 if (ret < 0) { 3578 err = ret; 3579 break; 3580 } 3581 3582 if (!RB_EMPTY_ROOT(&blocks)) { 3583 ret = relocate_tree_blocks(trans, rc, &blocks); 3584 if (ret < 0) { 3585 if (ret != -EAGAIN) { 3586 err = ret; 3587 break; 3588 } 3589 rc->extents_found--; 3590 rc->search_start = key.objectid; 3591 } 3592 } 3593 3594 btrfs_end_transaction_throttle(trans); 3595 btrfs_btree_balance_dirty(fs_info); 3596 trans = NULL; 3597 3598 if (rc->stage == MOVE_DATA_EXTENTS && 3599 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3600 rc->found_file_extent = true; 3601 ret = relocate_data_extent(rc, &key); 3602 if (ret < 0) { 3603 err = ret; 3604 break; 3605 } 3606 } 3607 if (btrfs_should_cancel_balance(fs_info)) { 3608 err = -ECANCELED; 3609 break; 3610 } 3611 } 3612 if (trans && progress && err == -ENOSPC) { 3613 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3614 if (ret == 1) { 3615 err = 0; 3616 progress = 0; 3617 goto restart; 3618 } 3619 } 3620 3621 btrfs_release_path(path); 3622 btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); 3623 3624 if (trans) { 3625 btrfs_end_transaction_throttle(trans); 3626 btrfs_btree_balance_dirty(fs_info); 3627 } 3628 3629 if (!err) { 3630 ret = relocate_file_extent_cluster(rc); 3631 if (ret < 0) 3632 err = ret; 3633 } 3634 3635 rc->create_reloc_tree = false; 3636 set_reloc_control(rc); 3637 3638 btrfs_backref_release_cache(&rc->backref_cache); 3639 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3640 3641 /* 3642 * Even in the case when the relocation is cancelled, we should all go 3643 * through prepare_to_merge() and merge_reloc_roots(). 3644 * 3645 * For error (including cancelled balance), prepare_to_merge() will 3646 * mark all reloc trees orphan, then queue them for cleanup in 3647 * merge_reloc_roots() 3648 */ 3649 err = prepare_to_merge(rc, err); 3650 3651 merge_reloc_roots(rc); 3652 3653 rc->merge_reloc_tree = false; 3654 unset_reloc_control(rc); 3655 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3656 3657 /* get rid of pinned extents */ 3658 trans = btrfs_join_transaction(rc->extent_root); 3659 if (IS_ERR(trans)) { 3660 err = PTR_ERR(trans); 3661 goto out_free; 3662 } 3663 ret = btrfs_commit_transaction(trans); 3664 if (ret && !err) 3665 err = ret; 3666 out_free: 3667 ret = clean_dirty_subvols(rc); 3668 if (ret < 0 && !err) 3669 err = ret; 3670 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3671 return err; 3672 } 3673 3674 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3675 struct btrfs_root *root, u64 objectid) 3676 { 3677 BTRFS_PATH_AUTO_FREE(path); 3678 struct btrfs_inode_item *item; 3679 struct extent_buffer *leaf; 3680 int ret; 3681 3682 path = btrfs_alloc_path(); 3683 if (!path) 3684 return -ENOMEM; 3685 3686 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3687 if (ret) 3688 return ret; 3689 3690 leaf = path->nodes[0]; 3691 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3692 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3693 btrfs_set_inode_generation(leaf, item, 1); 3694 btrfs_set_inode_size(leaf, item, 0); 3695 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3696 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3697 BTRFS_INODE_PREALLOC); 3698 return 0; 3699 } 3700 3701 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3702 struct btrfs_root *root, u64 objectid) 3703 { 3704 BTRFS_PATH_AUTO_FREE(path); 3705 struct btrfs_key key; 3706 int ret = 0; 3707 3708 path = btrfs_alloc_path(); 3709 if (!path) { 3710 ret = -ENOMEM; 3711 goto out; 3712 } 3713 3714 key.objectid = objectid; 3715 key.type = BTRFS_INODE_ITEM_KEY; 3716 key.offset = 0; 3717 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3718 if (ret) { 3719 if (ret > 0) 3720 ret = -ENOENT; 3721 goto out; 3722 } 3723 ret = btrfs_del_item(trans, root, path); 3724 out: 3725 if (ret) 3726 btrfs_abort_transaction(trans, ret); 3727 } 3728 3729 /* 3730 * helper to create inode for data relocation. 3731 * the inode is in data relocation tree and its link count is 0 3732 */ 3733 static noinline_for_stack struct inode *create_reloc_inode( 3734 const struct btrfs_block_group *group) 3735 { 3736 struct btrfs_fs_info *fs_info = group->fs_info; 3737 struct btrfs_inode *inode = NULL; 3738 struct btrfs_trans_handle *trans; 3739 struct btrfs_root *root; 3740 u64 objectid; 3741 int ret = 0; 3742 3743 root = btrfs_grab_root(fs_info->data_reloc_root); 3744 trans = btrfs_start_transaction(root, 6); 3745 if (IS_ERR(trans)) { 3746 btrfs_put_root(root); 3747 return ERR_CAST(trans); 3748 } 3749 3750 ret = btrfs_get_free_objectid(root, &objectid); 3751 if (ret) 3752 goto out; 3753 3754 ret = __insert_orphan_inode(trans, root, objectid); 3755 if (ret) 3756 goto out; 3757 3758 inode = btrfs_iget(objectid, root); 3759 if (IS_ERR(inode)) { 3760 delete_orphan_inode(trans, root, objectid); 3761 ret = PTR_ERR(inode); 3762 inode = NULL; 3763 goto out; 3764 } 3765 inode->reloc_block_group_start = group->start; 3766 3767 ret = btrfs_orphan_add(trans, inode); 3768 out: 3769 btrfs_put_root(root); 3770 btrfs_end_transaction(trans); 3771 btrfs_btree_balance_dirty(fs_info); 3772 if (ret) { 3773 if (inode) 3774 iput(&inode->vfs_inode); 3775 return ERR_PTR(ret); 3776 } 3777 return &inode->vfs_inode; 3778 } 3779 3780 /* 3781 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3782 * has been requested meanwhile and don't start in that case. 3783 * 3784 * Return: 3785 * 0 success 3786 * -EINPROGRESS operation is already in progress, that's probably a bug 3787 * -ECANCELED cancellation request was set before the operation started 3788 */ 3789 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3790 { 3791 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3792 /* This should not happen */ 3793 btrfs_err(fs_info, "reloc already running, cannot start"); 3794 return -EINPROGRESS; 3795 } 3796 3797 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3798 btrfs_info(fs_info, "chunk relocation canceled on start"); 3799 /* 3800 * On cancel, clear all requests but let the caller mark 3801 * the end after cleanup operations. 3802 */ 3803 atomic_set(&fs_info->reloc_cancel_req, 0); 3804 return -ECANCELED; 3805 } 3806 return 0; 3807 } 3808 3809 /* 3810 * Mark end of chunk relocation that is cancellable and wake any waiters. 3811 */ 3812 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3813 { 3814 /* Requested after start, clear bit first so any waiters can continue */ 3815 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3816 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3817 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3818 atomic_set(&fs_info->reloc_cancel_req, 0); 3819 } 3820 3821 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3822 { 3823 struct reloc_control *rc; 3824 3825 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3826 if (!rc) 3827 return NULL; 3828 3829 INIT_LIST_HEAD(&rc->reloc_roots); 3830 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3831 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3832 rc->reloc_root_tree.rb_root = RB_ROOT; 3833 spin_lock_init(&rc->reloc_root_tree.lock); 3834 btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3835 return rc; 3836 } 3837 3838 static void free_reloc_control(struct reloc_control *rc) 3839 { 3840 struct mapping_node *node, *tmp; 3841 3842 free_reloc_roots(&rc->reloc_roots); 3843 rbtree_postorder_for_each_entry_safe(node, tmp, 3844 &rc->reloc_root_tree.rb_root, rb_node) 3845 kfree(node); 3846 3847 kfree(rc); 3848 } 3849 3850 /* 3851 * Print the block group being relocated 3852 */ 3853 static void describe_relocation(struct btrfs_block_group *block_group) 3854 { 3855 char buf[128] = "NONE"; 3856 3857 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3858 3859 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3860 block_group->start, buf); 3861 } 3862 3863 static const char *stage_to_string(enum reloc_stage stage) 3864 { 3865 if (stage == MOVE_DATA_EXTENTS) 3866 return "move data extents"; 3867 if (stage == UPDATE_DATA_PTRS) 3868 return "update data pointers"; 3869 return "unknown"; 3870 } 3871 3872 /* 3873 * function to relocate all extents in a block group. 3874 */ 3875 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, 3876 bool verbose) 3877 { 3878 struct btrfs_block_group *bg; 3879 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3880 struct reloc_control *rc; 3881 struct inode *inode; 3882 struct btrfs_path *path; 3883 int ret; 3884 int rw = 0; 3885 int err = 0; 3886 3887 /* 3888 * This only gets set if we had a half-deleted snapshot on mount. We 3889 * cannot allow relocation to start while we're still trying to clean up 3890 * these pending deletions. 3891 */ 3892 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3893 if (ret) 3894 return ret; 3895 3896 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3897 if (btrfs_fs_closing(fs_info)) 3898 return -EINTR; 3899 3900 bg = btrfs_lookup_block_group(fs_info, group_start); 3901 if (!bg) 3902 return -ENOENT; 3903 3904 /* 3905 * Relocation of a data block group creates ordered extents. Without 3906 * sb_start_write(), we can freeze the filesystem while unfinished 3907 * ordered extents are left. Such ordered extents can cause a deadlock 3908 * e.g. when syncfs() is waiting for their completion but they can't 3909 * finish because they block when joining a transaction, due to the 3910 * fact that the freeze locks are being held in write mode. 3911 */ 3912 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3913 ASSERT(sb_write_started(fs_info->sb)); 3914 3915 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3916 btrfs_put_block_group(bg); 3917 return -ETXTBSY; 3918 } 3919 3920 rc = alloc_reloc_control(fs_info); 3921 if (!rc) { 3922 btrfs_put_block_group(bg); 3923 return -ENOMEM; 3924 } 3925 3926 ret = reloc_chunk_start(fs_info); 3927 if (ret < 0) { 3928 err = ret; 3929 goto out_put_bg; 3930 } 3931 3932 rc->extent_root = extent_root; 3933 rc->block_group = bg; 3934 3935 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3936 if (ret) { 3937 err = ret; 3938 goto out; 3939 } 3940 rw = 1; 3941 3942 path = btrfs_alloc_path(); 3943 if (!path) { 3944 err = -ENOMEM; 3945 goto out; 3946 } 3947 3948 inode = lookup_free_space_inode(rc->block_group, path); 3949 btrfs_free_path(path); 3950 3951 if (!IS_ERR(inode)) 3952 ret = delete_block_group_cache(rc->block_group, inode, 0); 3953 else 3954 ret = PTR_ERR(inode); 3955 3956 if (ret && ret != -ENOENT) { 3957 err = ret; 3958 goto out; 3959 } 3960 3961 rc->data_inode = create_reloc_inode(rc->block_group); 3962 if (IS_ERR(rc->data_inode)) { 3963 err = PTR_ERR(rc->data_inode); 3964 rc->data_inode = NULL; 3965 goto out; 3966 } 3967 3968 if (verbose) 3969 describe_relocation(rc->block_group); 3970 3971 btrfs_wait_block_group_reservations(rc->block_group); 3972 btrfs_wait_nocow_writers(rc->block_group); 3973 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 3974 3975 ret = btrfs_zone_finish(rc->block_group); 3976 WARN_ON(ret && ret != -EAGAIN); 3977 3978 while (1) { 3979 enum reloc_stage finishes_stage; 3980 3981 mutex_lock(&fs_info->cleaner_mutex); 3982 ret = relocate_block_group(rc); 3983 mutex_unlock(&fs_info->cleaner_mutex); 3984 if (ret < 0) 3985 err = ret; 3986 3987 finishes_stage = rc->stage; 3988 /* 3989 * We may have gotten ENOSPC after we already dirtied some 3990 * extents. If writeout happens while we're relocating a 3991 * different block group we could end up hitting the 3992 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 3993 * btrfs_reloc_cow_block. Make sure we write everything out 3994 * properly so we don't trip over this problem, and then break 3995 * out of the loop if we hit an error. 3996 */ 3997 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 3998 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 3999 (u64)-1); 4000 if (ret) 4001 err = ret; 4002 invalidate_mapping_pages(rc->data_inode->i_mapping, 4003 0, -1); 4004 rc->stage = UPDATE_DATA_PTRS; 4005 } 4006 4007 if (err < 0) 4008 goto out; 4009 4010 if (rc->extents_found == 0) 4011 break; 4012 4013 if (verbose) 4014 btrfs_info(fs_info, "found %llu extents, stage: %s", 4015 rc->extents_found, 4016 stage_to_string(finishes_stage)); 4017 } 4018 4019 WARN_ON(rc->block_group->pinned > 0); 4020 WARN_ON(rc->block_group->reserved > 0); 4021 WARN_ON(rc->block_group->used > 0); 4022 out: 4023 if (err && rw) 4024 btrfs_dec_block_group_ro(rc->block_group); 4025 iput(rc->data_inode); 4026 out_put_bg: 4027 btrfs_put_block_group(bg); 4028 reloc_chunk_end(fs_info); 4029 free_reloc_control(rc); 4030 return err; 4031 } 4032 4033 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4034 { 4035 struct btrfs_fs_info *fs_info = root->fs_info; 4036 struct btrfs_trans_handle *trans; 4037 int ret, err; 4038 4039 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4040 if (IS_ERR(trans)) 4041 return PTR_ERR(trans); 4042 4043 memset(&root->root_item.drop_progress, 0, 4044 sizeof(root->root_item.drop_progress)); 4045 btrfs_set_root_drop_level(&root->root_item, 0); 4046 btrfs_set_root_refs(&root->root_item, 0); 4047 ret = btrfs_update_root(trans, fs_info->tree_root, 4048 &root->root_key, &root->root_item); 4049 4050 err = btrfs_end_transaction(trans); 4051 if (err) 4052 return err; 4053 return ret; 4054 } 4055 4056 /* 4057 * recover relocation interrupted by system crash. 4058 * 4059 * this function resumes merging reloc trees with corresponding fs trees. 4060 * this is important for keeping the sharing of tree blocks 4061 */ 4062 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4063 { 4064 LIST_HEAD(reloc_roots); 4065 struct btrfs_key key; 4066 struct btrfs_root *fs_root; 4067 struct btrfs_root *reloc_root; 4068 struct btrfs_path *path; 4069 struct extent_buffer *leaf; 4070 struct reloc_control *rc = NULL; 4071 struct btrfs_trans_handle *trans; 4072 int ret2; 4073 int ret = 0; 4074 4075 path = btrfs_alloc_path(); 4076 if (!path) 4077 return -ENOMEM; 4078 path->reada = READA_BACK; 4079 4080 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4081 key.type = BTRFS_ROOT_ITEM_KEY; 4082 key.offset = (u64)-1; 4083 4084 while (1) { 4085 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4086 path, 0, 0); 4087 if (ret < 0) 4088 goto out; 4089 if (ret > 0) { 4090 if (path->slots[0] == 0) 4091 break; 4092 path->slots[0]--; 4093 } 4094 ret = 0; 4095 leaf = path->nodes[0]; 4096 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4097 btrfs_release_path(path); 4098 4099 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4100 key.type != BTRFS_ROOT_ITEM_KEY) 4101 break; 4102 4103 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4104 if (IS_ERR(reloc_root)) { 4105 ret = PTR_ERR(reloc_root); 4106 goto out; 4107 } 4108 4109 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4110 list_add(&reloc_root->root_list, &reloc_roots); 4111 4112 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4113 fs_root = btrfs_get_fs_root(fs_info, 4114 reloc_root->root_key.offset, false); 4115 if (IS_ERR(fs_root)) { 4116 ret = PTR_ERR(fs_root); 4117 if (ret != -ENOENT) 4118 goto out; 4119 ret = mark_garbage_root(reloc_root); 4120 if (ret < 0) 4121 goto out; 4122 ret = 0; 4123 } else { 4124 btrfs_put_root(fs_root); 4125 } 4126 } 4127 4128 if (key.offset == 0) 4129 break; 4130 4131 key.offset--; 4132 } 4133 btrfs_release_path(path); 4134 4135 if (list_empty(&reloc_roots)) 4136 goto out; 4137 4138 rc = alloc_reloc_control(fs_info); 4139 if (!rc) { 4140 ret = -ENOMEM; 4141 goto out; 4142 } 4143 4144 ret = reloc_chunk_start(fs_info); 4145 if (ret < 0) 4146 goto out_end; 4147 4148 rc->extent_root = btrfs_extent_root(fs_info, 0); 4149 4150 set_reloc_control(rc); 4151 4152 trans = btrfs_join_transaction(rc->extent_root); 4153 if (IS_ERR(trans)) { 4154 ret = PTR_ERR(trans); 4155 goto out_unset; 4156 } 4157 4158 rc->merge_reloc_tree = true; 4159 4160 while (!list_empty(&reloc_roots)) { 4161 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 4162 list_del(&reloc_root->root_list); 4163 4164 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4165 list_add_tail(&reloc_root->root_list, 4166 &rc->reloc_roots); 4167 continue; 4168 } 4169 4170 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4171 false); 4172 if (IS_ERR(fs_root)) { 4173 ret = PTR_ERR(fs_root); 4174 list_add_tail(&reloc_root->root_list, &reloc_roots); 4175 btrfs_end_transaction(trans); 4176 goto out_unset; 4177 } 4178 4179 ret = __add_reloc_root(reloc_root); 4180 ASSERT(ret != -EEXIST); 4181 if (ret) { 4182 list_add_tail(&reloc_root->root_list, &reloc_roots); 4183 btrfs_put_root(fs_root); 4184 btrfs_end_transaction(trans); 4185 goto out_unset; 4186 } 4187 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4188 btrfs_put_root(fs_root); 4189 } 4190 4191 ret = btrfs_commit_transaction(trans); 4192 if (ret) 4193 goto out_unset; 4194 4195 merge_reloc_roots(rc); 4196 4197 unset_reloc_control(rc); 4198 4199 trans = btrfs_join_transaction(rc->extent_root); 4200 if (IS_ERR(trans)) { 4201 ret = PTR_ERR(trans); 4202 goto out_clean; 4203 } 4204 ret = btrfs_commit_transaction(trans); 4205 out_clean: 4206 ret2 = clean_dirty_subvols(rc); 4207 if (ret2 < 0 && !ret) 4208 ret = ret2; 4209 out_unset: 4210 unset_reloc_control(rc); 4211 out_end: 4212 reloc_chunk_end(fs_info); 4213 free_reloc_control(rc); 4214 out: 4215 free_reloc_roots(&reloc_roots); 4216 4217 btrfs_free_path(path); 4218 4219 if (ret == 0) { 4220 /* cleanup orphan inode in data relocation tree */ 4221 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4222 ASSERT(fs_root); 4223 ret = btrfs_orphan_cleanup(fs_root); 4224 btrfs_put_root(fs_root); 4225 } 4226 return ret; 4227 } 4228 4229 /* 4230 * helper to add ordered checksum for data relocation. 4231 * 4232 * cloning checksum properly handles the nodatasum extents. 4233 * it also saves CPU time to re-calculate the checksum. 4234 */ 4235 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4236 { 4237 struct btrfs_inode *inode = ordered->inode; 4238 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4239 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4240 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4241 LIST_HEAD(list); 4242 int ret; 4243 4244 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4245 disk_bytenr + ordered->num_bytes - 1, 4246 &list, false); 4247 if (ret < 0) { 4248 btrfs_mark_ordered_extent_error(ordered); 4249 return ret; 4250 } 4251 4252 while (!list_empty(&list)) { 4253 struct btrfs_ordered_sum *sums = 4254 list_first_entry(&list, struct btrfs_ordered_sum, list); 4255 4256 list_del_init(&sums->list); 4257 4258 /* 4259 * We need to offset the new_bytenr based on where the csum is. 4260 * We need to do this because we will read in entire prealloc 4261 * extents but we may have written to say the middle of the 4262 * prealloc extent, so we need to make sure the csum goes with 4263 * the right disk offset. 4264 * 4265 * We can do this because the data reloc inode refers strictly 4266 * to the on disk bytes, so we don't have to worry about 4267 * disk_len vs real len like with real inodes since it's all 4268 * disk length. 4269 */ 4270 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4271 btrfs_add_ordered_sum(ordered, sums); 4272 } 4273 4274 return 0; 4275 } 4276 4277 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4278 struct btrfs_root *root, 4279 const struct extent_buffer *buf, 4280 struct extent_buffer *cow) 4281 { 4282 struct btrfs_fs_info *fs_info = root->fs_info; 4283 struct reloc_control *rc; 4284 struct btrfs_backref_node *node; 4285 int first_cow = 0; 4286 int level; 4287 int ret = 0; 4288 4289 rc = fs_info->reloc_ctl; 4290 if (!rc) 4291 return 0; 4292 4293 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4294 4295 level = btrfs_header_level(buf); 4296 if (btrfs_header_generation(buf) <= 4297 btrfs_root_last_snapshot(&root->root_item)) 4298 first_cow = 1; 4299 4300 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4301 WARN_ON(!first_cow && level == 0); 4302 4303 node = rc->backref_cache.path[level]; 4304 4305 /* 4306 * If node->bytenr != buf->start and node->new_bytenr != 4307 * buf->start then we've got the wrong backref node for what we 4308 * expected to see here and the cache is incorrect. 4309 */ 4310 if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { 4311 btrfs_err(fs_info, 4312 "bytenr %llu was found but our backref cache was expecting %llu or %llu", 4313 buf->start, node->bytenr, node->new_bytenr); 4314 return -EUCLEAN; 4315 } 4316 4317 btrfs_backref_drop_node_buffer(node); 4318 refcount_inc(&cow->refs); 4319 node->eb = cow; 4320 node->new_bytenr = cow->start; 4321 4322 if (!node->pending) { 4323 list_move_tail(&node->list, 4324 &rc->backref_cache.pending[level]); 4325 node->pending = 1; 4326 } 4327 4328 if (first_cow) 4329 mark_block_processed(rc, node); 4330 4331 if (first_cow && level > 0) 4332 rc->nodes_relocated += buf->len; 4333 } 4334 4335 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4336 ret = replace_file_extents(trans, rc, root, cow); 4337 return ret; 4338 } 4339 4340 /* 4341 * called before creating snapshot. it calculates metadata reservation 4342 * required for relocating tree blocks in the snapshot 4343 */ 4344 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4345 u64 *bytes_to_reserve) 4346 { 4347 struct btrfs_root *root = pending->root; 4348 struct reloc_control *rc = root->fs_info->reloc_ctl; 4349 4350 if (!rc || !have_reloc_root(root)) 4351 return; 4352 4353 if (!rc->merge_reloc_tree) 4354 return; 4355 4356 root = root->reloc_root; 4357 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4358 /* 4359 * relocation is in the stage of merging trees. the space 4360 * used by merging a reloc tree is twice the size of 4361 * relocated tree nodes in the worst case. half for cowing 4362 * the reloc tree, half for cowing the fs tree. the space 4363 * used by cowing the reloc tree will be freed after the 4364 * tree is dropped. if we create snapshot, cowing the fs 4365 * tree may use more space than it frees. so we need 4366 * reserve extra space. 4367 */ 4368 *bytes_to_reserve += rc->nodes_relocated; 4369 } 4370 4371 /* 4372 * called after snapshot is created. migrate block reservation 4373 * and create reloc root for the newly created snapshot 4374 * 4375 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4376 * references held on the reloc_root, one for root->reloc_root and one for 4377 * rc->reloc_roots. 4378 */ 4379 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4380 struct btrfs_pending_snapshot *pending) 4381 { 4382 struct btrfs_root *root = pending->root; 4383 struct btrfs_root *reloc_root; 4384 struct btrfs_root *new_root; 4385 struct reloc_control *rc = root->fs_info->reloc_ctl; 4386 int ret; 4387 4388 if (!rc || !have_reloc_root(root)) 4389 return 0; 4390 4391 rc = root->fs_info->reloc_ctl; 4392 rc->merging_rsv_size += rc->nodes_relocated; 4393 4394 if (rc->merge_reloc_tree) { 4395 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4396 rc->block_rsv, 4397 rc->nodes_relocated, true); 4398 if (ret) 4399 return ret; 4400 } 4401 4402 new_root = pending->snap; 4403 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4404 if (IS_ERR(reloc_root)) 4405 return PTR_ERR(reloc_root); 4406 4407 ret = __add_reloc_root(reloc_root); 4408 ASSERT(ret != -EEXIST); 4409 if (ret) { 4410 /* Pairs with create_reloc_root */ 4411 btrfs_put_root(reloc_root); 4412 return ret; 4413 } 4414 new_root->reloc_root = btrfs_grab_root(reloc_root); 4415 return 0; 4416 } 4417 4418 /* 4419 * Get the current bytenr for the block group which is being relocated. 4420 * 4421 * Return U64_MAX if no running relocation. 4422 */ 4423 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4424 { 4425 u64 logical = U64_MAX; 4426 4427 lockdep_assert_held(&fs_info->reloc_mutex); 4428 4429 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4430 logical = fs_info->reloc_ctl->block_group->start; 4431 return logical; 4432 } 4433