1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 union { 94 /* Use rb_simple_node for search/insert */ 95 struct { 96 struct rb_node rb_node; 97 u64 bytenr; 98 }; 99 100 struct rb_simple_node simple_node; 101 }; 102 void *data; 103 }; 104 105 struct mapping_tree { 106 struct rb_root rb_root; 107 spinlock_t lock; 108 }; 109 110 /* 111 * present a tree block to process 112 */ 113 struct tree_block { 114 union { 115 /* Use rb_simple_node for search/insert */ 116 struct { 117 struct rb_node rb_node; 118 u64 bytenr; 119 }; 120 121 struct rb_simple_node simple_node; 122 }; 123 u64 owner; 124 struct btrfs_key key; 125 u8 level; 126 bool key_ready; 127 }; 128 129 #define MAX_EXTENTS 128 130 131 struct file_extent_cluster { 132 u64 start; 133 u64 end; 134 u64 boundary[MAX_EXTENTS]; 135 unsigned int nr; 136 u64 owning_root; 137 }; 138 139 /* Stages of data relocation. */ 140 enum reloc_stage { 141 MOVE_DATA_EXTENTS, 142 UPDATE_DATA_PTRS 143 }; 144 145 struct reloc_control { 146 /* block group to relocate */ 147 struct btrfs_block_group *block_group; 148 /* extent tree */ 149 struct btrfs_root *extent_root; 150 /* inode for moving data */ 151 struct inode *data_inode; 152 153 struct btrfs_block_rsv *block_rsv; 154 155 struct btrfs_backref_cache backref_cache; 156 157 struct file_extent_cluster cluster; 158 /* tree blocks have been processed */ 159 struct extent_io_tree processed_blocks; 160 /* map start of tree root to corresponding reloc tree */ 161 struct mapping_tree reloc_root_tree; 162 /* list of reloc trees */ 163 struct list_head reloc_roots; 164 /* list of subvolume trees that get relocated */ 165 struct list_head dirty_subvol_roots; 166 /* size of metadata reservation for merging reloc trees */ 167 u64 merging_rsv_size; 168 /* size of relocated tree nodes */ 169 u64 nodes_relocated; 170 /* reserved size for block group relocation*/ 171 u64 reserved_bytes; 172 173 u64 search_start; 174 u64 extents_found; 175 176 enum reloc_stage stage; 177 bool create_reloc_tree; 178 bool merge_reloc_tree; 179 bool found_file_extent; 180 }; 181 182 static void mark_block_processed(struct reloc_control *rc, 183 struct btrfs_backref_node *node) 184 { 185 u32 blocksize; 186 187 if (node->level == 0 || 188 in_range(node->bytenr, rc->block_group->start, 189 rc->block_group->length)) { 190 blocksize = rc->extent_root->fs_info->nodesize; 191 btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, 192 node->bytenr + blocksize - 1, EXTENT_DIRTY, 193 NULL); 194 } 195 node->processed = 1; 196 } 197 198 /* 199 * walk up backref nodes until reach node presents tree root 200 */ 201 static struct btrfs_backref_node *walk_up_backref( 202 struct btrfs_backref_node *node, 203 struct btrfs_backref_edge *edges[], int *index) 204 { 205 struct btrfs_backref_edge *edge; 206 int idx = *index; 207 208 while (!list_empty(&node->upper)) { 209 edge = list_first_entry(&node->upper, struct btrfs_backref_edge, 210 list[LOWER]); 211 edges[idx++] = edge; 212 node = edge->node[UPPER]; 213 } 214 BUG_ON(node->detached); 215 *index = idx; 216 return node; 217 } 218 219 /* 220 * walk down backref nodes to find start of next reference path 221 */ 222 static struct btrfs_backref_node *walk_down_backref( 223 struct btrfs_backref_edge *edges[], int *index) 224 { 225 struct btrfs_backref_edge *edge; 226 struct btrfs_backref_node *lower; 227 int idx = *index; 228 229 while (idx > 0) { 230 edge = edges[idx - 1]; 231 lower = edge->node[LOWER]; 232 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 233 idx--; 234 continue; 235 } 236 edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, 237 list[LOWER]); 238 edges[idx - 1] = edge; 239 *index = idx; 240 return edge->node[UPPER]; 241 } 242 *index = 0; 243 return NULL; 244 } 245 246 static bool reloc_root_is_dead(const struct btrfs_root *root) 247 { 248 /* 249 * Pair with set_bit/clear_bit in clean_dirty_subvols and 250 * btrfs_update_reloc_root. We need to see the updated bit before 251 * trying to access reloc_root 252 */ 253 smp_rmb(); 254 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 255 return true; 256 return false; 257 } 258 259 /* 260 * Check if this subvolume tree has valid reloc tree. 261 * 262 * Reloc tree after swap is considered dead, thus not considered as valid. 263 * This is enough for most callers, as they don't distinguish dead reloc root 264 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 265 * special case. 266 */ 267 static bool have_reloc_root(const struct btrfs_root *root) 268 { 269 if (reloc_root_is_dead(root)) 270 return false; 271 if (!root->reloc_root) 272 return false; 273 return true; 274 } 275 276 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 277 { 278 struct btrfs_root *reloc_root; 279 280 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 281 return false; 282 283 /* This root has been merged with its reloc tree, we can ignore it */ 284 if (reloc_root_is_dead(root)) 285 return true; 286 287 reloc_root = root->reloc_root; 288 if (!reloc_root) 289 return false; 290 291 if (btrfs_header_generation(reloc_root->commit_root) == 292 root->fs_info->running_transaction->transid) 293 return false; 294 /* 295 * If there is reloc tree and it was created in previous transaction 296 * backref lookup can find the reloc tree, so backref node for the fs 297 * tree root is useless for relocation. 298 */ 299 return true; 300 } 301 302 /* 303 * find reloc tree by address of tree root 304 */ 305 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 306 { 307 struct reloc_control *rc = fs_info->reloc_ctl; 308 struct rb_node *rb_node; 309 struct mapping_node *node; 310 struct btrfs_root *root = NULL; 311 312 ASSERT(rc); 313 spin_lock(&rc->reloc_root_tree.lock); 314 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 315 if (rb_node) { 316 node = rb_entry(rb_node, struct mapping_node, rb_node); 317 root = node->data; 318 } 319 spin_unlock(&rc->reloc_root_tree.lock); 320 return btrfs_grab_root(root); 321 } 322 323 /* 324 * For useless nodes, do two major clean ups: 325 * 326 * - Cleanup the children edges and nodes 327 * If child node is also orphan (no parent) during cleanup, then the child 328 * node will also be cleaned up. 329 * 330 * - Freeing up leaves (level 0), keeps nodes detached 331 * For nodes, the node is still cached as "detached" 332 * 333 * Return false if @node is not in the @useless_nodes list. 334 * Return true if @node is in the @useless_nodes list. 335 */ 336 static bool handle_useless_nodes(struct reloc_control *rc, 337 struct btrfs_backref_node *node) 338 { 339 struct btrfs_backref_cache *cache = &rc->backref_cache; 340 struct list_head *useless_node = &cache->useless_node; 341 bool ret = false; 342 343 while (!list_empty(useless_node)) { 344 struct btrfs_backref_node *cur; 345 346 cur = list_first_entry(useless_node, struct btrfs_backref_node, 347 list); 348 list_del_init(&cur->list); 349 350 /* Only tree root nodes can be added to @useless_nodes */ 351 ASSERT(list_empty(&cur->upper)); 352 353 if (cur == node) 354 ret = true; 355 356 /* Cleanup the lower edges */ 357 while (!list_empty(&cur->lower)) { 358 struct btrfs_backref_edge *edge; 359 struct btrfs_backref_node *lower; 360 361 edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, 362 list[UPPER]); 363 list_del(&edge->list[UPPER]); 364 list_del(&edge->list[LOWER]); 365 lower = edge->node[LOWER]; 366 btrfs_backref_free_edge(cache, edge); 367 368 /* Child node is also orphan, queue for cleanup */ 369 if (list_empty(&lower->upper)) 370 list_add(&lower->list, useless_node); 371 } 372 /* Mark this block processed for relocation */ 373 mark_block_processed(rc, cur); 374 375 /* 376 * Backref nodes for tree leaves are deleted from the cache. 377 * Backref nodes for upper level tree blocks are left in the 378 * cache to avoid unnecessary backref lookup. 379 */ 380 if (cur->level > 0) { 381 cur->detached = 1; 382 } else { 383 rb_erase(&cur->rb_node, &cache->rb_root); 384 btrfs_backref_free_node(cache, cur); 385 } 386 } 387 return ret; 388 } 389 390 /* 391 * Build backref tree for a given tree block. Root of the backref tree 392 * corresponds the tree block, leaves of the backref tree correspond roots of 393 * b-trees that reference the tree block. 394 * 395 * The basic idea of this function is check backrefs of a given block to find 396 * upper level blocks that reference the block, and then check backrefs of 397 * these upper level blocks recursively. The recursion stops when tree root is 398 * reached or backrefs for the block is cached. 399 * 400 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 401 * all upper level blocks that directly/indirectly reference the block are also 402 * cached. 403 */ 404 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 405 struct btrfs_trans_handle *trans, 406 struct reloc_control *rc, struct btrfs_key *node_key, 407 int level, u64 bytenr) 408 { 409 struct btrfs_backref_iter *iter; 410 struct btrfs_backref_cache *cache = &rc->backref_cache; 411 /* For searching parent of TREE_BLOCK_REF */ 412 struct btrfs_path *path; 413 struct btrfs_backref_node *cur; 414 struct btrfs_backref_node *node = NULL; 415 struct btrfs_backref_edge *edge; 416 int ret; 417 418 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 419 if (!iter) 420 return ERR_PTR(-ENOMEM); 421 path = btrfs_alloc_path(); 422 if (!path) { 423 ret = -ENOMEM; 424 goto out; 425 } 426 427 node = btrfs_backref_alloc_node(cache, bytenr, level); 428 if (!node) { 429 ret = -ENOMEM; 430 goto out; 431 } 432 433 cur = node; 434 435 /* Breadth-first search to build backref cache */ 436 do { 437 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 438 node_key, cur); 439 if (ret < 0) 440 goto out; 441 442 edge = list_first_entry_or_null(&cache->pending_edge, 443 struct btrfs_backref_edge, list[UPPER]); 444 /* 445 * The pending list isn't empty, take the first block to 446 * process 447 */ 448 if (edge) { 449 list_del_init(&edge->list[UPPER]); 450 cur = edge->node[UPPER]; 451 } 452 } while (edge); 453 454 /* Finish the upper linkage of newly added edges/nodes */ 455 ret = btrfs_backref_finish_upper_links(cache, node); 456 if (ret < 0) 457 goto out; 458 459 if (handle_useless_nodes(rc, node)) 460 node = NULL; 461 out: 462 btrfs_free_path(iter->path); 463 kfree(iter); 464 btrfs_free_path(path); 465 if (ret) { 466 btrfs_backref_error_cleanup(cache, node); 467 return ERR_PTR(ret); 468 } 469 ASSERT(!node || !node->detached); 470 ASSERT(list_empty(&cache->useless_node) && 471 list_empty(&cache->pending_edge)); 472 return node; 473 } 474 475 /* 476 * helper to add 'address of tree root -> reloc tree' mapping 477 */ 478 static int __add_reloc_root(struct btrfs_root *root) 479 { 480 struct btrfs_fs_info *fs_info = root->fs_info; 481 struct rb_node *rb_node; 482 struct mapping_node *node; 483 struct reloc_control *rc = fs_info->reloc_ctl; 484 485 node = kmalloc(sizeof(*node), GFP_NOFS); 486 if (!node) 487 return -ENOMEM; 488 489 node->bytenr = root->commit_root->start; 490 node->data = root; 491 492 spin_lock(&rc->reloc_root_tree.lock); 493 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 494 spin_unlock(&rc->reloc_root_tree.lock); 495 if (rb_node) { 496 btrfs_err(fs_info, 497 "Duplicate root found for start=%llu while inserting into relocation tree", 498 node->bytenr); 499 return -EEXIST; 500 } 501 502 list_add_tail(&root->root_list, &rc->reloc_roots); 503 return 0; 504 } 505 506 /* 507 * helper to delete the 'address of tree root -> reloc tree' 508 * mapping 509 */ 510 static void __del_reloc_root(struct btrfs_root *root) 511 { 512 struct btrfs_fs_info *fs_info = root->fs_info; 513 struct rb_node *rb_node; 514 struct mapping_node AUTO_KFREE(node); 515 struct reloc_control *rc = fs_info->reloc_ctl; 516 bool put_ref = false; 517 518 if (rc && root->node) { 519 spin_lock(&rc->reloc_root_tree.lock); 520 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 521 root->commit_root->start); 522 if (rb_node) { 523 node = rb_entry(rb_node, struct mapping_node, rb_node); 524 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 525 RB_CLEAR_NODE(&node->rb_node); 526 } 527 spin_unlock(&rc->reloc_root_tree.lock); 528 ASSERT(!node || (struct btrfs_root *)node->data == root); 529 } 530 531 /* 532 * We only put the reloc root here if it's on the list. There's a lot 533 * of places where the pattern is to splice the rc->reloc_roots, process 534 * the reloc roots, and then add the reloc root back onto 535 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 536 * list we don't want the reference being dropped, because the guy 537 * messing with the list is in charge of the reference. 538 */ 539 spin_lock(&fs_info->trans_lock); 540 if (!list_empty(&root->root_list)) { 541 put_ref = true; 542 list_del_init(&root->root_list); 543 } 544 spin_unlock(&fs_info->trans_lock); 545 if (put_ref) 546 btrfs_put_root(root); 547 } 548 549 /* 550 * helper to update the 'address of tree root -> reloc tree' 551 * mapping 552 */ 553 static int __update_reloc_root(struct btrfs_root *root) 554 { 555 struct btrfs_fs_info *fs_info = root->fs_info; 556 struct rb_node *rb_node; 557 struct mapping_node *node = NULL; 558 struct reloc_control *rc = fs_info->reloc_ctl; 559 560 spin_lock(&rc->reloc_root_tree.lock); 561 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 562 root->commit_root->start); 563 if (rb_node) { 564 node = rb_entry(rb_node, struct mapping_node, rb_node); 565 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 566 } 567 spin_unlock(&rc->reloc_root_tree.lock); 568 569 if (!node) 570 return 0; 571 BUG_ON((struct btrfs_root *)node->data != root); 572 573 spin_lock(&rc->reloc_root_tree.lock); 574 node->bytenr = root->node->start; 575 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node); 576 spin_unlock(&rc->reloc_root_tree.lock); 577 if (rb_node) 578 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 579 return 0; 580 } 581 582 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 583 struct btrfs_root *root, u64 objectid) 584 { 585 struct btrfs_fs_info *fs_info = root->fs_info; 586 struct btrfs_root *reloc_root; 587 struct extent_buffer *eb; 588 struct btrfs_root_item AUTO_KFREE(root_item); 589 struct btrfs_key root_key; 590 int ret = 0; 591 592 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 593 if (!root_item) 594 return ERR_PTR(-ENOMEM); 595 596 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 597 root_key.type = BTRFS_ROOT_ITEM_KEY; 598 root_key.offset = objectid; 599 600 if (btrfs_root_id(root) == objectid) { 601 u64 commit_root_gen; 602 603 /* 604 * Relocation will wait for cleaner thread, and any half-dropped 605 * subvolume will be fully cleaned up at mount time. 606 * So here we shouldn't hit a subvolume with non-zero drop_progress. 607 * 608 * If this isn't the case, error out since it can make us attempt to 609 * drop references for extents that were already dropped before. 610 */ 611 if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { 612 struct btrfs_key cpu_key; 613 614 btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); 615 btrfs_err(fs_info, 616 "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT, 617 objectid, BTRFS_KEY_FMT_VALUE(&cpu_key)); 618 return ERR_PTR(-EUCLEAN); 619 } 620 621 /* called by btrfs_init_reloc_root */ 622 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 623 BTRFS_TREE_RELOC_OBJECTID); 624 if (ret) 625 return ERR_PTR(ret); 626 627 /* 628 * Set the last_snapshot field to the generation of the commit 629 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 630 * correctly (returns true) when the relocation root is created 631 * either inside the critical section of a transaction commit 632 * (through transaction.c:qgroup_account_snapshot()) and when 633 * it's created before the transaction commit is started. 634 */ 635 commit_root_gen = btrfs_header_generation(root->commit_root); 636 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 637 } else { 638 /* 639 * called by btrfs_reloc_post_snapshot_hook. 640 * the source tree is a reloc tree, all tree blocks 641 * modified after it was created have RELOC flag 642 * set in their headers. so it's OK to not update 643 * the 'last_snapshot'. 644 */ 645 ret = btrfs_copy_root(trans, root, root->node, &eb, 646 BTRFS_TREE_RELOC_OBJECTID); 647 if (ret) 648 return ERR_PTR(ret); 649 } 650 651 /* 652 * We have changed references at this point, we must abort the 653 * transaction if anything fails (i.e. 'goto abort'). 654 */ 655 656 memcpy(root_item, &root->root_item, sizeof(*root_item)); 657 btrfs_set_root_bytenr(root_item, eb->start); 658 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 659 btrfs_set_root_generation(root_item, trans->transid); 660 661 if (btrfs_root_id(root) == objectid) { 662 btrfs_set_root_refs(root_item, 0); 663 memset(&root_item->drop_progress, 0, 664 sizeof(struct btrfs_disk_key)); 665 btrfs_set_root_drop_level(root_item, 0); 666 } 667 668 btrfs_tree_unlock(eb); 669 free_extent_buffer(eb); 670 671 ret = btrfs_insert_root(trans, fs_info->tree_root, 672 &root_key, root_item); 673 if (ret) 674 goto abort; 675 676 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 677 if (IS_ERR(reloc_root)) { 678 ret = PTR_ERR(reloc_root); 679 goto abort; 680 } 681 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 682 btrfs_set_root_last_trans(reloc_root, trans->transid); 683 return reloc_root; 684 685 abort: 686 btrfs_abort_transaction(trans, ret); 687 return ERR_PTR(ret); 688 } 689 690 /* 691 * create reloc tree for a given fs tree. reloc tree is just a 692 * snapshot of the fs tree with special root objectid. 693 * 694 * The reloc_root comes out of here with two references, one for 695 * root->reloc_root, and another for being on the rc->reloc_roots list. 696 */ 697 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 698 struct btrfs_root *root) 699 { 700 struct btrfs_fs_info *fs_info = root->fs_info; 701 struct btrfs_root *reloc_root; 702 struct reloc_control *rc = fs_info->reloc_ctl; 703 struct btrfs_block_rsv *rsv; 704 int clear_rsv = 0; 705 int ret; 706 707 if (!rc) 708 return 0; 709 710 /* 711 * The subvolume has reloc tree but the swap is finished, no need to 712 * create/update the dead reloc tree 713 */ 714 if (reloc_root_is_dead(root)) 715 return 0; 716 717 /* 718 * This is subtle but important. We do not do 719 * record_root_in_transaction for reloc roots, instead we record their 720 * corresponding fs root, and then here we update the last trans for the 721 * reloc root. This means that we have to do this for the entire life 722 * of the reloc root, regardless of which stage of the relocation we are 723 * in. 724 */ 725 if (root->reloc_root) { 726 reloc_root = root->reloc_root; 727 btrfs_set_root_last_trans(reloc_root, trans->transid); 728 return 0; 729 } 730 731 /* 732 * We are merging reloc roots, we do not need new reloc trees. Also 733 * reloc trees never need their own reloc tree. 734 */ 735 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 736 return 0; 737 738 if (!trans->reloc_reserved) { 739 rsv = trans->block_rsv; 740 trans->block_rsv = rc->block_rsv; 741 clear_rsv = 1; 742 } 743 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 744 if (clear_rsv) 745 trans->block_rsv = rsv; 746 if (IS_ERR(reloc_root)) 747 return PTR_ERR(reloc_root); 748 749 ret = __add_reloc_root(reloc_root); 750 ASSERT(ret != -EEXIST); 751 if (ret) { 752 /* Pairs with create_reloc_root */ 753 btrfs_put_root(reloc_root); 754 return ret; 755 } 756 root->reloc_root = btrfs_grab_root(reloc_root); 757 return 0; 758 } 759 760 /* 761 * update root item of reloc tree 762 */ 763 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 764 struct btrfs_root *root) 765 { 766 struct btrfs_fs_info *fs_info = root->fs_info; 767 struct btrfs_root *reloc_root; 768 struct btrfs_root_item *root_item; 769 int ret; 770 771 if (!have_reloc_root(root)) 772 return 0; 773 774 reloc_root = root->reloc_root; 775 root_item = &reloc_root->root_item; 776 777 /* 778 * We are probably ok here, but __del_reloc_root() will drop its ref of 779 * the root. We have the ref for root->reloc_root, but just in case 780 * hold it while we update the reloc root. 781 */ 782 btrfs_grab_root(reloc_root); 783 784 /* root->reloc_root will stay until current relocation finished */ 785 if (fs_info->reloc_ctl && fs_info->reloc_ctl->merge_reloc_tree && 786 btrfs_root_refs(root_item) == 0) { 787 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 788 /* 789 * Mark the tree as dead before we change reloc_root so 790 * have_reloc_root will not touch it from now on. 791 */ 792 smp_wmb(); 793 __del_reloc_root(reloc_root); 794 } 795 796 if (reloc_root->commit_root != reloc_root->node) { 797 __update_reloc_root(reloc_root); 798 btrfs_set_root_node(root_item, reloc_root->node); 799 free_extent_buffer(reloc_root->commit_root); 800 reloc_root->commit_root = btrfs_root_node(reloc_root); 801 } 802 803 ret = btrfs_update_root(trans, fs_info->tree_root, 804 &reloc_root->root_key, root_item); 805 btrfs_put_root(reloc_root); 806 return ret; 807 } 808 809 /* 810 * get new location of data 811 */ 812 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 813 u64 bytenr, u64 num_bytes) 814 { 815 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 816 BTRFS_PATH_AUTO_FREE(path); 817 struct btrfs_file_extent_item *fi; 818 struct extent_buffer *leaf; 819 int ret; 820 821 path = btrfs_alloc_path(); 822 if (!path) 823 return -ENOMEM; 824 825 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 826 ret = btrfs_lookup_file_extent(NULL, root, path, 827 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 828 if (ret < 0) 829 return ret; 830 if (ret > 0) 831 return -ENOENT; 832 833 leaf = path->nodes[0]; 834 fi = btrfs_item_ptr(leaf, path->slots[0], 835 struct btrfs_file_extent_item); 836 837 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 838 btrfs_file_extent_compression(leaf, fi) || 839 btrfs_file_extent_encryption(leaf, fi) || 840 btrfs_file_extent_other_encoding(leaf, fi)); 841 842 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) 843 return -EINVAL; 844 845 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 846 return 0; 847 } 848 849 /* 850 * update file extent items in the tree leaf to point to 851 * the new locations. 852 */ 853 static noinline_for_stack 854 int replace_file_extents(struct btrfs_trans_handle *trans, 855 struct reloc_control *rc, 856 struct btrfs_root *root, 857 struct extent_buffer *leaf) 858 { 859 struct btrfs_fs_info *fs_info = root->fs_info; 860 struct btrfs_key key; 861 struct btrfs_file_extent_item *fi; 862 struct btrfs_inode *inode = NULL; 863 u64 parent; 864 u64 bytenr; 865 u64 new_bytenr = 0; 866 u64 num_bytes; 867 u64 end; 868 u32 nritems; 869 u32 i; 870 int ret = 0; 871 int first = 1; 872 873 if (rc->stage != UPDATE_DATA_PTRS) 874 return 0; 875 876 /* reloc trees always use full backref */ 877 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 878 parent = leaf->start; 879 else 880 parent = 0; 881 882 nritems = btrfs_header_nritems(leaf); 883 for (i = 0; i < nritems; i++) { 884 struct btrfs_ref ref = { 0 }; 885 886 cond_resched(); 887 btrfs_item_key_to_cpu(leaf, &key, i); 888 if (key.type != BTRFS_EXTENT_DATA_KEY) 889 continue; 890 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 891 if (btrfs_file_extent_type(leaf, fi) == 892 BTRFS_FILE_EXTENT_INLINE) 893 continue; 894 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 895 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 896 if (bytenr == 0) 897 continue; 898 if (!in_range(bytenr, rc->block_group->start, 899 rc->block_group->length)) 900 continue; 901 902 /* 903 * if we are modifying block in fs tree, wait for read_folio 904 * to complete and drop the extent cache 905 */ 906 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 907 if (first) { 908 inode = btrfs_find_first_inode(root, key.objectid); 909 first = 0; 910 } else if (inode && btrfs_ino(inode) < key.objectid) { 911 btrfs_add_delayed_iput(inode); 912 inode = btrfs_find_first_inode(root, key.objectid); 913 } 914 if (inode && btrfs_ino(inode) == key.objectid) { 915 struct extent_state *cached_state = NULL; 916 917 end = key.offset + 918 btrfs_file_extent_num_bytes(leaf, fi); 919 WARN_ON(!IS_ALIGNED(key.offset, 920 fs_info->sectorsize)); 921 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 922 end--; 923 /* Take mmap lock to serialize with reflinks. */ 924 if (!down_read_trylock(&inode->i_mmap_lock)) 925 continue; 926 ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, 927 end, &cached_state); 928 if (!ret) { 929 up_read(&inode->i_mmap_lock); 930 continue; 931 } 932 933 btrfs_drop_extent_map_range(inode, key.offset, end, true); 934 btrfs_unlock_extent(&inode->io_tree, key.offset, end, 935 &cached_state); 936 up_read(&inode->i_mmap_lock); 937 } 938 } 939 940 ret = get_new_location(rc->data_inode, &new_bytenr, 941 bytenr, num_bytes); 942 if (ret) { 943 /* 944 * Don't have to abort since we've not changed anything 945 * in the file extent yet. 946 */ 947 break; 948 } 949 950 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 951 952 key.offset -= btrfs_file_extent_offset(leaf, fi); 953 ref.action = BTRFS_ADD_DELAYED_REF; 954 ref.bytenr = new_bytenr; 955 ref.num_bytes = num_bytes; 956 ref.parent = parent; 957 ref.owning_root = btrfs_root_id(root); 958 ref.ref_root = btrfs_header_owner(leaf); 959 btrfs_init_data_ref(&ref, key.objectid, key.offset, 960 btrfs_root_id(root), false); 961 ret = btrfs_inc_extent_ref(trans, &ref); 962 if (unlikely(ret)) { 963 btrfs_abort_transaction(trans, ret); 964 break; 965 } 966 967 ref.action = BTRFS_DROP_DELAYED_REF; 968 ref.bytenr = bytenr; 969 ref.num_bytes = num_bytes; 970 ref.parent = parent; 971 ref.owning_root = btrfs_root_id(root); 972 ref.ref_root = btrfs_header_owner(leaf); 973 btrfs_init_data_ref(&ref, key.objectid, key.offset, 974 btrfs_root_id(root), false); 975 ret = btrfs_free_extent(trans, &ref); 976 if (unlikely(ret)) { 977 btrfs_abort_transaction(trans, ret); 978 break; 979 } 980 } 981 if (inode) 982 btrfs_add_delayed_iput(inode); 983 return ret; 984 } 985 986 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 987 int slot, const struct btrfs_path *path, 988 int level) 989 { 990 struct btrfs_disk_key key1; 991 struct btrfs_disk_key key2; 992 btrfs_node_key(eb, &key1, slot); 993 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 994 return memcmp(&key1, &key2, sizeof(key1)); 995 } 996 997 /* 998 * try to replace tree blocks in fs tree with the new blocks 999 * in reloc tree. tree blocks haven't been modified since the 1000 * reloc tree was create can be replaced. 1001 * 1002 * if a block was replaced, level of the block + 1 is returned. 1003 * if no block got replaced, 0 is returned. if there are other 1004 * errors, a negative error number is returned. 1005 */ 1006 static noinline_for_stack 1007 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1008 struct btrfs_root *dest, struct btrfs_root *src, 1009 struct btrfs_path *path, struct btrfs_key *next_key, 1010 int lowest_level, int max_level) 1011 { 1012 struct btrfs_fs_info *fs_info = dest->fs_info; 1013 struct extent_buffer *eb; 1014 struct extent_buffer *parent; 1015 struct btrfs_ref ref = { 0 }; 1016 struct btrfs_key key; 1017 u64 old_bytenr; 1018 u64 new_bytenr; 1019 u64 old_ptr_gen; 1020 u64 new_ptr_gen; 1021 u64 last_snapshot; 1022 u32 blocksize; 1023 int cow = 0; 1024 int level; 1025 int ret; 1026 int slot; 1027 1028 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1029 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1030 1031 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1032 again: 1033 slot = path->slots[lowest_level]; 1034 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1035 1036 eb = btrfs_lock_root_node(dest); 1037 level = btrfs_header_level(eb); 1038 1039 if (level < lowest_level) { 1040 btrfs_tree_unlock(eb); 1041 free_extent_buffer(eb); 1042 return 0; 1043 } 1044 1045 if (cow) { 1046 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1047 BTRFS_NESTING_COW); 1048 if (ret) { 1049 btrfs_tree_unlock(eb); 1050 free_extent_buffer(eb); 1051 return ret; 1052 } 1053 } 1054 1055 if (next_key) { 1056 next_key->objectid = (u64)-1; 1057 next_key->type = (u8)-1; 1058 next_key->offset = (u64)-1; 1059 } 1060 1061 parent = eb; 1062 while (1) { 1063 level = btrfs_header_level(parent); 1064 ASSERT(level >= lowest_level); 1065 1066 ret = btrfs_bin_search(parent, 0, &key, &slot); 1067 if (ret < 0) 1068 break; 1069 if (ret && slot > 0) 1070 slot--; 1071 1072 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1073 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1074 1075 old_bytenr = btrfs_node_blockptr(parent, slot); 1076 blocksize = fs_info->nodesize; 1077 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1078 1079 if (level <= max_level) { 1080 eb = path->nodes[level]; 1081 new_bytenr = btrfs_node_blockptr(eb, 1082 path->slots[level]); 1083 new_ptr_gen = btrfs_node_ptr_generation(eb, 1084 path->slots[level]); 1085 } else { 1086 new_bytenr = 0; 1087 new_ptr_gen = 0; 1088 } 1089 1090 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1091 ret = level; 1092 break; 1093 } 1094 1095 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1096 memcmp_node_keys(parent, slot, path, level)) { 1097 if (level <= lowest_level) { 1098 ret = 0; 1099 break; 1100 } 1101 1102 eb = btrfs_read_node_slot(parent, slot); 1103 if (IS_ERR(eb)) { 1104 ret = PTR_ERR(eb); 1105 break; 1106 } 1107 btrfs_tree_lock(eb); 1108 if (cow) { 1109 ret = btrfs_cow_block(trans, dest, eb, parent, 1110 slot, &eb, 1111 BTRFS_NESTING_COW); 1112 if (ret) { 1113 btrfs_tree_unlock(eb); 1114 free_extent_buffer(eb); 1115 break; 1116 } 1117 } 1118 1119 btrfs_tree_unlock(parent); 1120 free_extent_buffer(parent); 1121 1122 parent = eb; 1123 continue; 1124 } 1125 1126 if (!cow) { 1127 btrfs_tree_unlock(parent); 1128 free_extent_buffer(parent); 1129 cow = 1; 1130 goto again; 1131 } 1132 1133 btrfs_node_key_to_cpu(path->nodes[level], &key, 1134 path->slots[level]); 1135 btrfs_release_path(path); 1136 1137 path->lowest_level = level; 1138 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1139 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1140 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1141 path->lowest_level = 0; 1142 if (ret) { 1143 if (ret > 0) 1144 ret = -ENOENT; 1145 break; 1146 } 1147 1148 /* 1149 * Info qgroup to trace both subtrees. 1150 * 1151 * We must trace both trees. 1152 * 1) Tree reloc subtree 1153 * If not traced, we will leak data numbers 1154 * 2) Fs subtree 1155 * If not traced, we will double count old data 1156 * 1157 * We don't scan the subtree right now, but only record 1158 * the swapped tree blocks. 1159 * The real subtree rescan is delayed until we have new 1160 * CoW on the subtree root node before transaction commit. 1161 */ 1162 ret = btrfs_qgroup_add_swapped_blocks(dest, 1163 rc->block_group, parent, slot, 1164 path->nodes[level], path->slots[level], 1165 last_snapshot); 1166 if (ret < 0) 1167 break; 1168 /* 1169 * swap blocks in fs tree and reloc tree. 1170 */ 1171 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1172 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1173 1174 btrfs_set_node_blockptr(path->nodes[level], 1175 path->slots[level], old_bytenr); 1176 btrfs_set_node_ptr_generation(path->nodes[level], 1177 path->slots[level], old_ptr_gen); 1178 1179 ref.action = BTRFS_ADD_DELAYED_REF; 1180 ref.bytenr = old_bytenr; 1181 ref.num_bytes = blocksize; 1182 ref.parent = path->nodes[level]->start; 1183 ref.owning_root = btrfs_root_id(src); 1184 ref.ref_root = btrfs_root_id(src); 1185 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1186 ret = btrfs_inc_extent_ref(trans, &ref); 1187 if (unlikely(ret)) { 1188 btrfs_abort_transaction(trans, ret); 1189 break; 1190 } 1191 1192 ref.action = BTRFS_ADD_DELAYED_REF; 1193 ref.bytenr = new_bytenr; 1194 ref.num_bytes = blocksize; 1195 ref.parent = 0; 1196 ref.owning_root = btrfs_root_id(dest); 1197 ref.ref_root = btrfs_root_id(dest); 1198 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1199 ret = btrfs_inc_extent_ref(trans, &ref); 1200 if (unlikely(ret)) { 1201 btrfs_abort_transaction(trans, ret); 1202 break; 1203 } 1204 1205 /* We don't know the real owning_root, use 0. */ 1206 ref.action = BTRFS_DROP_DELAYED_REF; 1207 ref.bytenr = new_bytenr; 1208 ref.num_bytes = blocksize; 1209 ref.parent = path->nodes[level]->start; 1210 ref.owning_root = 0; 1211 ref.ref_root = btrfs_root_id(src); 1212 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1213 ret = btrfs_free_extent(trans, &ref); 1214 if (unlikely(ret)) { 1215 btrfs_abort_transaction(trans, ret); 1216 break; 1217 } 1218 1219 /* We don't know the real owning_root, use 0. */ 1220 ref.action = BTRFS_DROP_DELAYED_REF; 1221 ref.bytenr = old_bytenr; 1222 ref.num_bytes = blocksize; 1223 ref.parent = 0; 1224 ref.owning_root = 0; 1225 ref.ref_root = btrfs_root_id(dest); 1226 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1227 ret = btrfs_free_extent(trans, &ref); 1228 if (unlikely(ret)) { 1229 btrfs_abort_transaction(trans, ret); 1230 break; 1231 } 1232 1233 btrfs_unlock_up_safe(path, 0); 1234 1235 ret = level; 1236 break; 1237 } 1238 btrfs_tree_unlock(parent); 1239 free_extent_buffer(parent); 1240 return ret; 1241 } 1242 1243 /* 1244 * helper to find next relocated block in reloc tree 1245 */ 1246 static noinline_for_stack 1247 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1248 int *level) 1249 { 1250 struct extent_buffer *eb; 1251 int i; 1252 u64 last_snapshot; 1253 u32 nritems; 1254 1255 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1256 1257 for (i = 0; i < *level; i++) { 1258 free_extent_buffer(path->nodes[i]); 1259 path->nodes[i] = NULL; 1260 } 1261 1262 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1263 eb = path->nodes[i]; 1264 nritems = btrfs_header_nritems(eb); 1265 while (path->slots[i] + 1 < nritems) { 1266 path->slots[i]++; 1267 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1268 last_snapshot) 1269 continue; 1270 1271 *level = i; 1272 return 0; 1273 } 1274 free_extent_buffer(path->nodes[i]); 1275 path->nodes[i] = NULL; 1276 } 1277 return 1; 1278 } 1279 1280 /* 1281 * walk down reloc tree to find relocated block of lowest level 1282 */ 1283 static noinline_for_stack 1284 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1285 int *level) 1286 { 1287 struct extent_buffer *eb = NULL; 1288 int i; 1289 u64 ptr_gen = 0; 1290 u64 last_snapshot; 1291 u32 nritems; 1292 1293 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1294 1295 for (i = *level; i > 0; i--) { 1296 eb = path->nodes[i]; 1297 nritems = btrfs_header_nritems(eb); 1298 while (path->slots[i] < nritems) { 1299 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1300 if (ptr_gen > last_snapshot) 1301 break; 1302 path->slots[i]++; 1303 } 1304 if (path->slots[i] >= nritems) { 1305 if (i == *level) 1306 break; 1307 *level = i + 1; 1308 return 0; 1309 } 1310 if (i == 1) { 1311 *level = i; 1312 return 0; 1313 } 1314 1315 eb = btrfs_read_node_slot(eb, path->slots[i]); 1316 if (IS_ERR(eb)) 1317 return PTR_ERR(eb); 1318 BUG_ON(btrfs_header_level(eb) != i - 1); 1319 path->nodes[i - 1] = eb; 1320 path->slots[i - 1] = 0; 1321 } 1322 return 1; 1323 } 1324 1325 /* 1326 * invalidate extent cache for file extents whose key in range of 1327 * [min_key, max_key) 1328 */ 1329 static int invalidate_extent_cache(struct btrfs_root *root, 1330 const struct btrfs_key *min_key, 1331 const struct btrfs_key *max_key) 1332 { 1333 struct btrfs_fs_info *fs_info = root->fs_info; 1334 struct btrfs_inode *inode = NULL; 1335 u64 objectid; 1336 u64 start, end; 1337 u64 ino; 1338 1339 objectid = min_key->objectid; 1340 while (1) { 1341 struct extent_state *cached_state = NULL; 1342 1343 cond_resched(); 1344 if (inode) 1345 iput(&inode->vfs_inode); 1346 1347 if (objectid > max_key->objectid) 1348 break; 1349 1350 inode = btrfs_find_first_inode(root, objectid); 1351 if (!inode) 1352 break; 1353 ino = btrfs_ino(inode); 1354 1355 if (ino > max_key->objectid) { 1356 iput(&inode->vfs_inode); 1357 break; 1358 } 1359 1360 objectid = ino + 1; 1361 if (!S_ISREG(inode->vfs_inode.i_mode)) 1362 continue; 1363 1364 if (unlikely(min_key->objectid == ino)) { 1365 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1366 continue; 1367 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1368 start = 0; 1369 else { 1370 start = min_key->offset; 1371 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1372 } 1373 } else { 1374 start = 0; 1375 } 1376 1377 if (unlikely(max_key->objectid == ino)) { 1378 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1379 continue; 1380 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1381 end = (u64)-1; 1382 } else { 1383 if (max_key->offset == 0) 1384 continue; 1385 end = max_key->offset; 1386 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1387 end--; 1388 } 1389 } else { 1390 end = (u64)-1; 1391 } 1392 1393 /* the lock_extent waits for read_folio to complete */ 1394 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 1395 btrfs_drop_extent_map_range(inode, start, end, true); 1396 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 1397 } 1398 return 0; 1399 } 1400 1401 static int find_next_key(struct btrfs_path *path, int level, 1402 struct btrfs_key *key) 1403 1404 { 1405 while (level < BTRFS_MAX_LEVEL) { 1406 if (!path->nodes[level]) 1407 break; 1408 if (path->slots[level] + 1 < 1409 btrfs_header_nritems(path->nodes[level])) { 1410 btrfs_node_key_to_cpu(path->nodes[level], key, 1411 path->slots[level] + 1); 1412 return 0; 1413 } 1414 level++; 1415 } 1416 return 1; 1417 } 1418 1419 /* 1420 * Insert current subvolume into reloc_control::dirty_subvol_roots 1421 */ 1422 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1423 struct reloc_control *rc, 1424 struct btrfs_root *root) 1425 { 1426 struct btrfs_root *reloc_root = root->reloc_root; 1427 struct btrfs_root_item *reloc_root_item; 1428 int ret; 1429 1430 /* @root must be a subvolume tree root with a valid reloc tree */ 1431 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1432 ASSERT(reloc_root); 1433 1434 reloc_root_item = &reloc_root->root_item; 1435 memset(&reloc_root_item->drop_progress, 0, 1436 sizeof(reloc_root_item->drop_progress)); 1437 btrfs_set_root_drop_level(reloc_root_item, 0); 1438 btrfs_set_root_refs(reloc_root_item, 0); 1439 ret = btrfs_update_reloc_root(trans, root); 1440 if (ret) 1441 return ret; 1442 1443 if (list_empty(&root->reloc_dirty_list)) { 1444 btrfs_grab_root(root); 1445 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1446 } 1447 1448 return 0; 1449 } 1450 1451 static int clean_dirty_subvols(struct reloc_control *rc) 1452 { 1453 struct btrfs_root *root; 1454 struct btrfs_root *next; 1455 int ret = 0; 1456 int ret2; 1457 1458 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1459 reloc_dirty_list) { 1460 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1461 /* Merged subvolume, cleanup its reloc root */ 1462 struct btrfs_root *reloc_root = root->reloc_root; 1463 1464 list_del_init(&root->reloc_dirty_list); 1465 root->reloc_root = NULL; 1466 /* 1467 * Need barrier to ensure clear_bit() only happens after 1468 * root->reloc_root = NULL. Pairs with have_reloc_root. 1469 */ 1470 smp_wmb(); 1471 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1472 if (reloc_root) { 1473 /* 1474 * btrfs_drop_snapshot drops our ref we hold for 1475 * ->reloc_root. If it fails however we must 1476 * drop the ref ourselves. 1477 */ 1478 ret2 = btrfs_drop_snapshot(reloc_root, false, true); 1479 if (ret2 < 0) { 1480 btrfs_put_root(reloc_root); 1481 if (!ret) 1482 ret = ret2; 1483 } 1484 } 1485 btrfs_put_root(root); 1486 } else { 1487 /* Orphan reloc tree, just clean it up */ 1488 ret2 = btrfs_drop_snapshot(root, false, true); 1489 if (ret2 < 0) { 1490 btrfs_put_root(root); 1491 if (!ret) 1492 ret = ret2; 1493 } 1494 } 1495 } 1496 return ret; 1497 } 1498 1499 /* 1500 * merge the relocated tree blocks in reloc tree with corresponding 1501 * fs tree. 1502 */ 1503 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1504 struct btrfs_root *root) 1505 { 1506 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1507 struct btrfs_key key; 1508 struct btrfs_key next_key; 1509 struct btrfs_trans_handle *trans = NULL; 1510 struct btrfs_root *reloc_root; 1511 struct btrfs_root_item *root_item; 1512 struct btrfs_path *path; 1513 struct extent_buffer *leaf; 1514 int reserve_level; 1515 int level; 1516 int max_level; 1517 int replaced = 0; 1518 int ret = 0; 1519 u32 min_reserved; 1520 1521 path = btrfs_alloc_path(); 1522 if (!path) 1523 return -ENOMEM; 1524 path->reada = READA_FORWARD; 1525 1526 reloc_root = root->reloc_root; 1527 root_item = &reloc_root->root_item; 1528 1529 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1530 level = btrfs_root_level(root_item); 1531 refcount_inc(&reloc_root->node->refs); 1532 path->nodes[level] = reloc_root->node; 1533 path->slots[level] = 0; 1534 } else { 1535 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1536 1537 level = btrfs_root_drop_level(root_item); 1538 BUG_ON(level == 0); 1539 path->lowest_level = level; 1540 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1541 path->lowest_level = 0; 1542 if (ret < 0) { 1543 btrfs_free_path(path); 1544 return ret; 1545 } 1546 1547 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1548 path->slots[level]); 1549 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1550 1551 btrfs_unlock_up_safe(path, 0); 1552 } 1553 1554 /* 1555 * In merge_reloc_root(), we modify the upper level pointer to swap the 1556 * tree blocks between reloc tree and subvolume tree. Thus for tree 1557 * block COW, we COW at most from level 1 to root level for each tree. 1558 * 1559 * Thus the needed metadata size is at most root_level * nodesize, 1560 * and * 2 since we have two trees to COW. 1561 */ 1562 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1563 min_reserved = fs_info->nodesize * reserve_level * 2; 1564 memset(&next_key, 0, sizeof(next_key)); 1565 1566 while (1) { 1567 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1568 min_reserved, 1569 BTRFS_RESERVE_FLUSH_LIMIT); 1570 if (ret) 1571 goto out; 1572 trans = btrfs_start_transaction(root, 0); 1573 if (IS_ERR(trans)) { 1574 ret = PTR_ERR(trans); 1575 trans = NULL; 1576 goto out; 1577 } 1578 1579 /* 1580 * At this point we no longer have a reloc_control, so we can't 1581 * depend on btrfs_init_reloc_root to update our last_trans. 1582 * 1583 * But that's ok, we started the trans handle on our 1584 * corresponding fs_root, which means it's been added to the 1585 * dirty list. At commit time we'll still call 1586 * btrfs_update_reloc_root() and update our root item 1587 * appropriately. 1588 */ 1589 btrfs_set_root_last_trans(reloc_root, trans->transid); 1590 trans->block_rsv = rc->block_rsv; 1591 1592 replaced = 0; 1593 max_level = level; 1594 1595 ret = walk_down_reloc_tree(reloc_root, path, &level); 1596 if (ret < 0) 1597 goto out; 1598 if (ret > 0) 1599 break; 1600 1601 if (!find_next_key(path, level, &key) && 1602 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1603 ret = 0; 1604 } else { 1605 ret = replace_path(trans, rc, root, reloc_root, path, 1606 &next_key, level, max_level); 1607 } 1608 if (ret < 0) 1609 goto out; 1610 if (ret > 0) { 1611 level = ret; 1612 btrfs_node_key_to_cpu(path->nodes[level], &key, 1613 path->slots[level]); 1614 replaced = 1; 1615 } 1616 1617 ret = walk_up_reloc_tree(reloc_root, path, &level); 1618 if (ret > 0) 1619 break; 1620 1621 BUG_ON(level == 0); 1622 /* 1623 * save the merging progress in the drop_progress. 1624 * this is OK since root refs == 1 in this case. 1625 */ 1626 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1627 path->slots[level]); 1628 btrfs_set_root_drop_level(root_item, level); 1629 1630 btrfs_end_transaction_throttle(trans); 1631 trans = NULL; 1632 1633 btrfs_btree_balance_dirty(fs_info); 1634 1635 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1636 invalidate_extent_cache(root, &key, &next_key); 1637 } 1638 1639 /* 1640 * handle the case only one block in the fs tree need to be 1641 * relocated and the block is tree root. 1642 */ 1643 leaf = btrfs_lock_root_node(root); 1644 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1645 BTRFS_NESTING_COW); 1646 btrfs_tree_unlock(leaf); 1647 free_extent_buffer(leaf); 1648 out: 1649 btrfs_free_path(path); 1650 1651 if (ret == 0) { 1652 ret = insert_dirty_subvol(trans, rc, root); 1653 if (ret) 1654 btrfs_abort_transaction(trans, ret); 1655 } 1656 1657 if (trans) 1658 btrfs_end_transaction_throttle(trans); 1659 1660 btrfs_btree_balance_dirty(fs_info); 1661 1662 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1663 invalidate_extent_cache(root, &key, &next_key); 1664 1665 return ret; 1666 } 1667 1668 static noinline_for_stack 1669 int prepare_to_merge(struct reloc_control *rc, int err) 1670 { 1671 struct btrfs_root *root = rc->extent_root; 1672 struct btrfs_fs_info *fs_info = root->fs_info; 1673 struct btrfs_root *reloc_root; 1674 struct btrfs_trans_handle *trans; 1675 LIST_HEAD(reloc_roots); 1676 u64 num_bytes = 0; 1677 int ret; 1678 1679 mutex_lock(&fs_info->reloc_mutex); 1680 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1681 rc->merging_rsv_size += rc->nodes_relocated * 2; 1682 mutex_unlock(&fs_info->reloc_mutex); 1683 1684 again: 1685 if (!err) { 1686 num_bytes = rc->merging_rsv_size; 1687 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1688 BTRFS_RESERVE_FLUSH_ALL); 1689 if (ret) 1690 err = ret; 1691 } 1692 1693 trans = btrfs_join_transaction(rc->extent_root); 1694 if (IS_ERR(trans)) { 1695 if (!err) 1696 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1697 num_bytes, NULL); 1698 return PTR_ERR(trans); 1699 } 1700 1701 if (!err) { 1702 if (num_bytes != rc->merging_rsv_size) { 1703 btrfs_end_transaction(trans); 1704 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1705 num_bytes, NULL); 1706 goto again; 1707 } 1708 } 1709 1710 rc->merge_reloc_tree = true; 1711 1712 while (!list_empty(&rc->reloc_roots)) { 1713 reloc_root = list_first_entry(&rc->reloc_roots, 1714 struct btrfs_root, root_list); 1715 list_del_init(&reloc_root->root_list); 1716 1717 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1718 false); 1719 if (IS_ERR(root)) { 1720 /* 1721 * Even if we have an error we need this reloc root 1722 * back on our list so we can clean up properly. 1723 */ 1724 list_add(&reloc_root->root_list, &reloc_roots); 1725 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1726 if (!err) 1727 err = PTR_ERR(root); 1728 break; 1729 } 1730 1731 if (unlikely(root->reloc_root != reloc_root)) { 1732 if (root->reloc_root) { 1733 btrfs_err(fs_info, 1734 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1735 btrfs_root_id(root), 1736 btrfs_root_id(root->reloc_root), 1737 root->reloc_root->root_key.type, 1738 root->reloc_root->root_key.offset, 1739 btrfs_root_generation( 1740 &root->reloc_root->root_item), 1741 btrfs_root_id(reloc_root), 1742 reloc_root->root_key.type, 1743 reloc_root->root_key.offset, 1744 btrfs_root_generation( 1745 &reloc_root->root_item)); 1746 } else { 1747 btrfs_err(fs_info, 1748 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1749 btrfs_root_id(root), 1750 btrfs_root_id(reloc_root), 1751 reloc_root->root_key.type, 1752 reloc_root->root_key.offset, 1753 btrfs_root_generation( 1754 &reloc_root->root_item)); 1755 } 1756 list_add(&reloc_root->root_list, &reloc_roots); 1757 btrfs_put_root(root); 1758 btrfs_abort_transaction(trans, -EUCLEAN); 1759 if (!err) 1760 err = -EUCLEAN; 1761 break; 1762 } 1763 1764 /* 1765 * set reference count to 1, so btrfs_recover_relocation 1766 * knows it should resumes merging 1767 */ 1768 if (!err) 1769 btrfs_set_root_refs(&reloc_root->root_item, 1); 1770 ret = btrfs_update_reloc_root(trans, root); 1771 1772 /* 1773 * Even if we have an error we need this reloc root back on our 1774 * list so we can clean up properly. 1775 */ 1776 list_add(&reloc_root->root_list, &reloc_roots); 1777 btrfs_put_root(root); 1778 1779 if (unlikely(ret)) { 1780 btrfs_abort_transaction(trans, ret); 1781 if (!err) 1782 err = ret; 1783 break; 1784 } 1785 } 1786 1787 list_splice(&reloc_roots, &rc->reloc_roots); 1788 1789 if (!err) 1790 err = btrfs_commit_transaction(trans); 1791 else 1792 btrfs_end_transaction(trans); 1793 return err; 1794 } 1795 1796 static noinline_for_stack 1797 void free_reloc_roots(struct list_head *list) 1798 { 1799 struct btrfs_root *reloc_root, *tmp; 1800 1801 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1802 __del_reloc_root(reloc_root); 1803 } 1804 1805 static noinline_for_stack 1806 void merge_reloc_roots(struct reloc_control *rc) 1807 { 1808 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1809 struct btrfs_root *root; 1810 struct btrfs_root *reloc_root; 1811 LIST_HEAD(reloc_roots); 1812 int found = 0; 1813 int ret = 0; 1814 again: 1815 root = rc->extent_root; 1816 1817 /* 1818 * this serializes us with btrfs_record_root_in_transaction, 1819 * we have to make sure nobody is in the middle of 1820 * adding their roots to the list while we are 1821 * doing this splice 1822 */ 1823 mutex_lock(&fs_info->reloc_mutex); 1824 list_splice_init(&rc->reloc_roots, &reloc_roots); 1825 mutex_unlock(&fs_info->reloc_mutex); 1826 1827 while (!list_empty(&reloc_roots)) { 1828 found = 1; 1829 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 1830 1831 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1832 false); 1833 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1834 if (WARN_ON(IS_ERR(root))) { 1835 /* 1836 * For recovery we read the fs roots on mount, 1837 * and if we didn't find the root then we marked 1838 * the reloc root as a garbage root. For normal 1839 * relocation obviously the root should exist in 1840 * memory. However there's no reason we can't 1841 * handle the error properly here just in case. 1842 */ 1843 ret = PTR_ERR(root); 1844 goto out; 1845 } 1846 if (WARN_ON(root->reloc_root != reloc_root)) { 1847 /* 1848 * This can happen if on-disk metadata has some 1849 * corruption, e.g. bad reloc tree key offset. 1850 */ 1851 ret = -EINVAL; 1852 goto out; 1853 } 1854 ret = merge_reloc_root(rc, root); 1855 btrfs_put_root(root); 1856 if (ret) { 1857 if (list_empty(&reloc_root->root_list)) 1858 list_add_tail(&reloc_root->root_list, 1859 &reloc_roots); 1860 goto out; 1861 } 1862 } else { 1863 if (!IS_ERR(root)) { 1864 if (root->reloc_root == reloc_root) { 1865 root->reloc_root = NULL; 1866 btrfs_put_root(reloc_root); 1867 } 1868 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 1869 &root->state); 1870 btrfs_put_root(root); 1871 } 1872 1873 list_del_init(&reloc_root->root_list); 1874 /* Don't forget to queue this reloc root for cleanup */ 1875 list_add_tail(&reloc_root->reloc_dirty_list, 1876 &rc->dirty_subvol_roots); 1877 } 1878 } 1879 1880 if (found) { 1881 found = 0; 1882 goto again; 1883 } 1884 out: 1885 if (ret) { 1886 btrfs_handle_fs_error(fs_info, ret, NULL); 1887 free_reloc_roots(&reloc_roots); 1888 1889 /* new reloc root may be added */ 1890 mutex_lock(&fs_info->reloc_mutex); 1891 list_splice_init(&rc->reloc_roots, &reloc_roots); 1892 mutex_unlock(&fs_info->reloc_mutex); 1893 free_reloc_roots(&reloc_roots); 1894 } 1895 1896 /* 1897 * We used to have 1898 * 1899 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 1900 * 1901 * here, but it's wrong. If we fail to start the transaction in 1902 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 1903 * have actually been removed from the reloc_root_tree rb tree. This is 1904 * fine because we're bailing here, and we hold a reference on the root 1905 * for the list that holds it, so these roots will be cleaned up when we 1906 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 1907 * will be cleaned up on unmount. 1908 * 1909 * The remaining nodes will be cleaned up by free_reloc_control. 1910 */ 1911 } 1912 1913 static void free_block_list(struct rb_root *blocks) 1914 { 1915 struct tree_block *block; 1916 struct rb_node *rb_node; 1917 while ((rb_node = rb_first(blocks))) { 1918 block = rb_entry(rb_node, struct tree_block, rb_node); 1919 rb_erase(rb_node, blocks); 1920 kfree(block); 1921 } 1922 } 1923 1924 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 1925 struct btrfs_root *reloc_root) 1926 { 1927 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 1928 struct btrfs_root *root; 1929 int ret; 1930 1931 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 1932 return 0; 1933 1934 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 1935 1936 /* 1937 * This should succeed, since we can't have a reloc root without having 1938 * already looked up the actual root and created the reloc root for this 1939 * root. 1940 * 1941 * However if there's some sort of corruption where we have a ref to a 1942 * reloc root without a corresponding root this could return ENOENT. 1943 */ 1944 if (IS_ERR(root)) { 1945 DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); 1946 return PTR_ERR(root); 1947 } 1948 if (unlikely(root->reloc_root != reloc_root)) { 1949 DEBUG_WARN("unexpected reloc root found"); 1950 btrfs_err(fs_info, 1951 "root %llu has two reloc roots associated with it", 1952 reloc_root->root_key.offset); 1953 btrfs_put_root(root); 1954 return -EUCLEAN; 1955 } 1956 ret = btrfs_record_root_in_trans(trans, root); 1957 btrfs_put_root(root); 1958 1959 return ret; 1960 } 1961 1962 static noinline_for_stack 1963 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 1964 struct reloc_control *rc, 1965 struct btrfs_backref_node *node, 1966 struct btrfs_backref_edge *edges[]) 1967 { 1968 struct btrfs_backref_node *next; 1969 struct btrfs_root *root; 1970 int index = 0; 1971 int ret; 1972 1973 next = walk_up_backref(node, edges, &index); 1974 root = next->root; 1975 1976 /* 1977 * If there is no root, then our references for this block are 1978 * incomplete, as we should be able to walk all the way up to a block 1979 * that is owned by a root. 1980 * 1981 * This path is only for SHAREABLE roots, so if we come upon a 1982 * non-SHAREABLE root then we have backrefs that resolve improperly. 1983 * 1984 * Both of these cases indicate file system corruption, or a bug in the 1985 * backref walking code. 1986 */ 1987 if (unlikely(!root)) { 1988 btrfs_err(trans->fs_info, 1989 "bytenr %llu doesn't have a backref path ending in a root", 1990 node->bytenr); 1991 return ERR_PTR(-EUCLEAN); 1992 } 1993 if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) { 1994 btrfs_err(trans->fs_info, 1995 "bytenr %llu has multiple refs with one ending in a non-shareable root", 1996 node->bytenr); 1997 return ERR_PTR(-EUCLEAN); 1998 } 1999 2000 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2001 ret = record_reloc_root_in_trans(trans, root); 2002 if (ret) 2003 return ERR_PTR(ret); 2004 goto found; 2005 } 2006 2007 ret = btrfs_record_root_in_trans(trans, root); 2008 if (ret) 2009 return ERR_PTR(ret); 2010 root = root->reloc_root; 2011 2012 /* 2013 * We could have raced with another thread which failed, so 2014 * root->reloc_root may not be set, return ENOENT in this case. 2015 */ 2016 if (!root) 2017 return ERR_PTR(-ENOENT); 2018 2019 if (unlikely(next->new_bytenr)) { 2020 /* 2021 * We just created the reloc root, so we shouldn't have 2022 * ->new_bytenr set yet. If it is then we have multiple roots 2023 * pointing at the same bytenr which indicates corruption, or 2024 * we've made a mistake in the backref walking code. 2025 */ 2026 ASSERT(next->new_bytenr == 0); 2027 btrfs_err(trans->fs_info, 2028 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2029 node->bytenr, next->bytenr); 2030 return ERR_PTR(-EUCLEAN); 2031 } 2032 2033 next->new_bytenr = root->node->start; 2034 btrfs_put_root(next->root); 2035 next->root = btrfs_grab_root(root); 2036 ASSERT(next->root); 2037 mark_block_processed(rc, next); 2038 found: 2039 next = node; 2040 /* setup backref node path for btrfs_reloc_cow_block */ 2041 while (1) { 2042 rc->backref_cache.path[next->level] = next; 2043 if (--index < 0) 2044 break; 2045 next = edges[index]->node[UPPER]; 2046 } 2047 return root; 2048 } 2049 2050 /* 2051 * Select a tree root for relocation. 2052 * 2053 * Return NULL if the block is not shareable. We should use do_relocation() in 2054 * this case. 2055 * 2056 * Return a tree root pointer if the block is shareable. 2057 * Return -ENOENT if the block is root of reloc tree. 2058 */ 2059 static noinline_for_stack 2060 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2061 { 2062 struct btrfs_backref_node *next; 2063 struct btrfs_root *root; 2064 struct btrfs_root *fs_root = NULL; 2065 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2066 int index = 0; 2067 2068 next = node; 2069 while (1) { 2070 cond_resched(); 2071 next = walk_up_backref(next, edges, &index); 2072 root = next->root; 2073 2074 /* 2075 * This can occur if we have incomplete extent refs leading all 2076 * the way up a particular path, in this case return -EUCLEAN. 2077 */ 2078 if (unlikely(!root)) 2079 return ERR_PTR(-EUCLEAN); 2080 2081 /* No other choice for non-shareable tree */ 2082 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2083 return root; 2084 2085 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2086 fs_root = root; 2087 2088 if (next != node) 2089 return NULL; 2090 2091 next = walk_down_backref(edges, &index); 2092 if (!next || next->level <= node->level) 2093 break; 2094 } 2095 2096 if (!fs_root) 2097 return ERR_PTR(-ENOENT); 2098 return fs_root; 2099 } 2100 2101 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2102 struct btrfs_backref_node *node) 2103 { 2104 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2105 struct btrfs_backref_node *next = node; 2106 struct btrfs_backref_edge *edge; 2107 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2108 u64 num_bytes = 0; 2109 int index = 0; 2110 2111 BUG_ON(node->processed); 2112 2113 while (next) { 2114 cond_resched(); 2115 while (1) { 2116 if (next->processed) 2117 break; 2118 2119 num_bytes += fs_info->nodesize; 2120 2121 if (list_empty(&next->upper)) 2122 break; 2123 2124 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2125 list[LOWER]); 2126 edges[index++] = edge; 2127 next = edge->node[UPPER]; 2128 } 2129 next = walk_down_backref(edges, &index); 2130 } 2131 return num_bytes; 2132 } 2133 2134 static int refill_metadata_space(struct btrfs_trans_handle *trans, 2135 struct reloc_control *rc, u64 num_bytes) 2136 { 2137 struct btrfs_fs_info *fs_info = trans->fs_info; 2138 int ret; 2139 2140 trans->block_rsv = rc->block_rsv; 2141 rc->reserved_bytes += num_bytes; 2142 2143 /* 2144 * We are under a transaction here so we can only do limited flushing. 2145 * If we get an enospc just kick back -EAGAIN so we know to drop the 2146 * transaction and try to refill when we can flush all the things. 2147 */ 2148 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2149 BTRFS_RESERVE_FLUSH_LIMIT); 2150 if (ret) { 2151 u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2152 2153 while (tmp <= rc->reserved_bytes) 2154 tmp <<= 1; 2155 /* 2156 * only one thread can access block_rsv at this point, 2157 * so we don't need hold lock to protect block_rsv. 2158 * we expand more reservation size here to allow enough 2159 * space for relocation and we will return earlier in 2160 * enospc case. 2161 */ 2162 rc->block_rsv->size = tmp + fs_info->nodesize * 2163 RELOCATION_RESERVED_NODES; 2164 return -EAGAIN; 2165 } 2166 2167 return 0; 2168 } 2169 2170 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2171 struct reloc_control *rc, 2172 struct btrfs_backref_node *node) 2173 { 2174 u64 num_bytes; 2175 2176 num_bytes = calcu_metadata_size(rc, node) * 2; 2177 return refill_metadata_space(trans, rc, num_bytes); 2178 } 2179 2180 /* 2181 * relocate a block tree, and then update pointers in upper level 2182 * blocks that reference the block to point to the new location. 2183 * 2184 * if called by link_to_upper, the block has already been relocated. 2185 * in that case this function just updates pointers. 2186 */ 2187 static int do_relocation(struct btrfs_trans_handle *trans, 2188 struct reloc_control *rc, 2189 struct btrfs_backref_node *node, 2190 struct btrfs_key *key, 2191 struct btrfs_path *path, int lowest) 2192 { 2193 struct btrfs_backref_node *upper; 2194 struct btrfs_backref_edge *edge; 2195 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2196 struct btrfs_root *root; 2197 struct extent_buffer *eb; 2198 u32 blocksize; 2199 u64 bytenr; 2200 int slot; 2201 int ret = 0; 2202 2203 /* 2204 * If we are lowest then this is the first time we're processing this 2205 * block, and thus shouldn't have an eb associated with it yet. 2206 */ 2207 ASSERT(!lowest || !node->eb); 2208 2209 path->lowest_level = node->level + 1; 2210 rc->backref_cache.path[node->level] = node; 2211 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2212 cond_resched(); 2213 2214 upper = edge->node[UPPER]; 2215 root = select_reloc_root(trans, rc, upper, edges); 2216 if (IS_ERR(root)) { 2217 ret = PTR_ERR(root); 2218 goto next; 2219 } 2220 2221 if (upper->eb && !upper->locked) { 2222 if (!lowest) { 2223 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2224 if (ret < 0) 2225 goto next; 2226 BUG_ON(ret); 2227 bytenr = btrfs_node_blockptr(upper->eb, slot); 2228 if (node->eb->start == bytenr) 2229 goto next; 2230 } 2231 btrfs_backref_drop_node_buffer(upper); 2232 } 2233 2234 if (!upper->eb) { 2235 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2236 if (ret) { 2237 if (ret > 0) 2238 ret = -ENOENT; 2239 2240 btrfs_release_path(path); 2241 break; 2242 } 2243 2244 if (!upper->eb) { 2245 upper->eb = path->nodes[upper->level]; 2246 path->nodes[upper->level] = NULL; 2247 } else { 2248 BUG_ON(upper->eb != path->nodes[upper->level]); 2249 } 2250 2251 upper->locked = 1; 2252 path->locks[upper->level] = 0; 2253 2254 slot = path->slots[upper->level]; 2255 btrfs_release_path(path); 2256 } else { 2257 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2258 if (ret < 0) 2259 goto next; 2260 BUG_ON(ret); 2261 } 2262 2263 bytenr = btrfs_node_blockptr(upper->eb, slot); 2264 if (lowest) { 2265 if (unlikely(bytenr != node->bytenr)) { 2266 btrfs_err(root->fs_info, 2267 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2268 bytenr, node->bytenr, slot, 2269 upper->eb->start); 2270 ret = -EIO; 2271 goto next; 2272 } 2273 } else { 2274 if (node->eb->start == bytenr) 2275 goto next; 2276 } 2277 2278 blocksize = root->fs_info->nodesize; 2279 eb = btrfs_read_node_slot(upper->eb, slot); 2280 if (IS_ERR(eb)) { 2281 ret = PTR_ERR(eb); 2282 goto next; 2283 } 2284 btrfs_tree_lock(eb); 2285 2286 if (!node->eb) { 2287 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2288 slot, &eb, BTRFS_NESTING_COW); 2289 btrfs_tree_unlock(eb); 2290 free_extent_buffer(eb); 2291 if (ret < 0) 2292 goto next; 2293 /* 2294 * We've just COWed this block, it should have updated 2295 * the correct backref node entry. 2296 */ 2297 ASSERT(node->eb == eb); 2298 } else { 2299 struct btrfs_ref ref = { 2300 .action = BTRFS_ADD_DELAYED_REF, 2301 .bytenr = node->eb->start, 2302 .num_bytes = blocksize, 2303 .parent = upper->eb->start, 2304 .owning_root = btrfs_header_owner(upper->eb), 2305 .ref_root = btrfs_header_owner(upper->eb), 2306 }; 2307 2308 btrfs_set_node_blockptr(upper->eb, slot, 2309 node->eb->start); 2310 btrfs_set_node_ptr_generation(upper->eb, slot, 2311 trans->transid); 2312 btrfs_mark_buffer_dirty(trans, upper->eb); 2313 2314 btrfs_init_tree_ref(&ref, node->level, 2315 btrfs_root_id(root), false); 2316 ret = btrfs_inc_extent_ref(trans, &ref); 2317 if (!ret) 2318 ret = btrfs_drop_subtree(trans, root, eb, 2319 upper->eb); 2320 if (unlikely(ret)) 2321 btrfs_abort_transaction(trans, ret); 2322 } 2323 next: 2324 if (!upper->pending) 2325 btrfs_backref_drop_node_buffer(upper); 2326 else 2327 btrfs_backref_unlock_node_buffer(upper); 2328 if (ret) 2329 break; 2330 } 2331 2332 if (!ret && node->pending) { 2333 btrfs_backref_drop_node_buffer(node); 2334 list_del_init(&node->list); 2335 node->pending = 0; 2336 } 2337 2338 path->lowest_level = 0; 2339 2340 /* 2341 * We should have allocated all of our space in the block rsv and thus 2342 * shouldn't ENOSPC. 2343 */ 2344 ASSERT(ret != -ENOSPC); 2345 return ret; 2346 } 2347 2348 static int link_to_upper(struct btrfs_trans_handle *trans, 2349 struct reloc_control *rc, 2350 struct btrfs_backref_node *node, 2351 struct btrfs_path *path) 2352 { 2353 struct btrfs_key key; 2354 2355 btrfs_node_key_to_cpu(node->eb, &key, 0); 2356 return do_relocation(trans, rc, node, &key, path, 0); 2357 } 2358 2359 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2360 struct reloc_control *rc, 2361 struct btrfs_path *path, int err) 2362 { 2363 LIST_HEAD(list); 2364 struct btrfs_backref_cache *cache = &rc->backref_cache; 2365 struct btrfs_backref_node *node; 2366 int level; 2367 int ret; 2368 2369 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2370 while (!list_empty(&cache->pending[level])) { 2371 node = list_first_entry(&cache->pending[level], 2372 struct btrfs_backref_node, list); 2373 list_move_tail(&node->list, &list); 2374 BUG_ON(!node->pending); 2375 2376 if (!err) { 2377 ret = link_to_upper(trans, rc, node, path); 2378 if (ret < 0) 2379 err = ret; 2380 } 2381 } 2382 list_splice_init(&list, &cache->pending[level]); 2383 } 2384 return err; 2385 } 2386 2387 /* 2388 * mark a block and all blocks directly/indirectly reference the block 2389 * as processed. 2390 */ 2391 static void update_processed_blocks(struct reloc_control *rc, 2392 struct btrfs_backref_node *node) 2393 { 2394 struct btrfs_backref_node *next = node; 2395 struct btrfs_backref_edge *edge; 2396 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2397 int index = 0; 2398 2399 while (next) { 2400 cond_resched(); 2401 while (1) { 2402 if (next->processed) 2403 break; 2404 2405 mark_block_processed(rc, next); 2406 2407 if (list_empty(&next->upper)) 2408 break; 2409 2410 edge = list_first_entry(&next->upper, struct btrfs_backref_edge, 2411 list[LOWER]); 2412 edges[index++] = edge; 2413 next = edge->node[UPPER]; 2414 } 2415 next = walk_down_backref(edges, &index); 2416 } 2417 } 2418 2419 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2420 { 2421 u32 blocksize = rc->extent_root->fs_info->nodesize; 2422 2423 if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, 2424 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2425 return 1; 2426 return 0; 2427 } 2428 2429 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2430 struct tree_block *block) 2431 { 2432 struct btrfs_tree_parent_check check = { 2433 .level = block->level, 2434 .owner_root = block->owner, 2435 .transid = block->key.offset 2436 }; 2437 struct extent_buffer *eb; 2438 2439 eb = read_tree_block(fs_info, block->bytenr, &check); 2440 if (IS_ERR(eb)) 2441 return PTR_ERR(eb); 2442 if (unlikely(!extent_buffer_uptodate(eb))) { 2443 free_extent_buffer(eb); 2444 return -EIO; 2445 } 2446 if (block->level == 0) 2447 btrfs_item_key_to_cpu(eb, &block->key, 0); 2448 else 2449 btrfs_node_key_to_cpu(eb, &block->key, 0); 2450 free_extent_buffer(eb); 2451 block->key_ready = true; 2452 return 0; 2453 } 2454 2455 /* 2456 * helper function to relocate a tree block 2457 */ 2458 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2459 struct reloc_control *rc, 2460 struct btrfs_backref_node *node, 2461 struct btrfs_key *key, 2462 struct btrfs_path *path) 2463 { 2464 struct btrfs_root *root; 2465 int ret = 0; 2466 2467 if (!node) 2468 return 0; 2469 2470 /* 2471 * If we fail here we want to drop our backref_node because we are going 2472 * to start over and regenerate the tree for it. 2473 */ 2474 ret = reserve_metadata_space(trans, rc, node); 2475 if (ret) 2476 goto out; 2477 2478 BUG_ON(node->processed); 2479 root = select_one_root(node); 2480 if (IS_ERR(root)) { 2481 ret = PTR_ERR(root); 2482 2483 /* See explanation in select_one_root for the -EUCLEAN case. */ 2484 ASSERT(ret == -ENOENT); 2485 if (ret == -ENOENT) { 2486 ret = 0; 2487 update_processed_blocks(rc, node); 2488 } 2489 goto out; 2490 } 2491 2492 if (root) { 2493 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2494 /* 2495 * This block was the root block of a root, and this is 2496 * the first time we're processing the block and thus it 2497 * should not have had the ->new_bytenr modified. 2498 * 2499 * However in the case of corruption we could have 2500 * multiple refs pointing to the same block improperly, 2501 * and thus we would trip over these checks. ASSERT() 2502 * for the developer case, because it could indicate a 2503 * bug in the backref code, however error out for a 2504 * normal user in the case of corruption. 2505 */ 2506 ASSERT(node->new_bytenr == 0); 2507 if (unlikely(node->new_bytenr)) { 2508 btrfs_err(root->fs_info, 2509 "bytenr %llu has improper references to it", 2510 node->bytenr); 2511 ret = -EUCLEAN; 2512 goto out; 2513 } 2514 ret = btrfs_record_root_in_trans(trans, root); 2515 if (ret) 2516 goto out; 2517 /* 2518 * Another thread could have failed, need to check if we 2519 * have reloc_root actually set. 2520 */ 2521 if (!root->reloc_root) { 2522 ret = -ENOENT; 2523 goto out; 2524 } 2525 root = root->reloc_root; 2526 node->new_bytenr = root->node->start; 2527 btrfs_put_root(node->root); 2528 node->root = btrfs_grab_root(root); 2529 ASSERT(node->root); 2530 } else { 2531 btrfs_err(root->fs_info, 2532 "bytenr %llu resolved to a non-shareable root", 2533 node->bytenr); 2534 ret = -EUCLEAN; 2535 goto out; 2536 } 2537 if (!ret) 2538 update_processed_blocks(rc, node); 2539 } else { 2540 ret = do_relocation(trans, rc, node, key, path, 1); 2541 } 2542 out: 2543 if (ret || node->level == 0) 2544 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2545 return ret; 2546 } 2547 2548 static int relocate_cowonly_block(struct btrfs_trans_handle *trans, 2549 struct reloc_control *rc, struct tree_block *block, 2550 struct btrfs_path *path) 2551 { 2552 struct btrfs_fs_info *fs_info = trans->fs_info; 2553 struct btrfs_root *root; 2554 u64 num_bytes; 2555 int nr_levels; 2556 int ret; 2557 2558 root = btrfs_get_fs_root(fs_info, block->owner, true); 2559 if (IS_ERR(root)) 2560 return PTR_ERR(root); 2561 2562 nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1; 2563 2564 num_bytes = fs_info->nodesize * nr_levels; 2565 ret = refill_metadata_space(trans, rc, num_bytes); 2566 if (ret) { 2567 btrfs_put_root(root); 2568 return ret; 2569 } 2570 path->lowest_level = block->level; 2571 if (root == root->fs_info->chunk_root) 2572 btrfs_reserve_chunk_metadata(trans, false); 2573 2574 ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1); 2575 path->lowest_level = 0; 2576 btrfs_release_path(path); 2577 2578 if (root == root->fs_info->chunk_root) 2579 btrfs_trans_release_chunk_metadata(trans); 2580 if (ret > 0) 2581 ret = 0; 2582 btrfs_put_root(root); 2583 2584 return ret; 2585 } 2586 2587 /* 2588 * relocate a list of blocks 2589 */ 2590 static noinline_for_stack 2591 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2592 struct reloc_control *rc, struct rb_root *blocks) 2593 { 2594 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2595 struct btrfs_backref_node *node; 2596 struct btrfs_path *path; 2597 struct tree_block *block; 2598 struct tree_block *next; 2599 int ret = 0; 2600 2601 path = btrfs_alloc_path(); 2602 if (!path) { 2603 ret = -ENOMEM; 2604 goto out_free_blocks; 2605 } 2606 2607 /* Kick in readahead for tree blocks with missing keys */ 2608 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2609 if (!block->key_ready) 2610 btrfs_readahead_tree_block(fs_info, block->bytenr, 2611 block->owner, 0, 2612 block->level); 2613 } 2614 2615 /* Get first keys */ 2616 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2617 if (!block->key_ready) { 2618 ret = get_tree_block_key(fs_info, block); 2619 if (ret) 2620 goto out_free_path; 2621 } 2622 } 2623 2624 /* Do tree relocation */ 2625 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2626 /* 2627 * For COWonly blocks, or the data reloc tree, we only need to 2628 * COW down to the block, there's no need to generate a backref 2629 * tree. 2630 */ 2631 if (block->owner && 2632 (!btrfs_is_fstree(block->owner) || 2633 block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) { 2634 ret = relocate_cowonly_block(trans, rc, block, path); 2635 if (ret) 2636 break; 2637 continue; 2638 } 2639 2640 node = build_backref_tree(trans, rc, &block->key, 2641 block->level, block->bytenr); 2642 if (IS_ERR(node)) { 2643 ret = PTR_ERR(node); 2644 goto out; 2645 } 2646 2647 ret = relocate_tree_block(trans, rc, node, &block->key, 2648 path); 2649 if (ret < 0) 2650 break; 2651 } 2652 out: 2653 ret = finish_pending_nodes(trans, rc, path, ret); 2654 2655 out_free_path: 2656 btrfs_free_path(path); 2657 out_free_blocks: 2658 free_block_list(blocks); 2659 return ret; 2660 } 2661 2662 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2663 { 2664 const struct file_extent_cluster *cluster = &rc->cluster; 2665 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2666 u64 alloc_hint = 0; 2667 u64 start; 2668 u64 end; 2669 u64 offset = inode->reloc_block_group_start; 2670 u64 num_bytes; 2671 int nr; 2672 int ret = 0; 2673 u64 prealloc_start = cluster->start - offset; 2674 u64 prealloc_end = cluster->end - offset; 2675 u64 cur_offset = prealloc_start; 2676 2677 /* 2678 * For blocksize < folio size case (either bs < page size or large folios), 2679 * beyond i_size, all blocks are filled with zero. 2680 * 2681 * If the current cluster covers the above range, btrfs_do_readpage() 2682 * will skip the read, and relocate_one_folio() will later writeback 2683 * the padding zeros as new data, causing data corruption. 2684 * 2685 * Here we have to invalidate the cache covering our cluster. 2686 */ 2687 ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start, 2688 prealloc_end); 2689 if (ret < 0) 2690 return ret; 2691 2692 BUG_ON(cluster->start != cluster->boundary[0]); 2693 ret = btrfs_alloc_data_chunk_ondemand(inode, 2694 prealloc_end + 1 - prealloc_start); 2695 if (ret) 2696 return ret; 2697 2698 btrfs_inode_lock(inode, 0); 2699 for (nr = 0; nr < cluster->nr; nr++) { 2700 struct extent_state *cached_state = NULL; 2701 2702 start = cluster->boundary[nr] - offset; 2703 if (nr + 1 < cluster->nr) 2704 end = cluster->boundary[nr + 1] - 1 - offset; 2705 else 2706 end = cluster->end - offset; 2707 2708 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2709 num_bytes = end + 1 - start; 2710 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2711 num_bytes, num_bytes, 2712 end + 1, &alloc_hint); 2713 cur_offset = end + 1; 2714 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2715 if (ret) 2716 break; 2717 } 2718 btrfs_inode_unlock(inode, 0); 2719 2720 if (cur_offset < prealloc_end) 2721 btrfs_free_reserved_data_space_noquota(inode, 2722 prealloc_end + 1 - cur_offset); 2723 return ret; 2724 } 2725 2726 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2727 { 2728 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2729 struct extent_map *em; 2730 struct extent_state *cached_state = NULL; 2731 u64 offset = inode->reloc_block_group_start; 2732 u64 start = rc->cluster.start - offset; 2733 u64 end = rc->cluster.end - offset; 2734 int ret = 0; 2735 2736 em = btrfs_alloc_extent_map(); 2737 if (!em) 2738 return -ENOMEM; 2739 2740 em->start = start; 2741 em->len = end + 1 - start; 2742 em->disk_bytenr = rc->cluster.start; 2743 em->disk_num_bytes = em->len; 2744 em->ram_bytes = em->len; 2745 em->flags |= EXTENT_FLAG_PINNED; 2746 2747 btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); 2748 ret = btrfs_replace_extent_map_range(inode, em, false); 2749 btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); 2750 btrfs_free_extent_map(em); 2751 2752 return ret; 2753 } 2754 2755 /* 2756 * Allow error injection to test balance/relocation cancellation 2757 */ 2758 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2759 { 2760 return atomic_read(&fs_info->balance_cancel_req) || 2761 atomic_read(&fs_info->reloc_cancel_req) || 2762 fatal_signal_pending(current); 2763 } 2764 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2765 2766 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2767 int cluster_nr) 2768 { 2769 /* Last extent, use cluster end directly */ 2770 if (cluster_nr >= cluster->nr - 1) 2771 return cluster->end; 2772 2773 /* Use next boundary start*/ 2774 return cluster->boundary[cluster_nr + 1] - 1; 2775 } 2776 2777 static int relocate_one_folio(struct reloc_control *rc, 2778 struct file_ra_state *ra, 2779 int *cluster_nr, u64 *file_offset_ret) 2780 { 2781 const struct file_extent_cluster *cluster = &rc->cluster; 2782 struct inode *inode = rc->data_inode; 2783 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2784 const u64 orig_file_offset = *file_offset_ret; 2785 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2786 const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT; 2787 const pgoff_t index = orig_file_offset >> PAGE_SHIFT; 2788 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2789 struct folio *folio; 2790 u64 folio_start; 2791 u64 folio_end; 2792 u64 cur; 2793 int ret; 2794 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2795 2796 ASSERT(index <= last_index); 2797 again: 2798 folio = filemap_lock_folio(inode->i_mapping, index); 2799 if (IS_ERR(folio)) { 2800 2801 /* 2802 * On relocation we're doing readahead on the relocation inode, 2803 * but if the filesystem is backed by a RAID stripe tree we can 2804 * get ENOENT (e.g. due to preallocated extents not being 2805 * mapped in the RST) from the lookup. 2806 * 2807 * But readahead doesn't handle the error and submits invalid 2808 * reads to the device, causing a assertion failures. 2809 */ 2810 if (!use_rst) 2811 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2812 index, last_index + 1 - index); 2813 folio = __filemap_get_folio(inode->i_mapping, index, 2814 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2815 mask); 2816 if (IS_ERR(folio)) 2817 return PTR_ERR(folio); 2818 } 2819 2820 if (folio_test_readahead(folio) && !use_rst) 2821 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2822 folio, last_index + 1 - index); 2823 2824 if (!folio_test_uptodate(folio)) { 2825 btrfs_read_folio(NULL, folio); 2826 folio_lock(folio); 2827 if (unlikely(!folio_test_uptodate(folio))) { 2828 ret = -EIO; 2829 goto release_folio; 2830 } 2831 if (folio->mapping != inode->i_mapping) { 2832 folio_unlock(folio); 2833 folio_put(folio); 2834 goto again; 2835 } 2836 } 2837 2838 /* 2839 * We could have lost folio private when we dropped the lock to read the 2840 * folio above, make sure we set_folio_extent_mapped() here so we have any 2841 * of the subpage blocksize stuff we need in place. 2842 */ 2843 ret = set_folio_extent_mapped(folio); 2844 if (ret < 0) 2845 goto release_folio; 2846 2847 folio_start = folio_pos(folio); 2848 folio_end = folio_start + folio_size(folio) - 1; 2849 2850 /* 2851 * Start from the cluster, as for subpage case, the cluster can start 2852 * inside the folio. 2853 */ 2854 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 2855 while (cur <= folio_end) { 2856 struct extent_state *cached_state = NULL; 2857 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 2858 u64 extent_end = get_cluster_boundary_end(cluster, 2859 *cluster_nr) - offset; 2860 u64 clamped_start = max(folio_start, extent_start); 2861 u64 clamped_end = min(folio_end, extent_end); 2862 u32 clamped_len = clamped_end + 1 - clamped_start; 2863 2864 /* Reserve metadata for this range */ 2865 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 2866 clamped_len, clamped_len, 2867 false); 2868 if (ret) 2869 goto release_folio; 2870 2871 /* Mark the range delalloc and dirty for later writeback */ 2872 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, 2873 clamped_end, &cached_state); 2874 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 2875 clamped_end, 0, &cached_state); 2876 if (ret) { 2877 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 2878 clamped_start, clamped_end, 2879 EXTENT_LOCKED | EXTENT_BOUNDARY, 2880 &cached_state); 2881 btrfs_delalloc_release_metadata(BTRFS_I(inode), 2882 clamped_len, true); 2883 btrfs_delalloc_release_extents(BTRFS_I(inode), 2884 clamped_len); 2885 goto release_folio; 2886 } 2887 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 2888 2889 /* 2890 * Set the boundary if it's inside the folio. 2891 * Data relocation requires the destination extents to have the 2892 * same size as the source. 2893 * EXTENT_BOUNDARY bit prevents current extent from being merged 2894 * with previous extent. 2895 */ 2896 if (in_range(cluster->boundary[*cluster_nr] - offset, 2897 folio_start, folio_size(folio))) { 2898 u64 boundary_start = cluster->boundary[*cluster_nr] - 2899 offset; 2900 u64 boundary_end = boundary_start + 2901 fs_info->sectorsize - 1; 2902 2903 btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, 2904 boundary_start, boundary_end, 2905 EXTENT_BOUNDARY, NULL); 2906 } 2907 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 2908 &cached_state); 2909 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 2910 cur += clamped_len; 2911 2912 /* Crossed extent end, go to next extent */ 2913 if (cur >= extent_end) { 2914 (*cluster_nr)++; 2915 /* Just finished the last extent of the cluster, exit. */ 2916 if (*cluster_nr >= cluster->nr) 2917 break; 2918 } 2919 } 2920 folio_unlock(folio); 2921 folio_put(folio); 2922 2923 balance_dirty_pages_ratelimited(inode->i_mapping); 2924 btrfs_throttle(fs_info); 2925 if (btrfs_should_cancel_balance(fs_info)) 2926 ret = -ECANCELED; 2927 *file_offset_ret = folio_end + 1; 2928 return ret; 2929 2930 release_folio: 2931 folio_unlock(folio); 2932 folio_put(folio); 2933 return ret; 2934 } 2935 2936 static int relocate_file_extent_cluster(struct reloc_control *rc) 2937 { 2938 struct inode *inode = rc->data_inode; 2939 const struct file_extent_cluster *cluster = &rc->cluster; 2940 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2941 u64 cur_file_offset = cluster->start - offset; 2942 struct file_ra_state AUTO_KFREE(ra); 2943 int cluster_nr = 0; 2944 int ret = 0; 2945 2946 if (!cluster->nr) 2947 return 0; 2948 2949 ra = kzalloc(sizeof(*ra), GFP_NOFS); 2950 if (!ra) 2951 return -ENOMEM; 2952 2953 ret = prealloc_file_extent_cluster(rc); 2954 if (ret) 2955 return ret; 2956 2957 file_ra_state_init(ra, inode->i_mapping); 2958 2959 ret = setup_relocation_extent_mapping(rc); 2960 if (ret) 2961 return ret; 2962 2963 while (cur_file_offset < cluster->end - offset) { 2964 ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); 2965 if (ret) 2966 break; 2967 } 2968 if (ret == 0) 2969 WARN_ON(cluster_nr != cluster->nr); 2970 return ret; 2971 } 2972 2973 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 2974 const struct btrfs_key *extent_key) 2975 { 2976 struct inode *inode = rc->data_inode; 2977 struct file_extent_cluster *cluster = &rc->cluster; 2978 int ret; 2979 struct btrfs_root *root = BTRFS_I(inode)->root; 2980 2981 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 2982 ret = relocate_file_extent_cluster(rc); 2983 if (ret) 2984 return ret; 2985 cluster->nr = 0; 2986 } 2987 2988 /* 2989 * Under simple quotas, we set root->relocation_src_root when we find 2990 * the extent. If adjacent extents have different owners, we can't merge 2991 * them while relocating. Handle this by storing the owning root that 2992 * started a cluster and if we see an extent from a different root break 2993 * cluster formation (just like the above case of non-adjacent extents). 2994 * 2995 * Without simple quotas, relocation_src_root is always 0, so we should 2996 * never see a mismatch, and it should have no effect on relocation 2997 * clusters. 2998 */ 2999 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3000 u64 tmp = root->relocation_src_root; 3001 3002 /* 3003 * root->relocation_src_root is the state that actually affects 3004 * the preallocation we do here, so set it to the root owning 3005 * the cluster we need to relocate. 3006 */ 3007 root->relocation_src_root = cluster->owning_root; 3008 ret = relocate_file_extent_cluster(rc); 3009 if (ret) 3010 return ret; 3011 cluster->nr = 0; 3012 /* And reset it back for the current extent's owning root. */ 3013 root->relocation_src_root = tmp; 3014 } 3015 3016 if (!cluster->nr) { 3017 cluster->start = extent_key->objectid; 3018 cluster->owning_root = root->relocation_src_root; 3019 } 3020 else 3021 BUG_ON(cluster->nr >= MAX_EXTENTS); 3022 cluster->end = extent_key->objectid + extent_key->offset - 1; 3023 cluster->boundary[cluster->nr] = extent_key->objectid; 3024 cluster->nr++; 3025 3026 if (cluster->nr >= MAX_EXTENTS) { 3027 ret = relocate_file_extent_cluster(rc); 3028 if (ret) 3029 return ret; 3030 cluster->nr = 0; 3031 } 3032 return 0; 3033 } 3034 3035 /* 3036 * helper to add a tree block to the list. 3037 * the major work is getting the generation and level of the block 3038 */ 3039 static int add_tree_block(struct reloc_control *rc, 3040 const struct btrfs_key *extent_key, 3041 struct btrfs_path *path, 3042 struct rb_root *blocks) 3043 { 3044 struct extent_buffer *eb; 3045 struct btrfs_extent_item *ei; 3046 struct btrfs_tree_block_info *bi; 3047 struct tree_block *block; 3048 struct rb_node *rb_node; 3049 u32 item_size; 3050 int level = -1; 3051 u64 generation; 3052 u64 owner = 0; 3053 3054 eb = path->nodes[0]; 3055 item_size = btrfs_item_size(eb, path->slots[0]); 3056 3057 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3058 item_size >= sizeof(*ei) + sizeof(*bi)) { 3059 unsigned long ptr = 0, end; 3060 3061 ei = btrfs_item_ptr(eb, path->slots[0], 3062 struct btrfs_extent_item); 3063 end = (unsigned long)ei + item_size; 3064 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3065 bi = (struct btrfs_tree_block_info *)(ei + 1); 3066 level = btrfs_tree_block_level(eb, bi); 3067 ptr = (unsigned long)(bi + 1); 3068 } else { 3069 level = (int)extent_key->offset; 3070 ptr = (unsigned long)(ei + 1); 3071 } 3072 generation = btrfs_extent_generation(eb, ei); 3073 3074 /* 3075 * We're reading random blocks without knowing their owner ahead 3076 * of time. This is ok most of the time, as all reloc roots and 3077 * fs roots have the same lock type. However normal trees do 3078 * not, and the only way to know ahead of time is to read the 3079 * inline ref offset. We know it's an fs root if 3080 * 3081 * 1. There's more than one ref. 3082 * 2. There's a SHARED_DATA_REF_KEY set. 3083 * 3. FULL_BACKREF is set on the flags. 3084 * 3085 * Otherwise it's safe to assume that the ref offset == the 3086 * owner of this block, so we can use that when calling 3087 * read_tree_block. 3088 */ 3089 if (btrfs_extent_refs(eb, ei) == 1 && 3090 !(btrfs_extent_flags(eb, ei) & 3091 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3092 ptr < end) { 3093 struct btrfs_extent_inline_ref *iref; 3094 int type; 3095 3096 iref = (struct btrfs_extent_inline_ref *)ptr; 3097 type = btrfs_get_extent_inline_ref_type(eb, iref, 3098 BTRFS_REF_TYPE_BLOCK); 3099 if (type == BTRFS_REF_TYPE_INVALID) 3100 return -EINVAL; 3101 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3102 owner = btrfs_extent_inline_ref_offset(eb, iref); 3103 } 3104 } else { 3105 btrfs_print_leaf(eb); 3106 btrfs_err(rc->block_group->fs_info, 3107 "unrecognized tree backref at tree block %llu slot %u", 3108 eb->start, path->slots[0]); 3109 btrfs_release_path(path); 3110 return -EUCLEAN; 3111 } 3112 3113 btrfs_release_path(path); 3114 3115 BUG_ON(level == -1); 3116 3117 block = kmalloc(sizeof(*block), GFP_NOFS); 3118 if (!block) 3119 return -ENOMEM; 3120 3121 block->bytenr = extent_key->objectid; 3122 block->key.objectid = rc->extent_root->fs_info->nodesize; 3123 block->key.offset = generation; 3124 block->level = level; 3125 block->key_ready = false; 3126 block->owner = owner; 3127 3128 rb_node = rb_simple_insert(blocks, &block->simple_node); 3129 if (rb_node) 3130 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3131 -EEXIST); 3132 3133 return 0; 3134 } 3135 3136 /* 3137 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3138 */ 3139 static int __add_tree_block(struct reloc_control *rc, 3140 u64 bytenr, u32 blocksize, 3141 struct rb_root *blocks) 3142 { 3143 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3144 BTRFS_PATH_AUTO_FREE(path); 3145 struct btrfs_key key; 3146 int ret; 3147 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3148 3149 if (tree_block_processed(bytenr, rc)) 3150 return 0; 3151 3152 if (rb_simple_search(blocks, bytenr)) 3153 return 0; 3154 3155 path = btrfs_alloc_path(); 3156 if (!path) 3157 return -ENOMEM; 3158 again: 3159 key.objectid = bytenr; 3160 if (skinny) { 3161 key.type = BTRFS_METADATA_ITEM_KEY; 3162 key.offset = (u64)-1; 3163 } else { 3164 key.type = BTRFS_EXTENT_ITEM_KEY; 3165 key.offset = blocksize; 3166 } 3167 3168 path->search_commit_root = true; 3169 path->skip_locking = true; 3170 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3171 if (ret < 0) 3172 return ret; 3173 3174 if (ret > 0 && skinny) { 3175 if (path->slots[0]) { 3176 path->slots[0]--; 3177 btrfs_item_key_to_cpu(path->nodes[0], &key, 3178 path->slots[0]); 3179 if (key.objectid == bytenr && 3180 (key.type == BTRFS_METADATA_ITEM_KEY || 3181 (key.type == BTRFS_EXTENT_ITEM_KEY && 3182 key.offset == blocksize))) 3183 ret = 0; 3184 } 3185 3186 if (ret) { 3187 skinny = false; 3188 btrfs_release_path(path); 3189 goto again; 3190 } 3191 } 3192 if (ret) { 3193 ASSERT(ret == 1); 3194 btrfs_print_leaf(path->nodes[0]); 3195 btrfs_err(fs_info, 3196 "tree block extent item (%llu) is not found in extent tree", 3197 bytenr); 3198 WARN_ON(1); 3199 return -EINVAL; 3200 } 3201 3202 return add_tree_block(rc, &key, path, blocks); 3203 } 3204 3205 static int delete_block_group_cache(struct btrfs_block_group *block_group, 3206 struct inode *inode, 3207 u64 ino) 3208 { 3209 struct btrfs_fs_info *fs_info = block_group->fs_info; 3210 struct btrfs_root *root = fs_info->tree_root; 3211 struct btrfs_trans_handle *trans; 3212 struct btrfs_inode *btrfs_inode; 3213 int ret = 0; 3214 3215 if (inode) 3216 goto truncate; 3217 3218 btrfs_inode = btrfs_iget(ino, root); 3219 if (IS_ERR(btrfs_inode)) 3220 return -ENOENT; 3221 inode = &btrfs_inode->vfs_inode; 3222 3223 truncate: 3224 ret = btrfs_check_trunc_cache_free_space(fs_info, 3225 &fs_info->global_block_rsv); 3226 if (ret) 3227 goto out; 3228 3229 trans = btrfs_join_transaction(root); 3230 if (IS_ERR(trans)) { 3231 ret = PTR_ERR(trans); 3232 goto out; 3233 } 3234 3235 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3236 3237 btrfs_end_transaction(trans); 3238 btrfs_btree_balance_dirty(fs_info); 3239 out: 3240 iput(inode); 3241 return ret; 3242 } 3243 3244 /* 3245 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3246 * cache inode, to avoid free space cache data extent blocking data relocation. 3247 */ 3248 static int delete_v1_space_cache(struct extent_buffer *leaf, 3249 struct btrfs_block_group *block_group, 3250 u64 data_bytenr) 3251 { 3252 u64 space_cache_ino; 3253 struct btrfs_file_extent_item *ei; 3254 struct btrfs_key key; 3255 bool found = false; 3256 int i; 3257 int ret; 3258 3259 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3260 return 0; 3261 3262 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3263 u8 type; 3264 3265 btrfs_item_key_to_cpu(leaf, &key, i); 3266 if (key.type != BTRFS_EXTENT_DATA_KEY) 3267 continue; 3268 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3269 type = btrfs_file_extent_type(leaf, ei); 3270 3271 if ((type == BTRFS_FILE_EXTENT_REG || 3272 type == BTRFS_FILE_EXTENT_PREALLOC) && 3273 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3274 found = true; 3275 space_cache_ino = key.objectid; 3276 break; 3277 } 3278 } 3279 if (!found) 3280 return -ENOENT; 3281 ret = delete_block_group_cache(block_group, NULL, space_cache_ino); 3282 return ret; 3283 } 3284 3285 /* 3286 * helper to find all tree blocks that reference a given data extent 3287 */ 3288 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3289 const struct btrfs_key *extent_key, 3290 struct btrfs_path *path, 3291 struct rb_root *blocks) 3292 { 3293 struct btrfs_backref_walk_ctx ctx = { 0 }; 3294 struct ulist_iterator leaf_uiter; 3295 struct ulist_node *ref_node = NULL; 3296 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3297 int ret = 0; 3298 3299 btrfs_release_path(path); 3300 3301 ctx.bytenr = extent_key->objectid; 3302 ctx.skip_inode_ref_list = true; 3303 ctx.fs_info = rc->extent_root->fs_info; 3304 3305 ret = btrfs_find_all_leafs(&ctx); 3306 if (ret < 0) 3307 return ret; 3308 3309 ULIST_ITER_INIT(&leaf_uiter); 3310 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3311 struct btrfs_tree_parent_check check = { 0 }; 3312 struct extent_buffer *eb; 3313 3314 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3315 if (IS_ERR(eb)) { 3316 ret = PTR_ERR(eb); 3317 break; 3318 } 3319 ret = delete_v1_space_cache(eb, rc->block_group, 3320 extent_key->objectid); 3321 free_extent_buffer(eb); 3322 if (ret < 0) 3323 break; 3324 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3325 if (ret < 0) 3326 break; 3327 } 3328 if (ret < 0) 3329 free_block_list(blocks); 3330 ulist_free(ctx.refs); 3331 return ret; 3332 } 3333 3334 /* 3335 * helper to find next unprocessed extent 3336 */ 3337 static noinline_for_stack 3338 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3339 struct btrfs_key *extent_key) 3340 { 3341 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3342 struct btrfs_key key; 3343 struct extent_buffer *leaf; 3344 u64 start, end, last; 3345 int ret; 3346 3347 last = rc->block_group->start + rc->block_group->length; 3348 while (1) { 3349 bool block_found; 3350 3351 cond_resched(); 3352 if (rc->search_start >= last) { 3353 ret = 1; 3354 break; 3355 } 3356 3357 key.objectid = rc->search_start; 3358 key.type = BTRFS_EXTENT_ITEM_KEY; 3359 key.offset = 0; 3360 3361 path->search_commit_root = true; 3362 path->skip_locking = true; 3363 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3364 0, 0); 3365 if (ret < 0) 3366 break; 3367 next: 3368 leaf = path->nodes[0]; 3369 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3370 ret = btrfs_next_leaf(rc->extent_root, path); 3371 if (ret != 0) 3372 break; 3373 leaf = path->nodes[0]; 3374 } 3375 3376 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3377 if (key.objectid >= last) { 3378 ret = 1; 3379 break; 3380 } 3381 3382 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3383 key.type != BTRFS_METADATA_ITEM_KEY) { 3384 path->slots[0]++; 3385 goto next; 3386 } 3387 3388 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3389 key.objectid + key.offset <= rc->search_start) { 3390 path->slots[0]++; 3391 goto next; 3392 } 3393 3394 if (key.type == BTRFS_METADATA_ITEM_KEY && 3395 key.objectid + fs_info->nodesize <= 3396 rc->search_start) { 3397 path->slots[0]++; 3398 goto next; 3399 } 3400 3401 block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, 3402 key.objectid, &start, &end, 3403 EXTENT_DIRTY, NULL); 3404 3405 if (block_found && start <= key.objectid) { 3406 btrfs_release_path(path); 3407 rc->search_start = end + 1; 3408 } else { 3409 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3410 rc->search_start = key.objectid + key.offset; 3411 else 3412 rc->search_start = key.objectid + 3413 fs_info->nodesize; 3414 memcpy(extent_key, &key, sizeof(key)); 3415 return 0; 3416 } 3417 } 3418 btrfs_release_path(path); 3419 return ret; 3420 } 3421 3422 static void set_reloc_control(struct reloc_control *rc) 3423 { 3424 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3425 3426 mutex_lock(&fs_info->reloc_mutex); 3427 fs_info->reloc_ctl = rc; 3428 mutex_unlock(&fs_info->reloc_mutex); 3429 } 3430 3431 static void unset_reloc_control(struct reloc_control *rc) 3432 { 3433 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3434 3435 mutex_lock(&fs_info->reloc_mutex); 3436 fs_info->reloc_ctl = NULL; 3437 mutex_unlock(&fs_info->reloc_mutex); 3438 } 3439 3440 static noinline_for_stack 3441 int prepare_to_relocate(struct reloc_control *rc) 3442 { 3443 struct btrfs_trans_handle *trans; 3444 int ret; 3445 3446 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3447 BTRFS_BLOCK_RSV_TEMP); 3448 if (!rc->block_rsv) 3449 return -ENOMEM; 3450 3451 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3452 rc->search_start = rc->block_group->start; 3453 rc->extents_found = 0; 3454 rc->nodes_relocated = 0; 3455 rc->merging_rsv_size = 0; 3456 rc->reserved_bytes = 0; 3457 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3458 RELOCATION_RESERVED_NODES; 3459 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3460 rc->block_rsv, rc->block_rsv->size, 3461 BTRFS_RESERVE_FLUSH_ALL); 3462 if (ret) 3463 return ret; 3464 3465 rc->create_reloc_tree = true; 3466 set_reloc_control(rc); 3467 3468 trans = btrfs_join_transaction(rc->extent_root); 3469 if (IS_ERR(trans)) { 3470 unset_reloc_control(rc); 3471 /* 3472 * extent tree is not a ref_cow tree and has no reloc_root to 3473 * cleanup. And callers are responsible to free the above 3474 * block rsv. 3475 */ 3476 return PTR_ERR(trans); 3477 } 3478 3479 ret = btrfs_commit_transaction(trans); 3480 if (ret) 3481 unset_reloc_control(rc); 3482 3483 return ret; 3484 } 3485 3486 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3487 { 3488 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3489 struct rb_root blocks = RB_ROOT; 3490 struct btrfs_key key; 3491 struct btrfs_trans_handle *trans = NULL; 3492 BTRFS_PATH_AUTO_FREE(path); 3493 struct btrfs_extent_item *ei; 3494 u64 flags; 3495 int ret; 3496 int err = 0; 3497 int progress = 0; 3498 3499 path = btrfs_alloc_path(); 3500 if (!path) 3501 return -ENOMEM; 3502 path->reada = READA_FORWARD; 3503 3504 ret = prepare_to_relocate(rc); 3505 if (ret) { 3506 err = ret; 3507 goto out_free; 3508 } 3509 3510 while (1) { 3511 rc->reserved_bytes = 0; 3512 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3513 rc->block_rsv->size, 3514 BTRFS_RESERVE_FLUSH_ALL); 3515 if (ret) { 3516 err = ret; 3517 break; 3518 } 3519 progress++; 3520 trans = btrfs_start_transaction(rc->extent_root, 0); 3521 if (IS_ERR(trans)) { 3522 err = PTR_ERR(trans); 3523 trans = NULL; 3524 break; 3525 } 3526 restart: 3527 if (rc->backref_cache.last_trans != trans->transid) 3528 btrfs_backref_release_cache(&rc->backref_cache); 3529 rc->backref_cache.last_trans = trans->transid; 3530 3531 ret = find_next_extent(rc, path, &key); 3532 if (ret < 0) 3533 err = ret; 3534 if (ret != 0) 3535 break; 3536 3537 rc->extents_found++; 3538 3539 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3540 struct btrfs_extent_item); 3541 flags = btrfs_extent_flags(path->nodes[0], ei); 3542 3543 /* 3544 * If we are relocating a simple quota owned extent item, we 3545 * need to note the owner on the reloc data root so that when 3546 * we allocate the replacement item, we can attribute it to the 3547 * correct eventual owner (rather than the reloc data root). 3548 */ 3549 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3550 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3551 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3552 path->nodes[0], 3553 path->slots[0]); 3554 3555 root->relocation_src_root = owning_root_id; 3556 } 3557 3558 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3559 ret = add_tree_block(rc, &key, path, &blocks); 3560 } else if (rc->stage == UPDATE_DATA_PTRS && 3561 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3562 ret = add_data_references(rc, &key, path, &blocks); 3563 } else { 3564 btrfs_release_path(path); 3565 ret = 0; 3566 } 3567 if (ret < 0) { 3568 err = ret; 3569 break; 3570 } 3571 3572 if (!RB_EMPTY_ROOT(&blocks)) { 3573 ret = relocate_tree_blocks(trans, rc, &blocks); 3574 if (ret < 0) { 3575 if (ret != -EAGAIN) { 3576 err = ret; 3577 break; 3578 } 3579 rc->extents_found--; 3580 rc->search_start = key.objectid; 3581 } 3582 } 3583 3584 btrfs_end_transaction_throttle(trans); 3585 btrfs_btree_balance_dirty(fs_info); 3586 trans = NULL; 3587 3588 if (rc->stage == MOVE_DATA_EXTENTS && 3589 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3590 rc->found_file_extent = true; 3591 ret = relocate_data_extent(rc, &key); 3592 if (ret < 0) { 3593 err = ret; 3594 break; 3595 } 3596 } 3597 if (btrfs_should_cancel_balance(fs_info)) { 3598 err = -ECANCELED; 3599 break; 3600 } 3601 } 3602 if (trans && progress && err == -ENOSPC) { 3603 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3604 if (ret == 1) { 3605 err = 0; 3606 progress = 0; 3607 goto restart; 3608 } 3609 } 3610 3611 btrfs_release_path(path); 3612 btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL); 3613 3614 if (trans) { 3615 btrfs_end_transaction_throttle(trans); 3616 btrfs_btree_balance_dirty(fs_info); 3617 } 3618 3619 if (!err) { 3620 ret = relocate_file_extent_cluster(rc); 3621 if (ret < 0) 3622 err = ret; 3623 } 3624 3625 rc->create_reloc_tree = false; 3626 set_reloc_control(rc); 3627 3628 btrfs_backref_release_cache(&rc->backref_cache); 3629 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3630 3631 /* 3632 * Even in the case when the relocation is cancelled, we should all go 3633 * through prepare_to_merge() and merge_reloc_roots(). 3634 * 3635 * For error (including cancelled balance), prepare_to_merge() will 3636 * mark all reloc trees orphan, then queue them for cleanup in 3637 * merge_reloc_roots() 3638 */ 3639 err = prepare_to_merge(rc, err); 3640 3641 merge_reloc_roots(rc); 3642 3643 rc->merge_reloc_tree = false; 3644 unset_reloc_control(rc); 3645 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3646 3647 /* get rid of pinned extents */ 3648 trans = btrfs_join_transaction(rc->extent_root); 3649 if (IS_ERR(trans)) { 3650 err = PTR_ERR(trans); 3651 goto out_free; 3652 } 3653 ret = btrfs_commit_transaction(trans); 3654 if (ret && !err) 3655 err = ret; 3656 out_free: 3657 ret = clean_dirty_subvols(rc); 3658 if (ret < 0 && !err) 3659 err = ret; 3660 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3661 return err; 3662 } 3663 3664 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3665 struct btrfs_root *root, u64 objectid) 3666 { 3667 BTRFS_PATH_AUTO_FREE(path); 3668 struct btrfs_inode_item *item; 3669 struct extent_buffer *leaf; 3670 int ret; 3671 3672 path = btrfs_alloc_path(); 3673 if (!path) 3674 return -ENOMEM; 3675 3676 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3677 if (ret) 3678 return ret; 3679 3680 leaf = path->nodes[0]; 3681 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3682 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3683 btrfs_set_inode_generation(leaf, item, 1); 3684 btrfs_set_inode_size(leaf, item, 0); 3685 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3686 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3687 BTRFS_INODE_PREALLOC); 3688 return 0; 3689 } 3690 3691 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3692 struct btrfs_root *root, u64 objectid) 3693 { 3694 BTRFS_PATH_AUTO_FREE(path); 3695 struct btrfs_key key; 3696 int ret = 0; 3697 3698 path = btrfs_alloc_path(); 3699 if (!path) { 3700 ret = -ENOMEM; 3701 goto out; 3702 } 3703 3704 key.objectid = objectid; 3705 key.type = BTRFS_INODE_ITEM_KEY; 3706 key.offset = 0; 3707 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3708 if (ret) { 3709 if (ret > 0) 3710 ret = -ENOENT; 3711 goto out; 3712 } 3713 ret = btrfs_del_item(trans, root, path); 3714 out: 3715 if (ret) 3716 btrfs_abort_transaction(trans, ret); 3717 } 3718 3719 /* 3720 * helper to create inode for data relocation. 3721 * the inode is in data relocation tree and its link count is 0 3722 */ 3723 static noinline_for_stack struct inode *create_reloc_inode( 3724 const struct btrfs_block_group *group) 3725 { 3726 struct btrfs_fs_info *fs_info = group->fs_info; 3727 struct btrfs_inode *inode = NULL; 3728 struct btrfs_trans_handle *trans; 3729 struct btrfs_root *root; 3730 u64 objectid; 3731 int ret = 0; 3732 3733 root = btrfs_grab_root(fs_info->data_reloc_root); 3734 trans = btrfs_start_transaction(root, 6); 3735 if (IS_ERR(trans)) { 3736 btrfs_put_root(root); 3737 return ERR_CAST(trans); 3738 } 3739 3740 ret = btrfs_get_free_objectid(root, &objectid); 3741 if (ret) 3742 goto out; 3743 3744 ret = __insert_orphan_inode(trans, root, objectid); 3745 if (ret) 3746 goto out; 3747 3748 inode = btrfs_iget(objectid, root); 3749 if (IS_ERR(inode)) { 3750 delete_orphan_inode(trans, root, objectid); 3751 ret = PTR_ERR(inode); 3752 inode = NULL; 3753 goto out; 3754 } 3755 inode->reloc_block_group_start = group->start; 3756 3757 ret = btrfs_orphan_add(trans, inode); 3758 out: 3759 btrfs_put_root(root); 3760 btrfs_end_transaction(trans); 3761 btrfs_btree_balance_dirty(fs_info); 3762 if (ret) { 3763 if (inode) 3764 iput(&inode->vfs_inode); 3765 return ERR_PTR(ret); 3766 } 3767 return &inode->vfs_inode; 3768 } 3769 3770 /* 3771 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3772 * has been requested meanwhile and don't start in that case. 3773 * NOTE: if this returns an error, reloc_chunk_end() must not be called. 3774 * 3775 * Return: 3776 * 0 success 3777 * -EINPROGRESS operation is already in progress, that's probably a bug 3778 * -ECANCELED cancellation request was set before the operation started 3779 */ 3780 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3781 { 3782 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3783 /* This should not happen */ 3784 btrfs_err(fs_info, "reloc already running, cannot start"); 3785 return -EINPROGRESS; 3786 } 3787 3788 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3789 btrfs_info(fs_info, "chunk relocation canceled on start"); 3790 /* On cancel, clear all requests. */ 3791 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3792 atomic_set(&fs_info->reloc_cancel_req, 0); 3793 return -ECANCELED; 3794 } 3795 return 0; 3796 } 3797 3798 /* 3799 * Mark end of chunk relocation that is cancellable and wake any waiters. 3800 * NOTE: call only if a previous call to reloc_chunk_start() succeeded. 3801 */ 3802 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3803 { 3804 ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)); 3805 /* Requested after start, clear bit first so any waiters can continue */ 3806 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3807 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3808 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3809 atomic_set(&fs_info->reloc_cancel_req, 0); 3810 } 3811 3812 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3813 { 3814 struct reloc_control *rc; 3815 3816 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3817 if (!rc) 3818 return NULL; 3819 3820 INIT_LIST_HEAD(&rc->reloc_roots); 3821 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 3822 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 3823 rc->reloc_root_tree.rb_root = RB_ROOT; 3824 spin_lock_init(&rc->reloc_root_tree.lock); 3825 btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 3826 return rc; 3827 } 3828 3829 static void free_reloc_control(struct reloc_control *rc) 3830 { 3831 struct mapping_node *node, *tmp; 3832 3833 free_reloc_roots(&rc->reloc_roots); 3834 rbtree_postorder_for_each_entry_safe(node, tmp, 3835 &rc->reloc_root_tree.rb_root, rb_node) 3836 kfree(node); 3837 3838 kfree(rc); 3839 } 3840 3841 /* 3842 * Print the block group being relocated 3843 */ 3844 static void describe_relocation(struct btrfs_block_group *block_group) 3845 { 3846 char buf[128] = "NONE"; 3847 3848 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 3849 3850 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 3851 block_group->start, buf); 3852 } 3853 3854 static const char *stage_to_string(enum reloc_stage stage) 3855 { 3856 if (stage == MOVE_DATA_EXTENTS) 3857 return "move data extents"; 3858 if (stage == UPDATE_DATA_PTRS) 3859 return "update data pointers"; 3860 return "unknown"; 3861 } 3862 3863 /* 3864 * function to relocate all extents in a block group. 3865 */ 3866 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, 3867 bool verbose) 3868 { 3869 struct btrfs_block_group *bg; 3870 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 3871 struct reloc_control *rc; 3872 struct inode *inode; 3873 struct btrfs_path *path; 3874 int ret; 3875 bool bg_is_ro = false; 3876 3877 /* 3878 * This only gets set if we had a half-deleted snapshot on mount. We 3879 * cannot allow relocation to start while we're still trying to clean up 3880 * these pending deletions. 3881 */ 3882 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 3883 if (ret) 3884 return ret; 3885 3886 /* We may have been woken up by close_ctree, so bail if we're closing. */ 3887 if (btrfs_fs_closing(fs_info)) 3888 return -EINTR; 3889 3890 bg = btrfs_lookup_block_group(fs_info, group_start); 3891 if (!bg) 3892 return -ENOENT; 3893 3894 /* 3895 * Relocation of a data block group creates ordered extents. Without 3896 * sb_start_write(), we can freeze the filesystem while unfinished 3897 * ordered extents are left. Such ordered extents can cause a deadlock 3898 * e.g. when syncfs() is waiting for their completion but they can't 3899 * finish because they block when joining a transaction, due to the 3900 * fact that the freeze locks are being held in write mode. 3901 */ 3902 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 3903 ASSERT(sb_write_started(fs_info->sb)); 3904 3905 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 3906 btrfs_put_block_group(bg); 3907 return -ETXTBSY; 3908 } 3909 3910 rc = alloc_reloc_control(fs_info); 3911 if (!rc) { 3912 btrfs_put_block_group(bg); 3913 return -ENOMEM; 3914 } 3915 3916 ret = reloc_chunk_start(fs_info); 3917 if (ret < 0) 3918 goto out_put_bg; 3919 3920 rc->extent_root = extent_root; 3921 rc->block_group = bg; 3922 3923 ret = btrfs_inc_block_group_ro(rc->block_group, true); 3924 if (ret) 3925 goto out; 3926 bg_is_ro = true; 3927 3928 path = btrfs_alloc_path(); 3929 if (!path) { 3930 ret = -ENOMEM; 3931 goto out; 3932 } 3933 3934 inode = lookup_free_space_inode(rc->block_group, path); 3935 btrfs_free_path(path); 3936 3937 if (!IS_ERR(inode)) 3938 ret = delete_block_group_cache(rc->block_group, inode, 0); 3939 else 3940 ret = PTR_ERR(inode); 3941 3942 if (ret && ret != -ENOENT) 3943 goto out; 3944 3945 rc->data_inode = create_reloc_inode(rc->block_group); 3946 if (IS_ERR(rc->data_inode)) { 3947 ret = PTR_ERR(rc->data_inode); 3948 rc->data_inode = NULL; 3949 goto out; 3950 } 3951 3952 if (verbose) 3953 describe_relocation(rc->block_group); 3954 3955 btrfs_wait_block_group_reservations(rc->block_group); 3956 btrfs_wait_nocow_writers(rc->block_group); 3957 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 3958 3959 ret = btrfs_zone_finish(rc->block_group); 3960 WARN_ON(ret && ret != -EAGAIN); 3961 3962 while (1) { 3963 enum reloc_stage finishes_stage; 3964 3965 mutex_lock(&fs_info->cleaner_mutex); 3966 ret = relocate_block_group(rc); 3967 mutex_unlock(&fs_info->cleaner_mutex); 3968 3969 finishes_stage = rc->stage; 3970 /* 3971 * We may have gotten ENOSPC after we already dirtied some 3972 * extents. If writeout happens while we're relocating a 3973 * different block group we could end up hitting the 3974 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 3975 * btrfs_reloc_cow_block. Make sure we write everything out 3976 * properly so we don't trip over this problem, and then break 3977 * out of the loop if we hit an error. 3978 */ 3979 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 3980 int wb_ret; 3981 3982 wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 3983 (u64)-1); 3984 if (wb_ret && ret == 0) 3985 ret = wb_ret; 3986 invalidate_mapping_pages(rc->data_inode->i_mapping, 3987 0, -1); 3988 rc->stage = UPDATE_DATA_PTRS; 3989 } 3990 3991 if (ret < 0) 3992 goto out; 3993 3994 if (rc->extents_found == 0) 3995 break; 3996 3997 if (verbose) 3998 btrfs_info(fs_info, "found %llu extents, stage: %s", 3999 rc->extents_found, 4000 stage_to_string(finishes_stage)); 4001 } 4002 4003 WARN_ON(rc->block_group->pinned > 0); 4004 WARN_ON(rc->block_group->reserved > 0); 4005 WARN_ON(rc->block_group->used > 0); 4006 out: 4007 if (ret && bg_is_ro) 4008 btrfs_dec_block_group_ro(rc->block_group); 4009 iput(rc->data_inode); 4010 reloc_chunk_end(fs_info); 4011 out_put_bg: 4012 btrfs_put_block_group(bg); 4013 free_reloc_control(rc); 4014 return ret; 4015 } 4016 4017 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4018 { 4019 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_trans_handle *trans; 4021 int ret, err; 4022 4023 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4024 if (IS_ERR(trans)) 4025 return PTR_ERR(trans); 4026 4027 memset(&root->root_item.drop_progress, 0, 4028 sizeof(root->root_item.drop_progress)); 4029 btrfs_set_root_drop_level(&root->root_item, 0); 4030 btrfs_set_root_refs(&root->root_item, 0); 4031 ret = btrfs_update_root(trans, fs_info->tree_root, 4032 &root->root_key, &root->root_item); 4033 4034 err = btrfs_end_transaction(trans); 4035 if (err) 4036 return err; 4037 return ret; 4038 } 4039 4040 /* 4041 * recover relocation interrupted by system crash. 4042 * 4043 * this function resumes merging reloc trees with corresponding fs trees. 4044 * this is important for keeping the sharing of tree blocks 4045 */ 4046 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4047 { 4048 LIST_HEAD(reloc_roots); 4049 struct btrfs_key key; 4050 struct btrfs_root *fs_root; 4051 struct btrfs_root *reloc_root; 4052 struct btrfs_path *path; 4053 struct extent_buffer *leaf; 4054 struct reloc_control *rc = NULL; 4055 struct btrfs_trans_handle *trans; 4056 int ret2; 4057 int ret = 0; 4058 4059 path = btrfs_alloc_path(); 4060 if (!path) 4061 return -ENOMEM; 4062 path->reada = READA_BACK; 4063 4064 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4065 key.type = BTRFS_ROOT_ITEM_KEY; 4066 key.offset = (u64)-1; 4067 4068 while (1) { 4069 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4070 path, 0, 0); 4071 if (ret < 0) 4072 goto out; 4073 if (ret > 0) { 4074 if (path->slots[0] == 0) 4075 break; 4076 path->slots[0]--; 4077 } 4078 ret = 0; 4079 leaf = path->nodes[0]; 4080 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4081 btrfs_release_path(path); 4082 4083 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4084 key.type != BTRFS_ROOT_ITEM_KEY) 4085 break; 4086 4087 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4088 if (IS_ERR(reloc_root)) { 4089 ret = PTR_ERR(reloc_root); 4090 goto out; 4091 } 4092 4093 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4094 list_add(&reloc_root->root_list, &reloc_roots); 4095 4096 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4097 fs_root = btrfs_get_fs_root(fs_info, 4098 reloc_root->root_key.offset, false); 4099 if (IS_ERR(fs_root)) { 4100 ret = PTR_ERR(fs_root); 4101 if (ret != -ENOENT) 4102 goto out; 4103 ret = mark_garbage_root(reloc_root); 4104 if (ret < 0) 4105 goto out; 4106 ret = 0; 4107 } else { 4108 btrfs_put_root(fs_root); 4109 } 4110 } 4111 4112 if (key.offset == 0) 4113 break; 4114 4115 key.offset--; 4116 } 4117 btrfs_release_path(path); 4118 4119 if (list_empty(&reloc_roots)) 4120 goto out; 4121 4122 rc = alloc_reloc_control(fs_info); 4123 if (!rc) { 4124 ret = -ENOMEM; 4125 goto out; 4126 } 4127 4128 ret = reloc_chunk_start(fs_info); 4129 if (ret < 0) 4130 goto out_end; 4131 4132 rc->extent_root = btrfs_extent_root(fs_info, 0); 4133 4134 set_reloc_control(rc); 4135 4136 trans = btrfs_join_transaction(rc->extent_root); 4137 if (IS_ERR(trans)) { 4138 ret = PTR_ERR(trans); 4139 goto out_unset; 4140 } 4141 4142 rc->merge_reloc_tree = true; 4143 4144 while (!list_empty(&reloc_roots)) { 4145 reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); 4146 list_del(&reloc_root->root_list); 4147 4148 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4149 list_add_tail(&reloc_root->root_list, 4150 &rc->reloc_roots); 4151 continue; 4152 } 4153 4154 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4155 false); 4156 if (IS_ERR(fs_root)) { 4157 ret = PTR_ERR(fs_root); 4158 list_add_tail(&reloc_root->root_list, &reloc_roots); 4159 btrfs_end_transaction(trans); 4160 goto out_unset; 4161 } 4162 4163 ret = __add_reloc_root(reloc_root); 4164 ASSERT(ret != -EEXIST); 4165 if (ret) { 4166 list_add_tail(&reloc_root->root_list, &reloc_roots); 4167 btrfs_put_root(fs_root); 4168 btrfs_end_transaction(trans); 4169 goto out_unset; 4170 } 4171 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4172 btrfs_put_root(fs_root); 4173 } 4174 4175 ret = btrfs_commit_transaction(trans); 4176 if (ret) 4177 goto out_unset; 4178 4179 merge_reloc_roots(rc); 4180 4181 unset_reloc_control(rc); 4182 4183 trans = btrfs_join_transaction(rc->extent_root); 4184 if (IS_ERR(trans)) { 4185 ret = PTR_ERR(trans); 4186 goto out_clean; 4187 } 4188 ret = btrfs_commit_transaction(trans); 4189 out_clean: 4190 ret2 = clean_dirty_subvols(rc); 4191 if (ret2 < 0 && !ret) 4192 ret = ret2; 4193 out_unset: 4194 unset_reloc_control(rc); 4195 reloc_chunk_end(fs_info); 4196 out_end: 4197 free_reloc_control(rc); 4198 out: 4199 free_reloc_roots(&reloc_roots); 4200 4201 btrfs_free_path(path); 4202 4203 if (ret == 0) { 4204 /* cleanup orphan inode in data relocation tree */ 4205 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4206 ASSERT(fs_root); 4207 ret = btrfs_orphan_cleanup(fs_root); 4208 btrfs_put_root(fs_root); 4209 } 4210 return ret; 4211 } 4212 4213 /* 4214 * helper to add ordered checksum for data relocation. 4215 * 4216 * cloning checksum properly handles the nodatasum extents. 4217 * it also saves CPU time to re-calculate the checksum. 4218 */ 4219 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4220 { 4221 struct btrfs_inode *inode = ordered->inode; 4222 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4223 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4224 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4225 LIST_HEAD(list); 4226 int ret; 4227 4228 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4229 disk_bytenr + ordered->num_bytes - 1, 4230 &list, false); 4231 if (ret < 0) { 4232 btrfs_mark_ordered_extent_error(ordered); 4233 return ret; 4234 } 4235 4236 while (!list_empty(&list)) { 4237 struct btrfs_ordered_sum *sums = 4238 list_first_entry(&list, struct btrfs_ordered_sum, list); 4239 4240 list_del_init(&sums->list); 4241 4242 /* 4243 * We need to offset the new_bytenr based on where the csum is. 4244 * We need to do this because we will read in entire prealloc 4245 * extents but we may have written to say the middle of the 4246 * prealloc extent, so we need to make sure the csum goes with 4247 * the right disk offset. 4248 * 4249 * We can do this because the data reloc inode refers strictly 4250 * to the on disk bytes, so we don't have to worry about 4251 * disk_len vs real len like with real inodes since it's all 4252 * disk length. 4253 */ 4254 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4255 btrfs_add_ordered_sum(ordered, sums); 4256 } 4257 4258 return 0; 4259 } 4260 4261 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4262 struct btrfs_root *root, 4263 const struct extent_buffer *buf, 4264 struct extent_buffer *cow) 4265 { 4266 struct btrfs_fs_info *fs_info = root->fs_info; 4267 struct reloc_control *rc; 4268 struct btrfs_backref_node *node; 4269 int first_cow = 0; 4270 int level; 4271 int ret = 0; 4272 4273 rc = fs_info->reloc_ctl; 4274 if (!rc) 4275 return 0; 4276 4277 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4278 4279 level = btrfs_header_level(buf); 4280 if (btrfs_header_generation(buf) <= 4281 btrfs_root_last_snapshot(&root->root_item)) 4282 first_cow = 1; 4283 4284 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4285 WARN_ON(!first_cow && level == 0); 4286 4287 node = rc->backref_cache.path[level]; 4288 4289 /* 4290 * If node->bytenr != buf->start and node->new_bytenr != 4291 * buf->start then we've got the wrong backref node for what we 4292 * expected to see here and the cache is incorrect. 4293 */ 4294 if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { 4295 btrfs_err(fs_info, 4296 "bytenr %llu was found but our backref cache was expecting %llu or %llu", 4297 buf->start, node->bytenr, node->new_bytenr); 4298 return -EUCLEAN; 4299 } 4300 4301 btrfs_backref_drop_node_buffer(node); 4302 refcount_inc(&cow->refs); 4303 node->eb = cow; 4304 node->new_bytenr = cow->start; 4305 4306 if (!node->pending) { 4307 list_move_tail(&node->list, 4308 &rc->backref_cache.pending[level]); 4309 node->pending = 1; 4310 } 4311 4312 if (first_cow) 4313 mark_block_processed(rc, node); 4314 4315 if (first_cow && level > 0) 4316 rc->nodes_relocated += buf->len; 4317 } 4318 4319 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4320 ret = replace_file_extents(trans, rc, root, cow); 4321 return ret; 4322 } 4323 4324 /* 4325 * called before creating snapshot. it calculates metadata reservation 4326 * required for relocating tree blocks in the snapshot 4327 */ 4328 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4329 u64 *bytes_to_reserve) 4330 { 4331 struct btrfs_root *root = pending->root; 4332 struct reloc_control *rc = root->fs_info->reloc_ctl; 4333 4334 if (!rc || !have_reloc_root(root)) 4335 return; 4336 4337 if (!rc->merge_reloc_tree) 4338 return; 4339 4340 root = root->reloc_root; 4341 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4342 /* 4343 * relocation is in the stage of merging trees. the space 4344 * used by merging a reloc tree is twice the size of 4345 * relocated tree nodes in the worst case. half for cowing 4346 * the reloc tree, half for cowing the fs tree. the space 4347 * used by cowing the reloc tree will be freed after the 4348 * tree is dropped. if we create snapshot, cowing the fs 4349 * tree may use more space than it frees. so we need 4350 * reserve extra space. 4351 */ 4352 *bytes_to_reserve += rc->nodes_relocated; 4353 } 4354 4355 /* 4356 * called after snapshot is created. migrate block reservation 4357 * and create reloc root for the newly created snapshot 4358 * 4359 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4360 * references held on the reloc_root, one for root->reloc_root and one for 4361 * rc->reloc_roots. 4362 */ 4363 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4364 struct btrfs_pending_snapshot *pending) 4365 { 4366 struct btrfs_root *root = pending->root; 4367 struct btrfs_root *reloc_root; 4368 struct btrfs_root *new_root; 4369 struct reloc_control *rc = root->fs_info->reloc_ctl; 4370 int ret; 4371 4372 if (!rc || !have_reloc_root(root)) 4373 return 0; 4374 4375 rc = root->fs_info->reloc_ctl; 4376 rc->merging_rsv_size += rc->nodes_relocated; 4377 4378 if (rc->merge_reloc_tree) { 4379 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4380 rc->block_rsv, 4381 rc->nodes_relocated, true); 4382 if (ret) 4383 return ret; 4384 } 4385 4386 new_root = pending->snap; 4387 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4388 if (IS_ERR(reloc_root)) 4389 return PTR_ERR(reloc_root); 4390 4391 ret = __add_reloc_root(reloc_root); 4392 ASSERT(ret != -EEXIST); 4393 if (ret) { 4394 /* Pairs with create_reloc_root */ 4395 btrfs_put_root(reloc_root); 4396 return ret; 4397 } 4398 new_root->reloc_root = btrfs_grab_root(reloc_root); 4399 return 0; 4400 } 4401 4402 /* 4403 * Get the current bytenr for the block group which is being relocated. 4404 * 4405 * Return U64_MAX if no running relocation. 4406 */ 4407 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4408 { 4409 u64 logical = U64_MAX; 4410 4411 lockdep_assert_held(&fs_info->reloc_mutex); 4412 4413 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4414 logical = fs_info->reloc_ctl->block_group->start; 4415 return logical; 4416 } 4417