1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2009 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/error-injection.h> 13 #include "ctree.h" 14 #include "disk-io.h" 15 #include "transaction.h" 16 #include "volumes.h" 17 #include "locking.h" 18 #include "btrfs_inode.h" 19 #include "async-thread.h" 20 #include "free-space-cache.h" 21 #include "qgroup.h" 22 #include "print-tree.h" 23 #include "delalloc-space.h" 24 #include "block-group.h" 25 #include "backref.h" 26 #include "misc.h" 27 #include "subpage.h" 28 #include "zoned.h" 29 #include "inode-item.h" 30 #include "space-info.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "extent-tree.h" 34 #include "root-tree.h" 35 #include "file-item.h" 36 #include "relocation.h" 37 #include "super.h" 38 #include "tree-checker.h" 39 #include "raid-stripe-tree.h" 40 41 /* 42 * Relocation overview 43 * 44 * [What does relocation do] 45 * 46 * The objective of relocation is to relocate all extents of the target block 47 * group to other block groups. 48 * This is utilized by resize (shrink only), profile converting, compacting 49 * space, or balance routine to spread chunks over devices. 50 * 51 * Before | After 52 * ------------------------------------------------------------------ 53 * BG A: 10 data extents | BG A: deleted 54 * BG B: 2 data extents | BG B: 10 data extents (2 old + 8 relocated) 55 * BG C: 1 extents | BG C: 3 data extents (1 old + 2 relocated) 56 * 57 * [How does relocation work] 58 * 59 * 1. Mark the target block group read-only 60 * New extents won't be allocated from the target block group. 61 * 62 * 2.1 Record each extent in the target block group 63 * To build a proper map of extents to be relocated. 64 * 65 * 2.2 Build data reloc tree and reloc trees 66 * Data reloc tree will contain an inode, recording all newly relocated 67 * data extents. 68 * There will be only one data reloc tree for one data block group. 69 * 70 * Reloc tree will be a special snapshot of its source tree, containing 71 * relocated tree blocks. 72 * Each tree referring to a tree block in target block group will get its 73 * reloc tree built. 74 * 75 * 2.3 Swap source tree with its corresponding reloc tree 76 * Each involved tree only refers to new extents after swap. 77 * 78 * 3. Cleanup reloc trees and data reloc tree. 79 * As old extents in the target block group are still referenced by reloc 80 * trees, we need to clean them up before really freeing the target block 81 * group. 82 * 83 * The main complexity is in steps 2.2 and 2.3. 84 * 85 * The entry point of relocation is relocate_block_group() function. 86 */ 87 88 #define RELOCATION_RESERVED_NODES 256 89 /* 90 * map address of tree root to tree 91 */ 92 struct mapping_node { 93 struct { 94 struct rb_node rb_node; 95 u64 bytenr; 96 }; /* Use rb_simle_node for search/insert */ 97 void *data; 98 }; 99 100 struct mapping_tree { 101 struct rb_root rb_root; 102 spinlock_t lock; 103 }; 104 105 /* 106 * present a tree block to process 107 */ 108 struct tree_block { 109 struct { 110 struct rb_node rb_node; 111 u64 bytenr; 112 }; /* Use rb_simple_node for search/insert */ 113 u64 owner; 114 struct btrfs_key key; 115 u8 level; 116 bool key_ready; 117 }; 118 119 #define MAX_EXTENTS 128 120 121 struct file_extent_cluster { 122 u64 start; 123 u64 end; 124 u64 boundary[MAX_EXTENTS]; 125 unsigned int nr; 126 u64 owning_root; 127 }; 128 129 /* Stages of data relocation. */ 130 enum reloc_stage { 131 MOVE_DATA_EXTENTS, 132 UPDATE_DATA_PTRS 133 }; 134 135 struct reloc_control { 136 /* block group to relocate */ 137 struct btrfs_block_group *block_group; 138 /* extent tree */ 139 struct btrfs_root *extent_root; 140 /* inode for moving data */ 141 struct inode *data_inode; 142 143 struct btrfs_block_rsv *block_rsv; 144 145 struct btrfs_backref_cache backref_cache; 146 147 struct file_extent_cluster cluster; 148 /* tree blocks have been processed */ 149 struct extent_io_tree processed_blocks; 150 /* map start of tree root to corresponding reloc tree */ 151 struct mapping_tree reloc_root_tree; 152 /* list of reloc trees */ 153 struct list_head reloc_roots; 154 /* list of subvolume trees that get relocated */ 155 struct list_head dirty_subvol_roots; 156 /* size of metadata reservation for merging reloc trees */ 157 u64 merging_rsv_size; 158 /* size of relocated tree nodes */ 159 u64 nodes_relocated; 160 /* reserved size for block group relocation*/ 161 u64 reserved_bytes; 162 163 u64 search_start; 164 u64 extents_found; 165 166 enum reloc_stage stage; 167 bool create_reloc_tree; 168 bool merge_reloc_tree; 169 bool found_file_extent; 170 }; 171 172 static void mark_block_processed(struct reloc_control *rc, 173 struct btrfs_backref_node *node) 174 { 175 u32 blocksize; 176 177 if (node->level == 0 || 178 in_range(node->bytenr, rc->block_group->start, 179 rc->block_group->length)) { 180 blocksize = rc->extent_root->fs_info->nodesize; 181 set_extent_bit(&rc->processed_blocks, node->bytenr, 182 node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); 183 } 184 node->processed = 1; 185 } 186 187 /* 188 * walk up backref nodes until reach node presents tree root 189 */ 190 static struct btrfs_backref_node *walk_up_backref( 191 struct btrfs_backref_node *node, 192 struct btrfs_backref_edge *edges[], int *index) 193 { 194 struct btrfs_backref_edge *edge; 195 int idx = *index; 196 197 while (!list_empty(&node->upper)) { 198 edge = list_entry(node->upper.next, 199 struct btrfs_backref_edge, list[LOWER]); 200 edges[idx++] = edge; 201 node = edge->node[UPPER]; 202 } 203 BUG_ON(node->detached); 204 *index = idx; 205 return node; 206 } 207 208 /* 209 * walk down backref nodes to find start of next reference path 210 */ 211 static struct btrfs_backref_node *walk_down_backref( 212 struct btrfs_backref_edge *edges[], int *index) 213 { 214 struct btrfs_backref_edge *edge; 215 struct btrfs_backref_node *lower; 216 int idx = *index; 217 218 while (idx > 0) { 219 edge = edges[idx - 1]; 220 lower = edge->node[LOWER]; 221 if (list_is_last(&edge->list[LOWER], &lower->upper)) { 222 idx--; 223 continue; 224 } 225 edge = list_entry(edge->list[LOWER].next, 226 struct btrfs_backref_edge, list[LOWER]); 227 edges[idx - 1] = edge; 228 *index = idx; 229 return edge->node[UPPER]; 230 } 231 *index = 0; 232 return NULL; 233 } 234 235 static void update_backref_node(struct btrfs_backref_cache *cache, 236 struct btrfs_backref_node *node, u64 bytenr) 237 { 238 struct rb_node *rb_node; 239 rb_erase(&node->rb_node, &cache->rb_root); 240 node->bytenr = bytenr; 241 rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); 242 if (rb_node) 243 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); 244 } 245 246 /* 247 * update backref cache after a transaction commit 248 */ 249 static int update_backref_cache(struct btrfs_trans_handle *trans, 250 struct btrfs_backref_cache *cache) 251 { 252 struct btrfs_backref_node *node; 253 int level = 0; 254 255 if (cache->last_trans == 0) { 256 cache->last_trans = trans->transid; 257 return 0; 258 } 259 260 if (cache->last_trans == trans->transid) 261 return 0; 262 263 /* 264 * detached nodes are used to avoid unnecessary backref 265 * lookup. transaction commit changes the extent tree. 266 * so the detached nodes are no longer useful. 267 */ 268 while (!list_empty(&cache->detached)) { 269 node = list_entry(cache->detached.next, 270 struct btrfs_backref_node, list); 271 btrfs_backref_cleanup_node(cache, node); 272 } 273 274 while (!list_empty(&cache->changed)) { 275 node = list_entry(cache->changed.next, 276 struct btrfs_backref_node, list); 277 list_del_init(&node->list); 278 BUG_ON(node->pending); 279 update_backref_node(cache, node, node->new_bytenr); 280 } 281 282 /* 283 * some nodes can be left in the pending list if there were 284 * errors during processing the pending nodes. 285 */ 286 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 287 list_for_each_entry(node, &cache->pending[level], list) { 288 BUG_ON(!node->pending); 289 if (node->bytenr == node->new_bytenr) 290 continue; 291 update_backref_node(cache, node, node->new_bytenr); 292 } 293 } 294 295 cache->last_trans = 0; 296 return 1; 297 } 298 299 static bool reloc_root_is_dead(const struct btrfs_root *root) 300 { 301 /* 302 * Pair with set_bit/clear_bit in clean_dirty_subvols and 303 * btrfs_update_reloc_root. We need to see the updated bit before 304 * trying to access reloc_root 305 */ 306 smp_rmb(); 307 if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) 308 return true; 309 return false; 310 } 311 312 /* 313 * Check if this subvolume tree has valid reloc tree. 314 * 315 * Reloc tree after swap is considered dead, thus not considered as valid. 316 * This is enough for most callers, as they don't distinguish dead reloc root 317 * from no reloc root. But btrfs_should_ignore_reloc_root() below is a 318 * special case. 319 */ 320 static bool have_reloc_root(const struct btrfs_root *root) 321 { 322 if (reloc_root_is_dead(root)) 323 return false; 324 if (!root->reloc_root) 325 return false; 326 return true; 327 } 328 329 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) 330 { 331 struct btrfs_root *reloc_root; 332 333 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 334 return false; 335 336 /* This root has been merged with its reloc tree, we can ignore it */ 337 if (reloc_root_is_dead(root)) 338 return true; 339 340 reloc_root = root->reloc_root; 341 if (!reloc_root) 342 return false; 343 344 if (btrfs_header_generation(reloc_root->commit_root) == 345 root->fs_info->running_transaction->transid) 346 return false; 347 /* 348 * If there is reloc tree and it was created in previous transaction 349 * backref lookup can find the reloc tree, so backref node for the fs 350 * tree root is useless for relocation. 351 */ 352 return true; 353 } 354 355 /* 356 * find reloc tree by address of tree root 357 */ 358 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) 359 { 360 struct reloc_control *rc = fs_info->reloc_ctl; 361 struct rb_node *rb_node; 362 struct mapping_node *node; 363 struct btrfs_root *root = NULL; 364 365 ASSERT(rc); 366 spin_lock(&rc->reloc_root_tree.lock); 367 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); 368 if (rb_node) { 369 node = rb_entry(rb_node, struct mapping_node, rb_node); 370 root = node->data; 371 } 372 spin_unlock(&rc->reloc_root_tree.lock); 373 return btrfs_grab_root(root); 374 } 375 376 /* 377 * For useless nodes, do two major clean ups: 378 * 379 * - Cleanup the children edges and nodes 380 * If child node is also orphan (no parent) during cleanup, then the child 381 * node will also be cleaned up. 382 * 383 * - Freeing up leaves (level 0), keeps nodes detached 384 * For nodes, the node is still cached as "detached" 385 * 386 * Return false if @node is not in the @useless_nodes list. 387 * Return true if @node is in the @useless_nodes list. 388 */ 389 static bool handle_useless_nodes(struct reloc_control *rc, 390 struct btrfs_backref_node *node) 391 { 392 struct btrfs_backref_cache *cache = &rc->backref_cache; 393 struct list_head *useless_node = &cache->useless_node; 394 bool ret = false; 395 396 while (!list_empty(useless_node)) { 397 struct btrfs_backref_node *cur; 398 399 cur = list_first_entry(useless_node, struct btrfs_backref_node, 400 list); 401 list_del_init(&cur->list); 402 403 /* Only tree root nodes can be added to @useless_nodes */ 404 ASSERT(list_empty(&cur->upper)); 405 406 if (cur == node) 407 ret = true; 408 409 /* The node is the lowest node */ 410 if (cur->lowest) { 411 list_del_init(&cur->lower); 412 cur->lowest = 0; 413 } 414 415 /* Cleanup the lower edges */ 416 while (!list_empty(&cur->lower)) { 417 struct btrfs_backref_edge *edge; 418 struct btrfs_backref_node *lower; 419 420 edge = list_entry(cur->lower.next, 421 struct btrfs_backref_edge, list[UPPER]); 422 list_del(&edge->list[UPPER]); 423 list_del(&edge->list[LOWER]); 424 lower = edge->node[LOWER]; 425 btrfs_backref_free_edge(cache, edge); 426 427 /* Child node is also orphan, queue for cleanup */ 428 if (list_empty(&lower->upper)) 429 list_add(&lower->list, useless_node); 430 } 431 /* Mark this block processed for relocation */ 432 mark_block_processed(rc, cur); 433 434 /* 435 * Backref nodes for tree leaves are deleted from the cache. 436 * Backref nodes for upper level tree blocks are left in the 437 * cache to avoid unnecessary backref lookup. 438 */ 439 if (cur->level > 0) { 440 list_add(&cur->list, &cache->detached); 441 cur->detached = 1; 442 } else { 443 rb_erase(&cur->rb_node, &cache->rb_root); 444 btrfs_backref_free_node(cache, cur); 445 } 446 } 447 return ret; 448 } 449 450 /* 451 * Build backref tree for a given tree block. Root of the backref tree 452 * corresponds the tree block, leaves of the backref tree correspond roots of 453 * b-trees that reference the tree block. 454 * 455 * The basic idea of this function is check backrefs of a given block to find 456 * upper level blocks that reference the block, and then check backrefs of 457 * these upper level blocks recursively. The recursion stops when tree root is 458 * reached or backrefs for the block is cached. 459 * 460 * NOTE: if we find that backrefs for a block are cached, we know backrefs for 461 * all upper level blocks that directly/indirectly reference the block are also 462 * cached. 463 */ 464 static noinline_for_stack struct btrfs_backref_node *build_backref_tree( 465 struct btrfs_trans_handle *trans, 466 struct reloc_control *rc, struct btrfs_key *node_key, 467 int level, u64 bytenr) 468 { 469 struct btrfs_backref_iter *iter; 470 struct btrfs_backref_cache *cache = &rc->backref_cache; 471 /* For searching parent of TREE_BLOCK_REF */ 472 struct btrfs_path *path; 473 struct btrfs_backref_node *cur; 474 struct btrfs_backref_node *node = NULL; 475 struct btrfs_backref_edge *edge; 476 int ret; 477 478 iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); 479 if (!iter) 480 return ERR_PTR(-ENOMEM); 481 path = btrfs_alloc_path(); 482 if (!path) { 483 ret = -ENOMEM; 484 goto out; 485 } 486 487 node = btrfs_backref_alloc_node(cache, bytenr, level); 488 if (!node) { 489 ret = -ENOMEM; 490 goto out; 491 } 492 493 node->lowest = 1; 494 cur = node; 495 496 /* Breadth-first search to build backref cache */ 497 do { 498 ret = btrfs_backref_add_tree_node(trans, cache, path, iter, 499 node_key, cur); 500 if (ret < 0) 501 goto out; 502 503 edge = list_first_entry_or_null(&cache->pending_edge, 504 struct btrfs_backref_edge, list[UPPER]); 505 /* 506 * The pending list isn't empty, take the first block to 507 * process 508 */ 509 if (edge) { 510 list_del_init(&edge->list[UPPER]); 511 cur = edge->node[UPPER]; 512 } 513 } while (edge); 514 515 /* Finish the upper linkage of newly added edges/nodes */ 516 ret = btrfs_backref_finish_upper_links(cache, node); 517 if (ret < 0) 518 goto out; 519 520 if (handle_useless_nodes(rc, node)) 521 node = NULL; 522 out: 523 btrfs_free_path(iter->path); 524 kfree(iter); 525 btrfs_free_path(path); 526 if (ret) { 527 btrfs_backref_error_cleanup(cache, node); 528 return ERR_PTR(ret); 529 } 530 ASSERT(!node || !node->detached); 531 ASSERT(list_empty(&cache->useless_node) && 532 list_empty(&cache->pending_edge)); 533 return node; 534 } 535 536 /* 537 * helper to add backref node for the newly created snapshot. 538 * the backref node is created by cloning backref node that 539 * corresponds to root of source tree 540 */ 541 static int clone_backref_node(struct btrfs_trans_handle *trans, 542 struct reloc_control *rc, 543 const struct btrfs_root *src, 544 struct btrfs_root *dest) 545 { 546 struct btrfs_root *reloc_root = src->reloc_root; 547 struct btrfs_backref_cache *cache = &rc->backref_cache; 548 struct btrfs_backref_node *node = NULL; 549 struct btrfs_backref_node *new_node; 550 struct btrfs_backref_edge *edge; 551 struct btrfs_backref_edge *new_edge; 552 struct rb_node *rb_node; 553 554 if (cache->last_trans > 0) 555 update_backref_cache(trans, cache); 556 557 rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); 558 if (rb_node) { 559 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); 560 if (node->detached) 561 node = NULL; 562 else 563 BUG_ON(node->new_bytenr != reloc_root->node->start); 564 } 565 566 if (!node) { 567 rb_node = rb_simple_search(&cache->rb_root, 568 reloc_root->commit_root->start); 569 if (rb_node) { 570 node = rb_entry(rb_node, struct btrfs_backref_node, 571 rb_node); 572 BUG_ON(node->detached); 573 } 574 } 575 576 if (!node) 577 return 0; 578 579 new_node = btrfs_backref_alloc_node(cache, dest->node->start, 580 node->level); 581 if (!new_node) 582 return -ENOMEM; 583 584 new_node->lowest = node->lowest; 585 new_node->checked = 1; 586 new_node->root = btrfs_grab_root(dest); 587 ASSERT(new_node->root); 588 589 if (!node->lowest) { 590 list_for_each_entry(edge, &node->lower, list[UPPER]) { 591 new_edge = btrfs_backref_alloc_edge(cache); 592 if (!new_edge) 593 goto fail; 594 595 btrfs_backref_link_edge(new_edge, edge->node[LOWER], 596 new_node, LINK_UPPER); 597 } 598 } else { 599 list_add_tail(&new_node->lower, &cache->leaves); 600 } 601 602 rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, 603 &new_node->rb_node); 604 if (rb_node) 605 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); 606 607 if (!new_node->lowest) { 608 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { 609 list_add_tail(&new_edge->list[LOWER], 610 &new_edge->node[LOWER]->upper); 611 } 612 } 613 return 0; 614 fail: 615 while (!list_empty(&new_node->lower)) { 616 new_edge = list_entry(new_node->lower.next, 617 struct btrfs_backref_edge, list[UPPER]); 618 list_del(&new_edge->list[UPPER]); 619 btrfs_backref_free_edge(cache, new_edge); 620 } 621 btrfs_backref_free_node(cache, new_node); 622 return -ENOMEM; 623 } 624 625 /* 626 * helper to add 'address of tree root -> reloc tree' mapping 627 */ 628 static int __add_reloc_root(struct btrfs_root *root) 629 { 630 struct btrfs_fs_info *fs_info = root->fs_info; 631 struct rb_node *rb_node; 632 struct mapping_node *node; 633 struct reloc_control *rc = fs_info->reloc_ctl; 634 635 node = kmalloc(sizeof(*node), GFP_NOFS); 636 if (!node) 637 return -ENOMEM; 638 639 node->bytenr = root->commit_root->start; 640 node->data = root; 641 642 spin_lock(&rc->reloc_root_tree.lock); 643 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 644 node->bytenr, &node->rb_node); 645 spin_unlock(&rc->reloc_root_tree.lock); 646 if (rb_node) { 647 btrfs_err(fs_info, 648 "Duplicate root found for start=%llu while inserting into relocation tree", 649 node->bytenr); 650 return -EEXIST; 651 } 652 653 list_add_tail(&root->root_list, &rc->reloc_roots); 654 return 0; 655 } 656 657 /* 658 * helper to delete the 'address of tree root -> reloc tree' 659 * mapping 660 */ 661 static void __del_reloc_root(struct btrfs_root *root) 662 { 663 struct btrfs_fs_info *fs_info = root->fs_info; 664 struct rb_node *rb_node; 665 struct mapping_node *node = NULL; 666 struct reloc_control *rc = fs_info->reloc_ctl; 667 bool put_ref = false; 668 669 if (rc && root->node) { 670 spin_lock(&rc->reloc_root_tree.lock); 671 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 672 root->commit_root->start); 673 if (rb_node) { 674 node = rb_entry(rb_node, struct mapping_node, rb_node); 675 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 676 RB_CLEAR_NODE(&node->rb_node); 677 } 678 spin_unlock(&rc->reloc_root_tree.lock); 679 ASSERT(!node || (struct btrfs_root *)node->data == root); 680 } 681 682 /* 683 * We only put the reloc root here if it's on the list. There's a lot 684 * of places where the pattern is to splice the rc->reloc_roots, process 685 * the reloc roots, and then add the reloc root back onto 686 * rc->reloc_roots. If we call __del_reloc_root while it's off of the 687 * list we don't want the reference being dropped, because the guy 688 * messing with the list is in charge of the reference. 689 */ 690 spin_lock(&fs_info->trans_lock); 691 if (!list_empty(&root->root_list)) { 692 put_ref = true; 693 list_del_init(&root->root_list); 694 } 695 spin_unlock(&fs_info->trans_lock); 696 if (put_ref) 697 btrfs_put_root(root); 698 kfree(node); 699 } 700 701 /* 702 * helper to update the 'address of tree root -> reloc tree' 703 * mapping 704 */ 705 static int __update_reloc_root(struct btrfs_root *root) 706 { 707 struct btrfs_fs_info *fs_info = root->fs_info; 708 struct rb_node *rb_node; 709 struct mapping_node *node = NULL; 710 struct reloc_control *rc = fs_info->reloc_ctl; 711 712 spin_lock(&rc->reloc_root_tree.lock); 713 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, 714 root->commit_root->start); 715 if (rb_node) { 716 node = rb_entry(rb_node, struct mapping_node, rb_node); 717 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 718 } 719 spin_unlock(&rc->reloc_root_tree.lock); 720 721 if (!node) 722 return 0; 723 BUG_ON((struct btrfs_root *)node->data != root); 724 725 spin_lock(&rc->reloc_root_tree.lock); 726 node->bytenr = root->node->start; 727 rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, 728 node->bytenr, &node->rb_node); 729 spin_unlock(&rc->reloc_root_tree.lock); 730 if (rb_node) 731 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); 732 return 0; 733 } 734 735 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, 736 struct btrfs_root *root, u64 objectid) 737 { 738 struct btrfs_fs_info *fs_info = root->fs_info; 739 struct btrfs_root *reloc_root; 740 struct extent_buffer *eb; 741 struct btrfs_root_item *root_item; 742 struct btrfs_key root_key; 743 int ret = 0; 744 bool must_abort = false; 745 746 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 747 if (!root_item) 748 return ERR_PTR(-ENOMEM); 749 750 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 751 root_key.type = BTRFS_ROOT_ITEM_KEY; 752 root_key.offset = objectid; 753 754 if (btrfs_root_id(root) == objectid) { 755 u64 commit_root_gen; 756 757 /* called by btrfs_init_reloc_root */ 758 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 759 BTRFS_TREE_RELOC_OBJECTID); 760 if (ret) 761 goto fail; 762 763 /* 764 * Set the last_snapshot field to the generation of the commit 765 * root - like this ctree.c:btrfs_block_can_be_shared() behaves 766 * correctly (returns true) when the relocation root is created 767 * either inside the critical section of a transaction commit 768 * (through transaction.c:qgroup_account_snapshot()) and when 769 * it's created before the transaction commit is started. 770 */ 771 commit_root_gen = btrfs_header_generation(root->commit_root); 772 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); 773 } else { 774 /* 775 * called by btrfs_reloc_post_snapshot_hook. 776 * the source tree is a reloc tree, all tree blocks 777 * modified after it was created have RELOC flag 778 * set in their headers. so it's OK to not update 779 * the 'last_snapshot'. 780 */ 781 ret = btrfs_copy_root(trans, root, root->node, &eb, 782 BTRFS_TREE_RELOC_OBJECTID); 783 if (ret) 784 goto fail; 785 } 786 787 /* 788 * We have changed references at this point, we must abort the 789 * transaction if anything fails. 790 */ 791 must_abort = true; 792 793 memcpy(root_item, &root->root_item, sizeof(*root_item)); 794 btrfs_set_root_bytenr(root_item, eb->start); 795 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 796 btrfs_set_root_generation(root_item, trans->transid); 797 798 if (btrfs_root_id(root) == objectid) { 799 btrfs_set_root_refs(root_item, 0); 800 memset(&root_item->drop_progress, 0, 801 sizeof(struct btrfs_disk_key)); 802 btrfs_set_root_drop_level(root_item, 0); 803 } 804 805 btrfs_tree_unlock(eb); 806 free_extent_buffer(eb); 807 808 ret = btrfs_insert_root(trans, fs_info->tree_root, 809 &root_key, root_item); 810 if (ret) 811 goto fail; 812 813 kfree(root_item); 814 815 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); 816 if (IS_ERR(reloc_root)) { 817 ret = PTR_ERR(reloc_root); 818 goto abort; 819 } 820 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 821 btrfs_set_root_last_trans(reloc_root, trans->transid); 822 return reloc_root; 823 fail: 824 kfree(root_item); 825 abort: 826 if (must_abort) 827 btrfs_abort_transaction(trans, ret); 828 return ERR_PTR(ret); 829 } 830 831 /* 832 * create reloc tree for a given fs tree. reloc tree is just a 833 * snapshot of the fs tree with special root objectid. 834 * 835 * The reloc_root comes out of here with two references, one for 836 * root->reloc_root, and another for being on the rc->reloc_roots list. 837 */ 838 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, 839 struct btrfs_root *root) 840 { 841 struct btrfs_fs_info *fs_info = root->fs_info; 842 struct btrfs_root *reloc_root; 843 struct reloc_control *rc = fs_info->reloc_ctl; 844 struct btrfs_block_rsv *rsv; 845 int clear_rsv = 0; 846 int ret; 847 848 if (!rc) 849 return 0; 850 851 /* 852 * The subvolume has reloc tree but the swap is finished, no need to 853 * create/update the dead reloc tree 854 */ 855 if (reloc_root_is_dead(root)) 856 return 0; 857 858 /* 859 * This is subtle but important. We do not do 860 * record_root_in_transaction for reloc roots, instead we record their 861 * corresponding fs root, and then here we update the last trans for the 862 * reloc root. This means that we have to do this for the entire life 863 * of the reloc root, regardless of which stage of the relocation we are 864 * in. 865 */ 866 if (root->reloc_root) { 867 reloc_root = root->reloc_root; 868 btrfs_set_root_last_trans(reloc_root, trans->transid); 869 return 0; 870 } 871 872 /* 873 * We are merging reloc roots, we do not need new reloc trees. Also 874 * reloc trees never need their own reloc tree. 875 */ 876 if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 877 return 0; 878 879 if (!trans->reloc_reserved) { 880 rsv = trans->block_rsv; 881 trans->block_rsv = rc->block_rsv; 882 clear_rsv = 1; 883 } 884 reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); 885 if (clear_rsv) 886 trans->block_rsv = rsv; 887 if (IS_ERR(reloc_root)) 888 return PTR_ERR(reloc_root); 889 890 ret = __add_reloc_root(reloc_root); 891 ASSERT(ret != -EEXIST); 892 if (ret) { 893 /* Pairs with create_reloc_root */ 894 btrfs_put_root(reloc_root); 895 return ret; 896 } 897 root->reloc_root = btrfs_grab_root(reloc_root); 898 return 0; 899 } 900 901 /* 902 * update root item of reloc tree 903 */ 904 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, 905 struct btrfs_root *root) 906 { 907 struct btrfs_fs_info *fs_info = root->fs_info; 908 struct btrfs_root *reloc_root; 909 struct btrfs_root_item *root_item; 910 int ret; 911 912 if (!have_reloc_root(root)) 913 return 0; 914 915 reloc_root = root->reloc_root; 916 root_item = &reloc_root->root_item; 917 918 /* 919 * We are probably ok here, but __del_reloc_root() will drop its ref of 920 * the root. We have the ref for root->reloc_root, but just in case 921 * hold it while we update the reloc root. 922 */ 923 btrfs_grab_root(reloc_root); 924 925 /* root->reloc_root will stay until current relocation finished */ 926 if (fs_info->reloc_ctl->merge_reloc_tree && 927 btrfs_root_refs(root_item) == 0) { 928 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 929 /* 930 * Mark the tree as dead before we change reloc_root so 931 * have_reloc_root will not touch it from now on. 932 */ 933 smp_wmb(); 934 __del_reloc_root(reloc_root); 935 } 936 937 if (reloc_root->commit_root != reloc_root->node) { 938 __update_reloc_root(reloc_root); 939 btrfs_set_root_node(root_item, reloc_root->node); 940 free_extent_buffer(reloc_root->commit_root); 941 reloc_root->commit_root = btrfs_root_node(reloc_root); 942 } 943 944 ret = btrfs_update_root(trans, fs_info->tree_root, 945 &reloc_root->root_key, root_item); 946 btrfs_put_root(reloc_root); 947 return ret; 948 } 949 950 /* 951 * get new location of data 952 */ 953 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, 954 u64 bytenr, u64 num_bytes) 955 { 956 struct btrfs_root *root = BTRFS_I(reloc_inode)->root; 957 struct btrfs_path *path; 958 struct btrfs_file_extent_item *fi; 959 struct extent_buffer *leaf; 960 int ret; 961 962 path = btrfs_alloc_path(); 963 if (!path) 964 return -ENOMEM; 965 966 bytenr -= BTRFS_I(reloc_inode)->reloc_block_group_start; 967 ret = btrfs_lookup_file_extent(NULL, root, path, 968 btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0); 969 if (ret < 0) 970 goto out; 971 if (ret > 0) { 972 ret = -ENOENT; 973 goto out; 974 } 975 976 leaf = path->nodes[0]; 977 fi = btrfs_item_ptr(leaf, path->slots[0], 978 struct btrfs_file_extent_item); 979 980 BUG_ON(btrfs_file_extent_offset(leaf, fi) || 981 btrfs_file_extent_compression(leaf, fi) || 982 btrfs_file_extent_encryption(leaf, fi) || 983 btrfs_file_extent_other_encoding(leaf, fi)); 984 985 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 986 ret = -EINVAL; 987 goto out; 988 } 989 990 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 991 ret = 0; 992 out: 993 btrfs_free_path(path); 994 return ret; 995 } 996 997 /* 998 * update file extent items in the tree leaf to point to 999 * the new locations. 1000 */ 1001 static noinline_for_stack 1002 int replace_file_extents(struct btrfs_trans_handle *trans, 1003 struct reloc_control *rc, 1004 struct btrfs_root *root, 1005 struct extent_buffer *leaf) 1006 { 1007 struct btrfs_fs_info *fs_info = root->fs_info; 1008 struct btrfs_key key; 1009 struct btrfs_file_extent_item *fi; 1010 struct btrfs_inode *inode = NULL; 1011 u64 parent; 1012 u64 bytenr; 1013 u64 new_bytenr = 0; 1014 u64 num_bytes; 1015 u64 end; 1016 u32 nritems; 1017 u32 i; 1018 int ret = 0; 1019 int first = 1; 1020 int dirty = 0; 1021 1022 if (rc->stage != UPDATE_DATA_PTRS) 1023 return 0; 1024 1025 /* reloc trees always use full backref */ 1026 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 1027 parent = leaf->start; 1028 else 1029 parent = 0; 1030 1031 nritems = btrfs_header_nritems(leaf); 1032 for (i = 0; i < nritems; i++) { 1033 struct btrfs_ref ref = { 0 }; 1034 1035 cond_resched(); 1036 btrfs_item_key_to_cpu(leaf, &key, i); 1037 if (key.type != BTRFS_EXTENT_DATA_KEY) 1038 continue; 1039 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 1040 if (btrfs_file_extent_type(leaf, fi) == 1041 BTRFS_FILE_EXTENT_INLINE) 1042 continue; 1043 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 1044 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 1045 if (bytenr == 0) 1046 continue; 1047 if (!in_range(bytenr, rc->block_group->start, 1048 rc->block_group->length)) 1049 continue; 1050 1051 /* 1052 * if we are modifying block in fs tree, wait for read_folio 1053 * to complete and drop the extent cache 1054 */ 1055 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1056 if (first) { 1057 inode = btrfs_find_first_inode(root, key.objectid); 1058 first = 0; 1059 } else if (inode && btrfs_ino(inode) < key.objectid) { 1060 btrfs_add_delayed_iput(inode); 1061 inode = btrfs_find_first_inode(root, key.objectid); 1062 } 1063 if (inode && btrfs_ino(inode) == key.objectid) { 1064 struct extent_state *cached_state = NULL; 1065 1066 end = key.offset + 1067 btrfs_file_extent_num_bytes(leaf, fi); 1068 WARN_ON(!IS_ALIGNED(key.offset, 1069 fs_info->sectorsize)); 1070 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1071 end--; 1072 /* Take mmap lock to serialize with reflinks. */ 1073 if (!down_read_trylock(&inode->i_mmap_lock)) 1074 continue; 1075 ret = try_lock_extent(&inode->io_tree, key.offset, 1076 end, &cached_state); 1077 if (!ret) { 1078 up_read(&inode->i_mmap_lock); 1079 continue; 1080 } 1081 1082 btrfs_drop_extent_map_range(inode, key.offset, end, true); 1083 unlock_extent(&inode->io_tree, key.offset, end, 1084 &cached_state); 1085 up_read(&inode->i_mmap_lock); 1086 } 1087 } 1088 1089 ret = get_new_location(rc->data_inode, &new_bytenr, 1090 bytenr, num_bytes); 1091 if (ret) { 1092 /* 1093 * Don't have to abort since we've not changed anything 1094 * in the file extent yet. 1095 */ 1096 break; 1097 } 1098 1099 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1100 dirty = 1; 1101 1102 key.offset -= btrfs_file_extent_offset(leaf, fi); 1103 ref.action = BTRFS_ADD_DELAYED_REF; 1104 ref.bytenr = new_bytenr; 1105 ref.num_bytes = num_bytes; 1106 ref.parent = parent; 1107 ref.owning_root = btrfs_root_id(root); 1108 ref.ref_root = btrfs_header_owner(leaf); 1109 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1110 btrfs_root_id(root), false); 1111 ret = btrfs_inc_extent_ref(trans, &ref); 1112 if (ret) { 1113 btrfs_abort_transaction(trans, ret); 1114 break; 1115 } 1116 1117 ref.action = BTRFS_DROP_DELAYED_REF; 1118 ref.bytenr = bytenr; 1119 ref.num_bytes = num_bytes; 1120 ref.parent = parent; 1121 ref.owning_root = btrfs_root_id(root); 1122 ref.ref_root = btrfs_header_owner(leaf); 1123 btrfs_init_data_ref(&ref, key.objectid, key.offset, 1124 btrfs_root_id(root), false); 1125 ret = btrfs_free_extent(trans, &ref); 1126 if (ret) { 1127 btrfs_abort_transaction(trans, ret); 1128 break; 1129 } 1130 } 1131 if (dirty) 1132 btrfs_mark_buffer_dirty(trans, leaf); 1133 if (inode) 1134 btrfs_add_delayed_iput(inode); 1135 return ret; 1136 } 1137 1138 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, 1139 int slot, const struct btrfs_path *path, 1140 int level) 1141 { 1142 struct btrfs_disk_key key1; 1143 struct btrfs_disk_key key2; 1144 btrfs_node_key(eb, &key1, slot); 1145 btrfs_node_key(path->nodes[level], &key2, path->slots[level]); 1146 return memcmp(&key1, &key2, sizeof(key1)); 1147 } 1148 1149 /* 1150 * try to replace tree blocks in fs tree with the new blocks 1151 * in reloc tree. tree blocks haven't been modified since the 1152 * reloc tree was create can be replaced. 1153 * 1154 * if a block was replaced, level of the block + 1 is returned. 1155 * if no block got replaced, 0 is returned. if there are other 1156 * errors, a negative error number is returned. 1157 */ 1158 static noinline_for_stack 1159 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, 1160 struct btrfs_root *dest, struct btrfs_root *src, 1161 struct btrfs_path *path, struct btrfs_key *next_key, 1162 int lowest_level, int max_level) 1163 { 1164 struct btrfs_fs_info *fs_info = dest->fs_info; 1165 struct extent_buffer *eb; 1166 struct extent_buffer *parent; 1167 struct btrfs_ref ref = { 0 }; 1168 struct btrfs_key key; 1169 u64 old_bytenr; 1170 u64 new_bytenr; 1171 u64 old_ptr_gen; 1172 u64 new_ptr_gen; 1173 u64 last_snapshot; 1174 u32 blocksize; 1175 int cow = 0; 1176 int level; 1177 int ret; 1178 int slot; 1179 1180 ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); 1181 ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); 1182 1183 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1184 again: 1185 slot = path->slots[lowest_level]; 1186 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1187 1188 eb = btrfs_lock_root_node(dest); 1189 level = btrfs_header_level(eb); 1190 1191 if (level < lowest_level) { 1192 btrfs_tree_unlock(eb); 1193 free_extent_buffer(eb); 1194 return 0; 1195 } 1196 1197 if (cow) { 1198 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, 1199 BTRFS_NESTING_COW); 1200 if (ret) { 1201 btrfs_tree_unlock(eb); 1202 free_extent_buffer(eb); 1203 return ret; 1204 } 1205 } 1206 1207 if (next_key) { 1208 next_key->objectid = (u64)-1; 1209 next_key->type = (u8)-1; 1210 next_key->offset = (u64)-1; 1211 } 1212 1213 parent = eb; 1214 while (1) { 1215 level = btrfs_header_level(parent); 1216 ASSERT(level >= lowest_level); 1217 1218 ret = btrfs_bin_search(parent, 0, &key, &slot); 1219 if (ret < 0) 1220 break; 1221 if (ret && slot > 0) 1222 slot--; 1223 1224 if (next_key && slot + 1 < btrfs_header_nritems(parent)) 1225 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1226 1227 old_bytenr = btrfs_node_blockptr(parent, slot); 1228 blocksize = fs_info->nodesize; 1229 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1230 1231 if (level <= max_level) { 1232 eb = path->nodes[level]; 1233 new_bytenr = btrfs_node_blockptr(eb, 1234 path->slots[level]); 1235 new_ptr_gen = btrfs_node_ptr_generation(eb, 1236 path->slots[level]); 1237 } else { 1238 new_bytenr = 0; 1239 new_ptr_gen = 0; 1240 } 1241 1242 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) { 1243 ret = level; 1244 break; 1245 } 1246 1247 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1248 memcmp_node_keys(parent, slot, path, level)) { 1249 if (level <= lowest_level) { 1250 ret = 0; 1251 break; 1252 } 1253 1254 eb = btrfs_read_node_slot(parent, slot); 1255 if (IS_ERR(eb)) { 1256 ret = PTR_ERR(eb); 1257 break; 1258 } 1259 btrfs_tree_lock(eb); 1260 if (cow) { 1261 ret = btrfs_cow_block(trans, dest, eb, parent, 1262 slot, &eb, 1263 BTRFS_NESTING_COW); 1264 if (ret) { 1265 btrfs_tree_unlock(eb); 1266 free_extent_buffer(eb); 1267 break; 1268 } 1269 } 1270 1271 btrfs_tree_unlock(parent); 1272 free_extent_buffer(parent); 1273 1274 parent = eb; 1275 continue; 1276 } 1277 1278 if (!cow) { 1279 btrfs_tree_unlock(parent); 1280 free_extent_buffer(parent); 1281 cow = 1; 1282 goto again; 1283 } 1284 1285 btrfs_node_key_to_cpu(path->nodes[level], &key, 1286 path->slots[level]); 1287 btrfs_release_path(path); 1288 1289 path->lowest_level = level; 1290 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1291 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1292 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state); 1293 path->lowest_level = 0; 1294 if (ret) { 1295 if (ret > 0) 1296 ret = -ENOENT; 1297 break; 1298 } 1299 1300 /* 1301 * Info qgroup to trace both subtrees. 1302 * 1303 * We must trace both trees. 1304 * 1) Tree reloc subtree 1305 * If not traced, we will leak data numbers 1306 * 2) Fs subtree 1307 * If not traced, we will double count old data 1308 * 1309 * We don't scan the subtree right now, but only record 1310 * the swapped tree blocks. 1311 * The real subtree rescan is delayed until we have new 1312 * CoW on the subtree root node before transaction commit. 1313 */ 1314 ret = btrfs_qgroup_add_swapped_blocks(trans, dest, 1315 rc->block_group, parent, slot, 1316 path->nodes[level], path->slots[level], 1317 last_snapshot); 1318 if (ret < 0) 1319 break; 1320 /* 1321 * swap blocks in fs tree and reloc tree. 1322 */ 1323 btrfs_set_node_blockptr(parent, slot, new_bytenr); 1324 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); 1325 btrfs_mark_buffer_dirty(trans, parent); 1326 1327 btrfs_set_node_blockptr(path->nodes[level], 1328 path->slots[level], old_bytenr); 1329 btrfs_set_node_ptr_generation(path->nodes[level], 1330 path->slots[level], old_ptr_gen); 1331 btrfs_mark_buffer_dirty(trans, path->nodes[level]); 1332 1333 ref.action = BTRFS_ADD_DELAYED_REF; 1334 ref.bytenr = old_bytenr; 1335 ref.num_bytes = blocksize; 1336 ref.parent = path->nodes[level]->start; 1337 ref.owning_root = btrfs_root_id(src); 1338 ref.ref_root = btrfs_root_id(src); 1339 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1340 ret = btrfs_inc_extent_ref(trans, &ref); 1341 if (ret) { 1342 btrfs_abort_transaction(trans, ret); 1343 break; 1344 } 1345 1346 ref.action = BTRFS_ADD_DELAYED_REF; 1347 ref.bytenr = new_bytenr; 1348 ref.num_bytes = blocksize; 1349 ref.parent = 0; 1350 ref.owning_root = btrfs_root_id(dest); 1351 ref.ref_root = btrfs_root_id(dest); 1352 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1353 ret = btrfs_inc_extent_ref(trans, &ref); 1354 if (ret) { 1355 btrfs_abort_transaction(trans, ret); 1356 break; 1357 } 1358 1359 /* We don't know the real owning_root, use 0. */ 1360 ref.action = BTRFS_DROP_DELAYED_REF; 1361 ref.bytenr = new_bytenr; 1362 ref.num_bytes = blocksize; 1363 ref.parent = path->nodes[level]->start; 1364 ref.owning_root = 0; 1365 ref.ref_root = btrfs_root_id(src); 1366 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1367 ret = btrfs_free_extent(trans, &ref); 1368 if (ret) { 1369 btrfs_abort_transaction(trans, ret); 1370 break; 1371 } 1372 1373 /* We don't know the real owning_root, use 0. */ 1374 ref.action = BTRFS_DROP_DELAYED_REF; 1375 ref.bytenr = old_bytenr; 1376 ref.num_bytes = blocksize; 1377 ref.parent = 0; 1378 ref.owning_root = 0; 1379 ref.ref_root = btrfs_root_id(dest); 1380 btrfs_init_tree_ref(&ref, level - 1, 0, true); 1381 ret = btrfs_free_extent(trans, &ref); 1382 if (ret) { 1383 btrfs_abort_transaction(trans, ret); 1384 break; 1385 } 1386 1387 btrfs_unlock_up_safe(path, 0); 1388 1389 ret = level; 1390 break; 1391 } 1392 btrfs_tree_unlock(parent); 1393 free_extent_buffer(parent); 1394 return ret; 1395 } 1396 1397 /* 1398 * helper to find next relocated block in reloc tree 1399 */ 1400 static noinline_for_stack 1401 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1402 int *level) 1403 { 1404 struct extent_buffer *eb; 1405 int i; 1406 u64 last_snapshot; 1407 u32 nritems; 1408 1409 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1410 1411 for (i = 0; i < *level; i++) { 1412 free_extent_buffer(path->nodes[i]); 1413 path->nodes[i] = NULL; 1414 } 1415 1416 for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 1417 eb = path->nodes[i]; 1418 nritems = btrfs_header_nritems(eb); 1419 while (path->slots[i] + 1 < nritems) { 1420 path->slots[i]++; 1421 if (btrfs_node_ptr_generation(eb, path->slots[i]) <= 1422 last_snapshot) 1423 continue; 1424 1425 *level = i; 1426 return 0; 1427 } 1428 free_extent_buffer(path->nodes[i]); 1429 path->nodes[i] = NULL; 1430 } 1431 return 1; 1432 } 1433 1434 /* 1435 * walk down reloc tree to find relocated block of lowest level 1436 */ 1437 static noinline_for_stack 1438 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, 1439 int *level) 1440 { 1441 struct extent_buffer *eb = NULL; 1442 int i; 1443 u64 ptr_gen = 0; 1444 u64 last_snapshot; 1445 u32 nritems; 1446 1447 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1448 1449 for (i = *level; i > 0; i--) { 1450 eb = path->nodes[i]; 1451 nritems = btrfs_header_nritems(eb); 1452 while (path->slots[i] < nritems) { 1453 ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); 1454 if (ptr_gen > last_snapshot) 1455 break; 1456 path->slots[i]++; 1457 } 1458 if (path->slots[i] >= nritems) { 1459 if (i == *level) 1460 break; 1461 *level = i + 1; 1462 return 0; 1463 } 1464 if (i == 1) { 1465 *level = i; 1466 return 0; 1467 } 1468 1469 eb = btrfs_read_node_slot(eb, path->slots[i]); 1470 if (IS_ERR(eb)) 1471 return PTR_ERR(eb); 1472 BUG_ON(btrfs_header_level(eb) != i - 1); 1473 path->nodes[i - 1] = eb; 1474 path->slots[i - 1] = 0; 1475 } 1476 return 1; 1477 } 1478 1479 /* 1480 * invalidate extent cache for file extents whose key in range of 1481 * [min_key, max_key) 1482 */ 1483 static int invalidate_extent_cache(struct btrfs_root *root, 1484 const struct btrfs_key *min_key, 1485 const struct btrfs_key *max_key) 1486 { 1487 struct btrfs_fs_info *fs_info = root->fs_info; 1488 struct btrfs_inode *inode = NULL; 1489 u64 objectid; 1490 u64 start, end; 1491 u64 ino; 1492 1493 objectid = min_key->objectid; 1494 while (1) { 1495 struct extent_state *cached_state = NULL; 1496 1497 cond_resched(); 1498 if (inode) 1499 iput(&inode->vfs_inode); 1500 1501 if (objectid > max_key->objectid) 1502 break; 1503 1504 inode = btrfs_find_first_inode(root, objectid); 1505 if (!inode) 1506 break; 1507 ino = btrfs_ino(inode); 1508 1509 if (ino > max_key->objectid) { 1510 iput(&inode->vfs_inode); 1511 break; 1512 } 1513 1514 objectid = ino + 1; 1515 if (!S_ISREG(inode->vfs_inode.i_mode)) 1516 continue; 1517 1518 if (unlikely(min_key->objectid == ino)) { 1519 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1520 continue; 1521 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1522 start = 0; 1523 else { 1524 start = min_key->offset; 1525 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); 1526 } 1527 } else { 1528 start = 0; 1529 } 1530 1531 if (unlikely(max_key->objectid == ino)) { 1532 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1533 continue; 1534 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1535 end = (u64)-1; 1536 } else { 1537 if (max_key->offset == 0) 1538 continue; 1539 end = max_key->offset; 1540 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); 1541 end--; 1542 } 1543 } else { 1544 end = (u64)-1; 1545 } 1546 1547 /* the lock_extent waits for read_folio to complete */ 1548 lock_extent(&inode->io_tree, start, end, &cached_state); 1549 btrfs_drop_extent_map_range(inode, start, end, true); 1550 unlock_extent(&inode->io_tree, start, end, &cached_state); 1551 } 1552 return 0; 1553 } 1554 1555 static int find_next_key(struct btrfs_path *path, int level, 1556 struct btrfs_key *key) 1557 1558 { 1559 while (level < BTRFS_MAX_LEVEL) { 1560 if (!path->nodes[level]) 1561 break; 1562 if (path->slots[level] + 1 < 1563 btrfs_header_nritems(path->nodes[level])) { 1564 btrfs_node_key_to_cpu(path->nodes[level], key, 1565 path->slots[level] + 1); 1566 return 0; 1567 } 1568 level++; 1569 } 1570 return 1; 1571 } 1572 1573 /* 1574 * Insert current subvolume into reloc_control::dirty_subvol_roots 1575 */ 1576 static int insert_dirty_subvol(struct btrfs_trans_handle *trans, 1577 struct reloc_control *rc, 1578 struct btrfs_root *root) 1579 { 1580 struct btrfs_root *reloc_root = root->reloc_root; 1581 struct btrfs_root_item *reloc_root_item; 1582 int ret; 1583 1584 /* @root must be a subvolume tree root with a valid reloc tree */ 1585 ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); 1586 ASSERT(reloc_root); 1587 1588 reloc_root_item = &reloc_root->root_item; 1589 memset(&reloc_root_item->drop_progress, 0, 1590 sizeof(reloc_root_item->drop_progress)); 1591 btrfs_set_root_drop_level(reloc_root_item, 0); 1592 btrfs_set_root_refs(reloc_root_item, 0); 1593 ret = btrfs_update_reloc_root(trans, root); 1594 if (ret) 1595 return ret; 1596 1597 if (list_empty(&root->reloc_dirty_list)) { 1598 btrfs_grab_root(root); 1599 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); 1600 } 1601 1602 return 0; 1603 } 1604 1605 static int clean_dirty_subvols(struct reloc_control *rc) 1606 { 1607 struct btrfs_root *root; 1608 struct btrfs_root *next; 1609 int ret = 0; 1610 int ret2; 1611 1612 list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, 1613 reloc_dirty_list) { 1614 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { 1615 /* Merged subvolume, cleanup its reloc root */ 1616 struct btrfs_root *reloc_root = root->reloc_root; 1617 1618 list_del_init(&root->reloc_dirty_list); 1619 root->reloc_root = NULL; 1620 /* 1621 * Need barrier to ensure clear_bit() only happens after 1622 * root->reloc_root = NULL. Pairs with have_reloc_root. 1623 */ 1624 smp_wmb(); 1625 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); 1626 if (reloc_root) { 1627 /* 1628 * btrfs_drop_snapshot drops our ref we hold for 1629 * ->reloc_root. If it fails however we must 1630 * drop the ref ourselves. 1631 */ 1632 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1); 1633 if (ret2 < 0) { 1634 btrfs_put_root(reloc_root); 1635 if (!ret) 1636 ret = ret2; 1637 } 1638 } 1639 btrfs_put_root(root); 1640 } else { 1641 /* Orphan reloc tree, just clean it up */ 1642 ret2 = btrfs_drop_snapshot(root, 0, 1); 1643 if (ret2 < 0) { 1644 btrfs_put_root(root); 1645 if (!ret) 1646 ret = ret2; 1647 } 1648 } 1649 } 1650 return ret; 1651 } 1652 1653 /* 1654 * merge the relocated tree blocks in reloc tree with corresponding 1655 * fs tree. 1656 */ 1657 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, 1658 struct btrfs_root *root) 1659 { 1660 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1661 struct btrfs_key key; 1662 struct btrfs_key next_key; 1663 struct btrfs_trans_handle *trans = NULL; 1664 struct btrfs_root *reloc_root; 1665 struct btrfs_root_item *root_item; 1666 struct btrfs_path *path; 1667 struct extent_buffer *leaf; 1668 int reserve_level; 1669 int level; 1670 int max_level; 1671 int replaced = 0; 1672 int ret = 0; 1673 u32 min_reserved; 1674 1675 path = btrfs_alloc_path(); 1676 if (!path) 1677 return -ENOMEM; 1678 path->reada = READA_FORWARD; 1679 1680 reloc_root = root->reloc_root; 1681 root_item = &reloc_root->root_item; 1682 1683 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 1684 level = btrfs_root_level(root_item); 1685 atomic_inc(&reloc_root->node->refs); 1686 path->nodes[level] = reloc_root->node; 1687 path->slots[level] = 0; 1688 } else { 1689 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); 1690 1691 level = btrfs_root_drop_level(root_item); 1692 BUG_ON(level == 0); 1693 path->lowest_level = level; 1694 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); 1695 path->lowest_level = 0; 1696 if (ret < 0) { 1697 btrfs_free_path(path); 1698 return ret; 1699 } 1700 1701 btrfs_node_key_to_cpu(path->nodes[level], &next_key, 1702 path->slots[level]); 1703 WARN_ON(memcmp(&key, &next_key, sizeof(key))); 1704 1705 btrfs_unlock_up_safe(path, 0); 1706 } 1707 1708 /* 1709 * In merge_reloc_root(), we modify the upper level pointer to swap the 1710 * tree blocks between reloc tree and subvolume tree. Thus for tree 1711 * block COW, we COW at most from level 1 to root level for each tree. 1712 * 1713 * Thus the needed metadata size is at most root_level * nodesize, 1714 * and * 2 since we have two trees to COW. 1715 */ 1716 reserve_level = max_t(int, 1, btrfs_root_level(root_item)); 1717 min_reserved = fs_info->nodesize * reserve_level * 2; 1718 memset(&next_key, 0, sizeof(next_key)); 1719 1720 while (1) { 1721 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 1722 min_reserved, 1723 BTRFS_RESERVE_FLUSH_LIMIT); 1724 if (ret) 1725 goto out; 1726 trans = btrfs_start_transaction(root, 0); 1727 if (IS_ERR(trans)) { 1728 ret = PTR_ERR(trans); 1729 trans = NULL; 1730 goto out; 1731 } 1732 1733 /* 1734 * At this point we no longer have a reloc_control, so we can't 1735 * depend on btrfs_init_reloc_root to update our last_trans. 1736 * 1737 * But that's ok, we started the trans handle on our 1738 * corresponding fs_root, which means it's been added to the 1739 * dirty list. At commit time we'll still call 1740 * btrfs_update_reloc_root() and update our root item 1741 * appropriately. 1742 */ 1743 btrfs_set_root_last_trans(reloc_root, trans->transid); 1744 trans->block_rsv = rc->block_rsv; 1745 1746 replaced = 0; 1747 max_level = level; 1748 1749 ret = walk_down_reloc_tree(reloc_root, path, &level); 1750 if (ret < 0) 1751 goto out; 1752 if (ret > 0) 1753 break; 1754 1755 if (!find_next_key(path, level, &key) && 1756 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 1757 ret = 0; 1758 } else { 1759 ret = replace_path(trans, rc, root, reloc_root, path, 1760 &next_key, level, max_level); 1761 } 1762 if (ret < 0) 1763 goto out; 1764 if (ret > 0) { 1765 level = ret; 1766 btrfs_node_key_to_cpu(path->nodes[level], &key, 1767 path->slots[level]); 1768 replaced = 1; 1769 } 1770 1771 ret = walk_up_reloc_tree(reloc_root, path, &level); 1772 if (ret > 0) 1773 break; 1774 1775 BUG_ON(level == 0); 1776 /* 1777 * save the merging progress in the drop_progress. 1778 * this is OK since root refs == 1 in this case. 1779 */ 1780 btrfs_node_key(path->nodes[level], &root_item->drop_progress, 1781 path->slots[level]); 1782 btrfs_set_root_drop_level(root_item, level); 1783 1784 btrfs_end_transaction_throttle(trans); 1785 trans = NULL; 1786 1787 btrfs_btree_balance_dirty(fs_info); 1788 1789 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1790 invalidate_extent_cache(root, &key, &next_key); 1791 } 1792 1793 /* 1794 * handle the case only one block in the fs tree need to be 1795 * relocated and the block is tree root. 1796 */ 1797 leaf = btrfs_lock_root_node(root); 1798 ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, 1799 BTRFS_NESTING_COW); 1800 btrfs_tree_unlock(leaf); 1801 free_extent_buffer(leaf); 1802 out: 1803 btrfs_free_path(path); 1804 1805 if (ret == 0) { 1806 ret = insert_dirty_subvol(trans, rc, root); 1807 if (ret) 1808 btrfs_abort_transaction(trans, ret); 1809 } 1810 1811 if (trans) 1812 btrfs_end_transaction_throttle(trans); 1813 1814 btrfs_btree_balance_dirty(fs_info); 1815 1816 if (replaced && rc->stage == UPDATE_DATA_PTRS) 1817 invalidate_extent_cache(root, &key, &next_key); 1818 1819 return ret; 1820 } 1821 1822 static noinline_for_stack 1823 int prepare_to_merge(struct reloc_control *rc, int err) 1824 { 1825 struct btrfs_root *root = rc->extent_root; 1826 struct btrfs_fs_info *fs_info = root->fs_info; 1827 struct btrfs_root *reloc_root; 1828 struct btrfs_trans_handle *trans; 1829 LIST_HEAD(reloc_roots); 1830 u64 num_bytes = 0; 1831 int ret; 1832 1833 mutex_lock(&fs_info->reloc_mutex); 1834 rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 1835 rc->merging_rsv_size += rc->nodes_relocated * 2; 1836 mutex_unlock(&fs_info->reloc_mutex); 1837 1838 again: 1839 if (!err) { 1840 num_bytes = rc->merging_rsv_size; 1841 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes, 1842 BTRFS_RESERVE_FLUSH_ALL); 1843 if (ret) 1844 err = ret; 1845 } 1846 1847 trans = btrfs_join_transaction(rc->extent_root); 1848 if (IS_ERR(trans)) { 1849 if (!err) 1850 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1851 num_bytes, NULL); 1852 return PTR_ERR(trans); 1853 } 1854 1855 if (!err) { 1856 if (num_bytes != rc->merging_rsv_size) { 1857 btrfs_end_transaction(trans); 1858 btrfs_block_rsv_release(fs_info, rc->block_rsv, 1859 num_bytes, NULL); 1860 goto again; 1861 } 1862 } 1863 1864 rc->merge_reloc_tree = true; 1865 1866 while (!list_empty(&rc->reloc_roots)) { 1867 reloc_root = list_entry(rc->reloc_roots.next, 1868 struct btrfs_root, root_list); 1869 list_del_init(&reloc_root->root_list); 1870 1871 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1872 false); 1873 if (IS_ERR(root)) { 1874 /* 1875 * Even if we have an error we need this reloc root 1876 * back on our list so we can clean up properly. 1877 */ 1878 list_add(&reloc_root->root_list, &reloc_roots); 1879 btrfs_abort_transaction(trans, (int)PTR_ERR(root)); 1880 if (!err) 1881 err = PTR_ERR(root); 1882 break; 1883 } 1884 1885 if (unlikely(root->reloc_root != reloc_root)) { 1886 if (root->reloc_root) { 1887 btrfs_err(fs_info, 1888 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", 1889 btrfs_root_id(root), 1890 btrfs_root_id(root->reloc_root), 1891 root->reloc_root->root_key.type, 1892 root->reloc_root->root_key.offset, 1893 btrfs_root_generation( 1894 &root->reloc_root->root_item), 1895 btrfs_root_id(reloc_root), 1896 reloc_root->root_key.type, 1897 reloc_root->root_key.offset, 1898 btrfs_root_generation( 1899 &reloc_root->root_item)); 1900 } else { 1901 btrfs_err(fs_info, 1902 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", 1903 btrfs_root_id(root), 1904 btrfs_root_id(reloc_root), 1905 reloc_root->root_key.type, 1906 reloc_root->root_key.offset, 1907 btrfs_root_generation( 1908 &reloc_root->root_item)); 1909 } 1910 list_add(&reloc_root->root_list, &reloc_roots); 1911 btrfs_put_root(root); 1912 btrfs_abort_transaction(trans, -EUCLEAN); 1913 if (!err) 1914 err = -EUCLEAN; 1915 break; 1916 } 1917 1918 /* 1919 * set reference count to 1, so btrfs_recover_relocation 1920 * knows it should resumes merging 1921 */ 1922 if (!err) 1923 btrfs_set_root_refs(&reloc_root->root_item, 1); 1924 ret = btrfs_update_reloc_root(trans, root); 1925 1926 /* 1927 * Even if we have an error we need this reloc root back on our 1928 * list so we can clean up properly. 1929 */ 1930 list_add(&reloc_root->root_list, &reloc_roots); 1931 btrfs_put_root(root); 1932 1933 if (ret) { 1934 btrfs_abort_transaction(trans, ret); 1935 if (!err) 1936 err = ret; 1937 break; 1938 } 1939 } 1940 1941 list_splice(&reloc_roots, &rc->reloc_roots); 1942 1943 if (!err) 1944 err = btrfs_commit_transaction(trans); 1945 else 1946 btrfs_end_transaction(trans); 1947 return err; 1948 } 1949 1950 static noinline_for_stack 1951 void free_reloc_roots(struct list_head *list) 1952 { 1953 struct btrfs_root *reloc_root, *tmp; 1954 1955 list_for_each_entry_safe(reloc_root, tmp, list, root_list) 1956 __del_reloc_root(reloc_root); 1957 } 1958 1959 static noinline_for_stack 1960 void merge_reloc_roots(struct reloc_control *rc) 1961 { 1962 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 1963 struct btrfs_root *root; 1964 struct btrfs_root *reloc_root; 1965 LIST_HEAD(reloc_roots); 1966 int found = 0; 1967 int ret = 0; 1968 again: 1969 root = rc->extent_root; 1970 1971 /* 1972 * this serializes us with btrfs_record_root_in_transaction, 1973 * we have to make sure nobody is in the middle of 1974 * adding their roots to the list while we are 1975 * doing this splice 1976 */ 1977 mutex_lock(&fs_info->reloc_mutex); 1978 list_splice_init(&rc->reloc_roots, &reloc_roots); 1979 mutex_unlock(&fs_info->reloc_mutex); 1980 1981 while (!list_empty(&reloc_roots)) { 1982 found = 1; 1983 reloc_root = list_entry(reloc_roots.next, 1984 struct btrfs_root, root_list); 1985 1986 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 1987 false); 1988 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 1989 if (WARN_ON(IS_ERR(root))) { 1990 /* 1991 * For recovery we read the fs roots on mount, 1992 * and if we didn't find the root then we marked 1993 * the reloc root as a garbage root. For normal 1994 * relocation obviously the root should exist in 1995 * memory. However there's no reason we can't 1996 * handle the error properly here just in case. 1997 */ 1998 ret = PTR_ERR(root); 1999 goto out; 2000 } 2001 if (WARN_ON(root->reloc_root != reloc_root)) { 2002 /* 2003 * This can happen if on-disk metadata has some 2004 * corruption, e.g. bad reloc tree key offset. 2005 */ 2006 ret = -EINVAL; 2007 goto out; 2008 } 2009 ret = merge_reloc_root(rc, root); 2010 btrfs_put_root(root); 2011 if (ret) { 2012 if (list_empty(&reloc_root->root_list)) 2013 list_add_tail(&reloc_root->root_list, 2014 &reloc_roots); 2015 goto out; 2016 } 2017 } else { 2018 if (!IS_ERR(root)) { 2019 if (root->reloc_root == reloc_root) { 2020 root->reloc_root = NULL; 2021 btrfs_put_root(reloc_root); 2022 } 2023 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, 2024 &root->state); 2025 btrfs_put_root(root); 2026 } 2027 2028 list_del_init(&reloc_root->root_list); 2029 /* Don't forget to queue this reloc root for cleanup */ 2030 list_add_tail(&reloc_root->reloc_dirty_list, 2031 &rc->dirty_subvol_roots); 2032 } 2033 } 2034 2035 if (found) { 2036 found = 0; 2037 goto again; 2038 } 2039 out: 2040 if (ret) { 2041 btrfs_handle_fs_error(fs_info, ret, NULL); 2042 free_reloc_roots(&reloc_roots); 2043 2044 /* new reloc root may be added */ 2045 mutex_lock(&fs_info->reloc_mutex); 2046 list_splice_init(&rc->reloc_roots, &reloc_roots); 2047 mutex_unlock(&fs_info->reloc_mutex); 2048 free_reloc_roots(&reloc_roots); 2049 } 2050 2051 /* 2052 * We used to have 2053 * 2054 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2055 * 2056 * here, but it's wrong. If we fail to start the transaction in 2057 * prepare_to_merge() we will have only 0 ref reloc roots, none of which 2058 * have actually been removed from the reloc_root_tree rb tree. This is 2059 * fine because we're bailing here, and we hold a reference on the root 2060 * for the list that holds it, so these roots will be cleaned up when we 2061 * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root 2062 * will be cleaned up on unmount. 2063 * 2064 * The remaining nodes will be cleaned up by free_reloc_control. 2065 */ 2066 } 2067 2068 static void free_block_list(struct rb_root *blocks) 2069 { 2070 struct tree_block *block; 2071 struct rb_node *rb_node; 2072 while ((rb_node = rb_first(blocks))) { 2073 block = rb_entry(rb_node, struct tree_block, rb_node); 2074 rb_erase(rb_node, blocks); 2075 kfree(block); 2076 } 2077 } 2078 2079 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, 2080 struct btrfs_root *reloc_root) 2081 { 2082 struct btrfs_fs_info *fs_info = reloc_root->fs_info; 2083 struct btrfs_root *root; 2084 int ret; 2085 2086 if (btrfs_get_root_last_trans(reloc_root) == trans->transid) 2087 return 0; 2088 2089 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); 2090 2091 /* 2092 * This should succeed, since we can't have a reloc root without having 2093 * already looked up the actual root and created the reloc root for this 2094 * root. 2095 * 2096 * However if there's some sort of corruption where we have a ref to a 2097 * reloc root without a corresponding root this could return ENOENT. 2098 */ 2099 if (IS_ERR(root)) { 2100 ASSERT(0); 2101 return PTR_ERR(root); 2102 } 2103 if (root->reloc_root != reloc_root) { 2104 ASSERT(0); 2105 btrfs_err(fs_info, 2106 "root %llu has two reloc roots associated with it", 2107 reloc_root->root_key.offset); 2108 btrfs_put_root(root); 2109 return -EUCLEAN; 2110 } 2111 ret = btrfs_record_root_in_trans(trans, root); 2112 btrfs_put_root(root); 2113 2114 return ret; 2115 } 2116 2117 static noinline_for_stack 2118 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2119 struct reloc_control *rc, 2120 struct btrfs_backref_node *node, 2121 struct btrfs_backref_edge *edges[]) 2122 { 2123 struct btrfs_backref_node *next; 2124 struct btrfs_root *root; 2125 int index = 0; 2126 int ret; 2127 2128 next = node; 2129 while (1) { 2130 cond_resched(); 2131 next = walk_up_backref(next, edges, &index); 2132 root = next->root; 2133 2134 /* 2135 * If there is no root, then our references for this block are 2136 * incomplete, as we should be able to walk all the way up to a 2137 * block that is owned by a root. 2138 * 2139 * This path is only for SHAREABLE roots, so if we come upon a 2140 * non-SHAREABLE root then we have backrefs that resolve 2141 * improperly. 2142 * 2143 * Both of these cases indicate file system corruption, or a bug 2144 * in the backref walking code. 2145 */ 2146 if (!root) { 2147 ASSERT(0); 2148 btrfs_err(trans->fs_info, 2149 "bytenr %llu doesn't have a backref path ending in a root", 2150 node->bytenr); 2151 return ERR_PTR(-EUCLEAN); 2152 } 2153 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2154 ASSERT(0); 2155 btrfs_err(trans->fs_info, 2156 "bytenr %llu has multiple refs with one ending in a non-shareable root", 2157 node->bytenr); 2158 return ERR_PTR(-EUCLEAN); 2159 } 2160 2161 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 2162 ret = record_reloc_root_in_trans(trans, root); 2163 if (ret) 2164 return ERR_PTR(ret); 2165 break; 2166 } 2167 2168 ret = btrfs_record_root_in_trans(trans, root); 2169 if (ret) 2170 return ERR_PTR(ret); 2171 root = root->reloc_root; 2172 2173 /* 2174 * We could have raced with another thread which failed, so 2175 * root->reloc_root may not be set, return ENOENT in this case. 2176 */ 2177 if (!root) 2178 return ERR_PTR(-ENOENT); 2179 2180 if (next->new_bytenr != root->node->start) { 2181 /* 2182 * We just created the reloc root, so we shouldn't have 2183 * ->new_bytenr set and this shouldn't be in the changed 2184 * list. If it is then we have multiple roots pointing 2185 * at the same bytenr which indicates corruption, or 2186 * we've made a mistake in the backref walking code. 2187 */ 2188 ASSERT(next->new_bytenr == 0); 2189 ASSERT(list_empty(&next->list)); 2190 if (next->new_bytenr || !list_empty(&next->list)) { 2191 btrfs_err(trans->fs_info, 2192 "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", 2193 node->bytenr, next->bytenr); 2194 return ERR_PTR(-EUCLEAN); 2195 } 2196 2197 next->new_bytenr = root->node->start; 2198 btrfs_put_root(next->root); 2199 next->root = btrfs_grab_root(root); 2200 ASSERT(next->root); 2201 list_add_tail(&next->list, 2202 &rc->backref_cache.changed); 2203 mark_block_processed(rc, next); 2204 break; 2205 } 2206 2207 WARN_ON(1); 2208 root = NULL; 2209 next = walk_down_backref(edges, &index); 2210 if (!next || next->level <= node->level) 2211 break; 2212 } 2213 if (!root) { 2214 /* 2215 * This can happen if there's fs corruption or if there's a bug 2216 * in the backref lookup code. 2217 */ 2218 ASSERT(0); 2219 return ERR_PTR(-ENOENT); 2220 } 2221 2222 next = node; 2223 /* setup backref node path for btrfs_reloc_cow_block */ 2224 while (1) { 2225 rc->backref_cache.path[next->level] = next; 2226 if (--index < 0) 2227 break; 2228 next = edges[index]->node[UPPER]; 2229 } 2230 return root; 2231 } 2232 2233 /* 2234 * Select a tree root for relocation. 2235 * 2236 * Return NULL if the block is not shareable. We should use do_relocation() in 2237 * this case. 2238 * 2239 * Return a tree root pointer if the block is shareable. 2240 * Return -ENOENT if the block is root of reloc tree. 2241 */ 2242 static noinline_for_stack 2243 struct btrfs_root *select_one_root(struct btrfs_backref_node *node) 2244 { 2245 struct btrfs_backref_node *next; 2246 struct btrfs_root *root; 2247 struct btrfs_root *fs_root = NULL; 2248 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2249 int index = 0; 2250 2251 next = node; 2252 while (1) { 2253 cond_resched(); 2254 next = walk_up_backref(next, edges, &index); 2255 root = next->root; 2256 2257 /* 2258 * This can occur if we have incomplete extent refs leading all 2259 * the way up a particular path, in this case return -EUCLEAN. 2260 */ 2261 if (!root) 2262 return ERR_PTR(-EUCLEAN); 2263 2264 /* No other choice for non-shareable tree */ 2265 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 2266 return root; 2267 2268 if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) 2269 fs_root = root; 2270 2271 if (next != node) 2272 return NULL; 2273 2274 next = walk_down_backref(edges, &index); 2275 if (!next || next->level <= node->level) 2276 break; 2277 } 2278 2279 if (!fs_root) 2280 return ERR_PTR(-ENOENT); 2281 return fs_root; 2282 } 2283 2284 static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, 2285 struct btrfs_backref_node *node) 2286 { 2287 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2288 struct btrfs_backref_node *next = node; 2289 struct btrfs_backref_edge *edge; 2290 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2291 u64 num_bytes = 0; 2292 int index = 0; 2293 2294 BUG_ON(node->processed); 2295 2296 while (next) { 2297 cond_resched(); 2298 while (1) { 2299 if (next->processed) 2300 break; 2301 2302 num_bytes += fs_info->nodesize; 2303 2304 if (list_empty(&next->upper)) 2305 break; 2306 2307 edge = list_entry(next->upper.next, 2308 struct btrfs_backref_edge, list[LOWER]); 2309 edges[index++] = edge; 2310 next = edge->node[UPPER]; 2311 } 2312 next = walk_down_backref(edges, &index); 2313 } 2314 return num_bytes; 2315 } 2316 2317 static int reserve_metadata_space(struct btrfs_trans_handle *trans, 2318 struct reloc_control *rc, 2319 struct btrfs_backref_node *node) 2320 { 2321 struct btrfs_root *root = rc->extent_root; 2322 struct btrfs_fs_info *fs_info = root->fs_info; 2323 u64 num_bytes; 2324 int ret; 2325 u64 tmp; 2326 2327 num_bytes = calcu_metadata_size(rc, node) * 2; 2328 2329 trans->block_rsv = rc->block_rsv; 2330 rc->reserved_bytes += num_bytes; 2331 2332 /* 2333 * We are under a transaction here so we can only do limited flushing. 2334 * If we get an enospc just kick back -EAGAIN so we know to drop the 2335 * transaction and try to refill when we can flush all the things. 2336 */ 2337 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes, 2338 BTRFS_RESERVE_FLUSH_LIMIT); 2339 if (ret) { 2340 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; 2341 while (tmp <= rc->reserved_bytes) 2342 tmp <<= 1; 2343 /* 2344 * only one thread can access block_rsv at this point, 2345 * so we don't need hold lock to protect block_rsv. 2346 * we expand more reservation size here to allow enough 2347 * space for relocation and we will return earlier in 2348 * enospc case. 2349 */ 2350 rc->block_rsv->size = tmp + fs_info->nodesize * 2351 RELOCATION_RESERVED_NODES; 2352 return -EAGAIN; 2353 } 2354 2355 return 0; 2356 } 2357 2358 /* 2359 * relocate a block tree, and then update pointers in upper level 2360 * blocks that reference the block to point to the new location. 2361 * 2362 * if called by link_to_upper, the block has already been relocated. 2363 * in that case this function just updates pointers. 2364 */ 2365 static int do_relocation(struct btrfs_trans_handle *trans, 2366 struct reloc_control *rc, 2367 struct btrfs_backref_node *node, 2368 struct btrfs_key *key, 2369 struct btrfs_path *path, int lowest) 2370 { 2371 struct btrfs_backref_node *upper; 2372 struct btrfs_backref_edge *edge; 2373 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2374 struct btrfs_root *root; 2375 struct extent_buffer *eb; 2376 u32 blocksize; 2377 u64 bytenr; 2378 int slot; 2379 int ret = 0; 2380 2381 /* 2382 * If we are lowest then this is the first time we're processing this 2383 * block, and thus shouldn't have an eb associated with it yet. 2384 */ 2385 ASSERT(!lowest || !node->eb); 2386 2387 path->lowest_level = node->level + 1; 2388 rc->backref_cache.path[node->level] = node; 2389 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2390 cond_resched(); 2391 2392 upper = edge->node[UPPER]; 2393 root = select_reloc_root(trans, rc, upper, edges); 2394 if (IS_ERR(root)) { 2395 ret = PTR_ERR(root); 2396 goto next; 2397 } 2398 2399 if (upper->eb && !upper->locked) { 2400 if (!lowest) { 2401 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2402 if (ret < 0) 2403 goto next; 2404 BUG_ON(ret); 2405 bytenr = btrfs_node_blockptr(upper->eb, slot); 2406 if (node->eb->start == bytenr) 2407 goto next; 2408 } 2409 btrfs_backref_drop_node_buffer(upper); 2410 } 2411 2412 if (!upper->eb) { 2413 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2414 if (ret) { 2415 if (ret > 0) 2416 ret = -ENOENT; 2417 2418 btrfs_release_path(path); 2419 break; 2420 } 2421 2422 if (!upper->eb) { 2423 upper->eb = path->nodes[upper->level]; 2424 path->nodes[upper->level] = NULL; 2425 } else { 2426 BUG_ON(upper->eb != path->nodes[upper->level]); 2427 } 2428 2429 upper->locked = 1; 2430 path->locks[upper->level] = 0; 2431 2432 slot = path->slots[upper->level]; 2433 btrfs_release_path(path); 2434 } else { 2435 ret = btrfs_bin_search(upper->eb, 0, key, &slot); 2436 if (ret < 0) 2437 goto next; 2438 BUG_ON(ret); 2439 } 2440 2441 bytenr = btrfs_node_blockptr(upper->eb, slot); 2442 if (lowest) { 2443 if (bytenr != node->bytenr) { 2444 btrfs_err(root->fs_info, 2445 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu", 2446 bytenr, node->bytenr, slot, 2447 upper->eb->start); 2448 ret = -EIO; 2449 goto next; 2450 } 2451 } else { 2452 if (node->eb->start == bytenr) 2453 goto next; 2454 } 2455 2456 blocksize = root->fs_info->nodesize; 2457 eb = btrfs_read_node_slot(upper->eb, slot); 2458 if (IS_ERR(eb)) { 2459 ret = PTR_ERR(eb); 2460 goto next; 2461 } 2462 btrfs_tree_lock(eb); 2463 2464 if (!node->eb) { 2465 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2466 slot, &eb, BTRFS_NESTING_COW); 2467 btrfs_tree_unlock(eb); 2468 free_extent_buffer(eb); 2469 if (ret < 0) 2470 goto next; 2471 /* 2472 * We've just COWed this block, it should have updated 2473 * the correct backref node entry. 2474 */ 2475 ASSERT(node->eb == eb); 2476 } else { 2477 struct btrfs_ref ref = { 2478 .action = BTRFS_ADD_DELAYED_REF, 2479 .bytenr = node->eb->start, 2480 .num_bytes = blocksize, 2481 .parent = upper->eb->start, 2482 .owning_root = btrfs_header_owner(upper->eb), 2483 .ref_root = btrfs_header_owner(upper->eb), 2484 }; 2485 2486 btrfs_set_node_blockptr(upper->eb, slot, 2487 node->eb->start); 2488 btrfs_set_node_ptr_generation(upper->eb, slot, 2489 trans->transid); 2490 btrfs_mark_buffer_dirty(trans, upper->eb); 2491 2492 btrfs_init_tree_ref(&ref, node->level, 2493 btrfs_root_id(root), false); 2494 ret = btrfs_inc_extent_ref(trans, &ref); 2495 if (!ret) 2496 ret = btrfs_drop_subtree(trans, root, eb, 2497 upper->eb); 2498 if (ret) 2499 btrfs_abort_transaction(trans, ret); 2500 } 2501 next: 2502 if (!upper->pending) 2503 btrfs_backref_drop_node_buffer(upper); 2504 else 2505 btrfs_backref_unlock_node_buffer(upper); 2506 if (ret) 2507 break; 2508 } 2509 2510 if (!ret && node->pending) { 2511 btrfs_backref_drop_node_buffer(node); 2512 list_move_tail(&node->list, &rc->backref_cache.changed); 2513 node->pending = 0; 2514 } 2515 2516 path->lowest_level = 0; 2517 2518 /* 2519 * We should have allocated all of our space in the block rsv and thus 2520 * shouldn't ENOSPC. 2521 */ 2522 ASSERT(ret != -ENOSPC); 2523 return ret; 2524 } 2525 2526 static int link_to_upper(struct btrfs_trans_handle *trans, 2527 struct reloc_control *rc, 2528 struct btrfs_backref_node *node, 2529 struct btrfs_path *path) 2530 { 2531 struct btrfs_key key; 2532 2533 btrfs_node_key_to_cpu(node->eb, &key, 0); 2534 return do_relocation(trans, rc, node, &key, path, 0); 2535 } 2536 2537 static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2538 struct reloc_control *rc, 2539 struct btrfs_path *path, int err) 2540 { 2541 LIST_HEAD(list); 2542 struct btrfs_backref_cache *cache = &rc->backref_cache; 2543 struct btrfs_backref_node *node; 2544 int level; 2545 int ret; 2546 2547 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2548 while (!list_empty(&cache->pending[level])) { 2549 node = list_entry(cache->pending[level].next, 2550 struct btrfs_backref_node, list); 2551 list_move_tail(&node->list, &list); 2552 BUG_ON(!node->pending); 2553 2554 if (!err) { 2555 ret = link_to_upper(trans, rc, node, path); 2556 if (ret < 0) 2557 err = ret; 2558 } 2559 } 2560 list_splice_init(&list, &cache->pending[level]); 2561 } 2562 return err; 2563 } 2564 2565 /* 2566 * mark a block and all blocks directly/indirectly reference the block 2567 * as processed. 2568 */ 2569 static void update_processed_blocks(struct reloc_control *rc, 2570 struct btrfs_backref_node *node) 2571 { 2572 struct btrfs_backref_node *next = node; 2573 struct btrfs_backref_edge *edge; 2574 struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2575 int index = 0; 2576 2577 while (next) { 2578 cond_resched(); 2579 while (1) { 2580 if (next->processed) 2581 break; 2582 2583 mark_block_processed(rc, next); 2584 2585 if (list_empty(&next->upper)) 2586 break; 2587 2588 edge = list_entry(next->upper.next, 2589 struct btrfs_backref_edge, list[LOWER]); 2590 edges[index++] = edge; 2591 next = edge->node[UPPER]; 2592 } 2593 next = walk_down_backref(edges, &index); 2594 } 2595 } 2596 2597 static int tree_block_processed(u64 bytenr, struct reloc_control *rc) 2598 { 2599 u32 blocksize = rc->extent_root->fs_info->nodesize; 2600 2601 if (test_range_bit(&rc->processed_blocks, bytenr, 2602 bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) 2603 return 1; 2604 return 0; 2605 } 2606 2607 static int get_tree_block_key(struct btrfs_fs_info *fs_info, 2608 struct tree_block *block) 2609 { 2610 struct btrfs_tree_parent_check check = { 2611 .level = block->level, 2612 .owner_root = block->owner, 2613 .transid = block->key.offset 2614 }; 2615 struct extent_buffer *eb; 2616 2617 eb = read_tree_block(fs_info, block->bytenr, &check); 2618 if (IS_ERR(eb)) 2619 return PTR_ERR(eb); 2620 if (!extent_buffer_uptodate(eb)) { 2621 free_extent_buffer(eb); 2622 return -EIO; 2623 } 2624 if (block->level == 0) 2625 btrfs_item_key_to_cpu(eb, &block->key, 0); 2626 else 2627 btrfs_node_key_to_cpu(eb, &block->key, 0); 2628 free_extent_buffer(eb); 2629 block->key_ready = true; 2630 return 0; 2631 } 2632 2633 /* 2634 * helper function to relocate a tree block 2635 */ 2636 static int relocate_tree_block(struct btrfs_trans_handle *trans, 2637 struct reloc_control *rc, 2638 struct btrfs_backref_node *node, 2639 struct btrfs_key *key, 2640 struct btrfs_path *path) 2641 { 2642 struct btrfs_root *root; 2643 int ret = 0; 2644 2645 if (!node) 2646 return 0; 2647 2648 /* 2649 * If we fail here we want to drop our backref_node because we are going 2650 * to start over and regenerate the tree for it. 2651 */ 2652 ret = reserve_metadata_space(trans, rc, node); 2653 if (ret) 2654 goto out; 2655 2656 BUG_ON(node->processed); 2657 root = select_one_root(node); 2658 if (IS_ERR(root)) { 2659 ret = PTR_ERR(root); 2660 2661 /* See explanation in select_one_root for the -EUCLEAN case. */ 2662 ASSERT(ret == -ENOENT); 2663 if (ret == -ENOENT) { 2664 ret = 0; 2665 update_processed_blocks(rc, node); 2666 } 2667 goto out; 2668 } 2669 2670 if (root) { 2671 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { 2672 /* 2673 * This block was the root block of a root, and this is 2674 * the first time we're processing the block and thus it 2675 * should not have had the ->new_bytenr modified and 2676 * should have not been included on the changed list. 2677 * 2678 * However in the case of corruption we could have 2679 * multiple refs pointing to the same block improperly, 2680 * and thus we would trip over these checks. ASSERT() 2681 * for the developer case, because it could indicate a 2682 * bug in the backref code, however error out for a 2683 * normal user in the case of corruption. 2684 */ 2685 ASSERT(node->new_bytenr == 0); 2686 ASSERT(list_empty(&node->list)); 2687 if (node->new_bytenr || !list_empty(&node->list)) { 2688 btrfs_err(root->fs_info, 2689 "bytenr %llu has improper references to it", 2690 node->bytenr); 2691 ret = -EUCLEAN; 2692 goto out; 2693 } 2694 ret = btrfs_record_root_in_trans(trans, root); 2695 if (ret) 2696 goto out; 2697 /* 2698 * Another thread could have failed, need to check if we 2699 * have reloc_root actually set. 2700 */ 2701 if (!root->reloc_root) { 2702 ret = -ENOENT; 2703 goto out; 2704 } 2705 root = root->reloc_root; 2706 node->new_bytenr = root->node->start; 2707 btrfs_put_root(node->root); 2708 node->root = btrfs_grab_root(root); 2709 ASSERT(node->root); 2710 list_add_tail(&node->list, &rc->backref_cache.changed); 2711 } else { 2712 path->lowest_level = node->level; 2713 if (root == root->fs_info->chunk_root) 2714 btrfs_reserve_chunk_metadata(trans, false); 2715 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2716 btrfs_release_path(path); 2717 if (root == root->fs_info->chunk_root) 2718 btrfs_trans_release_chunk_metadata(trans); 2719 if (ret > 0) 2720 ret = 0; 2721 } 2722 if (!ret) 2723 update_processed_blocks(rc, node); 2724 } else { 2725 ret = do_relocation(trans, rc, node, key, path, 1); 2726 } 2727 out: 2728 if (ret || node->level == 0 || node->cowonly) 2729 btrfs_backref_cleanup_node(&rc->backref_cache, node); 2730 return ret; 2731 } 2732 2733 /* 2734 * relocate a list of blocks 2735 */ 2736 static noinline_for_stack 2737 int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2738 struct reloc_control *rc, struct rb_root *blocks) 2739 { 2740 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 2741 struct btrfs_backref_node *node; 2742 struct btrfs_path *path; 2743 struct tree_block *block; 2744 struct tree_block *next; 2745 int ret = 0; 2746 2747 path = btrfs_alloc_path(); 2748 if (!path) { 2749 ret = -ENOMEM; 2750 goto out_free_blocks; 2751 } 2752 2753 /* Kick in readahead for tree blocks with missing keys */ 2754 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2755 if (!block->key_ready) 2756 btrfs_readahead_tree_block(fs_info, block->bytenr, 2757 block->owner, 0, 2758 block->level); 2759 } 2760 2761 /* Get first keys */ 2762 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2763 if (!block->key_ready) { 2764 ret = get_tree_block_key(fs_info, block); 2765 if (ret) 2766 goto out_free_path; 2767 } 2768 } 2769 2770 /* Do tree relocation */ 2771 rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { 2772 node = build_backref_tree(trans, rc, &block->key, 2773 block->level, block->bytenr); 2774 if (IS_ERR(node)) { 2775 ret = PTR_ERR(node); 2776 goto out; 2777 } 2778 2779 ret = relocate_tree_block(trans, rc, node, &block->key, 2780 path); 2781 if (ret < 0) 2782 break; 2783 } 2784 out: 2785 ret = finish_pending_nodes(trans, rc, path, ret); 2786 2787 out_free_path: 2788 btrfs_free_path(path); 2789 out_free_blocks: 2790 free_block_list(blocks); 2791 return ret; 2792 } 2793 2794 static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control *rc) 2795 { 2796 const struct file_extent_cluster *cluster = &rc->cluster; 2797 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2798 u64 alloc_hint = 0; 2799 u64 start; 2800 u64 end; 2801 u64 offset = inode->reloc_block_group_start; 2802 u64 num_bytes; 2803 int nr; 2804 int ret = 0; 2805 u64 i_size = i_size_read(&inode->vfs_inode); 2806 u64 prealloc_start = cluster->start - offset; 2807 u64 prealloc_end = cluster->end - offset; 2808 u64 cur_offset = prealloc_start; 2809 2810 /* 2811 * For subpage case, previous i_size may not be aligned to PAGE_SIZE. 2812 * This means the range [i_size, PAGE_END + 1) is filled with zeros by 2813 * btrfs_do_readpage() call of previously relocated file cluster. 2814 * 2815 * If the current cluster starts in the above range, btrfs_do_readpage() 2816 * will skip the read, and relocate_one_folio() will later writeback 2817 * the padding zeros as new data, causing data corruption. 2818 * 2819 * Here we have to manually invalidate the range (i_size, PAGE_END + 1). 2820 */ 2821 if (!PAGE_ALIGNED(i_size)) { 2822 struct address_space *mapping = inode->vfs_inode.i_mapping; 2823 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2824 const u32 sectorsize = fs_info->sectorsize; 2825 struct folio *folio; 2826 2827 ASSERT(sectorsize < PAGE_SIZE); 2828 ASSERT(IS_ALIGNED(i_size, sectorsize)); 2829 2830 /* 2831 * Subpage can't handle page with DIRTY but without UPTODATE 2832 * bit as it can lead to the following deadlock: 2833 * 2834 * btrfs_read_folio() 2835 * | Page already *locked* 2836 * |- btrfs_lock_and_flush_ordered_range() 2837 * |- btrfs_start_ordered_extent() 2838 * |- extent_write_cache_pages() 2839 * |- lock_page() 2840 * We try to lock the page we already hold. 2841 * 2842 * Here we just writeback the whole data reloc inode, so that 2843 * we will be ensured to have no dirty range in the page, and 2844 * are safe to clear the uptodate bits. 2845 * 2846 * This shouldn't cause too much overhead, as we need to write 2847 * the data back anyway. 2848 */ 2849 ret = filemap_write_and_wait(mapping); 2850 if (ret < 0) 2851 return ret; 2852 2853 clear_extent_bits(&inode->io_tree, i_size, 2854 round_up(i_size, PAGE_SIZE) - 1, 2855 EXTENT_UPTODATE); 2856 folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); 2857 /* 2858 * If page is freed we don't need to do anything then, as we 2859 * will re-read the whole page anyway. 2860 */ 2861 if (!IS_ERR(folio)) { 2862 btrfs_subpage_clear_uptodate(fs_info, folio, i_size, 2863 round_up(i_size, PAGE_SIZE) - i_size); 2864 folio_unlock(folio); 2865 folio_put(folio); 2866 } 2867 } 2868 2869 BUG_ON(cluster->start != cluster->boundary[0]); 2870 ret = btrfs_alloc_data_chunk_ondemand(inode, 2871 prealloc_end + 1 - prealloc_start); 2872 if (ret) 2873 return ret; 2874 2875 btrfs_inode_lock(inode, 0); 2876 for (nr = 0; nr < cluster->nr; nr++) { 2877 struct extent_state *cached_state = NULL; 2878 2879 start = cluster->boundary[nr] - offset; 2880 if (nr + 1 < cluster->nr) 2881 end = cluster->boundary[nr + 1] - 1 - offset; 2882 else 2883 end = cluster->end - offset; 2884 2885 lock_extent(&inode->io_tree, start, end, &cached_state); 2886 num_bytes = end + 1 - start; 2887 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, 2888 num_bytes, num_bytes, 2889 end + 1, &alloc_hint); 2890 cur_offset = end + 1; 2891 unlock_extent(&inode->io_tree, start, end, &cached_state); 2892 if (ret) 2893 break; 2894 } 2895 btrfs_inode_unlock(inode, 0); 2896 2897 if (cur_offset < prealloc_end) 2898 btrfs_free_reserved_data_space_noquota(inode->root->fs_info, 2899 prealloc_end + 1 - cur_offset); 2900 return ret; 2901 } 2902 2903 static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_control *rc) 2904 { 2905 struct btrfs_inode *inode = BTRFS_I(rc->data_inode); 2906 struct extent_map *em; 2907 struct extent_state *cached_state = NULL; 2908 u64 offset = inode->reloc_block_group_start; 2909 u64 start = rc->cluster.start - offset; 2910 u64 end = rc->cluster.end - offset; 2911 int ret = 0; 2912 2913 em = alloc_extent_map(); 2914 if (!em) 2915 return -ENOMEM; 2916 2917 em->start = start; 2918 em->len = end + 1 - start; 2919 em->disk_bytenr = rc->cluster.start; 2920 em->disk_num_bytes = em->len; 2921 em->ram_bytes = em->len; 2922 em->flags |= EXTENT_FLAG_PINNED; 2923 2924 lock_extent(&inode->io_tree, start, end, &cached_state); 2925 ret = btrfs_replace_extent_map_range(inode, em, false); 2926 unlock_extent(&inode->io_tree, start, end, &cached_state); 2927 free_extent_map(em); 2928 2929 return ret; 2930 } 2931 2932 /* 2933 * Allow error injection to test balance/relocation cancellation 2934 */ 2935 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) 2936 { 2937 return atomic_read(&fs_info->balance_cancel_req) || 2938 atomic_read(&fs_info->reloc_cancel_req) || 2939 fatal_signal_pending(current); 2940 } 2941 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); 2942 2943 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, 2944 int cluster_nr) 2945 { 2946 /* Last extent, use cluster end directly */ 2947 if (cluster_nr >= cluster->nr - 1) 2948 return cluster->end; 2949 2950 /* Use next boundary start*/ 2951 return cluster->boundary[cluster_nr + 1] - 1; 2952 } 2953 2954 static int relocate_one_folio(struct reloc_control *rc, 2955 struct file_ra_state *ra, 2956 int *cluster_nr, unsigned long index) 2957 { 2958 const struct file_extent_cluster *cluster = &rc->cluster; 2959 struct inode *inode = rc->data_inode; 2960 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 2961 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 2962 const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; 2963 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 2964 struct folio *folio; 2965 u64 folio_start; 2966 u64 folio_end; 2967 u64 cur; 2968 int ret; 2969 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2970 2971 ASSERT(index <= last_index); 2972 folio = filemap_lock_folio(inode->i_mapping, index); 2973 if (IS_ERR(folio)) { 2974 2975 /* 2976 * On relocation we're doing readahead on the relocation inode, 2977 * but if the filesystem is backed by a RAID stripe tree we can 2978 * get ENOENT (e.g. due to preallocated extents not being 2979 * mapped in the RST) from the lookup. 2980 * 2981 * But readahead doesn't handle the error and submits invalid 2982 * reads to the device, causing a assertion failures. 2983 */ 2984 if (!use_rst) 2985 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 2986 index, last_index + 1 - index); 2987 folio = __filemap_get_folio(inode->i_mapping, index, 2988 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 2989 mask); 2990 if (IS_ERR(folio)) 2991 return PTR_ERR(folio); 2992 } 2993 2994 WARN_ON(folio_order(folio)); 2995 2996 if (folio_test_readahead(folio) && !use_rst) 2997 page_cache_async_readahead(inode->i_mapping, ra, NULL, 2998 folio, last_index + 1 - index); 2999 3000 if (!folio_test_uptodate(folio)) { 3001 btrfs_read_folio(NULL, folio); 3002 folio_lock(folio); 3003 if (!folio_test_uptodate(folio)) { 3004 ret = -EIO; 3005 goto release_folio; 3006 } 3007 } 3008 3009 /* 3010 * We could have lost folio private when we dropped the lock to read the 3011 * folio above, make sure we set_page_extent_mapped here so we have any 3012 * of the subpage blocksize stuff we need in place. 3013 */ 3014 ret = set_folio_extent_mapped(folio); 3015 if (ret < 0) 3016 goto release_folio; 3017 3018 folio_start = folio_pos(folio); 3019 folio_end = folio_start + PAGE_SIZE - 1; 3020 3021 /* 3022 * Start from the cluster, as for subpage case, the cluster can start 3023 * inside the folio. 3024 */ 3025 cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); 3026 while (cur <= folio_end) { 3027 struct extent_state *cached_state = NULL; 3028 u64 extent_start = cluster->boundary[*cluster_nr] - offset; 3029 u64 extent_end = get_cluster_boundary_end(cluster, 3030 *cluster_nr) - offset; 3031 u64 clamped_start = max(folio_start, extent_start); 3032 u64 clamped_end = min(folio_end, extent_end); 3033 u32 clamped_len = clamped_end + 1 - clamped_start; 3034 3035 /* Reserve metadata for this range */ 3036 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 3037 clamped_len, clamped_len, 3038 false); 3039 if (ret) 3040 goto release_folio; 3041 3042 /* Mark the range delalloc and dirty for later writeback */ 3043 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3044 &cached_state); 3045 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, 3046 clamped_end, 0, &cached_state); 3047 if (ret) { 3048 clear_extent_bit(&BTRFS_I(inode)->io_tree, 3049 clamped_start, clamped_end, 3050 EXTENT_LOCKED | EXTENT_BOUNDARY, 3051 &cached_state); 3052 btrfs_delalloc_release_metadata(BTRFS_I(inode), 3053 clamped_len, true); 3054 btrfs_delalloc_release_extents(BTRFS_I(inode), 3055 clamped_len); 3056 goto release_folio; 3057 } 3058 btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); 3059 3060 /* 3061 * Set the boundary if it's inside the folio. 3062 * Data relocation requires the destination extents to have the 3063 * same size as the source. 3064 * EXTENT_BOUNDARY bit prevents current extent from being merged 3065 * with previous extent. 3066 */ 3067 if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { 3068 u64 boundary_start = cluster->boundary[*cluster_nr] - 3069 offset; 3070 u64 boundary_end = boundary_start + 3071 fs_info->sectorsize - 1; 3072 3073 set_extent_bit(&BTRFS_I(inode)->io_tree, 3074 boundary_start, boundary_end, 3075 EXTENT_BOUNDARY, NULL); 3076 } 3077 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, 3078 &cached_state); 3079 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); 3080 cur += clamped_len; 3081 3082 /* Crossed extent end, go to next extent */ 3083 if (cur >= extent_end) { 3084 (*cluster_nr)++; 3085 /* Just finished the last extent of the cluster, exit. */ 3086 if (*cluster_nr >= cluster->nr) 3087 break; 3088 } 3089 } 3090 folio_unlock(folio); 3091 folio_put(folio); 3092 3093 balance_dirty_pages_ratelimited(inode->i_mapping); 3094 btrfs_throttle(fs_info); 3095 if (btrfs_should_cancel_balance(fs_info)) 3096 ret = -ECANCELED; 3097 return ret; 3098 3099 release_folio: 3100 folio_unlock(folio); 3101 folio_put(folio); 3102 return ret; 3103 } 3104 3105 static int relocate_file_extent_cluster(struct reloc_control *rc) 3106 { 3107 struct inode *inode = rc->data_inode; 3108 const struct file_extent_cluster *cluster = &rc->cluster; 3109 u64 offset = BTRFS_I(inode)->reloc_block_group_start; 3110 unsigned long index; 3111 unsigned long last_index; 3112 struct file_ra_state *ra; 3113 int cluster_nr = 0; 3114 int ret = 0; 3115 3116 if (!cluster->nr) 3117 return 0; 3118 3119 ra = kzalloc(sizeof(*ra), GFP_NOFS); 3120 if (!ra) 3121 return -ENOMEM; 3122 3123 ret = prealloc_file_extent_cluster(rc); 3124 if (ret) 3125 goto out; 3126 3127 file_ra_state_init(ra, inode->i_mapping); 3128 3129 ret = setup_relocation_extent_mapping(rc); 3130 if (ret) 3131 goto out; 3132 3133 last_index = (cluster->end - offset) >> PAGE_SHIFT; 3134 for (index = (cluster->start - offset) >> PAGE_SHIFT; 3135 index <= last_index && !ret; index++) 3136 ret = relocate_one_folio(rc, ra, &cluster_nr, index); 3137 if (ret == 0) 3138 WARN_ON(cluster_nr != cluster->nr); 3139 out: 3140 kfree(ra); 3141 return ret; 3142 } 3143 3144 static noinline_for_stack int relocate_data_extent(struct reloc_control *rc, 3145 const struct btrfs_key *extent_key) 3146 { 3147 struct inode *inode = rc->data_inode; 3148 struct file_extent_cluster *cluster = &rc->cluster; 3149 int ret; 3150 struct btrfs_root *root = BTRFS_I(inode)->root; 3151 3152 if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { 3153 ret = relocate_file_extent_cluster(rc); 3154 if (ret) 3155 return ret; 3156 cluster->nr = 0; 3157 } 3158 3159 /* 3160 * Under simple quotas, we set root->relocation_src_root when we find 3161 * the extent. If adjacent extents have different owners, we can't merge 3162 * them while relocating. Handle this by storing the owning root that 3163 * started a cluster and if we see an extent from a different root break 3164 * cluster formation (just like the above case of non-adjacent extents). 3165 * 3166 * Without simple quotas, relocation_src_root is always 0, so we should 3167 * never see a mismatch, and it should have no effect on relocation 3168 * clusters. 3169 */ 3170 if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { 3171 u64 tmp = root->relocation_src_root; 3172 3173 /* 3174 * root->relocation_src_root is the state that actually affects 3175 * the preallocation we do here, so set it to the root owning 3176 * the cluster we need to relocate. 3177 */ 3178 root->relocation_src_root = cluster->owning_root; 3179 ret = relocate_file_extent_cluster(rc); 3180 if (ret) 3181 return ret; 3182 cluster->nr = 0; 3183 /* And reset it back for the current extent's owning root. */ 3184 root->relocation_src_root = tmp; 3185 } 3186 3187 if (!cluster->nr) { 3188 cluster->start = extent_key->objectid; 3189 cluster->owning_root = root->relocation_src_root; 3190 } 3191 else 3192 BUG_ON(cluster->nr >= MAX_EXTENTS); 3193 cluster->end = extent_key->objectid + extent_key->offset - 1; 3194 cluster->boundary[cluster->nr] = extent_key->objectid; 3195 cluster->nr++; 3196 3197 if (cluster->nr >= MAX_EXTENTS) { 3198 ret = relocate_file_extent_cluster(rc); 3199 if (ret) 3200 return ret; 3201 cluster->nr = 0; 3202 } 3203 return 0; 3204 } 3205 3206 /* 3207 * helper to add a tree block to the list. 3208 * the major work is getting the generation and level of the block 3209 */ 3210 static int add_tree_block(struct reloc_control *rc, 3211 const struct btrfs_key *extent_key, 3212 struct btrfs_path *path, 3213 struct rb_root *blocks) 3214 { 3215 struct extent_buffer *eb; 3216 struct btrfs_extent_item *ei; 3217 struct btrfs_tree_block_info *bi; 3218 struct tree_block *block; 3219 struct rb_node *rb_node; 3220 u32 item_size; 3221 int level = -1; 3222 u64 generation; 3223 u64 owner = 0; 3224 3225 eb = path->nodes[0]; 3226 item_size = btrfs_item_size(eb, path->slots[0]); 3227 3228 if (extent_key->type == BTRFS_METADATA_ITEM_KEY || 3229 item_size >= sizeof(*ei) + sizeof(*bi)) { 3230 unsigned long ptr = 0, end; 3231 3232 ei = btrfs_item_ptr(eb, path->slots[0], 3233 struct btrfs_extent_item); 3234 end = (unsigned long)ei + item_size; 3235 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { 3236 bi = (struct btrfs_tree_block_info *)(ei + 1); 3237 level = btrfs_tree_block_level(eb, bi); 3238 ptr = (unsigned long)(bi + 1); 3239 } else { 3240 level = (int)extent_key->offset; 3241 ptr = (unsigned long)(ei + 1); 3242 } 3243 generation = btrfs_extent_generation(eb, ei); 3244 3245 /* 3246 * We're reading random blocks without knowing their owner ahead 3247 * of time. This is ok most of the time, as all reloc roots and 3248 * fs roots have the same lock type. However normal trees do 3249 * not, and the only way to know ahead of time is to read the 3250 * inline ref offset. We know it's an fs root if 3251 * 3252 * 1. There's more than one ref. 3253 * 2. There's a SHARED_DATA_REF_KEY set. 3254 * 3. FULL_BACKREF is set on the flags. 3255 * 3256 * Otherwise it's safe to assume that the ref offset == the 3257 * owner of this block, so we can use that when calling 3258 * read_tree_block. 3259 */ 3260 if (btrfs_extent_refs(eb, ei) == 1 && 3261 !(btrfs_extent_flags(eb, ei) & 3262 BTRFS_BLOCK_FLAG_FULL_BACKREF) && 3263 ptr < end) { 3264 struct btrfs_extent_inline_ref *iref; 3265 int type; 3266 3267 iref = (struct btrfs_extent_inline_ref *)ptr; 3268 type = btrfs_get_extent_inline_ref_type(eb, iref, 3269 BTRFS_REF_TYPE_BLOCK); 3270 if (type == BTRFS_REF_TYPE_INVALID) 3271 return -EINVAL; 3272 if (type == BTRFS_TREE_BLOCK_REF_KEY) 3273 owner = btrfs_extent_inline_ref_offset(eb, iref); 3274 } 3275 } else { 3276 btrfs_print_leaf(eb); 3277 btrfs_err(rc->block_group->fs_info, 3278 "unrecognized tree backref at tree block %llu slot %u", 3279 eb->start, path->slots[0]); 3280 btrfs_release_path(path); 3281 return -EUCLEAN; 3282 } 3283 3284 btrfs_release_path(path); 3285 3286 BUG_ON(level == -1); 3287 3288 block = kmalloc(sizeof(*block), GFP_NOFS); 3289 if (!block) 3290 return -ENOMEM; 3291 3292 block->bytenr = extent_key->objectid; 3293 block->key.objectid = rc->extent_root->fs_info->nodesize; 3294 block->key.offset = generation; 3295 block->level = level; 3296 block->key_ready = false; 3297 block->owner = owner; 3298 3299 rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); 3300 if (rb_node) 3301 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, 3302 -EEXIST); 3303 3304 return 0; 3305 } 3306 3307 /* 3308 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY 3309 */ 3310 static int __add_tree_block(struct reloc_control *rc, 3311 u64 bytenr, u32 blocksize, 3312 struct rb_root *blocks) 3313 { 3314 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3315 struct btrfs_path *path; 3316 struct btrfs_key key; 3317 int ret; 3318 bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 3319 3320 if (tree_block_processed(bytenr, rc)) 3321 return 0; 3322 3323 if (rb_simple_search(blocks, bytenr)) 3324 return 0; 3325 3326 path = btrfs_alloc_path(); 3327 if (!path) 3328 return -ENOMEM; 3329 again: 3330 key.objectid = bytenr; 3331 if (skinny) { 3332 key.type = BTRFS_METADATA_ITEM_KEY; 3333 key.offset = (u64)-1; 3334 } else { 3335 key.type = BTRFS_EXTENT_ITEM_KEY; 3336 key.offset = blocksize; 3337 } 3338 3339 path->search_commit_root = 1; 3340 path->skip_locking = 1; 3341 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); 3342 if (ret < 0) 3343 goto out; 3344 3345 if (ret > 0 && skinny) { 3346 if (path->slots[0]) { 3347 path->slots[0]--; 3348 btrfs_item_key_to_cpu(path->nodes[0], &key, 3349 path->slots[0]); 3350 if (key.objectid == bytenr && 3351 (key.type == BTRFS_METADATA_ITEM_KEY || 3352 (key.type == BTRFS_EXTENT_ITEM_KEY && 3353 key.offset == blocksize))) 3354 ret = 0; 3355 } 3356 3357 if (ret) { 3358 skinny = false; 3359 btrfs_release_path(path); 3360 goto again; 3361 } 3362 } 3363 if (ret) { 3364 ASSERT(ret == 1); 3365 btrfs_print_leaf(path->nodes[0]); 3366 btrfs_err(fs_info, 3367 "tree block extent item (%llu) is not found in extent tree", 3368 bytenr); 3369 WARN_ON(1); 3370 ret = -EINVAL; 3371 goto out; 3372 } 3373 3374 ret = add_tree_block(rc, &key, path, blocks); 3375 out: 3376 btrfs_free_path(path); 3377 return ret; 3378 } 3379 3380 static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3381 struct btrfs_block_group *block_group, 3382 struct inode *inode, 3383 u64 ino) 3384 { 3385 struct btrfs_root *root = fs_info->tree_root; 3386 struct btrfs_trans_handle *trans; 3387 int ret = 0; 3388 3389 if (inode) 3390 goto truncate; 3391 3392 inode = btrfs_iget(ino, root); 3393 if (IS_ERR(inode)) 3394 return -ENOENT; 3395 3396 truncate: 3397 ret = btrfs_check_trunc_cache_free_space(fs_info, 3398 &fs_info->global_block_rsv); 3399 if (ret) 3400 goto out; 3401 3402 trans = btrfs_join_transaction(root); 3403 if (IS_ERR(trans)) { 3404 ret = PTR_ERR(trans); 3405 goto out; 3406 } 3407 3408 ret = btrfs_truncate_free_space_cache(trans, block_group, inode); 3409 3410 btrfs_end_transaction(trans); 3411 btrfs_btree_balance_dirty(fs_info); 3412 out: 3413 iput(inode); 3414 return ret; 3415 } 3416 3417 /* 3418 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the 3419 * cache inode, to avoid free space cache data extent blocking data relocation. 3420 */ 3421 static int delete_v1_space_cache(struct extent_buffer *leaf, 3422 struct btrfs_block_group *block_group, 3423 u64 data_bytenr) 3424 { 3425 u64 space_cache_ino; 3426 struct btrfs_file_extent_item *ei; 3427 struct btrfs_key key; 3428 bool found = false; 3429 int i; 3430 int ret; 3431 3432 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3433 return 0; 3434 3435 for (i = 0; i < btrfs_header_nritems(leaf); i++) { 3436 u8 type; 3437 3438 btrfs_item_key_to_cpu(leaf, &key, i); 3439 if (key.type != BTRFS_EXTENT_DATA_KEY) 3440 continue; 3441 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); 3442 type = btrfs_file_extent_type(leaf, ei); 3443 3444 if ((type == BTRFS_FILE_EXTENT_REG || 3445 type == BTRFS_FILE_EXTENT_PREALLOC) && 3446 btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) { 3447 found = true; 3448 space_cache_ino = key.objectid; 3449 break; 3450 } 3451 } 3452 if (!found) 3453 return -ENOENT; 3454 ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, 3455 space_cache_ino); 3456 return ret; 3457 } 3458 3459 /* 3460 * helper to find all tree blocks that reference a given data extent 3461 */ 3462 static noinline_for_stack int add_data_references(struct reloc_control *rc, 3463 const struct btrfs_key *extent_key, 3464 struct btrfs_path *path, 3465 struct rb_root *blocks) 3466 { 3467 struct btrfs_backref_walk_ctx ctx = { 0 }; 3468 struct ulist_iterator leaf_uiter; 3469 struct ulist_node *ref_node = NULL; 3470 const u32 blocksize = rc->extent_root->fs_info->nodesize; 3471 int ret = 0; 3472 3473 btrfs_release_path(path); 3474 3475 ctx.bytenr = extent_key->objectid; 3476 ctx.skip_inode_ref_list = true; 3477 ctx.fs_info = rc->extent_root->fs_info; 3478 3479 ret = btrfs_find_all_leafs(&ctx); 3480 if (ret < 0) 3481 return ret; 3482 3483 ULIST_ITER_INIT(&leaf_uiter); 3484 while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { 3485 struct btrfs_tree_parent_check check = { 0 }; 3486 struct extent_buffer *eb; 3487 3488 eb = read_tree_block(ctx.fs_info, ref_node->val, &check); 3489 if (IS_ERR(eb)) { 3490 ret = PTR_ERR(eb); 3491 break; 3492 } 3493 ret = delete_v1_space_cache(eb, rc->block_group, 3494 extent_key->objectid); 3495 free_extent_buffer(eb); 3496 if (ret < 0) 3497 break; 3498 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks); 3499 if (ret < 0) 3500 break; 3501 } 3502 if (ret < 0) 3503 free_block_list(blocks); 3504 ulist_free(ctx.refs); 3505 return ret; 3506 } 3507 3508 /* 3509 * helper to find next unprocessed extent 3510 */ 3511 static noinline_for_stack 3512 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, 3513 struct btrfs_key *extent_key) 3514 { 3515 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3516 struct btrfs_key key; 3517 struct extent_buffer *leaf; 3518 u64 start, end, last; 3519 int ret; 3520 3521 last = rc->block_group->start + rc->block_group->length; 3522 while (1) { 3523 bool block_found; 3524 3525 cond_resched(); 3526 if (rc->search_start >= last) { 3527 ret = 1; 3528 break; 3529 } 3530 3531 key.objectid = rc->search_start; 3532 key.type = BTRFS_EXTENT_ITEM_KEY; 3533 key.offset = 0; 3534 3535 path->search_commit_root = 1; 3536 path->skip_locking = 1; 3537 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 3538 0, 0); 3539 if (ret < 0) 3540 break; 3541 next: 3542 leaf = path->nodes[0]; 3543 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 3544 ret = btrfs_next_leaf(rc->extent_root, path); 3545 if (ret != 0) 3546 break; 3547 leaf = path->nodes[0]; 3548 } 3549 3550 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3551 if (key.objectid >= last) { 3552 ret = 1; 3553 break; 3554 } 3555 3556 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3557 key.type != BTRFS_METADATA_ITEM_KEY) { 3558 path->slots[0]++; 3559 goto next; 3560 } 3561 3562 if (key.type == BTRFS_EXTENT_ITEM_KEY && 3563 key.objectid + key.offset <= rc->search_start) { 3564 path->slots[0]++; 3565 goto next; 3566 } 3567 3568 if (key.type == BTRFS_METADATA_ITEM_KEY && 3569 key.objectid + fs_info->nodesize <= 3570 rc->search_start) { 3571 path->slots[0]++; 3572 goto next; 3573 } 3574 3575 block_found = find_first_extent_bit(&rc->processed_blocks, 3576 key.objectid, &start, &end, 3577 EXTENT_DIRTY, NULL); 3578 3579 if (block_found && start <= key.objectid) { 3580 btrfs_release_path(path); 3581 rc->search_start = end + 1; 3582 } else { 3583 if (key.type == BTRFS_EXTENT_ITEM_KEY) 3584 rc->search_start = key.objectid + key.offset; 3585 else 3586 rc->search_start = key.objectid + 3587 fs_info->nodesize; 3588 memcpy(extent_key, &key, sizeof(key)); 3589 return 0; 3590 } 3591 } 3592 btrfs_release_path(path); 3593 return ret; 3594 } 3595 3596 static void set_reloc_control(struct reloc_control *rc) 3597 { 3598 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3599 3600 mutex_lock(&fs_info->reloc_mutex); 3601 fs_info->reloc_ctl = rc; 3602 mutex_unlock(&fs_info->reloc_mutex); 3603 } 3604 3605 static void unset_reloc_control(struct reloc_control *rc) 3606 { 3607 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3608 3609 mutex_lock(&fs_info->reloc_mutex); 3610 fs_info->reloc_ctl = NULL; 3611 mutex_unlock(&fs_info->reloc_mutex); 3612 } 3613 3614 static noinline_for_stack 3615 int prepare_to_relocate(struct reloc_control *rc) 3616 { 3617 struct btrfs_trans_handle *trans; 3618 int ret; 3619 3620 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, 3621 BTRFS_BLOCK_RSV_TEMP); 3622 if (!rc->block_rsv) 3623 return -ENOMEM; 3624 3625 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3626 rc->search_start = rc->block_group->start; 3627 rc->extents_found = 0; 3628 rc->nodes_relocated = 0; 3629 rc->merging_rsv_size = 0; 3630 rc->reserved_bytes = 0; 3631 rc->block_rsv->size = rc->extent_root->fs_info->nodesize * 3632 RELOCATION_RESERVED_NODES; 3633 ret = btrfs_block_rsv_refill(rc->extent_root->fs_info, 3634 rc->block_rsv, rc->block_rsv->size, 3635 BTRFS_RESERVE_FLUSH_ALL); 3636 if (ret) 3637 return ret; 3638 3639 rc->create_reloc_tree = true; 3640 set_reloc_control(rc); 3641 3642 trans = btrfs_join_transaction(rc->extent_root); 3643 if (IS_ERR(trans)) { 3644 unset_reloc_control(rc); 3645 /* 3646 * extent tree is not a ref_cow tree and has no reloc_root to 3647 * cleanup. And callers are responsible to free the above 3648 * block rsv. 3649 */ 3650 return PTR_ERR(trans); 3651 } 3652 3653 ret = btrfs_commit_transaction(trans); 3654 if (ret) 3655 unset_reloc_control(rc); 3656 3657 return ret; 3658 } 3659 3660 static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3661 { 3662 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3663 struct rb_root blocks = RB_ROOT; 3664 struct btrfs_key key; 3665 struct btrfs_trans_handle *trans = NULL; 3666 struct btrfs_path *path; 3667 struct btrfs_extent_item *ei; 3668 u64 flags; 3669 int ret; 3670 int err = 0; 3671 int progress = 0; 3672 3673 path = btrfs_alloc_path(); 3674 if (!path) 3675 return -ENOMEM; 3676 path->reada = READA_FORWARD; 3677 3678 ret = prepare_to_relocate(rc); 3679 if (ret) { 3680 err = ret; 3681 goto out_free; 3682 } 3683 3684 while (1) { 3685 rc->reserved_bytes = 0; 3686 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, 3687 rc->block_rsv->size, 3688 BTRFS_RESERVE_FLUSH_ALL); 3689 if (ret) { 3690 err = ret; 3691 break; 3692 } 3693 progress++; 3694 trans = btrfs_start_transaction(rc->extent_root, 0); 3695 if (IS_ERR(trans)) { 3696 err = PTR_ERR(trans); 3697 trans = NULL; 3698 break; 3699 } 3700 restart: 3701 if (update_backref_cache(trans, &rc->backref_cache)) { 3702 btrfs_end_transaction(trans); 3703 trans = NULL; 3704 continue; 3705 } 3706 3707 ret = find_next_extent(rc, path, &key); 3708 if (ret < 0) 3709 err = ret; 3710 if (ret != 0) 3711 break; 3712 3713 rc->extents_found++; 3714 3715 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3716 struct btrfs_extent_item); 3717 flags = btrfs_extent_flags(path->nodes[0], ei); 3718 3719 /* 3720 * If we are relocating a simple quota owned extent item, we 3721 * need to note the owner on the reloc data root so that when 3722 * we allocate the replacement item, we can attribute it to the 3723 * correct eventual owner (rather than the reloc data root). 3724 */ 3725 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { 3726 struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; 3727 u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, 3728 path->nodes[0], 3729 path->slots[0]); 3730 3731 root->relocation_src_root = owning_root_id; 3732 } 3733 3734 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3735 ret = add_tree_block(rc, &key, path, &blocks); 3736 } else if (rc->stage == UPDATE_DATA_PTRS && 3737 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3738 ret = add_data_references(rc, &key, path, &blocks); 3739 } else { 3740 btrfs_release_path(path); 3741 ret = 0; 3742 } 3743 if (ret < 0) { 3744 err = ret; 3745 break; 3746 } 3747 3748 if (!RB_EMPTY_ROOT(&blocks)) { 3749 ret = relocate_tree_blocks(trans, rc, &blocks); 3750 if (ret < 0) { 3751 if (ret != -EAGAIN) { 3752 err = ret; 3753 break; 3754 } 3755 rc->extents_found--; 3756 rc->search_start = key.objectid; 3757 } 3758 } 3759 3760 btrfs_end_transaction_throttle(trans); 3761 btrfs_btree_balance_dirty(fs_info); 3762 trans = NULL; 3763 3764 if (rc->stage == MOVE_DATA_EXTENTS && 3765 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3766 rc->found_file_extent = true; 3767 ret = relocate_data_extent(rc, &key); 3768 if (ret < 0) { 3769 err = ret; 3770 break; 3771 } 3772 } 3773 if (btrfs_should_cancel_balance(fs_info)) { 3774 err = -ECANCELED; 3775 break; 3776 } 3777 } 3778 if (trans && progress && err == -ENOSPC) { 3779 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags); 3780 if (ret == 1) { 3781 err = 0; 3782 progress = 0; 3783 goto restart; 3784 } 3785 } 3786 3787 btrfs_release_path(path); 3788 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); 3789 3790 if (trans) { 3791 btrfs_end_transaction_throttle(trans); 3792 btrfs_btree_balance_dirty(fs_info); 3793 } 3794 3795 if (!err) { 3796 ret = relocate_file_extent_cluster(rc); 3797 if (ret < 0) 3798 err = ret; 3799 } 3800 3801 rc->create_reloc_tree = false; 3802 set_reloc_control(rc); 3803 3804 btrfs_backref_release_cache(&rc->backref_cache); 3805 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3806 3807 /* 3808 * Even in the case when the relocation is cancelled, we should all go 3809 * through prepare_to_merge() and merge_reloc_roots(). 3810 * 3811 * For error (including cancelled balance), prepare_to_merge() will 3812 * mark all reloc trees orphan, then queue them for cleanup in 3813 * merge_reloc_roots() 3814 */ 3815 err = prepare_to_merge(rc, err); 3816 3817 merge_reloc_roots(rc); 3818 3819 rc->merge_reloc_tree = false; 3820 unset_reloc_control(rc); 3821 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3822 3823 /* get rid of pinned extents */ 3824 trans = btrfs_join_transaction(rc->extent_root); 3825 if (IS_ERR(trans)) { 3826 err = PTR_ERR(trans); 3827 goto out_free; 3828 } 3829 ret = btrfs_commit_transaction(trans); 3830 if (ret && !err) 3831 err = ret; 3832 out_free: 3833 ret = clean_dirty_subvols(rc); 3834 if (ret < 0 && !err) 3835 err = ret; 3836 btrfs_free_block_rsv(fs_info, rc->block_rsv); 3837 btrfs_free_path(path); 3838 return err; 3839 } 3840 3841 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, 3842 struct btrfs_root *root, u64 objectid) 3843 { 3844 struct btrfs_path *path; 3845 struct btrfs_inode_item *item; 3846 struct extent_buffer *leaf; 3847 int ret; 3848 3849 path = btrfs_alloc_path(); 3850 if (!path) 3851 return -ENOMEM; 3852 3853 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 3854 if (ret) 3855 goto out; 3856 3857 leaf = path->nodes[0]; 3858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); 3859 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3860 btrfs_set_inode_generation(leaf, item, 1); 3861 btrfs_set_inode_size(leaf, item, 0); 3862 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3863 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3864 BTRFS_INODE_PREALLOC); 3865 btrfs_mark_buffer_dirty(trans, leaf); 3866 out: 3867 btrfs_free_path(path); 3868 return ret; 3869 } 3870 3871 static void delete_orphan_inode(struct btrfs_trans_handle *trans, 3872 struct btrfs_root *root, u64 objectid) 3873 { 3874 struct btrfs_path *path; 3875 struct btrfs_key key; 3876 int ret = 0; 3877 3878 path = btrfs_alloc_path(); 3879 if (!path) { 3880 ret = -ENOMEM; 3881 goto out; 3882 } 3883 3884 key.objectid = objectid; 3885 key.type = BTRFS_INODE_ITEM_KEY; 3886 key.offset = 0; 3887 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3888 if (ret) { 3889 if (ret > 0) 3890 ret = -ENOENT; 3891 goto out; 3892 } 3893 ret = btrfs_del_item(trans, root, path); 3894 out: 3895 if (ret) 3896 btrfs_abort_transaction(trans, ret); 3897 btrfs_free_path(path); 3898 } 3899 3900 /* 3901 * helper to create inode for data relocation. 3902 * the inode is in data relocation tree and its link count is 0 3903 */ 3904 static noinline_for_stack struct inode *create_reloc_inode( 3905 struct btrfs_fs_info *fs_info, 3906 const struct btrfs_block_group *group) 3907 { 3908 struct inode *inode = NULL; 3909 struct btrfs_trans_handle *trans; 3910 struct btrfs_root *root; 3911 u64 objectid; 3912 int ret = 0; 3913 3914 root = btrfs_grab_root(fs_info->data_reloc_root); 3915 trans = btrfs_start_transaction(root, 6); 3916 if (IS_ERR(trans)) { 3917 btrfs_put_root(root); 3918 return ERR_CAST(trans); 3919 } 3920 3921 ret = btrfs_get_free_objectid(root, &objectid); 3922 if (ret) 3923 goto out; 3924 3925 ret = __insert_orphan_inode(trans, root, objectid); 3926 if (ret) 3927 goto out; 3928 3929 inode = btrfs_iget(objectid, root); 3930 if (IS_ERR(inode)) { 3931 delete_orphan_inode(trans, root, objectid); 3932 ret = PTR_ERR(inode); 3933 inode = NULL; 3934 goto out; 3935 } 3936 BTRFS_I(inode)->reloc_block_group_start = group->start; 3937 3938 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 3939 out: 3940 btrfs_put_root(root); 3941 btrfs_end_transaction(trans); 3942 btrfs_btree_balance_dirty(fs_info); 3943 if (ret) { 3944 iput(inode); 3945 inode = ERR_PTR(ret); 3946 } 3947 return inode; 3948 } 3949 3950 /* 3951 * Mark start of chunk relocation that is cancellable. Check if the cancellation 3952 * has been requested meanwhile and don't start in that case. 3953 * 3954 * Return: 3955 * 0 success 3956 * -EINPROGRESS operation is already in progress, that's probably a bug 3957 * -ECANCELED cancellation request was set before the operation started 3958 */ 3959 static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3960 { 3961 if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3962 /* This should not happen */ 3963 btrfs_err(fs_info, "reloc already running, cannot start"); 3964 return -EINPROGRESS; 3965 } 3966 3967 if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3968 btrfs_info(fs_info, "chunk relocation canceled on start"); 3969 /* 3970 * On cancel, clear all requests but let the caller mark 3971 * the end after cleanup operations. 3972 */ 3973 atomic_set(&fs_info->reloc_cancel_req, 0); 3974 return -ECANCELED; 3975 } 3976 return 0; 3977 } 3978 3979 /* 3980 * Mark end of chunk relocation that is cancellable and wake any waiters. 3981 */ 3982 static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3983 { 3984 /* Requested after start, clear bit first so any waiters can continue */ 3985 if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3986 btrfs_info(fs_info, "chunk relocation canceled during operation"); 3987 clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3988 atomic_set(&fs_info->reloc_cancel_req, 0); 3989 } 3990 3991 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3992 { 3993 struct reloc_control *rc; 3994 3995 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3996 if (!rc) 3997 return NULL; 3998 3999 INIT_LIST_HEAD(&rc->reloc_roots); 4000 INIT_LIST_HEAD(&rc->dirty_subvol_roots); 4001 btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); 4002 rc->reloc_root_tree.rb_root = RB_ROOT; 4003 spin_lock_init(&rc->reloc_root_tree.lock); 4004 extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); 4005 return rc; 4006 } 4007 4008 static void free_reloc_control(struct reloc_control *rc) 4009 { 4010 struct mapping_node *node, *tmp; 4011 4012 free_reloc_roots(&rc->reloc_roots); 4013 rbtree_postorder_for_each_entry_safe(node, tmp, 4014 &rc->reloc_root_tree.rb_root, rb_node) 4015 kfree(node); 4016 4017 kfree(rc); 4018 } 4019 4020 /* 4021 * Print the block group being relocated 4022 */ 4023 static void describe_relocation(struct btrfs_block_group *block_group) 4024 { 4025 char buf[128] = {'\0'}; 4026 4027 btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); 4028 4029 btrfs_info(block_group->fs_info, "relocating block group %llu flags %s", 4030 block_group->start, buf); 4031 } 4032 4033 static const char *stage_to_string(enum reloc_stage stage) 4034 { 4035 if (stage == MOVE_DATA_EXTENTS) 4036 return "move data extents"; 4037 if (stage == UPDATE_DATA_PTRS) 4038 return "update data pointers"; 4039 return "unknown"; 4040 } 4041 4042 /* 4043 * function to relocate all extents in a block group. 4044 */ 4045 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) 4046 { 4047 struct btrfs_block_group *bg; 4048 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 4049 struct reloc_control *rc; 4050 struct inode *inode; 4051 struct btrfs_path *path; 4052 int ret; 4053 int rw = 0; 4054 int err = 0; 4055 4056 /* 4057 * This only gets set if we had a half-deleted snapshot on mount. We 4058 * cannot allow relocation to start while we're still trying to clean up 4059 * these pending deletions. 4060 */ 4061 ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE); 4062 if (ret) 4063 return ret; 4064 4065 /* We may have been woken up by close_ctree, so bail if we're closing. */ 4066 if (btrfs_fs_closing(fs_info)) 4067 return -EINTR; 4068 4069 bg = btrfs_lookup_block_group(fs_info, group_start); 4070 if (!bg) 4071 return -ENOENT; 4072 4073 /* 4074 * Relocation of a data block group creates ordered extents. Without 4075 * sb_start_write(), we can freeze the filesystem while unfinished 4076 * ordered extents are left. Such ordered extents can cause a deadlock 4077 * e.g. when syncfs() is waiting for their completion but they can't 4078 * finish because they block when joining a transaction, due to the 4079 * fact that the freeze locks are being held in write mode. 4080 */ 4081 if (bg->flags & BTRFS_BLOCK_GROUP_DATA) 4082 ASSERT(sb_write_started(fs_info->sb)); 4083 4084 if (btrfs_pinned_by_swapfile(fs_info, bg)) { 4085 btrfs_put_block_group(bg); 4086 return -ETXTBSY; 4087 } 4088 4089 rc = alloc_reloc_control(fs_info); 4090 if (!rc) { 4091 btrfs_put_block_group(bg); 4092 return -ENOMEM; 4093 } 4094 4095 ret = reloc_chunk_start(fs_info); 4096 if (ret < 0) { 4097 err = ret; 4098 goto out_put_bg; 4099 } 4100 4101 rc->extent_root = extent_root; 4102 rc->block_group = bg; 4103 4104 ret = btrfs_inc_block_group_ro(rc->block_group, true); 4105 if (ret) { 4106 err = ret; 4107 goto out; 4108 } 4109 rw = 1; 4110 4111 path = btrfs_alloc_path(); 4112 if (!path) { 4113 err = -ENOMEM; 4114 goto out; 4115 } 4116 4117 inode = lookup_free_space_inode(rc->block_group, path); 4118 btrfs_free_path(path); 4119 4120 if (!IS_ERR(inode)) 4121 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); 4122 else 4123 ret = PTR_ERR(inode); 4124 4125 if (ret && ret != -ENOENT) { 4126 err = ret; 4127 goto out; 4128 } 4129 4130 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4131 if (IS_ERR(rc->data_inode)) { 4132 err = PTR_ERR(rc->data_inode); 4133 rc->data_inode = NULL; 4134 goto out; 4135 } 4136 4137 describe_relocation(rc->block_group); 4138 4139 btrfs_wait_block_group_reservations(rc->block_group); 4140 btrfs_wait_nocow_writers(rc->block_group); 4141 btrfs_wait_ordered_roots(fs_info, U64_MAX, rc->block_group); 4142 4143 ret = btrfs_zone_finish(rc->block_group); 4144 WARN_ON(ret && ret != -EAGAIN); 4145 4146 while (1) { 4147 enum reloc_stage finishes_stage; 4148 4149 mutex_lock(&fs_info->cleaner_mutex); 4150 ret = relocate_block_group(rc); 4151 mutex_unlock(&fs_info->cleaner_mutex); 4152 if (ret < 0) 4153 err = ret; 4154 4155 finishes_stage = rc->stage; 4156 /* 4157 * We may have gotten ENOSPC after we already dirtied some 4158 * extents. If writeout happens while we're relocating a 4159 * different block group we could end up hitting the 4160 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 4161 * btrfs_reloc_cow_block. Make sure we write everything out 4162 * properly so we don't trip over this problem, and then break 4163 * out of the loop if we hit an error. 4164 */ 4165 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4166 ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 4167 (u64)-1); 4168 if (ret) 4169 err = ret; 4170 invalidate_mapping_pages(rc->data_inode->i_mapping, 4171 0, -1); 4172 rc->stage = UPDATE_DATA_PTRS; 4173 } 4174 4175 if (err < 0) 4176 goto out; 4177 4178 if (rc->extents_found == 0) 4179 break; 4180 4181 btrfs_info(fs_info, "found %llu extents, stage: %s", 4182 rc->extents_found, stage_to_string(finishes_stage)); 4183 } 4184 4185 WARN_ON(rc->block_group->pinned > 0); 4186 WARN_ON(rc->block_group->reserved > 0); 4187 WARN_ON(rc->block_group->used > 0); 4188 out: 4189 if (err && rw) 4190 btrfs_dec_block_group_ro(rc->block_group); 4191 iput(rc->data_inode); 4192 out_put_bg: 4193 btrfs_put_block_group(bg); 4194 reloc_chunk_end(fs_info); 4195 free_reloc_control(rc); 4196 return err; 4197 } 4198 4199 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) 4200 { 4201 struct btrfs_fs_info *fs_info = root->fs_info; 4202 struct btrfs_trans_handle *trans; 4203 int ret, err; 4204 4205 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4206 if (IS_ERR(trans)) 4207 return PTR_ERR(trans); 4208 4209 memset(&root->root_item.drop_progress, 0, 4210 sizeof(root->root_item.drop_progress)); 4211 btrfs_set_root_drop_level(&root->root_item, 0); 4212 btrfs_set_root_refs(&root->root_item, 0); 4213 ret = btrfs_update_root(trans, fs_info->tree_root, 4214 &root->root_key, &root->root_item); 4215 4216 err = btrfs_end_transaction(trans); 4217 if (err) 4218 return err; 4219 return ret; 4220 } 4221 4222 /* 4223 * recover relocation interrupted by system crash. 4224 * 4225 * this function resumes merging reloc trees with corresponding fs trees. 4226 * this is important for keeping the sharing of tree blocks 4227 */ 4228 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) 4229 { 4230 LIST_HEAD(reloc_roots); 4231 struct btrfs_key key; 4232 struct btrfs_root *fs_root; 4233 struct btrfs_root *reloc_root; 4234 struct btrfs_path *path; 4235 struct extent_buffer *leaf; 4236 struct reloc_control *rc = NULL; 4237 struct btrfs_trans_handle *trans; 4238 int ret2; 4239 int ret = 0; 4240 4241 path = btrfs_alloc_path(); 4242 if (!path) 4243 return -ENOMEM; 4244 path->reada = READA_BACK; 4245 4246 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4247 key.type = BTRFS_ROOT_ITEM_KEY; 4248 key.offset = (u64)-1; 4249 4250 while (1) { 4251 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, 4252 path, 0, 0); 4253 if (ret < 0) 4254 goto out; 4255 if (ret > 0) { 4256 if (path->slots[0] == 0) 4257 break; 4258 path->slots[0]--; 4259 } 4260 ret = 0; 4261 leaf = path->nodes[0]; 4262 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4263 btrfs_release_path(path); 4264 4265 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4266 key.type != BTRFS_ROOT_ITEM_KEY) 4267 break; 4268 4269 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key); 4270 if (IS_ERR(reloc_root)) { 4271 ret = PTR_ERR(reloc_root); 4272 goto out; 4273 } 4274 4275 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); 4276 list_add(&reloc_root->root_list, &reloc_roots); 4277 4278 if (btrfs_root_refs(&reloc_root->root_item) > 0) { 4279 fs_root = btrfs_get_fs_root(fs_info, 4280 reloc_root->root_key.offset, false); 4281 if (IS_ERR(fs_root)) { 4282 ret = PTR_ERR(fs_root); 4283 if (ret != -ENOENT) 4284 goto out; 4285 ret = mark_garbage_root(reloc_root); 4286 if (ret < 0) 4287 goto out; 4288 ret = 0; 4289 } else { 4290 btrfs_put_root(fs_root); 4291 } 4292 } 4293 4294 if (key.offset == 0) 4295 break; 4296 4297 key.offset--; 4298 } 4299 btrfs_release_path(path); 4300 4301 if (list_empty(&reloc_roots)) 4302 goto out; 4303 4304 rc = alloc_reloc_control(fs_info); 4305 if (!rc) { 4306 ret = -ENOMEM; 4307 goto out; 4308 } 4309 4310 ret = reloc_chunk_start(fs_info); 4311 if (ret < 0) 4312 goto out_end; 4313 4314 rc->extent_root = btrfs_extent_root(fs_info, 0); 4315 4316 set_reloc_control(rc); 4317 4318 trans = btrfs_join_transaction(rc->extent_root); 4319 if (IS_ERR(trans)) { 4320 ret = PTR_ERR(trans); 4321 goto out_unset; 4322 } 4323 4324 rc->merge_reloc_tree = true; 4325 4326 while (!list_empty(&reloc_roots)) { 4327 reloc_root = list_entry(reloc_roots.next, 4328 struct btrfs_root, root_list); 4329 list_del(&reloc_root->root_list); 4330 4331 if (btrfs_root_refs(&reloc_root->root_item) == 0) { 4332 list_add_tail(&reloc_root->root_list, 4333 &rc->reloc_roots); 4334 continue; 4335 } 4336 4337 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, 4338 false); 4339 if (IS_ERR(fs_root)) { 4340 ret = PTR_ERR(fs_root); 4341 list_add_tail(&reloc_root->root_list, &reloc_roots); 4342 btrfs_end_transaction(trans); 4343 goto out_unset; 4344 } 4345 4346 ret = __add_reloc_root(reloc_root); 4347 ASSERT(ret != -EEXIST); 4348 if (ret) { 4349 list_add_tail(&reloc_root->root_list, &reloc_roots); 4350 btrfs_put_root(fs_root); 4351 btrfs_end_transaction(trans); 4352 goto out_unset; 4353 } 4354 fs_root->reloc_root = btrfs_grab_root(reloc_root); 4355 btrfs_put_root(fs_root); 4356 } 4357 4358 ret = btrfs_commit_transaction(trans); 4359 if (ret) 4360 goto out_unset; 4361 4362 merge_reloc_roots(rc); 4363 4364 unset_reloc_control(rc); 4365 4366 trans = btrfs_join_transaction(rc->extent_root); 4367 if (IS_ERR(trans)) { 4368 ret = PTR_ERR(trans); 4369 goto out_clean; 4370 } 4371 ret = btrfs_commit_transaction(trans); 4372 out_clean: 4373 ret2 = clean_dirty_subvols(rc); 4374 if (ret2 < 0 && !ret) 4375 ret = ret2; 4376 out_unset: 4377 unset_reloc_control(rc); 4378 out_end: 4379 reloc_chunk_end(fs_info); 4380 free_reloc_control(rc); 4381 out: 4382 free_reloc_roots(&reloc_roots); 4383 4384 btrfs_free_path(path); 4385 4386 if (ret == 0) { 4387 /* cleanup orphan inode in data relocation tree */ 4388 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 4389 ASSERT(fs_root); 4390 ret = btrfs_orphan_cleanup(fs_root); 4391 btrfs_put_root(fs_root); 4392 } 4393 return ret; 4394 } 4395 4396 /* 4397 * helper to add ordered checksum for data relocation. 4398 * 4399 * cloning checksum properly handles the nodatasum extents. 4400 * it also saves CPU time to re-calculate the checksum. 4401 */ 4402 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) 4403 { 4404 struct btrfs_inode *inode = ordered->inode; 4405 struct btrfs_fs_info *fs_info = inode->root->fs_info; 4406 u64 disk_bytenr = ordered->file_offset + inode->reloc_block_group_start; 4407 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); 4408 LIST_HEAD(list); 4409 int ret; 4410 4411 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, 4412 disk_bytenr + ordered->num_bytes - 1, 4413 &list, false); 4414 if (ret < 0) { 4415 btrfs_mark_ordered_extent_error(ordered); 4416 return ret; 4417 } 4418 4419 while (!list_empty(&list)) { 4420 struct btrfs_ordered_sum *sums = 4421 list_entry(list.next, struct btrfs_ordered_sum, list); 4422 4423 list_del_init(&sums->list); 4424 4425 /* 4426 * We need to offset the new_bytenr based on where the csum is. 4427 * We need to do this because we will read in entire prealloc 4428 * extents but we may have written to say the middle of the 4429 * prealloc extent, so we need to make sure the csum goes with 4430 * the right disk offset. 4431 * 4432 * We can do this because the data reloc inode refers strictly 4433 * to the on disk bytes, so we don't have to worry about 4434 * disk_len vs real len like with real inodes since it's all 4435 * disk length. 4436 */ 4437 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; 4438 btrfs_add_ordered_sum(ordered, sums); 4439 } 4440 4441 return 0; 4442 } 4443 4444 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4445 struct btrfs_root *root, 4446 const struct extent_buffer *buf, 4447 struct extent_buffer *cow) 4448 { 4449 struct btrfs_fs_info *fs_info = root->fs_info; 4450 struct reloc_control *rc; 4451 struct btrfs_backref_node *node; 4452 int first_cow = 0; 4453 int level; 4454 int ret = 0; 4455 4456 rc = fs_info->reloc_ctl; 4457 if (!rc) 4458 return 0; 4459 4460 BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); 4461 4462 level = btrfs_header_level(buf); 4463 if (btrfs_header_generation(buf) <= 4464 btrfs_root_last_snapshot(&root->root_item)) 4465 first_cow = 1; 4466 4467 if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { 4468 WARN_ON(!first_cow && level == 0); 4469 4470 node = rc->backref_cache.path[level]; 4471 BUG_ON(node->bytenr != buf->start && 4472 node->new_bytenr != buf->start); 4473 4474 btrfs_backref_drop_node_buffer(node); 4475 atomic_inc(&cow->refs); 4476 node->eb = cow; 4477 node->new_bytenr = cow->start; 4478 4479 if (!node->pending) { 4480 list_move_tail(&node->list, 4481 &rc->backref_cache.pending[level]); 4482 node->pending = 1; 4483 } 4484 4485 if (first_cow) 4486 mark_block_processed(rc, node); 4487 4488 if (first_cow && level > 0) 4489 rc->nodes_relocated += buf->len; 4490 } 4491 4492 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) 4493 ret = replace_file_extents(trans, rc, root, cow); 4494 return ret; 4495 } 4496 4497 /* 4498 * called before creating snapshot. it calculates metadata reservation 4499 * required for relocating tree blocks in the snapshot 4500 */ 4501 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, 4502 u64 *bytes_to_reserve) 4503 { 4504 struct btrfs_root *root = pending->root; 4505 struct reloc_control *rc = root->fs_info->reloc_ctl; 4506 4507 if (!rc || !have_reloc_root(root)) 4508 return; 4509 4510 if (!rc->merge_reloc_tree) 4511 return; 4512 4513 root = root->reloc_root; 4514 BUG_ON(btrfs_root_refs(&root->root_item) == 0); 4515 /* 4516 * relocation is in the stage of merging trees. the space 4517 * used by merging a reloc tree is twice the size of 4518 * relocated tree nodes in the worst case. half for cowing 4519 * the reloc tree, half for cowing the fs tree. the space 4520 * used by cowing the reloc tree will be freed after the 4521 * tree is dropped. if we create snapshot, cowing the fs 4522 * tree may use more space than it frees. so we need 4523 * reserve extra space. 4524 */ 4525 *bytes_to_reserve += rc->nodes_relocated; 4526 } 4527 4528 /* 4529 * called after snapshot is created. migrate block reservation 4530 * and create reloc root for the newly created snapshot 4531 * 4532 * This is similar to btrfs_init_reloc_root(), we come out of here with two 4533 * references held on the reloc_root, one for root->reloc_root and one for 4534 * rc->reloc_roots. 4535 */ 4536 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 4537 struct btrfs_pending_snapshot *pending) 4538 { 4539 struct btrfs_root *root = pending->root; 4540 struct btrfs_root *reloc_root; 4541 struct btrfs_root *new_root; 4542 struct reloc_control *rc = root->fs_info->reloc_ctl; 4543 int ret; 4544 4545 if (!rc || !have_reloc_root(root)) 4546 return 0; 4547 4548 rc = root->fs_info->reloc_ctl; 4549 rc->merging_rsv_size += rc->nodes_relocated; 4550 4551 if (rc->merge_reloc_tree) { 4552 ret = btrfs_block_rsv_migrate(&pending->block_rsv, 4553 rc->block_rsv, 4554 rc->nodes_relocated, true); 4555 if (ret) 4556 return ret; 4557 } 4558 4559 new_root = pending->snap; 4560 reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); 4561 if (IS_ERR(reloc_root)) 4562 return PTR_ERR(reloc_root); 4563 4564 ret = __add_reloc_root(reloc_root); 4565 ASSERT(ret != -EEXIST); 4566 if (ret) { 4567 /* Pairs with create_reloc_root */ 4568 btrfs_put_root(reloc_root); 4569 return ret; 4570 } 4571 new_root->reloc_root = btrfs_grab_root(reloc_root); 4572 4573 if (rc->create_reloc_tree) 4574 ret = clone_backref_node(trans, rc, root, reloc_root); 4575 return ret; 4576 } 4577 4578 /* 4579 * Get the current bytenr for the block group which is being relocated. 4580 * 4581 * Return U64_MAX if no running relocation. 4582 */ 4583 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) 4584 { 4585 u64 logical = U64_MAX; 4586 4587 lockdep_assert_held(&fs_info->reloc_mutex); 4588 4589 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 4590 logical = fs_info->reloc_ctl->block_group->start; 4591 return logical; 4592 } 4593