1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "transaction.h" 10 #include "locking.h" 11 #include "accessors.h" 12 #include "messages.h" 13 #include "delalloc-space.h" 14 #include "subpage.h" 15 #include "defrag.h" 16 #include "file-item.h" 17 #include "super.h" 18 19 static struct kmem_cache *btrfs_inode_defrag_cachep; 20 21 /* 22 * When auto defrag is enabled we queue up these defrag structs to remember 23 * which inodes need defragging passes. 24 */ 25 struct inode_defrag { 26 struct rb_node rb_node; 27 /* Inode number */ 28 u64 ino; 29 /* 30 * Transid where the defrag was added, we search for extents newer than 31 * this. 32 */ 33 u64 transid; 34 35 /* Root objectid */ 36 u64 root; 37 38 /* 39 * The extent size threshold for autodefrag. 40 * 41 * This value is different for compressed/non-compressed extents, thus 42 * needs to be passed from higher layer. 43 * (aka, inode_should_defrag()) 44 */ 45 u32 extent_thresh; 46 }; 47 48 static int compare_inode_defrag(const struct inode_defrag *defrag1, 49 const struct inode_defrag *defrag2) 50 { 51 if (defrag1->root > defrag2->root) 52 return 1; 53 else if (defrag1->root < defrag2->root) 54 return -1; 55 else if (defrag1->ino > defrag2->ino) 56 return 1; 57 else if (defrag1->ino < defrag2->ino) 58 return -1; 59 else 60 return 0; 61 } 62 63 static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) 64 { 65 const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); 66 const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); 67 68 return compare_inode_defrag(new_defrag, existing_defrag); 69 } 70 71 /* 72 * Insert a record for an inode into the defrag tree. The lock must be held 73 * already. 74 * 75 * If you're inserting a record for an older transid than an existing record, 76 * the transid already in the tree is lowered. 77 */ 78 static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, 79 struct inode_defrag *defrag) 80 { 81 struct btrfs_fs_info *fs_info = inode->root->fs_info; 82 struct rb_node *node; 83 84 node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); 85 if (node) { 86 struct inode_defrag *entry; 87 88 entry = rb_entry(node, struct inode_defrag, rb_node); 89 /* 90 * If we're reinserting an entry for an old defrag run, make 91 * sure to lower the transid of our existing record. 92 */ 93 if (defrag->transid < entry->transid) 94 entry->transid = defrag->transid; 95 entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); 96 return -EEXIST; 97 } 98 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 99 return 0; 100 } 101 102 static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) 103 { 104 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 105 return false; 106 107 if (btrfs_fs_closing(fs_info)) 108 return false; 109 110 return true; 111 } 112 113 /* 114 * Insert a defrag record for this inode if auto defrag is enabled. No errors 115 * returned as they're not considered fatal. 116 */ 117 void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) 118 { 119 struct btrfs_root *root = inode->root; 120 struct btrfs_fs_info *fs_info = root->fs_info; 121 struct inode_defrag *defrag; 122 int ret; 123 124 if (!need_auto_defrag(fs_info)) 125 return; 126 127 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 128 return; 129 130 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 131 if (!defrag) 132 return; 133 134 defrag->ino = btrfs_ino(inode); 135 defrag->transid = btrfs_get_root_last_trans(root); 136 defrag->root = btrfs_root_id(root); 137 defrag->extent_thresh = extent_thresh; 138 139 spin_lock(&fs_info->defrag_inodes_lock); 140 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 141 /* 142 * If we set IN_DEFRAG flag and evict the inode from memory, 143 * and then re-read this inode, this new inode doesn't have 144 * IN_DEFRAG flag. At the case, we may find the existed defrag. 145 */ 146 ret = btrfs_insert_inode_defrag(inode, defrag); 147 if (ret) 148 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 149 } else { 150 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 151 } 152 spin_unlock(&fs_info->defrag_inodes_lock); 153 } 154 155 /* 156 * Pick the defraggable inode that we want, if it doesn't exist, we will get the 157 * next one. 158 */ 159 static struct inode_defrag *btrfs_pick_defrag_inode( 160 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 161 { 162 struct inode_defrag *entry = NULL; 163 struct inode_defrag tmp; 164 struct rb_node *p; 165 struct rb_node *parent = NULL; 166 int ret; 167 168 tmp.ino = ino; 169 tmp.root = root; 170 171 spin_lock(&fs_info->defrag_inodes_lock); 172 p = fs_info->defrag_inodes.rb_node; 173 while (p) { 174 parent = p; 175 entry = rb_entry(parent, struct inode_defrag, rb_node); 176 177 ret = compare_inode_defrag(&tmp, entry); 178 if (ret < 0) 179 p = parent->rb_left; 180 else if (ret > 0) 181 p = parent->rb_right; 182 else 183 goto out; 184 } 185 186 if (parent && compare_inode_defrag(&tmp, entry) > 0) { 187 parent = rb_next(parent); 188 entry = rb_entry_safe(parent, struct inode_defrag, rb_node); 189 } 190 out: 191 if (entry) 192 rb_erase(parent, &fs_info->defrag_inodes); 193 spin_unlock(&fs_info->defrag_inodes_lock); 194 return entry; 195 } 196 197 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 198 { 199 struct inode_defrag *defrag, *next; 200 201 spin_lock(&fs_info->defrag_inodes_lock); 202 203 rbtree_postorder_for_each_entry_safe(defrag, next, 204 &fs_info->defrag_inodes, rb_node) 205 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 206 207 fs_info->defrag_inodes = RB_ROOT; 208 209 spin_unlock(&fs_info->defrag_inodes_lock); 210 } 211 212 #define BTRFS_DEFRAG_BATCH 1024 213 214 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 215 struct inode_defrag *defrag, 216 struct file_ra_state *ra) 217 { 218 struct btrfs_root *inode_root; 219 struct btrfs_inode *inode; 220 struct btrfs_ioctl_defrag_range_args range; 221 int ret = 0; 222 u64 cur = 0; 223 224 again: 225 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 226 goto cleanup; 227 if (!need_auto_defrag(fs_info)) 228 goto cleanup; 229 230 /* Get the inode */ 231 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 232 if (IS_ERR(inode_root)) { 233 ret = PTR_ERR(inode_root); 234 goto cleanup; 235 } 236 237 inode = btrfs_iget(defrag->ino, inode_root); 238 btrfs_put_root(inode_root); 239 if (IS_ERR(inode)) { 240 ret = PTR_ERR(inode); 241 goto cleanup; 242 } 243 244 if (cur >= i_size_read(&inode->vfs_inode)) { 245 iput(&inode->vfs_inode); 246 goto cleanup; 247 } 248 249 /* Do a chunk of defrag */ 250 clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 251 memset(&range, 0, sizeof(range)); 252 range.len = (u64)-1; 253 range.start = cur; 254 range.extent_thresh = defrag->extent_thresh; 255 file_ra_state_init(ra, inode->vfs_inode.i_mapping); 256 257 scoped_guard(super_write, fs_info->sb) 258 ret = btrfs_defrag_file(inode, ra, &range, 259 defrag->transid, BTRFS_DEFRAG_BATCH); 260 iput(&inode->vfs_inode); 261 262 if (ret < 0) 263 goto cleanup; 264 265 cur = max(cur + fs_info->sectorsize, range.start); 266 goto again; 267 268 cleanup: 269 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 270 return ret; 271 } 272 273 /* 274 * Run through the list of inodes in the FS that need defragging. 275 */ 276 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 277 { 278 struct inode_defrag *defrag; 279 u64 first_ino = 0; 280 u64 root_objectid = 0; 281 282 atomic_inc(&fs_info->defrag_running); 283 while (1) { 284 struct file_ra_state ra = { 0 }; 285 286 /* Pause the auto defragger. */ 287 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 288 break; 289 290 if (!need_auto_defrag(fs_info)) 291 break; 292 293 /* find an inode to defrag */ 294 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 295 if (!defrag) { 296 if (root_objectid || first_ino) { 297 root_objectid = 0; 298 first_ino = 0; 299 continue; 300 } else { 301 break; 302 } 303 } 304 305 first_ino = defrag->ino + 1; 306 root_objectid = defrag->root; 307 308 btrfs_run_defrag_inode(fs_info, defrag, &ra); 309 } 310 atomic_dec(&fs_info->defrag_running); 311 312 /* 313 * During unmount, we use the transaction_wait queue to wait for the 314 * defragger to stop. 315 */ 316 wake_up(&fs_info->transaction_wait); 317 return 0; 318 } 319 320 /* 321 * Check if two blocks addresses are close, used by defrag. 322 */ 323 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 324 { 325 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 326 return true; 327 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 328 return true; 329 return false; 330 } 331 332 /* 333 * Go through all the leaves pointed to by a node and reallocate them so that 334 * disk order is close to key order. 335 */ 336 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 337 struct btrfs_root *root, 338 struct extent_buffer *parent, 339 int start_slot, u64 *last_ret, 340 struct btrfs_key *progress) 341 { 342 struct btrfs_fs_info *fs_info = root->fs_info; 343 const u32 blocksize = fs_info->nodesize; 344 const int end_slot = btrfs_header_nritems(parent) - 1; 345 u64 search_start = *last_ret; 346 u64 last_block = 0; 347 int ret = 0; 348 bool progress_passed = false; 349 350 /* 351 * COWing must happen through a running transaction, which always 352 * matches the current fs generation (it's a transaction with a state 353 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 354 * into error state to prevent the commit of any transaction. 355 */ 356 if (unlikely(trans->transaction != fs_info->running_transaction || 357 trans->transid != fs_info->generation)) { 358 btrfs_abort_transaction(trans, -EUCLEAN); 359 btrfs_crit(fs_info, 360 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 361 parent->start, btrfs_root_id(root), trans->transid, 362 fs_info->running_transaction->transid, 363 fs_info->generation); 364 return -EUCLEAN; 365 } 366 367 if (btrfs_header_nritems(parent) <= 1) 368 return 0; 369 370 for (int i = start_slot; i <= end_slot; i++) { 371 struct extent_buffer *cur; 372 struct btrfs_disk_key disk_key; 373 u64 blocknr; 374 u64 other; 375 bool close = true; 376 377 btrfs_node_key(parent, &disk_key, i); 378 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 379 continue; 380 381 progress_passed = true; 382 blocknr = btrfs_node_blockptr(parent, i); 383 if (last_block == 0) 384 last_block = blocknr; 385 386 if (i > 0) { 387 other = btrfs_node_blockptr(parent, i - 1); 388 close = close_blocks(blocknr, other, blocksize); 389 } 390 if (!close && i < end_slot) { 391 other = btrfs_node_blockptr(parent, i + 1); 392 close = close_blocks(blocknr, other, blocksize); 393 } 394 if (close) { 395 last_block = blocknr; 396 continue; 397 } 398 399 cur = btrfs_read_node_slot(parent, i); 400 if (IS_ERR(cur)) 401 return PTR_ERR(cur); 402 if (search_start == 0) 403 search_start = last_block; 404 405 btrfs_tree_lock(cur); 406 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 407 &cur, search_start, 408 min(16 * blocksize, 409 (end_slot - i) * blocksize), 410 BTRFS_NESTING_COW); 411 if (ret) { 412 btrfs_tree_unlock(cur); 413 free_extent_buffer(cur); 414 break; 415 } 416 search_start = cur->start; 417 last_block = cur->start; 418 *last_ret = search_start; 419 btrfs_tree_unlock(cur); 420 free_extent_buffer(cur); 421 } 422 return ret; 423 } 424 425 /* 426 * Defrag all the leaves in a given btree. 427 * Read all the leaves and try to get key order to 428 * better reflect disk order 429 */ 430 431 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 432 struct btrfs_root *root) 433 { 434 struct btrfs_path *path = NULL; 435 struct btrfs_key key; 436 int ret = 0; 437 int wret; 438 int level; 439 int next_key_ret = 0; 440 u64 last_ret = 0; 441 442 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 443 goto out; 444 445 path = btrfs_alloc_path(); 446 if (!path) { 447 ret = -ENOMEM; 448 goto out; 449 } 450 451 level = btrfs_header_level(root->node); 452 453 if (level == 0) 454 goto out; 455 456 if (root->defrag_progress.objectid == 0) { 457 struct extent_buffer *root_node; 458 u32 nritems; 459 460 root_node = btrfs_lock_root_node(root); 461 nritems = btrfs_header_nritems(root_node); 462 root->defrag_max.objectid = 0; 463 /* from above we know this is not a leaf */ 464 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 465 nritems - 1); 466 btrfs_tree_unlock(root_node); 467 free_extent_buffer(root_node); 468 memset(&key, 0, sizeof(key)); 469 } else { 470 memcpy(&key, &root->defrag_progress, sizeof(key)); 471 } 472 473 path->keep_locks = 1; 474 475 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 476 if (ret < 0) 477 goto out; 478 if (ret > 0) { 479 ret = 0; 480 goto out; 481 } 482 btrfs_release_path(path); 483 /* 484 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 485 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 486 * a deadlock (attempting to write lock an already write locked leaf). 487 */ 488 path->lowest_level = 1; 489 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 490 491 if (wret < 0) { 492 ret = wret; 493 goto out; 494 } 495 if (!path->nodes[1]) { 496 ret = 0; 497 goto out; 498 } 499 /* 500 * The node at level 1 must always be locked when our path has 501 * keep_locks set and lowest_level is 1, regardless of the value of 502 * path->slots[1]. 503 */ 504 ASSERT(path->locks[1] != 0); 505 ret = btrfs_realloc_node(trans, root, 506 path->nodes[1], 0, 507 &last_ret, 508 &root->defrag_progress); 509 if (ret) { 510 WARN_ON(ret == -EAGAIN); 511 goto out; 512 } 513 /* 514 * Now that we reallocated the node we can find the next key. Note that 515 * btrfs_find_next_key() can release our path and do another search 516 * without COWing, this is because even with path->keep_locks = 1, 517 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 518 * node when path->slots[node_level - 1] does not point to the last 519 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 520 * we search for the next key after reallocating our node. 521 */ 522 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 523 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 524 BTRFS_OLDEST_GENERATION); 525 if (next_key_ret == 0) { 526 memcpy(&root->defrag_progress, &key, sizeof(key)); 527 ret = -EAGAIN; 528 } 529 out: 530 btrfs_free_path(path); 531 if (ret == -EAGAIN) { 532 if (root->defrag_max.objectid > root->defrag_progress.objectid) 533 goto done; 534 if (root->defrag_max.type > root->defrag_progress.type) 535 goto done; 536 if (root->defrag_max.offset > root->defrag_progress.offset) 537 goto done; 538 ret = 0; 539 } 540 done: 541 if (ret != -EAGAIN) 542 memset(&root->defrag_progress, 0, 543 sizeof(root->defrag_progress)); 544 545 return ret; 546 } 547 548 /* 549 * Defrag a given btree. Every leaf in the btree is read and defragmented. 550 */ 551 int btrfs_defrag_root(struct btrfs_root *root) 552 { 553 struct btrfs_fs_info *fs_info = root->fs_info; 554 int ret; 555 556 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 557 return 0; 558 559 while (1) { 560 struct btrfs_trans_handle *trans; 561 562 trans = btrfs_start_transaction(root, 0); 563 if (IS_ERR(trans)) { 564 ret = PTR_ERR(trans); 565 break; 566 } 567 568 ret = btrfs_defrag_leaves(trans, root); 569 570 btrfs_end_transaction(trans); 571 btrfs_btree_balance_dirty(fs_info); 572 cond_resched(); 573 574 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 575 break; 576 577 if (btrfs_defrag_cancelled(fs_info)) { 578 btrfs_debug(fs_info, "defrag_root cancelled"); 579 ret = -EAGAIN; 580 break; 581 } 582 } 583 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 584 return ret; 585 } 586 587 /* 588 * Defrag specific helper to get an extent map. 589 * 590 * Differences between this and btrfs_get_extent() are: 591 * 592 * - No extent_map will be added to inode->extent_tree 593 * To reduce memory usage in the long run. 594 * 595 * - Extra optimization to skip file extents older than @newer_than 596 * By using btrfs_search_forward() we can skip entire file ranges that 597 * have extents created in past transactions, because btrfs_search_forward() 598 * will not visit leaves and nodes with a generation smaller than given 599 * minimal generation threshold (@newer_than). 600 * 601 * Return valid em if we find a file extent matching the requirement. 602 * Return NULL if we can not find a file extent matching the requirement. 603 * 604 * Return ERR_PTR() for error. 605 */ 606 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 607 u64 start, u64 newer_than) 608 { 609 struct btrfs_root *root = inode->root; 610 struct btrfs_file_extent_item *fi; 611 struct btrfs_path path = { 0 }; 612 struct extent_map *em; 613 struct btrfs_key key; 614 u64 ino = btrfs_ino(inode); 615 int ret; 616 617 em = btrfs_alloc_extent_map(); 618 if (!em) { 619 ret = -ENOMEM; 620 goto err; 621 } 622 623 key.objectid = ino; 624 key.type = BTRFS_EXTENT_DATA_KEY; 625 key.offset = start; 626 627 if (newer_than) { 628 ret = btrfs_search_forward(root, &key, &path, newer_than); 629 if (ret < 0) 630 goto err; 631 /* Can't find anything newer */ 632 if (ret > 0) 633 goto not_found; 634 } else { 635 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 636 if (ret < 0) 637 goto err; 638 } 639 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 640 /* 641 * If btrfs_search_slot() makes path to point beyond nritems, 642 * we should not have an empty leaf, as this inode must at 643 * least have its INODE_ITEM. 644 */ 645 ASSERT(btrfs_header_nritems(path.nodes[0])); 646 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 647 } 648 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 649 /* Perfect match, no need to go one slot back */ 650 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 651 key.offset == start) 652 goto iterate; 653 654 /* We didn't find a perfect match, needs to go one slot back */ 655 if (path.slots[0] > 0) { 656 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 657 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 658 path.slots[0]--; 659 } 660 661 iterate: 662 /* Iterate through the path to find a file extent covering @start */ 663 while (true) { 664 u64 extent_end; 665 666 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 667 goto next; 668 669 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 670 671 /* 672 * We may go one slot back to INODE_REF/XATTR item, then 673 * need to go forward until we reach an EXTENT_DATA. 674 * But we should still has the correct ino as key.objectid. 675 */ 676 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 677 goto next; 678 679 /* It's beyond our target range, definitely not extent found */ 680 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 681 goto not_found; 682 683 /* 684 * | |<- File extent ->| 685 * \- start 686 * 687 * This means there is a hole between start and key.offset. 688 */ 689 if (key.offset > start) { 690 em->start = start; 691 em->disk_bytenr = EXTENT_MAP_HOLE; 692 em->disk_num_bytes = 0; 693 em->ram_bytes = 0; 694 em->offset = 0; 695 em->len = key.offset - start; 696 break; 697 } 698 699 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 700 struct btrfs_file_extent_item); 701 extent_end = btrfs_file_extent_end(&path); 702 703 /* 704 * |<- file extent ->| | 705 * \- start 706 * 707 * We haven't reached start, search next slot. 708 */ 709 if (extent_end <= start) 710 goto next; 711 712 /* Now this extent covers @start, convert it to em */ 713 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 714 break; 715 next: 716 ret = btrfs_next_item(root, &path); 717 if (ret < 0) 718 goto err; 719 if (ret > 0) 720 goto not_found; 721 } 722 btrfs_release_path(&path); 723 return em; 724 725 not_found: 726 btrfs_release_path(&path); 727 btrfs_free_extent_map(em); 728 return NULL; 729 730 err: 731 btrfs_release_path(&path); 732 btrfs_free_extent_map(em); 733 return ERR_PTR(ret); 734 } 735 736 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 737 u64 newer_than, bool locked) 738 { 739 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 740 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 741 struct extent_map *em; 742 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 743 744 /* 745 * Hopefully we have this extent in the tree already, try without the 746 * full extent lock. 747 */ 748 read_lock(&em_tree->lock); 749 em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); 750 read_unlock(&em_tree->lock); 751 752 /* 753 * We can get a merged extent, in that case, we need to re-search 754 * tree to get the original em for defrag. 755 * 756 * This is because even if we have adjacent extents that are contiguous 757 * and compatible (same type and flags), we still want to defrag them 758 * so that we use less metadata (extent items in the extent tree and 759 * file extent items in the inode's subvolume tree). 760 */ 761 if (em && (em->flags & EXTENT_FLAG_MERGED)) { 762 btrfs_free_extent_map(em); 763 em = NULL; 764 } 765 766 if (!em) { 767 struct extent_state *cached = NULL; 768 u64 end = start + sectorsize - 1; 769 770 /* Get the big lock and read metadata off disk. */ 771 if (!locked) 772 btrfs_lock_extent(io_tree, start, end, &cached); 773 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 774 if (!locked) 775 btrfs_unlock_extent(io_tree, start, end, &cached); 776 777 if (IS_ERR(em)) 778 return NULL; 779 } 780 781 return em; 782 } 783 784 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 785 const struct extent_map *em) 786 { 787 if (btrfs_extent_map_is_compressed(em)) 788 return BTRFS_MAX_COMPRESSED; 789 return fs_info->max_extent_size; 790 } 791 792 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 793 u32 extent_thresh, u64 newer_than, bool locked) 794 { 795 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 796 struct extent_map *next; 797 bool ret = false; 798 799 /* This is the last extent */ 800 if (em->start + em->len >= i_size_read(inode)) 801 return false; 802 803 /* 804 * Here we need to pass @newer_then when checking the next extent, or 805 * we will hit a case we mark current extent for defrag, but the next 806 * one will not be a target. 807 * This will just cause extra IO without really reducing the fragments. 808 */ 809 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 810 /* No more em or hole */ 811 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 812 goto out; 813 if (next->flags & EXTENT_FLAG_PREALLOC) 814 goto out; 815 /* 816 * If the next extent is at its max capacity, defragging current extent 817 * makes no sense, as the total number of extents won't change. 818 */ 819 if (next->len >= get_extent_max_capacity(fs_info, em)) 820 goto out; 821 /* Skip older extent */ 822 if (next->generation < newer_than) 823 goto out; 824 /* Also check extent size */ 825 if (next->len >= extent_thresh) 826 goto out; 827 828 ret = true; 829 out: 830 btrfs_free_extent_map(next); 831 return ret; 832 } 833 834 /* 835 * Prepare one page to be defragged. 836 * 837 * This will ensure: 838 * 839 * - Returned page is locked and has been set up properly. 840 * - No ordered extent exists in the page. 841 * - The page is uptodate. 842 * 843 * NOTE: Caller should also wait for page writeback after the cluster is 844 * prepared, here we don't do writeback wait for each page. 845 */ 846 static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) 847 { 848 struct address_space *mapping = inode->vfs_inode.i_mapping; 849 gfp_t mask = btrfs_alloc_write_mask(mapping); 850 u64 lock_start; 851 u64 lock_end; 852 struct extent_state *cached_state = NULL; 853 struct folio *folio; 854 int ret; 855 856 again: 857 /* TODO: Add order fgp order flags when large folios are fully enabled. */ 858 folio = __filemap_get_folio(mapping, index, 859 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 860 if (IS_ERR(folio)) 861 return folio; 862 863 /* 864 * Since we can defragment files opened read-only, we can encounter 865 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). 866 * 867 * The IO for such large folios is not fully tested, thus return 868 * an error to reject such folios unless it's an experimental build. 869 * 870 * Filesystem transparent huge pages are typically only used for 871 * executables that explicitly enable them, so this isn't very 872 * restrictive. 873 */ 874 if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { 875 folio_unlock(folio); 876 folio_put(folio); 877 return ERR_PTR(-ETXTBSY); 878 } 879 880 ret = set_folio_extent_mapped(folio); 881 if (ret < 0) { 882 folio_unlock(folio); 883 folio_put(folio); 884 return ERR_PTR(ret); 885 } 886 887 lock_start = folio_pos(folio); 888 lock_end = folio_next_pos(folio) - 1; 889 /* Wait for any existing ordered extent in the range */ 890 while (1) { 891 struct btrfs_ordered_extent *ordered; 892 893 btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 894 ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); 895 btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 896 if (!ordered) 897 break; 898 899 folio_unlock(folio); 900 btrfs_start_ordered_extent(ordered); 901 btrfs_put_ordered_extent(ordered); 902 folio_lock(folio); 903 /* 904 * We unlocked the folio above, so we need check if it was 905 * released or not. 906 */ 907 if (folio->mapping != mapping || !folio->private) { 908 folio_unlock(folio); 909 folio_put(folio); 910 goto again; 911 } 912 } 913 914 /* 915 * Now the page range has no ordered extent any more. Read the page to 916 * make it uptodate. 917 */ 918 if (!folio_test_uptodate(folio)) { 919 btrfs_read_folio(NULL, folio); 920 folio_lock(folio); 921 if (folio->mapping != mapping || !folio->private) { 922 folio_unlock(folio); 923 folio_put(folio); 924 goto again; 925 } 926 if (unlikely(!folio_test_uptodate(folio))) { 927 folio_unlock(folio); 928 folio_put(folio); 929 return ERR_PTR(-EIO); 930 } 931 } 932 return folio; 933 } 934 935 struct defrag_target_range { 936 struct list_head list; 937 u64 start; 938 u64 len; 939 }; 940 941 /* 942 * Collect all valid target extents. 943 * 944 * @start: file offset to lookup 945 * @len: length to lookup 946 * @extent_thresh: file extent size threshold, any extent size >= this value 947 * will be ignored 948 * @newer_than: only defrag extents newer than this value 949 * @do_compress: whether the defrag is doing compression or no-compression 950 * if true, @extent_thresh will be ignored and all regular 951 * file extents meeting @newer_than will be targets. 952 * @locked: if the range has already held extent lock 953 * @target_list: list of targets file extents 954 */ 955 static int defrag_collect_targets(struct btrfs_inode *inode, 956 u64 start, u64 len, u32 extent_thresh, 957 u64 newer_than, bool do_compress, 958 bool locked, struct list_head *target_list, 959 u64 *last_scanned_ret) 960 { 961 struct btrfs_fs_info *fs_info = inode->root->fs_info; 962 bool last_is_target = false; 963 u64 cur = start; 964 int ret = 0; 965 966 while (cur < start + len) { 967 struct extent_map *em; 968 struct defrag_target_range *new; 969 bool next_mergeable = true; 970 u64 range_len; 971 972 last_is_target = false; 973 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 974 if (!em) 975 break; 976 977 /* 978 * If the file extent is an inlined one, we may still want to 979 * defrag it (fallthrough) if it will cause a regular extent. 980 * This is for users who want to convert inline extents to 981 * regular ones through max_inline= mount option. 982 */ 983 if (em->disk_bytenr == EXTENT_MAP_INLINE && 984 em->len <= inode->root->fs_info->max_inline) 985 goto next; 986 987 /* Skip holes and preallocated extents. */ 988 if (em->disk_bytenr == EXTENT_MAP_HOLE || 989 (em->flags & EXTENT_FLAG_PREALLOC)) 990 goto next; 991 992 /* Skip older extent */ 993 if (em->generation < newer_than) 994 goto next; 995 996 /* This em is under writeback, no need to defrag */ 997 if (em->generation == (u64)-1) 998 goto next; 999 1000 /* 1001 * Our start offset might be in the middle of an existing extent 1002 * map, so take that into account. 1003 */ 1004 range_len = em->len - (cur - em->start); 1005 /* 1006 * If this range of the extent map is already flagged for delalloc, 1007 * skip it, because: 1008 * 1009 * 1) We could deadlock later, when trying to reserve space for 1010 * delalloc, because in case we can't immediately reserve space 1011 * the flusher can start delalloc and wait for the respective 1012 * ordered extents to complete. The deadlock would happen 1013 * because we do the space reservation while holding the range 1014 * locked, and starting writeback, or finishing an ordered 1015 * extent, requires locking the range; 1016 * 1017 * 2) If there's delalloc there, it means there's dirty pages for 1018 * which writeback has not started yet (we clean the delalloc 1019 * flag when starting writeback and after creating an ordered 1020 * extent). If we mark pages in an adjacent range for defrag, 1021 * then we will have a larger contiguous range for delalloc, 1022 * very likely resulting in a larger extent after writeback is 1023 * triggered (except in a case of free space fragmentation). 1024 */ 1025 if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1026 EXTENT_DELALLOC)) 1027 goto next; 1028 1029 /* 1030 * For do_compress case, we want to compress all valid file 1031 * extents, thus no @extent_thresh or mergeable check. 1032 */ 1033 if (do_compress) 1034 goto add; 1035 1036 /* Skip too large extent */ 1037 if (em->len >= extent_thresh) 1038 goto next; 1039 1040 /* 1041 * Skip extents already at its max capacity, this is mostly for 1042 * compressed extents, which max cap is only 128K. 1043 */ 1044 if (em->len >= get_extent_max_capacity(fs_info, em)) 1045 goto next; 1046 1047 /* 1048 * Normally there are no more extents after an inline one, thus 1049 * @next_mergeable will normally be false and not defragged. 1050 * So if an inline extent passed all above checks, just add it 1051 * for defrag, and be converted to regular extents. 1052 */ 1053 if (em->disk_bytenr == EXTENT_MAP_INLINE) 1054 goto add; 1055 1056 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1057 extent_thresh, newer_than, locked); 1058 if (!next_mergeable) { 1059 struct defrag_target_range *last; 1060 1061 /* Empty target list, no way to merge with last entry */ 1062 if (list_empty(target_list)) 1063 goto next; 1064 last = list_last_entry(target_list, 1065 struct defrag_target_range, list); 1066 /* Not mergeable with last entry */ 1067 if (last->start + last->len != cur) 1068 goto next; 1069 1070 /* Mergeable, fall through to add it to @target_list. */ 1071 } 1072 1073 add: 1074 last_is_target = true; 1075 range_len = min(btrfs_extent_map_end(em), start + len) - cur; 1076 /* 1077 * This one is a good target, check if it can be merged into 1078 * last range of the target list. 1079 */ 1080 if (!list_empty(target_list)) { 1081 struct defrag_target_range *last; 1082 1083 last = list_last_entry(target_list, 1084 struct defrag_target_range, list); 1085 ASSERT(last->start + last->len <= cur); 1086 if (last->start + last->len == cur) { 1087 /* Mergeable, enlarge the last entry */ 1088 last->len += range_len; 1089 goto next; 1090 } 1091 /* Fall through to allocate a new entry */ 1092 } 1093 1094 /* Allocate new defrag_target_range */ 1095 new = kmalloc(sizeof(*new), GFP_NOFS); 1096 if (!new) { 1097 btrfs_free_extent_map(em); 1098 ret = -ENOMEM; 1099 break; 1100 } 1101 new->start = cur; 1102 new->len = range_len; 1103 list_add_tail(&new->list, target_list); 1104 1105 next: 1106 cur = btrfs_extent_map_end(em); 1107 btrfs_free_extent_map(em); 1108 } 1109 if (ret < 0) { 1110 struct defrag_target_range *entry; 1111 struct defrag_target_range *tmp; 1112 1113 list_for_each_entry_safe(entry, tmp, target_list, list) { 1114 list_del_init(&entry->list); 1115 kfree(entry); 1116 } 1117 } 1118 if (!ret && last_scanned_ret) { 1119 /* 1120 * If the last extent is not a target, the caller can skip to 1121 * the end of that extent. 1122 * Otherwise, we can only go the end of the specified range. 1123 */ 1124 if (!last_is_target) 1125 *last_scanned_ret = max(cur, *last_scanned_ret); 1126 else 1127 *last_scanned_ret = max(start + len, *last_scanned_ret); 1128 } 1129 return ret; 1130 } 1131 1132 #define CLUSTER_SIZE (SZ_256K) 1133 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1134 1135 /* 1136 * Defrag one contiguous target range. 1137 * 1138 * @inode: target inode 1139 * @target: target range to defrag 1140 * @pages: locked pages covering the defrag range 1141 * @nr_pages: number of locked pages 1142 * 1143 * Caller should ensure: 1144 * 1145 * - Pages are prepared 1146 * Pages should be locked, no ordered extent in the pages range, 1147 * no writeback. 1148 * 1149 * - Extent bits are locked 1150 */ 1151 static int defrag_one_locked_target(struct btrfs_inode *inode, 1152 struct defrag_target_range *target, 1153 struct folio **folios, int nr_pages, 1154 struct extent_state **cached_state) 1155 { 1156 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1157 struct extent_changeset *data_reserved = NULL; 1158 const u64 start = target->start; 1159 const u64 len = target->len; 1160 int ret = 0; 1161 1162 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1163 if (ret < 0) 1164 return ret; 1165 btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, 1166 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1167 EXTENT_DEFRAG, cached_state); 1168 btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, 1169 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1170 1171 /* 1172 * Update the page status. 1173 * Due to possible large folios, we have to check all folios one by one. 1174 */ 1175 for (int i = 0; i < nr_pages && folios[i]; i++) { 1176 struct folio *folio = folios[i]; 1177 1178 if (!folio) 1179 break; 1180 if (start >= folio_next_pos(folio) || 1181 start + len <= folio_pos(folio)) 1182 continue; 1183 btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); 1184 btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); 1185 } 1186 btrfs_delalloc_release_extents(inode, len); 1187 extent_changeset_free(data_reserved); 1188 1189 return ret; 1190 } 1191 1192 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1193 u32 extent_thresh, u64 newer_than, bool do_compress, 1194 u64 *last_scanned_ret) 1195 { 1196 struct extent_state *cached_state = NULL; 1197 struct defrag_target_range *entry; 1198 struct defrag_target_range *tmp; 1199 LIST_HEAD(target_list); 1200 struct folio **folios; 1201 const u32 sectorsize = inode->root->fs_info->sectorsize; 1202 u64 cur = start; 1203 const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - 1204 (start >> PAGE_SHIFT) + 1; 1205 int ret = 0; 1206 1207 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1208 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1209 1210 folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); 1211 if (!folios) 1212 return -ENOMEM; 1213 1214 /* Prepare all pages */ 1215 for (int i = 0; cur < start + len && i < nr_pages; i++) { 1216 folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); 1217 if (IS_ERR(folios[i])) { 1218 ret = PTR_ERR(folios[i]); 1219 folios[i] = NULL; 1220 goto free_folios; 1221 } 1222 cur = folio_next_pos(folios[i]); 1223 } 1224 for (int i = 0; i < nr_pages; i++) { 1225 if (!folios[i]) 1226 break; 1227 folio_wait_writeback(folios[i]); 1228 } 1229 1230 /* We should get at least one folio. */ 1231 ASSERT(folios[0]); 1232 /* Lock the pages range */ 1233 btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1234 /* 1235 * Now we have a consistent view about the extent map, re-check 1236 * which range really needs to be defragged. 1237 * 1238 * And this time we have extent locked already, pass @locked = true 1239 * so that we won't relock the extent range and cause deadlock. 1240 */ 1241 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1242 newer_than, do_compress, true, 1243 &target_list, last_scanned_ret); 1244 if (ret < 0) 1245 goto unlock_extent; 1246 1247 list_for_each_entry(entry, &target_list, list) { 1248 ret = defrag_one_locked_target(inode, entry, folios, nr_pages, 1249 &cached_state); 1250 if (ret < 0) 1251 break; 1252 } 1253 1254 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1255 list_del_init(&entry->list); 1256 kfree(entry); 1257 } 1258 unlock_extent: 1259 btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1260 free_folios: 1261 for (int i = 0; i < nr_pages; i++) { 1262 if (!folios[i]) 1263 break; 1264 folio_unlock(folios[i]); 1265 folio_put(folios[i]); 1266 } 1267 kfree(folios); 1268 return ret; 1269 } 1270 1271 static int defrag_one_cluster(struct btrfs_inode *inode, 1272 struct file_ra_state *ra, 1273 u64 start, u32 len, u32 extent_thresh, 1274 u64 newer_than, bool do_compress, 1275 unsigned long *sectors_defragged, 1276 unsigned long max_sectors, 1277 u64 *last_scanned_ret) 1278 { 1279 const u32 sectorsize = inode->root->fs_info->sectorsize; 1280 struct defrag_target_range *entry; 1281 struct defrag_target_range *tmp; 1282 LIST_HEAD(target_list); 1283 int ret; 1284 1285 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1286 newer_than, do_compress, false, 1287 &target_list, NULL); 1288 if (ret < 0) 1289 goto out; 1290 1291 list_for_each_entry(entry, &target_list, list) { 1292 u32 range_len = entry->len; 1293 1294 /* Reached or beyond the limit */ 1295 if (max_sectors && *sectors_defragged >= max_sectors) { 1296 ret = 1; 1297 break; 1298 } 1299 1300 if (max_sectors) 1301 range_len = min_t(u32, range_len, 1302 (max_sectors - *sectors_defragged) * sectorsize); 1303 1304 /* 1305 * If defrag_one_range() has updated last_scanned_ret, 1306 * our range may already be invalid (e.g. hole punched). 1307 * Skip if our range is before last_scanned_ret, as there is 1308 * no need to defrag the range anymore. 1309 */ 1310 if (entry->start + range_len <= *last_scanned_ret) 1311 continue; 1312 1313 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1314 ra, NULL, entry->start >> PAGE_SHIFT, 1315 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1316 (entry->start >> PAGE_SHIFT) + 1); 1317 /* 1318 * Here we may not defrag any range if holes are punched before 1319 * we locked the pages. 1320 * But that's fine, it only affects the @sectors_defragged 1321 * accounting. 1322 */ 1323 ret = defrag_one_range(inode, entry->start, range_len, 1324 extent_thresh, newer_than, do_compress, 1325 last_scanned_ret); 1326 if (ret < 0) 1327 break; 1328 *sectors_defragged += range_len >> 1329 inode->root->fs_info->sectorsize_bits; 1330 } 1331 out: 1332 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1333 list_del_init(&entry->list); 1334 kfree(entry); 1335 } 1336 if (ret >= 0) 1337 *last_scanned_ret = max(*last_scanned_ret, start + len); 1338 return ret; 1339 } 1340 1341 /* 1342 * Entry point to file defragmentation. 1343 * 1344 * @inode: inode to be defragged 1345 * @ra: readahead state 1346 * @range: defrag options including range and flags 1347 * @newer_than: minimum transid to defrag 1348 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1349 * will be defragged. 1350 * 1351 * Return <0 for error. 1352 * Return >=0 for the number of sectors defragged, and range->start will be updated 1353 * to indicate the file offset where next defrag should be started at. 1354 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1355 * defragging all the range). 1356 */ 1357 int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, 1358 struct btrfs_ioctl_defrag_range_args *range, 1359 u64 newer_than, unsigned long max_to_defrag) 1360 { 1361 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1362 unsigned long sectors_defragged = 0; 1363 u64 isize = i_size_read(&inode->vfs_inode); 1364 u64 cur; 1365 u64 last_byte; 1366 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1367 bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); 1368 int compress_type = BTRFS_COMPRESS_ZLIB; 1369 int compress_level = 0; 1370 int ret = 0; 1371 u32 extent_thresh = range->extent_thresh; 1372 pgoff_t start_index; 1373 1374 ASSERT(ra); 1375 1376 if (isize == 0) 1377 return 0; 1378 1379 if (range->start >= isize) 1380 return -EINVAL; 1381 1382 if (do_compress) { 1383 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { 1384 if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) 1385 return -EINVAL; 1386 if (range->compress.type) { 1387 compress_type = range->compress.type; 1388 compress_level = range->compress.level; 1389 if (!btrfs_compress_level_valid(compress_type, compress_level)) 1390 return -EINVAL; 1391 } 1392 } else { 1393 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1394 return -EINVAL; 1395 if (range->compress_type) 1396 compress_type = range->compress_type; 1397 } 1398 } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { 1399 compress_type = BTRFS_DEFRAG_DONT_COMPRESS; 1400 compress_level = 1; 1401 } 1402 1403 if (extent_thresh == 0) 1404 extent_thresh = SZ_256K; 1405 1406 if (range->start + range->len > range->start) { 1407 /* Got a specific range */ 1408 last_byte = min(isize, range->start + range->len); 1409 } else { 1410 /* Defrag until file end */ 1411 last_byte = isize; 1412 } 1413 1414 /* Align the range */ 1415 cur = round_down(range->start, fs_info->sectorsize); 1416 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1417 1418 /* 1419 * Make writeback start from the beginning of the range, so that the 1420 * defrag range can be written sequentially. 1421 */ 1422 start_index = cur >> PAGE_SHIFT; 1423 if (start_index < inode->vfs_inode.i_mapping->writeback_index) 1424 inode->vfs_inode.i_mapping->writeback_index = start_index; 1425 1426 while (cur < last_byte) { 1427 const unsigned long prev_sectors_defragged = sectors_defragged; 1428 u64 last_scanned = cur; 1429 u64 cluster_end; 1430 1431 if (btrfs_defrag_cancelled(fs_info)) { 1432 ret = -EAGAIN; 1433 break; 1434 } 1435 1436 /* We want the cluster end at page boundary when possible */ 1437 cluster_end = (((cur >> PAGE_SHIFT) + 1438 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1439 cluster_end = min(cluster_end, last_byte); 1440 1441 btrfs_inode_lock(inode, 0); 1442 if (IS_SWAPFILE(&inode->vfs_inode)) { 1443 ret = -ETXTBSY; 1444 btrfs_inode_unlock(inode, 0); 1445 break; 1446 } 1447 if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { 1448 btrfs_inode_unlock(inode, 0); 1449 break; 1450 } 1451 if (do_compress || no_compress) { 1452 inode->defrag_compress = compress_type; 1453 inode->defrag_compress_level = compress_level; 1454 } 1455 ret = defrag_one_cluster(inode, ra, cur, 1456 cluster_end + 1 - cur, extent_thresh, 1457 newer_than, do_compress || no_compress, 1458 §ors_defragged, 1459 max_to_defrag, &last_scanned); 1460 1461 if (sectors_defragged > prev_sectors_defragged) 1462 balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); 1463 1464 btrfs_inode_unlock(inode, 0); 1465 if (ret < 0) 1466 break; 1467 cur = max(cluster_end + 1, last_scanned); 1468 if (ret > 0) { 1469 ret = 0; 1470 break; 1471 } 1472 cond_resched(); 1473 } 1474 1475 /* 1476 * Update range.start for autodefrag, this will indicate where to start 1477 * in next run. 1478 */ 1479 range->start = cur; 1480 if (sectors_defragged) { 1481 /* 1482 * We have defragged some sectors, for compression case they 1483 * need to be written back immediately. 1484 */ 1485 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1486 filemap_flush(inode->vfs_inode.i_mapping); 1487 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1488 &inode->runtime_flags)) 1489 filemap_flush(inode->vfs_inode.i_mapping); 1490 } 1491 if (range->compress_type == BTRFS_COMPRESS_LZO) 1492 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1493 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1494 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1495 ret = sectors_defragged; 1496 } 1497 if (do_compress || no_compress) { 1498 btrfs_inode_lock(inode, 0); 1499 inode->defrag_compress = BTRFS_COMPRESS_NONE; 1500 btrfs_inode_unlock(inode, 0); 1501 } 1502 return ret; 1503 } 1504 1505 void __cold btrfs_auto_defrag_exit(void) 1506 { 1507 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1508 } 1509 1510 int __init btrfs_auto_defrag_init(void) 1511 { 1512 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1513 sizeof(struct inode_defrag), 0, 0, NULL); 1514 if (!btrfs_inode_defrag_cachep) 1515 return -ENOMEM; 1516 1517 return 0; 1518 } 1519