1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "transaction.h" 10 #include "locking.h" 11 #include "accessors.h" 12 #include "messages.h" 13 #include "delalloc-space.h" 14 #include "subpage.h" 15 #include "defrag.h" 16 #include "file-item.h" 17 #include "super.h" 18 19 static struct kmem_cache *btrfs_inode_defrag_cachep; 20 21 /* 22 * When auto defrag is enabled we queue up these defrag structs to remember 23 * which inodes need defragging passes. 24 */ 25 struct inode_defrag { 26 struct rb_node rb_node; 27 /* Inode number */ 28 u64 ino; 29 /* 30 * Transid where the defrag was added, we search for extents newer than 31 * this. 32 */ 33 u64 transid; 34 35 /* Root objectid */ 36 u64 root; 37 38 /* 39 * The extent size threshold for autodefrag. 40 * 41 * This value is different for compressed/non-compressed extents, thus 42 * needs to be passed from higher layer. 43 * (aka, inode_should_defrag()) 44 */ 45 u32 extent_thresh; 46 }; 47 48 static int compare_inode_defrag(const struct inode_defrag *defrag1, 49 const struct inode_defrag *defrag2) 50 { 51 if (defrag1->root > defrag2->root) 52 return 1; 53 else if (defrag1->root < defrag2->root) 54 return -1; 55 else if (defrag1->ino > defrag2->ino) 56 return 1; 57 else if (defrag1->ino < defrag2->ino) 58 return -1; 59 else 60 return 0; 61 } 62 63 static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) 64 { 65 const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); 66 const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); 67 68 return compare_inode_defrag(new_defrag, existing_defrag); 69 } 70 71 /* 72 * Insert a record for an inode into the defrag tree. The lock must be held 73 * already. 74 * 75 * If you're inserting a record for an older transid than an existing record, 76 * the transid already in the tree is lowered. 77 */ 78 static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, 79 struct inode_defrag *defrag) 80 { 81 struct btrfs_fs_info *fs_info = inode->root->fs_info; 82 struct rb_node *node; 83 84 node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); 85 if (node) { 86 struct inode_defrag *entry; 87 88 entry = rb_entry(node, struct inode_defrag, rb_node); 89 /* 90 * If we're reinserting an entry for an old defrag run, make 91 * sure to lower the transid of our existing record. 92 */ 93 if (defrag->transid < entry->transid) 94 entry->transid = defrag->transid; 95 entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); 96 return -EEXIST; 97 } 98 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 99 return 0; 100 } 101 102 static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) 103 { 104 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 105 return false; 106 107 if (btrfs_fs_closing(fs_info)) 108 return false; 109 110 return true; 111 } 112 113 /* 114 * Insert a defrag record for this inode if auto defrag is enabled. No errors 115 * returned as they're not considered fatal. 116 */ 117 void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) 118 { 119 struct btrfs_root *root = inode->root; 120 struct btrfs_fs_info *fs_info = root->fs_info; 121 struct inode_defrag *defrag; 122 int ret; 123 124 if (!need_auto_defrag(fs_info)) 125 return; 126 127 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 128 return; 129 130 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 131 if (!defrag) 132 return; 133 134 defrag->ino = btrfs_ino(inode); 135 defrag->transid = btrfs_get_root_last_trans(root); 136 defrag->root = btrfs_root_id(root); 137 defrag->extent_thresh = extent_thresh; 138 139 spin_lock(&fs_info->defrag_inodes_lock); 140 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 141 /* 142 * If we set IN_DEFRAG flag and evict the inode from memory, 143 * and then re-read this inode, this new inode doesn't have 144 * IN_DEFRAG flag. At the case, we may find the existed defrag. 145 */ 146 ret = btrfs_insert_inode_defrag(inode, defrag); 147 if (ret) 148 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 149 } else { 150 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 151 } 152 spin_unlock(&fs_info->defrag_inodes_lock); 153 } 154 155 /* 156 * Pick the defragable inode that we want, if it doesn't exist, we will get the 157 * next one. 158 */ 159 static struct inode_defrag *btrfs_pick_defrag_inode( 160 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 161 { 162 struct inode_defrag *entry = NULL; 163 struct inode_defrag tmp; 164 struct rb_node *p; 165 struct rb_node *parent = NULL; 166 int ret; 167 168 tmp.ino = ino; 169 tmp.root = root; 170 171 spin_lock(&fs_info->defrag_inodes_lock); 172 p = fs_info->defrag_inodes.rb_node; 173 while (p) { 174 parent = p; 175 entry = rb_entry(parent, struct inode_defrag, rb_node); 176 177 ret = compare_inode_defrag(&tmp, entry); 178 if (ret < 0) 179 p = parent->rb_left; 180 else if (ret > 0) 181 p = parent->rb_right; 182 else 183 goto out; 184 } 185 186 if (parent && compare_inode_defrag(&tmp, entry) > 0) { 187 parent = rb_next(parent); 188 entry = rb_entry_safe(parent, struct inode_defrag, rb_node); 189 } 190 out: 191 if (entry) 192 rb_erase(parent, &fs_info->defrag_inodes); 193 spin_unlock(&fs_info->defrag_inodes_lock); 194 return entry; 195 } 196 197 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 198 { 199 struct inode_defrag *defrag, *next; 200 201 spin_lock(&fs_info->defrag_inodes_lock); 202 203 rbtree_postorder_for_each_entry_safe(defrag, next, 204 &fs_info->defrag_inodes, rb_node) 205 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 206 207 fs_info->defrag_inodes = RB_ROOT; 208 209 spin_unlock(&fs_info->defrag_inodes_lock); 210 } 211 212 #define BTRFS_DEFRAG_BATCH 1024 213 214 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 215 struct inode_defrag *defrag, 216 struct file_ra_state *ra) 217 { 218 struct btrfs_root *inode_root; 219 struct btrfs_inode *inode; 220 struct btrfs_ioctl_defrag_range_args range; 221 int ret = 0; 222 u64 cur = 0; 223 224 again: 225 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 226 goto cleanup; 227 if (!need_auto_defrag(fs_info)) 228 goto cleanup; 229 230 /* Get the inode */ 231 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 232 if (IS_ERR(inode_root)) { 233 ret = PTR_ERR(inode_root); 234 goto cleanup; 235 } 236 237 inode = btrfs_iget(defrag->ino, inode_root); 238 btrfs_put_root(inode_root); 239 if (IS_ERR(inode)) { 240 ret = PTR_ERR(inode); 241 goto cleanup; 242 } 243 244 if (cur >= i_size_read(&inode->vfs_inode)) { 245 iput(&inode->vfs_inode); 246 goto cleanup; 247 } 248 249 /* Do a chunk of defrag */ 250 clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 251 memset(&range, 0, sizeof(range)); 252 range.len = (u64)-1; 253 range.start = cur; 254 range.extent_thresh = defrag->extent_thresh; 255 file_ra_state_init(ra, inode->vfs_inode.i_mapping); 256 257 sb_start_write(fs_info->sb); 258 ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, 259 BTRFS_DEFRAG_BATCH); 260 sb_end_write(fs_info->sb); 261 iput(&inode->vfs_inode); 262 263 if (ret < 0) 264 goto cleanup; 265 266 cur = max(cur + fs_info->sectorsize, range.start); 267 goto again; 268 269 cleanup: 270 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 271 return ret; 272 } 273 274 /* 275 * Run through the list of inodes in the FS that need defragging. 276 */ 277 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 278 { 279 struct inode_defrag *defrag; 280 u64 first_ino = 0; 281 u64 root_objectid = 0; 282 283 atomic_inc(&fs_info->defrag_running); 284 while (1) { 285 struct file_ra_state ra = { 0 }; 286 287 /* Pause the auto defragger. */ 288 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 289 break; 290 291 if (!need_auto_defrag(fs_info)) 292 break; 293 294 /* find an inode to defrag */ 295 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 296 if (!defrag) { 297 if (root_objectid || first_ino) { 298 root_objectid = 0; 299 first_ino = 0; 300 continue; 301 } else { 302 break; 303 } 304 } 305 306 first_ino = defrag->ino + 1; 307 root_objectid = defrag->root; 308 309 btrfs_run_defrag_inode(fs_info, defrag, &ra); 310 } 311 atomic_dec(&fs_info->defrag_running); 312 313 /* 314 * During unmount, we use the transaction_wait queue to wait for the 315 * defragger to stop. 316 */ 317 wake_up(&fs_info->transaction_wait); 318 return 0; 319 } 320 321 /* 322 * Check if two blocks addresses are close, used by defrag. 323 */ 324 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 325 { 326 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 327 return true; 328 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 329 return true; 330 return false; 331 } 332 333 /* 334 * Go through all the leaves pointed to by a node and reallocate them so that 335 * disk order is close to key order. 336 */ 337 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 338 struct btrfs_root *root, 339 struct extent_buffer *parent, 340 int start_slot, u64 *last_ret, 341 struct btrfs_key *progress) 342 { 343 struct btrfs_fs_info *fs_info = root->fs_info; 344 const u32 blocksize = fs_info->nodesize; 345 const int end_slot = btrfs_header_nritems(parent) - 1; 346 u64 search_start = *last_ret; 347 u64 last_block = 0; 348 int ret = 0; 349 bool progress_passed = false; 350 351 /* 352 * COWing must happen through a running transaction, which always 353 * matches the current fs generation (it's a transaction with a state 354 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 355 * into error state to prevent the commit of any transaction. 356 */ 357 if (unlikely(trans->transaction != fs_info->running_transaction || 358 trans->transid != fs_info->generation)) { 359 btrfs_abort_transaction(trans, -EUCLEAN); 360 btrfs_crit(fs_info, 361 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 362 parent->start, btrfs_root_id(root), trans->transid, 363 fs_info->running_transaction->transid, 364 fs_info->generation); 365 return -EUCLEAN; 366 } 367 368 if (btrfs_header_nritems(parent) <= 1) 369 return 0; 370 371 for (int i = start_slot; i <= end_slot; i++) { 372 struct extent_buffer *cur; 373 struct btrfs_disk_key disk_key; 374 u64 blocknr; 375 u64 other; 376 bool close = true; 377 378 btrfs_node_key(parent, &disk_key, i); 379 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 380 continue; 381 382 progress_passed = true; 383 blocknr = btrfs_node_blockptr(parent, i); 384 if (last_block == 0) 385 last_block = blocknr; 386 387 if (i > 0) { 388 other = btrfs_node_blockptr(parent, i - 1); 389 close = close_blocks(blocknr, other, blocksize); 390 } 391 if (!close && i < end_slot) { 392 other = btrfs_node_blockptr(parent, i + 1); 393 close = close_blocks(blocknr, other, blocksize); 394 } 395 if (close) { 396 last_block = blocknr; 397 continue; 398 } 399 400 cur = btrfs_read_node_slot(parent, i); 401 if (IS_ERR(cur)) 402 return PTR_ERR(cur); 403 if (search_start == 0) 404 search_start = last_block; 405 406 btrfs_tree_lock(cur); 407 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 408 &cur, search_start, 409 min(16 * blocksize, 410 (end_slot - i) * blocksize), 411 BTRFS_NESTING_COW); 412 if (ret) { 413 btrfs_tree_unlock(cur); 414 free_extent_buffer(cur); 415 break; 416 } 417 search_start = cur->start; 418 last_block = cur->start; 419 *last_ret = search_start; 420 btrfs_tree_unlock(cur); 421 free_extent_buffer(cur); 422 } 423 return ret; 424 } 425 426 /* 427 * Defrag all the leaves in a given btree. 428 * Read all the leaves and try to get key order to 429 * better reflect disk order 430 */ 431 432 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 433 struct btrfs_root *root) 434 { 435 struct btrfs_path *path = NULL; 436 struct btrfs_key key; 437 int ret = 0; 438 int wret; 439 int level; 440 int next_key_ret = 0; 441 u64 last_ret = 0; 442 443 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 444 goto out; 445 446 path = btrfs_alloc_path(); 447 if (!path) { 448 ret = -ENOMEM; 449 goto out; 450 } 451 452 level = btrfs_header_level(root->node); 453 454 if (level == 0) 455 goto out; 456 457 if (root->defrag_progress.objectid == 0) { 458 struct extent_buffer *root_node; 459 u32 nritems; 460 461 root_node = btrfs_lock_root_node(root); 462 nritems = btrfs_header_nritems(root_node); 463 root->defrag_max.objectid = 0; 464 /* from above we know this is not a leaf */ 465 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 466 nritems - 1); 467 btrfs_tree_unlock(root_node); 468 free_extent_buffer(root_node); 469 memset(&key, 0, sizeof(key)); 470 } else { 471 memcpy(&key, &root->defrag_progress, sizeof(key)); 472 } 473 474 path->keep_locks = 1; 475 476 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 477 if (ret < 0) 478 goto out; 479 if (ret > 0) { 480 ret = 0; 481 goto out; 482 } 483 btrfs_release_path(path); 484 /* 485 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 486 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 487 * a deadlock (attempting to write lock an already write locked leaf). 488 */ 489 path->lowest_level = 1; 490 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 491 492 if (wret < 0) { 493 ret = wret; 494 goto out; 495 } 496 if (!path->nodes[1]) { 497 ret = 0; 498 goto out; 499 } 500 /* 501 * The node at level 1 must always be locked when our path has 502 * keep_locks set and lowest_level is 1, regardless of the value of 503 * path->slots[1]. 504 */ 505 ASSERT(path->locks[1] != 0); 506 ret = btrfs_realloc_node(trans, root, 507 path->nodes[1], 0, 508 &last_ret, 509 &root->defrag_progress); 510 if (ret) { 511 WARN_ON(ret == -EAGAIN); 512 goto out; 513 } 514 /* 515 * Now that we reallocated the node we can find the next key. Note that 516 * btrfs_find_next_key() can release our path and do another search 517 * without COWing, this is because even with path->keep_locks = 1, 518 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 519 * node when path->slots[node_level - 1] does not point to the last 520 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 521 * we search for the next key after reallocating our node. 522 */ 523 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 524 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 525 BTRFS_OLDEST_GENERATION); 526 if (next_key_ret == 0) { 527 memcpy(&root->defrag_progress, &key, sizeof(key)); 528 ret = -EAGAIN; 529 } 530 out: 531 btrfs_free_path(path); 532 if (ret == -EAGAIN) { 533 if (root->defrag_max.objectid > root->defrag_progress.objectid) 534 goto done; 535 if (root->defrag_max.type > root->defrag_progress.type) 536 goto done; 537 if (root->defrag_max.offset > root->defrag_progress.offset) 538 goto done; 539 ret = 0; 540 } 541 done: 542 if (ret != -EAGAIN) 543 memset(&root->defrag_progress, 0, 544 sizeof(root->defrag_progress)); 545 546 return ret; 547 } 548 549 /* 550 * Defrag a given btree. Every leaf in the btree is read and defragmented. 551 */ 552 int btrfs_defrag_root(struct btrfs_root *root) 553 { 554 struct btrfs_fs_info *fs_info = root->fs_info; 555 int ret; 556 557 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 558 return 0; 559 560 while (1) { 561 struct btrfs_trans_handle *trans; 562 563 trans = btrfs_start_transaction(root, 0); 564 if (IS_ERR(trans)) { 565 ret = PTR_ERR(trans); 566 break; 567 } 568 569 ret = btrfs_defrag_leaves(trans, root); 570 571 btrfs_end_transaction(trans); 572 btrfs_btree_balance_dirty(fs_info); 573 cond_resched(); 574 575 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 576 break; 577 578 if (btrfs_defrag_cancelled(fs_info)) { 579 btrfs_debug(fs_info, "defrag_root cancelled"); 580 ret = -EAGAIN; 581 break; 582 } 583 } 584 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 585 return ret; 586 } 587 588 /* 589 * Defrag specific helper to get an extent map. 590 * 591 * Differences between this and btrfs_get_extent() are: 592 * 593 * - No extent_map will be added to inode->extent_tree 594 * To reduce memory usage in the long run. 595 * 596 * - Extra optimization to skip file extents older than @newer_than 597 * By using btrfs_search_forward() we can skip entire file ranges that 598 * have extents created in past transactions, because btrfs_search_forward() 599 * will not visit leaves and nodes with a generation smaller than given 600 * minimal generation threshold (@newer_than). 601 * 602 * Return valid em if we find a file extent matching the requirement. 603 * Return NULL if we can not find a file extent matching the requirement. 604 * 605 * Return ERR_PTR() for error. 606 */ 607 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 608 u64 start, u64 newer_than) 609 { 610 struct btrfs_root *root = inode->root; 611 struct btrfs_file_extent_item *fi; 612 struct btrfs_path path = { 0 }; 613 struct extent_map *em; 614 struct btrfs_key key; 615 u64 ino = btrfs_ino(inode); 616 int ret; 617 618 em = btrfs_alloc_extent_map(); 619 if (!em) { 620 ret = -ENOMEM; 621 goto err; 622 } 623 624 key.objectid = ino; 625 key.type = BTRFS_EXTENT_DATA_KEY; 626 key.offset = start; 627 628 if (newer_than) { 629 ret = btrfs_search_forward(root, &key, &path, newer_than); 630 if (ret < 0) 631 goto err; 632 /* Can't find anything newer */ 633 if (ret > 0) 634 goto not_found; 635 } else { 636 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 637 if (ret < 0) 638 goto err; 639 } 640 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 641 /* 642 * If btrfs_search_slot() makes path to point beyond nritems, 643 * we should not have an empty leaf, as this inode must at 644 * least have its INODE_ITEM. 645 */ 646 ASSERT(btrfs_header_nritems(path.nodes[0])); 647 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 648 } 649 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 650 /* Perfect match, no need to go one slot back */ 651 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 652 key.offset == start) 653 goto iterate; 654 655 /* We didn't find a perfect match, needs to go one slot back */ 656 if (path.slots[0] > 0) { 657 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 658 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 659 path.slots[0]--; 660 } 661 662 iterate: 663 /* Iterate through the path to find a file extent covering @start */ 664 while (true) { 665 u64 extent_end; 666 667 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 668 goto next; 669 670 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 671 672 /* 673 * We may go one slot back to INODE_REF/XATTR item, then 674 * need to go forward until we reach an EXTENT_DATA. 675 * But we should still has the correct ino as key.objectid. 676 */ 677 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 678 goto next; 679 680 /* It's beyond our target range, definitely not extent found */ 681 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 682 goto not_found; 683 684 /* 685 * | |<- File extent ->| 686 * \- start 687 * 688 * This means there is a hole between start and key.offset. 689 */ 690 if (key.offset > start) { 691 em->start = start; 692 em->disk_bytenr = EXTENT_MAP_HOLE; 693 em->disk_num_bytes = 0; 694 em->ram_bytes = 0; 695 em->offset = 0; 696 em->len = key.offset - start; 697 break; 698 } 699 700 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 701 struct btrfs_file_extent_item); 702 extent_end = btrfs_file_extent_end(&path); 703 704 /* 705 * |<- file extent ->| | 706 * \- start 707 * 708 * We haven't reached start, search next slot. 709 */ 710 if (extent_end <= start) 711 goto next; 712 713 /* Now this extent covers @start, convert it to em */ 714 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 715 break; 716 next: 717 ret = btrfs_next_item(root, &path); 718 if (ret < 0) 719 goto err; 720 if (ret > 0) 721 goto not_found; 722 } 723 btrfs_release_path(&path); 724 return em; 725 726 not_found: 727 btrfs_release_path(&path); 728 btrfs_free_extent_map(em); 729 return NULL; 730 731 err: 732 btrfs_release_path(&path); 733 btrfs_free_extent_map(em); 734 return ERR_PTR(ret); 735 } 736 737 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 738 u64 newer_than, bool locked) 739 { 740 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 741 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 742 struct extent_map *em; 743 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 744 745 /* 746 * Hopefully we have this extent in the tree already, try without the 747 * full extent lock. 748 */ 749 read_lock(&em_tree->lock); 750 em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); 751 read_unlock(&em_tree->lock); 752 753 /* 754 * We can get a merged extent, in that case, we need to re-search 755 * tree to get the original em for defrag. 756 * 757 * This is because even if we have adjacent extents that are contiguous 758 * and compatible (same type and flags), we still want to defrag them 759 * so that we use less metadata (extent items in the extent tree and 760 * file extent items in the inode's subvolume tree). 761 */ 762 if (em && (em->flags & EXTENT_FLAG_MERGED)) { 763 btrfs_free_extent_map(em); 764 em = NULL; 765 } 766 767 if (!em) { 768 struct extent_state *cached = NULL; 769 u64 end = start + sectorsize - 1; 770 771 /* Get the big lock and read metadata off disk. */ 772 if (!locked) 773 btrfs_lock_extent(io_tree, start, end, &cached); 774 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 775 if (!locked) 776 btrfs_unlock_extent(io_tree, start, end, &cached); 777 778 if (IS_ERR(em)) 779 return NULL; 780 } 781 782 return em; 783 } 784 785 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 786 const struct extent_map *em) 787 { 788 if (btrfs_extent_map_is_compressed(em)) 789 return BTRFS_MAX_COMPRESSED; 790 return fs_info->max_extent_size; 791 } 792 793 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 794 u32 extent_thresh, u64 newer_than, bool locked) 795 { 796 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 797 struct extent_map *next; 798 bool ret = false; 799 800 /* This is the last extent */ 801 if (em->start + em->len >= i_size_read(inode)) 802 return false; 803 804 /* 805 * Here we need to pass @newer_then when checking the next extent, or 806 * we will hit a case we mark current extent for defrag, but the next 807 * one will not be a target. 808 * This will just cause extra IO without really reducing the fragments. 809 */ 810 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 811 /* No more em or hole */ 812 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 813 goto out; 814 if (next->flags & EXTENT_FLAG_PREALLOC) 815 goto out; 816 /* 817 * If the next extent is at its max capacity, defragging current extent 818 * makes no sense, as the total number of extents won't change. 819 */ 820 if (next->len >= get_extent_max_capacity(fs_info, em)) 821 goto out; 822 /* Skip older extent */ 823 if (next->generation < newer_than) 824 goto out; 825 /* Also check extent size */ 826 if (next->len >= extent_thresh) 827 goto out; 828 829 ret = true; 830 out: 831 btrfs_free_extent_map(next); 832 return ret; 833 } 834 835 /* 836 * Prepare one page to be defragged. 837 * 838 * This will ensure: 839 * 840 * - Returned page is locked and has been set up properly. 841 * - No ordered extent exists in the page. 842 * - The page is uptodate. 843 * 844 * NOTE: Caller should also wait for page writeback after the cluster is 845 * prepared, here we don't do writeback wait for each page. 846 */ 847 static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) 848 { 849 struct address_space *mapping = inode->vfs_inode.i_mapping; 850 gfp_t mask = btrfs_alloc_write_mask(mapping); 851 u64 lock_start; 852 u64 lock_end; 853 struct extent_state *cached_state = NULL; 854 struct folio *folio; 855 int ret; 856 857 again: 858 /* TODO: Add order fgp order flags when large folios are fully enabled. */ 859 folio = __filemap_get_folio(mapping, index, 860 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 861 if (IS_ERR(folio)) 862 return folio; 863 864 /* 865 * Since we can defragment files opened read-only, we can encounter 866 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). 867 * 868 * The IO for such large folios is not fully tested, thus return 869 * an error to reject such folios unless it's an experimental build. 870 * 871 * Filesystem transparent huge pages are typically only used for 872 * executables that explicitly enable them, so this isn't very 873 * restrictive. 874 */ 875 if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { 876 folio_unlock(folio); 877 folio_put(folio); 878 return ERR_PTR(-ETXTBSY); 879 } 880 881 ret = set_folio_extent_mapped(folio); 882 if (ret < 0) { 883 folio_unlock(folio); 884 folio_put(folio); 885 return ERR_PTR(ret); 886 } 887 888 lock_start = folio_pos(folio); 889 lock_end = folio_end(folio) - 1; 890 /* Wait for any existing ordered extent in the range */ 891 while (1) { 892 struct btrfs_ordered_extent *ordered; 893 894 btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 895 ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); 896 btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 897 if (!ordered) 898 break; 899 900 folio_unlock(folio); 901 btrfs_start_ordered_extent(ordered); 902 btrfs_put_ordered_extent(ordered); 903 folio_lock(folio); 904 /* 905 * We unlocked the folio above, so we need check if it was 906 * released or not. 907 */ 908 if (folio->mapping != mapping || !folio->private) { 909 folio_unlock(folio); 910 folio_put(folio); 911 goto again; 912 } 913 } 914 915 /* 916 * Now the page range has no ordered extent any more. Read the page to 917 * make it uptodate. 918 */ 919 if (!folio_test_uptodate(folio)) { 920 btrfs_read_folio(NULL, folio); 921 folio_lock(folio); 922 if (folio->mapping != mapping || !folio->private) { 923 folio_unlock(folio); 924 folio_put(folio); 925 goto again; 926 } 927 if (!folio_test_uptodate(folio)) { 928 folio_unlock(folio); 929 folio_put(folio); 930 return ERR_PTR(-EIO); 931 } 932 } 933 return folio; 934 } 935 936 struct defrag_target_range { 937 struct list_head list; 938 u64 start; 939 u64 len; 940 }; 941 942 /* 943 * Collect all valid target extents. 944 * 945 * @start: file offset to lookup 946 * @len: length to lookup 947 * @extent_thresh: file extent size threshold, any extent size >= this value 948 * will be ignored 949 * @newer_than: only defrag extents newer than this value 950 * @do_compress: whether the defrag is doing compression or no-compression 951 * if true, @extent_thresh will be ignored and all regular 952 * file extents meeting @newer_than will be targets. 953 * @locked: if the range has already held extent lock 954 * @target_list: list of targets file extents 955 */ 956 static int defrag_collect_targets(struct btrfs_inode *inode, 957 u64 start, u64 len, u32 extent_thresh, 958 u64 newer_than, bool do_compress, 959 bool locked, struct list_head *target_list, 960 u64 *last_scanned_ret) 961 { 962 struct btrfs_fs_info *fs_info = inode->root->fs_info; 963 bool last_is_target = false; 964 u64 cur = start; 965 int ret = 0; 966 967 while (cur < start + len) { 968 struct extent_map *em; 969 struct defrag_target_range *new; 970 bool next_mergeable = true; 971 u64 range_len; 972 973 last_is_target = false; 974 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 975 if (!em) 976 break; 977 978 /* 979 * If the file extent is an inlined one, we may still want to 980 * defrag it (fallthrough) if it will cause a regular extent. 981 * This is for users who want to convert inline extents to 982 * regular ones through max_inline= mount option. 983 */ 984 if (em->disk_bytenr == EXTENT_MAP_INLINE && 985 em->len <= inode->root->fs_info->max_inline) 986 goto next; 987 988 /* Skip holes and preallocated extents. */ 989 if (em->disk_bytenr == EXTENT_MAP_HOLE || 990 (em->flags & EXTENT_FLAG_PREALLOC)) 991 goto next; 992 993 /* Skip older extent */ 994 if (em->generation < newer_than) 995 goto next; 996 997 /* This em is under writeback, no need to defrag */ 998 if (em->generation == (u64)-1) 999 goto next; 1000 1001 /* 1002 * Our start offset might be in the middle of an existing extent 1003 * map, so take that into account. 1004 */ 1005 range_len = em->len - (cur - em->start); 1006 /* 1007 * If this range of the extent map is already flagged for delalloc, 1008 * skip it, because: 1009 * 1010 * 1) We could deadlock later, when trying to reserve space for 1011 * delalloc, because in case we can't immediately reserve space 1012 * the flusher can start delalloc and wait for the respective 1013 * ordered extents to complete. The deadlock would happen 1014 * because we do the space reservation while holding the range 1015 * locked, and starting writeback, or finishing an ordered 1016 * extent, requires locking the range; 1017 * 1018 * 2) If there's delalloc there, it means there's dirty pages for 1019 * which writeback has not started yet (we clean the delalloc 1020 * flag when starting writeback and after creating an ordered 1021 * extent). If we mark pages in an adjacent range for defrag, 1022 * then we will have a larger contiguous range for delalloc, 1023 * very likely resulting in a larger extent after writeback is 1024 * triggered (except in a case of free space fragmentation). 1025 */ 1026 if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1027 EXTENT_DELALLOC)) 1028 goto next; 1029 1030 /* 1031 * For do_compress case, we want to compress all valid file 1032 * extents, thus no @extent_thresh or mergeable check. 1033 */ 1034 if (do_compress) 1035 goto add; 1036 1037 /* Skip too large extent */ 1038 if (em->len >= extent_thresh) 1039 goto next; 1040 1041 /* 1042 * Skip extents already at its max capacity, this is mostly for 1043 * compressed extents, which max cap is only 128K. 1044 */ 1045 if (em->len >= get_extent_max_capacity(fs_info, em)) 1046 goto next; 1047 1048 /* 1049 * Normally there are no more extents after an inline one, thus 1050 * @next_mergeable will normally be false and not defragged. 1051 * So if an inline extent passed all above checks, just add it 1052 * for defrag, and be converted to regular extents. 1053 */ 1054 if (em->disk_bytenr == EXTENT_MAP_INLINE) 1055 goto add; 1056 1057 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1058 extent_thresh, newer_than, locked); 1059 if (!next_mergeable) { 1060 struct defrag_target_range *last; 1061 1062 /* Empty target list, no way to merge with last entry */ 1063 if (list_empty(target_list)) 1064 goto next; 1065 last = list_last_entry(target_list, 1066 struct defrag_target_range, list); 1067 /* Not mergeable with last entry */ 1068 if (last->start + last->len != cur) 1069 goto next; 1070 1071 /* Mergeable, fall through to add it to @target_list. */ 1072 } 1073 1074 add: 1075 last_is_target = true; 1076 range_len = min(btrfs_extent_map_end(em), start + len) - cur; 1077 /* 1078 * This one is a good target, check if it can be merged into 1079 * last range of the target list. 1080 */ 1081 if (!list_empty(target_list)) { 1082 struct defrag_target_range *last; 1083 1084 last = list_last_entry(target_list, 1085 struct defrag_target_range, list); 1086 ASSERT(last->start + last->len <= cur); 1087 if (last->start + last->len == cur) { 1088 /* Mergeable, enlarge the last entry */ 1089 last->len += range_len; 1090 goto next; 1091 } 1092 /* Fall through to allocate a new entry */ 1093 } 1094 1095 /* Allocate new defrag_target_range */ 1096 new = kmalloc(sizeof(*new), GFP_NOFS); 1097 if (!new) { 1098 btrfs_free_extent_map(em); 1099 ret = -ENOMEM; 1100 break; 1101 } 1102 new->start = cur; 1103 new->len = range_len; 1104 list_add_tail(&new->list, target_list); 1105 1106 next: 1107 cur = btrfs_extent_map_end(em); 1108 btrfs_free_extent_map(em); 1109 } 1110 if (ret < 0) { 1111 struct defrag_target_range *entry; 1112 struct defrag_target_range *tmp; 1113 1114 list_for_each_entry_safe(entry, tmp, target_list, list) { 1115 list_del_init(&entry->list); 1116 kfree(entry); 1117 } 1118 } 1119 if (!ret && last_scanned_ret) { 1120 /* 1121 * If the last extent is not a target, the caller can skip to 1122 * the end of that extent. 1123 * Otherwise, we can only go the end of the specified range. 1124 */ 1125 if (!last_is_target) 1126 *last_scanned_ret = max(cur, *last_scanned_ret); 1127 else 1128 *last_scanned_ret = max(start + len, *last_scanned_ret); 1129 } 1130 return ret; 1131 } 1132 1133 #define CLUSTER_SIZE (SZ_256K) 1134 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1135 1136 /* 1137 * Defrag one contiguous target range. 1138 * 1139 * @inode: target inode 1140 * @target: target range to defrag 1141 * @pages: locked pages covering the defrag range 1142 * @nr_pages: number of locked pages 1143 * 1144 * Caller should ensure: 1145 * 1146 * - Pages are prepared 1147 * Pages should be locked, no ordered extent in the pages range, 1148 * no writeback. 1149 * 1150 * - Extent bits are locked 1151 */ 1152 static int defrag_one_locked_target(struct btrfs_inode *inode, 1153 struct defrag_target_range *target, 1154 struct folio **folios, int nr_pages, 1155 struct extent_state **cached_state) 1156 { 1157 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1158 struct extent_changeset *data_reserved = NULL; 1159 const u64 start = target->start; 1160 const u64 len = target->len; 1161 int ret = 0; 1162 1163 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1164 if (ret < 0) 1165 return ret; 1166 btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, 1167 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1168 EXTENT_DEFRAG, cached_state); 1169 btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, 1170 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1171 1172 /* 1173 * Update the page status. 1174 * Due to possible large folios, we have to check all folios one by one. 1175 */ 1176 for (int i = 0; i < nr_pages && folios[i]; i++) { 1177 struct folio *folio = folios[i]; 1178 1179 if (!folio) 1180 break; 1181 if (start >= folio_end(folio) || start + len <= folio_pos(folio)) 1182 continue; 1183 btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); 1184 btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); 1185 } 1186 btrfs_delalloc_release_extents(inode, len); 1187 extent_changeset_free(data_reserved); 1188 1189 return ret; 1190 } 1191 1192 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1193 u32 extent_thresh, u64 newer_than, bool do_compress, 1194 u64 *last_scanned_ret) 1195 { 1196 struct extent_state *cached_state = NULL; 1197 struct defrag_target_range *entry; 1198 struct defrag_target_range *tmp; 1199 LIST_HEAD(target_list); 1200 struct folio **folios; 1201 const u32 sectorsize = inode->root->fs_info->sectorsize; 1202 u64 cur = start; 1203 const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - 1204 (start >> PAGE_SHIFT) + 1; 1205 int ret = 0; 1206 1207 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1208 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1209 1210 folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); 1211 if (!folios) 1212 return -ENOMEM; 1213 1214 /* Prepare all pages */ 1215 for (int i = 0; cur < start + len && i < nr_pages; i++) { 1216 folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); 1217 if (IS_ERR(folios[i])) { 1218 ret = PTR_ERR(folios[i]); 1219 folios[i] = NULL; 1220 goto free_folios; 1221 } 1222 cur = folio_end(folios[i]); 1223 } 1224 for (int i = 0; i < nr_pages; i++) { 1225 if (!folios[i]) 1226 break; 1227 folio_wait_writeback(folios[i]); 1228 } 1229 1230 /* We should get at least one folio. */ 1231 ASSERT(folios[0]); 1232 /* Lock the pages range */ 1233 btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1234 /* 1235 * Now we have a consistent view about the extent map, re-check 1236 * which range really needs to be defragged. 1237 * 1238 * And this time we have extent locked already, pass @locked = true 1239 * so that we won't relock the extent range and cause deadlock. 1240 */ 1241 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1242 newer_than, do_compress, true, 1243 &target_list, last_scanned_ret); 1244 if (ret < 0) 1245 goto unlock_extent; 1246 1247 list_for_each_entry(entry, &target_list, list) { 1248 ret = defrag_one_locked_target(inode, entry, folios, nr_pages, 1249 &cached_state); 1250 if (ret < 0) 1251 break; 1252 } 1253 1254 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1255 list_del_init(&entry->list); 1256 kfree(entry); 1257 } 1258 unlock_extent: 1259 btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1260 free_folios: 1261 for (int i = 0; i < nr_pages; i++) { 1262 if (!folios[i]) 1263 break; 1264 folio_unlock(folios[i]); 1265 folio_put(folios[i]); 1266 } 1267 kfree(folios); 1268 return ret; 1269 } 1270 1271 static int defrag_one_cluster(struct btrfs_inode *inode, 1272 struct file_ra_state *ra, 1273 u64 start, u32 len, u32 extent_thresh, 1274 u64 newer_than, bool do_compress, 1275 unsigned long *sectors_defragged, 1276 unsigned long max_sectors, 1277 u64 *last_scanned_ret) 1278 { 1279 const u32 sectorsize = inode->root->fs_info->sectorsize; 1280 struct defrag_target_range *entry; 1281 struct defrag_target_range *tmp; 1282 LIST_HEAD(target_list); 1283 int ret; 1284 1285 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1286 newer_than, do_compress, false, 1287 &target_list, NULL); 1288 if (ret < 0) 1289 goto out; 1290 1291 list_for_each_entry(entry, &target_list, list) { 1292 u32 range_len = entry->len; 1293 1294 /* Reached or beyond the limit */ 1295 if (max_sectors && *sectors_defragged >= max_sectors) { 1296 ret = 1; 1297 break; 1298 } 1299 1300 if (max_sectors) 1301 range_len = min_t(u32, range_len, 1302 (max_sectors - *sectors_defragged) * sectorsize); 1303 1304 /* 1305 * If defrag_one_range() has updated last_scanned_ret, 1306 * our range may already be invalid (e.g. hole punched). 1307 * Skip if our range is before last_scanned_ret, as there is 1308 * no need to defrag the range anymore. 1309 */ 1310 if (entry->start + range_len <= *last_scanned_ret) 1311 continue; 1312 1313 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1314 ra, NULL, entry->start >> PAGE_SHIFT, 1315 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1316 (entry->start >> PAGE_SHIFT) + 1); 1317 /* 1318 * Here we may not defrag any range if holes are punched before 1319 * we locked the pages. 1320 * But that's fine, it only affects the @sectors_defragged 1321 * accounting. 1322 */ 1323 ret = defrag_one_range(inode, entry->start, range_len, 1324 extent_thresh, newer_than, do_compress, 1325 last_scanned_ret); 1326 if (ret < 0) 1327 break; 1328 *sectors_defragged += range_len >> 1329 inode->root->fs_info->sectorsize_bits; 1330 } 1331 out: 1332 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1333 list_del_init(&entry->list); 1334 kfree(entry); 1335 } 1336 if (ret >= 0) 1337 *last_scanned_ret = max(*last_scanned_ret, start + len); 1338 return ret; 1339 } 1340 1341 /* 1342 * Entry point to file defragmentation. 1343 * 1344 * @inode: inode to be defragged 1345 * @ra: readahead state 1346 * @range: defrag options including range and flags 1347 * @newer_than: minimum transid to defrag 1348 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1349 * will be defragged. 1350 * 1351 * Return <0 for error. 1352 * Return >=0 for the number of sectors defragged, and range->start will be updated 1353 * to indicate the file offset where next defrag should be started at. 1354 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1355 * defragging all the range). 1356 */ 1357 int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, 1358 struct btrfs_ioctl_defrag_range_args *range, 1359 u64 newer_than, unsigned long max_to_defrag) 1360 { 1361 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1362 unsigned long sectors_defragged = 0; 1363 u64 isize = i_size_read(&inode->vfs_inode); 1364 u64 cur; 1365 u64 last_byte; 1366 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1367 bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); 1368 int compress_type = BTRFS_COMPRESS_ZLIB; 1369 int compress_level = 0; 1370 int ret = 0; 1371 u32 extent_thresh = range->extent_thresh; 1372 pgoff_t start_index; 1373 1374 ASSERT(ra); 1375 1376 if (isize == 0) 1377 return 0; 1378 1379 if (range->start >= isize) 1380 return -EINVAL; 1381 1382 if (do_compress) { 1383 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { 1384 if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) 1385 return -EINVAL; 1386 if (range->compress.type) { 1387 compress_type = range->compress.type; 1388 compress_level = range->compress.level; 1389 if (!btrfs_compress_level_valid(compress_type, compress_level)) 1390 return -EINVAL; 1391 } 1392 } else { 1393 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1394 return -EINVAL; 1395 if (range->compress_type) 1396 compress_type = range->compress_type; 1397 } 1398 } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { 1399 compress_type = BTRFS_DEFRAG_DONT_COMPRESS; 1400 compress_level = 1; 1401 } 1402 1403 if (extent_thresh == 0) 1404 extent_thresh = SZ_256K; 1405 1406 if (range->start + range->len > range->start) { 1407 /* Got a specific range */ 1408 last_byte = min(isize, range->start + range->len); 1409 } else { 1410 /* Defrag until file end */ 1411 last_byte = isize; 1412 } 1413 1414 /* Align the range */ 1415 cur = round_down(range->start, fs_info->sectorsize); 1416 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1417 1418 /* 1419 * Make writeback start from the beginning of the range, so that the 1420 * defrag range can be written sequentially. 1421 */ 1422 start_index = cur >> PAGE_SHIFT; 1423 if (start_index < inode->vfs_inode.i_mapping->writeback_index) 1424 inode->vfs_inode.i_mapping->writeback_index = start_index; 1425 1426 while (cur < last_byte) { 1427 const unsigned long prev_sectors_defragged = sectors_defragged; 1428 u64 last_scanned = cur; 1429 u64 cluster_end; 1430 1431 if (btrfs_defrag_cancelled(fs_info)) { 1432 ret = -EAGAIN; 1433 break; 1434 } 1435 1436 /* We want the cluster end at page boundary when possible */ 1437 cluster_end = (((cur >> PAGE_SHIFT) + 1438 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1439 cluster_end = min(cluster_end, last_byte); 1440 1441 btrfs_inode_lock(inode, 0); 1442 if (IS_SWAPFILE(&inode->vfs_inode)) { 1443 ret = -ETXTBSY; 1444 btrfs_inode_unlock(inode, 0); 1445 break; 1446 } 1447 if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { 1448 btrfs_inode_unlock(inode, 0); 1449 break; 1450 } 1451 if (do_compress || no_compress) { 1452 inode->defrag_compress = compress_type; 1453 inode->defrag_compress_level = compress_level; 1454 } 1455 ret = defrag_one_cluster(inode, ra, cur, 1456 cluster_end + 1 - cur, extent_thresh, 1457 newer_than, do_compress || no_compress, 1458 §ors_defragged, 1459 max_to_defrag, &last_scanned); 1460 1461 if (sectors_defragged > prev_sectors_defragged) 1462 balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); 1463 1464 btrfs_inode_unlock(inode, 0); 1465 if (ret < 0) 1466 break; 1467 cur = max(cluster_end + 1, last_scanned); 1468 if (ret > 0) { 1469 ret = 0; 1470 break; 1471 } 1472 cond_resched(); 1473 } 1474 1475 /* 1476 * Update range.start for autodefrag, this will indicate where to start 1477 * in next run. 1478 */ 1479 range->start = cur; 1480 if (sectors_defragged) { 1481 /* 1482 * We have defragged some sectors, for compression case they 1483 * need to be written back immediately. 1484 */ 1485 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1486 filemap_flush(inode->vfs_inode.i_mapping); 1487 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1488 &inode->runtime_flags)) 1489 filemap_flush(inode->vfs_inode.i_mapping); 1490 } 1491 if (range->compress_type == BTRFS_COMPRESS_LZO) 1492 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1493 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1494 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1495 ret = sectors_defragged; 1496 } 1497 if (do_compress || no_compress) { 1498 btrfs_inode_lock(inode, 0); 1499 inode->defrag_compress = BTRFS_COMPRESS_NONE; 1500 btrfs_inode_unlock(inode, 0); 1501 } 1502 return ret; 1503 } 1504 1505 void __cold btrfs_auto_defrag_exit(void) 1506 { 1507 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1508 } 1509 1510 int __init btrfs_auto_defrag_init(void) 1511 { 1512 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1513 sizeof(struct inode_defrag), 0, 0, NULL); 1514 if (!btrfs_inode_defrag_cachep) 1515 return -ENOMEM; 1516 1517 return 0; 1518 } 1519