1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "transaction.h" 10 #include "locking.h" 11 #include "accessors.h" 12 #include "messages.h" 13 #include "delalloc-space.h" 14 #include "subpage.h" 15 #include "defrag.h" 16 #include "file-item.h" 17 #include "super.h" 18 #include "compression.h" 19 20 static struct kmem_cache *btrfs_inode_defrag_cachep; 21 22 /* 23 * When auto defrag is enabled we queue up these defrag structs to remember 24 * which inodes need defragging passes. 25 */ 26 struct inode_defrag { 27 struct rb_node rb_node; 28 /* Inode number */ 29 u64 ino; 30 /* 31 * Transid where the defrag was added, we search for extents newer than 32 * this. 33 */ 34 u64 transid; 35 36 /* Root objectid */ 37 u64 root; 38 39 /* 40 * The extent size threshold for autodefrag. 41 * 42 * This value is different for compressed/non-compressed extents, thus 43 * needs to be passed from higher layer. 44 * (aka, inode_should_defrag()) 45 */ 46 u32 extent_thresh; 47 }; 48 49 static int compare_inode_defrag(const struct inode_defrag *defrag1, 50 const struct inode_defrag *defrag2) 51 { 52 if (defrag1->root > defrag2->root) 53 return 1; 54 else if (defrag1->root < defrag2->root) 55 return -1; 56 else if (defrag1->ino > defrag2->ino) 57 return 1; 58 else if (defrag1->ino < defrag2->ino) 59 return -1; 60 else 61 return 0; 62 } 63 64 static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) 65 { 66 const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); 67 const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); 68 69 return compare_inode_defrag(new_defrag, existing_defrag); 70 } 71 72 /* 73 * Insert a record for an inode into the defrag tree. The lock must be held 74 * already. 75 * 76 * If you're inserting a record for an older transid than an existing record, 77 * the transid already in the tree is lowered. 78 */ 79 static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, 80 struct inode_defrag *defrag) 81 { 82 struct btrfs_fs_info *fs_info = inode->root->fs_info; 83 struct rb_node *node; 84 85 node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); 86 if (node) { 87 struct inode_defrag *entry; 88 89 entry = rb_entry(node, struct inode_defrag, rb_node); 90 /* 91 * If we're reinserting an entry for an old defrag run, make 92 * sure to lower the transid of our existing record. 93 */ 94 if (defrag->transid < entry->transid) 95 entry->transid = defrag->transid; 96 entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); 97 return -EEXIST; 98 } 99 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 100 return 0; 101 } 102 103 static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) 104 { 105 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 106 return false; 107 108 if (btrfs_fs_closing(fs_info)) 109 return false; 110 111 return true; 112 } 113 114 /* 115 * Insert a defrag record for this inode if auto defrag is enabled. No errors 116 * returned as they're not considered fatal. 117 */ 118 void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) 119 { 120 struct btrfs_root *root = inode->root; 121 struct btrfs_fs_info *fs_info = root->fs_info; 122 struct inode_defrag *defrag; 123 int ret; 124 125 if (!need_auto_defrag(fs_info)) 126 return; 127 128 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 129 return; 130 131 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 132 if (!defrag) 133 return; 134 135 defrag->ino = btrfs_ino(inode); 136 defrag->transid = btrfs_get_root_last_trans(root); 137 defrag->root = btrfs_root_id(root); 138 defrag->extent_thresh = extent_thresh; 139 140 spin_lock(&fs_info->defrag_inodes_lock); 141 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 142 /* 143 * If we set IN_DEFRAG flag and evict the inode from memory, 144 * and then re-read this inode, this new inode doesn't have 145 * IN_DEFRAG flag. At the case, we may find the existed defrag. 146 */ 147 ret = btrfs_insert_inode_defrag(inode, defrag); 148 if (ret) 149 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 150 } else { 151 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 152 } 153 spin_unlock(&fs_info->defrag_inodes_lock); 154 } 155 156 /* 157 * Pick the defraggable inode that we want, if it doesn't exist, we will get the 158 * next one. 159 */ 160 static struct inode_defrag *btrfs_pick_defrag_inode( 161 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 162 { 163 struct inode_defrag *entry = NULL; 164 struct inode_defrag tmp; 165 struct rb_node *p; 166 struct rb_node *parent = NULL; 167 int ret; 168 169 tmp.ino = ino; 170 tmp.root = root; 171 172 spin_lock(&fs_info->defrag_inodes_lock); 173 p = fs_info->defrag_inodes.rb_node; 174 while (p) { 175 parent = p; 176 entry = rb_entry(parent, struct inode_defrag, rb_node); 177 178 ret = compare_inode_defrag(&tmp, entry); 179 if (ret < 0) 180 p = parent->rb_left; 181 else if (ret > 0) 182 p = parent->rb_right; 183 else 184 goto out; 185 } 186 187 if (parent && compare_inode_defrag(&tmp, entry) > 0) { 188 parent = rb_next(parent); 189 entry = rb_entry_safe(parent, struct inode_defrag, rb_node); 190 } 191 out: 192 if (entry) 193 rb_erase(parent, &fs_info->defrag_inodes); 194 spin_unlock(&fs_info->defrag_inodes_lock); 195 return entry; 196 } 197 198 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 199 { 200 struct inode_defrag *defrag, *next; 201 202 spin_lock(&fs_info->defrag_inodes_lock); 203 204 rbtree_postorder_for_each_entry_safe(defrag, next, 205 &fs_info->defrag_inodes, rb_node) 206 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 207 208 fs_info->defrag_inodes = RB_ROOT; 209 210 spin_unlock(&fs_info->defrag_inodes_lock); 211 } 212 213 #define BTRFS_DEFRAG_BATCH 1024 214 215 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 216 struct inode_defrag *defrag, 217 struct file_ra_state *ra) 218 { 219 struct btrfs_root *inode_root; 220 struct btrfs_inode *inode; 221 struct btrfs_ioctl_defrag_range_args range; 222 int ret = 0; 223 u64 cur = 0; 224 225 again: 226 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 227 goto cleanup; 228 if (!need_auto_defrag(fs_info)) 229 goto cleanup; 230 231 /* Get the inode */ 232 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 233 if (IS_ERR(inode_root)) { 234 ret = PTR_ERR(inode_root); 235 goto cleanup; 236 } 237 238 inode = btrfs_iget(defrag->ino, inode_root); 239 btrfs_put_root(inode_root); 240 if (IS_ERR(inode)) { 241 ret = PTR_ERR(inode); 242 goto cleanup; 243 } 244 245 if (cur >= i_size_read(&inode->vfs_inode)) { 246 iput(&inode->vfs_inode); 247 goto cleanup; 248 } 249 250 /* Do a chunk of defrag */ 251 clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 252 memset(&range, 0, sizeof(range)); 253 range.len = (u64)-1; 254 range.start = cur; 255 range.extent_thresh = defrag->extent_thresh; 256 file_ra_state_init(ra, inode->vfs_inode.i_mapping); 257 258 scoped_guard(super_write, fs_info->sb) 259 ret = btrfs_defrag_file(inode, ra, &range, 260 defrag->transid, BTRFS_DEFRAG_BATCH); 261 iput(&inode->vfs_inode); 262 263 if (ret < 0) 264 goto cleanup; 265 266 cur = max(cur + fs_info->sectorsize, range.start); 267 goto again; 268 269 cleanup: 270 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 271 return ret; 272 } 273 274 /* 275 * Run through the list of inodes in the FS that need defragging. 276 */ 277 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 278 { 279 struct inode_defrag *defrag; 280 u64 first_ino = 0; 281 u64 root_objectid = 0; 282 283 atomic_inc(&fs_info->defrag_running); 284 while (1) { 285 struct file_ra_state ra = { 0 }; 286 287 /* Pause the auto defragger. */ 288 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 289 break; 290 291 if (!need_auto_defrag(fs_info)) 292 break; 293 294 /* find an inode to defrag */ 295 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 296 if (!defrag) { 297 if (root_objectid || first_ino) { 298 root_objectid = 0; 299 first_ino = 0; 300 continue; 301 } else { 302 break; 303 } 304 } 305 306 first_ino = defrag->ino + 1; 307 root_objectid = defrag->root; 308 309 btrfs_run_defrag_inode(fs_info, defrag, &ra); 310 } 311 atomic_dec(&fs_info->defrag_running); 312 313 /* 314 * During unmount, we use the transaction_wait queue to wait for the 315 * defragger to stop. 316 */ 317 wake_up(&fs_info->transaction_wait); 318 return 0; 319 } 320 321 /* 322 * Check if two blocks addresses are close, used by defrag. 323 */ 324 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 325 { 326 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 327 return true; 328 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 329 return true; 330 return false; 331 } 332 333 /* 334 * Go through all the leaves pointed to by a node and reallocate them so that 335 * disk order is close to key order. 336 */ 337 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 338 struct btrfs_root *root, 339 struct extent_buffer *parent, 340 int start_slot, u64 *last_ret, 341 struct btrfs_key *progress) 342 { 343 struct btrfs_fs_info *fs_info = root->fs_info; 344 const u32 blocksize = fs_info->nodesize; 345 const int end_slot = btrfs_header_nritems(parent) - 1; 346 u64 search_start = *last_ret; 347 u64 last_block = 0; 348 int ret = 0; 349 bool progress_passed = false; 350 351 /* 352 * COWing must happen through a running transaction, which always 353 * matches the current fs generation (it's a transaction with a state 354 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 355 * into error state to prevent the commit of any transaction. 356 */ 357 if (unlikely(trans->transaction != fs_info->running_transaction || 358 trans->transid != fs_info->generation)) { 359 btrfs_abort_transaction(trans, -EUCLEAN); 360 btrfs_crit(fs_info, 361 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 362 parent->start, btrfs_root_id(root), trans->transid, 363 fs_info->running_transaction->transid, 364 fs_info->generation); 365 return -EUCLEAN; 366 } 367 368 if (btrfs_header_nritems(parent) <= 1) 369 return 0; 370 371 for (int i = start_slot; i <= end_slot; i++) { 372 struct extent_buffer *cur; 373 struct btrfs_disk_key disk_key; 374 u64 blocknr; 375 u64 other; 376 bool close = true; 377 378 btrfs_node_key(parent, &disk_key, i); 379 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 380 continue; 381 382 progress_passed = true; 383 blocknr = btrfs_node_blockptr(parent, i); 384 if (last_block == 0) 385 last_block = blocknr; 386 387 if (i > 0) { 388 other = btrfs_node_blockptr(parent, i - 1); 389 close = close_blocks(blocknr, other, blocksize); 390 } 391 if (!close && i < end_slot) { 392 other = btrfs_node_blockptr(parent, i + 1); 393 close = close_blocks(blocknr, other, blocksize); 394 } 395 if (close) { 396 last_block = blocknr; 397 continue; 398 } 399 400 cur = btrfs_read_node_slot(parent, i); 401 if (IS_ERR(cur)) 402 return PTR_ERR(cur); 403 if (search_start == 0) 404 search_start = last_block; 405 406 btrfs_tree_lock(cur); 407 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 408 &cur, search_start, 409 min(16 * blocksize, 410 (end_slot - i) * blocksize), 411 BTRFS_NESTING_COW); 412 if (ret) { 413 btrfs_tree_unlock(cur); 414 free_extent_buffer(cur); 415 break; 416 } 417 search_start = cur->start; 418 last_block = cur->start; 419 *last_ret = search_start; 420 btrfs_tree_unlock(cur); 421 free_extent_buffer(cur); 422 } 423 return ret; 424 } 425 426 /* 427 * Defrag all the leaves in a given btree. 428 * Read all the leaves and try to get key order to 429 * better reflect disk order 430 */ 431 432 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 433 struct btrfs_root *root) 434 { 435 struct btrfs_path *path = NULL; 436 struct btrfs_key key; 437 int ret = 0; 438 int wret; 439 int level; 440 int next_key_ret = 0; 441 u64 last_ret = 0; 442 443 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 444 goto out; 445 446 path = btrfs_alloc_path(); 447 if (!path) { 448 ret = -ENOMEM; 449 goto out; 450 } 451 452 level = btrfs_header_level(root->node); 453 454 if (level == 0) 455 goto out; 456 457 if (root->defrag_progress.objectid == 0) { 458 struct extent_buffer *root_node; 459 u32 nritems; 460 461 root_node = btrfs_lock_root_node(root); 462 nritems = btrfs_header_nritems(root_node); 463 root->defrag_max.objectid = 0; 464 /* from above we know this is not a leaf */ 465 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 466 nritems - 1); 467 btrfs_tree_unlock(root_node); 468 free_extent_buffer(root_node); 469 memset(&key, 0, sizeof(key)); 470 } else { 471 memcpy(&key, &root->defrag_progress, sizeof(key)); 472 } 473 474 path->keep_locks = true; 475 476 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 477 if (ret < 0) 478 goto out; 479 if (ret > 0) { 480 ret = 0; 481 goto out; 482 } 483 btrfs_release_path(path); 484 /* 485 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 486 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 487 * a deadlock (attempting to write lock an already write locked leaf). 488 */ 489 path->lowest_level = 1; 490 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 491 492 if (wret < 0) { 493 ret = wret; 494 goto out; 495 } 496 if (!path->nodes[1]) { 497 ret = 0; 498 goto out; 499 } 500 /* 501 * The node at level 1 must always be locked when our path has 502 * keep_locks set and lowest_level is 1, regardless of the value of 503 * path->slots[1]. 504 */ 505 ASSERT(path->locks[1] != 0); 506 ret = btrfs_realloc_node(trans, root, 507 path->nodes[1], 0, 508 &last_ret, 509 &root->defrag_progress); 510 if (ret) { 511 WARN_ON(ret == -EAGAIN); 512 goto out; 513 } 514 /* 515 * Now that we reallocated the node we can find the next key. Note that 516 * btrfs_find_next_key() can release our path and do another search 517 * without COWing, this is because even with path->keep_locks == true, 518 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 519 * node when path->slots[node_level - 1] does not point to the last 520 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 521 * we search for the next key after reallocating our node. 522 */ 523 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 524 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 525 BTRFS_OLDEST_GENERATION); 526 if (next_key_ret == 0) { 527 memcpy(&root->defrag_progress, &key, sizeof(key)); 528 ret = -EAGAIN; 529 } 530 out: 531 btrfs_free_path(path); 532 if (ret == -EAGAIN) { 533 if (root->defrag_max.objectid > root->defrag_progress.objectid) 534 goto done; 535 if (root->defrag_max.type > root->defrag_progress.type) 536 goto done; 537 if (root->defrag_max.offset > root->defrag_progress.offset) 538 goto done; 539 ret = 0; 540 } 541 done: 542 if (ret != -EAGAIN) 543 memset(&root->defrag_progress, 0, 544 sizeof(root->defrag_progress)); 545 546 return ret; 547 } 548 549 /* 550 * Defrag a given btree. Every leaf in the btree is read and defragmented. 551 */ 552 int btrfs_defrag_root(struct btrfs_root *root) 553 { 554 struct btrfs_fs_info *fs_info = root->fs_info; 555 int ret; 556 557 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 558 return 0; 559 560 while (1) { 561 struct btrfs_trans_handle *trans; 562 563 trans = btrfs_start_transaction(root, 0); 564 if (IS_ERR(trans)) { 565 ret = PTR_ERR(trans); 566 break; 567 } 568 569 ret = btrfs_defrag_leaves(trans, root); 570 571 btrfs_end_transaction(trans); 572 btrfs_btree_balance_dirty(fs_info); 573 cond_resched(); 574 575 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 576 break; 577 578 if (btrfs_defrag_cancelled(fs_info)) { 579 btrfs_debug(fs_info, "defrag_root cancelled"); 580 ret = -EAGAIN; 581 break; 582 } 583 } 584 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 585 return ret; 586 } 587 588 /* 589 * Defrag specific helper to get an extent map. 590 * 591 * Differences between this and btrfs_get_extent() are: 592 * 593 * - No extent_map will be added to inode->extent_tree 594 * To reduce memory usage in the long run. 595 * 596 * - Extra optimization to skip file extents older than @newer_than 597 * By using btrfs_search_forward() we can skip entire file ranges that 598 * have extents created in past transactions, because btrfs_search_forward() 599 * will not visit leaves and nodes with a generation smaller than given 600 * minimal generation threshold (@newer_than). 601 * 602 * Return valid em if we find a file extent matching the requirement. 603 * Return NULL if we can not find a file extent matching the requirement. 604 * 605 * Return ERR_PTR() for error. 606 */ 607 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 608 u64 start, u64 newer_than) 609 { 610 struct btrfs_root *root = inode->root; 611 struct btrfs_file_extent_item *fi; 612 BTRFS_PATH_AUTO_RELEASE(path); 613 struct extent_map *em; 614 struct btrfs_key key; 615 u64 ino = btrfs_ino(inode); 616 int ret; 617 618 em = btrfs_alloc_extent_map(); 619 if (!em) { 620 ret = -ENOMEM; 621 goto err; 622 } 623 624 key.objectid = ino; 625 key.type = BTRFS_EXTENT_DATA_KEY; 626 key.offset = start; 627 628 if (newer_than) { 629 ret = btrfs_search_forward(root, &key, &path, newer_than); 630 if (ret < 0) 631 goto err; 632 /* Can't find anything newer */ 633 if (ret > 0) 634 goto not_found; 635 } else { 636 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 637 if (ret < 0) 638 goto err; 639 } 640 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 641 /* 642 * If btrfs_search_slot() makes path to point beyond nritems, 643 * we should not have an empty leaf, as this inode must at 644 * least have its INODE_ITEM. 645 */ 646 ASSERT(btrfs_header_nritems(path.nodes[0])); 647 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 648 } 649 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 650 /* Perfect match, no need to go one slot back */ 651 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 652 key.offset == start) 653 goto iterate; 654 655 /* We didn't find a perfect match, needs to go one slot back */ 656 if (path.slots[0] > 0) { 657 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 658 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 659 path.slots[0]--; 660 } 661 662 iterate: 663 /* Iterate through the path to find a file extent covering @start */ 664 while (true) { 665 u64 extent_end; 666 667 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 668 goto next; 669 670 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 671 672 /* 673 * We may go one slot back to INODE_REF/XATTR item, then 674 * need to go forward until we reach an EXTENT_DATA. 675 * But we should still has the correct ino as key.objectid. 676 */ 677 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 678 goto next; 679 680 /* It's beyond our target range, definitely not extent found */ 681 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 682 goto not_found; 683 684 /* 685 * | |<- File extent ->| 686 * \- start 687 * 688 * This means there is a hole between start and key.offset. 689 */ 690 if (key.offset > start) { 691 em->start = start; 692 em->disk_bytenr = EXTENT_MAP_HOLE; 693 em->disk_num_bytes = 0; 694 em->ram_bytes = 0; 695 em->offset = 0; 696 em->len = key.offset - start; 697 break; 698 } 699 700 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 701 struct btrfs_file_extent_item); 702 extent_end = btrfs_file_extent_end(&path); 703 704 /* 705 * |<- file extent ->| | 706 * \- start 707 * 708 * We haven't reached start, search next slot. 709 */ 710 if (extent_end <= start) 711 goto next; 712 713 /* Now this extent covers @start, convert it to em */ 714 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 715 break; 716 next: 717 ret = btrfs_next_item(root, &path); 718 if (ret < 0) 719 goto err; 720 if (ret > 0) 721 goto not_found; 722 } 723 return em; 724 725 not_found: 726 btrfs_free_extent_map(em); 727 return NULL; 728 729 err: 730 btrfs_free_extent_map(em); 731 return ERR_PTR(ret); 732 } 733 734 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 735 u64 newer_than, bool locked) 736 { 737 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 738 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 739 struct extent_map *em; 740 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 741 742 /* 743 * Hopefully we have this extent in the tree already, try without the 744 * full extent lock. 745 */ 746 read_lock(&em_tree->lock); 747 em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); 748 read_unlock(&em_tree->lock); 749 750 /* 751 * We can get a merged extent, in that case, we need to re-search 752 * tree to get the original em for defrag. 753 * 754 * This is because even if we have adjacent extents that are contiguous 755 * and compatible (same type and flags), we still want to defrag them 756 * so that we use less metadata (extent items in the extent tree and 757 * file extent items in the inode's subvolume tree). 758 */ 759 if (em && (em->flags & EXTENT_FLAG_MERGED)) { 760 btrfs_free_extent_map(em); 761 em = NULL; 762 } 763 764 if (!em) { 765 struct extent_state *cached = NULL; 766 u64 end = start + sectorsize - 1; 767 768 /* Get the big lock and read metadata off disk. */ 769 if (!locked) 770 btrfs_lock_extent(io_tree, start, end, &cached); 771 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 772 if (!locked) 773 btrfs_unlock_extent(io_tree, start, end, &cached); 774 775 if (IS_ERR(em)) 776 return NULL; 777 } 778 779 return em; 780 } 781 782 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 783 const struct extent_map *em) 784 { 785 if (btrfs_extent_map_is_compressed(em)) 786 return BTRFS_MAX_COMPRESSED; 787 return fs_info->max_extent_size; 788 } 789 790 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 791 u32 extent_thresh, u64 newer_than, bool locked) 792 { 793 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 794 struct extent_map *next; 795 const u64 em_end = btrfs_extent_map_end(em); 796 bool ret = false; 797 798 /* This is the last extent */ 799 if (em_end >= i_size_read(inode)) 800 return false; 801 802 /* 803 * Here we need to pass @newer_then when checking the next extent, or 804 * we will hit a case we mark current extent for defrag, but the next 805 * one will not be a target. 806 * This will just cause extra IO without really reducing the fragments. 807 */ 808 next = defrag_lookup_extent(inode, em_end, newer_than, locked); 809 /* No more em or hole */ 810 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 811 goto out; 812 if (next->flags & EXTENT_FLAG_PREALLOC) 813 goto out; 814 /* 815 * If the next extent is at its max capacity, defragging current extent 816 * makes no sense, as the total number of extents won't change. 817 */ 818 if (next->len >= get_extent_max_capacity(fs_info, em)) 819 goto out; 820 /* Skip older extent */ 821 if (next->generation < newer_than) 822 goto out; 823 /* Also check extent size */ 824 if (next->len >= extent_thresh) 825 goto out; 826 827 ret = true; 828 out: 829 btrfs_free_extent_map(next); 830 return ret; 831 } 832 833 /* 834 * Prepare one page to be defragged. 835 * 836 * This will ensure: 837 * 838 * - Returned page is locked and has been set up properly. 839 * - No ordered extent exists in the page. 840 * - The page is uptodate. 841 * 842 * NOTE: Caller should also wait for page writeback after the cluster is 843 * prepared, here we don't do writeback wait for each page. 844 */ 845 static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) 846 { 847 struct address_space *mapping = inode->vfs_inode.i_mapping; 848 gfp_t mask = btrfs_alloc_write_mask(mapping); 849 u64 lock_start; 850 u64 lock_end; 851 struct extent_state *cached_state = NULL; 852 struct folio *folio; 853 int ret; 854 855 again: 856 /* TODO: Add order fgp order flags when large folios are fully enabled. */ 857 folio = __filemap_get_folio(mapping, index, 858 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 859 if (IS_ERR(folio)) 860 return folio; 861 862 /* 863 * Since we can defragment files opened read-only, we can encounter 864 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). 865 * 866 * The IO for such large folios is not fully tested, thus return 867 * an error to reject such folios unless it's an experimental build. 868 * 869 * Filesystem transparent huge pages are typically only used for 870 * executables that explicitly enable them, so this isn't very 871 * restrictive. 872 */ 873 if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { 874 folio_unlock(folio); 875 folio_put(folio); 876 return ERR_PTR(-ETXTBSY); 877 } 878 879 ret = set_folio_extent_mapped(folio); 880 if (ret < 0) { 881 folio_unlock(folio); 882 folio_put(folio); 883 return ERR_PTR(ret); 884 } 885 886 lock_start = folio_pos(folio); 887 lock_end = folio_next_pos(folio) - 1; 888 /* Wait for any existing ordered extent in the range */ 889 while (1) { 890 struct btrfs_ordered_extent *ordered; 891 892 btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 893 ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); 894 btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 895 if (!ordered) 896 break; 897 898 folio_unlock(folio); 899 btrfs_start_ordered_extent(ordered); 900 btrfs_put_ordered_extent(ordered); 901 folio_lock(folio); 902 /* 903 * We unlocked the folio above, so we need check if it was 904 * released or not. 905 */ 906 if (folio->mapping != mapping || !folio->private) { 907 folio_unlock(folio); 908 folio_put(folio); 909 goto again; 910 } 911 } 912 913 /* 914 * Now the page range has no ordered extent any more. Read the page to 915 * make it uptodate. 916 */ 917 if (!folio_test_uptodate(folio)) { 918 btrfs_read_folio(NULL, folio); 919 folio_lock(folio); 920 if (folio->mapping != mapping || !folio->private) { 921 folio_unlock(folio); 922 folio_put(folio); 923 goto again; 924 } 925 if (unlikely(!folio_test_uptodate(folio))) { 926 folio_unlock(folio); 927 folio_put(folio); 928 return ERR_PTR(-EIO); 929 } 930 } 931 return folio; 932 } 933 934 struct defrag_target_range { 935 struct list_head list; 936 u64 start; 937 u64 len; 938 }; 939 940 /* 941 * Collect all valid target extents. 942 * 943 * @start: file offset to lookup 944 * @len: length to lookup 945 * @extent_thresh: file extent size threshold, any extent size >= this value 946 * will be ignored 947 * @newer_than: only defrag extents newer than this value 948 * @do_compress: whether the defrag is doing compression or no-compression 949 * if true, @extent_thresh will be ignored and all regular 950 * file extents meeting @newer_than will be targets. 951 * @locked: if the range has already held extent lock 952 * @target_list: list of targets file extents 953 */ 954 static int defrag_collect_targets(struct btrfs_inode *inode, 955 u64 start, u64 len, u32 extent_thresh, 956 u64 newer_than, bool do_compress, 957 bool locked, struct list_head *target_list, 958 u64 *last_scanned_ret) 959 { 960 struct btrfs_fs_info *fs_info = inode->root->fs_info; 961 bool last_is_target = false; 962 u64 cur = start; 963 int ret = 0; 964 965 while (cur < start + len) { 966 struct extent_map *em; 967 struct defrag_target_range *new; 968 bool next_mergeable = true; 969 u64 range_len; 970 971 last_is_target = false; 972 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 973 if (!em) 974 break; 975 976 /* 977 * If the file extent is an inlined one, we may still want to 978 * defrag it (fallthrough) if it will cause a regular extent. 979 * This is for users who want to convert inline extents to 980 * regular ones through max_inline= mount option. 981 */ 982 if (em->disk_bytenr == EXTENT_MAP_INLINE && 983 em->len <= inode->root->fs_info->max_inline) 984 goto next; 985 986 /* Skip holes and preallocated extents. */ 987 if (em->disk_bytenr == EXTENT_MAP_HOLE || 988 (em->flags & EXTENT_FLAG_PREALLOC)) 989 goto next; 990 991 /* Skip older extent */ 992 if (em->generation < newer_than) 993 goto next; 994 995 /* This em is under writeback, no need to defrag */ 996 if (em->generation == (u64)-1) 997 goto next; 998 999 /* 1000 * Our start offset might be in the middle of an existing extent 1001 * map, so take that into account. 1002 */ 1003 range_len = em->len - (cur - em->start); 1004 /* 1005 * If this range of the extent map is already flagged for delalloc, 1006 * skip it, because: 1007 * 1008 * 1) We could deadlock later, when trying to reserve space for 1009 * delalloc, because in case we can't immediately reserve space 1010 * the flusher can start delalloc and wait for the respective 1011 * ordered extents to complete. The deadlock would happen 1012 * because we do the space reservation while holding the range 1013 * locked, and starting writeback, or finishing an ordered 1014 * extent, requires locking the range; 1015 * 1016 * 2) If there's delalloc there, it means there's dirty pages for 1017 * which writeback has not started yet (we clean the delalloc 1018 * flag when starting writeback and after creating an ordered 1019 * extent). If we mark pages in an adjacent range for defrag, 1020 * then we will have a larger contiguous range for delalloc, 1021 * very likely resulting in a larger extent after writeback is 1022 * triggered (except in a case of free space fragmentation). 1023 */ 1024 if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1025 EXTENT_DELALLOC)) 1026 goto next; 1027 1028 /* 1029 * For do_compress case, we want to compress all valid file 1030 * extents, thus no @extent_thresh or mergeable check. 1031 */ 1032 if (do_compress) 1033 goto add; 1034 1035 /* Skip too large extent */ 1036 if (em->len >= extent_thresh) 1037 goto next; 1038 1039 /* 1040 * Skip extents already at its max capacity, this is mostly for 1041 * compressed extents, which max cap is only 128K. 1042 */ 1043 if (em->len >= get_extent_max_capacity(fs_info, em)) 1044 goto next; 1045 1046 /* 1047 * Normally there are no more extents after an inline one, thus 1048 * @next_mergeable will normally be false and not defragged. 1049 * So if an inline extent passed all above checks, just add it 1050 * for defrag, and be converted to regular extents. 1051 */ 1052 if (em->disk_bytenr == EXTENT_MAP_INLINE) 1053 goto add; 1054 1055 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1056 extent_thresh, newer_than, locked); 1057 if (!next_mergeable) { 1058 struct defrag_target_range *last; 1059 1060 /* Empty target list, no way to merge with last entry */ 1061 if (list_empty(target_list)) 1062 goto next; 1063 last = list_last_entry(target_list, 1064 struct defrag_target_range, list); 1065 /* Not mergeable with last entry */ 1066 if (last->start + last->len != cur) 1067 goto next; 1068 1069 /* Mergeable, fall through to add it to @target_list. */ 1070 } 1071 1072 add: 1073 last_is_target = true; 1074 range_len = min(btrfs_extent_map_end(em), start + len) - cur; 1075 /* 1076 * This one is a good target, check if it can be merged into 1077 * last range of the target list. 1078 */ 1079 if (!list_empty(target_list)) { 1080 struct defrag_target_range *last; 1081 1082 last = list_last_entry(target_list, 1083 struct defrag_target_range, list); 1084 ASSERT(last->start + last->len <= cur); 1085 if (last->start + last->len == cur) { 1086 /* Mergeable, enlarge the last entry */ 1087 last->len += range_len; 1088 goto next; 1089 } 1090 /* Fall through to allocate a new entry */ 1091 } 1092 1093 /* Allocate new defrag_target_range */ 1094 new = kmalloc(sizeof(*new), GFP_NOFS); 1095 if (!new) { 1096 btrfs_free_extent_map(em); 1097 ret = -ENOMEM; 1098 break; 1099 } 1100 new->start = cur; 1101 new->len = range_len; 1102 list_add_tail(&new->list, target_list); 1103 1104 next: 1105 cur = btrfs_extent_map_end(em); 1106 btrfs_free_extent_map(em); 1107 } 1108 if (ret < 0) { 1109 struct defrag_target_range *entry; 1110 struct defrag_target_range *tmp; 1111 1112 list_for_each_entry_safe(entry, tmp, target_list, list) { 1113 list_del_init(&entry->list); 1114 kfree(entry); 1115 } 1116 } 1117 if (!ret && last_scanned_ret) { 1118 /* 1119 * If the last extent is not a target, the caller can skip to 1120 * the end of that extent. 1121 * Otherwise, we can only go the end of the specified range. 1122 */ 1123 if (!last_is_target) 1124 *last_scanned_ret = max(cur, *last_scanned_ret); 1125 else 1126 *last_scanned_ret = max(start + len, *last_scanned_ret); 1127 } 1128 return ret; 1129 } 1130 1131 #define CLUSTER_SIZE (SZ_256K) 1132 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1133 1134 /* 1135 * Defrag one contiguous target range. 1136 * 1137 * @inode: target inode 1138 * @target: target range to defrag 1139 * @pages: locked pages covering the defrag range 1140 * @nr_pages: number of locked pages 1141 * 1142 * Caller should ensure: 1143 * 1144 * - Pages are prepared 1145 * Pages should be locked, no ordered extent in the pages range, 1146 * no writeback. 1147 * 1148 * - Extent bits are locked 1149 */ 1150 static int defrag_one_locked_target(struct btrfs_inode *inode, 1151 struct defrag_target_range *target, 1152 struct folio **folios, int nr_pages, 1153 struct extent_state **cached_state) 1154 { 1155 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1156 struct extent_changeset *data_reserved = NULL; 1157 const u64 start = target->start; 1158 const u64 len = target->len; 1159 int ret = 0; 1160 1161 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1162 if (ret < 0) 1163 return ret; 1164 btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, 1165 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1166 EXTENT_DEFRAG, cached_state); 1167 btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, 1168 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1169 1170 /* 1171 * Update the page status. 1172 * Due to possible large folios, we have to check all folios one by one. 1173 */ 1174 for (int i = 0; i < nr_pages && folios[i]; i++) { 1175 struct folio *folio = folios[i]; 1176 1177 if (!folio) 1178 break; 1179 if (start >= folio_next_pos(folio) || 1180 start + len <= folio_pos(folio)) 1181 continue; 1182 btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); 1183 btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); 1184 } 1185 btrfs_delalloc_release_extents(inode, len); 1186 extent_changeset_free(data_reserved); 1187 1188 return ret; 1189 } 1190 1191 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1192 u32 extent_thresh, u64 newer_than, bool do_compress, 1193 u64 *last_scanned_ret) 1194 { 1195 struct extent_state *cached_state = NULL; 1196 struct defrag_target_range *entry; 1197 struct defrag_target_range *tmp; 1198 LIST_HEAD(target_list); 1199 struct folio **folios; 1200 const u32 sectorsize = inode->root->fs_info->sectorsize; 1201 u64 cur = start; 1202 const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - 1203 (start >> PAGE_SHIFT) + 1; 1204 int ret = 0; 1205 1206 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1207 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1208 1209 folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); 1210 if (!folios) 1211 return -ENOMEM; 1212 1213 /* Prepare all pages */ 1214 for (int i = 0; cur < start + len && i < nr_pages; i++) { 1215 folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); 1216 if (IS_ERR(folios[i])) { 1217 ret = PTR_ERR(folios[i]); 1218 folios[i] = NULL; 1219 goto free_folios; 1220 } 1221 cur = folio_next_pos(folios[i]); 1222 } 1223 for (int i = 0; i < nr_pages; i++) { 1224 if (!folios[i]) 1225 break; 1226 folio_wait_writeback(folios[i]); 1227 } 1228 1229 /* We should get at least one folio. */ 1230 ASSERT(folios[0]); 1231 /* Lock the pages range */ 1232 btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1233 /* 1234 * Now we have a consistent view about the extent map, re-check 1235 * which range really needs to be defragged. 1236 * 1237 * And this time we have extent locked already, pass @locked = true 1238 * so that we won't relock the extent range and cause deadlock. 1239 */ 1240 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1241 newer_than, do_compress, true, 1242 &target_list, last_scanned_ret); 1243 if (ret < 0) 1244 goto unlock_extent; 1245 1246 list_for_each_entry(entry, &target_list, list) { 1247 ret = defrag_one_locked_target(inode, entry, folios, nr_pages, 1248 &cached_state); 1249 if (ret < 0) 1250 break; 1251 } 1252 1253 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1254 list_del_init(&entry->list); 1255 kfree(entry); 1256 } 1257 unlock_extent: 1258 btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1259 free_folios: 1260 for (int i = 0; i < nr_pages; i++) { 1261 if (!folios[i]) 1262 break; 1263 folio_unlock(folios[i]); 1264 folio_put(folios[i]); 1265 } 1266 kfree(folios); 1267 return ret; 1268 } 1269 1270 static int defrag_one_cluster(struct btrfs_inode *inode, 1271 struct file_ra_state *ra, 1272 u64 start, u32 len, u32 extent_thresh, 1273 u64 newer_than, bool do_compress, 1274 unsigned long *sectors_defragged, 1275 unsigned long max_sectors, 1276 u64 *last_scanned_ret) 1277 { 1278 const u32 sectorsize = inode->root->fs_info->sectorsize; 1279 struct defrag_target_range *entry; 1280 struct defrag_target_range *tmp; 1281 LIST_HEAD(target_list); 1282 int ret; 1283 1284 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1285 newer_than, do_compress, false, 1286 &target_list, NULL); 1287 if (ret < 0) 1288 goto out; 1289 1290 list_for_each_entry(entry, &target_list, list) { 1291 u32 range_len = entry->len; 1292 1293 /* Reached or beyond the limit */ 1294 if (max_sectors && *sectors_defragged >= max_sectors) { 1295 ret = 1; 1296 break; 1297 } 1298 1299 if (max_sectors) 1300 range_len = min_t(u32, range_len, 1301 (max_sectors - *sectors_defragged) * sectorsize); 1302 1303 /* 1304 * If defrag_one_range() has updated last_scanned_ret, 1305 * our range may already be invalid (e.g. hole punched). 1306 * Skip if our range is before last_scanned_ret, as there is 1307 * no need to defrag the range anymore. 1308 */ 1309 if (entry->start + range_len <= *last_scanned_ret) 1310 continue; 1311 1312 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1313 ra, NULL, entry->start >> PAGE_SHIFT, 1314 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1315 (entry->start >> PAGE_SHIFT) + 1); 1316 /* 1317 * Here we may not defrag any range if holes are punched before 1318 * we locked the pages. 1319 * But that's fine, it only affects the @sectors_defragged 1320 * accounting. 1321 */ 1322 ret = defrag_one_range(inode, entry->start, range_len, 1323 extent_thresh, newer_than, do_compress, 1324 last_scanned_ret); 1325 if (ret < 0) 1326 break; 1327 *sectors_defragged += range_len >> 1328 inode->root->fs_info->sectorsize_bits; 1329 } 1330 out: 1331 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1332 list_del_init(&entry->list); 1333 kfree(entry); 1334 } 1335 if (ret >= 0) 1336 *last_scanned_ret = max(*last_scanned_ret, start + len); 1337 return ret; 1338 } 1339 1340 /* 1341 * Entry point to file defragmentation. 1342 * 1343 * @inode: inode to be defragged 1344 * @ra: readahead state 1345 * @range: defrag options including range and flags 1346 * @newer_than: minimum transid to defrag 1347 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1348 * will be defragged. 1349 * 1350 * Return <0 for error. 1351 * Return >=0 for the number of sectors defragged, and range->start will be updated 1352 * to indicate the file offset where next defrag should be started at. 1353 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1354 * defragging all the range). 1355 */ 1356 int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, 1357 struct btrfs_ioctl_defrag_range_args *range, 1358 u64 newer_than, unsigned long max_to_defrag) 1359 { 1360 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1361 unsigned long sectors_defragged = 0; 1362 u64 isize = i_size_read(&inode->vfs_inode); 1363 u64 cur; 1364 u64 last_byte; 1365 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1366 bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); 1367 int compress_type = BTRFS_COMPRESS_ZLIB; 1368 int compress_level = 0; 1369 int ret = 0; 1370 u32 extent_thresh = range->extent_thresh; 1371 pgoff_t start_index; 1372 1373 ASSERT(ra); 1374 1375 if (isize == 0) 1376 return 0; 1377 1378 if (range->start >= isize) 1379 return -EINVAL; 1380 1381 if (do_compress) { 1382 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { 1383 if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) 1384 return -EINVAL; 1385 if (range->compress.type) { 1386 compress_type = range->compress.type; 1387 compress_level = range->compress.level; 1388 if (!btrfs_compress_level_valid(compress_type, compress_level)) 1389 return -EINVAL; 1390 } 1391 } else { 1392 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1393 return -EINVAL; 1394 if (range->compress_type) 1395 compress_type = range->compress_type; 1396 } 1397 } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { 1398 compress_type = BTRFS_DEFRAG_DONT_COMPRESS; 1399 compress_level = 1; 1400 } 1401 1402 if (extent_thresh == 0) 1403 extent_thresh = SZ_256K; 1404 1405 if (range->start + range->len > range->start) { 1406 /* Got a specific range */ 1407 last_byte = min(isize, range->start + range->len); 1408 } else { 1409 /* Defrag until file end */ 1410 last_byte = isize; 1411 } 1412 1413 /* Align the range */ 1414 cur = round_down(range->start, fs_info->sectorsize); 1415 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1416 1417 /* 1418 * Make writeback start from the beginning of the range, so that the 1419 * defrag range can be written sequentially. 1420 */ 1421 start_index = cur >> PAGE_SHIFT; 1422 if (start_index < inode->vfs_inode.i_mapping->writeback_index) 1423 inode->vfs_inode.i_mapping->writeback_index = start_index; 1424 1425 while (cur < last_byte) { 1426 const unsigned long prev_sectors_defragged = sectors_defragged; 1427 u64 last_scanned = cur; 1428 u64 cluster_end; 1429 1430 if (btrfs_defrag_cancelled(fs_info)) { 1431 ret = -EAGAIN; 1432 break; 1433 } 1434 1435 /* We want the cluster end at page boundary when possible */ 1436 cluster_end = (((cur >> PAGE_SHIFT) + 1437 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1438 cluster_end = min(cluster_end, last_byte); 1439 1440 btrfs_inode_lock(inode, 0); 1441 if (IS_SWAPFILE(&inode->vfs_inode)) { 1442 ret = -ETXTBSY; 1443 btrfs_inode_unlock(inode, 0); 1444 break; 1445 } 1446 if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { 1447 btrfs_inode_unlock(inode, 0); 1448 break; 1449 } 1450 if (do_compress || no_compress) { 1451 inode->defrag_compress = compress_type; 1452 inode->defrag_compress_level = compress_level; 1453 } 1454 ret = defrag_one_cluster(inode, ra, cur, 1455 cluster_end + 1 - cur, extent_thresh, 1456 newer_than, do_compress || no_compress, 1457 §ors_defragged, 1458 max_to_defrag, &last_scanned); 1459 1460 if (sectors_defragged > prev_sectors_defragged) 1461 balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); 1462 1463 btrfs_inode_unlock(inode, 0); 1464 if (ret < 0) 1465 break; 1466 cur = max(cluster_end + 1, last_scanned); 1467 if (ret > 0) { 1468 ret = 0; 1469 break; 1470 } 1471 cond_resched(); 1472 } 1473 1474 /* 1475 * Update range.start for autodefrag, this will indicate where to start 1476 * in next run. 1477 */ 1478 range->start = cur; 1479 if (sectors_defragged) { 1480 /* 1481 * We have defragged some sectors, for compression case they 1482 * need to be written back immediately. 1483 */ 1484 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1485 filemap_flush(inode->vfs_inode.i_mapping); 1486 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1487 &inode->runtime_flags)) 1488 filemap_flush(inode->vfs_inode.i_mapping); 1489 } 1490 if (range->compress_type == BTRFS_COMPRESS_LZO) 1491 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1492 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1493 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1494 ret = sectors_defragged; 1495 } 1496 if (do_compress || no_compress) { 1497 btrfs_inode_lock(inode, 0); 1498 inode->defrag_compress = BTRFS_COMPRESS_NONE; 1499 btrfs_inode_unlock(inode, 0); 1500 } 1501 return ret; 1502 } 1503 1504 void __cold btrfs_auto_defrag_exit(void) 1505 { 1506 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1507 } 1508 1509 int __init btrfs_auto_defrag_init(void) 1510 { 1511 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1512 sizeof(struct inode_defrag), 0, 0, NULL); 1513 if (!btrfs_inode_defrag_cachep) 1514 return -ENOMEM; 1515 1516 return 0; 1517 } 1518