1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "transaction.h" 10 #include "locking.h" 11 #include "accessors.h" 12 #include "messages.h" 13 #include "delalloc-space.h" 14 #include "subpage.h" 15 #include "defrag.h" 16 #include "file-item.h" 17 #include "super.h" 18 #include "compression.h" 19 20 static struct kmem_cache *btrfs_inode_defrag_cachep; 21 22 /* 23 * When auto defrag is enabled we queue up these defrag structs to remember 24 * which inodes need defragging passes. 25 */ 26 struct inode_defrag { 27 struct rb_node rb_node; 28 /* Inode number */ 29 u64 ino; 30 /* 31 * Transid where the defrag was added, we search for extents newer than 32 * this. 33 */ 34 u64 transid; 35 36 /* Root objectid */ 37 u64 root; 38 39 /* 40 * The extent size threshold for autodefrag. 41 * 42 * This value is different for compressed/non-compressed extents, thus 43 * needs to be passed from higher layer. 44 * (aka, inode_should_defrag()) 45 */ 46 u32 extent_thresh; 47 }; 48 49 static int compare_inode_defrag(const struct inode_defrag *defrag1, 50 const struct inode_defrag *defrag2) 51 { 52 if (defrag1->root > defrag2->root) 53 return 1; 54 else if (defrag1->root < defrag2->root) 55 return -1; 56 else if (defrag1->ino > defrag2->ino) 57 return 1; 58 else if (defrag1->ino < defrag2->ino) 59 return -1; 60 else 61 return 0; 62 } 63 64 static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing) 65 { 66 const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node); 67 const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node); 68 69 return compare_inode_defrag(new_defrag, existing_defrag); 70 } 71 72 /* 73 * Insert a record for an inode into the defrag tree. The lock must be held 74 * already. 75 * 76 * If you're inserting a record for an older transid than an existing record, 77 * the transid already in the tree is lowered. 78 */ 79 static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, 80 struct inode_defrag *defrag) 81 { 82 struct btrfs_fs_info *fs_info = inode->root->fs_info; 83 struct rb_node *node; 84 85 node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp); 86 if (node) { 87 struct inode_defrag *entry; 88 89 entry = rb_entry(node, struct inode_defrag, rb_node); 90 /* 91 * If we're reinserting an entry for an old defrag run, make 92 * sure to lower the transid of our existing record. 93 */ 94 if (defrag->transid < entry->transid) 95 entry->transid = defrag->transid; 96 entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh); 97 return -EEXIST; 98 } 99 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 100 return 0; 101 } 102 103 static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) 104 { 105 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 106 return false; 107 108 if (btrfs_fs_closing(fs_info)) 109 return false; 110 111 return true; 112 } 113 114 /* 115 * Insert a defrag record for this inode if auto defrag is enabled. No errors 116 * returned as they're not considered fatal. 117 */ 118 void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) 119 { 120 struct btrfs_root *root = inode->root; 121 struct btrfs_fs_info *fs_info = root->fs_info; 122 struct inode_defrag *defrag; 123 int ret; 124 125 if (!need_auto_defrag(fs_info)) 126 return; 127 128 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 129 return; 130 131 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 132 if (!defrag) 133 return; 134 135 defrag->ino = btrfs_ino(inode); 136 defrag->transid = btrfs_get_root_last_trans(root); 137 defrag->root = btrfs_root_id(root); 138 defrag->extent_thresh = extent_thresh; 139 140 spin_lock(&fs_info->defrag_inodes_lock); 141 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 142 /* 143 * If we set IN_DEFRAG flag and evict the inode from memory, 144 * and then re-read this inode, this new inode doesn't have 145 * IN_DEFRAG flag. At the case, we may find the existed defrag. 146 */ 147 ret = btrfs_insert_inode_defrag(inode, defrag); 148 if (ret) 149 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 150 } else { 151 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 152 } 153 spin_unlock(&fs_info->defrag_inodes_lock); 154 } 155 156 /* 157 * Pick the defraggable inode that we want, if it doesn't exist, we will get the 158 * next one. 159 */ 160 static struct inode_defrag *btrfs_pick_defrag_inode( 161 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 162 { 163 struct inode_defrag *entry = NULL; 164 struct inode_defrag tmp; 165 struct rb_node *p; 166 struct rb_node *parent = NULL; 167 int ret; 168 169 tmp.ino = ino; 170 tmp.root = root; 171 172 spin_lock(&fs_info->defrag_inodes_lock); 173 p = fs_info->defrag_inodes.rb_node; 174 while (p) { 175 parent = p; 176 entry = rb_entry(parent, struct inode_defrag, rb_node); 177 178 ret = compare_inode_defrag(&tmp, entry); 179 if (ret < 0) 180 p = parent->rb_left; 181 else if (ret > 0) 182 p = parent->rb_right; 183 else 184 goto out; 185 } 186 187 if (parent && compare_inode_defrag(&tmp, entry) > 0) { 188 parent = rb_next(parent); 189 entry = rb_entry_safe(parent, struct inode_defrag, rb_node); 190 } 191 out: 192 if (entry) 193 rb_erase(parent, &fs_info->defrag_inodes); 194 spin_unlock(&fs_info->defrag_inodes_lock); 195 return entry; 196 } 197 198 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 199 { 200 struct inode_defrag *defrag, *next; 201 202 spin_lock(&fs_info->defrag_inodes_lock); 203 204 rbtree_postorder_for_each_entry_safe(defrag, next, 205 &fs_info->defrag_inodes, rb_node) 206 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 207 208 fs_info->defrag_inodes = RB_ROOT; 209 210 spin_unlock(&fs_info->defrag_inodes_lock); 211 } 212 213 #define BTRFS_DEFRAG_BATCH 1024 214 215 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 216 struct inode_defrag *defrag, 217 struct file_ra_state *ra) 218 { 219 struct btrfs_root *inode_root; 220 struct btrfs_inode *inode; 221 struct btrfs_ioctl_defrag_range_args range; 222 int ret = 0; 223 u64 cur = 0; 224 225 again: 226 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 227 goto cleanup; 228 if (!need_auto_defrag(fs_info)) 229 goto cleanup; 230 231 /* Get the inode */ 232 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 233 if (IS_ERR(inode_root)) { 234 ret = PTR_ERR(inode_root); 235 goto cleanup; 236 } 237 238 inode = btrfs_iget(defrag->ino, inode_root); 239 btrfs_put_root(inode_root); 240 if (IS_ERR(inode)) { 241 ret = PTR_ERR(inode); 242 goto cleanup; 243 } 244 245 if (cur >= i_size_read(&inode->vfs_inode)) { 246 iput(&inode->vfs_inode); 247 goto cleanup; 248 } 249 250 /* Do a chunk of defrag */ 251 clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 252 memset(&range, 0, sizeof(range)); 253 range.len = (u64)-1; 254 range.start = cur; 255 range.extent_thresh = defrag->extent_thresh; 256 file_ra_state_init(ra, inode->vfs_inode.i_mapping); 257 258 scoped_guard(super_write, fs_info->sb) 259 ret = btrfs_defrag_file(inode, ra, &range, 260 defrag->transid, BTRFS_DEFRAG_BATCH); 261 iput(&inode->vfs_inode); 262 263 if (ret < 0) 264 goto cleanup; 265 266 cur = max(cur + fs_info->sectorsize, range.start); 267 goto again; 268 269 cleanup: 270 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 271 return ret; 272 } 273 274 /* 275 * Run through the list of inodes in the FS that need defragging. 276 */ 277 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 278 { 279 struct inode_defrag *defrag; 280 u64 first_ino = 0; 281 u64 root_objectid = 0; 282 283 atomic_inc(&fs_info->defrag_running); 284 while (1) { 285 struct file_ra_state ra = { 0 }; 286 287 /* Pause the auto defragger. */ 288 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 289 break; 290 291 if (!need_auto_defrag(fs_info)) 292 break; 293 294 /* find an inode to defrag */ 295 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 296 if (!defrag) { 297 if (root_objectid || first_ino) { 298 root_objectid = 0; 299 first_ino = 0; 300 continue; 301 } else { 302 break; 303 } 304 } 305 306 first_ino = defrag->ino + 1; 307 root_objectid = defrag->root; 308 309 btrfs_run_defrag_inode(fs_info, defrag, &ra); 310 } 311 atomic_dec(&fs_info->defrag_running); 312 313 /* 314 * During unmount, we use the transaction_wait queue to wait for the 315 * defragger to stop. 316 */ 317 wake_up(&fs_info->transaction_wait); 318 return 0; 319 } 320 321 /* 322 * Check if two blocks addresses are close, used by defrag. 323 */ 324 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 325 { 326 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 327 return true; 328 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 329 return true; 330 return false; 331 } 332 333 /* 334 * Go through all the leaves pointed to by a node and reallocate them so that 335 * disk order is close to key order. 336 */ 337 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 338 struct btrfs_root *root, 339 struct extent_buffer *parent, 340 int start_slot, u64 *last_ret, 341 struct btrfs_key *progress) 342 { 343 struct btrfs_fs_info *fs_info = root->fs_info; 344 const u32 blocksize = fs_info->nodesize; 345 const int end_slot = btrfs_header_nritems(parent) - 1; 346 u64 search_start = *last_ret; 347 u64 last_block = 0; 348 int ret = 0; 349 bool progress_passed = false; 350 351 /* 352 * COWing must happen through a running transaction, which always 353 * matches the current fs generation (it's a transaction with a state 354 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 355 * into error state to prevent the commit of any transaction. 356 */ 357 if (unlikely(trans->transaction != fs_info->running_transaction || 358 trans->transid != fs_info->generation)) { 359 btrfs_abort_transaction(trans, -EUCLEAN); 360 btrfs_crit(fs_info, 361 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 362 parent->start, btrfs_root_id(root), trans->transid, 363 fs_info->running_transaction->transid, 364 fs_info->generation); 365 return -EUCLEAN; 366 } 367 368 if (btrfs_header_nritems(parent) <= 1) 369 return 0; 370 371 for (int i = start_slot; i <= end_slot; i++) { 372 struct extent_buffer *cur; 373 struct btrfs_disk_key disk_key; 374 u64 blocknr; 375 u64 other; 376 bool close = true; 377 378 btrfs_node_key(parent, &disk_key, i); 379 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 380 continue; 381 382 progress_passed = true; 383 blocknr = btrfs_node_blockptr(parent, i); 384 if (last_block == 0) 385 last_block = blocknr; 386 387 if (i > 0) { 388 other = btrfs_node_blockptr(parent, i - 1); 389 close = close_blocks(blocknr, other, blocksize); 390 } 391 if (!close && i < end_slot) { 392 other = btrfs_node_blockptr(parent, i + 1); 393 close = close_blocks(blocknr, other, blocksize); 394 } 395 if (close) { 396 last_block = blocknr; 397 continue; 398 } 399 400 cur = btrfs_read_node_slot(parent, i); 401 if (IS_ERR(cur)) 402 return PTR_ERR(cur); 403 if (search_start == 0) 404 search_start = last_block; 405 406 btrfs_tree_lock(cur); 407 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 408 &cur, search_start, 409 min(16 * blocksize, 410 (end_slot - i) * blocksize), 411 BTRFS_NESTING_COW); 412 if (ret) { 413 btrfs_tree_unlock(cur); 414 free_extent_buffer(cur); 415 break; 416 } 417 search_start = cur->start; 418 last_block = cur->start; 419 *last_ret = search_start; 420 btrfs_tree_unlock(cur); 421 free_extent_buffer(cur); 422 } 423 return ret; 424 } 425 426 /* 427 * Defrag all the leaves in a given btree. 428 * Read all the leaves and try to get key order to 429 * better reflect disk order 430 */ 431 432 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 433 struct btrfs_root *root) 434 { 435 struct btrfs_path *path = NULL; 436 struct btrfs_key key; 437 int ret = 0; 438 int wret; 439 int level; 440 int next_key_ret = 0; 441 u64 last_ret = 0; 442 443 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 444 goto out; 445 446 path = btrfs_alloc_path(); 447 if (!path) { 448 ret = -ENOMEM; 449 goto out; 450 } 451 452 level = btrfs_header_level(root->node); 453 454 if (level == 0) 455 goto out; 456 457 if (root->defrag_progress.objectid == 0) { 458 struct extent_buffer *root_node; 459 u32 nritems; 460 461 root_node = btrfs_lock_root_node(root); 462 nritems = btrfs_header_nritems(root_node); 463 root->defrag_max.objectid = 0; 464 /* from above we know this is not a leaf */ 465 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 466 nritems - 1); 467 btrfs_tree_unlock(root_node); 468 free_extent_buffer(root_node); 469 memset(&key, 0, sizeof(key)); 470 } else { 471 memcpy(&key, &root->defrag_progress, sizeof(key)); 472 } 473 474 path->keep_locks = true; 475 476 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 477 if (ret < 0) 478 goto out; 479 if (ret > 0) { 480 ret = 0; 481 goto out; 482 } 483 btrfs_release_path(path); 484 /* 485 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 486 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 487 * a deadlock (attempting to write lock an already write locked leaf). 488 */ 489 path->lowest_level = 1; 490 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 491 492 if (wret < 0) { 493 ret = wret; 494 goto out; 495 } 496 if (!path->nodes[1]) { 497 ret = 0; 498 goto out; 499 } 500 /* 501 * The node at level 1 must always be locked when our path has 502 * keep_locks set and lowest_level is 1, regardless of the value of 503 * path->slots[1]. 504 */ 505 ASSERT(path->locks[1] != 0); 506 ret = btrfs_realloc_node(trans, root, 507 path->nodes[1], 0, 508 &last_ret, 509 &root->defrag_progress); 510 if (ret) { 511 WARN_ON(ret == -EAGAIN); 512 goto out; 513 } 514 /* 515 * Now that we reallocated the node we can find the next key. Note that 516 * btrfs_find_next_key() can release our path and do another search 517 * without COWing, this is because even with path->keep_locks == true, 518 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 519 * node when path->slots[node_level - 1] does not point to the last 520 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 521 * we search for the next key after reallocating our node. 522 */ 523 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 524 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 525 BTRFS_OLDEST_GENERATION); 526 if (next_key_ret == 0) { 527 memcpy(&root->defrag_progress, &key, sizeof(key)); 528 ret = -EAGAIN; 529 } 530 out: 531 btrfs_free_path(path); 532 if (ret == -EAGAIN) { 533 if (root->defrag_max.objectid > root->defrag_progress.objectid) 534 goto done; 535 if (root->defrag_max.type > root->defrag_progress.type) 536 goto done; 537 if (root->defrag_max.offset > root->defrag_progress.offset) 538 goto done; 539 ret = 0; 540 } 541 done: 542 if (ret != -EAGAIN) 543 memset(&root->defrag_progress, 0, 544 sizeof(root->defrag_progress)); 545 546 return ret; 547 } 548 549 /* 550 * Defrag a given btree. Every leaf in the btree is read and defragmented. 551 */ 552 int btrfs_defrag_root(struct btrfs_root *root) 553 { 554 struct btrfs_fs_info *fs_info = root->fs_info; 555 int ret; 556 557 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 558 return 0; 559 560 while (1) { 561 struct btrfs_trans_handle *trans; 562 563 trans = btrfs_start_transaction(root, 0); 564 if (IS_ERR(trans)) { 565 ret = PTR_ERR(trans); 566 break; 567 } 568 569 ret = btrfs_defrag_leaves(trans, root); 570 571 btrfs_end_transaction(trans); 572 btrfs_btree_balance_dirty(fs_info); 573 cond_resched(); 574 575 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 576 break; 577 578 if (btrfs_defrag_cancelled(fs_info)) { 579 btrfs_debug(fs_info, "defrag_root cancelled"); 580 ret = -EAGAIN; 581 break; 582 } 583 } 584 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 585 return ret; 586 } 587 588 /* 589 * Defrag specific helper to get an extent map. 590 * 591 * Differences between this and btrfs_get_extent() are: 592 * 593 * - No extent_map will be added to inode->extent_tree 594 * To reduce memory usage in the long run. 595 * 596 * - Extra optimization to skip file extents older than @newer_than 597 * By using btrfs_search_forward() we can skip entire file ranges that 598 * have extents created in past transactions, because btrfs_search_forward() 599 * will not visit leaves and nodes with a generation smaller than given 600 * minimal generation threshold (@newer_than). 601 * 602 * Return valid em if we find a file extent matching the requirement. 603 * Return NULL if we can not find a file extent matching the requirement. 604 * 605 * Return ERR_PTR() for error. 606 */ 607 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 608 u64 start, u64 newer_than) 609 { 610 struct btrfs_root *root = inode->root; 611 struct btrfs_file_extent_item *fi; 612 BTRFS_PATH_AUTO_RELEASE(path); 613 struct extent_map *em; 614 struct btrfs_key key; 615 u64 ino = btrfs_ino(inode); 616 int ret; 617 618 em = btrfs_alloc_extent_map(); 619 if (!em) { 620 ret = -ENOMEM; 621 goto err; 622 } 623 624 key.objectid = ino; 625 key.type = BTRFS_EXTENT_DATA_KEY; 626 key.offset = start; 627 628 if (newer_than) { 629 ret = btrfs_search_forward(root, &key, &path, newer_than); 630 if (ret < 0) 631 goto err; 632 /* Can't find anything newer */ 633 if (ret > 0) 634 goto not_found; 635 } else { 636 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 637 if (ret < 0) 638 goto err; 639 } 640 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 641 /* 642 * If btrfs_search_slot() makes path to point beyond nritems, 643 * we should not have an empty leaf, as this inode must at 644 * least have its INODE_ITEM. 645 */ 646 ASSERT(btrfs_header_nritems(path.nodes[0])); 647 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 648 } 649 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 650 /* Perfect match, no need to go one slot back */ 651 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 652 key.offset == start) 653 goto iterate; 654 655 /* We didn't find a perfect match, needs to go one slot back */ 656 if (path.slots[0] > 0) { 657 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 658 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 659 path.slots[0]--; 660 } 661 662 iterate: 663 /* Iterate through the path to find a file extent covering @start */ 664 while (true) { 665 u64 extent_end; 666 667 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 668 goto next; 669 670 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 671 672 /* 673 * We may go one slot back to INODE_REF/XATTR item, then 674 * need to go forward until we reach an EXTENT_DATA. 675 * But we should still has the correct ino as key.objectid. 676 */ 677 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 678 goto next; 679 680 /* It's beyond our target range, definitely not extent found */ 681 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 682 goto not_found; 683 684 /* 685 * | |<- File extent ->| 686 * \- start 687 * 688 * This means there is a hole between start and key.offset. 689 */ 690 if (key.offset > start) { 691 em->start = start; 692 em->disk_bytenr = EXTENT_MAP_HOLE; 693 em->disk_num_bytes = 0; 694 em->ram_bytes = 0; 695 em->offset = 0; 696 em->len = key.offset - start; 697 break; 698 } 699 700 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 701 struct btrfs_file_extent_item); 702 extent_end = btrfs_file_extent_end(&path); 703 704 /* 705 * |<- file extent ->| | 706 * \- start 707 * 708 * We haven't reached start, search next slot. 709 */ 710 if (extent_end <= start) 711 goto next; 712 713 /* Now this extent covers @start, convert it to em */ 714 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 715 break; 716 next: 717 ret = btrfs_next_item(root, &path); 718 if (ret < 0) 719 goto err; 720 if (ret > 0) 721 goto not_found; 722 } 723 return em; 724 725 not_found: 726 btrfs_free_extent_map(em); 727 return NULL; 728 729 err: 730 btrfs_free_extent_map(em); 731 return ERR_PTR(ret); 732 } 733 734 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 735 u64 newer_than, bool locked) 736 { 737 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 738 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 739 struct extent_map *em; 740 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 741 742 /* 743 * Hopefully we have this extent in the tree already, try without the 744 * full extent lock. 745 */ 746 read_lock(&em_tree->lock); 747 em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); 748 read_unlock(&em_tree->lock); 749 750 /* 751 * We can get a merged extent, in that case, we need to re-search 752 * tree to get the original em for defrag. 753 * 754 * This is because even if we have adjacent extents that are contiguous 755 * and compatible (same type and flags), we still want to defrag them 756 * so that we use less metadata (extent items in the extent tree and 757 * file extent items in the inode's subvolume tree). 758 */ 759 if (em && (em->flags & EXTENT_FLAG_MERGED)) { 760 btrfs_free_extent_map(em); 761 em = NULL; 762 } 763 764 if (!em) { 765 struct extent_state *cached = NULL; 766 u64 end = start + sectorsize - 1; 767 768 /* Get the big lock and read metadata off disk. */ 769 if (!locked) 770 btrfs_lock_extent(io_tree, start, end, &cached); 771 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 772 if (!locked) 773 btrfs_unlock_extent(io_tree, start, end, &cached); 774 775 if (IS_ERR(em)) 776 return NULL; 777 } 778 779 return em; 780 } 781 782 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 783 const struct extent_map *em) 784 { 785 if (btrfs_extent_map_is_compressed(em)) 786 return BTRFS_MAX_COMPRESSED; 787 return fs_info->max_extent_size; 788 } 789 790 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 791 u32 extent_thresh, u64 newer_than, bool locked) 792 { 793 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 794 struct extent_map *next; 795 const u64 em_end = btrfs_extent_map_end(em); 796 bool ret = false; 797 798 /* This is the last extent */ 799 if (em_end >= i_size_read(inode)) 800 return false; 801 802 /* 803 * Here we need to pass @newer_then when checking the next extent, or 804 * we will hit a case we mark current extent for defrag, but the next 805 * one will not be a target. 806 * This will just cause extra IO without really reducing the fragments. 807 */ 808 next = defrag_lookup_extent(inode, em_end, newer_than, locked); 809 /* No more em or hole */ 810 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 811 goto out; 812 if (next->flags & EXTENT_FLAG_PREALLOC) 813 goto out; 814 /* 815 * If the next extent is at its max capacity, defragging current extent 816 * makes no sense, as the total number of extents won't change. 817 */ 818 if (next->len >= get_extent_max_capacity(fs_info, em)) 819 goto out; 820 /* Skip older extent */ 821 if (next->generation < newer_than) 822 goto out; 823 /* Also check extent size */ 824 if (next->len >= extent_thresh) 825 goto out; 826 827 ret = true; 828 out: 829 btrfs_free_extent_map(next); 830 return ret; 831 } 832 833 /* 834 * Prepare one page to be defragged. 835 * 836 * This will ensure: 837 * 838 * - Returned page is locked and has been set up properly. 839 * - No ordered extent exists in the page. 840 * - The page is uptodate. 841 * 842 * NOTE: Caller should also wait for page writeback after the cluster is 843 * prepared, here we don't do writeback wait for each page. 844 */ 845 static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) 846 { 847 struct address_space *mapping = inode->vfs_inode.i_mapping; 848 gfp_t mask = btrfs_alloc_write_mask(mapping); 849 u64 lock_start; 850 u64 lock_end; 851 struct extent_state *cached_state = NULL; 852 struct folio *folio; 853 int ret; 854 855 again: 856 /* TODO: Add order fgp order flags when large folios are fully enabled. */ 857 folio = __filemap_get_folio(mapping, index, 858 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 859 if (IS_ERR(folio)) 860 return folio; 861 862 ret = set_folio_extent_mapped(folio); 863 if (ret < 0) { 864 folio_unlock(folio); 865 folio_put(folio); 866 return ERR_PTR(ret); 867 } 868 869 lock_start = folio_pos(folio); 870 lock_end = folio_next_pos(folio) - 1; 871 /* Wait for any existing ordered extent in the range */ 872 while (1) { 873 struct btrfs_ordered_extent *ordered; 874 875 btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 876 ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio)); 877 btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state); 878 if (!ordered) 879 break; 880 881 folio_unlock(folio); 882 btrfs_start_ordered_extent(ordered); 883 btrfs_put_ordered_extent(ordered); 884 folio_lock(folio); 885 /* 886 * We unlocked the folio above, so we need check if it was 887 * released or not. 888 */ 889 if (folio->mapping != mapping || !folio->private) { 890 folio_unlock(folio); 891 folio_put(folio); 892 goto again; 893 } 894 } 895 896 /* 897 * Now the page range has no ordered extent any more. Read the page to 898 * make it uptodate. 899 */ 900 if (!folio_test_uptodate(folio)) { 901 btrfs_read_folio(NULL, folio); 902 folio_lock(folio); 903 if (folio->mapping != mapping || !folio->private) { 904 folio_unlock(folio); 905 folio_put(folio); 906 goto again; 907 } 908 if (unlikely(!folio_test_uptodate(folio))) { 909 folio_unlock(folio); 910 folio_put(folio); 911 return ERR_PTR(-EIO); 912 } 913 } 914 return folio; 915 } 916 917 struct defrag_target_range { 918 struct list_head list; 919 u64 start; 920 u64 len; 921 }; 922 923 /* 924 * Collect all valid target extents. 925 * 926 * @start: file offset to lookup 927 * @len: length to lookup 928 * @extent_thresh: file extent size threshold, any extent size >= this value 929 * will be ignored 930 * @newer_than: only defrag extents newer than this value 931 * @do_compress: whether the defrag is doing compression or no-compression 932 * if true, @extent_thresh will be ignored and all regular 933 * file extents meeting @newer_than will be targets. 934 * @locked: if the range has already held extent lock 935 * @target_list: list of targets file extents 936 */ 937 static int defrag_collect_targets(struct btrfs_inode *inode, 938 u64 start, u64 len, u32 extent_thresh, 939 u64 newer_than, bool do_compress, 940 bool locked, struct list_head *target_list, 941 u64 *last_scanned_ret) 942 { 943 struct btrfs_fs_info *fs_info = inode->root->fs_info; 944 bool last_is_target = false; 945 u64 cur = start; 946 int ret = 0; 947 948 while (cur < start + len) { 949 struct extent_map *em; 950 struct defrag_target_range *new; 951 bool next_mergeable = true; 952 u64 range_len; 953 954 last_is_target = false; 955 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 956 if (!em) 957 break; 958 959 /* 960 * If the file extent is an inlined one, we may still want to 961 * defrag it (fallthrough) if it will cause a regular extent. 962 * This is for users who want to convert inline extents to 963 * regular ones through max_inline= mount option. 964 */ 965 if (em->disk_bytenr == EXTENT_MAP_INLINE && 966 em->len <= inode->root->fs_info->max_inline) 967 goto next; 968 969 /* Skip holes and preallocated extents. */ 970 if (em->disk_bytenr == EXTENT_MAP_HOLE || 971 (em->flags & EXTENT_FLAG_PREALLOC)) 972 goto next; 973 974 /* Skip older extent */ 975 if (em->generation < newer_than) 976 goto next; 977 978 /* This em is under writeback, no need to defrag */ 979 if (em->generation == (u64)-1) 980 goto next; 981 982 /* 983 * Our start offset might be in the middle of an existing extent 984 * map, so take that into account. 985 */ 986 range_len = em->len - (cur - em->start); 987 /* 988 * If this range of the extent map is already flagged for delalloc, 989 * skip it, because: 990 * 991 * 1) We could deadlock later, when trying to reserve space for 992 * delalloc, because in case we can't immediately reserve space 993 * the flusher can start delalloc and wait for the respective 994 * ordered extents to complete. The deadlock would happen 995 * because we do the space reservation while holding the range 996 * locked, and starting writeback, or finishing an ordered 997 * extent, requires locking the range; 998 * 999 * 2) If there's delalloc there, it means there's dirty pages for 1000 * which writeback has not started yet (we clean the delalloc 1001 * flag when starting writeback and after creating an ordered 1002 * extent). If we mark pages in an adjacent range for defrag, 1003 * then we will have a larger contiguous range for delalloc, 1004 * very likely resulting in a larger extent after writeback is 1005 * triggered (except in a case of free space fragmentation). 1006 */ 1007 if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1008 EXTENT_DELALLOC)) 1009 goto next; 1010 1011 /* 1012 * For do_compress case, we want to compress all valid file 1013 * extents, thus no @extent_thresh or mergeable check. 1014 */ 1015 if (do_compress) 1016 goto add; 1017 1018 /* Skip too large extent */ 1019 if (em->len >= extent_thresh) 1020 goto next; 1021 1022 /* 1023 * Skip extents already at its max capacity, this is mostly for 1024 * compressed extents, which max cap is only 128K. 1025 */ 1026 if (em->len >= get_extent_max_capacity(fs_info, em)) 1027 goto next; 1028 1029 /* 1030 * Normally there are no more extents after an inline one, thus 1031 * @next_mergeable will normally be false and not defragged. 1032 * So if an inline extent passed all above checks, just add it 1033 * for defrag, and be converted to regular extents. 1034 */ 1035 if (em->disk_bytenr == EXTENT_MAP_INLINE) 1036 goto add; 1037 1038 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1039 extent_thresh, newer_than, locked); 1040 if (!next_mergeable) { 1041 struct defrag_target_range *last; 1042 1043 /* Empty target list, no way to merge with last entry */ 1044 if (list_empty(target_list)) 1045 goto next; 1046 last = list_last_entry(target_list, 1047 struct defrag_target_range, list); 1048 /* Not mergeable with last entry */ 1049 if (last->start + last->len != cur) 1050 goto next; 1051 1052 /* Mergeable, fall through to add it to @target_list. */ 1053 } 1054 1055 add: 1056 last_is_target = true; 1057 range_len = min(btrfs_extent_map_end(em), start + len) - cur; 1058 /* 1059 * This one is a good target, check if it can be merged into 1060 * last range of the target list. 1061 */ 1062 if (!list_empty(target_list)) { 1063 struct defrag_target_range *last; 1064 1065 last = list_last_entry(target_list, 1066 struct defrag_target_range, list); 1067 ASSERT(last->start + last->len <= cur); 1068 if (last->start + last->len == cur) { 1069 /* Mergeable, enlarge the last entry */ 1070 last->len += range_len; 1071 goto next; 1072 } 1073 /* Fall through to allocate a new entry */ 1074 } 1075 1076 /* Allocate new defrag_target_range */ 1077 new = kmalloc_obj(*new, GFP_NOFS); 1078 if (!new) { 1079 btrfs_free_extent_map(em); 1080 ret = -ENOMEM; 1081 break; 1082 } 1083 new->start = cur; 1084 new->len = range_len; 1085 list_add_tail(&new->list, target_list); 1086 1087 next: 1088 cur = btrfs_extent_map_end(em); 1089 btrfs_free_extent_map(em); 1090 } 1091 if (ret < 0) { 1092 struct defrag_target_range *entry; 1093 struct defrag_target_range *tmp; 1094 1095 list_for_each_entry_safe(entry, tmp, target_list, list) { 1096 list_del_init(&entry->list); 1097 kfree(entry); 1098 } 1099 } 1100 if (!ret && last_scanned_ret) { 1101 /* 1102 * If the last extent is not a target, the caller can skip to 1103 * the end of that extent. 1104 * Otherwise, we can only go the end of the specified range. 1105 */ 1106 if (!last_is_target) 1107 *last_scanned_ret = max(cur, *last_scanned_ret); 1108 else 1109 *last_scanned_ret = max(start + len, *last_scanned_ret); 1110 } 1111 return ret; 1112 } 1113 1114 #define CLUSTER_SIZE (SZ_256K) 1115 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1116 1117 /* 1118 * Defrag one contiguous target range. 1119 * 1120 * @inode: target inode 1121 * @target: target range to defrag 1122 * @pages: locked pages covering the defrag range 1123 * @nr_pages: number of locked pages 1124 * 1125 * Caller should ensure: 1126 * 1127 * - Pages are prepared 1128 * Pages should be locked, no ordered extent in the pages range, 1129 * no writeback. 1130 * 1131 * - Extent bits are locked 1132 */ 1133 static int defrag_one_locked_target(struct btrfs_inode *inode, 1134 struct defrag_target_range *target, 1135 struct folio **folios, int nr_pages, 1136 struct extent_state **cached_state) 1137 { 1138 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1139 struct extent_changeset *data_reserved = NULL; 1140 const u64 start = target->start; 1141 const u64 len = target->len; 1142 int ret = 0; 1143 1144 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1145 if (ret < 0) 1146 return ret; 1147 btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, 1148 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1149 EXTENT_DEFRAG, cached_state); 1150 btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, 1151 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1152 1153 /* 1154 * Update the page status. 1155 * Due to possible large folios, we have to check all folios one by one. 1156 */ 1157 for (int i = 0; i < nr_pages && folios[i]; i++) { 1158 struct folio *folio = folios[i]; 1159 1160 if (!folio) 1161 break; 1162 if (start >= folio_next_pos(folio) || 1163 start + len <= folio_pos(folio)) 1164 continue; 1165 btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); 1166 } 1167 btrfs_delalloc_release_extents(inode, len); 1168 extent_changeset_free(data_reserved); 1169 1170 return ret; 1171 } 1172 1173 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1174 u32 extent_thresh, u64 newer_than, bool do_compress, 1175 u64 *last_scanned_ret) 1176 { 1177 struct extent_state *cached_state = NULL; 1178 struct defrag_target_range *entry; 1179 struct defrag_target_range *tmp; 1180 LIST_HEAD(target_list); 1181 struct folio **folios; 1182 const u32 sectorsize = inode->root->fs_info->sectorsize; 1183 u64 cur = start; 1184 const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - 1185 (start >> PAGE_SHIFT) + 1; 1186 int ret = 0; 1187 1188 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1189 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1190 1191 folios = kzalloc_objs(struct folio *, nr_pages, GFP_NOFS); 1192 if (!folios) 1193 return -ENOMEM; 1194 1195 /* Prepare all pages */ 1196 for (int i = 0; cur < start + len && i < nr_pages; i++) { 1197 folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); 1198 if (IS_ERR(folios[i])) { 1199 ret = PTR_ERR(folios[i]); 1200 folios[i] = NULL; 1201 goto free_folios; 1202 } 1203 cur = folio_next_pos(folios[i]); 1204 } 1205 for (int i = 0; i < nr_pages; i++) { 1206 if (!folios[i]) 1207 break; 1208 folio_wait_writeback(folios[i]); 1209 } 1210 1211 /* We should get at least one folio. */ 1212 ASSERT(folios[0]); 1213 /* Lock the pages range */ 1214 btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1215 /* 1216 * Now we have a consistent view about the extent map, re-check 1217 * which range really needs to be defragged. 1218 * 1219 * And this time we have extent locked already, pass @locked = true 1220 * so that we won't relock the extent range and cause deadlock. 1221 */ 1222 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1223 newer_than, do_compress, true, 1224 &target_list, last_scanned_ret); 1225 if (ret < 0) 1226 goto unlock_extent; 1227 1228 list_for_each_entry(entry, &target_list, list) { 1229 ret = defrag_one_locked_target(inode, entry, folios, nr_pages, 1230 &cached_state); 1231 if (ret < 0) 1232 break; 1233 } 1234 1235 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1236 list_del_init(&entry->list); 1237 kfree(entry); 1238 } 1239 unlock_extent: 1240 btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); 1241 free_folios: 1242 for (int i = 0; i < nr_pages; i++) { 1243 if (!folios[i]) 1244 break; 1245 folio_unlock(folios[i]); 1246 folio_put(folios[i]); 1247 } 1248 kfree(folios); 1249 return ret; 1250 } 1251 1252 static int defrag_one_cluster(struct btrfs_inode *inode, 1253 struct file_ra_state *ra, 1254 u64 start, u32 len, u32 extent_thresh, 1255 u64 newer_than, bool do_compress, 1256 unsigned long *sectors_defragged, 1257 unsigned long max_sectors, 1258 u64 *last_scanned_ret) 1259 { 1260 const u32 sectorsize = inode->root->fs_info->sectorsize; 1261 struct defrag_target_range *entry; 1262 struct defrag_target_range *tmp; 1263 LIST_HEAD(target_list); 1264 int ret; 1265 1266 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1267 newer_than, do_compress, false, 1268 &target_list, NULL); 1269 if (ret < 0) 1270 goto out; 1271 1272 list_for_each_entry(entry, &target_list, list) { 1273 u32 range_len = entry->len; 1274 1275 /* Reached or beyond the limit */ 1276 if (max_sectors && *sectors_defragged >= max_sectors) { 1277 ret = 1; 1278 break; 1279 } 1280 1281 if (max_sectors) 1282 range_len = min_t(u32, range_len, 1283 (max_sectors - *sectors_defragged) * sectorsize); 1284 1285 /* 1286 * If defrag_one_range() has updated last_scanned_ret, 1287 * our range may already be invalid (e.g. hole punched). 1288 * Skip if our range is before last_scanned_ret, as there is 1289 * no need to defrag the range anymore. 1290 */ 1291 if (entry->start + range_len <= *last_scanned_ret) 1292 continue; 1293 1294 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1295 ra, NULL, entry->start >> PAGE_SHIFT, 1296 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1297 (entry->start >> PAGE_SHIFT) + 1); 1298 /* 1299 * Here we may not defrag any range if holes are punched before 1300 * we locked the pages. 1301 * But that's fine, it only affects the @sectors_defragged 1302 * accounting. 1303 */ 1304 ret = defrag_one_range(inode, entry->start, range_len, 1305 extent_thresh, newer_than, do_compress, 1306 last_scanned_ret); 1307 if (ret < 0) 1308 break; 1309 *sectors_defragged += range_len >> 1310 inode->root->fs_info->sectorsize_bits; 1311 } 1312 out: 1313 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1314 list_del_init(&entry->list); 1315 kfree(entry); 1316 } 1317 if (ret >= 0) 1318 *last_scanned_ret = max(*last_scanned_ret, start + len); 1319 return ret; 1320 } 1321 1322 /* 1323 * Entry point to file defragmentation. 1324 * 1325 * @inode: inode to be defragged 1326 * @ra: readahead state 1327 * @range: defrag options including range and flags 1328 * @newer_than: minimum transid to defrag 1329 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1330 * will be defragged. 1331 * 1332 * Return <0 for error. 1333 * Return >=0 for the number of sectors defragged, and range->start will be updated 1334 * to indicate the file offset where next defrag should be started at. 1335 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1336 * defragging all the range). 1337 */ 1338 int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, 1339 struct btrfs_ioctl_defrag_range_args *range, 1340 u64 newer_than, unsigned long max_to_defrag) 1341 { 1342 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1343 unsigned long sectors_defragged = 0; 1344 u64 isize = i_size_read(&inode->vfs_inode); 1345 u64 cur; 1346 u64 last_byte; 1347 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1348 bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS); 1349 int compress_type = BTRFS_COMPRESS_ZLIB; 1350 int compress_level = 0; 1351 int ret = 0; 1352 u32 extent_thresh = range->extent_thresh; 1353 pgoff_t start_index; 1354 1355 ASSERT(ra); 1356 1357 if (isize == 0) 1358 return 0; 1359 1360 if (range->start >= isize) 1361 return -EINVAL; 1362 1363 if (do_compress) { 1364 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { 1365 if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) 1366 return -EINVAL; 1367 if (range->compress.type) { 1368 compress_type = range->compress.type; 1369 compress_level = range->compress.level; 1370 if (!btrfs_compress_level_valid(compress_type, compress_level)) 1371 return -EINVAL; 1372 } 1373 } else { 1374 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1375 return -EINVAL; 1376 if (range->compress_type) 1377 compress_type = range->compress_type; 1378 } 1379 } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) { 1380 compress_type = BTRFS_DEFRAG_DONT_COMPRESS; 1381 compress_level = 1; 1382 } 1383 1384 if (extent_thresh == 0) 1385 extent_thresh = SZ_256K; 1386 1387 if (range->start + range->len > range->start) { 1388 /* Got a specific range */ 1389 last_byte = min(isize, range->start + range->len); 1390 } else { 1391 /* Defrag until file end */ 1392 last_byte = isize; 1393 } 1394 1395 /* Align the range */ 1396 cur = round_down(range->start, fs_info->sectorsize); 1397 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1398 1399 /* 1400 * Make writeback start from the beginning of the range, so that the 1401 * defrag range can be written sequentially. 1402 */ 1403 start_index = cur >> PAGE_SHIFT; 1404 if (start_index < inode->vfs_inode.i_mapping->writeback_index) 1405 inode->vfs_inode.i_mapping->writeback_index = start_index; 1406 1407 while (cur < last_byte) { 1408 const unsigned long prev_sectors_defragged = sectors_defragged; 1409 u64 last_scanned = cur; 1410 u64 cluster_end; 1411 1412 if (btrfs_defrag_cancelled(fs_info)) { 1413 ret = -EAGAIN; 1414 break; 1415 } 1416 1417 /* We want the cluster end at page boundary when possible */ 1418 cluster_end = (((cur >> PAGE_SHIFT) + 1419 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1420 cluster_end = min(cluster_end, last_byte); 1421 1422 btrfs_inode_lock(inode, 0); 1423 if (IS_SWAPFILE(&inode->vfs_inode)) { 1424 ret = -ETXTBSY; 1425 btrfs_inode_unlock(inode, 0); 1426 break; 1427 } 1428 if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { 1429 btrfs_inode_unlock(inode, 0); 1430 break; 1431 } 1432 if (do_compress || no_compress) { 1433 inode->defrag_compress = compress_type; 1434 inode->defrag_compress_level = compress_level; 1435 } 1436 ret = defrag_one_cluster(inode, ra, cur, 1437 cluster_end + 1 - cur, extent_thresh, 1438 newer_than, do_compress || no_compress, 1439 §ors_defragged, 1440 max_to_defrag, &last_scanned); 1441 1442 if (sectors_defragged > prev_sectors_defragged) 1443 balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); 1444 1445 btrfs_inode_unlock(inode, 0); 1446 if (ret < 0) 1447 break; 1448 cur = max(cluster_end + 1, last_scanned); 1449 if (ret > 0) { 1450 ret = 0; 1451 break; 1452 } 1453 cond_resched(); 1454 } 1455 1456 /* 1457 * Update range.start for autodefrag, this will indicate where to start 1458 * in next run. 1459 */ 1460 range->start = cur; 1461 if (sectors_defragged) { 1462 /* 1463 * We have defragged some sectors, for compression case they 1464 * need to be written back immediately. 1465 */ 1466 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1467 filemap_flush(inode->vfs_inode.i_mapping); 1468 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1469 &inode->runtime_flags)) 1470 filemap_flush(inode->vfs_inode.i_mapping); 1471 } 1472 if (range->compress_type == BTRFS_COMPRESS_LZO) 1473 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1474 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1475 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1476 ret = sectors_defragged; 1477 } 1478 if (do_compress || no_compress) { 1479 btrfs_inode_lock(inode, 0); 1480 inode->defrag_compress = BTRFS_COMPRESS_NONE; 1481 btrfs_inode_unlock(inode, 0); 1482 } 1483 return ret; 1484 } 1485 1486 void __cold btrfs_auto_defrag_exit(void) 1487 { 1488 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1489 } 1490 1491 int __init btrfs_auto_defrag_init(void) 1492 { 1493 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1494 sizeof(struct inode_defrag), 0, 0, NULL); 1495 if (!btrfs_inode_defrag_cachep) 1496 return -ENOMEM; 1497 1498 return 0; 1499 } 1500