1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "transaction.h" 10 #include "locking.h" 11 #include "accessors.h" 12 #include "messages.h" 13 #include "delalloc-space.h" 14 #include "subpage.h" 15 #include "defrag.h" 16 #include "file-item.h" 17 #include "super.h" 18 19 static struct kmem_cache *btrfs_inode_defrag_cachep; 20 21 /* 22 * When auto defrag is enabled we queue up these defrag structs to remember 23 * which inodes need defragging passes. 24 */ 25 struct inode_defrag { 26 struct rb_node rb_node; 27 /* Inode number */ 28 u64 ino; 29 /* 30 * Transid where the defrag was added, we search for extents newer than 31 * this. 32 */ 33 u64 transid; 34 35 /* Root objectid */ 36 u64 root; 37 38 /* 39 * The extent size threshold for autodefrag. 40 * 41 * This value is different for compressed/non-compressed extents, thus 42 * needs to be passed from higher layer. 43 * (aka, inode_should_defrag()) 44 */ 45 u32 extent_thresh; 46 }; 47 48 static int compare_inode_defrag(const struct inode_defrag *defrag1, 49 const struct inode_defrag *defrag2) 50 { 51 if (defrag1->root > defrag2->root) 52 return 1; 53 else if (defrag1->root < defrag2->root) 54 return -1; 55 else if (defrag1->ino > defrag2->ino) 56 return 1; 57 else if (defrag1->ino < defrag2->ino) 58 return -1; 59 else 60 return 0; 61 } 62 63 /* 64 * Insert a record for an inode into the defrag tree. The lock must be held 65 * already. 66 * 67 * If you're inserting a record for an older transid than an existing record, 68 * the transid already in the tree is lowered. 69 */ 70 static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, 71 struct inode_defrag *defrag) 72 { 73 struct btrfs_fs_info *fs_info = inode->root->fs_info; 74 struct inode_defrag *entry; 75 struct rb_node **p; 76 struct rb_node *parent = NULL; 77 int ret; 78 79 p = &fs_info->defrag_inodes.rb_node; 80 while (*p) { 81 parent = *p; 82 entry = rb_entry(parent, struct inode_defrag, rb_node); 83 84 ret = compare_inode_defrag(defrag, entry); 85 if (ret < 0) 86 p = &parent->rb_left; 87 else if (ret > 0) 88 p = &parent->rb_right; 89 else { 90 /* 91 * If we're reinserting an entry for an old defrag run, 92 * make sure to lower the transid of our existing 93 * record. 94 */ 95 if (defrag->transid < entry->transid) 96 entry->transid = defrag->transid; 97 entry->extent_thresh = min(defrag->extent_thresh, 98 entry->extent_thresh); 99 return -EEXIST; 100 } 101 } 102 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 103 rb_link_node(&defrag->rb_node, parent, p); 104 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 105 return 0; 106 } 107 108 static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) 109 { 110 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 111 return 0; 112 113 if (btrfs_fs_closing(fs_info)) 114 return 0; 115 116 return 1; 117 } 118 119 /* 120 * Insert a defrag record for this inode if auto defrag is enabled. No errors 121 * returned as they're not considered fatal. 122 */ 123 void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh) 124 { 125 struct btrfs_root *root = inode->root; 126 struct btrfs_fs_info *fs_info = root->fs_info; 127 struct inode_defrag *defrag; 128 int ret; 129 130 if (!need_auto_defrag(fs_info)) 131 return; 132 133 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 134 return; 135 136 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 137 if (!defrag) 138 return; 139 140 defrag->ino = btrfs_ino(inode); 141 defrag->transid = btrfs_get_root_last_trans(root); 142 defrag->root = btrfs_root_id(root); 143 defrag->extent_thresh = extent_thresh; 144 145 spin_lock(&fs_info->defrag_inodes_lock); 146 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 147 /* 148 * If we set IN_DEFRAG flag and evict the inode from memory, 149 * and then re-read this inode, this new inode doesn't have 150 * IN_DEFRAG flag. At the case, we may find the existed defrag. 151 */ 152 ret = btrfs_insert_inode_defrag(inode, defrag); 153 if (ret) 154 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 155 } else { 156 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 157 } 158 spin_unlock(&fs_info->defrag_inodes_lock); 159 } 160 161 /* 162 * Pick the defragable inode that we want, if it doesn't exist, we will get the 163 * next one. 164 */ 165 static struct inode_defrag *btrfs_pick_defrag_inode( 166 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 167 { 168 struct inode_defrag *entry = NULL; 169 struct inode_defrag tmp; 170 struct rb_node *p; 171 struct rb_node *parent = NULL; 172 int ret; 173 174 tmp.ino = ino; 175 tmp.root = root; 176 177 spin_lock(&fs_info->defrag_inodes_lock); 178 p = fs_info->defrag_inodes.rb_node; 179 while (p) { 180 parent = p; 181 entry = rb_entry(parent, struct inode_defrag, rb_node); 182 183 ret = compare_inode_defrag(&tmp, entry); 184 if (ret < 0) 185 p = parent->rb_left; 186 else if (ret > 0) 187 p = parent->rb_right; 188 else 189 goto out; 190 } 191 192 if (parent && compare_inode_defrag(&tmp, entry) > 0) { 193 parent = rb_next(parent); 194 if (parent) 195 entry = rb_entry(parent, struct inode_defrag, rb_node); 196 else 197 entry = NULL; 198 } 199 out: 200 if (entry) 201 rb_erase(parent, &fs_info->defrag_inodes); 202 spin_unlock(&fs_info->defrag_inodes_lock); 203 return entry; 204 } 205 206 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 207 { 208 struct inode_defrag *defrag, *next; 209 210 spin_lock(&fs_info->defrag_inodes_lock); 211 212 rbtree_postorder_for_each_entry_safe(defrag, next, 213 &fs_info->defrag_inodes, rb_node) 214 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 215 216 fs_info->defrag_inodes = RB_ROOT; 217 218 spin_unlock(&fs_info->defrag_inodes_lock); 219 } 220 221 #define BTRFS_DEFRAG_BATCH 1024 222 223 static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 224 struct inode_defrag *defrag, 225 struct file_ra_state *ra) 226 { 227 struct btrfs_root *inode_root; 228 struct inode *inode; 229 struct btrfs_ioctl_defrag_range_args range; 230 int ret = 0; 231 u64 cur = 0; 232 233 again: 234 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 235 goto cleanup; 236 if (!need_auto_defrag(fs_info)) 237 goto cleanup; 238 239 /* Get the inode */ 240 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 241 if (IS_ERR(inode_root)) { 242 ret = PTR_ERR(inode_root); 243 goto cleanup; 244 } 245 246 inode = btrfs_iget(defrag->ino, inode_root); 247 btrfs_put_root(inode_root); 248 if (IS_ERR(inode)) { 249 ret = PTR_ERR(inode); 250 goto cleanup; 251 } 252 253 if (cur >= i_size_read(inode)) { 254 iput(inode); 255 goto cleanup; 256 } 257 258 /* Do a chunk of defrag */ 259 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 260 memset(&range, 0, sizeof(range)); 261 range.len = (u64)-1; 262 range.start = cur; 263 range.extent_thresh = defrag->extent_thresh; 264 file_ra_state_init(ra, inode->i_mapping); 265 266 sb_start_write(fs_info->sb); 267 ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, 268 BTRFS_DEFRAG_BATCH); 269 sb_end_write(fs_info->sb); 270 iput(inode); 271 272 if (ret < 0) 273 goto cleanup; 274 275 cur = max(cur + fs_info->sectorsize, range.start); 276 goto again; 277 278 cleanup: 279 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 280 return ret; 281 } 282 283 /* 284 * Run through the list of inodes in the FS that need defragging. 285 */ 286 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 287 { 288 struct inode_defrag *defrag; 289 u64 first_ino = 0; 290 u64 root_objectid = 0; 291 292 atomic_inc(&fs_info->defrag_running); 293 while (1) { 294 struct file_ra_state ra = { 0 }; 295 296 /* Pause the auto defragger. */ 297 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 298 break; 299 300 if (!need_auto_defrag(fs_info)) 301 break; 302 303 /* find an inode to defrag */ 304 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 305 if (!defrag) { 306 if (root_objectid || first_ino) { 307 root_objectid = 0; 308 first_ino = 0; 309 continue; 310 } else { 311 break; 312 } 313 } 314 315 first_ino = defrag->ino + 1; 316 root_objectid = defrag->root; 317 318 btrfs_run_defrag_inode(fs_info, defrag, &ra); 319 } 320 atomic_dec(&fs_info->defrag_running); 321 322 /* 323 * During unmount, we use the transaction_wait queue to wait for the 324 * defragger to stop. 325 */ 326 wake_up(&fs_info->transaction_wait); 327 return 0; 328 } 329 330 /* 331 * Check if two blocks addresses are close, used by defrag. 332 */ 333 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 334 { 335 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 336 return true; 337 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 338 return true; 339 return false; 340 } 341 342 /* 343 * Go through all the leaves pointed to by a node and reallocate them so that 344 * disk order is close to key order. 345 */ 346 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 347 struct btrfs_root *root, 348 struct extent_buffer *parent, 349 int start_slot, u64 *last_ret, 350 struct btrfs_key *progress) 351 { 352 struct btrfs_fs_info *fs_info = root->fs_info; 353 const u32 blocksize = fs_info->nodesize; 354 const int end_slot = btrfs_header_nritems(parent) - 1; 355 u64 search_start = *last_ret; 356 u64 last_block = 0; 357 int ret = 0; 358 bool progress_passed = false; 359 360 /* 361 * COWing must happen through a running transaction, which always 362 * matches the current fs generation (it's a transaction with a state 363 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 364 * into error state to prevent the commit of any transaction. 365 */ 366 if (unlikely(trans->transaction != fs_info->running_transaction || 367 trans->transid != fs_info->generation)) { 368 btrfs_abort_transaction(trans, -EUCLEAN); 369 btrfs_crit(fs_info, 370 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 371 parent->start, btrfs_root_id(root), trans->transid, 372 fs_info->running_transaction->transid, 373 fs_info->generation); 374 return -EUCLEAN; 375 } 376 377 if (btrfs_header_nritems(parent) <= 1) 378 return 0; 379 380 for (int i = start_slot; i <= end_slot; i++) { 381 struct extent_buffer *cur; 382 struct btrfs_disk_key disk_key; 383 u64 blocknr; 384 u64 other; 385 bool close = true; 386 387 btrfs_node_key(parent, &disk_key, i); 388 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 389 continue; 390 391 progress_passed = true; 392 blocknr = btrfs_node_blockptr(parent, i); 393 if (last_block == 0) 394 last_block = blocknr; 395 396 if (i > 0) { 397 other = btrfs_node_blockptr(parent, i - 1); 398 close = close_blocks(blocknr, other, blocksize); 399 } 400 if (!close && i < end_slot) { 401 other = btrfs_node_blockptr(parent, i + 1); 402 close = close_blocks(blocknr, other, blocksize); 403 } 404 if (close) { 405 last_block = blocknr; 406 continue; 407 } 408 409 cur = btrfs_read_node_slot(parent, i); 410 if (IS_ERR(cur)) 411 return PTR_ERR(cur); 412 if (search_start == 0) 413 search_start = last_block; 414 415 btrfs_tree_lock(cur); 416 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 417 &cur, search_start, 418 min(16 * blocksize, 419 (end_slot - i) * blocksize), 420 BTRFS_NESTING_COW); 421 if (ret) { 422 btrfs_tree_unlock(cur); 423 free_extent_buffer(cur); 424 break; 425 } 426 search_start = cur->start; 427 last_block = cur->start; 428 *last_ret = search_start; 429 btrfs_tree_unlock(cur); 430 free_extent_buffer(cur); 431 } 432 return ret; 433 } 434 435 /* 436 * Defrag all the leaves in a given btree. 437 * Read all the leaves and try to get key order to 438 * better reflect disk order 439 */ 440 441 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 442 struct btrfs_root *root) 443 { 444 struct btrfs_path *path = NULL; 445 struct btrfs_key key; 446 int ret = 0; 447 int wret; 448 int level; 449 int next_key_ret = 0; 450 u64 last_ret = 0; 451 452 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 453 goto out; 454 455 path = btrfs_alloc_path(); 456 if (!path) { 457 ret = -ENOMEM; 458 goto out; 459 } 460 461 level = btrfs_header_level(root->node); 462 463 if (level == 0) 464 goto out; 465 466 if (root->defrag_progress.objectid == 0) { 467 struct extent_buffer *root_node; 468 u32 nritems; 469 470 root_node = btrfs_lock_root_node(root); 471 nritems = btrfs_header_nritems(root_node); 472 root->defrag_max.objectid = 0; 473 /* from above we know this is not a leaf */ 474 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 475 nritems - 1); 476 btrfs_tree_unlock(root_node); 477 free_extent_buffer(root_node); 478 memset(&key, 0, sizeof(key)); 479 } else { 480 memcpy(&key, &root->defrag_progress, sizeof(key)); 481 } 482 483 path->keep_locks = 1; 484 485 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 486 if (ret < 0) 487 goto out; 488 if (ret > 0) { 489 ret = 0; 490 goto out; 491 } 492 btrfs_release_path(path); 493 /* 494 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 495 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 496 * a deadlock (attempting to write lock an already write locked leaf). 497 */ 498 path->lowest_level = 1; 499 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 500 501 if (wret < 0) { 502 ret = wret; 503 goto out; 504 } 505 if (!path->nodes[1]) { 506 ret = 0; 507 goto out; 508 } 509 /* 510 * The node at level 1 must always be locked when our path has 511 * keep_locks set and lowest_level is 1, regardless of the value of 512 * path->slots[1]. 513 */ 514 ASSERT(path->locks[1] != 0); 515 ret = btrfs_realloc_node(trans, root, 516 path->nodes[1], 0, 517 &last_ret, 518 &root->defrag_progress); 519 if (ret) { 520 WARN_ON(ret == -EAGAIN); 521 goto out; 522 } 523 /* 524 * Now that we reallocated the node we can find the next key. Note that 525 * btrfs_find_next_key() can release our path and do another search 526 * without COWing, this is because even with path->keep_locks = 1, 527 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 528 * node when path->slots[node_level - 1] does not point to the last 529 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 530 * we search for the next key after reallocating our node. 531 */ 532 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 533 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 534 BTRFS_OLDEST_GENERATION); 535 if (next_key_ret == 0) { 536 memcpy(&root->defrag_progress, &key, sizeof(key)); 537 ret = -EAGAIN; 538 } 539 out: 540 btrfs_free_path(path); 541 if (ret == -EAGAIN) { 542 if (root->defrag_max.objectid > root->defrag_progress.objectid) 543 goto done; 544 if (root->defrag_max.type > root->defrag_progress.type) 545 goto done; 546 if (root->defrag_max.offset > root->defrag_progress.offset) 547 goto done; 548 ret = 0; 549 } 550 done: 551 if (ret != -EAGAIN) 552 memset(&root->defrag_progress, 0, 553 sizeof(root->defrag_progress)); 554 555 return ret; 556 } 557 558 /* 559 * Defrag a given btree. Every leaf in the btree is read and defragmented. 560 */ 561 int btrfs_defrag_root(struct btrfs_root *root) 562 { 563 struct btrfs_fs_info *fs_info = root->fs_info; 564 int ret; 565 566 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 567 return 0; 568 569 while (1) { 570 struct btrfs_trans_handle *trans; 571 572 trans = btrfs_start_transaction(root, 0); 573 if (IS_ERR(trans)) { 574 ret = PTR_ERR(trans); 575 break; 576 } 577 578 ret = btrfs_defrag_leaves(trans, root); 579 580 btrfs_end_transaction(trans); 581 btrfs_btree_balance_dirty(fs_info); 582 cond_resched(); 583 584 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 585 break; 586 587 if (btrfs_defrag_cancelled(fs_info)) { 588 btrfs_debug(fs_info, "defrag_root cancelled"); 589 ret = -EAGAIN; 590 break; 591 } 592 } 593 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 594 return ret; 595 } 596 597 /* 598 * Defrag specific helper to get an extent map. 599 * 600 * Differences between this and btrfs_get_extent() are: 601 * 602 * - No extent_map will be added to inode->extent_tree 603 * To reduce memory usage in the long run. 604 * 605 * - Extra optimization to skip file extents older than @newer_than 606 * By using btrfs_search_forward() we can skip entire file ranges that 607 * have extents created in past transactions, because btrfs_search_forward() 608 * will not visit leaves and nodes with a generation smaller than given 609 * minimal generation threshold (@newer_than). 610 * 611 * Return valid em if we find a file extent matching the requirement. 612 * Return NULL if we can not find a file extent matching the requirement. 613 * 614 * Return ERR_PTR() for error. 615 */ 616 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 617 u64 start, u64 newer_than) 618 { 619 struct btrfs_root *root = inode->root; 620 struct btrfs_file_extent_item *fi; 621 struct btrfs_path path = { 0 }; 622 struct extent_map *em; 623 struct btrfs_key key; 624 u64 ino = btrfs_ino(inode); 625 int ret; 626 627 em = alloc_extent_map(); 628 if (!em) { 629 ret = -ENOMEM; 630 goto err; 631 } 632 633 key.objectid = ino; 634 key.type = BTRFS_EXTENT_DATA_KEY; 635 key.offset = start; 636 637 if (newer_than) { 638 ret = btrfs_search_forward(root, &key, &path, newer_than); 639 if (ret < 0) 640 goto err; 641 /* Can't find anything newer */ 642 if (ret > 0) 643 goto not_found; 644 } else { 645 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 646 if (ret < 0) 647 goto err; 648 } 649 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 650 /* 651 * If btrfs_search_slot() makes path to point beyond nritems, 652 * we should not have an empty leaf, as this inode must at 653 * least have its INODE_ITEM. 654 */ 655 ASSERT(btrfs_header_nritems(path.nodes[0])); 656 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 657 } 658 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 659 /* Perfect match, no need to go one slot back */ 660 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 661 key.offset == start) 662 goto iterate; 663 664 /* We didn't find a perfect match, needs to go one slot back */ 665 if (path.slots[0] > 0) { 666 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 667 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 668 path.slots[0]--; 669 } 670 671 iterate: 672 /* Iterate through the path to find a file extent covering @start */ 673 while (true) { 674 u64 extent_end; 675 676 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 677 goto next; 678 679 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 680 681 /* 682 * We may go one slot back to INODE_REF/XATTR item, then 683 * need to go forward until we reach an EXTENT_DATA. 684 * But we should still has the correct ino as key.objectid. 685 */ 686 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 687 goto next; 688 689 /* It's beyond our target range, definitely not extent found */ 690 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 691 goto not_found; 692 693 /* 694 * | |<- File extent ->| 695 * \- start 696 * 697 * This means there is a hole between start and key.offset. 698 */ 699 if (key.offset > start) { 700 em->start = start; 701 em->disk_bytenr = EXTENT_MAP_HOLE; 702 em->disk_num_bytes = 0; 703 em->ram_bytes = 0; 704 em->offset = 0; 705 em->len = key.offset - start; 706 break; 707 } 708 709 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 710 struct btrfs_file_extent_item); 711 extent_end = btrfs_file_extent_end(&path); 712 713 /* 714 * |<- file extent ->| | 715 * \- start 716 * 717 * We haven't reached start, search next slot. 718 */ 719 if (extent_end <= start) 720 goto next; 721 722 /* Now this extent covers @start, convert it to em */ 723 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 724 break; 725 next: 726 ret = btrfs_next_item(root, &path); 727 if (ret < 0) 728 goto err; 729 if (ret > 0) 730 goto not_found; 731 } 732 btrfs_release_path(&path); 733 return em; 734 735 not_found: 736 btrfs_release_path(&path); 737 free_extent_map(em); 738 return NULL; 739 740 err: 741 btrfs_release_path(&path); 742 free_extent_map(em); 743 return ERR_PTR(ret); 744 } 745 746 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 747 u64 newer_than, bool locked) 748 { 749 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 750 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 751 struct extent_map *em; 752 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 753 754 /* 755 * Hopefully we have this extent in the tree already, try without the 756 * full extent lock. 757 */ 758 read_lock(&em_tree->lock); 759 em = lookup_extent_mapping(em_tree, start, sectorsize); 760 read_unlock(&em_tree->lock); 761 762 /* 763 * We can get a merged extent, in that case, we need to re-search 764 * tree to get the original em for defrag. 765 * 766 * If @newer_than is 0 or em::generation < newer_than, we can trust 767 * this em, as either we don't care about the generation, or the 768 * merged extent map will be rejected anyway. 769 */ 770 if (em && (em->flags & EXTENT_FLAG_MERGED) && 771 newer_than && em->generation >= newer_than) { 772 free_extent_map(em); 773 em = NULL; 774 } 775 776 if (!em) { 777 struct extent_state *cached = NULL; 778 u64 end = start + sectorsize - 1; 779 780 /* Get the big lock and read metadata off disk. */ 781 if (!locked) 782 lock_extent(io_tree, start, end, &cached); 783 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 784 if (!locked) 785 unlock_extent(io_tree, start, end, &cached); 786 787 if (IS_ERR(em)) 788 return NULL; 789 } 790 791 return em; 792 } 793 794 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 795 const struct extent_map *em) 796 { 797 if (extent_map_is_compressed(em)) 798 return BTRFS_MAX_COMPRESSED; 799 return fs_info->max_extent_size; 800 } 801 802 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 803 u32 extent_thresh, u64 newer_than, bool locked) 804 { 805 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 806 struct extent_map *next; 807 bool ret = false; 808 809 /* This is the last extent */ 810 if (em->start + em->len >= i_size_read(inode)) 811 return false; 812 813 /* 814 * Here we need to pass @newer_then when checking the next extent, or 815 * we will hit a case we mark current extent for defrag, but the next 816 * one will not be a target. 817 * This will just cause extra IO without really reducing the fragments. 818 */ 819 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 820 /* No more em or hole */ 821 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 822 goto out; 823 if (next->flags & EXTENT_FLAG_PREALLOC) 824 goto out; 825 /* 826 * If the next extent is at its max capacity, defragging current extent 827 * makes no sense, as the total number of extents won't change. 828 */ 829 if (next->len >= get_extent_max_capacity(fs_info, em)) 830 goto out; 831 /* Skip older extent */ 832 if (next->generation < newer_than) 833 goto out; 834 /* Also check extent size */ 835 if (next->len >= extent_thresh) 836 goto out; 837 838 ret = true; 839 out: 840 free_extent_map(next); 841 return ret; 842 } 843 844 /* 845 * Prepare one page to be defragged. 846 * 847 * This will ensure: 848 * 849 * - Returned page is locked and has been set up properly. 850 * - No ordered extent exists in the page. 851 * - The page is uptodate. 852 * 853 * NOTE: Caller should also wait for page writeback after the cluster is 854 * prepared, here we don't do writeback wait for each page. 855 */ 856 static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t index) 857 { 858 struct address_space *mapping = inode->vfs_inode.i_mapping; 859 gfp_t mask = btrfs_alloc_write_mask(mapping); 860 u64 page_start = (u64)index << PAGE_SHIFT; 861 u64 page_end = page_start + PAGE_SIZE - 1; 862 struct extent_state *cached_state = NULL; 863 struct folio *folio; 864 int ret; 865 866 again: 867 folio = __filemap_get_folio(mapping, index, 868 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); 869 if (IS_ERR(folio)) 870 return folio; 871 872 /* 873 * Since we can defragment files opened read-only, we can encounter 874 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 875 * can't do I/O using huge pages yet, so return an error for now. 876 * Filesystem transparent huge pages are typically only used for 877 * executables that explicitly enable them, so this isn't very 878 * restrictive. 879 */ 880 if (folio_test_large(folio)) { 881 folio_unlock(folio); 882 folio_put(folio); 883 return ERR_PTR(-ETXTBSY); 884 } 885 886 ret = set_folio_extent_mapped(folio); 887 if (ret < 0) { 888 folio_unlock(folio); 889 folio_put(folio); 890 return ERR_PTR(ret); 891 } 892 893 /* Wait for any existing ordered extent in the range */ 894 while (1) { 895 struct btrfs_ordered_extent *ordered; 896 897 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 898 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 899 unlock_extent(&inode->io_tree, page_start, page_end, 900 &cached_state); 901 if (!ordered) 902 break; 903 904 folio_unlock(folio); 905 btrfs_start_ordered_extent(ordered); 906 btrfs_put_ordered_extent(ordered); 907 folio_lock(folio); 908 /* 909 * We unlocked the folio above, so we need check if it was 910 * released or not. 911 */ 912 if (folio->mapping != mapping || !folio->private) { 913 folio_unlock(folio); 914 folio_put(folio); 915 goto again; 916 } 917 } 918 919 /* 920 * Now the page range has no ordered extent any more. Read the page to 921 * make it uptodate. 922 */ 923 if (!folio_test_uptodate(folio)) { 924 btrfs_read_folio(NULL, folio); 925 folio_lock(folio); 926 if (folio->mapping != mapping || !folio->private) { 927 folio_unlock(folio); 928 folio_put(folio); 929 goto again; 930 } 931 if (!folio_test_uptodate(folio)) { 932 folio_unlock(folio); 933 folio_put(folio); 934 return ERR_PTR(-EIO); 935 } 936 } 937 return folio; 938 } 939 940 struct defrag_target_range { 941 struct list_head list; 942 u64 start; 943 u64 len; 944 }; 945 946 /* 947 * Collect all valid target extents. 948 * 949 * @start: file offset to lookup 950 * @len: length to lookup 951 * @extent_thresh: file extent size threshold, any extent size >= this value 952 * will be ignored 953 * @newer_than: only defrag extents newer than this value 954 * @do_compress: whether the defrag is doing compression 955 * if true, @extent_thresh will be ignored and all regular 956 * file extents meeting @newer_than will be targets. 957 * @locked: if the range has already held extent lock 958 * @target_list: list of targets file extents 959 */ 960 static int defrag_collect_targets(struct btrfs_inode *inode, 961 u64 start, u64 len, u32 extent_thresh, 962 u64 newer_than, bool do_compress, 963 bool locked, struct list_head *target_list, 964 u64 *last_scanned_ret) 965 { 966 struct btrfs_fs_info *fs_info = inode->root->fs_info; 967 bool last_is_target = false; 968 u64 cur = start; 969 int ret = 0; 970 971 while (cur < start + len) { 972 struct extent_map *em; 973 struct defrag_target_range *new; 974 bool next_mergeable = true; 975 u64 range_len; 976 977 last_is_target = false; 978 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 979 if (!em) 980 break; 981 982 /* 983 * If the file extent is an inlined one, we may still want to 984 * defrag it (fallthrough) if it will cause a regular extent. 985 * This is for users who want to convert inline extents to 986 * regular ones through max_inline= mount option. 987 */ 988 if (em->disk_bytenr == EXTENT_MAP_INLINE && 989 em->len <= inode->root->fs_info->max_inline) 990 goto next; 991 992 /* Skip holes and preallocated extents. */ 993 if (em->disk_bytenr == EXTENT_MAP_HOLE || 994 (em->flags & EXTENT_FLAG_PREALLOC)) 995 goto next; 996 997 /* Skip older extent */ 998 if (em->generation < newer_than) 999 goto next; 1000 1001 /* This em is under writeback, no need to defrag */ 1002 if (em->generation == (u64)-1) 1003 goto next; 1004 1005 /* 1006 * Our start offset might be in the middle of an existing extent 1007 * map, so take that into account. 1008 */ 1009 range_len = em->len - (cur - em->start); 1010 /* 1011 * If this range of the extent map is already flagged for delalloc, 1012 * skip it, because: 1013 * 1014 * 1) We could deadlock later, when trying to reserve space for 1015 * delalloc, because in case we can't immediately reserve space 1016 * the flusher can start delalloc and wait for the respective 1017 * ordered extents to complete. The deadlock would happen 1018 * because we do the space reservation while holding the range 1019 * locked, and starting writeback, or finishing an ordered 1020 * extent, requires locking the range; 1021 * 1022 * 2) If there's delalloc there, it means there's dirty pages for 1023 * which writeback has not started yet (we clean the delalloc 1024 * flag when starting writeback and after creating an ordered 1025 * extent). If we mark pages in an adjacent range for defrag, 1026 * then we will have a larger contiguous range for delalloc, 1027 * very likely resulting in a larger extent after writeback is 1028 * triggered (except in a case of free space fragmentation). 1029 */ 1030 if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1031 EXTENT_DELALLOC)) 1032 goto next; 1033 1034 /* 1035 * For do_compress case, we want to compress all valid file 1036 * extents, thus no @extent_thresh or mergeable check. 1037 */ 1038 if (do_compress) 1039 goto add; 1040 1041 /* Skip too large extent */ 1042 if (em->len >= extent_thresh) 1043 goto next; 1044 1045 /* 1046 * Skip extents already at its max capacity, this is mostly for 1047 * compressed extents, which max cap is only 128K. 1048 */ 1049 if (em->len >= get_extent_max_capacity(fs_info, em)) 1050 goto next; 1051 1052 /* 1053 * Normally there are no more extents after an inline one, thus 1054 * @next_mergeable will normally be false and not defragged. 1055 * So if an inline extent passed all above checks, just add it 1056 * for defrag, and be converted to regular extents. 1057 */ 1058 if (em->disk_bytenr == EXTENT_MAP_INLINE) 1059 goto add; 1060 1061 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1062 extent_thresh, newer_than, locked); 1063 if (!next_mergeable) { 1064 struct defrag_target_range *last; 1065 1066 /* Empty target list, no way to merge with last entry */ 1067 if (list_empty(target_list)) 1068 goto next; 1069 last = list_entry(target_list->prev, 1070 struct defrag_target_range, list); 1071 /* Not mergeable with last entry */ 1072 if (last->start + last->len != cur) 1073 goto next; 1074 1075 /* Mergeable, fall through to add it to @target_list. */ 1076 } 1077 1078 add: 1079 last_is_target = true; 1080 range_len = min(extent_map_end(em), start + len) - cur; 1081 /* 1082 * This one is a good target, check if it can be merged into 1083 * last range of the target list. 1084 */ 1085 if (!list_empty(target_list)) { 1086 struct defrag_target_range *last; 1087 1088 last = list_entry(target_list->prev, 1089 struct defrag_target_range, list); 1090 ASSERT(last->start + last->len <= cur); 1091 if (last->start + last->len == cur) { 1092 /* Mergeable, enlarge the last entry */ 1093 last->len += range_len; 1094 goto next; 1095 } 1096 /* Fall through to allocate a new entry */ 1097 } 1098 1099 /* Allocate new defrag_target_range */ 1100 new = kmalloc(sizeof(*new), GFP_NOFS); 1101 if (!new) { 1102 free_extent_map(em); 1103 ret = -ENOMEM; 1104 break; 1105 } 1106 new->start = cur; 1107 new->len = range_len; 1108 list_add_tail(&new->list, target_list); 1109 1110 next: 1111 cur = extent_map_end(em); 1112 free_extent_map(em); 1113 } 1114 if (ret < 0) { 1115 struct defrag_target_range *entry; 1116 struct defrag_target_range *tmp; 1117 1118 list_for_each_entry_safe(entry, tmp, target_list, list) { 1119 list_del_init(&entry->list); 1120 kfree(entry); 1121 } 1122 } 1123 if (!ret && last_scanned_ret) { 1124 /* 1125 * If the last extent is not a target, the caller can skip to 1126 * the end of that extent. 1127 * Otherwise, we can only go the end of the specified range. 1128 */ 1129 if (!last_is_target) 1130 *last_scanned_ret = max(cur, *last_scanned_ret); 1131 else 1132 *last_scanned_ret = max(start + len, *last_scanned_ret); 1133 } 1134 return ret; 1135 } 1136 1137 #define CLUSTER_SIZE (SZ_256K) 1138 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1139 1140 /* 1141 * Defrag one contiguous target range. 1142 * 1143 * @inode: target inode 1144 * @target: target range to defrag 1145 * @pages: locked pages covering the defrag range 1146 * @nr_pages: number of locked pages 1147 * 1148 * Caller should ensure: 1149 * 1150 * - Pages are prepared 1151 * Pages should be locked, no ordered extent in the pages range, 1152 * no writeback. 1153 * 1154 * - Extent bits are locked 1155 */ 1156 static int defrag_one_locked_target(struct btrfs_inode *inode, 1157 struct defrag_target_range *target, 1158 struct folio **folios, int nr_pages, 1159 struct extent_state **cached_state) 1160 { 1161 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1162 struct extent_changeset *data_reserved = NULL; 1163 const u64 start = target->start; 1164 const u64 len = target->len; 1165 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1166 unsigned long start_index = start >> PAGE_SHIFT; 1167 unsigned long first_index = folios[0]->index; 1168 int ret = 0; 1169 int i; 1170 1171 ASSERT(last_index - first_index + 1 <= nr_pages); 1172 1173 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1174 if (ret < 0) 1175 return ret; 1176 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1177 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1178 EXTENT_DEFRAG, cached_state); 1179 set_extent_bit(&inode->io_tree, start, start + len - 1, 1180 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1181 1182 /* Update the page status */ 1183 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1184 folio_clear_checked(folios[i]); 1185 btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); 1186 } 1187 btrfs_delalloc_release_extents(inode, len); 1188 extent_changeset_free(data_reserved); 1189 1190 return ret; 1191 } 1192 1193 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1194 u32 extent_thresh, u64 newer_than, bool do_compress, 1195 u64 *last_scanned_ret) 1196 { 1197 struct extent_state *cached_state = NULL; 1198 struct defrag_target_range *entry; 1199 struct defrag_target_range *tmp; 1200 LIST_HEAD(target_list); 1201 struct folio **folios; 1202 const u32 sectorsize = inode->root->fs_info->sectorsize; 1203 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1204 u64 start_index = start >> PAGE_SHIFT; 1205 unsigned int nr_pages = last_index - start_index + 1; 1206 int ret = 0; 1207 int i; 1208 1209 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1210 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1211 1212 folios = kcalloc(nr_pages, sizeof(struct folio *), GFP_NOFS); 1213 if (!folios) 1214 return -ENOMEM; 1215 1216 /* Prepare all pages */ 1217 for (i = 0; i < nr_pages; i++) { 1218 folios[i] = defrag_prepare_one_folio(inode, start_index + i); 1219 if (IS_ERR(folios[i])) { 1220 ret = PTR_ERR(folios[i]); 1221 nr_pages = i; 1222 goto free_folios; 1223 } 1224 } 1225 for (i = 0; i < nr_pages; i++) 1226 folio_wait_writeback(folios[i]); 1227 1228 /* Lock the pages range */ 1229 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1230 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1231 &cached_state); 1232 /* 1233 * Now we have a consistent view about the extent map, re-check 1234 * which range really needs to be defragged. 1235 * 1236 * And this time we have extent locked already, pass @locked = true 1237 * so that we won't relock the extent range and cause deadlock. 1238 */ 1239 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1240 newer_than, do_compress, true, 1241 &target_list, last_scanned_ret); 1242 if (ret < 0) 1243 goto unlock_extent; 1244 1245 list_for_each_entry(entry, &target_list, list) { 1246 ret = defrag_one_locked_target(inode, entry, folios, nr_pages, 1247 &cached_state); 1248 if (ret < 0) 1249 break; 1250 } 1251 1252 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1253 list_del_init(&entry->list); 1254 kfree(entry); 1255 } 1256 unlock_extent: 1257 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1258 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1259 &cached_state); 1260 free_folios: 1261 for (i = 0; i < nr_pages; i++) { 1262 folio_unlock(folios[i]); 1263 folio_put(folios[i]); 1264 } 1265 kfree(folios); 1266 return ret; 1267 } 1268 1269 static int defrag_one_cluster(struct btrfs_inode *inode, 1270 struct file_ra_state *ra, 1271 u64 start, u32 len, u32 extent_thresh, 1272 u64 newer_than, bool do_compress, 1273 unsigned long *sectors_defragged, 1274 unsigned long max_sectors, 1275 u64 *last_scanned_ret) 1276 { 1277 const u32 sectorsize = inode->root->fs_info->sectorsize; 1278 struct defrag_target_range *entry; 1279 struct defrag_target_range *tmp; 1280 LIST_HEAD(target_list); 1281 int ret; 1282 1283 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1284 newer_than, do_compress, false, 1285 &target_list, NULL); 1286 if (ret < 0) 1287 goto out; 1288 1289 list_for_each_entry(entry, &target_list, list) { 1290 u32 range_len = entry->len; 1291 1292 /* Reached or beyond the limit */ 1293 if (max_sectors && *sectors_defragged >= max_sectors) { 1294 ret = 1; 1295 break; 1296 } 1297 1298 if (max_sectors) 1299 range_len = min_t(u32, range_len, 1300 (max_sectors - *sectors_defragged) * sectorsize); 1301 1302 /* 1303 * If defrag_one_range() has updated last_scanned_ret, 1304 * our range may already be invalid (e.g. hole punched). 1305 * Skip if our range is before last_scanned_ret, as there is 1306 * no need to defrag the range anymore. 1307 */ 1308 if (entry->start + range_len <= *last_scanned_ret) 1309 continue; 1310 1311 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1312 ra, NULL, entry->start >> PAGE_SHIFT, 1313 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1314 (entry->start >> PAGE_SHIFT) + 1); 1315 /* 1316 * Here we may not defrag any range if holes are punched before 1317 * we locked the pages. 1318 * But that's fine, it only affects the @sectors_defragged 1319 * accounting. 1320 */ 1321 ret = defrag_one_range(inode, entry->start, range_len, 1322 extent_thresh, newer_than, do_compress, 1323 last_scanned_ret); 1324 if (ret < 0) 1325 break; 1326 *sectors_defragged += range_len >> 1327 inode->root->fs_info->sectorsize_bits; 1328 } 1329 out: 1330 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1331 list_del_init(&entry->list); 1332 kfree(entry); 1333 } 1334 if (ret >= 0) 1335 *last_scanned_ret = max(*last_scanned_ret, start + len); 1336 return ret; 1337 } 1338 1339 /* 1340 * Entry point to file defragmentation. 1341 * 1342 * @inode: inode to be defragged 1343 * @ra: readahead state 1344 * @range: defrag options including range and flags 1345 * @newer_than: minimum transid to defrag 1346 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1347 * will be defragged. 1348 * 1349 * Return <0 for error. 1350 * Return >=0 for the number of sectors defragged, and range->start will be updated 1351 * to indicate the file offset where next defrag should be started at. 1352 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1353 * defragging all the range). 1354 */ 1355 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1356 struct btrfs_ioctl_defrag_range_args *range, 1357 u64 newer_than, unsigned long max_to_defrag) 1358 { 1359 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 1360 unsigned long sectors_defragged = 0; 1361 u64 isize = i_size_read(inode); 1362 u64 cur; 1363 u64 last_byte; 1364 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1365 int compress_type = BTRFS_COMPRESS_ZLIB; 1366 int ret = 0; 1367 u32 extent_thresh = range->extent_thresh; 1368 pgoff_t start_index; 1369 1370 ASSERT(ra); 1371 1372 if (isize == 0) 1373 return 0; 1374 1375 if (range->start >= isize) 1376 return -EINVAL; 1377 1378 if (do_compress) { 1379 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1380 return -EINVAL; 1381 if (range->compress_type) 1382 compress_type = range->compress_type; 1383 } 1384 1385 if (extent_thresh == 0) 1386 extent_thresh = SZ_256K; 1387 1388 if (range->start + range->len > range->start) { 1389 /* Got a specific range */ 1390 last_byte = min(isize, range->start + range->len); 1391 } else { 1392 /* Defrag until file end */ 1393 last_byte = isize; 1394 } 1395 1396 /* Align the range */ 1397 cur = round_down(range->start, fs_info->sectorsize); 1398 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1399 1400 /* 1401 * Make writeback start from the beginning of the range, so that the 1402 * defrag range can be written sequentially. 1403 */ 1404 start_index = cur >> PAGE_SHIFT; 1405 if (start_index < inode->i_mapping->writeback_index) 1406 inode->i_mapping->writeback_index = start_index; 1407 1408 while (cur < last_byte) { 1409 const unsigned long prev_sectors_defragged = sectors_defragged; 1410 u64 last_scanned = cur; 1411 u64 cluster_end; 1412 1413 if (btrfs_defrag_cancelled(fs_info)) { 1414 ret = -EAGAIN; 1415 break; 1416 } 1417 1418 /* We want the cluster end at page boundary when possible */ 1419 cluster_end = (((cur >> PAGE_SHIFT) + 1420 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1421 cluster_end = min(cluster_end, last_byte); 1422 1423 btrfs_inode_lock(BTRFS_I(inode), 0); 1424 if (IS_SWAPFILE(inode)) { 1425 ret = -ETXTBSY; 1426 btrfs_inode_unlock(BTRFS_I(inode), 0); 1427 break; 1428 } 1429 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1430 btrfs_inode_unlock(BTRFS_I(inode), 0); 1431 break; 1432 } 1433 if (do_compress) 1434 BTRFS_I(inode)->defrag_compress = compress_type; 1435 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1436 cluster_end + 1 - cur, extent_thresh, 1437 newer_than, do_compress, §ors_defragged, 1438 max_to_defrag, &last_scanned); 1439 1440 if (sectors_defragged > prev_sectors_defragged) 1441 balance_dirty_pages_ratelimited(inode->i_mapping); 1442 1443 btrfs_inode_unlock(BTRFS_I(inode), 0); 1444 if (ret < 0) 1445 break; 1446 cur = max(cluster_end + 1, last_scanned); 1447 if (ret > 0) { 1448 ret = 0; 1449 break; 1450 } 1451 cond_resched(); 1452 } 1453 1454 /* 1455 * Update range.start for autodefrag, this will indicate where to start 1456 * in next run. 1457 */ 1458 range->start = cur; 1459 if (sectors_defragged) { 1460 /* 1461 * We have defragged some sectors, for compression case they 1462 * need to be written back immediately. 1463 */ 1464 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1465 filemap_flush(inode->i_mapping); 1466 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1467 &BTRFS_I(inode)->runtime_flags)) 1468 filemap_flush(inode->i_mapping); 1469 } 1470 if (range->compress_type == BTRFS_COMPRESS_LZO) 1471 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1472 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1473 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1474 ret = sectors_defragged; 1475 } 1476 if (do_compress) { 1477 btrfs_inode_lock(BTRFS_I(inode), 0); 1478 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1479 btrfs_inode_unlock(BTRFS_I(inode), 0); 1480 } 1481 return ret; 1482 } 1483 1484 void __cold btrfs_auto_defrag_exit(void) 1485 { 1486 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1487 } 1488 1489 int __init btrfs_auto_defrag_init(void) 1490 { 1491 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1492 sizeof(struct inode_defrag), 0, 0, NULL); 1493 if (!btrfs_inode_defrag_cachep) 1494 return -ENOMEM; 1495 1496 return 0; 1497 } 1498