1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 #include "defrag.h" 17 #include "file-item.h" 18 #include "super.h" 19 20 static struct kmem_cache *btrfs_inode_defrag_cachep; 21 22 /* 23 * When auto defrag is enabled we queue up these defrag structs to remember 24 * which inodes need defragging passes. 25 */ 26 struct inode_defrag { 27 struct rb_node rb_node; 28 /* Inode number */ 29 u64 ino; 30 /* 31 * Transid where the defrag was added, we search for extents newer than 32 * this. 33 */ 34 u64 transid; 35 36 /* Root objectid */ 37 u64 root; 38 39 /* 40 * The extent size threshold for autodefrag. 41 * 42 * This value is different for compressed/non-compressed extents, thus 43 * needs to be passed from higher layer. 44 * (aka, inode_should_defrag()) 45 */ 46 u32 extent_thresh; 47 }; 48 49 static int __compare_inode_defrag(struct inode_defrag *defrag1, 50 struct inode_defrag *defrag2) 51 { 52 if (defrag1->root > defrag2->root) 53 return 1; 54 else if (defrag1->root < defrag2->root) 55 return -1; 56 else if (defrag1->ino > defrag2->ino) 57 return 1; 58 else if (defrag1->ino < defrag2->ino) 59 return -1; 60 else 61 return 0; 62 } 63 64 /* 65 * Pop a record for an inode into the defrag tree. The lock must be held 66 * already. 67 * 68 * If you're inserting a record for an older transid than an existing record, 69 * the transid already in the tree is lowered. 70 * 71 * If an existing record is found the defrag item you pass in is freed. 72 */ 73 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 74 struct inode_defrag *defrag) 75 { 76 struct btrfs_fs_info *fs_info = inode->root->fs_info; 77 struct inode_defrag *entry; 78 struct rb_node **p; 79 struct rb_node *parent = NULL; 80 int ret; 81 82 p = &fs_info->defrag_inodes.rb_node; 83 while (*p) { 84 parent = *p; 85 entry = rb_entry(parent, struct inode_defrag, rb_node); 86 87 ret = __compare_inode_defrag(defrag, entry); 88 if (ret < 0) 89 p = &parent->rb_left; 90 else if (ret > 0) 91 p = &parent->rb_right; 92 else { 93 /* 94 * If we're reinserting an entry for an old defrag run, 95 * make sure to lower the transid of our existing 96 * record. 97 */ 98 if (defrag->transid < entry->transid) 99 entry->transid = defrag->transid; 100 entry->extent_thresh = min(defrag->extent_thresh, 101 entry->extent_thresh); 102 return -EEXIST; 103 } 104 } 105 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 106 rb_link_node(&defrag->rb_node, parent, p); 107 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 108 return 0; 109 } 110 111 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 112 { 113 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 114 return 0; 115 116 if (btrfs_fs_closing(fs_info)) 117 return 0; 118 119 return 1; 120 } 121 122 /* 123 * Insert a defrag record for this inode if auto defrag is enabled. 124 */ 125 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 126 struct btrfs_inode *inode, u32 extent_thresh) 127 { 128 struct btrfs_root *root = inode->root; 129 struct btrfs_fs_info *fs_info = root->fs_info; 130 struct inode_defrag *defrag; 131 u64 transid; 132 int ret; 133 134 if (!__need_auto_defrag(fs_info)) 135 return 0; 136 137 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 138 return 0; 139 140 if (trans) 141 transid = trans->transid; 142 else 143 transid = inode->root->last_trans; 144 145 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 146 if (!defrag) 147 return -ENOMEM; 148 149 defrag->ino = btrfs_ino(inode); 150 defrag->transid = transid; 151 defrag->root = root->root_key.objectid; 152 defrag->extent_thresh = extent_thresh; 153 154 spin_lock(&fs_info->defrag_inodes_lock); 155 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 156 /* 157 * If we set IN_DEFRAG flag and evict the inode from memory, 158 * and then re-read this inode, this new inode doesn't have 159 * IN_DEFRAG flag. At the case, we may find the existed defrag. 160 */ 161 ret = __btrfs_add_inode_defrag(inode, defrag); 162 if (ret) 163 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 164 } else { 165 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 166 } 167 spin_unlock(&fs_info->defrag_inodes_lock); 168 return 0; 169 } 170 171 /* 172 * Pick the defragable inode that we want, if it doesn't exist, we will get the 173 * next one. 174 */ 175 static struct inode_defrag *btrfs_pick_defrag_inode( 176 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 177 { 178 struct inode_defrag *entry = NULL; 179 struct inode_defrag tmp; 180 struct rb_node *p; 181 struct rb_node *parent = NULL; 182 int ret; 183 184 tmp.ino = ino; 185 tmp.root = root; 186 187 spin_lock(&fs_info->defrag_inodes_lock); 188 p = fs_info->defrag_inodes.rb_node; 189 while (p) { 190 parent = p; 191 entry = rb_entry(parent, struct inode_defrag, rb_node); 192 193 ret = __compare_inode_defrag(&tmp, entry); 194 if (ret < 0) 195 p = parent->rb_left; 196 else if (ret > 0) 197 p = parent->rb_right; 198 else 199 goto out; 200 } 201 202 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 203 parent = rb_next(parent); 204 if (parent) 205 entry = rb_entry(parent, struct inode_defrag, rb_node); 206 else 207 entry = NULL; 208 } 209 out: 210 if (entry) 211 rb_erase(parent, &fs_info->defrag_inodes); 212 spin_unlock(&fs_info->defrag_inodes_lock); 213 return entry; 214 } 215 216 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 217 { 218 struct inode_defrag *defrag; 219 struct rb_node *node; 220 221 spin_lock(&fs_info->defrag_inodes_lock); 222 node = rb_first(&fs_info->defrag_inodes); 223 while (node) { 224 rb_erase(node, &fs_info->defrag_inodes); 225 defrag = rb_entry(node, struct inode_defrag, rb_node); 226 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 227 228 cond_resched_lock(&fs_info->defrag_inodes_lock); 229 230 node = rb_first(&fs_info->defrag_inodes); 231 } 232 spin_unlock(&fs_info->defrag_inodes_lock); 233 } 234 235 #define BTRFS_DEFRAG_BATCH 1024 236 237 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 238 struct inode_defrag *defrag) 239 { 240 struct btrfs_root *inode_root; 241 struct inode *inode; 242 struct btrfs_ioctl_defrag_range_args range; 243 int ret = 0; 244 u64 cur = 0; 245 246 again: 247 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 248 goto cleanup; 249 if (!__need_auto_defrag(fs_info)) 250 goto cleanup; 251 252 /* Get the inode */ 253 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 254 if (IS_ERR(inode_root)) { 255 ret = PTR_ERR(inode_root); 256 goto cleanup; 257 } 258 259 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 260 btrfs_put_root(inode_root); 261 if (IS_ERR(inode)) { 262 ret = PTR_ERR(inode); 263 goto cleanup; 264 } 265 266 if (cur >= i_size_read(inode)) { 267 iput(inode); 268 goto cleanup; 269 } 270 271 /* Do a chunk of defrag */ 272 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 273 memset(&range, 0, sizeof(range)); 274 range.len = (u64)-1; 275 range.start = cur; 276 range.extent_thresh = defrag->extent_thresh; 277 278 sb_start_write(fs_info->sb); 279 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 280 BTRFS_DEFRAG_BATCH); 281 sb_end_write(fs_info->sb); 282 iput(inode); 283 284 if (ret < 0) 285 goto cleanup; 286 287 cur = max(cur + fs_info->sectorsize, range.start); 288 goto again; 289 290 cleanup: 291 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 292 return ret; 293 } 294 295 /* 296 * Run through the list of inodes in the FS that need defragging. 297 */ 298 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 299 { 300 struct inode_defrag *defrag; 301 u64 first_ino = 0; 302 u64 root_objectid = 0; 303 304 atomic_inc(&fs_info->defrag_running); 305 while (1) { 306 /* Pause the auto defragger. */ 307 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 308 break; 309 310 if (!__need_auto_defrag(fs_info)) 311 break; 312 313 /* find an inode to defrag */ 314 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 315 if (!defrag) { 316 if (root_objectid || first_ino) { 317 root_objectid = 0; 318 first_ino = 0; 319 continue; 320 } else { 321 break; 322 } 323 } 324 325 first_ino = defrag->ino + 1; 326 root_objectid = defrag->root; 327 328 __btrfs_run_defrag_inode(fs_info, defrag); 329 } 330 atomic_dec(&fs_info->defrag_running); 331 332 /* 333 * During unmount, we use the transaction_wait queue to wait for the 334 * defragger to stop. 335 */ 336 wake_up(&fs_info->transaction_wait); 337 return 0; 338 } 339 340 /* 341 * Check if two blocks addresses are close, used by defrag. 342 */ 343 static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) 344 { 345 if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) 346 return true; 347 if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) 348 return true; 349 return false; 350 } 351 352 /* 353 * Go through all the leaves pointed to by a node and reallocate them so that 354 * disk order is close to key order. 355 */ 356 static int btrfs_realloc_node(struct btrfs_trans_handle *trans, 357 struct btrfs_root *root, 358 struct extent_buffer *parent, 359 int start_slot, u64 *last_ret, 360 struct btrfs_key *progress) 361 { 362 struct btrfs_fs_info *fs_info = root->fs_info; 363 const u32 blocksize = fs_info->nodesize; 364 const int end_slot = btrfs_header_nritems(parent) - 1; 365 u64 search_start = *last_ret; 366 u64 last_block = 0; 367 int ret = 0; 368 bool progress_passed = false; 369 370 /* 371 * COWing must happen through a running transaction, which always 372 * matches the current fs generation (it's a transaction with a state 373 * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs 374 * into error state to prevent the commit of any transaction. 375 */ 376 if (unlikely(trans->transaction != fs_info->running_transaction || 377 trans->transid != fs_info->generation)) { 378 btrfs_abort_transaction(trans, -EUCLEAN); 379 btrfs_crit(fs_info, 380 "unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", 381 parent->start, btrfs_root_id(root), trans->transid, 382 fs_info->running_transaction->transid, 383 fs_info->generation); 384 return -EUCLEAN; 385 } 386 387 if (btrfs_header_nritems(parent) <= 1) 388 return 0; 389 390 for (int i = start_slot; i <= end_slot; i++) { 391 struct extent_buffer *cur; 392 struct btrfs_disk_key disk_key; 393 u64 blocknr; 394 u64 other; 395 bool close = true; 396 397 btrfs_node_key(parent, &disk_key, i); 398 if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) 399 continue; 400 401 progress_passed = true; 402 blocknr = btrfs_node_blockptr(parent, i); 403 if (last_block == 0) 404 last_block = blocknr; 405 406 if (i > 0) { 407 other = btrfs_node_blockptr(parent, i - 1); 408 close = close_blocks(blocknr, other, blocksize); 409 } 410 if (!close && i < end_slot) { 411 other = btrfs_node_blockptr(parent, i + 1); 412 close = close_blocks(blocknr, other, blocksize); 413 } 414 if (close) { 415 last_block = blocknr; 416 continue; 417 } 418 419 cur = btrfs_read_node_slot(parent, i); 420 if (IS_ERR(cur)) 421 return PTR_ERR(cur); 422 if (search_start == 0) 423 search_start = last_block; 424 425 btrfs_tree_lock(cur); 426 ret = btrfs_force_cow_block(trans, root, cur, parent, i, 427 &cur, search_start, 428 min(16 * blocksize, 429 (end_slot - i) * blocksize), 430 BTRFS_NESTING_COW); 431 if (ret) { 432 btrfs_tree_unlock(cur); 433 free_extent_buffer(cur); 434 break; 435 } 436 search_start = cur->start; 437 last_block = cur->start; 438 *last_ret = search_start; 439 btrfs_tree_unlock(cur); 440 free_extent_buffer(cur); 441 } 442 return ret; 443 } 444 445 /* 446 * Defrag all the leaves in a given btree. 447 * Read all the leaves and try to get key order to 448 * better reflect disk order 449 */ 450 451 static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 452 struct btrfs_root *root) 453 { 454 struct btrfs_path *path = NULL; 455 struct btrfs_key key; 456 int ret = 0; 457 int wret; 458 int level; 459 int next_key_ret = 0; 460 u64 last_ret = 0; 461 462 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 463 goto out; 464 465 path = btrfs_alloc_path(); 466 if (!path) { 467 ret = -ENOMEM; 468 goto out; 469 } 470 471 level = btrfs_header_level(root->node); 472 473 if (level == 0) 474 goto out; 475 476 if (root->defrag_progress.objectid == 0) { 477 struct extent_buffer *root_node; 478 u32 nritems; 479 480 root_node = btrfs_lock_root_node(root); 481 nritems = btrfs_header_nritems(root_node); 482 root->defrag_max.objectid = 0; 483 /* from above we know this is not a leaf */ 484 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 485 nritems - 1); 486 btrfs_tree_unlock(root_node); 487 free_extent_buffer(root_node); 488 memset(&key, 0, sizeof(key)); 489 } else { 490 memcpy(&key, &root->defrag_progress, sizeof(key)); 491 } 492 493 path->keep_locks = 1; 494 495 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 496 if (ret < 0) 497 goto out; 498 if (ret > 0) { 499 ret = 0; 500 goto out; 501 } 502 btrfs_release_path(path); 503 /* 504 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 505 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 506 * a deadlock (attempting to write lock an already write locked leaf). 507 */ 508 path->lowest_level = 1; 509 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 510 511 if (wret < 0) { 512 ret = wret; 513 goto out; 514 } 515 if (!path->nodes[1]) { 516 ret = 0; 517 goto out; 518 } 519 /* 520 * The node at level 1 must always be locked when our path has 521 * keep_locks set and lowest_level is 1, regardless of the value of 522 * path->slots[1]. 523 */ 524 BUG_ON(path->locks[1] == 0); 525 ret = btrfs_realloc_node(trans, root, 526 path->nodes[1], 0, 527 &last_ret, 528 &root->defrag_progress); 529 if (ret) { 530 WARN_ON(ret == -EAGAIN); 531 goto out; 532 } 533 /* 534 * Now that we reallocated the node we can find the next key. Note that 535 * btrfs_find_next_key() can release our path and do another search 536 * without COWing, this is because even with path->keep_locks = 1, 537 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 538 * node when path->slots[node_level - 1] does not point to the last 539 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 540 * we search for the next key after reallocating our node. 541 */ 542 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 543 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 544 BTRFS_OLDEST_GENERATION); 545 if (next_key_ret == 0) { 546 memcpy(&root->defrag_progress, &key, sizeof(key)); 547 ret = -EAGAIN; 548 } 549 out: 550 btrfs_free_path(path); 551 if (ret == -EAGAIN) { 552 if (root->defrag_max.objectid > root->defrag_progress.objectid) 553 goto done; 554 if (root->defrag_max.type > root->defrag_progress.type) 555 goto done; 556 if (root->defrag_max.offset > root->defrag_progress.offset) 557 goto done; 558 ret = 0; 559 } 560 done: 561 if (ret != -EAGAIN) 562 memset(&root->defrag_progress, 0, 563 sizeof(root->defrag_progress)); 564 565 return ret; 566 } 567 568 /* 569 * Defrag a given btree. Every leaf in the btree is read and defragmented. 570 */ 571 int btrfs_defrag_root(struct btrfs_root *root) 572 { 573 struct btrfs_fs_info *fs_info = root->fs_info; 574 int ret; 575 576 if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) 577 return 0; 578 579 while (1) { 580 struct btrfs_trans_handle *trans; 581 582 trans = btrfs_start_transaction(root, 0); 583 if (IS_ERR(trans)) { 584 ret = PTR_ERR(trans); 585 break; 586 } 587 588 ret = btrfs_defrag_leaves(trans, root); 589 590 btrfs_end_transaction(trans); 591 btrfs_btree_balance_dirty(fs_info); 592 cond_resched(); 593 594 if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) 595 break; 596 597 if (btrfs_defrag_cancelled(fs_info)) { 598 btrfs_debug(fs_info, "defrag_root cancelled"); 599 ret = -EAGAIN; 600 break; 601 } 602 } 603 clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); 604 return ret; 605 } 606 607 /* 608 * Defrag specific helper to get an extent map. 609 * 610 * Differences between this and btrfs_get_extent() are: 611 * 612 * - No extent_map will be added to inode->extent_tree 613 * To reduce memory usage in the long run. 614 * 615 * - Extra optimization to skip file extents older than @newer_than 616 * By using btrfs_search_forward() we can skip entire file ranges that 617 * have extents created in past transactions, because btrfs_search_forward() 618 * will not visit leaves and nodes with a generation smaller than given 619 * minimal generation threshold (@newer_than). 620 * 621 * Return valid em if we find a file extent matching the requirement. 622 * Return NULL if we can not find a file extent matching the requirement. 623 * 624 * Return ERR_PTR() for error. 625 */ 626 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 627 u64 start, u64 newer_than) 628 { 629 struct btrfs_root *root = inode->root; 630 struct btrfs_file_extent_item *fi; 631 struct btrfs_path path = { 0 }; 632 struct extent_map *em; 633 struct btrfs_key key; 634 u64 ino = btrfs_ino(inode); 635 int ret; 636 637 em = alloc_extent_map(); 638 if (!em) { 639 ret = -ENOMEM; 640 goto err; 641 } 642 643 key.objectid = ino; 644 key.type = BTRFS_EXTENT_DATA_KEY; 645 key.offset = start; 646 647 if (newer_than) { 648 ret = btrfs_search_forward(root, &key, &path, newer_than); 649 if (ret < 0) 650 goto err; 651 /* Can't find anything newer */ 652 if (ret > 0) 653 goto not_found; 654 } else { 655 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 656 if (ret < 0) 657 goto err; 658 } 659 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 660 /* 661 * If btrfs_search_slot() makes path to point beyond nritems, 662 * we should not have an empty leaf, as this inode must at 663 * least have its INODE_ITEM. 664 */ 665 ASSERT(btrfs_header_nritems(path.nodes[0])); 666 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 667 } 668 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 669 /* Perfect match, no need to go one slot back */ 670 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 671 key.offset == start) 672 goto iterate; 673 674 /* We didn't find a perfect match, needs to go one slot back */ 675 if (path.slots[0] > 0) { 676 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 677 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 678 path.slots[0]--; 679 } 680 681 iterate: 682 /* Iterate through the path to find a file extent covering @start */ 683 while (true) { 684 u64 extent_end; 685 686 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 687 goto next; 688 689 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 690 691 /* 692 * We may go one slot back to INODE_REF/XATTR item, then 693 * need to go forward until we reach an EXTENT_DATA. 694 * But we should still has the correct ino as key.objectid. 695 */ 696 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 697 goto next; 698 699 /* It's beyond our target range, definitely not extent found */ 700 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 701 goto not_found; 702 703 /* 704 * | |<- File extent ->| 705 * \- start 706 * 707 * This means there is a hole between start and key.offset. 708 */ 709 if (key.offset > start) { 710 em->start = start; 711 em->orig_start = start; 712 em->block_start = EXTENT_MAP_HOLE; 713 em->len = key.offset - start; 714 break; 715 } 716 717 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 718 struct btrfs_file_extent_item); 719 extent_end = btrfs_file_extent_end(&path); 720 721 /* 722 * |<- file extent ->| | 723 * \- start 724 * 725 * We haven't reached start, search next slot. 726 */ 727 if (extent_end <= start) 728 goto next; 729 730 /* Now this extent covers @start, convert it to em */ 731 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 732 break; 733 next: 734 ret = btrfs_next_item(root, &path); 735 if (ret < 0) 736 goto err; 737 if (ret > 0) 738 goto not_found; 739 } 740 btrfs_release_path(&path); 741 return em; 742 743 not_found: 744 btrfs_release_path(&path); 745 free_extent_map(em); 746 return NULL; 747 748 err: 749 btrfs_release_path(&path); 750 free_extent_map(em); 751 return ERR_PTR(ret); 752 } 753 754 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 755 u64 newer_than, bool locked) 756 { 757 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 758 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 759 struct extent_map *em; 760 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 761 762 /* 763 * Hopefully we have this extent in the tree already, try without the 764 * full extent lock. 765 */ 766 read_lock(&em_tree->lock); 767 em = lookup_extent_mapping(em_tree, start, sectorsize); 768 read_unlock(&em_tree->lock); 769 770 /* 771 * We can get a merged extent, in that case, we need to re-search 772 * tree to get the original em for defrag. 773 * 774 * If @newer_than is 0 or em::generation < newer_than, we can trust 775 * this em, as either we don't care about the generation, or the 776 * merged extent map will be rejected anyway. 777 */ 778 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 779 newer_than && em->generation >= newer_than) { 780 free_extent_map(em); 781 em = NULL; 782 } 783 784 if (!em) { 785 struct extent_state *cached = NULL; 786 u64 end = start + sectorsize - 1; 787 788 /* Get the big lock and read metadata off disk. */ 789 if (!locked) 790 lock_extent(io_tree, start, end, &cached); 791 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 792 if (!locked) 793 unlock_extent(io_tree, start, end, &cached); 794 795 if (IS_ERR(em)) 796 return NULL; 797 } 798 799 return em; 800 } 801 802 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 803 const struct extent_map *em) 804 { 805 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 806 return BTRFS_MAX_COMPRESSED; 807 return fs_info->max_extent_size; 808 } 809 810 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 811 u32 extent_thresh, u64 newer_than, bool locked) 812 { 813 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 814 struct extent_map *next; 815 bool ret = false; 816 817 /* This is the last extent */ 818 if (em->start + em->len >= i_size_read(inode)) 819 return false; 820 821 /* 822 * Here we need to pass @newer_then when checking the next extent, or 823 * we will hit a case we mark current extent for defrag, but the next 824 * one will not be a target. 825 * This will just cause extra IO without really reducing the fragments. 826 */ 827 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 828 /* No more em or hole */ 829 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 830 goto out; 831 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 832 goto out; 833 /* 834 * If the next extent is at its max capacity, defragging current extent 835 * makes no sense, as the total number of extents won't change. 836 */ 837 if (next->len >= get_extent_max_capacity(fs_info, em)) 838 goto out; 839 /* Skip older extent */ 840 if (next->generation < newer_than) 841 goto out; 842 /* Also check extent size */ 843 if (next->len >= extent_thresh) 844 goto out; 845 846 ret = true; 847 out: 848 free_extent_map(next); 849 return ret; 850 } 851 852 /* 853 * Prepare one page to be defragged. 854 * 855 * This will ensure: 856 * 857 * - Returned page is locked and has been set up properly. 858 * - No ordered extent exists in the page. 859 * - The page is uptodate. 860 * 861 * NOTE: Caller should also wait for page writeback after the cluster is 862 * prepared, here we don't do writeback wait for each page. 863 */ 864 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 865 { 866 struct address_space *mapping = inode->vfs_inode.i_mapping; 867 gfp_t mask = btrfs_alloc_write_mask(mapping); 868 u64 page_start = (u64)index << PAGE_SHIFT; 869 u64 page_end = page_start + PAGE_SIZE - 1; 870 struct extent_state *cached_state = NULL; 871 struct page *page; 872 int ret; 873 874 again: 875 page = find_or_create_page(mapping, index, mask); 876 if (!page) 877 return ERR_PTR(-ENOMEM); 878 879 /* 880 * Since we can defragment files opened read-only, we can encounter 881 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 882 * can't do I/O using huge pages yet, so return an error for now. 883 * Filesystem transparent huge pages are typically only used for 884 * executables that explicitly enable them, so this isn't very 885 * restrictive. 886 */ 887 if (PageCompound(page)) { 888 unlock_page(page); 889 put_page(page); 890 return ERR_PTR(-ETXTBSY); 891 } 892 893 ret = set_page_extent_mapped(page); 894 if (ret < 0) { 895 unlock_page(page); 896 put_page(page); 897 return ERR_PTR(ret); 898 } 899 900 /* Wait for any existing ordered extent in the range */ 901 while (1) { 902 struct btrfs_ordered_extent *ordered; 903 904 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 905 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 906 unlock_extent(&inode->io_tree, page_start, page_end, 907 &cached_state); 908 if (!ordered) 909 break; 910 911 unlock_page(page); 912 btrfs_start_ordered_extent(ordered); 913 btrfs_put_ordered_extent(ordered); 914 lock_page(page); 915 /* 916 * We unlocked the page above, so we need check if it was 917 * released or not. 918 */ 919 if (page->mapping != mapping || !PagePrivate(page)) { 920 unlock_page(page); 921 put_page(page); 922 goto again; 923 } 924 } 925 926 /* 927 * Now the page range has no ordered extent any more. Read the page to 928 * make it uptodate. 929 */ 930 if (!PageUptodate(page)) { 931 btrfs_read_folio(NULL, page_folio(page)); 932 lock_page(page); 933 if (page->mapping != mapping || !PagePrivate(page)) { 934 unlock_page(page); 935 put_page(page); 936 goto again; 937 } 938 if (!PageUptodate(page)) { 939 unlock_page(page); 940 put_page(page); 941 return ERR_PTR(-EIO); 942 } 943 } 944 return page; 945 } 946 947 struct defrag_target_range { 948 struct list_head list; 949 u64 start; 950 u64 len; 951 }; 952 953 /* 954 * Collect all valid target extents. 955 * 956 * @start: file offset to lookup 957 * @len: length to lookup 958 * @extent_thresh: file extent size threshold, any extent size >= this value 959 * will be ignored 960 * @newer_than: only defrag extents newer than this value 961 * @do_compress: whether the defrag is doing compression 962 * if true, @extent_thresh will be ignored and all regular 963 * file extents meeting @newer_than will be targets. 964 * @locked: if the range has already held extent lock 965 * @target_list: list of targets file extents 966 */ 967 static int defrag_collect_targets(struct btrfs_inode *inode, 968 u64 start, u64 len, u32 extent_thresh, 969 u64 newer_than, bool do_compress, 970 bool locked, struct list_head *target_list, 971 u64 *last_scanned_ret) 972 { 973 struct btrfs_fs_info *fs_info = inode->root->fs_info; 974 bool last_is_target = false; 975 u64 cur = start; 976 int ret = 0; 977 978 while (cur < start + len) { 979 struct extent_map *em; 980 struct defrag_target_range *new; 981 bool next_mergeable = true; 982 u64 range_len; 983 984 last_is_target = false; 985 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 986 if (!em) 987 break; 988 989 /* 990 * If the file extent is an inlined one, we may still want to 991 * defrag it (fallthrough) if it will cause a regular extent. 992 * This is for users who want to convert inline extents to 993 * regular ones through max_inline= mount option. 994 */ 995 if (em->block_start == EXTENT_MAP_INLINE && 996 em->len <= inode->root->fs_info->max_inline) 997 goto next; 998 999 /* Skip hole/delalloc/preallocated extents */ 1000 if (em->block_start == EXTENT_MAP_HOLE || 1001 em->block_start == EXTENT_MAP_DELALLOC || 1002 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 1003 goto next; 1004 1005 /* Skip older extent */ 1006 if (em->generation < newer_than) 1007 goto next; 1008 1009 /* This em is under writeback, no need to defrag */ 1010 if (em->generation == (u64)-1) 1011 goto next; 1012 1013 /* 1014 * Our start offset might be in the middle of an existing extent 1015 * map, so take that into account. 1016 */ 1017 range_len = em->len - (cur - em->start); 1018 /* 1019 * If this range of the extent map is already flagged for delalloc, 1020 * skip it, because: 1021 * 1022 * 1) We could deadlock later, when trying to reserve space for 1023 * delalloc, because in case we can't immediately reserve space 1024 * the flusher can start delalloc and wait for the respective 1025 * ordered extents to complete. The deadlock would happen 1026 * because we do the space reservation while holding the range 1027 * locked, and starting writeback, or finishing an ordered 1028 * extent, requires locking the range; 1029 * 1030 * 2) If there's delalloc there, it means there's dirty pages for 1031 * which writeback has not started yet (we clean the delalloc 1032 * flag when starting writeback and after creating an ordered 1033 * extent). If we mark pages in an adjacent range for defrag, 1034 * then we will have a larger contiguous range for delalloc, 1035 * very likely resulting in a larger extent after writeback is 1036 * triggered (except in a case of free space fragmentation). 1037 */ 1038 if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, 1039 EXTENT_DELALLOC)) 1040 goto next; 1041 1042 /* 1043 * For do_compress case, we want to compress all valid file 1044 * extents, thus no @extent_thresh or mergeable check. 1045 */ 1046 if (do_compress) 1047 goto add; 1048 1049 /* Skip too large extent */ 1050 if (range_len >= extent_thresh) 1051 goto next; 1052 1053 /* 1054 * Skip extents already at its max capacity, this is mostly for 1055 * compressed extents, which max cap is only 128K. 1056 */ 1057 if (em->len >= get_extent_max_capacity(fs_info, em)) 1058 goto next; 1059 1060 /* 1061 * Normally there are no more extents after an inline one, thus 1062 * @next_mergeable will normally be false and not defragged. 1063 * So if an inline extent passed all above checks, just add it 1064 * for defrag, and be converted to regular extents. 1065 */ 1066 if (em->block_start == EXTENT_MAP_INLINE) 1067 goto add; 1068 1069 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 1070 extent_thresh, newer_than, locked); 1071 if (!next_mergeable) { 1072 struct defrag_target_range *last; 1073 1074 /* Empty target list, no way to merge with last entry */ 1075 if (list_empty(target_list)) 1076 goto next; 1077 last = list_entry(target_list->prev, 1078 struct defrag_target_range, list); 1079 /* Not mergeable with last entry */ 1080 if (last->start + last->len != cur) 1081 goto next; 1082 1083 /* Mergeable, fall through to add it to @target_list. */ 1084 } 1085 1086 add: 1087 last_is_target = true; 1088 range_len = min(extent_map_end(em), start + len) - cur; 1089 /* 1090 * This one is a good target, check if it can be merged into 1091 * last range of the target list. 1092 */ 1093 if (!list_empty(target_list)) { 1094 struct defrag_target_range *last; 1095 1096 last = list_entry(target_list->prev, 1097 struct defrag_target_range, list); 1098 ASSERT(last->start + last->len <= cur); 1099 if (last->start + last->len == cur) { 1100 /* Mergeable, enlarge the last entry */ 1101 last->len += range_len; 1102 goto next; 1103 } 1104 /* Fall through to allocate a new entry */ 1105 } 1106 1107 /* Allocate new defrag_target_range */ 1108 new = kmalloc(sizeof(*new), GFP_NOFS); 1109 if (!new) { 1110 free_extent_map(em); 1111 ret = -ENOMEM; 1112 break; 1113 } 1114 new->start = cur; 1115 new->len = range_len; 1116 list_add_tail(&new->list, target_list); 1117 1118 next: 1119 cur = extent_map_end(em); 1120 free_extent_map(em); 1121 } 1122 if (ret < 0) { 1123 struct defrag_target_range *entry; 1124 struct defrag_target_range *tmp; 1125 1126 list_for_each_entry_safe(entry, tmp, target_list, list) { 1127 list_del_init(&entry->list); 1128 kfree(entry); 1129 } 1130 } 1131 if (!ret && last_scanned_ret) { 1132 /* 1133 * If the last extent is not a target, the caller can skip to 1134 * the end of that extent. 1135 * Otherwise, we can only go the end of the specified range. 1136 */ 1137 if (!last_is_target) 1138 *last_scanned_ret = max(cur, *last_scanned_ret); 1139 else 1140 *last_scanned_ret = max(start + len, *last_scanned_ret); 1141 } 1142 return ret; 1143 } 1144 1145 #define CLUSTER_SIZE (SZ_256K) 1146 static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); 1147 1148 /* 1149 * Defrag one contiguous target range. 1150 * 1151 * @inode: target inode 1152 * @target: target range to defrag 1153 * @pages: locked pages covering the defrag range 1154 * @nr_pages: number of locked pages 1155 * 1156 * Caller should ensure: 1157 * 1158 * - Pages are prepared 1159 * Pages should be locked, no ordered extent in the pages range, 1160 * no writeback. 1161 * 1162 * - Extent bits are locked 1163 */ 1164 static int defrag_one_locked_target(struct btrfs_inode *inode, 1165 struct defrag_target_range *target, 1166 struct page **pages, int nr_pages, 1167 struct extent_state **cached_state) 1168 { 1169 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1170 struct extent_changeset *data_reserved = NULL; 1171 const u64 start = target->start; 1172 const u64 len = target->len; 1173 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1174 unsigned long start_index = start >> PAGE_SHIFT; 1175 unsigned long first_index = page_index(pages[0]); 1176 int ret = 0; 1177 int i; 1178 1179 ASSERT(last_index - first_index + 1 <= nr_pages); 1180 1181 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1182 if (ret < 0) 1183 return ret; 1184 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1185 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1186 EXTENT_DEFRAG, cached_state); 1187 set_extent_bit(&inode->io_tree, start, start + len - 1, 1188 EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); 1189 1190 /* Update the page status */ 1191 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1192 ClearPageChecked(pages[i]); 1193 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1194 } 1195 btrfs_delalloc_release_extents(inode, len); 1196 extent_changeset_free(data_reserved); 1197 1198 return ret; 1199 } 1200 1201 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1202 u32 extent_thresh, u64 newer_than, bool do_compress, 1203 u64 *last_scanned_ret) 1204 { 1205 struct extent_state *cached_state = NULL; 1206 struct defrag_target_range *entry; 1207 struct defrag_target_range *tmp; 1208 LIST_HEAD(target_list); 1209 struct page **pages; 1210 const u32 sectorsize = inode->root->fs_info->sectorsize; 1211 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1212 u64 start_index = start >> PAGE_SHIFT; 1213 unsigned int nr_pages = last_index - start_index + 1; 1214 int ret = 0; 1215 int i; 1216 1217 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1218 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1219 1220 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1221 if (!pages) 1222 return -ENOMEM; 1223 1224 /* Prepare all pages */ 1225 for (i = 0; i < nr_pages; i++) { 1226 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1227 if (IS_ERR(pages[i])) { 1228 ret = PTR_ERR(pages[i]); 1229 pages[i] = NULL; 1230 goto free_pages; 1231 } 1232 } 1233 for (i = 0; i < nr_pages; i++) 1234 wait_on_page_writeback(pages[i]); 1235 1236 /* Lock the pages range */ 1237 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1238 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1239 &cached_state); 1240 /* 1241 * Now we have a consistent view about the extent map, re-check 1242 * which range really needs to be defragged. 1243 * 1244 * And this time we have extent locked already, pass @locked = true 1245 * so that we won't relock the extent range and cause deadlock. 1246 */ 1247 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1248 newer_than, do_compress, true, 1249 &target_list, last_scanned_ret); 1250 if (ret < 0) 1251 goto unlock_extent; 1252 1253 list_for_each_entry(entry, &target_list, list) { 1254 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1255 &cached_state); 1256 if (ret < 0) 1257 break; 1258 } 1259 1260 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1261 list_del_init(&entry->list); 1262 kfree(entry); 1263 } 1264 unlock_extent: 1265 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1266 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1267 &cached_state); 1268 free_pages: 1269 for (i = 0; i < nr_pages; i++) { 1270 if (pages[i]) { 1271 unlock_page(pages[i]); 1272 put_page(pages[i]); 1273 } 1274 } 1275 kfree(pages); 1276 return ret; 1277 } 1278 1279 static int defrag_one_cluster(struct btrfs_inode *inode, 1280 struct file_ra_state *ra, 1281 u64 start, u32 len, u32 extent_thresh, 1282 u64 newer_than, bool do_compress, 1283 unsigned long *sectors_defragged, 1284 unsigned long max_sectors, 1285 u64 *last_scanned_ret) 1286 { 1287 const u32 sectorsize = inode->root->fs_info->sectorsize; 1288 struct defrag_target_range *entry; 1289 struct defrag_target_range *tmp; 1290 LIST_HEAD(target_list); 1291 int ret; 1292 1293 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1294 newer_than, do_compress, false, 1295 &target_list, NULL); 1296 if (ret < 0) 1297 goto out; 1298 1299 list_for_each_entry(entry, &target_list, list) { 1300 u32 range_len = entry->len; 1301 1302 /* Reached or beyond the limit */ 1303 if (max_sectors && *sectors_defragged >= max_sectors) { 1304 ret = 1; 1305 break; 1306 } 1307 1308 if (max_sectors) 1309 range_len = min_t(u32, range_len, 1310 (max_sectors - *sectors_defragged) * sectorsize); 1311 1312 /* 1313 * If defrag_one_range() has updated last_scanned_ret, 1314 * our range may already be invalid (e.g. hole punched). 1315 * Skip if our range is before last_scanned_ret, as there is 1316 * no need to defrag the range anymore. 1317 */ 1318 if (entry->start + range_len <= *last_scanned_ret) 1319 continue; 1320 1321 if (ra) 1322 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1323 ra, NULL, entry->start >> PAGE_SHIFT, 1324 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1325 (entry->start >> PAGE_SHIFT) + 1); 1326 /* 1327 * Here we may not defrag any range if holes are punched before 1328 * we locked the pages. 1329 * But that's fine, it only affects the @sectors_defragged 1330 * accounting. 1331 */ 1332 ret = defrag_one_range(inode, entry->start, range_len, 1333 extent_thresh, newer_than, do_compress, 1334 last_scanned_ret); 1335 if (ret < 0) 1336 break; 1337 *sectors_defragged += range_len >> 1338 inode->root->fs_info->sectorsize_bits; 1339 } 1340 out: 1341 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1342 list_del_init(&entry->list); 1343 kfree(entry); 1344 } 1345 if (ret >= 0) 1346 *last_scanned_ret = max(*last_scanned_ret, start + len); 1347 return ret; 1348 } 1349 1350 /* 1351 * Entry point to file defragmentation. 1352 * 1353 * @inode: inode to be defragged 1354 * @ra: readahead state (can be NUL) 1355 * @range: defrag options including range and flags 1356 * @newer_than: minimum transid to defrag 1357 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1358 * will be defragged. 1359 * 1360 * Return <0 for error. 1361 * Return >=0 for the number of sectors defragged, and range->start will be updated 1362 * to indicate the file offset where next defrag should be started at. 1363 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1364 * defragging all the range). 1365 */ 1366 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1367 struct btrfs_ioctl_defrag_range_args *range, 1368 u64 newer_than, unsigned long max_to_defrag) 1369 { 1370 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1371 unsigned long sectors_defragged = 0; 1372 u64 isize = i_size_read(inode); 1373 u64 cur; 1374 u64 last_byte; 1375 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1376 bool ra_allocated = false; 1377 int compress_type = BTRFS_COMPRESS_ZLIB; 1378 int ret = 0; 1379 u32 extent_thresh = range->extent_thresh; 1380 pgoff_t start_index; 1381 1382 if (isize == 0) 1383 return 0; 1384 1385 if (range->start >= isize) 1386 return -EINVAL; 1387 1388 if (do_compress) { 1389 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1390 return -EINVAL; 1391 if (range->compress_type) 1392 compress_type = range->compress_type; 1393 } 1394 1395 if (extent_thresh == 0) 1396 extent_thresh = SZ_256K; 1397 1398 if (range->start + range->len > range->start) { 1399 /* Got a specific range */ 1400 last_byte = min(isize, range->start + range->len); 1401 } else { 1402 /* Defrag until file end */ 1403 last_byte = isize; 1404 } 1405 1406 /* Align the range */ 1407 cur = round_down(range->start, fs_info->sectorsize); 1408 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1409 1410 /* 1411 * If we were not given a ra, allocate a readahead context. As 1412 * readahead is just an optimization, defrag will work without it so 1413 * we don't error out. 1414 */ 1415 if (!ra) { 1416 ra_allocated = true; 1417 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1418 if (ra) 1419 file_ra_state_init(ra, inode->i_mapping); 1420 } 1421 1422 /* 1423 * Make writeback start from the beginning of the range, so that the 1424 * defrag range can be written sequentially. 1425 */ 1426 start_index = cur >> PAGE_SHIFT; 1427 if (start_index < inode->i_mapping->writeback_index) 1428 inode->i_mapping->writeback_index = start_index; 1429 1430 while (cur < last_byte) { 1431 const unsigned long prev_sectors_defragged = sectors_defragged; 1432 u64 last_scanned = cur; 1433 u64 cluster_end; 1434 1435 if (btrfs_defrag_cancelled(fs_info)) { 1436 ret = -EAGAIN; 1437 break; 1438 } 1439 1440 /* We want the cluster end at page boundary when possible */ 1441 cluster_end = (((cur >> PAGE_SHIFT) + 1442 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1443 cluster_end = min(cluster_end, last_byte); 1444 1445 btrfs_inode_lock(BTRFS_I(inode), 0); 1446 if (IS_SWAPFILE(inode)) { 1447 ret = -ETXTBSY; 1448 btrfs_inode_unlock(BTRFS_I(inode), 0); 1449 break; 1450 } 1451 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1452 btrfs_inode_unlock(BTRFS_I(inode), 0); 1453 break; 1454 } 1455 if (do_compress) 1456 BTRFS_I(inode)->defrag_compress = compress_type; 1457 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1458 cluster_end + 1 - cur, extent_thresh, 1459 newer_than, do_compress, §ors_defragged, 1460 max_to_defrag, &last_scanned); 1461 1462 if (sectors_defragged > prev_sectors_defragged) 1463 balance_dirty_pages_ratelimited(inode->i_mapping); 1464 1465 btrfs_inode_unlock(BTRFS_I(inode), 0); 1466 if (ret < 0) 1467 break; 1468 cur = max(cluster_end + 1, last_scanned); 1469 if (ret > 0) { 1470 ret = 0; 1471 break; 1472 } 1473 cond_resched(); 1474 } 1475 1476 if (ra_allocated) 1477 kfree(ra); 1478 /* 1479 * Update range.start for autodefrag, this will indicate where to start 1480 * in next run. 1481 */ 1482 range->start = cur; 1483 if (sectors_defragged) { 1484 /* 1485 * We have defragged some sectors, for compression case they 1486 * need to be written back immediately. 1487 */ 1488 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1489 filemap_flush(inode->i_mapping); 1490 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1491 &BTRFS_I(inode)->runtime_flags)) 1492 filemap_flush(inode->i_mapping); 1493 } 1494 if (range->compress_type == BTRFS_COMPRESS_LZO) 1495 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1496 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1497 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1498 ret = sectors_defragged; 1499 } 1500 if (do_compress) { 1501 btrfs_inode_lock(BTRFS_I(inode), 0); 1502 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1503 btrfs_inode_unlock(BTRFS_I(inode), 0); 1504 } 1505 return ret; 1506 } 1507 1508 void __cold btrfs_auto_defrag_exit(void) 1509 { 1510 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1511 } 1512 1513 int __init btrfs_auto_defrag_init(void) 1514 { 1515 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1516 sizeof(struct inode_defrag), 0, 1517 SLAB_MEM_SPREAD, 1518 NULL); 1519 if (!btrfs_inode_defrag_cachep) 1520 return -ENOMEM; 1521 1522 return 0; 1523 } 1524