1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 #include "defrag.h" 17 #include "file-item.h" 18 #include "super.h" 19 20 static struct kmem_cache *btrfs_inode_defrag_cachep; 21 22 /* 23 * When auto defrag is enabled we queue up these defrag structs to remember 24 * which inodes need defragging passes. 25 */ 26 struct inode_defrag { 27 struct rb_node rb_node; 28 /* Inode number */ 29 u64 ino; 30 /* 31 * Transid where the defrag was added, we search for extents newer than 32 * this. 33 */ 34 u64 transid; 35 36 /* Root objectid */ 37 u64 root; 38 39 /* 40 * The extent size threshold for autodefrag. 41 * 42 * This value is different for compressed/non-compressed extents, thus 43 * needs to be passed from higher layer. 44 * (aka, inode_should_defrag()) 45 */ 46 u32 extent_thresh; 47 }; 48 49 static int __compare_inode_defrag(struct inode_defrag *defrag1, 50 struct inode_defrag *defrag2) 51 { 52 if (defrag1->root > defrag2->root) 53 return 1; 54 else if (defrag1->root < defrag2->root) 55 return -1; 56 else if (defrag1->ino > defrag2->ino) 57 return 1; 58 else if (defrag1->ino < defrag2->ino) 59 return -1; 60 else 61 return 0; 62 } 63 64 /* 65 * Pop a record for an inode into the defrag tree. The lock must be held 66 * already. 67 * 68 * If you're inserting a record for an older transid than an existing record, 69 * the transid already in the tree is lowered. 70 * 71 * If an existing record is found the defrag item you pass in is freed. 72 */ 73 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 74 struct inode_defrag *defrag) 75 { 76 struct btrfs_fs_info *fs_info = inode->root->fs_info; 77 struct inode_defrag *entry; 78 struct rb_node **p; 79 struct rb_node *parent = NULL; 80 int ret; 81 82 p = &fs_info->defrag_inodes.rb_node; 83 while (*p) { 84 parent = *p; 85 entry = rb_entry(parent, struct inode_defrag, rb_node); 86 87 ret = __compare_inode_defrag(defrag, entry); 88 if (ret < 0) 89 p = &parent->rb_left; 90 else if (ret > 0) 91 p = &parent->rb_right; 92 else { 93 /* 94 * If we're reinserting an entry for an old defrag run, 95 * make sure to lower the transid of our existing 96 * record. 97 */ 98 if (defrag->transid < entry->transid) 99 entry->transid = defrag->transid; 100 entry->extent_thresh = min(defrag->extent_thresh, 101 entry->extent_thresh); 102 return -EEXIST; 103 } 104 } 105 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 106 rb_link_node(&defrag->rb_node, parent, p); 107 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 108 return 0; 109 } 110 111 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 112 { 113 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 114 return 0; 115 116 if (btrfs_fs_closing(fs_info)) 117 return 0; 118 119 return 1; 120 } 121 122 /* 123 * Insert a defrag record for this inode if auto defrag is enabled. 124 */ 125 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 126 struct btrfs_inode *inode, u32 extent_thresh) 127 { 128 struct btrfs_root *root = inode->root; 129 struct btrfs_fs_info *fs_info = root->fs_info; 130 struct inode_defrag *defrag; 131 u64 transid; 132 int ret; 133 134 if (!__need_auto_defrag(fs_info)) 135 return 0; 136 137 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 138 return 0; 139 140 if (trans) 141 transid = trans->transid; 142 else 143 transid = inode->root->last_trans; 144 145 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 146 if (!defrag) 147 return -ENOMEM; 148 149 defrag->ino = btrfs_ino(inode); 150 defrag->transid = transid; 151 defrag->root = root->root_key.objectid; 152 defrag->extent_thresh = extent_thresh; 153 154 spin_lock(&fs_info->defrag_inodes_lock); 155 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 156 /* 157 * If we set IN_DEFRAG flag and evict the inode from memory, 158 * and then re-read this inode, this new inode doesn't have 159 * IN_DEFRAG flag. At the case, we may find the existed defrag. 160 */ 161 ret = __btrfs_add_inode_defrag(inode, defrag); 162 if (ret) 163 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 164 } else { 165 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 166 } 167 spin_unlock(&fs_info->defrag_inodes_lock); 168 return 0; 169 } 170 171 /* 172 * Pick the defragable inode that we want, if it doesn't exist, we will get the 173 * next one. 174 */ 175 static struct inode_defrag *btrfs_pick_defrag_inode( 176 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 177 { 178 struct inode_defrag *entry = NULL; 179 struct inode_defrag tmp; 180 struct rb_node *p; 181 struct rb_node *parent = NULL; 182 int ret; 183 184 tmp.ino = ino; 185 tmp.root = root; 186 187 spin_lock(&fs_info->defrag_inodes_lock); 188 p = fs_info->defrag_inodes.rb_node; 189 while (p) { 190 parent = p; 191 entry = rb_entry(parent, struct inode_defrag, rb_node); 192 193 ret = __compare_inode_defrag(&tmp, entry); 194 if (ret < 0) 195 p = parent->rb_left; 196 else if (ret > 0) 197 p = parent->rb_right; 198 else 199 goto out; 200 } 201 202 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 203 parent = rb_next(parent); 204 if (parent) 205 entry = rb_entry(parent, struct inode_defrag, rb_node); 206 else 207 entry = NULL; 208 } 209 out: 210 if (entry) 211 rb_erase(parent, &fs_info->defrag_inodes); 212 spin_unlock(&fs_info->defrag_inodes_lock); 213 return entry; 214 } 215 216 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 217 { 218 struct inode_defrag *defrag; 219 struct rb_node *node; 220 221 spin_lock(&fs_info->defrag_inodes_lock); 222 node = rb_first(&fs_info->defrag_inodes); 223 while (node) { 224 rb_erase(node, &fs_info->defrag_inodes); 225 defrag = rb_entry(node, struct inode_defrag, rb_node); 226 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 227 228 cond_resched_lock(&fs_info->defrag_inodes_lock); 229 230 node = rb_first(&fs_info->defrag_inodes); 231 } 232 spin_unlock(&fs_info->defrag_inodes_lock); 233 } 234 235 #define BTRFS_DEFRAG_BATCH 1024 236 237 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 238 struct inode_defrag *defrag) 239 { 240 struct btrfs_root *inode_root; 241 struct inode *inode; 242 struct btrfs_ioctl_defrag_range_args range; 243 int ret = 0; 244 u64 cur = 0; 245 246 again: 247 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 248 goto cleanup; 249 if (!__need_auto_defrag(fs_info)) 250 goto cleanup; 251 252 /* Get the inode */ 253 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 254 if (IS_ERR(inode_root)) { 255 ret = PTR_ERR(inode_root); 256 goto cleanup; 257 } 258 259 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 260 btrfs_put_root(inode_root); 261 if (IS_ERR(inode)) { 262 ret = PTR_ERR(inode); 263 goto cleanup; 264 } 265 266 if (cur >= i_size_read(inode)) { 267 iput(inode); 268 goto cleanup; 269 } 270 271 /* Do a chunk of defrag */ 272 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 273 memset(&range, 0, sizeof(range)); 274 range.len = (u64)-1; 275 range.start = cur; 276 range.extent_thresh = defrag->extent_thresh; 277 278 sb_start_write(fs_info->sb); 279 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 280 BTRFS_DEFRAG_BATCH); 281 sb_end_write(fs_info->sb); 282 iput(inode); 283 284 if (ret < 0) 285 goto cleanup; 286 287 cur = max(cur + fs_info->sectorsize, range.start); 288 goto again; 289 290 cleanup: 291 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 292 return ret; 293 } 294 295 /* 296 * Run through the list of inodes in the FS that need defragging. 297 */ 298 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 299 { 300 struct inode_defrag *defrag; 301 u64 first_ino = 0; 302 u64 root_objectid = 0; 303 304 atomic_inc(&fs_info->defrag_running); 305 while (1) { 306 /* Pause the auto defragger. */ 307 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 308 break; 309 310 if (!__need_auto_defrag(fs_info)) 311 break; 312 313 /* find an inode to defrag */ 314 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 315 if (!defrag) { 316 if (root_objectid || first_ino) { 317 root_objectid = 0; 318 first_ino = 0; 319 continue; 320 } else { 321 break; 322 } 323 } 324 325 first_ino = defrag->ino + 1; 326 root_objectid = defrag->root; 327 328 __btrfs_run_defrag_inode(fs_info, defrag); 329 } 330 atomic_dec(&fs_info->defrag_running); 331 332 /* 333 * During unmount, we use the transaction_wait queue to wait for the 334 * defragger to stop. 335 */ 336 wake_up(&fs_info->transaction_wait); 337 return 0; 338 } 339 340 /* 341 * Defrag all the leaves in a given btree. 342 * Read all the leaves and try to get key order to 343 * better reflect disk order 344 */ 345 346 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 347 struct btrfs_root *root) 348 { 349 struct btrfs_path *path = NULL; 350 struct btrfs_key key; 351 int ret = 0; 352 int wret; 353 int level; 354 int next_key_ret = 0; 355 u64 last_ret = 0; 356 357 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 358 goto out; 359 360 path = btrfs_alloc_path(); 361 if (!path) { 362 ret = -ENOMEM; 363 goto out; 364 } 365 366 level = btrfs_header_level(root->node); 367 368 if (level == 0) 369 goto out; 370 371 if (root->defrag_progress.objectid == 0) { 372 struct extent_buffer *root_node; 373 u32 nritems; 374 375 root_node = btrfs_lock_root_node(root); 376 nritems = btrfs_header_nritems(root_node); 377 root->defrag_max.objectid = 0; 378 /* from above we know this is not a leaf */ 379 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 380 nritems - 1); 381 btrfs_tree_unlock(root_node); 382 free_extent_buffer(root_node); 383 memset(&key, 0, sizeof(key)); 384 } else { 385 memcpy(&key, &root->defrag_progress, sizeof(key)); 386 } 387 388 path->keep_locks = 1; 389 390 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 391 if (ret < 0) 392 goto out; 393 if (ret > 0) { 394 ret = 0; 395 goto out; 396 } 397 btrfs_release_path(path); 398 /* 399 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 400 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 401 * a deadlock (attempting to write lock an already write locked leaf). 402 */ 403 path->lowest_level = 1; 404 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 405 406 if (wret < 0) { 407 ret = wret; 408 goto out; 409 } 410 if (!path->nodes[1]) { 411 ret = 0; 412 goto out; 413 } 414 /* 415 * The node at level 1 must always be locked when our path has 416 * keep_locks set and lowest_level is 1, regardless of the value of 417 * path->slots[1]. 418 */ 419 BUG_ON(path->locks[1] == 0); 420 ret = btrfs_realloc_node(trans, root, 421 path->nodes[1], 0, 422 &last_ret, 423 &root->defrag_progress); 424 if (ret) { 425 WARN_ON(ret == -EAGAIN); 426 goto out; 427 } 428 /* 429 * Now that we reallocated the node we can find the next key. Note that 430 * btrfs_find_next_key() can release our path and do another search 431 * without COWing, this is because even with path->keep_locks = 1, 432 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 433 * node when path->slots[node_level - 1] does not point to the last 434 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 435 * we search for the next key after reallocating our node. 436 */ 437 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 438 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 439 BTRFS_OLDEST_GENERATION); 440 if (next_key_ret == 0) { 441 memcpy(&root->defrag_progress, &key, sizeof(key)); 442 ret = -EAGAIN; 443 } 444 out: 445 btrfs_free_path(path); 446 if (ret == -EAGAIN) { 447 if (root->defrag_max.objectid > root->defrag_progress.objectid) 448 goto done; 449 if (root->defrag_max.type > root->defrag_progress.type) 450 goto done; 451 if (root->defrag_max.offset > root->defrag_progress.offset) 452 goto done; 453 ret = 0; 454 } 455 done: 456 if (ret != -EAGAIN) 457 memset(&root->defrag_progress, 0, 458 sizeof(root->defrag_progress)); 459 460 return ret; 461 } 462 463 /* 464 * Defrag specific helper to get an extent map. 465 * 466 * Differences between this and btrfs_get_extent() are: 467 * 468 * - No extent_map will be added to inode->extent_tree 469 * To reduce memory usage in the long run. 470 * 471 * - Extra optimization to skip file extents older than @newer_than 472 * By using btrfs_search_forward() we can skip entire file ranges that 473 * have extents created in past transactions, because btrfs_search_forward() 474 * will not visit leaves and nodes with a generation smaller than given 475 * minimal generation threshold (@newer_than). 476 * 477 * Return valid em if we find a file extent matching the requirement. 478 * Return NULL if we can not find a file extent matching the requirement. 479 * 480 * Return ERR_PTR() for error. 481 */ 482 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 483 u64 start, u64 newer_than) 484 { 485 struct btrfs_root *root = inode->root; 486 struct btrfs_file_extent_item *fi; 487 struct btrfs_path path = { 0 }; 488 struct extent_map *em; 489 struct btrfs_key key; 490 u64 ino = btrfs_ino(inode); 491 int ret; 492 493 em = alloc_extent_map(); 494 if (!em) { 495 ret = -ENOMEM; 496 goto err; 497 } 498 499 key.objectid = ino; 500 key.type = BTRFS_EXTENT_DATA_KEY; 501 key.offset = start; 502 503 if (newer_than) { 504 ret = btrfs_search_forward(root, &key, &path, newer_than); 505 if (ret < 0) 506 goto err; 507 /* Can't find anything newer */ 508 if (ret > 0) 509 goto not_found; 510 } else { 511 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 512 if (ret < 0) 513 goto err; 514 } 515 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 516 /* 517 * If btrfs_search_slot() makes path to point beyond nritems, 518 * we should not have an empty leaf, as this inode must at 519 * least have its INODE_ITEM. 520 */ 521 ASSERT(btrfs_header_nritems(path.nodes[0])); 522 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 523 } 524 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 525 /* Perfect match, no need to go one slot back */ 526 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 527 key.offset == start) 528 goto iterate; 529 530 /* We didn't find a perfect match, needs to go one slot back */ 531 if (path.slots[0] > 0) { 532 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 533 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 534 path.slots[0]--; 535 } 536 537 iterate: 538 /* Iterate through the path to find a file extent covering @start */ 539 while (true) { 540 u64 extent_end; 541 542 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 543 goto next; 544 545 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 546 547 /* 548 * We may go one slot back to INODE_REF/XATTR item, then 549 * need to go forward until we reach an EXTENT_DATA. 550 * But we should still has the correct ino as key.objectid. 551 */ 552 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 553 goto next; 554 555 /* It's beyond our target range, definitely not extent found */ 556 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 557 goto not_found; 558 559 /* 560 * | |<- File extent ->| 561 * \- start 562 * 563 * This means there is a hole between start and key.offset. 564 */ 565 if (key.offset > start) { 566 em->start = start; 567 em->orig_start = start; 568 em->block_start = EXTENT_MAP_HOLE; 569 em->len = key.offset - start; 570 break; 571 } 572 573 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 574 struct btrfs_file_extent_item); 575 extent_end = btrfs_file_extent_end(&path); 576 577 /* 578 * |<- file extent ->| | 579 * \- start 580 * 581 * We haven't reached start, search next slot. 582 */ 583 if (extent_end <= start) 584 goto next; 585 586 /* Now this extent covers @start, convert it to em */ 587 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 588 break; 589 next: 590 ret = btrfs_next_item(root, &path); 591 if (ret < 0) 592 goto err; 593 if (ret > 0) 594 goto not_found; 595 } 596 btrfs_release_path(&path); 597 return em; 598 599 not_found: 600 btrfs_release_path(&path); 601 free_extent_map(em); 602 return NULL; 603 604 err: 605 btrfs_release_path(&path); 606 free_extent_map(em); 607 return ERR_PTR(ret); 608 } 609 610 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 611 u64 newer_than, bool locked) 612 { 613 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 614 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 615 struct extent_map *em; 616 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 617 618 /* 619 * Hopefully we have this extent in the tree already, try without the 620 * full extent lock. 621 */ 622 read_lock(&em_tree->lock); 623 em = lookup_extent_mapping(em_tree, start, sectorsize); 624 read_unlock(&em_tree->lock); 625 626 /* 627 * We can get a merged extent, in that case, we need to re-search 628 * tree to get the original em for defrag. 629 * 630 * If @newer_than is 0 or em::generation < newer_than, we can trust 631 * this em, as either we don't care about the generation, or the 632 * merged extent map will be rejected anyway. 633 */ 634 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 635 newer_than && em->generation >= newer_than) { 636 free_extent_map(em); 637 em = NULL; 638 } 639 640 if (!em) { 641 struct extent_state *cached = NULL; 642 u64 end = start + sectorsize - 1; 643 644 /* Get the big lock and read metadata off disk. */ 645 if (!locked) 646 lock_extent(io_tree, start, end, &cached); 647 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 648 if (!locked) 649 unlock_extent(io_tree, start, end, &cached); 650 651 if (IS_ERR(em)) 652 return NULL; 653 } 654 655 return em; 656 } 657 658 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 659 const struct extent_map *em) 660 { 661 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 662 return BTRFS_MAX_COMPRESSED; 663 return fs_info->max_extent_size; 664 } 665 666 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 667 u32 extent_thresh, u64 newer_than, bool locked) 668 { 669 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 670 struct extent_map *next; 671 bool ret = false; 672 673 /* This is the last extent */ 674 if (em->start + em->len >= i_size_read(inode)) 675 return false; 676 677 /* 678 * Here we need to pass @newer_then when checking the next extent, or 679 * we will hit a case we mark current extent for defrag, but the next 680 * one will not be a target. 681 * This will just cause extra IO without really reducing the fragments. 682 */ 683 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 684 /* No more em or hole */ 685 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 686 goto out; 687 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 688 goto out; 689 /* 690 * If the next extent is at its max capacity, defragging current extent 691 * makes no sense, as the total number of extents won't change. 692 */ 693 if (next->len >= get_extent_max_capacity(fs_info, em)) 694 goto out; 695 /* Skip older extent */ 696 if (next->generation < newer_than) 697 goto out; 698 /* Also check extent size */ 699 if (next->len >= extent_thresh) 700 goto out; 701 702 ret = true; 703 out: 704 free_extent_map(next); 705 return ret; 706 } 707 708 /* 709 * Prepare one page to be defragged. 710 * 711 * This will ensure: 712 * 713 * - Returned page is locked and has been set up properly. 714 * - No ordered extent exists in the page. 715 * - The page is uptodate. 716 * 717 * NOTE: Caller should also wait for page writeback after the cluster is 718 * prepared, here we don't do writeback wait for each page. 719 */ 720 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 721 { 722 struct address_space *mapping = inode->vfs_inode.i_mapping; 723 gfp_t mask = btrfs_alloc_write_mask(mapping); 724 u64 page_start = (u64)index << PAGE_SHIFT; 725 u64 page_end = page_start + PAGE_SIZE - 1; 726 struct extent_state *cached_state = NULL; 727 struct page *page; 728 int ret; 729 730 again: 731 page = find_or_create_page(mapping, index, mask); 732 if (!page) 733 return ERR_PTR(-ENOMEM); 734 735 /* 736 * Since we can defragment files opened read-only, we can encounter 737 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 738 * can't do I/O using huge pages yet, so return an error for now. 739 * Filesystem transparent huge pages are typically only used for 740 * executables that explicitly enable them, so this isn't very 741 * restrictive. 742 */ 743 if (PageCompound(page)) { 744 unlock_page(page); 745 put_page(page); 746 return ERR_PTR(-ETXTBSY); 747 } 748 749 ret = set_page_extent_mapped(page); 750 if (ret < 0) { 751 unlock_page(page); 752 put_page(page); 753 return ERR_PTR(ret); 754 } 755 756 /* Wait for any existing ordered extent in the range */ 757 while (1) { 758 struct btrfs_ordered_extent *ordered; 759 760 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 761 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 762 unlock_extent(&inode->io_tree, page_start, page_end, 763 &cached_state); 764 if (!ordered) 765 break; 766 767 unlock_page(page); 768 btrfs_start_ordered_extent(ordered, 1); 769 btrfs_put_ordered_extent(ordered); 770 lock_page(page); 771 /* 772 * We unlocked the page above, so we need check if it was 773 * released or not. 774 */ 775 if (page->mapping != mapping || !PagePrivate(page)) { 776 unlock_page(page); 777 put_page(page); 778 goto again; 779 } 780 } 781 782 /* 783 * Now the page range has no ordered extent any more. Read the page to 784 * make it uptodate. 785 */ 786 if (!PageUptodate(page)) { 787 btrfs_read_folio(NULL, page_folio(page)); 788 lock_page(page); 789 if (page->mapping != mapping || !PagePrivate(page)) { 790 unlock_page(page); 791 put_page(page); 792 goto again; 793 } 794 if (!PageUptodate(page)) { 795 unlock_page(page); 796 put_page(page); 797 return ERR_PTR(-EIO); 798 } 799 } 800 return page; 801 } 802 803 struct defrag_target_range { 804 struct list_head list; 805 u64 start; 806 u64 len; 807 }; 808 809 /* 810 * Collect all valid target extents. 811 * 812 * @start: file offset to lookup 813 * @len: length to lookup 814 * @extent_thresh: file extent size threshold, any extent size >= this value 815 * will be ignored 816 * @newer_than: only defrag extents newer than this value 817 * @do_compress: whether the defrag is doing compression 818 * if true, @extent_thresh will be ignored and all regular 819 * file extents meeting @newer_than will be targets. 820 * @locked: if the range has already held extent lock 821 * @target_list: list of targets file extents 822 */ 823 static int defrag_collect_targets(struct btrfs_inode *inode, 824 u64 start, u64 len, u32 extent_thresh, 825 u64 newer_than, bool do_compress, 826 bool locked, struct list_head *target_list, 827 u64 *last_scanned_ret) 828 { 829 struct btrfs_fs_info *fs_info = inode->root->fs_info; 830 bool last_is_target = false; 831 u64 cur = start; 832 int ret = 0; 833 834 while (cur < start + len) { 835 struct extent_map *em; 836 struct defrag_target_range *new; 837 bool next_mergeable = true; 838 u64 range_len; 839 840 last_is_target = false; 841 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 842 if (!em) 843 break; 844 845 /* 846 * If the file extent is an inlined one, we may still want to 847 * defrag it (fallthrough) if it will cause a regular extent. 848 * This is for users who want to convert inline extents to 849 * regular ones through max_inline= mount option. 850 */ 851 if (em->block_start == EXTENT_MAP_INLINE && 852 em->len <= inode->root->fs_info->max_inline) 853 goto next; 854 855 /* Skip hole/delalloc/preallocated extents */ 856 if (em->block_start == EXTENT_MAP_HOLE || 857 em->block_start == EXTENT_MAP_DELALLOC || 858 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 859 goto next; 860 861 /* Skip older extent */ 862 if (em->generation < newer_than) 863 goto next; 864 865 /* This em is under writeback, no need to defrag */ 866 if (em->generation == (u64)-1) 867 goto next; 868 869 /* 870 * Our start offset might be in the middle of an existing extent 871 * map, so take that into account. 872 */ 873 range_len = em->len - (cur - em->start); 874 /* 875 * If this range of the extent map is already flagged for delalloc, 876 * skip it, because: 877 * 878 * 1) We could deadlock later, when trying to reserve space for 879 * delalloc, because in case we can't immediately reserve space 880 * the flusher can start delalloc and wait for the respective 881 * ordered extents to complete. The deadlock would happen 882 * because we do the space reservation while holding the range 883 * locked, and starting writeback, or finishing an ordered 884 * extent, requires locking the range; 885 * 886 * 2) If there's delalloc there, it means there's dirty pages for 887 * which writeback has not started yet (we clean the delalloc 888 * flag when starting writeback and after creating an ordered 889 * extent). If we mark pages in an adjacent range for defrag, 890 * then we will have a larger contiguous range for delalloc, 891 * very likely resulting in a larger extent after writeback is 892 * triggered (except in a case of free space fragmentation). 893 */ 894 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, 895 EXTENT_DELALLOC, 0, NULL)) 896 goto next; 897 898 /* 899 * For do_compress case, we want to compress all valid file 900 * extents, thus no @extent_thresh or mergeable check. 901 */ 902 if (do_compress) 903 goto add; 904 905 /* Skip too large extent */ 906 if (range_len >= extent_thresh) 907 goto next; 908 909 /* 910 * Skip extents already at its max capacity, this is mostly for 911 * compressed extents, which max cap is only 128K. 912 */ 913 if (em->len >= get_extent_max_capacity(fs_info, em)) 914 goto next; 915 916 /* 917 * Normally there are no more extents after an inline one, thus 918 * @next_mergeable will normally be false and not defragged. 919 * So if an inline extent passed all above checks, just add it 920 * for defrag, and be converted to regular extents. 921 */ 922 if (em->block_start == EXTENT_MAP_INLINE) 923 goto add; 924 925 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 926 extent_thresh, newer_than, locked); 927 if (!next_mergeable) { 928 struct defrag_target_range *last; 929 930 /* Empty target list, no way to merge with last entry */ 931 if (list_empty(target_list)) 932 goto next; 933 last = list_entry(target_list->prev, 934 struct defrag_target_range, list); 935 /* Not mergeable with last entry */ 936 if (last->start + last->len != cur) 937 goto next; 938 939 /* Mergeable, fall through to add it to @target_list. */ 940 } 941 942 add: 943 last_is_target = true; 944 range_len = min(extent_map_end(em), start + len) - cur; 945 /* 946 * This one is a good target, check if it can be merged into 947 * last range of the target list. 948 */ 949 if (!list_empty(target_list)) { 950 struct defrag_target_range *last; 951 952 last = list_entry(target_list->prev, 953 struct defrag_target_range, list); 954 ASSERT(last->start + last->len <= cur); 955 if (last->start + last->len == cur) { 956 /* Mergeable, enlarge the last entry */ 957 last->len += range_len; 958 goto next; 959 } 960 /* Fall through to allocate a new entry */ 961 } 962 963 /* Allocate new defrag_target_range */ 964 new = kmalloc(sizeof(*new), GFP_NOFS); 965 if (!new) { 966 free_extent_map(em); 967 ret = -ENOMEM; 968 break; 969 } 970 new->start = cur; 971 new->len = range_len; 972 list_add_tail(&new->list, target_list); 973 974 next: 975 cur = extent_map_end(em); 976 free_extent_map(em); 977 } 978 if (ret < 0) { 979 struct defrag_target_range *entry; 980 struct defrag_target_range *tmp; 981 982 list_for_each_entry_safe(entry, tmp, target_list, list) { 983 list_del_init(&entry->list); 984 kfree(entry); 985 } 986 } 987 if (!ret && last_scanned_ret) { 988 /* 989 * If the last extent is not a target, the caller can skip to 990 * the end of that extent. 991 * Otherwise, we can only go the end of the specified range. 992 */ 993 if (!last_is_target) 994 *last_scanned_ret = max(cur, *last_scanned_ret); 995 else 996 *last_scanned_ret = max(start + len, *last_scanned_ret); 997 } 998 return ret; 999 } 1000 1001 #define CLUSTER_SIZE (SZ_256K) 1002 static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1003 1004 /* 1005 * Defrag one contiguous target range. 1006 * 1007 * @inode: target inode 1008 * @target: target range to defrag 1009 * @pages: locked pages covering the defrag range 1010 * @nr_pages: number of locked pages 1011 * 1012 * Caller should ensure: 1013 * 1014 * - Pages are prepared 1015 * Pages should be locked, no ordered extent in the pages range, 1016 * no writeback. 1017 * 1018 * - Extent bits are locked 1019 */ 1020 static int defrag_one_locked_target(struct btrfs_inode *inode, 1021 struct defrag_target_range *target, 1022 struct page **pages, int nr_pages, 1023 struct extent_state **cached_state) 1024 { 1025 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1026 struct extent_changeset *data_reserved = NULL; 1027 const u64 start = target->start; 1028 const u64 len = target->len; 1029 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1030 unsigned long start_index = start >> PAGE_SHIFT; 1031 unsigned long first_index = page_index(pages[0]); 1032 int ret = 0; 1033 int i; 1034 1035 ASSERT(last_index - first_index + 1 <= nr_pages); 1036 1037 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1038 if (ret < 0) 1039 return ret; 1040 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1041 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1042 EXTENT_DEFRAG, cached_state); 1043 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); 1044 1045 /* Update the page status */ 1046 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1047 ClearPageChecked(pages[i]); 1048 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1049 } 1050 btrfs_delalloc_release_extents(inode, len); 1051 extent_changeset_free(data_reserved); 1052 1053 return ret; 1054 } 1055 1056 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1057 u32 extent_thresh, u64 newer_than, bool do_compress, 1058 u64 *last_scanned_ret) 1059 { 1060 struct extent_state *cached_state = NULL; 1061 struct defrag_target_range *entry; 1062 struct defrag_target_range *tmp; 1063 LIST_HEAD(target_list); 1064 struct page **pages; 1065 const u32 sectorsize = inode->root->fs_info->sectorsize; 1066 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1067 u64 start_index = start >> PAGE_SHIFT; 1068 unsigned int nr_pages = last_index - start_index + 1; 1069 int ret = 0; 1070 int i; 1071 1072 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1073 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1074 1075 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1076 if (!pages) 1077 return -ENOMEM; 1078 1079 /* Prepare all pages */ 1080 for (i = 0; i < nr_pages; i++) { 1081 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1082 if (IS_ERR(pages[i])) { 1083 ret = PTR_ERR(pages[i]); 1084 pages[i] = NULL; 1085 goto free_pages; 1086 } 1087 } 1088 for (i = 0; i < nr_pages; i++) 1089 wait_on_page_writeback(pages[i]); 1090 1091 /* Lock the pages range */ 1092 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1093 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1094 &cached_state); 1095 /* 1096 * Now we have a consistent view about the extent map, re-check 1097 * which range really needs to be defragged. 1098 * 1099 * And this time we have extent locked already, pass @locked = true 1100 * so that we won't relock the extent range and cause deadlock. 1101 */ 1102 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1103 newer_than, do_compress, true, 1104 &target_list, last_scanned_ret); 1105 if (ret < 0) 1106 goto unlock_extent; 1107 1108 list_for_each_entry(entry, &target_list, list) { 1109 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1110 &cached_state); 1111 if (ret < 0) 1112 break; 1113 } 1114 1115 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1116 list_del_init(&entry->list); 1117 kfree(entry); 1118 } 1119 unlock_extent: 1120 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1121 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1122 &cached_state); 1123 free_pages: 1124 for (i = 0; i < nr_pages; i++) { 1125 if (pages[i]) { 1126 unlock_page(pages[i]); 1127 put_page(pages[i]); 1128 } 1129 } 1130 kfree(pages); 1131 return ret; 1132 } 1133 1134 static int defrag_one_cluster(struct btrfs_inode *inode, 1135 struct file_ra_state *ra, 1136 u64 start, u32 len, u32 extent_thresh, 1137 u64 newer_than, bool do_compress, 1138 unsigned long *sectors_defragged, 1139 unsigned long max_sectors, 1140 u64 *last_scanned_ret) 1141 { 1142 const u32 sectorsize = inode->root->fs_info->sectorsize; 1143 struct defrag_target_range *entry; 1144 struct defrag_target_range *tmp; 1145 LIST_HEAD(target_list); 1146 int ret; 1147 1148 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1149 newer_than, do_compress, false, 1150 &target_list, NULL); 1151 if (ret < 0) 1152 goto out; 1153 1154 list_for_each_entry(entry, &target_list, list) { 1155 u32 range_len = entry->len; 1156 1157 /* Reached or beyond the limit */ 1158 if (max_sectors && *sectors_defragged >= max_sectors) { 1159 ret = 1; 1160 break; 1161 } 1162 1163 if (max_sectors) 1164 range_len = min_t(u32, range_len, 1165 (max_sectors - *sectors_defragged) * sectorsize); 1166 1167 /* 1168 * If defrag_one_range() has updated last_scanned_ret, 1169 * our range may already be invalid (e.g. hole punched). 1170 * Skip if our range is before last_scanned_ret, as there is 1171 * no need to defrag the range anymore. 1172 */ 1173 if (entry->start + range_len <= *last_scanned_ret) 1174 continue; 1175 1176 if (ra) 1177 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1178 ra, NULL, entry->start >> PAGE_SHIFT, 1179 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1180 (entry->start >> PAGE_SHIFT) + 1); 1181 /* 1182 * Here we may not defrag any range if holes are punched before 1183 * we locked the pages. 1184 * But that's fine, it only affects the @sectors_defragged 1185 * accounting. 1186 */ 1187 ret = defrag_one_range(inode, entry->start, range_len, 1188 extent_thresh, newer_than, do_compress, 1189 last_scanned_ret); 1190 if (ret < 0) 1191 break; 1192 *sectors_defragged += range_len >> 1193 inode->root->fs_info->sectorsize_bits; 1194 } 1195 out: 1196 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1197 list_del_init(&entry->list); 1198 kfree(entry); 1199 } 1200 if (ret >= 0) 1201 *last_scanned_ret = max(*last_scanned_ret, start + len); 1202 return ret; 1203 } 1204 1205 /* 1206 * Entry point to file defragmentation. 1207 * 1208 * @inode: inode to be defragged 1209 * @ra: readahead state (can be NUL) 1210 * @range: defrag options including range and flags 1211 * @newer_than: minimum transid to defrag 1212 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1213 * will be defragged. 1214 * 1215 * Return <0 for error. 1216 * Return >=0 for the number of sectors defragged, and range->start will be updated 1217 * to indicate the file offset where next defrag should be started at. 1218 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1219 * defragging all the range). 1220 */ 1221 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1222 struct btrfs_ioctl_defrag_range_args *range, 1223 u64 newer_than, unsigned long max_to_defrag) 1224 { 1225 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1226 unsigned long sectors_defragged = 0; 1227 u64 isize = i_size_read(inode); 1228 u64 cur; 1229 u64 last_byte; 1230 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1231 bool ra_allocated = false; 1232 int compress_type = BTRFS_COMPRESS_ZLIB; 1233 int ret = 0; 1234 u32 extent_thresh = range->extent_thresh; 1235 pgoff_t start_index; 1236 1237 if (isize == 0) 1238 return 0; 1239 1240 if (range->start >= isize) 1241 return -EINVAL; 1242 1243 if (do_compress) { 1244 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1245 return -EINVAL; 1246 if (range->compress_type) 1247 compress_type = range->compress_type; 1248 } 1249 1250 if (extent_thresh == 0) 1251 extent_thresh = SZ_256K; 1252 1253 if (range->start + range->len > range->start) { 1254 /* Got a specific range */ 1255 last_byte = min(isize, range->start + range->len); 1256 } else { 1257 /* Defrag until file end */ 1258 last_byte = isize; 1259 } 1260 1261 /* Align the range */ 1262 cur = round_down(range->start, fs_info->sectorsize); 1263 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1264 1265 /* 1266 * If we were not given a ra, allocate a readahead context. As 1267 * readahead is just an optimization, defrag will work without it so 1268 * we don't error out. 1269 */ 1270 if (!ra) { 1271 ra_allocated = true; 1272 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1273 if (ra) 1274 file_ra_state_init(ra, inode->i_mapping); 1275 } 1276 1277 /* 1278 * Make writeback start from the beginning of the range, so that the 1279 * defrag range can be written sequentially. 1280 */ 1281 start_index = cur >> PAGE_SHIFT; 1282 if (start_index < inode->i_mapping->writeback_index) 1283 inode->i_mapping->writeback_index = start_index; 1284 1285 while (cur < last_byte) { 1286 const unsigned long prev_sectors_defragged = sectors_defragged; 1287 u64 last_scanned = cur; 1288 u64 cluster_end; 1289 1290 if (btrfs_defrag_cancelled(fs_info)) { 1291 ret = -EAGAIN; 1292 break; 1293 } 1294 1295 /* We want the cluster end at page boundary when possible */ 1296 cluster_end = (((cur >> PAGE_SHIFT) + 1297 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1298 cluster_end = min(cluster_end, last_byte); 1299 1300 btrfs_inode_lock(BTRFS_I(inode), 0); 1301 if (IS_SWAPFILE(inode)) { 1302 ret = -ETXTBSY; 1303 btrfs_inode_unlock(BTRFS_I(inode), 0); 1304 break; 1305 } 1306 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1307 btrfs_inode_unlock(BTRFS_I(inode), 0); 1308 break; 1309 } 1310 if (do_compress) 1311 BTRFS_I(inode)->defrag_compress = compress_type; 1312 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1313 cluster_end + 1 - cur, extent_thresh, 1314 newer_than, do_compress, §ors_defragged, 1315 max_to_defrag, &last_scanned); 1316 1317 if (sectors_defragged > prev_sectors_defragged) 1318 balance_dirty_pages_ratelimited(inode->i_mapping); 1319 1320 btrfs_inode_unlock(BTRFS_I(inode), 0); 1321 if (ret < 0) 1322 break; 1323 cur = max(cluster_end + 1, last_scanned); 1324 if (ret > 0) { 1325 ret = 0; 1326 break; 1327 } 1328 cond_resched(); 1329 } 1330 1331 if (ra_allocated) 1332 kfree(ra); 1333 /* 1334 * Update range.start for autodefrag, this will indicate where to start 1335 * in next run. 1336 */ 1337 range->start = cur; 1338 if (sectors_defragged) { 1339 /* 1340 * We have defragged some sectors, for compression case they 1341 * need to be written back immediately. 1342 */ 1343 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1344 filemap_flush(inode->i_mapping); 1345 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1346 &BTRFS_I(inode)->runtime_flags)) 1347 filemap_flush(inode->i_mapping); 1348 } 1349 if (range->compress_type == BTRFS_COMPRESS_LZO) 1350 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1351 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1352 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1353 ret = sectors_defragged; 1354 } 1355 if (do_compress) { 1356 btrfs_inode_lock(BTRFS_I(inode), 0); 1357 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1358 btrfs_inode_unlock(BTRFS_I(inode), 0); 1359 } 1360 return ret; 1361 } 1362 1363 void __cold btrfs_auto_defrag_exit(void) 1364 { 1365 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1366 } 1367 1368 int __init btrfs_auto_defrag_init(void) 1369 { 1370 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1371 sizeof(struct inode_defrag), 0, 1372 SLAB_MEM_SPREAD, 1373 NULL); 1374 if (!btrfs_inode_defrag_cachep) 1375 return -ENOMEM; 1376 1377 return 0; 1378 } 1379