1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 #include "messages.h" 14 #include "delalloc-space.h" 15 #include "subpage.h" 16 #include "defrag.h" 17 #include "file-item.h" 18 #include "super.h" 19 20 static struct kmem_cache *btrfs_inode_defrag_cachep; 21 22 /* 23 * When auto defrag is enabled we queue up these defrag structs to remember 24 * which inodes need defragging passes. 25 */ 26 struct inode_defrag { 27 struct rb_node rb_node; 28 /* Inode number */ 29 u64 ino; 30 /* 31 * Transid where the defrag was added, we search for extents newer than 32 * this. 33 */ 34 u64 transid; 35 36 /* Root objectid */ 37 u64 root; 38 39 /* 40 * The extent size threshold for autodefrag. 41 * 42 * This value is different for compressed/non-compressed extents, thus 43 * needs to be passed from higher layer. 44 * (aka, inode_should_defrag()) 45 */ 46 u32 extent_thresh; 47 }; 48 49 static int __compare_inode_defrag(struct inode_defrag *defrag1, 50 struct inode_defrag *defrag2) 51 { 52 if (defrag1->root > defrag2->root) 53 return 1; 54 else if (defrag1->root < defrag2->root) 55 return -1; 56 else if (defrag1->ino > defrag2->ino) 57 return 1; 58 else if (defrag1->ino < defrag2->ino) 59 return -1; 60 else 61 return 0; 62 } 63 64 /* 65 * Pop a record for an inode into the defrag tree. The lock must be held 66 * already. 67 * 68 * If you're inserting a record for an older transid than an existing record, 69 * the transid already in the tree is lowered. 70 * 71 * If an existing record is found the defrag item you pass in is freed. 72 */ 73 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 74 struct inode_defrag *defrag) 75 { 76 struct btrfs_fs_info *fs_info = inode->root->fs_info; 77 struct inode_defrag *entry; 78 struct rb_node **p; 79 struct rb_node *parent = NULL; 80 int ret; 81 82 p = &fs_info->defrag_inodes.rb_node; 83 while (*p) { 84 parent = *p; 85 entry = rb_entry(parent, struct inode_defrag, rb_node); 86 87 ret = __compare_inode_defrag(defrag, entry); 88 if (ret < 0) 89 p = &parent->rb_left; 90 else if (ret > 0) 91 p = &parent->rb_right; 92 else { 93 /* 94 * If we're reinserting an entry for an old defrag run, 95 * make sure to lower the transid of our existing 96 * record. 97 */ 98 if (defrag->transid < entry->transid) 99 entry->transid = defrag->transid; 100 entry->extent_thresh = min(defrag->extent_thresh, 101 entry->extent_thresh); 102 return -EEXIST; 103 } 104 } 105 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 106 rb_link_node(&defrag->rb_node, parent, p); 107 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 108 return 0; 109 } 110 111 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 112 { 113 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 114 return 0; 115 116 if (btrfs_fs_closing(fs_info)) 117 return 0; 118 119 return 1; 120 } 121 122 /* 123 * Insert a defrag record for this inode if auto defrag is enabled. 124 */ 125 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 126 struct btrfs_inode *inode, u32 extent_thresh) 127 { 128 struct btrfs_root *root = inode->root; 129 struct btrfs_fs_info *fs_info = root->fs_info; 130 struct inode_defrag *defrag; 131 u64 transid; 132 int ret; 133 134 if (!__need_auto_defrag(fs_info)) 135 return 0; 136 137 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 138 return 0; 139 140 if (trans) 141 transid = trans->transid; 142 else 143 transid = inode->root->last_trans; 144 145 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 146 if (!defrag) 147 return -ENOMEM; 148 149 defrag->ino = btrfs_ino(inode); 150 defrag->transid = transid; 151 defrag->root = root->root_key.objectid; 152 defrag->extent_thresh = extent_thresh; 153 154 spin_lock(&fs_info->defrag_inodes_lock); 155 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 156 /* 157 * If we set IN_DEFRAG flag and evict the inode from memory, 158 * and then re-read this inode, this new inode doesn't have 159 * IN_DEFRAG flag. At the case, we may find the existed defrag. 160 */ 161 ret = __btrfs_add_inode_defrag(inode, defrag); 162 if (ret) 163 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 164 } else { 165 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 166 } 167 spin_unlock(&fs_info->defrag_inodes_lock); 168 return 0; 169 } 170 171 /* 172 * Pick the defragable inode that we want, if it doesn't exist, we will get the 173 * next one. 174 */ 175 static struct inode_defrag *btrfs_pick_defrag_inode( 176 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 177 { 178 struct inode_defrag *entry = NULL; 179 struct inode_defrag tmp; 180 struct rb_node *p; 181 struct rb_node *parent = NULL; 182 int ret; 183 184 tmp.ino = ino; 185 tmp.root = root; 186 187 spin_lock(&fs_info->defrag_inodes_lock); 188 p = fs_info->defrag_inodes.rb_node; 189 while (p) { 190 parent = p; 191 entry = rb_entry(parent, struct inode_defrag, rb_node); 192 193 ret = __compare_inode_defrag(&tmp, entry); 194 if (ret < 0) 195 p = parent->rb_left; 196 else if (ret > 0) 197 p = parent->rb_right; 198 else 199 goto out; 200 } 201 202 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 203 parent = rb_next(parent); 204 if (parent) 205 entry = rb_entry(parent, struct inode_defrag, rb_node); 206 else 207 entry = NULL; 208 } 209 out: 210 if (entry) 211 rb_erase(parent, &fs_info->defrag_inodes); 212 spin_unlock(&fs_info->defrag_inodes_lock); 213 return entry; 214 } 215 216 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 217 { 218 struct inode_defrag *defrag; 219 struct rb_node *node; 220 221 spin_lock(&fs_info->defrag_inodes_lock); 222 node = rb_first(&fs_info->defrag_inodes); 223 while (node) { 224 rb_erase(node, &fs_info->defrag_inodes); 225 defrag = rb_entry(node, struct inode_defrag, rb_node); 226 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 227 228 cond_resched_lock(&fs_info->defrag_inodes_lock); 229 230 node = rb_first(&fs_info->defrag_inodes); 231 } 232 spin_unlock(&fs_info->defrag_inodes_lock); 233 } 234 235 #define BTRFS_DEFRAG_BATCH 1024 236 237 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 238 struct inode_defrag *defrag) 239 { 240 struct btrfs_root *inode_root; 241 struct inode *inode; 242 struct btrfs_ioctl_defrag_range_args range; 243 int ret = 0; 244 u64 cur = 0; 245 246 again: 247 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 248 goto cleanup; 249 if (!__need_auto_defrag(fs_info)) 250 goto cleanup; 251 252 /* Get the inode */ 253 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 254 if (IS_ERR(inode_root)) { 255 ret = PTR_ERR(inode_root); 256 goto cleanup; 257 } 258 259 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 260 btrfs_put_root(inode_root); 261 if (IS_ERR(inode)) { 262 ret = PTR_ERR(inode); 263 goto cleanup; 264 } 265 266 if (cur >= i_size_read(inode)) { 267 iput(inode); 268 goto cleanup; 269 } 270 271 /* Do a chunk of defrag */ 272 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 273 memset(&range, 0, sizeof(range)); 274 range.len = (u64)-1; 275 range.start = cur; 276 range.extent_thresh = defrag->extent_thresh; 277 278 sb_start_write(fs_info->sb); 279 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 280 BTRFS_DEFRAG_BATCH); 281 sb_end_write(fs_info->sb); 282 iput(inode); 283 284 if (ret < 0) 285 goto cleanup; 286 287 cur = max(cur + fs_info->sectorsize, range.start); 288 goto again; 289 290 cleanup: 291 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 292 return ret; 293 } 294 295 /* 296 * Run through the list of inodes in the FS that need defragging. 297 */ 298 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 299 { 300 struct inode_defrag *defrag; 301 u64 first_ino = 0; 302 u64 root_objectid = 0; 303 304 atomic_inc(&fs_info->defrag_running); 305 while (1) { 306 /* Pause the auto defragger. */ 307 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 308 break; 309 310 if (!__need_auto_defrag(fs_info)) 311 break; 312 313 /* find an inode to defrag */ 314 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 315 if (!defrag) { 316 if (root_objectid || first_ino) { 317 root_objectid = 0; 318 first_ino = 0; 319 continue; 320 } else { 321 break; 322 } 323 } 324 325 first_ino = defrag->ino + 1; 326 root_objectid = defrag->root; 327 328 __btrfs_run_defrag_inode(fs_info, defrag); 329 } 330 atomic_dec(&fs_info->defrag_running); 331 332 /* 333 * During unmount, we use the transaction_wait queue to wait for the 334 * defragger to stop. 335 */ 336 wake_up(&fs_info->transaction_wait); 337 return 0; 338 } 339 340 /* 341 * Defrag all the leaves in a given btree. 342 * Read all the leaves and try to get key order to 343 * better reflect disk order 344 */ 345 346 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 347 struct btrfs_root *root) 348 { 349 struct btrfs_path *path = NULL; 350 struct btrfs_key key; 351 int ret = 0; 352 int wret; 353 int level; 354 int next_key_ret = 0; 355 u64 last_ret = 0; 356 357 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 358 goto out; 359 360 path = btrfs_alloc_path(); 361 if (!path) 362 return -ENOMEM; 363 364 level = btrfs_header_level(root->node); 365 366 if (level == 0) 367 goto out; 368 369 if (root->defrag_progress.objectid == 0) { 370 struct extent_buffer *root_node; 371 u32 nritems; 372 373 root_node = btrfs_lock_root_node(root); 374 nritems = btrfs_header_nritems(root_node); 375 root->defrag_max.objectid = 0; 376 /* from above we know this is not a leaf */ 377 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 378 nritems - 1); 379 btrfs_tree_unlock(root_node); 380 free_extent_buffer(root_node); 381 memset(&key, 0, sizeof(key)); 382 } else { 383 memcpy(&key, &root->defrag_progress, sizeof(key)); 384 } 385 386 path->keep_locks = 1; 387 388 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 389 if (ret < 0) 390 goto out; 391 if (ret > 0) { 392 ret = 0; 393 goto out; 394 } 395 btrfs_release_path(path); 396 /* 397 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 398 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 399 * a deadlock (attempting to write lock an already write locked leaf). 400 */ 401 path->lowest_level = 1; 402 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 403 404 if (wret < 0) { 405 ret = wret; 406 goto out; 407 } 408 if (!path->nodes[1]) { 409 ret = 0; 410 goto out; 411 } 412 /* 413 * The node at level 1 must always be locked when our path has 414 * keep_locks set and lowest_level is 1, regardless of the value of 415 * path->slots[1]. 416 */ 417 BUG_ON(path->locks[1] == 0); 418 ret = btrfs_realloc_node(trans, root, 419 path->nodes[1], 0, 420 &last_ret, 421 &root->defrag_progress); 422 if (ret) { 423 WARN_ON(ret == -EAGAIN); 424 goto out; 425 } 426 /* 427 * Now that we reallocated the node we can find the next key. Note that 428 * btrfs_find_next_key() can release our path and do another search 429 * without COWing, this is because even with path->keep_locks = 1, 430 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 431 * node when path->slots[node_level - 1] does not point to the last 432 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 433 * we search for the next key after reallocating our node. 434 */ 435 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 436 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 437 BTRFS_OLDEST_GENERATION); 438 if (next_key_ret == 0) { 439 memcpy(&root->defrag_progress, &key, sizeof(key)); 440 ret = -EAGAIN; 441 } 442 out: 443 btrfs_free_path(path); 444 if (ret == -EAGAIN) { 445 if (root->defrag_max.objectid > root->defrag_progress.objectid) 446 goto done; 447 if (root->defrag_max.type > root->defrag_progress.type) 448 goto done; 449 if (root->defrag_max.offset > root->defrag_progress.offset) 450 goto done; 451 ret = 0; 452 } 453 done: 454 if (ret != -EAGAIN) 455 memset(&root->defrag_progress, 0, 456 sizeof(root->defrag_progress)); 457 458 return ret; 459 } 460 461 /* 462 * Defrag specific helper to get an extent map. 463 * 464 * Differences between this and btrfs_get_extent() are: 465 * 466 * - No extent_map will be added to inode->extent_tree 467 * To reduce memory usage in the long run. 468 * 469 * - Extra optimization to skip file extents older than @newer_than 470 * By using btrfs_search_forward() we can skip entire file ranges that 471 * have extents created in past transactions, because btrfs_search_forward() 472 * will not visit leaves and nodes with a generation smaller than given 473 * minimal generation threshold (@newer_than). 474 * 475 * Return valid em if we find a file extent matching the requirement. 476 * Return NULL if we can not find a file extent matching the requirement. 477 * 478 * Return ERR_PTR() for error. 479 */ 480 static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, 481 u64 start, u64 newer_than) 482 { 483 struct btrfs_root *root = inode->root; 484 struct btrfs_file_extent_item *fi; 485 struct btrfs_path path = { 0 }; 486 struct extent_map *em; 487 struct btrfs_key key; 488 u64 ino = btrfs_ino(inode); 489 int ret; 490 491 em = alloc_extent_map(); 492 if (!em) { 493 ret = -ENOMEM; 494 goto err; 495 } 496 497 key.objectid = ino; 498 key.type = BTRFS_EXTENT_DATA_KEY; 499 key.offset = start; 500 501 if (newer_than) { 502 ret = btrfs_search_forward(root, &key, &path, newer_than); 503 if (ret < 0) 504 goto err; 505 /* Can't find anything newer */ 506 if (ret > 0) 507 goto not_found; 508 } else { 509 ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 510 if (ret < 0) 511 goto err; 512 } 513 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { 514 /* 515 * If btrfs_search_slot() makes path to point beyond nritems, 516 * we should not have an empty leaf, as this inode must at 517 * least have its INODE_ITEM. 518 */ 519 ASSERT(btrfs_header_nritems(path.nodes[0])); 520 path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; 521 } 522 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 523 /* Perfect match, no need to go one slot back */ 524 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && 525 key.offset == start) 526 goto iterate; 527 528 /* We didn't find a perfect match, needs to go one slot back */ 529 if (path.slots[0] > 0) { 530 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 531 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 532 path.slots[0]--; 533 } 534 535 iterate: 536 /* Iterate through the path to find a file extent covering @start */ 537 while (true) { 538 u64 extent_end; 539 540 if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 541 goto next; 542 543 btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 544 545 /* 546 * We may go one slot back to INODE_REF/XATTR item, then 547 * need to go forward until we reach an EXTENT_DATA. 548 * But we should still has the correct ino as key.objectid. 549 */ 550 if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) 551 goto next; 552 553 /* It's beyond our target range, definitely not extent found */ 554 if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) 555 goto not_found; 556 557 /* 558 * | |<- File extent ->| 559 * \- start 560 * 561 * This means there is a hole between start and key.offset. 562 */ 563 if (key.offset > start) { 564 em->start = start; 565 em->orig_start = start; 566 em->block_start = EXTENT_MAP_HOLE; 567 em->len = key.offset - start; 568 break; 569 } 570 571 fi = btrfs_item_ptr(path.nodes[0], path.slots[0], 572 struct btrfs_file_extent_item); 573 extent_end = btrfs_file_extent_end(&path); 574 575 /* 576 * |<- file extent ->| | 577 * \- start 578 * 579 * We haven't reached start, search next slot. 580 */ 581 if (extent_end <= start) 582 goto next; 583 584 /* Now this extent covers @start, convert it to em */ 585 btrfs_extent_item_to_extent_map(inode, &path, fi, em); 586 break; 587 next: 588 ret = btrfs_next_item(root, &path); 589 if (ret < 0) 590 goto err; 591 if (ret > 0) 592 goto not_found; 593 } 594 btrfs_release_path(&path); 595 return em; 596 597 not_found: 598 btrfs_release_path(&path); 599 free_extent_map(em); 600 return NULL; 601 602 err: 603 btrfs_release_path(&path); 604 free_extent_map(em); 605 return ERR_PTR(ret); 606 } 607 608 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, 609 u64 newer_than, bool locked) 610 { 611 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 612 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 613 struct extent_map *em; 614 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; 615 616 /* 617 * Hopefully we have this extent in the tree already, try without the 618 * full extent lock. 619 */ 620 read_lock(&em_tree->lock); 621 em = lookup_extent_mapping(em_tree, start, sectorsize); 622 read_unlock(&em_tree->lock); 623 624 /* 625 * We can get a merged extent, in that case, we need to re-search 626 * tree to get the original em for defrag. 627 * 628 * If @newer_than is 0 or em::generation < newer_than, we can trust 629 * this em, as either we don't care about the generation, or the 630 * merged extent map will be rejected anyway. 631 */ 632 if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && 633 newer_than && em->generation >= newer_than) { 634 free_extent_map(em); 635 em = NULL; 636 } 637 638 if (!em) { 639 struct extent_state *cached = NULL; 640 u64 end = start + sectorsize - 1; 641 642 /* Get the big lock and read metadata off disk. */ 643 if (!locked) 644 lock_extent(io_tree, start, end, &cached); 645 em = defrag_get_extent(BTRFS_I(inode), start, newer_than); 646 if (!locked) 647 unlock_extent(io_tree, start, end, &cached); 648 649 if (IS_ERR(em)) 650 return NULL; 651 } 652 653 return em; 654 } 655 656 static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, 657 const struct extent_map *em) 658 { 659 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 660 return BTRFS_MAX_COMPRESSED; 661 return fs_info->max_extent_size; 662 } 663 664 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, 665 u32 extent_thresh, u64 newer_than, bool locked) 666 { 667 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 668 struct extent_map *next; 669 bool ret = false; 670 671 /* This is the last extent */ 672 if (em->start + em->len >= i_size_read(inode)) 673 return false; 674 675 /* 676 * Here we need to pass @newer_then when checking the next extent, or 677 * we will hit a case we mark current extent for defrag, but the next 678 * one will not be a target. 679 * This will just cause extra IO without really reducing the fragments. 680 */ 681 next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 682 /* No more em or hole */ 683 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) 684 goto out; 685 if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) 686 goto out; 687 /* 688 * If the next extent is at its max capacity, defragging current extent 689 * makes no sense, as the total number of extents won't change. 690 */ 691 if (next->len >= get_extent_max_capacity(fs_info, em)) 692 goto out; 693 /* Skip older extent */ 694 if (next->generation < newer_than) 695 goto out; 696 /* Also check extent size */ 697 if (next->len >= extent_thresh) 698 goto out; 699 700 ret = true; 701 out: 702 free_extent_map(next); 703 return ret; 704 } 705 706 /* 707 * Prepare one page to be defragged. 708 * 709 * This will ensure: 710 * 711 * - Returned page is locked and has been set up properly. 712 * - No ordered extent exists in the page. 713 * - The page is uptodate. 714 * 715 * NOTE: Caller should also wait for page writeback after the cluster is 716 * prepared, here we don't do writeback wait for each page. 717 */ 718 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) 719 { 720 struct address_space *mapping = inode->vfs_inode.i_mapping; 721 gfp_t mask = btrfs_alloc_write_mask(mapping); 722 u64 page_start = (u64)index << PAGE_SHIFT; 723 u64 page_end = page_start + PAGE_SIZE - 1; 724 struct extent_state *cached_state = NULL; 725 struct page *page; 726 int ret; 727 728 again: 729 page = find_or_create_page(mapping, index, mask); 730 if (!page) 731 return ERR_PTR(-ENOMEM); 732 733 /* 734 * Since we can defragment files opened read-only, we can encounter 735 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We 736 * can't do I/O using huge pages yet, so return an error for now. 737 * Filesystem transparent huge pages are typically only used for 738 * executables that explicitly enable them, so this isn't very 739 * restrictive. 740 */ 741 if (PageCompound(page)) { 742 unlock_page(page); 743 put_page(page); 744 return ERR_PTR(-ETXTBSY); 745 } 746 747 ret = set_page_extent_mapped(page); 748 if (ret < 0) { 749 unlock_page(page); 750 put_page(page); 751 return ERR_PTR(ret); 752 } 753 754 /* Wait for any existing ordered extent in the range */ 755 while (1) { 756 struct btrfs_ordered_extent *ordered; 757 758 lock_extent(&inode->io_tree, page_start, page_end, &cached_state); 759 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); 760 unlock_extent(&inode->io_tree, page_start, page_end, 761 &cached_state); 762 if (!ordered) 763 break; 764 765 unlock_page(page); 766 btrfs_start_ordered_extent(ordered, 1); 767 btrfs_put_ordered_extent(ordered); 768 lock_page(page); 769 /* 770 * We unlocked the page above, so we need check if it was 771 * released or not. 772 */ 773 if (page->mapping != mapping || !PagePrivate(page)) { 774 unlock_page(page); 775 put_page(page); 776 goto again; 777 } 778 } 779 780 /* 781 * Now the page range has no ordered extent any more. Read the page to 782 * make it uptodate. 783 */ 784 if (!PageUptodate(page)) { 785 btrfs_read_folio(NULL, page_folio(page)); 786 lock_page(page); 787 if (page->mapping != mapping || !PagePrivate(page)) { 788 unlock_page(page); 789 put_page(page); 790 goto again; 791 } 792 if (!PageUptodate(page)) { 793 unlock_page(page); 794 put_page(page); 795 return ERR_PTR(-EIO); 796 } 797 } 798 return page; 799 } 800 801 struct defrag_target_range { 802 struct list_head list; 803 u64 start; 804 u64 len; 805 }; 806 807 /* 808 * Collect all valid target extents. 809 * 810 * @start: file offset to lookup 811 * @len: length to lookup 812 * @extent_thresh: file extent size threshold, any extent size >= this value 813 * will be ignored 814 * @newer_than: only defrag extents newer than this value 815 * @do_compress: whether the defrag is doing compression 816 * if true, @extent_thresh will be ignored and all regular 817 * file extents meeting @newer_than will be targets. 818 * @locked: if the range has already held extent lock 819 * @target_list: list of targets file extents 820 */ 821 static int defrag_collect_targets(struct btrfs_inode *inode, 822 u64 start, u64 len, u32 extent_thresh, 823 u64 newer_than, bool do_compress, 824 bool locked, struct list_head *target_list, 825 u64 *last_scanned_ret) 826 { 827 struct btrfs_fs_info *fs_info = inode->root->fs_info; 828 bool last_is_target = false; 829 u64 cur = start; 830 int ret = 0; 831 832 while (cur < start + len) { 833 struct extent_map *em; 834 struct defrag_target_range *new; 835 bool next_mergeable = true; 836 u64 range_len; 837 838 last_is_target = false; 839 em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); 840 if (!em) 841 break; 842 843 /* 844 * If the file extent is an inlined one, we may still want to 845 * defrag it (fallthrough) if it will cause a regular extent. 846 * This is for users who want to convert inline extents to 847 * regular ones through max_inline= mount option. 848 */ 849 if (em->block_start == EXTENT_MAP_INLINE && 850 em->len <= inode->root->fs_info->max_inline) 851 goto next; 852 853 /* Skip hole/delalloc/preallocated extents */ 854 if (em->block_start == EXTENT_MAP_HOLE || 855 em->block_start == EXTENT_MAP_DELALLOC || 856 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 857 goto next; 858 859 /* Skip older extent */ 860 if (em->generation < newer_than) 861 goto next; 862 863 /* This em is under writeback, no need to defrag */ 864 if (em->generation == (u64)-1) 865 goto next; 866 867 /* 868 * Our start offset might be in the middle of an existing extent 869 * map, so take that into account. 870 */ 871 range_len = em->len - (cur - em->start); 872 /* 873 * If this range of the extent map is already flagged for delalloc, 874 * skip it, because: 875 * 876 * 1) We could deadlock later, when trying to reserve space for 877 * delalloc, because in case we can't immediately reserve space 878 * the flusher can start delalloc and wait for the respective 879 * ordered extents to complete. The deadlock would happen 880 * because we do the space reservation while holding the range 881 * locked, and starting writeback, or finishing an ordered 882 * extent, requires locking the range; 883 * 884 * 2) If there's delalloc there, it means there's dirty pages for 885 * which writeback has not started yet (we clean the delalloc 886 * flag when starting writeback and after creating an ordered 887 * extent). If we mark pages in an adjacent range for defrag, 888 * then we will have a larger contiguous range for delalloc, 889 * very likely resulting in a larger extent after writeback is 890 * triggered (except in a case of free space fragmentation). 891 */ 892 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, 893 EXTENT_DELALLOC, 0, NULL)) 894 goto next; 895 896 /* 897 * For do_compress case, we want to compress all valid file 898 * extents, thus no @extent_thresh or mergeable check. 899 */ 900 if (do_compress) 901 goto add; 902 903 /* Skip too large extent */ 904 if (range_len >= extent_thresh) 905 goto next; 906 907 /* 908 * Skip extents already at its max capacity, this is mostly for 909 * compressed extents, which max cap is only 128K. 910 */ 911 if (em->len >= get_extent_max_capacity(fs_info, em)) 912 goto next; 913 914 /* 915 * Normally there are no more extents after an inline one, thus 916 * @next_mergeable will normally be false and not defragged. 917 * So if an inline extent passed all above checks, just add it 918 * for defrag, and be converted to regular extents. 919 */ 920 if (em->block_start == EXTENT_MAP_INLINE) 921 goto add; 922 923 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, 924 extent_thresh, newer_than, locked); 925 if (!next_mergeable) { 926 struct defrag_target_range *last; 927 928 /* Empty target list, no way to merge with last entry */ 929 if (list_empty(target_list)) 930 goto next; 931 last = list_entry(target_list->prev, 932 struct defrag_target_range, list); 933 /* Not mergeable with last entry */ 934 if (last->start + last->len != cur) 935 goto next; 936 937 /* Mergeable, fall through to add it to @target_list. */ 938 } 939 940 add: 941 last_is_target = true; 942 range_len = min(extent_map_end(em), start + len) - cur; 943 /* 944 * This one is a good target, check if it can be merged into 945 * last range of the target list. 946 */ 947 if (!list_empty(target_list)) { 948 struct defrag_target_range *last; 949 950 last = list_entry(target_list->prev, 951 struct defrag_target_range, list); 952 ASSERT(last->start + last->len <= cur); 953 if (last->start + last->len == cur) { 954 /* Mergeable, enlarge the last entry */ 955 last->len += range_len; 956 goto next; 957 } 958 /* Fall through to allocate a new entry */ 959 } 960 961 /* Allocate new defrag_target_range */ 962 new = kmalloc(sizeof(*new), GFP_NOFS); 963 if (!new) { 964 free_extent_map(em); 965 ret = -ENOMEM; 966 break; 967 } 968 new->start = cur; 969 new->len = range_len; 970 list_add_tail(&new->list, target_list); 971 972 next: 973 cur = extent_map_end(em); 974 free_extent_map(em); 975 } 976 if (ret < 0) { 977 struct defrag_target_range *entry; 978 struct defrag_target_range *tmp; 979 980 list_for_each_entry_safe(entry, tmp, target_list, list) { 981 list_del_init(&entry->list); 982 kfree(entry); 983 } 984 } 985 if (!ret && last_scanned_ret) { 986 /* 987 * If the last extent is not a target, the caller can skip to 988 * the end of that extent. 989 * Otherwise, we can only go the end of the specified range. 990 */ 991 if (!last_is_target) 992 *last_scanned_ret = max(cur, *last_scanned_ret); 993 else 994 *last_scanned_ret = max(start + len, *last_scanned_ret); 995 } 996 return ret; 997 } 998 999 #define CLUSTER_SIZE (SZ_256K) 1000 static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); 1001 1002 /* 1003 * Defrag one contiguous target range. 1004 * 1005 * @inode: target inode 1006 * @target: target range to defrag 1007 * @pages: locked pages covering the defrag range 1008 * @nr_pages: number of locked pages 1009 * 1010 * Caller should ensure: 1011 * 1012 * - Pages are prepared 1013 * Pages should be locked, no ordered extent in the pages range, 1014 * no writeback. 1015 * 1016 * - Extent bits are locked 1017 */ 1018 static int defrag_one_locked_target(struct btrfs_inode *inode, 1019 struct defrag_target_range *target, 1020 struct page **pages, int nr_pages, 1021 struct extent_state **cached_state) 1022 { 1023 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1024 struct extent_changeset *data_reserved = NULL; 1025 const u64 start = target->start; 1026 const u64 len = target->len; 1027 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; 1028 unsigned long start_index = start >> PAGE_SHIFT; 1029 unsigned long first_index = page_index(pages[0]); 1030 int ret = 0; 1031 int i; 1032 1033 ASSERT(last_index - first_index + 1 <= nr_pages); 1034 1035 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); 1036 if (ret < 0) 1037 return ret; 1038 clear_extent_bit(&inode->io_tree, start, start + len - 1, 1039 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 1040 EXTENT_DEFRAG, cached_state); 1041 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); 1042 1043 /* Update the page status */ 1044 for (i = start_index - first_index; i <= last_index - first_index; i++) { 1045 ClearPageChecked(pages[i]); 1046 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); 1047 } 1048 btrfs_delalloc_release_extents(inode, len); 1049 extent_changeset_free(data_reserved); 1050 1051 return ret; 1052 } 1053 1054 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, 1055 u32 extent_thresh, u64 newer_than, bool do_compress, 1056 u64 *last_scanned_ret) 1057 { 1058 struct extent_state *cached_state = NULL; 1059 struct defrag_target_range *entry; 1060 struct defrag_target_range *tmp; 1061 LIST_HEAD(target_list); 1062 struct page **pages; 1063 const u32 sectorsize = inode->root->fs_info->sectorsize; 1064 u64 last_index = (start + len - 1) >> PAGE_SHIFT; 1065 u64 start_index = start >> PAGE_SHIFT; 1066 unsigned int nr_pages = last_index - start_index + 1; 1067 int ret = 0; 1068 int i; 1069 1070 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); 1071 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); 1072 1073 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 1074 if (!pages) 1075 return -ENOMEM; 1076 1077 /* Prepare all pages */ 1078 for (i = 0; i < nr_pages; i++) { 1079 pages[i] = defrag_prepare_one_page(inode, start_index + i); 1080 if (IS_ERR(pages[i])) { 1081 ret = PTR_ERR(pages[i]); 1082 pages[i] = NULL; 1083 goto free_pages; 1084 } 1085 } 1086 for (i = 0; i < nr_pages; i++) 1087 wait_on_page_writeback(pages[i]); 1088 1089 /* Lock the pages range */ 1090 lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1091 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1092 &cached_state); 1093 /* 1094 * Now we have a consistent view about the extent map, re-check 1095 * which range really needs to be defragged. 1096 * 1097 * And this time we have extent locked already, pass @locked = true 1098 * so that we won't relock the extent range and cause deadlock. 1099 */ 1100 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1101 newer_than, do_compress, true, 1102 &target_list, last_scanned_ret); 1103 if (ret < 0) 1104 goto unlock_extent; 1105 1106 list_for_each_entry(entry, &target_list, list) { 1107 ret = defrag_one_locked_target(inode, entry, pages, nr_pages, 1108 &cached_state); 1109 if (ret < 0) 1110 break; 1111 } 1112 1113 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1114 list_del_init(&entry->list); 1115 kfree(entry); 1116 } 1117 unlock_extent: 1118 unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, 1119 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, 1120 &cached_state); 1121 free_pages: 1122 for (i = 0; i < nr_pages; i++) { 1123 if (pages[i]) { 1124 unlock_page(pages[i]); 1125 put_page(pages[i]); 1126 } 1127 } 1128 kfree(pages); 1129 return ret; 1130 } 1131 1132 static int defrag_one_cluster(struct btrfs_inode *inode, 1133 struct file_ra_state *ra, 1134 u64 start, u32 len, u32 extent_thresh, 1135 u64 newer_than, bool do_compress, 1136 unsigned long *sectors_defragged, 1137 unsigned long max_sectors, 1138 u64 *last_scanned_ret) 1139 { 1140 const u32 sectorsize = inode->root->fs_info->sectorsize; 1141 struct defrag_target_range *entry; 1142 struct defrag_target_range *tmp; 1143 LIST_HEAD(target_list); 1144 int ret; 1145 1146 ret = defrag_collect_targets(inode, start, len, extent_thresh, 1147 newer_than, do_compress, false, 1148 &target_list, NULL); 1149 if (ret < 0) 1150 goto out; 1151 1152 list_for_each_entry(entry, &target_list, list) { 1153 u32 range_len = entry->len; 1154 1155 /* Reached or beyond the limit */ 1156 if (max_sectors && *sectors_defragged >= max_sectors) { 1157 ret = 1; 1158 break; 1159 } 1160 1161 if (max_sectors) 1162 range_len = min_t(u32, range_len, 1163 (max_sectors - *sectors_defragged) * sectorsize); 1164 1165 /* 1166 * If defrag_one_range() has updated last_scanned_ret, 1167 * our range may already be invalid (e.g. hole punched). 1168 * Skip if our range is before last_scanned_ret, as there is 1169 * no need to defrag the range anymore. 1170 */ 1171 if (entry->start + range_len <= *last_scanned_ret) 1172 continue; 1173 1174 if (ra) 1175 page_cache_sync_readahead(inode->vfs_inode.i_mapping, 1176 ra, NULL, entry->start >> PAGE_SHIFT, 1177 ((entry->start + range_len - 1) >> PAGE_SHIFT) - 1178 (entry->start >> PAGE_SHIFT) + 1); 1179 /* 1180 * Here we may not defrag any range if holes are punched before 1181 * we locked the pages. 1182 * But that's fine, it only affects the @sectors_defragged 1183 * accounting. 1184 */ 1185 ret = defrag_one_range(inode, entry->start, range_len, 1186 extent_thresh, newer_than, do_compress, 1187 last_scanned_ret); 1188 if (ret < 0) 1189 break; 1190 *sectors_defragged += range_len >> 1191 inode->root->fs_info->sectorsize_bits; 1192 } 1193 out: 1194 list_for_each_entry_safe(entry, tmp, &target_list, list) { 1195 list_del_init(&entry->list); 1196 kfree(entry); 1197 } 1198 if (ret >= 0) 1199 *last_scanned_ret = max(*last_scanned_ret, start + len); 1200 return ret; 1201 } 1202 1203 /* 1204 * Entry point to file defragmentation. 1205 * 1206 * @inode: inode to be defragged 1207 * @ra: readahead state (can be NUL) 1208 * @range: defrag options including range and flags 1209 * @newer_than: minimum transid to defrag 1210 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode 1211 * will be defragged. 1212 * 1213 * Return <0 for error. 1214 * Return >=0 for the number of sectors defragged, and range->start will be updated 1215 * to indicate the file offset where next defrag should be started at. 1216 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without 1217 * defragging all the range). 1218 */ 1219 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, 1220 struct btrfs_ioctl_defrag_range_args *range, 1221 u64 newer_than, unsigned long max_to_defrag) 1222 { 1223 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 1224 unsigned long sectors_defragged = 0; 1225 u64 isize = i_size_read(inode); 1226 u64 cur; 1227 u64 last_byte; 1228 bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); 1229 bool ra_allocated = false; 1230 int compress_type = BTRFS_COMPRESS_ZLIB; 1231 int ret = 0; 1232 u32 extent_thresh = range->extent_thresh; 1233 pgoff_t start_index; 1234 1235 if (isize == 0) 1236 return 0; 1237 1238 if (range->start >= isize) 1239 return -EINVAL; 1240 1241 if (do_compress) { 1242 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) 1243 return -EINVAL; 1244 if (range->compress_type) 1245 compress_type = range->compress_type; 1246 } 1247 1248 if (extent_thresh == 0) 1249 extent_thresh = SZ_256K; 1250 1251 if (range->start + range->len > range->start) { 1252 /* Got a specific range */ 1253 last_byte = min(isize, range->start + range->len); 1254 } else { 1255 /* Defrag until file end */ 1256 last_byte = isize; 1257 } 1258 1259 /* Align the range */ 1260 cur = round_down(range->start, fs_info->sectorsize); 1261 last_byte = round_up(last_byte, fs_info->sectorsize) - 1; 1262 1263 /* 1264 * If we were not given a ra, allocate a readahead context. As 1265 * readahead is just an optimization, defrag will work without it so 1266 * we don't error out. 1267 */ 1268 if (!ra) { 1269 ra_allocated = true; 1270 ra = kzalloc(sizeof(*ra), GFP_KERNEL); 1271 if (ra) 1272 file_ra_state_init(ra, inode->i_mapping); 1273 } 1274 1275 /* 1276 * Make writeback start from the beginning of the range, so that the 1277 * defrag range can be written sequentially. 1278 */ 1279 start_index = cur >> PAGE_SHIFT; 1280 if (start_index < inode->i_mapping->writeback_index) 1281 inode->i_mapping->writeback_index = start_index; 1282 1283 while (cur < last_byte) { 1284 const unsigned long prev_sectors_defragged = sectors_defragged; 1285 u64 last_scanned = cur; 1286 u64 cluster_end; 1287 1288 if (btrfs_defrag_cancelled(fs_info)) { 1289 ret = -EAGAIN; 1290 break; 1291 } 1292 1293 /* We want the cluster end at page boundary when possible */ 1294 cluster_end = (((cur >> PAGE_SHIFT) + 1295 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; 1296 cluster_end = min(cluster_end, last_byte); 1297 1298 btrfs_inode_lock(BTRFS_I(inode), 0); 1299 if (IS_SWAPFILE(inode)) { 1300 ret = -ETXTBSY; 1301 btrfs_inode_unlock(BTRFS_I(inode), 0); 1302 break; 1303 } 1304 if (!(inode->i_sb->s_flags & SB_ACTIVE)) { 1305 btrfs_inode_unlock(BTRFS_I(inode), 0); 1306 break; 1307 } 1308 if (do_compress) 1309 BTRFS_I(inode)->defrag_compress = compress_type; 1310 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, 1311 cluster_end + 1 - cur, extent_thresh, 1312 newer_than, do_compress, §ors_defragged, 1313 max_to_defrag, &last_scanned); 1314 1315 if (sectors_defragged > prev_sectors_defragged) 1316 balance_dirty_pages_ratelimited(inode->i_mapping); 1317 1318 btrfs_inode_unlock(BTRFS_I(inode), 0); 1319 if (ret < 0) 1320 break; 1321 cur = max(cluster_end + 1, last_scanned); 1322 if (ret > 0) { 1323 ret = 0; 1324 break; 1325 } 1326 cond_resched(); 1327 } 1328 1329 if (ra_allocated) 1330 kfree(ra); 1331 /* 1332 * Update range.start for autodefrag, this will indicate where to start 1333 * in next run. 1334 */ 1335 range->start = cur; 1336 if (sectors_defragged) { 1337 /* 1338 * We have defragged some sectors, for compression case they 1339 * need to be written back immediately. 1340 */ 1341 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { 1342 filemap_flush(inode->i_mapping); 1343 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 1344 &BTRFS_I(inode)->runtime_flags)) 1345 filemap_flush(inode->i_mapping); 1346 } 1347 if (range->compress_type == BTRFS_COMPRESS_LZO) 1348 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); 1349 else if (range->compress_type == BTRFS_COMPRESS_ZSTD) 1350 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1351 ret = sectors_defragged; 1352 } 1353 if (do_compress) { 1354 btrfs_inode_lock(BTRFS_I(inode), 0); 1355 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; 1356 btrfs_inode_unlock(BTRFS_I(inode), 0); 1357 } 1358 return ret; 1359 } 1360 1361 void __cold btrfs_auto_defrag_exit(void) 1362 { 1363 kmem_cache_destroy(btrfs_inode_defrag_cachep); 1364 } 1365 1366 int __init btrfs_auto_defrag_init(void) 1367 { 1368 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 1369 sizeof(struct inode_defrag), 0, 1370 SLAB_MEM_SPREAD, 1371 NULL); 1372 if (!btrfs_inode_defrag_cachep) 1373 return -ENOMEM; 1374 1375 return 0; 1376 } 1377