1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include <linux/string_choices.h> 15 #include "extent-tree.h" 16 #include "fs.h" 17 #include "messages.h" 18 #include "misc.h" 19 #include "free-space-cache.h" 20 #include "transaction.h" 21 #include "disk-io.h" 22 #include "extent_io.h" 23 #include "space-info.h" 24 #include "block-group.h" 25 #include "discard.h" 26 #include "subpage.h" 27 #include "inode-item.h" 28 #include "accessors.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "super.h" 32 #include "relocation.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void btrfs_crc32c_final(u32 crc, u8 *result) 61 { 62 put_unaligned_le32(~crc, result); 63 } 64 65 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 66 { 67 struct btrfs_free_space *info; 68 struct rb_node *node; 69 70 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 71 info = rb_entry(node, struct btrfs_free_space, offset_index); 72 if (!info->bitmap) { 73 unlink_free_space(ctl, info, true); 74 kmem_cache_free(btrfs_free_space_cachep, info); 75 } else { 76 free_bitmap(ctl, info); 77 } 78 79 cond_resched_lock(&ctl->tree_lock); 80 } 81 } 82 83 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 84 struct btrfs_path *path, 85 u64 offset) 86 { 87 struct btrfs_key key; 88 struct btrfs_key location; 89 struct btrfs_disk_key disk_key; 90 struct btrfs_free_space_header *header; 91 struct extent_buffer *leaf; 92 struct btrfs_inode *inode; 93 unsigned nofs_flag; 94 int ret; 95 96 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 97 key.type = 0; 98 key.offset = offset; 99 100 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 101 if (ret < 0) 102 return ERR_PTR(ret); 103 if (ret > 0) { 104 btrfs_release_path(path); 105 return ERR_PTR(-ENOENT); 106 } 107 108 leaf = path->nodes[0]; 109 header = btrfs_item_ptr(leaf, path->slots[0], 110 struct btrfs_free_space_header); 111 btrfs_free_space_key(leaf, header, &disk_key); 112 btrfs_disk_key_to_cpu(&location, &disk_key); 113 btrfs_release_path(path); 114 115 /* 116 * We are often under a trans handle at this point, so we need to make 117 * sure NOFS is set to keep us from deadlocking. 118 */ 119 nofs_flag = memalloc_nofs_save(); 120 inode = btrfs_iget_path(location.objectid, root, path); 121 btrfs_release_path(path); 122 memalloc_nofs_restore(nofs_flag); 123 if (IS_ERR(inode)) 124 return ERR_CAST(inode); 125 126 mapping_set_gfp_mask(inode->vfs_inode.i_mapping, 127 mapping_gfp_constraint(inode->vfs_inode.i_mapping, 128 ~(__GFP_FS | __GFP_HIGHMEM))); 129 130 return &inode->vfs_inode; 131 } 132 133 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 134 struct btrfs_path *path) 135 { 136 struct btrfs_fs_info *fs_info = block_group->fs_info; 137 struct inode *inode = NULL; 138 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 139 140 spin_lock(&block_group->lock); 141 if (block_group->inode) 142 inode = igrab(&block_group->inode->vfs_inode); 143 spin_unlock(&block_group->lock); 144 if (inode) 145 return inode; 146 147 inode = __lookup_free_space_inode(fs_info->tree_root, path, 148 block_group->start); 149 if (IS_ERR(inode)) 150 return inode; 151 152 spin_lock(&block_group->lock); 153 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 154 btrfs_info(fs_info, "Old style space inode found, converting."); 155 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 156 BTRFS_INODE_NODATACOW; 157 block_group->disk_cache_state = BTRFS_DC_CLEAR; 158 } 159 160 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 161 block_group->inode = BTRFS_I(igrab(inode)); 162 spin_unlock(&block_group->lock); 163 164 return inode; 165 } 166 167 static int __create_free_space_inode(struct btrfs_root *root, 168 struct btrfs_trans_handle *trans, 169 struct btrfs_path *path, 170 u64 ino, u64 offset) 171 { 172 struct btrfs_key key; 173 struct btrfs_disk_key disk_key; 174 struct btrfs_free_space_header *header; 175 struct btrfs_inode_item *inode_item; 176 struct extent_buffer *leaf; 177 /* We inline CRCs for the free disk space cache */ 178 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 179 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 180 int ret; 181 182 ret = btrfs_insert_empty_inode(trans, root, path, ino); 183 if (ret) 184 return ret; 185 186 leaf = path->nodes[0]; 187 inode_item = btrfs_item_ptr(leaf, path->slots[0], 188 struct btrfs_inode_item); 189 btrfs_item_key(leaf, &disk_key, path->slots[0]); 190 memzero_extent_buffer(leaf, (unsigned long)inode_item, 191 sizeof(*inode_item)); 192 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 193 btrfs_set_inode_size(leaf, inode_item, 0); 194 btrfs_set_inode_nbytes(leaf, inode_item, 0); 195 btrfs_set_inode_uid(leaf, inode_item, 0); 196 btrfs_set_inode_gid(leaf, inode_item, 0); 197 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 198 btrfs_set_inode_flags(leaf, inode_item, flags); 199 btrfs_set_inode_nlink(leaf, inode_item, 1); 200 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 201 btrfs_set_inode_block_group(leaf, inode_item, offset); 202 btrfs_release_path(path); 203 204 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 205 key.type = 0; 206 key.offset = offset; 207 ret = btrfs_insert_empty_item(trans, root, path, &key, 208 sizeof(struct btrfs_free_space_header)); 209 if (ret < 0) { 210 btrfs_release_path(path); 211 return ret; 212 } 213 214 leaf = path->nodes[0]; 215 header = btrfs_item_ptr(leaf, path->slots[0], 216 struct btrfs_free_space_header); 217 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 218 btrfs_set_free_space_key(leaf, header, &disk_key); 219 btrfs_release_path(path); 220 221 return 0; 222 } 223 224 int create_free_space_inode(struct btrfs_trans_handle *trans, 225 struct btrfs_block_group *block_group, 226 struct btrfs_path *path) 227 { 228 int ret; 229 u64 ino; 230 231 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 232 if (ret < 0) 233 return ret; 234 235 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 236 ino, block_group->start); 237 } 238 239 /* 240 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 241 * handles lookup, otherwise it takes ownership and iputs the inode. 242 * Don't reuse an inode pointer after passing it into this function. 243 */ 244 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 245 struct inode *inode, 246 struct btrfs_block_group *block_group) 247 { 248 BTRFS_PATH_AUTO_FREE(path); 249 struct btrfs_key key; 250 int ret = 0; 251 252 path = btrfs_alloc_path(); 253 if (!path) 254 return -ENOMEM; 255 256 if (!inode) 257 inode = lookup_free_space_inode(block_group, path); 258 if (IS_ERR(inode)) { 259 if (PTR_ERR(inode) != -ENOENT) 260 ret = PTR_ERR(inode); 261 return ret; 262 } 263 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 264 if (ret) { 265 btrfs_add_delayed_iput(BTRFS_I(inode)); 266 return ret; 267 } 268 clear_nlink(inode); 269 /* One for the block groups ref */ 270 spin_lock(&block_group->lock); 271 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 272 block_group->inode = NULL; 273 spin_unlock(&block_group->lock); 274 iput(inode); 275 } else { 276 spin_unlock(&block_group->lock); 277 } 278 /* One for the lookup ref */ 279 btrfs_add_delayed_iput(BTRFS_I(inode)); 280 281 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 282 key.type = 0; 283 key.offset = block_group->start; 284 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 285 -1, 1); 286 if (ret) { 287 if (ret > 0) 288 ret = 0; 289 return ret; 290 } 291 return btrfs_del_item(trans, trans->fs_info->tree_root, path); 292 } 293 294 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 295 struct btrfs_block_group *block_group, 296 struct inode *vfs_inode) 297 { 298 struct btrfs_truncate_control control = { 299 .inode = BTRFS_I(vfs_inode), 300 .new_size = 0, 301 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 302 .min_type = BTRFS_EXTENT_DATA_KEY, 303 .clear_extent_range = true, 304 }; 305 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 306 struct btrfs_root *root = inode->root; 307 struct extent_state *cached_state = NULL; 308 int ret = 0; 309 bool locked = false; 310 311 if (block_group) { 312 BTRFS_PATH_AUTO_FREE(path); 313 314 path = btrfs_alloc_path(); 315 if (!path) { 316 ret = -ENOMEM; 317 goto fail; 318 } 319 locked = true; 320 mutex_lock(&trans->transaction->cache_write_mutex); 321 if (!list_empty(&block_group->io_list)) { 322 list_del_init(&block_group->io_list); 323 324 btrfs_wait_cache_io(trans, block_group, path); 325 btrfs_put_block_group(block_group); 326 } 327 328 /* 329 * now that we've truncated the cache away, its no longer 330 * setup or written 331 */ 332 spin_lock(&block_group->lock); 333 block_group->disk_cache_state = BTRFS_DC_CLEAR; 334 spin_unlock(&block_group->lock); 335 } 336 337 btrfs_i_size_write(inode, 0); 338 truncate_pagecache(vfs_inode, 0); 339 340 btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 341 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 342 343 /* 344 * We skip the throttling logic for free space cache inodes, so we don't 345 * need to check for -EAGAIN. 346 */ 347 ret = btrfs_truncate_inode_items(trans, root, &control); 348 349 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 350 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 351 352 btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 353 if (ret) 354 goto fail; 355 356 ret = btrfs_update_inode(trans, inode); 357 358 fail: 359 if (locked) 360 mutex_unlock(&trans->transaction->cache_write_mutex); 361 if (ret) 362 btrfs_abort_transaction(trans, ret); 363 364 return ret; 365 } 366 367 static void readahead_cache(struct inode *inode) 368 { 369 struct file_ra_state ra; 370 pgoff_t last_index; 371 372 file_ra_state_init(&ra, inode->i_mapping); 373 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 374 375 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 376 } 377 378 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 379 int write) 380 { 381 int num_pages; 382 383 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 384 385 /* Make sure we can fit our crcs and generation into the first page */ 386 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 387 return -ENOSPC; 388 389 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 390 391 io_ctl->pages = kzalloc_objs(struct page *, num_pages, GFP_NOFS); 392 if (!io_ctl->pages) 393 return -ENOMEM; 394 395 io_ctl->num_pages = num_pages; 396 io_ctl->fs_info = inode_to_fs_info(inode); 397 io_ctl->inode = inode; 398 399 return 0; 400 } 401 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 402 403 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 404 { 405 kfree(io_ctl->pages); 406 io_ctl->pages = NULL; 407 } 408 409 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 410 { 411 if (io_ctl->cur) { 412 io_ctl->cur = NULL; 413 io_ctl->orig = NULL; 414 } 415 } 416 417 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 418 { 419 ASSERT(io_ctl->index < io_ctl->num_pages); 420 io_ctl->page = io_ctl->pages[io_ctl->index++]; 421 io_ctl->cur = page_address(io_ctl->page); 422 io_ctl->orig = io_ctl->cur; 423 io_ctl->size = PAGE_SIZE; 424 if (clear) 425 clear_page(io_ctl->cur); 426 } 427 428 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 429 { 430 int i; 431 432 io_ctl_unmap_page(io_ctl); 433 434 for (i = 0; i < io_ctl->num_pages; i++) { 435 if (io_ctl->pages[i]) { 436 unlock_page(io_ctl->pages[i]); 437 put_page(io_ctl->pages[i]); 438 } 439 } 440 } 441 442 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 443 { 444 struct folio *folio; 445 struct inode *inode = io_ctl->inode; 446 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 447 int i; 448 449 for (i = 0; i < io_ctl->num_pages; i++) { 450 int ret; 451 452 folio = __filemap_get_folio(inode->i_mapping, i, 453 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 454 mask); 455 if (IS_ERR(folio)) { 456 io_ctl_drop_pages(io_ctl); 457 return PTR_ERR(folio); 458 } 459 460 ret = set_folio_extent_mapped(folio); 461 if (ret < 0) { 462 folio_unlock(folio); 463 folio_put(folio); 464 io_ctl_drop_pages(io_ctl); 465 return ret; 466 } 467 468 io_ctl->pages[i] = &folio->page; 469 if (uptodate && !folio_test_uptodate(folio)) { 470 btrfs_read_folio(NULL, folio); 471 folio_lock(folio); 472 if (folio->mapping != inode->i_mapping) { 473 btrfs_err(BTRFS_I(inode)->root->fs_info, 474 "free space cache page truncated"); 475 io_ctl_drop_pages(io_ctl); 476 return -EIO; 477 } 478 if (!folio_test_uptodate(folio)) { 479 btrfs_err(BTRFS_I(inode)->root->fs_info, 480 "error reading free space cache"); 481 io_ctl_drop_pages(io_ctl); 482 return -EIO; 483 } 484 } 485 } 486 487 for (i = 0; i < io_ctl->num_pages; i++) 488 clear_page_dirty_for_io(io_ctl->pages[i]); 489 490 return 0; 491 } 492 493 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 494 { 495 io_ctl_map_page(io_ctl, 1); 496 497 /* 498 * Skip the csum areas. If we don't check crcs then we just have a 499 * 64bit chunk at the front of the first page. 500 */ 501 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 502 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 503 504 put_unaligned_le64(generation, io_ctl->cur); 505 io_ctl->cur += sizeof(u64); 506 } 507 508 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 509 { 510 u64 cache_gen; 511 512 /* 513 * Skip the crc area. If we don't check crcs then we just have a 64bit 514 * chunk at the front of the first page. 515 */ 516 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 517 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 518 519 cache_gen = get_unaligned_le64(io_ctl->cur); 520 if (cache_gen != generation) { 521 btrfs_err_rl(io_ctl->fs_info, 522 "space cache generation (%llu) does not match inode (%llu)", 523 cache_gen, generation); 524 io_ctl_unmap_page(io_ctl); 525 return -EIO; 526 } 527 io_ctl->cur += sizeof(u64); 528 return 0; 529 } 530 531 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 532 { 533 u32 *tmp; 534 u32 crc = ~(u32)0; 535 unsigned offset = 0; 536 537 if (index == 0) 538 offset = sizeof(u32) * io_ctl->num_pages; 539 540 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 541 btrfs_crc32c_final(crc, (u8 *)&crc); 542 io_ctl_unmap_page(io_ctl); 543 tmp = page_address(io_ctl->pages[0]); 544 tmp += index; 545 *tmp = crc; 546 } 547 548 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 549 { 550 u32 *tmp, val; 551 u32 crc = ~(u32)0; 552 unsigned offset = 0; 553 554 if (index == 0) 555 offset = sizeof(u32) * io_ctl->num_pages; 556 557 tmp = page_address(io_ctl->pages[0]); 558 tmp += index; 559 val = *tmp; 560 561 io_ctl_map_page(io_ctl, 0); 562 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 563 btrfs_crc32c_final(crc, (u8 *)&crc); 564 if (val != crc) { 565 btrfs_err_rl(io_ctl->fs_info, 566 "csum mismatch on free space cache"); 567 io_ctl_unmap_page(io_ctl); 568 return -EIO; 569 } 570 571 return 0; 572 } 573 574 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 575 void *bitmap) 576 { 577 struct btrfs_free_space_entry *entry; 578 579 if (!io_ctl->cur) 580 return -ENOSPC; 581 582 entry = io_ctl->cur; 583 put_unaligned_le64(offset, &entry->offset); 584 put_unaligned_le64(bytes, &entry->bytes); 585 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 586 BTRFS_FREE_SPACE_EXTENT; 587 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 588 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 589 590 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 591 return 0; 592 593 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 594 595 /* No more pages to map */ 596 if (io_ctl->index >= io_ctl->num_pages) 597 return 0; 598 599 /* map the next page */ 600 io_ctl_map_page(io_ctl, 1); 601 return 0; 602 } 603 604 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 605 { 606 if (!io_ctl->cur) 607 return -ENOSPC; 608 609 /* 610 * If we aren't at the start of the current page, unmap this one and 611 * map the next one if there is any left. 612 */ 613 if (io_ctl->cur != io_ctl->orig) { 614 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 615 if (io_ctl->index >= io_ctl->num_pages) 616 return -ENOSPC; 617 io_ctl_map_page(io_ctl, 0); 618 } 619 620 copy_page(io_ctl->cur, bitmap); 621 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 622 if (io_ctl->index < io_ctl->num_pages) 623 io_ctl_map_page(io_ctl, 0); 624 return 0; 625 } 626 627 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 628 { 629 /* 630 * If we're not on the boundary we know we've modified the page and we 631 * need to crc the page. 632 */ 633 if (io_ctl->cur != io_ctl->orig) 634 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 635 else 636 io_ctl_unmap_page(io_ctl); 637 638 while (io_ctl->index < io_ctl->num_pages) { 639 io_ctl_map_page(io_ctl, 1); 640 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 641 } 642 } 643 644 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 645 struct btrfs_free_space *entry, u8 *type) 646 { 647 struct btrfs_free_space_entry *e; 648 int ret; 649 650 if (!io_ctl->cur) { 651 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 652 if (ret) 653 return ret; 654 } 655 656 e = io_ctl->cur; 657 entry->offset = get_unaligned_le64(&e->offset); 658 entry->bytes = get_unaligned_le64(&e->bytes); 659 *type = e->type; 660 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 661 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 662 663 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 664 return 0; 665 666 io_ctl_unmap_page(io_ctl); 667 668 return 0; 669 } 670 671 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 672 struct btrfs_free_space *entry) 673 { 674 int ret; 675 676 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 677 if (ret) 678 return ret; 679 680 copy_page(entry->bitmap, io_ctl->cur); 681 io_ctl_unmap_page(io_ctl); 682 683 return 0; 684 } 685 686 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 687 { 688 struct btrfs_block_group *block_group = ctl->block_group; 689 const int unit = block_group->fs_info->sectorsize; 690 u64 max_bytes; 691 u64 bitmap_bytes; 692 u64 extent_bytes; 693 u64 size = block_group->length; 694 u64 bytes_per_bg = BITS_PER_BITMAP * unit; 695 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 696 697 max_bitmaps = max_t(u64, max_bitmaps, 1); 698 699 if (ctl->total_bitmaps > max_bitmaps) 700 btrfs_err(block_group->fs_info, 701 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 702 block_group->start, block_group->length, 703 ctl->total_bitmaps, unit, max_bitmaps, 704 bytes_per_bg); 705 ASSERT(ctl->total_bitmaps <= max_bitmaps); 706 707 /* 708 * We are trying to keep the total amount of memory used per 1GiB of 709 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 710 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 711 * bitmaps, we may end up using more memory than this. 712 */ 713 if (size < SZ_1G) 714 max_bytes = MAX_CACHE_BYTES_PER_GIG; 715 else 716 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 717 718 bitmap_bytes = ctl->total_bitmaps * unit; 719 720 /* 721 * we want the extent entry threshold to always be at most 1/2 the max 722 * bytes we can have, or whatever is less than that. 723 */ 724 extent_bytes = max_bytes - bitmap_bytes; 725 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 726 727 ctl->extents_thresh = 728 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 729 } 730 731 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 732 struct btrfs_free_space_ctl *ctl, 733 struct btrfs_path *path, u64 offset) 734 { 735 struct btrfs_fs_info *fs_info = root->fs_info; 736 struct btrfs_free_space_header *header; 737 struct extent_buffer *leaf; 738 struct btrfs_io_ctl io_ctl; 739 struct btrfs_key key; 740 struct btrfs_free_space *e, *n; 741 LIST_HEAD(bitmaps); 742 u64 num_entries; 743 u64 num_bitmaps; 744 u64 generation; 745 u8 type; 746 int ret = 0; 747 748 /* Nothing in the space cache, goodbye */ 749 if (!i_size_read(inode)) 750 return 0; 751 752 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 753 key.type = 0; 754 key.offset = offset; 755 756 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 757 if (ret < 0) 758 return 0; 759 else if (ret > 0) { 760 btrfs_release_path(path); 761 return 0; 762 } 763 764 ret = -1; 765 766 leaf = path->nodes[0]; 767 header = btrfs_item_ptr(leaf, path->slots[0], 768 struct btrfs_free_space_header); 769 num_entries = btrfs_free_space_entries(leaf, header); 770 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 771 generation = btrfs_free_space_generation(leaf, header); 772 btrfs_release_path(path); 773 774 if (!BTRFS_I(inode)->generation) { 775 btrfs_info(fs_info, 776 "the free space cache file (%llu) is invalid, skip it", 777 offset); 778 return 0; 779 } 780 781 if (BTRFS_I(inode)->generation != generation) { 782 btrfs_err(fs_info, 783 "free space inode generation (%llu) did not match free space cache generation (%llu)", 784 BTRFS_I(inode)->generation, generation); 785 return 0; 786 } 787 788 if (!num_entries) 789 return 0; 790 791 ret = io_ctl_init(&io_ctl, inode, 0); 792 if (ret) 793 return ret; 794 795 readahead_cache(inode); 796 797 ret = io_ctl_prepare_pages(&io_ctl, true); 798 if (ret) 799 goto out; 800 801 ret = io_ctl_check_crc(&io_ctl, 0); 802 if (ret) 803 goto free_cache; 804 805 ret = io_ctl_check_generation(&io_ctl, generation); 806 if (ret) 807 goto free_cache; 808 809 while (num_entries) { 810 e = kmem_cache_zalloc(btrfs_free_space_cachep, 811 GFP_NOFS); 812 if (!e) { 813 ret = -ENOMEM; 814 goto free_cache; 815 } 816 817 ret = io_ctl_read_entry(&io_ctl, e, &type); 818 if (ret) { 819 kmem_cache_free(btrfs_free_space_cachep, e); 820 goto free_cache; 821 } 822 823 if (!e->bytes) { 824 ret = -1; 825 kmem_cache_free(btrfs_free_space_cachep, e); 826 goto free_cache; 827 } 828 829 if (type == BTRFS_FREE_SPACE_EXTENT) { 830 spin_lock(&ctl->tree_lock); 831 ret = link_free_space(ctl, e); 832 spin_unlock(&ctl->tree_lock); 833 if (ret) { 834 btrfs_err(fs_info, 835 "Duplicate entries in free space cache, dumping"); 836 kmem_cache_free(btrfs_free_space_cachep, e); 837 goto free_cache; 838 } 839 } else { 840 ASSERT(num_bitmaps); 841 num_bitmaps--; 842 e->bitmap = kmem_cache_zalloc( 843 btrfs_free_space_bitmap_cachep, GFP_NOFS); 844 if (!e->bitmap) { 845 ret = -ENOMEM; 846 kmem_cache_free( 847 btrfs_free_space_cachep, e); 848 goto free_cache; 849 } 850 spin_lock(&ctl->tree_lock); 851 ret = link_free_space(ctl, e); 852 if (ret) { 853 spin_unlock(&ctl->tree_lock); 854 btrfs_err(fs_info, 855 "Duplicate entries in free space cache, dumping"); 856 kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); 857 kmem_cache_free(btrfs_free_space_cachep, e); 858 goto free_cache; 859 } 860 ctl->total_bitmaps++; 861 recalculate_thresholds(ctl); 862 spin_unlock(&ctl->tree_lock); 863 list_add_tail(&e->list, &bitmaps); 864 } 865 866 num_entries--; 867 } 868 869 io_ctl_unmap_page(&io_ctl); 870 871 /* 872 * We add the bitmaps at the end of the entries in order that 873 * the bitmap entries are added to the cache. 874 */ 875 list_for_each_entry_safe(e, n, &bitmaps, list) { 876 list_del_init(&e->list); 877 ret = io_ctl_read_bitmap(&io_ctl, e); 878 if (ret) 879 goto free_cache; 880 } 881 882 io_ctl_drop_pages(&io_ctl); 883 ret = 1; 884 out: 885 io_ctl_free(&io_ctl); 886 return ret; 887 free_cache: 888 io_ctl_drop_pages(&io_ctl); 889 890 spin_lock(&ctl->tree_lock); 891 __btrfs_remove_free_space_cache(ctl); 892 spin_unlock(&ctl->tree_lock); 893 goto out; 894 } 895 896 static int copy_free_space_cache(struct btrfs_free_space_ctl *ctl) 897 { 898 struct btrfs_free_space *info; 899 struct rb_node *n; 900 int ret = 0; 901 902 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 903 info = rb_entry(n, struct btrfs_free_space, offset_index); 904 if (!info->bitmap) { 905 const u64 offset = info->offset; 906 const u64 bytes = info->bytes; 907 908 unlink_free_space(ctl, info, true); 909 spin_unlock(&ctl->tree_lock); 910 kmem_cache_free(btrfs_free_space_cachep, info); 911 ret = btrfs_add_free_space(ctl->block_group, offset, bytes); 912 spin_lock(&ctl->tree_lock); 913 } else { 914 u64 offset = info->offset; 915 u64 bytes = ctl->block_group->fs_info->sectorsize; 916 917 ret = search_bitmap(ctl, info, &offset, &bytes, false); 918 if (ret == 0) { 919 bitmap_clear_bits(ctl, info, offset, bytes, true); 920 spin_unlock(&ctl->tree_lock); 921 ret = btrfs_add_free_space(ctl->block_group, offset, 922 bytes); 923 spin_lock(&ctl->tree_lock); 924 } else { 925 free_bitmap(ctl, info); 926 ret = 0; 927 } 928 } 929 cond_resched_lock(&ctl->tree_lock); 930 } 931 return ret; 932 } 933 934 static struct lock_class_key btrfs_free_space_inode_key; 935 936 int load_free_space_cache(struct btrfs_block_group *block_group) 937 { 938 struct btrfs_fs_info *fs_info = block_group->fs_info; 939 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 940 struct btrfs_free_space_ctl tmp_ctl = {}; 941 struct inode *inode; 942 struct btrfs_path *path; 943 int ret = 0; 944 bool matched; 945 u64 used = block_group->used; 946 947 /* 948 * Because we could potentially discard our loaded free space, we want 949 * to load everything into a temporary structure first, and then if it's 950 * valid copy it all into the actual free space ctl. 951 */ 952 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 953 954 /* 955 * If this block group has been marked to be cleared for one reason or 956 * another then we can't trust the on disk cache, so just return. 957 */ 958 spin_lock(&block_group->lock); 959 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 960 spin_unlock(&block_group->lock); 961 return 0; 962 } 963 spin_unlock(&block_group->lock); 964 965 path = btrfs_alloc_path(); 966 if (!path) 967 return 0; 968 path->search_commit_root = true; 969 path->skip_locking = true; 970 971 /* 972 * We must pass a path with search_commit_root set to btrfs_iget in 973 * order to avoid a deadlock when allocating extents for the tree root. 974 * 975 * When we are COWing an extent buffer from the tree root, when looking 976 * for a free extent, at extent-tree.c:find_free_extent(), we can find 977 * block group without its free space cache loaded. When we find one 978 * we must load its space cache which requires reading its free space 979 * cache's inode item from the root tree. If this inode item is located 980 * in the same leaf that we started COWing before, then we end up in 981 * deadlock on the extent buffer (trying to read lock it when we 982 * previously write locked it). 983 * 984 * It's safe to read the inode item using the commit root because 985 * block groups, once loaded, stay in memory forever (until they are 986 * removed) as well as their space caches once loaded. New block groups 987 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 988 * we will never try to read their inode item while the fs is mounted. 989 */ 990 inode = lookup_free_space_inode(block_group, path); 991 if (IS_ERR(inode)) { 992 btrfs_free_path(path); 993 return 0; 994 } 995 996 /* We may have converted the inode and made the cache invalid. */ 997 spin_lock(&block_group->lock); 998 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 999 spin_unlock(&block_group->lock); 1000 btrfs_free_path(path); 1001 goto out; 1002 } 1003 spin_unlock(&block_group->lock); 1004 1005 /* 1006 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1007 * free space inodes to prevent false positives related to locks for normal 1008 * inodes. 1009 */ 1010 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1011 &btrfs_free_space_inode_key); 1012 1013 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1014 path, block_group->start); 1015 btrfs_free_path(path); 1016 if (ret <= 0) 1017 goto out; 1018 1019 matched = (tmp_ctl.free_space == (block_group->length - used - 1020 block_group->bytes_super)); 1021 1022 if (matched) { 1023 spin_lock(&tmp_ctl.tree_lock); 1024 ret = copy_free_space_cache(&tmp_ctl); 1025 spin_unlock(&tmp_ctl.tree_lock); 1026 /* 1027 * ret == 1 means we successfully loaded the free space cache, 1028 * so we need to re-set it here. 1029 */ 1030 if (ret == 0) 1031 ret = 1; 1032 } else { 1033 /* 1034 * We need to call the _locked variant so we don't try to update 1035 * the discard counters. 1036 */ 1037 spin_lock(&tmp_ctl.tree_lock); 1038 __btrfs_remove_free_space_cache(&tmp_ctl); 1039 spin_unlock(&tmp_ctl.tree_lock); 1040 btrfs_warn(fs_info, 1041 "block group %llu has wrong amount of free space", 1042 block_group->start); 1043 ret = -1; 1044 } 1045 out: 1046 if (ret < 0) { 1047 /* This cache is bogus, make sure it gets cleared */ 1048 spin_lock(&block_group->lock); 1049 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1050 spin_unlock(&block_group->lock); 1051 ret = 0; 1052 1053 btrfs_warn(fs_info, 1054 "failed to load free space cache for block group %llu, rebuilding it now", 1055 block_group->start); 1056 } 1057 1058 spin_lock(&ctl->tree_lock); 1059 btrfs_discard_update_discardable(block_group); 1060 spin_unlock(&ctl->tree_lock); 1061 iput(inode); 1062 return ret; 1063 } 1064 1065 static noinline_for_stack 1066 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1067 struct btrfs_block_group *block_group, 1068 int *entries, int *bitmaps, 1069 struct list_head *bitmap_list) 1070 { 1071 int ret; 1072 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1073 struct btrfs_free_cluster *cluster = NULL; 1074 struct btrfs_free_cluster *cluster_locked = NULL; 1075 struct rb_node *node = rb_first(&ctl->free_space_offset); 1076 struct btrfs_trim_range *trim_entry; 1077 1078 /* Get the cluster for this block_group if it exists */ 1079 if (!list_empty(&block_group->cluster_list)) { 1080 cluster = list_first_entry(&block_group->cluster_list, 1081 struct btrfs_free_cluster, block_group_list); 1082 } 1083 1084 if (!node && cluster) { 1085 cluster_locked = cluster; 1086 spin_lock(&cluster_locked->lock); 1087 node = rb_first(&cluster->root); 1088 cluster = NULL; 1089 } 1090 1091 /* Write out the extent entries */ 1092 while (node) { 1093 struct btrfs_free_space *e; 1094 1095 e = rb_entry(node, struct btrfs_free_space, offset_index); 1096 *entries += 1; 1097 1098 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1099 e->bitmap); 1100 if (ret) 1101 goto fail; 1102 1103 if (e->bitmap) { 1104 list_add_tail(&e->list, bitmap_list); 1105 *bitmaps += 1; 1106 } 1107 node = rb_next(node); 1108 if (!node && cluster) { 1109 node = rb_first(&cluster->root); 1110 cluster_locked = cluster; 1111 spin_lock(&cluster_locked->lock); 1112 cluster = NULL; 1113 } 1114 } 1115 if (cluster_locked) { 1116 spin_unlock(&cluster_locked->lock); 1117 cluster_locked = NULL; 1118 } 1119 1120 /* 1121 * Make sure we don't miss any range that was removed from our rbtree 1122 * because trimming is running. Otherwise after a umount+mount (or crash 1123 * after committing the transaction) we would leak free space and get 1124 * an inconsistent free space cache report from fsck. 1125 */ 1126 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1127 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1128 trim_entry->bytes, NULL); 1129 if (ret) 1130 goto fail; 1131 *entries += 1; 1132 } 1133 1134 return 0; 1135 fail: 1136 if (cluster_locked) 1137 spin_unlock(&cluster_locked->lock); 1138 return -ENOSPC; 1139 } 1140 1141 static noinline_for_stack int 1142 update_cache_item(struct btrfs_trans_handle *trans, 1143 struct btrfs_root *root, 1144 struct inode *inode, 1145 struct btrfs_path *path, u64 offset, 1146 int entries, int bitmaps) 1147 { 1148 struct btrfs_key key; 1149 struct btrfs_free_space_header *header; 1150 struct extent_buffer *leaf; 1151 int ret; 1152 1153 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1154 key.type = 0; 1155 key.offset = offset; 1156 1157 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1158 if (ret < 0) { 1159 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1160 EXTENT_DELALLOC, NULL); 1161 return ret; 1162 } 1163 leaf = path->nodes[0]; 1164 if (ret > 0) { 1165 struct btrfs_key found_key; 1166 ASSERT(path->slots[0]); 1167 path->slots[0]--; 1168 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1169 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1170 found_key.offset != offset) { 1171 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1172 inode->i_size - 1, EXTENT_DELALLOC, 1173 NULL); 1174 btrfs_release_path(path); 1175 return -ENOENT; 1176 } 1177 } 1178 1179 BTRFS_I(inode)->generation = trans->transid; 1180 header = btrfs_item_ptr(leaf, path->slots[0], 1181 struct btrfs_free_space_header); 1182 btrfs_set_free_space_entries(leaf, header, entries); 1183 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1184 btrfs_set_free_space_generation(leaf, header, trans->transid); 1185 btrfs_release_path(path); 1186 1187 return 0; 1188 } 1189 1190 static noinline_for_stack int write_pinned_extent_entries( 1191 struct btrfs_trans_handle *trans, 1192 struct btrfs_block_group *block_group, 1193 struct btrfs_io_ctl *io_ctl, 1194 int *entries) 1195 { 1196 u64 start, extent_start, extent_end, len; 1197 const u64 block_group_end = btrfs_block_group_end(block_group); 1198 struct extent_io_tree *unpin = NULL; 1199 int ret; 1200 1201 /* 1202 * We want to add any pinned extents to our free space cache 1203 * so we don't leak the space 1204 * 1205 * We shouldn't have switched the pinned extents yet so this is the 1206 * right one 1207 */ 1208 unpin = &trans->transaction->pinned_extents; 1209 1210 start = block_group->start; 1211 1212 while (start < block_group_end) { 1213 if (!btrfs_find_first_extent_bit(unpin, start, 1214 &extent_start, &extent_end, 1215 EXTENT_DIRTY, NULL)) 1216 return 0; 1217 1218 /* This pinned extent is out of our range */ 1219 if (extent_start >= block_group_end) 1220 return 0; 1221 1222 extent_start = max(extent_start, start); 1223 extent_end = min(block_group_end, extent_end + 1); 1224 len = extent_end - extent_start; 1225 1226 *entries += 1; 1227 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1228 if (ret) 1229 return -ENOSPC; 1230 1231 start = extent_end; 1232 } 1233 1234 return 0; 1235 } 1236 1237 static noinline_for_stack int 1238 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1239 { 1240 struct btrfs_free_space *entry, *next; 1241 int ret; 1242 1243 /* Write out the bitmaps */ 1244 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1245 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1246 if (ret) 1247 return -ENOSPC; 1248 list_del_init(&entry->list); 1249 } 1250 1251 return 0; 1252 } 1253 1254 static int flush_dirty_cache(struct inode *inode) 1255 { 1256 int ret; 1257 1258 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 1259 if (ret) 1260 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1261 EXTENT_DELALLOC, NULL); 1262 1263 return ret; 1264 } 1265 1266 static void noinline_for_stack 1267 cleanup_bitmap_list(struct list_head *bitmap_list) 1268 { 1269 struct btrfs_free_space *entry, *next; 1270 1271 list_for_each_entry_safe(entry, next, bitmap_list, list) 1272 list_del_init(&entry->list); 1273 } 1274 1275 static void noinline_for_stack 1276 cleanup_write_cache_enospc(struct inode *inode, 1277 struct btrfs_io_ctl *io_ctl, 1278 struct extent_state **cached_state) 1279 { 1280 io_ctl_drop_pages(io_ctl); 1281 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1282 cached_state); 1283 } 1284 1285 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1286 struct btrfs_trans_handle *trans, 1287 struct btrfs_block_group *block_group, 1288 struct btrfs_io_ctl *io_ctl, 1289 struct btrfs_path *path, u64 offset) 1290 { 1291 int ret; 1292 struct inode *inode = io_ctl->inode; 1293 1294 if (!inode) 1295 return 0; 1296 1297 /* Flush the dirty pages in the cache file. */ 1298 ret = flush_dirty_cache(inode); 1299 if (ret) 1300 goto out; 1301 1302 /* Update the cache item to tell everyone this cache file is valid. */ 1303 ret = update_cache_item(trans, root, inode, path, offset, 1304 io_ctl->entries, io_ctl->bitmaps); 1305 out: 1306 if (ret) { 1307 invalidate_inode_pages2(inode->i_mapping); 1308 BTRFS_I(inode)->generation = 0; 1309 if (block_group) 1310 btrfs_debug(root->fs_info, 1311 "failed to write free space cache for block group %llu error %d", 1312 block_group->start, ret); 1313 } 1314 btrfs_update_inode(trans, BTRFS_I(inode)); 1315 1316 if (block_group) { 1317 /* the dirty list is protected by the dirty_bgs_lock */ 1318 spin_lock(&trans->transaction->dirty_bgs_lock); 1319 1320 /* the disk_cache_state is protected by the block group lock */ 1321 spin_lock(&block_group->lock); 1322 1323 /* 1324 * only mark this as written if we didn't get put back on 1325 * the dirty list while waiting for IO. Otherwise our 1326 * cache state won't be right, and we won't get written again 1327 */ 1328 if (!ret && list_empty(&block_group->dirty_list)) 1329 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1330 else if (ret) 1331 block_group->disk_cache_state = BTRFS_DC_ERROR; 1332 1333 spin_unlock(&block_group->lock); 1334 spin_unlock(&trans->transaction->dirty_bgs_lock); 1335 io_ctl->inode = NULL; 1336 iput(inode); 1337 } 1338 1339 return ret; 1340 1341 } 1342 1343 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1344 struct btrfs_block_group *block_group, 1345 struct btrfs_path *path) 1346 { 1347 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1348 block_group, &block_group->io_ctl, 1349 path, block_group->start); 1350 } 1351 1352 /* 1353 * Write out cached info to an inode. 1354 * 1355 * @inode: freespace inode we are writing out 1356 * @ctl: free space cache we are going to write out 1357 * @block_group: block_group for this cache if it belongs to a block_group 1358 * @io_ctl: holds context for the io 1359 * @trans: the trans handle 1360 * 1361 * This function writes out a free space cache struct to disk for quick recovery 1362 * on mount. This will return 0 if it was successful in writing the cache out, 1363 * or an errno if it was not. 1364 */ 1365 static int __btrfs_write_out_cache(struct inode *inode, 1366 struct btrfs_block_group *block_group, 1367 struct btrfs_trans_handle *trans) 1368 { 1369 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1370 struct btrfs_io_ctl *io_ctl = &block_group->io_ctl; 1371 struct extent_state *cached_state = NULL; 1372 LIST_HEAD(bitmap_list); 1373 int entries = 0; 1374 int bitmaps = 0; 1375 int ret; 1376 bool must_iput = false; 1377 int i_size; 1378 1379 if (!i_size_read(inode)) 1380 return -EIO; 1381 1382 WARN_ON(io_ctl->pages); 1383 ret = io_ctl_init(io_ctl, inode, 1); 1384 if (ret) 1385 return ret; 1386 1387 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 1388 down_write(&block_group->data_rwsem); 1389 spin_lock(&block_group->lock); 1390 if (block_group->delalloc_bytes) { 1391 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1392 spin_unlock(&block_group->lock); 1393 up_write(&block_group->data_rwsem); 1394 BTRFS_I(inode)->generation = 0; 1395 ret = 0; 1396 must_iput = true; 1397 goto out; 1398 } 1399 spin_unlock(&block_group->lock); 1400 } 1401 1402 /* Lock all pages first so we can lock the extent safely. */ 1403 ret = io_ctl_prepare_pages(io_ctl, false); 1404 if (ret) 1405 goto out_unlock; 1406 1407 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1408 &cached_state); 1409 1410 io_ctl_set_generation(io_ctl, trans->transid); 1411 1412 mutex_lock(&ctl->cache_writeout_mutex); 1413 /* Write out the extent entries in the free space cache */ 1414 spin_lock(&ctl->tree_lock); 1415 ret = write_cache_extent_entries(io_ctl, block_group, &entries, &bitmaps, 1416 &bitmap_list); 1417 if (ret) 1418 goto out_nospc_locked; 1419 1420 /* 1421 * Some spaces that are freed in the current transaction are pinned, 1422 * they will be added into free space cache after the transaction is 1423 * committed, we shouldn't lose them. 1424 * 1425 * If this changes while we are working we'll get added back to 1426 * the dirty list and redo it. No locking needed 1427 */ 1428 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1429 if (ret) 1430 goto out_nospc_locked; 1431 1432 /* 1433 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1434 * locked while doing it because a concurrent trim can be manipulating 1435 * or freeing the bitmap. 1436 */ 1437 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1438 spin_unlock(&ctl->tree_lock); 1439 mutex_unlock(&ctl->cache_writeout_mutex); 1440 if (ret) 1441 goto out_nospc; 1442 1443 /* Zero out the rest of the pages just to make sure */ 1444 io_ctl_zero_remaining_pages(io_ctl); 1445 1446 /* Everything is written out, now we dirty the pages in the file. */ 1447 i_size = i_size_read(inode); 1448 for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { 1449 u64 dirty_start = i * PAGE_SIZE; 1450 u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; 1451 1452 ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), 1453 dirty_start, dirty_len, &cached_state, false); 1454 if (ret < 0) 1455 goto out_nospc; 1456 } 1457 1458 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1459 up_write(&block_group->data_rwsem); 1460 /* 1461 * Release the pages and unlock the extent, we will flush 1462 * them out later 1463 */ 1464 io_ctl_drop_pages(io_ctl); 1465 io_ctl_free(io_ctl); 1466 1467 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1468 &cached_state); 1469 1470 /* 1471 * at this point the pages are under IO and we're happy, 1472 * The caller is responsible for waiting on them and updating 1473 * the cache and the inode 1474 */ 1475 io_ctl->entries = entries; 1476 io_ctl->bitmaps = bitmaps; 1477 1478 ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); 1479 if (ret) 1480 goto out; 1481 1482 return 0; 1483 1484 out_nospc_locked: 1485 cleanup_bitmap_list(&bitmap_list); 1486 spin_unlock(&ctl->tree_lock); 1487 mutex_unlock(&ctl->cache_writeout_mutex); 1488 1489 out_nospc: 1490 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1491 1492 out_unlock: 1493 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1494 up_write(&block_group->data_rwsem); 1495 1496 out: 1497 io_ctl->inode = NULL; 1498 io_ctl_free(io_ctl); 1499 if (ret) { 1500 invalidate_inode_pages2(inode->i_mapping); 1501 BTRFS_I(inode)->generation = 0; 1502 } 1503 btrfs_update_inode(trans, BTRFS_I(inode)); 1504 if (must_iput) 1505 iput(inode); 1506 return ret; 1507 } 1508 1509 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1510 struct btrfs_block_group *block_group, 1511 struct btrfs_path *path) 1512 { 1513 struct btrfs_fs_info *fs_info = trans->fs_info; 1514 struct inode *inode; 1515 int ret = 0; 1516 1517 spin_lock(&block_group->lock); 1518 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1519 spin_unlock(&block_group->lock); 1520 return 0; 1521 } 1522 spin_unlock(&block_group->lock); 1523 1524 inode = lookup_free_space_inode(block_group, path); 1525 if (IS_ERR(inode)) 1526 return 0; 1527 1528 ret = __btrfs_write_out_cache(inode, block_group, trans); 1529 if (ret) { 1530 btrfs_debug(fs_info, 1531 "failed to write free space cache for block group %llu error %d", 1532 block_group->start, ret); 1533 spin_lock(&block_group->lock); 1534 block_group->disk_cache_state = BTRFS_DC_ERROR; 1535 spin_unlock(&block_group->lock); 1536 1537 block_group->io_ctl.inode = NULL; 1538 iput(inode); 1539 } 1540 1541 /* 1542 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1543 * to wait for IO and put the inode 1544 */ 1545 1546 return ret; 1547 } 1548 1549 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1550 u64 offset) 1551 { 1552 ASSERT(offset >= bitmap_start); 1553 offset -= bitmap_start; 1554 return (unsigned long)(div_u64(offset, unit)); 1555 } 1556 1557 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1558 { 1559 return (unsigned long)(div_u64(bytes, unit)); 1560 } 1561 1562 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1563 u64 offset) 1564 { 1565 u64 bitmap_start; 1566 u64 bytes_per_bitmap; 1567 1568 bytes_per_bitmap = BITS_PER_BITMAP * ctl->block_group->fs_info->sectorsize; 1569 bitmap_start = offset - ctl->block_group->start; 1570 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1571 bitmap_start *= bytes_per_bitmap; 1572 bitmap_start += ctl->block_group->start; 1573 1574 return bitmap_start; 1575 } 1576 1577 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1578 struct btrfs_free_cluster *cluster, 1579 struct btrfs_free_space *new_entry) 1580 { 1581 struct rb_root *root; 1582 struct rb_node **p; 1583 struct rb_node *parent = NULL; 1584 1585 lockdep_assert_held(&ctl->tree_lock); 1586 1587 if (cluster) { 1588 lockdep_assert_held(&cluster->lock); 1589 root = &cluster->root; 1590 } else { 1591 root = &ctl->free_space_offset; 1592 } 1593 1594 p = &root->rb_node; 1595 1596 while (*p) { 1597 struct btrfs_free_space *info; 1598 1599 parent = *p; 1600 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1601 1602 if (new_entry->offset < info->offset) { 1603 p = &(*p)->rb_left; 1604 } else if (new_entry->offset > info->offset) { 1605 p = &(*p)->rb_right; 1606 } else { 1607 /* 1608 * we could have a bitmap entry and an extent entry 1609 * share the same offset. If this is the case, we want 1610 * the extent entry to always be found first if we do a 1611 * linear search through the tree, since we want to have 1612 * the quickest allocation time, and allocating from an 1613 * extent is faster than allocating from a bitmap. So 1614 * if we're inserting a bitmap and we find an entry at 1615 * this offset, we want to go right, or after this entry 1616 * logically. If we are inserting an extent and we've 1617 * found a bitmap, we want to go left, or before 1618 * logically. 1619 */ 1620 if (new_entry->bitmap) { 1621 if (info->bitmap) { 1622 WARN_ON_ONCE(1); 1623 return -EEXIST; 1624 } 1625 p = &(*p)->rb_right; 1626 } else { 1627 if (!info->bitmap) { 1628 WARN_ON_ONCE(1); 1629 return -EEXIST; 1630 } 1631 p = &(*p)->rb_left; 1632 } 1633 } 1634 } 1635 1636 rb_link_node(&new_entry->offset_index, parent, p); 1637 rb_insert_color(&new_entry->offset_index, root); 1638 1639 return 0; 1640 } 1641 1642 /* 1643 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1644 * searched through the bitmap and figured out the largest ->max_extent_size, 1645 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1646 * allocator the wrong thing, we want to use the actual real max_extent_size 1647 * we've found already if it's larger, or we want to use ->bytes. 1648 * 1649 * This matters because find_free_space() will skip entries who's ->bytes is 1650 * less than the required bytes. So if we didn't search down this bitmap, we 1651 * may pick some previous entry that has a smaller ->max_extent_size than we 1652 * have. For example, assume we have two entries, one that has 1653 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1654 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1655 * call into find_free_space(), and return with max_extent_size == 4K, because 1656 * that first bitmap entry had ->max_extent_size set, but the second one did 1657 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1658 * 8K contiguous range. 1659 * 1660 * Consider the other case, we have 2 8K chunks in that second entry and still 1661 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1662 * allocator comes in it'll fully search our second bitmap, and this time it'll 1663 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1664 * right allocation the next loop through. 1665 */ 1666 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1667 { 1668 if (entry->bitmap && entry->max_extent_size) 1669 return entry->max_extent_size; 1670 return entry->bytes; 1671 } 1672 1673 /* 1674 * We want the largest entry to be leftmost, so this is inverted from what you'd 1675 * normally expect. 1676 */ 1677 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1678 { 1679 const struct btrfs_free_space *entry, *exist; 1680 1681 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1682 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1683 return get_max_extent_size(exist) < get_max_extent_size(entry); 1684 } 1685 1686 /* 1687 * searches the tree for the given offset. 1688 * 1689 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1690 * want a section that has at least bytes size and comes at or after the given 1691 * offset. 1692 */ 1693 static struct btrfs_free_space * 1694 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1695 u64 offset, int bitmap_only, int fuzzy) 1696 { 1697 struct rb_node *n = ctl->free_space_offset.rb_node; 1698 struct btrfs_free_space *entry = NULL, *prev = NULL; 1699 const int unit = ctl->block_group->fs_info->sectorsize; 1700 1701 lockdep_assert_held(&ctl->tree_lock); 1702 1703 /* find entry that is closest to the 'offset' */ 1704 while (n) { 1705 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1706 prev = entry; 1707 1708 if (offset < entry->offset) 1709 n = n->rb_left; 1710 else if (offset > entry->offset) 1711 n = n->rb_right; 1712 else 1713 break; 1714 1715 entry = NULL; 1716 } 1717 1718 if (bitmap_only) { 1719 if (!entry) 1720 return NULL; 1721 if (entry->bitmap) 1722 return entry; 1723 1724 /* 1725 * bitmap entry and extent entry may share same offset, 1726 * in that case, bitmap entry comes after extent entry. 1727 */ 1728 n = rb_next(n); 1729 if (!n) 1730 return NULL; 1731 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1732 if (entry->offset != offset) 1733 return NULL; 1734 1735 WARN_ON(!entry->bitmap); 1736 return entry; 1737 } else if (entry) { 1738 if (entry->bitmap) { 1739 /* 1740 * if previous extent entry covers the offset, 1741 * we should return it instead of the bitmap entry 1742 */ 1743 n = rb_prev(&entry->offset_index); 1744 if (n) { 1745 prev = rb_entry(n, struct btrfs_free_space, 1746 offset_index); 1747 if (!prev->bitmap && 1748 prev->offset + prev->bytes > offset) 1749 entry = prev; 1750 } 1751 } 1752 return entry; 1753 } 1754 1755 if (!prev) 1756 return NULL; 1757 1758 /* find last entry before the 'offset' */ 1759 entry = prev; 1760 if (entry->offset > offset) { 1761 n = rb_prev(&entry->offset_index); 1762 if (n) { 1763 entry = rb_entry(n, struct btrfs_free_space, 1764 offset_index); 1765 ASSERT(entry->offset <= offset); 1766 } else { 1767 if (fuzzy) 1768 return entry; 1769 else 1770 return NULL; 1771 } 1772 } 1773 1774 if (entry->bitmap) { 1775 n = rb_prev(&entry->offset_index); 1776 if (n) { 1777 prev = rb_entry(n, struct btrfs_free_space, 1778 offset_index); 1779 if (!prev->bitmap && 1780 prev->offset + prev->bytes > offset) 1781 return prev; 1782 } 1783 if (entry->offset + BITS_PER_BITMAP * unit > offset) 1784 return entry; 1785 } else if (entry->offset + entry->bytes > offset) 1786 return entry; 1787 1788 if (!fuzzy) 1789 return NULL; 1790 1791 while (1) { 1792 n = rb_next(&entry->offset_index); 1793 if (!n) 1794 return NULL; 1795 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1796 if (entry->bitmap) { 1797 if (entry->offset + BITS_PER_BITMAP * unit > offset) 1798 break; 1799 } else { 1800 if (entry->offset + entry->bytes > offset) 1801 break; 1802 } 1803 } 1804 return entry; 1805 } 1806 1807 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1808 struct btrfs_free_space *info, 1809 bool update_stat) 1810 { 1811 lockdep_assert_held(&ctl->tree_lock); 1812 1813 rb_erase(&info->offset_index, &ctl->free_space_offset); 1814 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1815 ctl->free_extents--; 1816 1817 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1818 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1819 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1820 } 1821 1822 if (update_stat) 1823 ctl->free_space -= info->bytes; 1824 } 1825 1826 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1827 struct btrfs_free_space *info) 1828 { 1829 int ret = 0; 1830 1831 lockdep_assert_held(&ctl->tree_lock); 1832 1833 ASSERT(info->bytes || info->bitmap); 1834 ret = tree_insert_offset(ctl, NULL, info); 1835 if (ret) 1836 return ret; 1837 1838 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1839 1840 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1841 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1842 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1843 } 1844 1845 ctl->free_space += info->bytes; 1846 ctl->free_extents++; 1847 return ret; 1848 } 1849 1850 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1851 struct btrfs_free_space *info) 1852 { 1853 ASSERT(info->bitmap); 1854 1855 /* 1856 * If our entry is empty it's because we're on a cluster and we don't 1857 * want to re-link it into our ctl bytes index. 1858 */ 1859 if (RB_EMPTY_NODE(&info->bytes_index)) 1860 return; 1861 1862 lockdep_assert_held(&ctl->tree_lock); 1863 1864 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1865 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1866 } 1867 1868 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1869 struct btrfs_free_space *info, 1870 u64 offset, u64 bytes, bool update_stat) 1871 { 1872 const int unit = ctl->block_group->fs_info->sectorsize; 1873 unsigned long start, count, end; 1874 int extent_delta = -1; 1875 1876 start = offset_to_bit(info->offset, unit, offset); 1877 count = bytes_to_bits(bytes, unit); 1878 end = start + count; 1879 ASSERT(end <= BITS_PER_BITMAP); 1880 1881 bitmap_clear(info->bitmap, start, count); 1882 1883 info->bytes -= bytes; 1884 if (info->max_extent_size > unit) 1885 info->max_extent_size = 0; 1886 1887 relink_bitmap_entry(ctl, info); 1888 1889 if (start && test_bit(start - 1, info->bitmap)) 1890 extent_delta++; 1891 1892 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1893 extent_delta++; 1894 1895 info->bitmap_extents += extent_delta; 1896 if (!btrfs_free_space_trimmed(info)) { 1897 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1898 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1899 } 1900 1901 if (update_stat) 1902 ctl->free_space -= bytes; 1903 } 1904 1905 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1906 struct btrfs_free_space *info, u64 offset, 1907 u64 bytes) 1908 { 1909 const int unit = ctl->block_group->fs_info->sectorsize; 1910 unsigned long start, count, end; 1911 int extent_delta = 1; 1912 1913 start = offset_to_bit(info->offset, unit, offset); 1914 count = bytes_to_bits(bytes, unit); 1915 end = start + count; 1916 ASSERT(end <= BITS_PER_BITMAP); 1917 1918 bitmap_set(info->bitmap, start, count); 1919 1920 /* 1921 * We set some bytes, we have no idea what the max extent size is 1922 * anymore. 1923 */ 1924 info->max_extent_size = 0; 1925 info->bytes += bytes; 1926 ctl->free_space += bytes; 1927 1928 relink_bitmap_entry(ctl, info); 1929 1930 if (start && test_bit(start - 1, info->bitmap)) 1931 extent_delta--; 1932 1933 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1934 extent_delta--; 1935 1936 info->bitmap_extents += extent_delta; 1937 if (!btrfs_free_space_trimmed(info)) { 1938 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1939 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1940 } 1941 } 1942 1943 /* 1944 * If we can not find suitable extent, we will use bytes to record 1945 * the size of the max extent. 1946 */ 1947 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1948 struct btrfs_free_space *bitmap_info, u64 *offset, 1949 u64 *bytes, bool for_alloc) 1950 { 1951 const int unit = ctl->block_group->fs_info->sectorsize; 1952 unsigned long found_bits = 0; 1953 unsigned long max_bits = 0; 1954 unsigned long bits, i; 1955 unsigned long next_zero; 1956 unsigned long extent_bits; 1957 1958 /* 1959 * Skip searching the bitmap if we don't have a contiguous section that 1960 * is large enough for this allocation. 1961 */ 1962 if (for_alloc && 1963 bitmap_info->max_extent_size && 1964 bitmap_info->max_extent_size < *bytes) { 1965 *bytes = bitmap_info->max_extent_size; 1966 return -1; 1967 } 1968 1969 i = offset_to_bit(bitmap_info->offset, unit, 1970 max_t(u64, *offset, bitmap_info->offset)); 1971 bits = bytes_to_bits(*bytes, unit); 1972 1973 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1974 if (for_alloc && bits == 1) { 1975 found_bits = 1; 1976 break; 1977 } 1978 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1979 BITS_PER_BITMAP, i); 1980 extent_bits = next_zero - i; 1981 if (extent_bits >= bits) { 1982 found_bits = extent_bits; 1983 break; 1984 } else if (extent_bits > max_bits) { 1985 max_bits = extent_bits; 1986 } 1987 i = next_zero; 1988 } 1989 1990 if (found_bits) { 1991 *offset = (u64)(i * unit) + bitmap_info->offset; 1992 *bytes = (u64)(found_bits) * unit; 1993 return 0; 1994 } 1995 1996 *bytes = (u64)(max_bits) * unit; 1997 bitmap_info->max_extent_size = *bytes; 1998 relink_bitmap_entry(ctl, bitmap_info); 1999 return -1; 2000 } 2001 2002 /* Cache the size of the max extent in bytes */ 2003 static struct btrfs_free_space * 2004 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2005 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2006 { 2007 struct btrfs_free_space *entry; 2008 struct rb_node *node; 2009 u64 tmp; 2010 u64 align_off; 2011 int ret; 2012 2013 if (!ctl->free_space_offset.rb_node) 2014 return NULL; 2015 again: 2016 if (use_bytes_index) { 2017 node = rb_first_cached(&ctl->free_space_bytes); 2018 } else { 2019 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2020 0, 1); 2021 if (!entry) 2022 return NULL; 2023 node = &entry->offset_index; 2024 } 2025 2026 for (; node; node = rb_next(node)) { 2027 if (use_bytes_index) 2028 entry = rb_entry(node, struct btrfs_free_space, 2029 bytes_index); 2030 else 2031 entry = rb_entry(node, struct btrfs_free_space, 2032 offset_index); 2033 2034 /* 2035 * If we are using the bytes index then all subsequent entries 2036 * in this tree are going to be < bytes, so simply set the max 2037 * extent size and exit the loop. 2038 * 2039 * If we're using the offset index then we need to keep going 2040 * through the rest of the tree. 2041 */ 2042 if (entry->bytes < *bytes) { 2043 *max_extent_size = max(get_max_extent_size(entry), 2044 *max_extent_size); 2045 if (use_bytes_index) 2046 break; 2047 continue; 2048 } 2049 2050 /* make sure the space returned is big enough 2051 * to match our requested alignment 2052 */ 2053 if (*bytes >= align) { 2054 tmp = entry->offset - ctl->block_group->start + align - 1; 2055 tmp = div64_u64(tmp, align); 2056 tmp = tmp * align + ctl->block_group->start; 2057 align_off = tmp - entry->offset; 2058 } else { 2059 align_off = 0; 2060 tmp = entry->offset; 2061 } 2062 2063 /* 2064 * We don't break here if we're using the bytes index because we 2065 * may have another entry that has the correct alignment that is 2066 * the right size, so we don't want to miss that possibility. 2067 * At worst this adds another loop through the logic, but if we 2068 * broke here we could prematurely ENOSPC. 2069 */ 2070 if (entry->bytes < *bytes + align_off) { 2071 *max_extent_size = max(get_max_extent_size(entry), 2072 *max_extent_size); 2073 continue; 2074 } 2075 2076 if (entry->bitmap) { 2077 struct rb_node *old_next = rb_next(node); 2078 u64 size = *bytes; 2079 2080 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2081 if (!ret) { 2082 *offset = tmp; 2083 *bytes = size; 2084 return entry; 2085 } else { 2086 *max_extent_size = 2087 max(get_max_extent_size(entry), 2088 *max_extent_size); 2089 } 2090 2091 /* 2092 * The bitmap may have gotten re-arranged in the space 2093 * index here because the max_extent_size may have been 2094 * updated. Start from the beginning again if this 2095 * happened. 2096 */ 2097 if (use_bytes_index && old_next != rb_next(node)) 2098 goto again; 2099 continue; 2100 } 2101 2102 *offset = tmp; 2103 *bytes = entry->bytes - align_off; 2104 return entry; 2105 } 2106 2107 return NULL; 2108 } 2109 2110 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2111 struct btrfs_free_space *info, u64 offset) 2112 { 2113 info->offset = offset_to_bitmap(ctl, offset); 2114 info->bytes = 0; 2115 info->bitmap_extents = 0; 2116 INIT_LIST_HEAD(&info->list); 2117 link_free_space(ctl, info); 2118 ctl->total_bitmaps++; 2119 recalculate_thresholds(ctl); 2120 } 2121 2122 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2123 struct btrfs_free_space *bitmap_info) 2124 { 2125 /* 2126 * Normally when this is called, the bitmap is completely empty. However, 2127 * if we are blowing up the free space cache for one reason or another 2128 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2129 * we may leave stats on the table. 2130 */ 2131 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2132 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2133 bitmap_info->bitmap_extents; 2134 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2135 2136 } 2137 unlink_free_space(ctl, bitmap_info, true); 2138 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2139 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2140 ctl->total_bitmaps--; 2141 recalculate_thresholds(ctl); 2142 } 2143 2144 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2145 struct btrfs_free_space *bitmap_info, 2146 u64 *offset, u64 *bytes) 2147 { 2148 const int unit = ctl->block_group->fs_info->sectorsize; 2149 u64 end; 2150 u64 search_start, search_bytes; 2151 int ret; 2152 2153 again: 2154 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * unit) - 1; 2155 2156 /* 2157 * We need to search for bits in this bitmap. We could only cover some 2158 * of the extent in this bitmap thanks to how we add space, so we need 2159 * to search for as much as it as we can and clear that amount, and then 2160 * go searching for the next bit. 2161 */ 2162 search_start = *offset; 2163 search_bytes = unit; 2164 search_bytes = min(search_bytes, end - search_start + 1); 2165 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2166 false); 2167 if (ret < 0 || search_start != *offset) 2168 return -EINVAL; 2169 2170 /* We may have found more bits than what we need */ 2171 search_bytes = min(search_bytes, *bytes); 2172 2173 /* Cannot clear past the end of the bitmap */ 2174 search_bytes = min(search_bytes, end - search_start + 1); 2175 2176 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2177 *offset += search_bytes; 2178 *bytes -= search_bytes; 2179 2180 if (*bytes) { 2181 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2182 if (!bitmap_info->bytes) 2183 free_bitmap(ctl, bitmap_info); 2184 2185 /* 2186 * no entry after this bitmap, but we still have bytes to 2187 * remove, so something has gone wrong. 2188 */ 2189 if (!next) 2190 return -EINVAL; 2191 2192 bitmap_info = rb_entry(next, struct btrfs_free_space, 2193 offset_index); 2194 2195 /* 2196 * if the next entry isn't a bitmap we need to return to let the 2197 * extent stuff do its work. 2198 */ 2199 if (!bitmap_info->bitmap) 2200 return -EAGAIN; 2201 2202 /* 2203 * Ok the next item is a bitmap, but it may not actually hold 2204 * the information for the rest of this free space stuff, so 2205 * look for it, and if we don't find it return so we can try 2206 * everything over again. 2207 */ 2208 search_start = *offset; 2209 search_bytes = unit; 2210 ret = search_bitmap(ctl, bitmap_info, &search_start, 2211 &search_bytes, false); 2212 if (ret < 0 || search_start != *offset) 2213 return -EAGAIN; 2214 2215 goto again; 2216 } else if (!bitmap_info->bytes) 2217 free_bitmap(ctl, bitmap_info); 2218 2219 return 0; 2220 } 2221 2222 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2223 struct btrfs_free_space *info, u64 offset, 2224 u64 bytes, enum btrfs_trim_state trim_state) 2225 { 2226 const int unit = ctl->block_group->fs_info->sectorsize; 2227 u64 bytes_to_set = 0; 2228 u64 end; 2229 2230 /* 2231 * This is a tradeoff to make bitmap trim state minimal. We mark the 2232 * whole bitmap untrimmed if at any point we add untrimmed regions. 2233 */ 2234 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2235 if (btrfs_free_space_trimmed(info)) { 2236 ctl->discardable_extents[BTRFS_STAT_CURR] += 2237 info->bitmap_extents; 2238 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2239 } 2240 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2241 } 2242 2243 end = info->offset + (u64)(BITS_PER_BITMAP * unit); 2244 2245 bytes_to_set = min(end - offset, bytes); 2246 2247 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2248 2249 return bytes_to_set; 2250 2251 } 2252 2253 EXPORT_FOR_TESTS 2254 bool btrfs_use_bitmap(struct btrfs_free_space_ctl *ctl, 2255 struct btrfs_free_space *info) 2256 { 2257 struct btrfs_block_group *block_group = ctl->block_group; 2258 struct btrfs_fs_info *fs_info = block_group->fs_info; 2259 bool forced = false; 2260 2261 #ifdef CONFIG_BTRFS_DEBUG 2262 if (btrfs_should_fragment_free_space(block_group)) 2263 forced = true; 2264 #endif 2265 2266 /* This is a way to reclaim large regions from the bitmaps. */ 2267 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2268 return false; 2269 2270 /* 2271 * If we are below the extents threshold then we can add this as an 2272 * extent, and don't have to deal with the bitmap 2273 */ 2274 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2275 /* 2276 * If this block group has some small extents we don't want to 2277 * use up all of our free slots in the cache with them, we want 2278 * to reserve them to larger extents, however if we have plenty 2279 * of cache left then go ahead and add them, no sense in adding 2280 * the overhead of a bitmap if we don't have to. 2281 */ 2282 if (info->bytes <= fs_info->sectorsize * 8) { 2283 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2284 return false; 2285 } else { 2286 return false; 2287 } 2288 } 2289 2290 /* 2291 * The original block groups from mkfs can be really small, like 8 2292 * megabytes, so don't bother with a bitmap for those entries. However 2293 * some block groups can be smaller than what a bitmap would cover but 2294 * are still large enough that they could overflow the 32k memory limit, 2295 * so allow those block groups to still be allowed to have a bitmap 2296 * entry. 2297 */ 2298 if (((BITS_PER_BITMAP * fs_info->sectorsize) >> 1) > block_group->length) 2299 return false; 2300 2301 return true; 2302 } 2303 2304 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2305 struct btrfs_free_space *info) 2306 { 2307 struct btrfs_free_space *bitmap_info; 2308 struct btrfs_block_group *block_group = ctl->block_group; 2309 bool added = false; 2310 u64 bytes, offset, bytes_added; 2311 enum btrfs_trim_state trim_state; 2312 int ret; 2313 2314 bytes = info->bytes; 2315 offset = info->offset; 2316 trim_state = info->trim_state; 2317 2318 if (btrfs_is_testing(block_group->fs_info)) { 2319 if (!block_group->fs_info->use_bitmap(ctl, info)) 2320 return 0; 2321 } else { 2322 if (!btrfs_use_bitmap(ctl, info)) 2323 return 0; 2324 } 2325 again: 2326 /* 2327 * Since we link bitmaps right into the cluster we need to see if we 2328 * have a cluster here, and if so and it has our bitmap we need to add 2329 * the free space to that bitmap. 2330 */ 2331 if (!list_empty(&block_group->cluster_list)) { 2332 struct btrfs_free_cluster *cluster; 2333 struct rb_node *node; 2334 struct btrfs_free_space *entry; 2335 2336 cluster = list_first_entry(&block_group->cluster_list, 2337 struct btrfs_free_cluster, block_group_list); 2338 spin_lock(&cluster->lock); 2339 node = rb_first(&cluster->root); 2340 if (!node) { 2341 spin_unlock(&cluster->lock); 2342 goto no_cluster_bitmap; 2343 } 2344 2345 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2346 if (!entry->bitmap) { 2347 spin_unlock(&cluster->lock); 2348 goto no_cluster_bitmap; 2349 } 2350 2351 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2352 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2353 bytes, trim_state); 2354 bytes -= bytes_added; 2355 offset += bytes_added; 2356 } 2357 spin_unlock(&cluster->lock); 2358 if (!bytes) { 2359 ret = 1; 2360 goto out; 2361 } 2362 } 2363 2364 no_cluster_bitmap: 2365 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2366 1, 0); 2367 if (!bitmap_info) { 2368 ASSERT(!added); 2369 goto new_bitmap; 2370 } 2371 2372 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2373 trim_state); 2374 bytes -= bytes_added; 2375 offset += bytes_added; 2376 added = false; 2377 2378 if (!bytes) { 2379 ret = 1; 2380 goto out; 2381 } else 2382 goto again; 2383 2384 new_bitmap: 2385 if (info && info->bitmap) { 2386 add_new_bitmap(ctl, info, offset); 2387 added = true; 2388 info = NULL; 2389 goto again; 2390 } else { 2391 spin_unlock(&ctl->tree_lock); 2392 2393 /* no pre-allocated info, allocate a new one */ 2394 if (!info) { 2395 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2396 GFP_NOFS); 2397 if (!info) { 2398 spin_lock(&ctl->tree_lock); 2399 ret = -ENOMEM; 2400 goto out; 2401 } 2402 } 2403 2404 /* allocate the bitmap */ 2405 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2406 GFP_NOFS); 2407 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2408 spin_lock(&ctl->tree_lock); 2409 if (!info->bitmap) { 2410 ret = -ENOMEM; 2411 goto out; 2412 } 2413 goto again; 2414 } 2415 2416 out: 2417 if (info) { 2418 if (info->bitmap) 2419 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2420 info->bitmap); 2421 kmem_cache_free(btrfs_free_space_cachep, info); 2422 } 2423 2424 return ret; 2425 } 2426 2427 /* 2428 * Free space merging rules: 2429 * 1) Merge trimmed areas together 2430 * 2) Let untrimmed areas coalesce with trimmed areas 2431 * 3) Always pull neighboring regions from bitmaps 2432 * 2433 * The above rules are for when we merge free space based on btrfs_trim_state. 2434 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2435 * same reason: to promote larger extent regions which makes life easier for 2436 * find_free_extent(). Rule 2 enables coalescing based on the common path 2437 * being returning free space from btrfs_finish_extent_commit(). So when free 2438 * space is trimmed, it will prevent aggregating trimmed new region and 2439 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2440 * and provide find_free_extent() with the largest extents possible hoping for 2441 * the reuse path. 2442 */ 2443 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2444 struct btrfs_free_space *info, bool update_stat) 2445 { 2446 struct btrfs_free_space *left_info = NULL; 2447 struct btrfs_free_space *right_info; 2448 bool merged = false; 2449 u64 offset = info->offset; 2450 u64 bytes = info->bytes; 2451 const bool is_trimmed = btrfs_free_space_trimmed(info); 2452 struct rb_node *right_prev = NULL; 2453 2454 /* 2455 * first we want to see if there is free space adjacent to the range we 2456 * are adding, if there is remove that struct and add a new one to 2457 * cover the entire range 2458 */ 2459 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2460 if (right_info) 2461 right_prev = rb_prev(&right_info->offset_index); 2462 2463 if (right_prev) 2464 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2465 else if (!right_info) 2466 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2467 2468 /* See try_merge_free_space() comment. */ 2469 if (right_info && !right_info->bitmap && 2470 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2471 unlink_free_space(ctl, right_info, update_stat); 2472 info->bytes += right_info->bytes; 2473 kmem_cache_free(btrfs_free_space_cachep, right_info); 2474 merged = true; 2475 } 2476 2477 /* See try_merge_free_space() comment. */ 2478 if (left_info && !left_info->bitmap && 2479 left_info->offset + left_info->bytes == offset && 2480 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2481 unlink_free_space(ctl, left_info, update_stat); 2482 info->offset = left_info->offset; 2483 info->bytes += left_info->bytes; 2484 kmem_cache_free(btrfs_free_space_cachep, left_info); 2485 merged = true; 2486 } 2487 2488 return merged; 2489 } 2490 2491 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2492 struct btrfs_free_space *info, 2493 bool update_stat) 2494 { 2495 const int unit = ctl->block_group->fs_info->sectorsize; 2496 struct btrfs_free_space *bitmap; 2497 unsigned long i; 2498 unsigned long j; 2499 const u64 end = info->offset + info->bytes; 2500 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2501 u64 bytes; 2502 2503 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2504 if (!bitmap) 2505 return false; 2506 2507 i = offset_to_bit(bitmap->offset, unit, end); 2508 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2509 if (j == i) 2510 return false; 2511 bytes = (j - i) * unit; 2512 info->bytes += bytes; 2513 2514 /* See try_merge_free_space() comment. */ 2515 if (!btrfs_free_space_trimmed(bitmap)) 2516 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2517 2518 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2519 2520 if (!bitmap->bytes) 2521 free_bitmap(ctl, bitmap); 2522 2523 return true; 2524 } 2525 2526 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2527 struct btrfs_free_space *info, 2528 bool update_stat) 2529 { 2530 const int unit = ctl->block_group->fs_info->sectorsize; 2531 struct btrfs_free_space *bitmap; 2532 u64 bitmap_offset; 2533 unsigned long i; 2534 unsigned long j; 2535 unsigned long prev_j; 2536 u64 bytes; 2537 2538 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2539 /* If we're on a boundary, try the previous logical bitmap. */ 2540 if (bitmap_offset == info->offset) { 2541 if (info->offset == 0) 2542 return false; 2543 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2544 } 2545 2546 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2547 if (!bitmap) 2548 return false; 2549 2550 i = offset_to_bit(bitmap->offset, unit, info->offset) - 1; 2551 j = 0; 2552 prev_j = (unsigned long)-1; 2553 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2554 if (j > i) 2555 break; 2556 prev_j = j; 2557 } 2558 if (prev_j == i) 2559 return false; 2560 2561 if (prev_j == (unsigned long)-1) 2562 bytes = (i + 1) * unit; 2563 else 2564 bytes = (i - prev_j) * unit; 2565 2566 info->offset -= bytes; 2567 info->bytes += bytes; 2568 2569 /* See try_merge_free_space() comment. */ 2570 if (!btrfs_free_space_trimmed(bitmap)) 2571 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2572 2573 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2574 2575 if (!bitmap->bytes) 2576 free_bitmap(ctl, bitmap); 2577 2578 return true; 2579 } 2580 2581 /* 2582 * We prefer always to allocate from extent entries, both for clustered and 2583 * non-clustered allocation requests. So when attempting to add a new extent 2584 * entry, try to see if there's adjacent free space in bitmap entries, and if 2585 * there is, migrate that space from the bitmaps to the extent. 2586 * Like this we get better chances of satisfying space allocation requests 2587 * because we attempt to satisfy them based on a single cache entry, and never 2588 * on 2 or more entries - even if the entries represent a contiguous free space 2589 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2590 * ends). 2591 */ 2592 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2593 struct btrfs_free_space *info, 2594 bool update_stat) 2595 { 2596 /* 2597 * Only work with disconnected entries, as we can change their offset, 2598 * and must be extent entries. 2599 */ 2600 ASSERT(!info->bitmap); 2601 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2602 2603 if (ctl->total_bitmaps > 0) { 2604 bool stole_end; 2605 bool stole_front = false; 2606 2607 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2608 if (ctl->total_bitmaps > 0) 2609 stole_front = steal_from_bitmap_to_front(ctl, info, 2610 update_stat); 2611 2612 if (stole_end || stole_front) 2613 try_merge_free_space(ctl, info, update_stat); 2614 } 2615 } 2616 2617 static int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2618 u64 offset, u64 bytes, 2619 enum btrfs_trim_state trim_state) 2620 { 2621 struct btrfs_fs_info *fs_info = block_group->fs_info; 2622 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2623 struct btrfs_free_space *info; 2624 int ret = 0; 2625 u64 filter_bytes = bytes; 2626 2627 ASSERT(!btrfs_is_zoned(fs_info)); 2628 2629 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2630 if (!info) 2631 return -ENOMEM; 2632 2633 info->offset = offset; 2634 info->bytes = bytes; 2635 info->trim_state = trim_state; 2636 RB_CLEAR_NODE(&info->offset_index); 2637 RB_CLEAR_NODE(&info->bytes_index); 2638 2639 spin_lock(&ctl->tree_lock); 2640 2641 if (try_merge_free_space(ctl, info, true)) 2642 goto link; 2643 2644 /* 2645 * There was no extent directly to the left or right of this new 2646 * extent then we know we're going to have to allocate a new extent, so 2647 * before we do that see if we need to drop this into a bitmap 2648 */ 2649 ret = insert_into_bitmap(ctl, info); 2650 if (ret < 0) { 2651 goto out; 2652 } else if (ret) { 2653 ret = 0; 2654 goto out; 2655 } 2656 link: 2657 /* 2658 * Only steal free space from adjacent bitmaps if we're sure we're not 2659 * going to add the new free space to existing bitmap entries - because 2660 * that would mean unnecessary work that would be reverted. Therefore 2661 * attempt to steal space from bitmaps if we're adding an extent entry. 2662 */ 2663 steal_from_bitmap(ctl, info, true); 2664 2665 filter_bytes = max(filter_bytes, info->bytes); 2666 2667 ret = link_free_space(ctl, info); 2668 if (ret) 2669 kmem_cache_free(btrfs_free_space_cachep, info); 2670 out: 2671 btrfs_discard_update_discardable(block_group); 2672 spin_unlock(&ctl->tree_lock); 2673 2674 if (ret) { 2675 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2676 ASSERT(ret != -EEXIST); 2677 } 2678 2679 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2680 btrfs_discard_check_filter(block_group, filter_bytes); 2681 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2682 } 2683 2684 return ret; 2685 } 2686 2687 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2688 u64 bytenr, u64 size, bool used) 2689 { 2690 struct btrfs_space_info *sinfo = block_group->space_info; 2691 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2692 u64 offset = bytenr - block_group->start; 2693 u64 to_free, to_unusable; 2694 int bg_reclaim_threshold = 0; 2695 bool initial; 2696 u64 reclaimable_unusable; 2697 2698 spin_lock(&block_group->lock); 2699 2700 initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2701 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2702 if (!initial) 2703 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2704 2705 if (!used) 2706 to_free = size; 2707 else if (initial) 2708 to_free = block_group->zone_capacity; 2709 else if (offset >= block_group->alloc_offset) 2710 to_free = size; 2711 else if (offset + size <= block_group->alloc_offset) 2712 to_free = 0; 2713 else 2714 to_free = offset + size - block_group->alloc_offset; 2715 to_unusable = size - to_free; 2716 2717 spin_lock(&ctl->tree_lock); 2718 ctl->free_space += to_free; 2719 spin_unlock(&ctl->tree_lock); 2720 /* 2721 * If the block group is read-only, we should account freed space into 2722 * bytes_readonly. 2723 */ 2724 if (!block_group->ro) { 2725 block_group->zone_unusable += to_unusable; 2726 WARN_ON(block_group->zone_unusable > block_group->length); 2727 } 2728 if (!used) { 2729 block_group->alloc_offset -= size; 2730 } 2731 2732 reclaimable_unusable = block_group->zone_unusable - 2733 (block_group->length - block_group->zone_capacity); 2734 /* All the region is now unusable. Mark it as unused and reclaim */ 2735 if (block_group->zone_unusable == block_group->length) { 2736 btrfs_mark_bg_unused(block_group); 2737 } else if (bg_reclaim_threshold && 2738 reclaimable_unusable >= 2739 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2740 btrfs_mark_bg_to_reclaim(block_group); 2741 } 2742 2743 spin_unlock(&block_group->lock); 2744 2745 return 0; 2746 } 2747 2748 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2749 u64 bytenr, u64 size) 2750 { 2751 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2752 2753 if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) 2754 return 0; 2755 2756 if (btrfs_is_zoned(block_group->fs_info)) 2757 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2758 true); 2759 2760 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2761 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2762 2763 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2764 } 2765 2766 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2767 u64 bytenr, u64 size) 2768 { 2769 if (btrfs_is_zoned(block_group->fs_info)) 2770 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2771 false); 2772 2773 return btrfs_add_free_space(block_group, bytenr, size); 2774 } 2775 2776 /* 2777 * This is a subtle distinction because when adding free space back in general, 2778 * we want it to be added as untrimmed for async. But in the case where we add 2779 * it on loading of a block group, we want to consider it trimmed. 2780 */ 2781 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2782 u64 bytenr, u64 size) 2783 { 2784 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2785 2786 if (btrfs_is_zoned(block_group->fs_info)) 2787 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2788 true); 2789 2790 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2791 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2792 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2793 2794 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2795 } 2796 2797 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2798 u64 offset, u64 bytes) 2799 { 2800 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2801 struct btrfs_free_space *info; 2802 int ret; 2803 bool re_search = false; 2804 2805 if (btrfs_is_zoned(block_group->fs_info)) { 2806 /* 2807 * This can happen with conventional zones when replaying log. 2808 * Since the allocation info of tree-log nodes are not recorded 2809 * to the extent-tree, calculate_alloc_pointer() failed to 2810 * advance the allocation pointer after last allocated tree log 2811 * node blocks. 2812 * 2813 * This function is called from 2814 * btrfs_pin_extent_for_log_replay() when replaying the log. 2815 * Advance the pointer not to overwrite the tree-log nodes. 2816 */ 2817 if (block_group->start + block_group->alloc_offset < 2818 offset + bytes) { 2819 block_group->alloc_offset = 2820 offset + bytes - block_group->start; 2821 } 2822 return 0; 2823 } 2824 2825 spin_lock(&ctl->tree_lock); 2826 2827 again: 2828 ret = 0; 2829 if (!bytes) 2830 goto out_lock; 2831 2832 info = tree_search_offset(ctl, offset, 0, 0); 2833 if (!info) { 2834 /* 2835 * oops didn't find an extent that matched the space we wanted 2836 * to remove, look for a bitmap instead 2837 */ 2838 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2839 1, 0); 2840 if (!info) { 2841 /* 2842 * If we found a partial bit of our free space in a 2843 * bitmap but then couldn't find the other part this may 2844 * be a problem, so WARN about it. 2845 */ 2846 WARN_ON(re_search); 2847 goto out_lock; 2848 } 2849 } 2850 2851 re_search = false; 2852 if (!info->bitmap) { 2853 unlink_free_space(ctl, info, true); 2854 if (offset == info->offset) { 2855 u64 to_free = min(bytes, info->bytes); 2856 2857 info->bytes -= to_free; 2858 info->offset += to_free; 2859 if (info->bytes) { 2860 ret = link_free_space(ctl, info); 2861 WARN_ON(ret); 2862 } else { 2863 kmem_cache_free(btrfs_free_space_cachep, info); 2864 } 2865 2866 offset += to_free; 2867 bytes -= to_free; 2868 goto again; 2869 } else { 2870 u64 old_end = info->bytes + info->offset; 2871 2872 info->bytes = offset - info->offset; 2873 ret = link_free_space(ctl, info); 2874 WARN_ON(ret); 2875 if (ret) 2876 goto out_lock; 2877 2878 /* Not enough bytes in this entry to satisfy us */ 2879 if (old_end < offset + bytes) { 2880 bytes -= old_end - offset; 2881 offset = old_end; 2882 goto again; 2883 } else if (old_end == offset + bytes) { 2884 /* all done */ 2885 goto out_lock; 2886 } 2887 spin_unlock(&ctl->tree_lock); 2888 2889 ret = __btrfs_add_free_space(block_group, 2890 offset + bytes, 2891 old_end - (offset + bytes), 2892 info->trim_state); 2893 WARN_ON(ret); 2894 return ret; 2895 } 2896 } 2897 2898 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2899 if (ret == -EAGAIN) { 2900 re_search = true; 2901 goto again; 2902 } 2903 out_lock: 2904 btrfs_discard_update_discardable(block_group); 2905 spin_unlock(&ctl->tree_lock); 2906 2907 return ret; 2908 } 2909 2910 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2911 u64 bytes) 2912 { 2913 struct btrfs_fs_info *fs_info = block_group->fs_info; 2914 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2915 struct btrfs_free_space *info; 2916 struct rb_node *n; 2917 int count = 0; 2918 2919 /* 2920 * Zoned btrfs does not use free space tree and cluster. Just print 2921 * out the free space after the allocation offset. 2922 */ 2923 if (btrfs_is_zoned(fs_info)) { 2924 btrfs_info(fs_info, "free space %llu active %d", 2925 block_group->zone_capacity - block_group->alloc_offset, 2926 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2927 &block_group->runtime_flags)); 2928 return; 2929 } 2930 2931 spin_lock(&ctl->tree_lock); 2932 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2933 info = rb_entry(n, struct btrfs_free_space, offset_index); 2934 if (info->bytes >= bytes && !block_group->ro) 2935 count++; 2936 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2937 info->offset, info->bytes, str_yes_no(info->bitmap)); 2938 } 2939 spin_unlock(&ctl->tree_lock); 2940 btrfs_info(fs_info, "block group has cluster?: %s", 2941 str_no_yes(list_empty(&block_group->cluster_list))); 2942 btrfs_info(fs_info, 2943 "%d free space entries at or bigger than %llu bytes", 2944 count, bytes); 2945 } 2946 2947 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2948 struct btrfs_free_space_ctl *ctl) 2949 { 2950 spin_lock_init(&ctl->tree_lock); 2951 ctl->block_group = block_group; 2952 ctl->free_space_bytes = RB_ROOT_CACHED; 2953 INIT_LIST_HEAD(&ctl->trimming_ranges); 2954 mutex_init(&ctl->cache_writeout_mutex); 2955 2956 /* 2957 * we only want to have 32k of ram per block group for keeping 2958 * track of free space, and if we pass 1/2 of that we want to 2959 * start converting things over to using bitmaps 2960 */ 2961 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2962 } 2963 2964 /* 2965 * for a given cluster, put all of its extents back into the free 2966 * space cache. If the block group passed doesn't match the block group 2967 * pointed to by the cluster, someone else raced in and freed the 2968 * cluster already. In that case, we just return without changing anything 2969 */ 2970 static void __btrfs_return_cluster_to_free_space( 2971 struct btrfs_block_group *block_group, 2972 struct btrfs_free_cluster *cluster) 2973 { 2974 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2975 struct rb_node *node; 2976 2977 lockdep_assert_held(&ctl->tree_lock); 2978 2979 spin_lock(&cluster->lock); 2980 if (cluster->block_group != block_group) { 2981 spin_unlock(&cluster->lock); 2982 return; 2983 } 2984 2985 cluster->block_group = NULL; 2986 cluster->window_start = 0; 2987 list_del_init(&cluster->block_group_list); 2988 2989 node = rb_first(&cluster->root); 2990 while (node) { 2991 struct btrfs_free_space *entry; 2992 2993 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2994 node = rb_next(&entry->offset_index); 2995 rb_erase(&entry->offset_index, &cluster->root); 2996 RB_CLEAR_NODE(&entry->offset_index); 2997 2998 if (!entry->bitmap) { 2999 /* Merging treats extents as if they were new */ 3000 if (!btrfs_free_space_trimmed(entry)) { 3001 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3002 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3003 entry->bytes; 3004 } 3005 3006 try_merge_free_space(ctl, entry, false); 3007 steal_from_bitmap(ctl, entry, false); 3008 3009 /* As we insert directly, update these statistics */ 3010 if (!btrfs_free_space_trimmed(entry)) { 3011 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3012 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3013 entry->bytes; 3014 } 3015 } 3016 tree_insert_offset(ctl, NULL, entry); 3017 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3018 entry_less); 3019 } 3020 cluster->root = RB_ROOT; 3021 spin_unlock(&cluster->lock); 3022 btrfs_put_block_group(block_group); 3023 } 3024 3025 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3026 { 3027 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3028 struct btrfs_free_cluster *cluster; 3029 struct list_head *head; 3030 3031 spin_lock(&ctl->tree_lock); 3032 while ((head = block_group->cluster_list.next) != 3033 &block_group->cluster_list) { 3034 cluster = list_entry(head, struct btrfs_free_cluster, 3035 block_group_list); 3036 3037 WARN_ON(cluster->block_group != block_group); 3038 __btrfs_return_cluster_to_free_space(block_group, cluster); 3039 3040 cond_resched_lock(&ctl->tree_lock); 3041 } 3042 __btrfs_remove_free_space_cache(ctl); 3043 btrfs_discard_update_discardable(block_group); 3044 spin_unlock(&ctl->tree_lock); 3045 3046 } 3047 3048 /* 3049 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3050 */ 3051 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3052 { 3053 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3054 struct btrfs_free_space *info; 3055 struct rb_node *node; 3056 bool ret = true; 3057 3058 if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && 3059 !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) && 3060 block_group->identity_remap_count == 0) { 3061 return true; 3062 } 3063 3064 spin_lock(&ctl->tree_lock); 3065 node = rb_first(&ctl->free_space_offset); 3066 3067 while (node) { 3068 info = rb_entry(node, struct btrfs_free_space, offset_index); 3069 3070 if (!btrfs_free_space_trimmed(info)) { 3071 ret = false; 3072 break; 3073 } 3074 3075 node = rb_next(node); 3076 } 3077 3078 spin_unlock(&ctl->tree_lock); 3079 return ret; 3080 } 3081 3082 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3083 u64 offset, u64 bytes, u64 empty_size, 3084 u64 *max_extent_size) 3085 { 3086 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3087 struct btrfs_discard_ctl *discard_ctl = 3088 &block_group->fs_info->discard_ctl; 3089 struct btrfs_free_space *entry = NULL; 3090 u64 bytes_search = bytes + empty_size; 3091 u64 ret = 0; 3092 u64 align_gap = 0; 3093 u64 align_gap_len = 0; 3094 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3095 bool use_bytes_index = (offset == block_group->start); 3096 3097 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3098 3099 spin_lock(&ctl->tree_lock); 3100 entry = find_free_space(ctl, &offset, &bytes_search, 3101 block_group->full_stripe_len, max_extent_size, 3102 use_bytes_index); 3103 if (!entry) 3104 goto out; 3105 3106 ret = offset; 3107 if (entry->bitmap) { 3108 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3109 3110 if (!btrfs_free_space_trimmed(entry)) 3111 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3112 3113 if (!entry->bytes) 3114 free_bitmap(ctl, entry); 3115 } else { 3116 unlink_free_space(ctl, entry, true); 3117 align_gap_len = offset - entry->offset; 3118 align_gap = entry->offset; 3119 align_gap_trim_state = entry->trim_state; 3120 3121 if (!btrfs_free_space_trimmed(entry)) 3122 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3123 3124 entry->offset = offset + bytes; 3125 WARN_ON(entry->bytes < bytes + align_gap_len); 3126 3127 entry->bytes -= bytes + align_gap_len; 3128 if (!entry->bytes) 3129 kmem_cache_free(btrfs_free_space_cachep, entry); 3130 else 3131 link_free_space(ctl, entry); 3132 } 3133 out: 3134 btrfs_discard_update_discardable(block_group); 3135 spin_unlock(&ctl->tree_lock); 3136 3137 if (align_gap_len) 3138 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3139 align_gap_trim_state); 3140 return ret; 3141 } 3142 3143 /* 3144 * given a cluster, put all of its extents back into the free space 3145 * cache. If a block group is passed, this function will only free 3146 * a cluster that belongs to the passed block group. 3147 * 3148 * Otherwise, it'll get a reference on the block group pointed to by the 3149 * cluster and remove the cluster from it. 3150 */ 3151 void btrfs_return_cluster_to_free_space( 3152 struct btrfs_block_group *block_group, 3153 struct btrfs_free_cluster *cluster) 3154 { 3155 struct btrfs_free_space_ctl *ctl; 3156 3157 /* first, get a safe pointer to the block group */ 3158 spin_lock(&cluster->lock); 3159 if (!block_group) { 3160 block_group = cluster->block_group; 3161 if (!block_group) { 3162 spin_unlock(&cluster->lock); 3163 return; 3164 } 3165 } else if (cluster->block_group != block_group) { 3166 /* someone else has already freed it don't redo their work */ 3167 spin_unlock(&cluster->lock); 3168 return; 3169 } 3170 btrfs_get_block_group(block_group); 3171 spin_unlock(&cluster->lock); 3172 3173 ctl = block_group->free_space_ctl; 3174 3175 /* now return any extents the cluster had on it */ 3176 spin_lock(&ctl->tree_lock); 3177 __btrfs_return_cluster_to_free_space(block_group, cluster); 3178 spin_unlock(&ctl->tree_lock); 3179 3180 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3181 3182 /* finally drop our ref */ 3183 btrfs_put_block_group(block_group); 3184 } 3185 3186 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3187 struct btrfs_free_cluster *cluster, 3188 struct btrfs_free_space *entry, 3189 u64 bytes, u64 min_start, 3190 u64 *max_extent_size) 3191 { 3192 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3193 int ret2; 3194 u64 search_start = cluster->window_start; 3195 u64 search_bytes = bytes; 3196 u64 ret = 0; 3197 3198 search_start = min_start; 3199 search_bytes = bytes; 3200 3201 ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3202 if (ret2) { 3203 *max_extent_size = max(get_max_extent_size(entry), 3204 *max_extent_size); 3205 return 0; 3206 } 3207 3208 ret = search_start; 3209 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3210 3211 return ret; 3212 } 3213 3214 /* 3215 * given a cluster, try to allocate 'bytes' from it, returns 0 3216 * if it couldn't find anything suitably large, or a logical disk offset 3217 * if things worked out 3218 */ 3219 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3220 struct btrfs_free_cluster *cluster, u64 bytes, 3221 u64 min_start, u64 *max_extent_size) 3222 { 3223 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3224 struct btrfs_discard_ctl *discard_ctl = 3225 &block_group->fs_info->discard_ctl; 3226 struct btrfs_free_space *entry = NULL; 3227 struct rb_node *node; 3228 u64 ret = 0; 3229 3230 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3231 3232 spin_lock(&cluster->lock); 3233 if (bytes > cluster->max_size) 3234 goto out; 3235 3236 if (cluster->block_group != block_group) 3237 goto out; 3238 3239 node = rb_first(&cluster->root); 3240 if (!node) 3241 goto out; 3242 3243 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3244 while (1) { 3245 if (entry->bytes < bytes) 3246 *max_extent_size = max(get_max_extent_size(entry), 3247 *max_extent_size); 3248 3249 if (entry->bytes < bytes || 3250 (!entry->bitmap && entry->offset < min_start)) { 3251 node = rb_next(&entry->offset_index); 3252 if (!node) 3253 break; 3254 entry = rb_entry(node, struct btrfs_free_space, 3255 offset_index); 3256 continue; 3257 } 3258 3259 if (entry->bitmap) { 3260 ret = btrfs_alloc_from_bitmap(block_group, 3261 cluster, entry, bytes, 3262 cluster->window_start, 3263 max_extent_size); 3264 if (ret == 0) { 3265 node = rb_next(&entry->offset_index); 3266 if (!node) 3267 break; 3268 entry = rb_entry(node, struct btrfs_free_space, 3269 offset_index); 3270 continue; 3271 } 3272 cluster->window_start += bytes; 3273 } else { 3274 ret = entry->offset; 3275 3276 entry->offset += bytes; 3277 entry->bytes -= bytes; 3278 } 3279 3280 break; 3281 } 3282 out: 3283 spin_unlock(&cluster->lock); 3284 3285 if (!ret) 3286 return 0; 3287 3288 spin_lock(&ctl->tree_lock); 3289 3290 if (!btrfs_free_space_trimmed(entry)) 3291 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3292 3293 ctl->free_space -= bytes; 3294 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3295 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3296 3297 spin_lock(&cluster->lock); 3298 if (entry->bytes == 0) { 3299 rb_erase(&entry->offset_index, &cluster->root); 3300 ctl->free_extents--; 3301 if (entry->bitmap) { 3302 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3303 entry->bitmap); 3304 ctl->total_bitmaps--; 3305 recalculate_thresholds(ctl); 3306 } else if (!btrfs_free_space_trimmed(entry)) { 3307 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3308 } 3309 kmem_cache_free(btrfs_free_space_cachep, entry); 3310 } 3311 3312 spin_unlock(&cluster->lock); 3313 spin_unlock(&ctl->tree_lock); 3314 3315 return ret; 3316 } 3317 3318 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3319 struct btrfs_free_space *entry, 3320 struct btrfs_free_cluster *cluster, 3321 u64 offset, u64 bytes, 3322 u64 cont1_bytes, u64 min_bytes) 3323 { 3324 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3325 const int unit = block_group->fs_info->sectorsize; 3326 unsigned long next_zero; 3327 unsigned long i; 3328 unsigned long want_bits; 3329 unsigned long min_bits; 3330 unsigned long found_bits; 3331 unsigned long max_bits = 0; 3332 unsigned long start = 0; 3333 unsigned long total_found = 0; 3334 int ret; 3335 3336 lockdep_assert_held(&ctl->tree_lock); 3337 3338 i = offset_to_bit(entry->offset, unit, 3339 max_t(u64, offset, entry->offset)); 3340 want_bits = bytes_to_bits(bytes, unit); 3341 min_bits = bytes_to_bits(min_bytes, unit); 3342 3343 /* 3344 * Don't bother looking for a cluster in this bitmap if it's heavily 3345 * fragmented. 3346 */ 3347 if (entry->max_extent_size && 3348 entry->max_extent_size < cont1_bytes) 3349 return -ENOSPC; 3350 again: 3351 found_bits = 0; 3352 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3353 next_zero = find_next_zero_bit(entry->bitmap, 3354 BITS_PER_BITMAP, i); 3355 if (next_zero - i >= min_bits) { 3356 found_bits = next_zero - i; 3357 if (found_bits > max_bits) 3358 max_bits = found_bits; 3359 break; 3360 } 3361 if (next_zero - i > max_bits) 3362 max_bits = next_zero - i; 3363 i = next_zero; 3364 } 3365 3366 if (!found_bits) { 3367 entry->max_extent_size = (u64)max_bits * unit; 3368 return -ENOSPC; 3369 } 3370 3371 if (!total_found) { 3372 start = i; 3373 cluster->max_size = 0; 3374 } 3375 3376 total_found += found_bits; 3377 3378 if (cluster->max_size < found_bits * unit) 3379 cluster->max_size = found_bits * unit; 3380 3381 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3382 i = next_zero + 1; 3383 goto again; 3384 } 3385 3386 cluster->window_start = start * unit + entry->offset; 3387 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3388 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3389 3390 /* 3391 * We need to know if we're currently on the normal space index when we 3392 * manipulate the bitmap so that we know we need to remove and re-insert 3393 * it into the space_index tree. Clear the bytes_index node here so the 3394 * bitmap manipulation helpers know not to mess with the space_index 3395 * until this bitmap entry is added back into the normal cache. 3396 */ 3397 RB_CLEAR_NODE(&entry->bytes_index); 3398 3399 ret = tree_insert_offset(ctl, cluster, entry); 3400 ASSERT(!ret); /* -EEXIST; Logic error */ 3401 3402 trace_btrfs_setup_cluster(block_group, cluster, total_found * unit, 1); 3403 return 0; 3404 } 3405 3406 /* 3407 * This searches the block group for just extents to fill the cluster with. 3408 * Try to find a cluster with at least bytes total bytes, at least one 3409 * extent of cont1_bytes, and other clusters of at least min_bytes. 3410 */ 3411 static noinline int 3412 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3413 struct btrfs_free_cluster *cluster, 3414 struct list_head *bitmaps, u64 offset, u64 bytes, 3415 u64 cont1_bytes, u64 min_bytes) 3416 { 3417 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3418 struct btrfs_free_space *first = NULL; 3419 struct btrfs_free_space *entry = NULL; 3420 struct btrfs_free_space *last; 3421 struct rb_node *node; 3422 u64 window_free; 3423 u64 max_extent; 3424 u64 total_size = 0; 3425 3426 lockdep_assert_held(&ctl->tree_lock); 3427 3428 entry = tree_search_offset(ctl, offset, 0, 1); 3429 if (!entry) 3430 return -ENOSPC; 3431 3432 /* 3433 * We don't want bitmaps, so just move along until we find a normal 3434 * extent entry. 3435 */ 3436 while (entry->bitmap || entry->bytes < min_bytes) { 3437 if (entry->bitmap && list_empty(&entry->list)) 3438 list_add_tail(&entry->list, bitmaps); 3439 node = rb_next(&entry->offset_index); 3440 if (!node) 3441 return -ENOSPC; 3442 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3443 } 3444 3445 window_free = entry->bytes; 3446 max_extent = entry->bytes; 3447 first = entry; 3448 last = entry; 3449 3450 for (node = rb_next(&entry->offset_index); node; 3451 node = rb_next(&entry->offset_index)) { 3452 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3453 3454 if (entry->bitmap) { 3455 if (list_empty(&entry->list)) 3456 list_add_tail(&entry->list, bitmaps); 3457 continue; 3458 } 3459 3460 if (entry->bytes < min_bytes) 3461 continue; 3462 3463 last = entry; 3464 window_free += entry->bytes; 3465 if (entry->bytes > max_extent) 3466 max_extent = entry->bytes; 3467 } 3468 3469 if (window_free < bytes || max_extent < cont1_bytes) 3470 return -ENOSPC; 3471 3472 cluster->window_start = first->offset; 3473 3474 node = &first->offset_index; 3475 3476 /* 3477 * now we've found our entries, pull them out of the free space 3478 * cache and put them into the cluster rbtree 3479 */ 3480 do { 3481 int ret; 3482 3483 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3484 node = rb_next(&entry->offset_index); 3485 if (entry->bitmap || entry->bytes < min_bytes) 3486 continue; 3487 3488 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3489 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3490 ret = tree_insert_offset(ctl, cluster, entry); 3491 total_size += entry->bytes; 3492 ASSERT(!ret); /* -EEXIST; Logic error */ 3493 } while (node && entry != last); 3494 3495 cluster->max_size = max_extent; 3496 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3497 return 0; 3498 } 3499 3500 /* 3501 * This specifically looks for bitmaps that may work in the cluster, we assume 3502 * that we have already failed to find extents that will work. 3503 */ 3504 static noinline int 3505 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3506 struct btrfs_free_cluster *cluster, 3507 struct list_head *bitmaps, u64 offset, u64 bytes, 3508 u64 cont1_bytes, u64 min_bytes) 3509 { 3510 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3511 struct btrfs_free_space *entry = NULL; 3512 int ret = -ENOSPC; 3513 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3514 3515 if (ctl->total_bitmaps == 0) 3516 return -ENOSPC; 3517 3518 /* 3519 * The bitmap that covers offset won't be in the list unless offset 3520 * is just its start offset. 3521 */ 3522 if (!list_empty(bitmaps)) 3523 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3524 3525 if (!entry || entry->offset != bitmap_offset) { 3526 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3527 if (entry && list_empty(&entry->list)) 3528 list_add(&entry->list, bitmaps); 3529 } 3530 3531 list_for_each_entry(entry, bitmaps, list) { 3532 if (entry->bytes < bytes) 3533 continue; 3534 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3535 bytes, cont1_bytes, min_bytes); 3536 if (!ret) 3537 return 0; 3538 } 3539 3540 /* 3541 * The bitmaps list has all the bitmaps that record free space 3542 * starting after offset, so no more search is required. 3543 */ 3544 return -ENOSPC; 3545 } 3546 3547 /* 3548 * here we try to find a cluster of blocks in a block group. The goal 3549 * is to find at least bytes+empty_size. 3550 * We might not find them all in one contiguous area. 3551 * 3552 * returns zero and sets up cluster if things worked out, otherwise 3553 * it returns -enospc 3554 */ 3555 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3556 struct btrfs_free_cluster *cluster, 3557 u64 offset, u64 bytes, u64 empty_size) 3558 { 3559 struct btrfs_fs_info *fs_info = block_group->fs_info; 3560 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3561 struct btrfs_free_space *entry, *tmp; 3562 LIST_HEAD(bitmaps); 3563 u64 min_bytes; 3564 u64 cont1_bytes; 3565 int ret; 3566 3567 /* 3568 * Choose the minimum extent size we'll require for this 3569 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3570 * For metadata, allow allocates with smaller extents. For 3571 * data, keep it dense. 3572 */ 3573 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3574 cont1_bytes = bytes + empty_size; 3575 min_bytes = cont1_bytes; 3576 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3577 cont1_bytes = bytes; 3578 min_bytes = fs_info->sectorsize; 3579 } else { 3580 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3581 min_bytes = fs_info->sectorsize; 3582 } 3583 3584 spin_lock(&ctl->tree_lock); 3585 3586 /* 3587 * If we know we don't have enough space to make a cluster don't even 3588 * bother doing all the work to try and find one. 3589 */ 3590 if (ctl->free_space < bytes) { 3591 spin_unlock(&ctl->tree_lock); 3592 return -ENOSPC; 3593 } 3594 3595 spin_lock(&cluster->lock); 3596 3597 /* someone already found a cluster, hooray */ 3598 if (cluster->block_group) { 3599 ret = 0; 3600 goto out; 3601 } 3602 3603 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3604 min_bytes); 3605 3606 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3607 bytes + empty_size, 3608 cont1_bytes, min_bytes); 3609 if (ret) 3610 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3611 offset, bytes + empty_size, 3612 cont1_bytes, min_bytes); 3613 3614 /* Clear our temporary list */ 3615 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3616 list_del_init(&entry->list); 3617 3618 if (!ret) { 3619 btrfs_get_block_group(block_group); 3620 list_add_tail(&cluster->block_group_list, 3621 &block_group->cluster_list); 3622 cluster->block_group = block_group; 3623 } else { 3624 trace_btrfs_failed_cluster_setup(block_group); 3625 } 3626 out: 3627 spin_unlock(&cluster->lock); 3628 spin_unlock(&ctl->tree_lock); 3629 3630 return ret; 3631 } 3632 3633 /* 3634 * simple code to zero out a cluster 3635 */ 3636 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3637 { 3638 spin_lock_init(&cluster->lock); 3639 spin_lock_init(&cluster->refill_lock); 3640 cluster->root = RB_ROOT; 3641 cluster->max_size = 0; 3642 cluster->fragmented = false; 3643 INIT_LIST_HEAD(&cluster->block_group_list); 3644 cluster->block_group = NULL; 3645 } 3646 3647 static int do_trimming(struct btrfs_block_group *block_group, 3648 u64 *total_trimmed, u64 start, u64 bytes, 3649 u64 reserved_start, u64 reserved_bytes, 3650 enum btrfs_trim_state reserved_trim_state, 3651 struct btrfs_trim_range *trim_entry) 3652 { 3653 struct btrfs_space_info *space_info = block_group->space_info; 3654 struct btrfs_fs_info *fs_info = block_group->fs_info; 3655 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3656 int ret; 3657 bool bg_ro; 3658 const u64 end = start + bytes; 3659 const u64 reserved_end = reserved_start + reserved_bytes; 3660 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3661 u64 trimmed = 0; 3662 3663 spin_lock(&space_info->lock); 3664 spin_lock(&block_group->lock); 3665 bg_ro = block_group->ro; 3666 if (!bg_ro) { 3667 block_group->reserved += reserved_bytes; 3668 spin_unlock(&block_group->lock); 3669 space_info->bytes_reserved += reserved_bytes; 3670 } else { 3671 spin_unlock(&block_group->lock); 3672 } 3673 spin_unlock(&space_info->lock); 3674 3675 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false); 3676 if (!ret) { 3677 *total_trimmed += trimmed; 3678 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3679 } 3680 3681 mutex_lock(&ctl->cache_writeout_mutex); 3682 if (reserved_start < start) 3683 __btrfs_add_free_space(block_group, reserved_start, 3684 start - reserved_start, 3685 reserved_trim_state); 3686 if (end < reserved_end) 3687 __btrfs_add_free_space(block_group, end, reserved_end - end, 3688 reserved_trim_state); 3689 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3690 list_del(&trim_entry->list); 3691 mutex_unlock(&ctl->cache_writeout_mutex); 3692 3693 if (!bg_ro) { 3694 spin_lock(&space_info->lock); 3695 spin_lock(&block_group->lock); 3696 bg_ro = block_group->ro; 3697 block_group->reserved -= reserved_bytes; 3698 spin_unlock(&block_group->lock); 3699 3700 space_info->bytes_reserved -= reserved_bytes; 3701 if (bg_ro) 3702 space_info->bytes_readonly += reserved_bytes; 3703 spin_unlock(&space_info->lock); 3704 } 3705 3706 return ret; 3707 } 3708 3709 /* 3710 * If @async is set, then we will trim 1 region and return. 3711 */ 3712 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3713 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3714 bool async) 3715 { 3716 struct btrfs_discard_ctl *discard_ctl = 3717 &block_group->fs_info->discard_ctl; 3718 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3719 struct btrfs_free_space *entry; 3720 struct rb_node *node; 3721 int ret = 0; 3722 u64 extent_start; 3723 u64 extent_bytes; 3724 enum btrfs_trim_state extent_trim_state; 3725 u64 bytes; 3726 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3727 3728 while (start < end) { 3729 struct btrfs_trim_range trim_entry; 3730 3731 mutex_lock(&ctl->cache_writeout_mutex); 3732 spin_lock(&ctl->tree_lock); 3733 3734 if (ctl->free_space < minlen) 3735 goto out_unlock; 3736 3737 entry = tree_search_offset(ctl, start, 0, 1); 3738 if (!entry) 3739 goto out_unlock; 3740 3741 /* Skip bitmaps and if async, already trimmed entries */ 3742 while (entry->bitmap || 3743 (async && btrfs_free_space_trimmed(entry))) { 3744 node = rb_next(&entry->offset_index); 3745 if (!node) 3746 goto out_unlock; 3747 entry = rb_entry(node, struct btrfs_free_space, 3748 offset_index); 3749 } 3750 3751 if (entry->offset >= end) 3752 goto out_unlock; 3753 3754 extent_start = entry->offset; 3755 extent_bytes = entry->bytes; 3756 extent_trim_state = entry->trim_state; 3757 if (async) { 3758 start = entry->offset; 3759 bytes = entry->bytes; 3760 if (bytes < minlen) { 3761 spin_unlock(&ctl->tree_lock); 3762 mutex_unlock(&ctl->cache_writeout_mutex); 3763 goto next; 3764 } 3765 unlink_free_space(ctl, entry, true); 3766 /* 3767 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3768 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3769 * X when we come back around. So trim it now. 3770 */ 3771 if (max_discard_size && 3772 bytes >= (max_discard_size + 3773 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3774 bytes = max_discard_size; 3775 extent_bytes = max_discard_size; 3776 entry->offset += max_discard_size; 3777 entry->bytes -= max_discard_size; 3778 link_free_space(ctl, entry); 3779 } else { 3780 kmem_cache_free(btrfs_free_space_cachep, entry); 3781 } 3782 } else { 3783 start = max(start, extent_start); 3784 bytes = min(extent_start + extent_bytes, end) - start; 3785 if (bytes < minlen) { 3786 spin_unlock(&ctl->tree_lock); 3787 mutex_unlock(&ctl->cache_writeout_mutex); 3788 goto next; 3789 } 3790 3791 unlink_free_space(ctl, entry, true); 3792 kmem_cache_free(btrfs_free_space_cachep, entry); 3793 } 3794 3795 spin_unlock(&ctl->tree_lock); 3796 trim_entry.start = extent_start; 3797 trim_entry.bytes = extent_bytes; 3798 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3799 mutex_unlock(&ctl->cache_writeout_mutex); 3800 3801 ret = do_trimming(block_group, total_trimmed, start, bytes, 3802 extent_start, extent_bytes, extent_trim_state, 3803 &trim_entry); 3804 if (ret) { 3805 block_group->discard_cursor = start + bytes; 3806 break; 3807 } 3808 next: 3809 start += bytes; 3810 block_group->discard_cursor = start; 3811 if (async && *total_trimmed) 3812 break; 3813 3814 if (btrfs_trim_interrupted()) { 3815 ret = -ERESTARTSYS; 3816 break; 3817 } 3818 3819 cond_resched(); 3820 } 3821 3822 return ret; 3823 3824 out_unlock: 3825 block_group->discard_cursor = btrfs_block_group_end(block_group); 3826 spin_unlock(&ctl->tree_lock); 3827 mutex_unlock(&ctl->cache_writeout_mutex); 3828 3829 return ret; 3830 } 3831 3832 void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) 3833 { 3834 struct btrfs_fs_info *fs_info = bg->fs_info; 3835 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; 3836 int ret = 0; 3837 u64 bytes, trimmed; 3838 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3839 u64 end = btrfs_block_group_end(bg); 3840 3841 if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) { 3842 bg->discard_cursor = end; 3843 3844 if (bg->used == 0) { 3845 spin_lock(&fs_info->unused_bgs_lock); 3846 if (!list_empty(&bg->bg_list)) { 3847 list_del_init(&bg->bg_list); 3848 btrfs_put_block_group(bg); 3849 } 3850 spin_unlock(&fs_info->unused_bgs_lock); 3851 3852 btrfs_mark_bg_unused(bg); 3853 } 3854 3855 return; 3856 } 3857 3858 bytes = end - bg->discard_cursor; 3859 3860 if (max_discard_size && 3861 bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) 3862 bytes = max_discard_size; 3863 3864 ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false); 3865 if (ret) 3866 return; 3867 3868 bg->discard_cursor += trimmed; 3869 3870 if (bg->discard_cursor < end) 3871 return; 3872 3873 btrfs_complete_bg_remapping(bg); 3874 } 3875 3876 /* 3877 * If we break out of trimming a bitmap prematurely, we should reset the 3878 * trimming bit. In a rather contrived case, it's possible to race here so 3879 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3880 * 3881 * start = start of bitmap 3882 * end = near end of bitmap 3883 * 3884 * Thread 1: Thread 2: 3885 * trim_bitmaps(start) 3886 * trim_bitmaps(end) 3887 * end_trimming_bitmap() 3888 * reset_trimming_bitmap() 3889 */ 3890 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3891 { 3892 struct btrfs_free_space *entry; 3893 3894 spin_lock(&ctl->tree_lock); 3895 entry = tree_search_offset(ctl, offset, 1, 0); 3896 if (entry) { 3897 if (btrfs_free_space_trimmed(entry)) { 3898 ctl->discardable_extents[BTRFS_STAT_CURR] += 3899 entry->bitmap_extents; 3900 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3901 } 3902 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3903 } 3904 3905 spin_unlock(&ctl->tree_lock); 3906 } 3907 3908 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3909 struct btrfs_free_space *entry) 3910 { 3911 if (btrfs_free_space_trimming_bitmap(entry)) { 3912 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3913 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3914 entry->bitmap_extents; 3915 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3916 } 3917 } 3918 3919 /* 3920 * If @async is set, then we will trim 1 region and return. 3921 */ 3922 static int trim_bitmaps(struct btrfs_block_group *block_group, 3923 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3924 u64 maxlen, bool async) 3925 { 3926 struct btrfs_discard_ctl *discard_ctl = 3927 &block_group->fs_info->discard_ctl; 3928 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3929 struct btrfs_free_space *entry; 3930 int ret = 0; 3931 int ret2; 3932 u64 bytes; 3933 u64 offset = offset_to_bitmap(ctl, start); 3934 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3935 3936 while (offset < end) { 3937 bool next_bitmap = false; 3938 struct btrfs_trim_range trim_entry; 3939 3940 mutex_lock(&ctl->cache_writeout_mutex); 3941 spin_lock(&ctl->tree_lock); 3942 3943 if (ctl->free_space < minlen) { 3944 block_group->discard_cursor = 3945 btrfs_block_group_end(block_group); 3946 spin_unlock(&ctl->tree_lock); 3947 mutex_unlock(&ctl->cache_writeout_mutex); 3948 break; 3949 } 3950 3951 entry = tree_search_offset(ctl, offset, 1, 0); 3952 /* 3953 * Bitmaps are marked trimmed lossily now to prevent constant 3954 * discarding of the same bitmap (the reason why we are bound 3955 * by the filters). So, retrim the block group bitmaps when we 3956 * are preparing to punt to the unused_bgs list. This uses 3957 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3958 * which is the only discard index which sets minlen to 0. 3959 */ 3960 if (!entry || (async && minlen && start == offset && 3961 btrfs_free_space_trimmed(entry))) { 3962 spin_unlock(&ctl->tree_lock); 3963 mutex_unlock(&ctl->cache_writeout_mutex); 3964 next_bitmap = true; 3965 goto next; 3966 } 3967 3968 /* 3969 * Async discard bitmap trimming begins at by setting the start 3970 * to be key.objectid and the offset_to_bitmap() aligns to the 3971 * start of the bitmap. This lets us know we are fully 3972 * scanning the bitmap rather than only some portion of it. 3973 */ 3974 if (start == offset) 3975 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3976 3977 bytes = minlen; 3978 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3979 if (ret2 || start >= end) { 3980 /* 3981 * We lossily consider a bitmap trimmed if we only skip 3982 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3983 */ 3984 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3985 end_trimming_bitmap(ctl, entry); 3986 else 3987 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3988 spin_unlock(&ctl->tree_lock); 3989 mutex_unlock(&ctl->cache_writeout_mutex); 3990 next_bitmap = true; 3991 goto next; 3992 } 3993 3994 /* 3995 * We already trimmed a region, but are using the locking above 3996 * to reset the trim_state. 3997 */ 3998 if (async && *total_trimmed) { 3999 spin_unlock(&ctl->tree_lock); 4000 mutex_unlock(&ctl->cache_writeout_mutex); 4001 return ret; 4002 } 4003 4004 bytes = min(bytes, end - start); 4005 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 4006 spin_unlock(&ctl->tree_lock); 4007 mutex_unlock(&ctl->cache_writeout_mutex); 4008 goto next; 4009 } 4010 4011 /* 4012 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 4013 * If X < @minlen, we won't trim X when we come back around. 4014 * So trim it now. We differ here from trimming extents as we 4015 * don't keep individual state per bit. 4016 */ 4017 if (async && 4018 max_discard_size && 4019 bytes > (max_discard_size + minlen)) 4020 bytes = max_discard_size; 4021 4022 bitmap_clear_bits(ctl, entry, start, bytes, true); 4023 if (entry->bytes == 0) 4024 free_bitmap(ctl, entry); 4025 4026 spin_unlock(&ctl->tree_lock); 4027 trim_entry.start = start; 4028 trim_entry.bytes = bytes; 4029 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 4030 mutex_unlock(&ctl->cache_writeout_mutex); 4031 4032 ret = do_trimming(block_group, total_trimmed, start, bytes, 4033 start, bytes, 0, &trim_entry); 4034 if (ret) { 4035 reset_trimming_bitmap(ctl, offset); 4036 block_group->discard_cursor = 4037 btrfs_block_group_end(block_group); 4038 break; 4039 } 4040 next: 4041 if (next_bitmap) { 4042 const int unit = block_group->fs_info->sectorsize; 4043 4044 offset += BITS_PER_BITMAP * unit; 4045 start = offset; 4046 } else { 4047 start += bytes; 4048 } 4049 block_group->discard_cursor = start; 4050 4051 if (btrfs_trim_interrupted()) { 4052 if (start != offset) 4053 reset_trimming_bitmap(ctl, offset); 4054 ret = -ERESTARTSYS; 4055 break; 4056 } 4057 4058 cond_resched(); 4059 } 4060 4061 if (offset >= end) 4062 block_group->discard_cursor = end; 4063 4064 return ret; 4065 } 4066 4067 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4068 u64 *trimmed, u64 start, u64 end, u64 minlen) 4069 { 4070 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4071 const int unit = block_group->fs_info->sectorsize; 4072 int ret; 4073 u64 rem = 0; 4074 4075 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4076 4077 *trimmed = 0; 4078 4079 spin_lock(&block_group->lock); 4080 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4081 spin_unlock(&block_group->lock); 4082 return 0; 4083 } 4084 btrfs_freeze_block_group(block_group); 4085 spin_unlock(&block_group->lock); 4086 4087 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4088 if (ret) 4089 goto out; 4090 4091 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4092 div64_u64_rem(end, BITS_PER_BITMAP * unit, &rem); 4093 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4094 if (rem) 4095 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4096 out: 4097 btrfs_unfreeze_block_group(block_group); 4098 return ret; 4099 } 4100 4101 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4102 u64 *trimmed, u64 start, u64 end, u64 minlen, 4103 bool async) 4104 { 4105 int ret; 4106 4107 *trimmed = 0; 4108 4109 spin_lock(&block_group->lock); 4110 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4111 spin_unlock(&block_group->lock); 4112 return 0; 4113 } 4114 btrfs_freeze_block_group(block_group); 4115 spin_unlock(&block_group->lock); 4116 4117 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4118 btrfs_unfreeze_block_group(block_group); 4119 4120 return ret; 4121 } 4122 4123 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4124 u64 *trimmed, u64 start, u64 end, u64 minlen, 4125 u64 maxlen, bool async) 4126 { 4127 int ret; 4128 4129 *trimmed = 0; 4130 4131 spin_lock(&block_group->lock); 4132 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4133 spin_unlock(&block_group->lock); 4134 return 0; 4135 } 4136 btrfs_freeze_block_group(block_group); 4137 spin_unlock(&block_group->lock); 4138 4139 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4140 async); 4141 4142 btrfs_unfreeze_block_group(block_group); 4143 4144 return ret; 4145 } 4146 4147 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4148 { 4149 return btrfs_super_cache_generation(fs_info->super_copy); 4150 } 4151 4152 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4153 struct btrfs_trans_handle *trans) 4154 { 4155 struct btrfs_block_group *block_group; 4156 struct rb_node *node; 4157 4158 btrfs_info(fs_info, "cleaning free space cache v1"); 4159 4160 node = rb_first_cached(&fs_info->block_group_cache_tree); 4161 while (node) { 4162 int ret; 4163 4164 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4165 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4166 if (ret) 4167 return ret; 4168 node = rb_next(node); 4169 } 4170 return 0; 4171 } 4172 4173 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4174 { 4175 struct btrfs_trans_handle *trans; 4176 int ret; 4177 4178 /* 4179 * update_super_roots will appropriately set or unset 4180 * super_copy->cache_generation based on SPACE_CACHE and 4181 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4182 * transaction commit whether we are enabling space cache v1 and don't 4183 * have any other work to do, or are disabling it and removing free 4184 * space inodes. 4185 */ 4186 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4187 if (IS_ERR(trans)) 4188 return PTR_ERR(trans); 4189 4190 if (!active) { 4191 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4192 ret = cleanup_free_space_cache_v1(fs_info, trans); 4193 if (unlikely(ret)) { 4194 btrfs_abort_transaction(trans, ret); 4195 btrfs_end_transaction(trans); 4196 goto out; 4197 } 4198 } 4199 4200 ret = btrfs_commit_transaction(trans); 4201 out: 4202 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4203 4204 return ret; 4205 } 4206 4207 int __init btrfs_free_space_init(void) 4208 { 4209 btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); 4210 if (!btrfs_free_space_cachep) 4211 return -ENOMEM; 4212 4213 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4214 PAGE_SIZE, PAGE_SIZE, 4215 0, NULL); 4216 if (!btrfs_free_space_bitmap_cachep) { 4217 kmem_cache_destroy(btrfs_free_space_cachep); 4218 return -ENOMEM; 4219 } 4220 4221 return 0; 4222 } 4223 4224 void __cold btrfs_free_space_exit(void) 4225 { 4226 kmem_cache_destroy(btrfs_free_space_cachep); 4227 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4228 } 4229 4230 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4231 /* 4232 * Use this if you need to make a bitmap or extent entry specifically, it 4233 * doesn't do any of the merging that add_free_space does, this acts a lot like 4234 * how the free space cache loading stuff works, so you can get really weird 4235 * configurations. 4236 */ 4237 int test_add_free_space_entry(struct btrfs_block_group *cache, 4238 u64 offset, u64 bytes, bool bitmap) 4239 { 4240 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4241 struct btrfs_free_space *info = NULL, *bitmap_info; 4242 void *map = NULL; 4243 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4244 u64 bytes_added; 4245 int ret; 4246 4247 again: 4248 if (!info) { 4249 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4250 if (!info) 4251 return -ENOMEM; 4252 } 4253 4254 if (!bitmap) { 4255 spin_lock(&ctl->tree_lock); 4256 info->offset = offset; 4257 info->bytes = bytes; 4258 info->max_extent_size = 0; 4259 ret = link_free_space(ctl, info); 4260 spin_unlock(&ctl->tree_lock); 4261 if (ret) 4262 kmem_cache_free(btrfs_free_space_cachep, info); 4263 return ret; 4264 } 4265 4266 if (!map) { 4267 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4268 if (!map) { 4269 kmem_cache_free(btrfs_free_space_cachep, info); 4270 return -ENOMEM; 4271 } 4272 } 4273 4274 spin_lock(&ctl->tree_lock); 4275 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4276 1, 0); 4277 if (!bitmap_info) { 4278 info->bitmap = map; 4279 map = NULL; 4280 add_new_bitmap(ctl, info, offset); 4281 bitmap_info = info; 4282 info = NULL; 4283 } 4284 4285 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4286 trim_state); 4287 4288 bytes -= bytes_added; 4289 offset += bytes_added; 4290 spin_unlock(&ctl->tree_lock); 4291 4292 if (bytes) 4293 goto again; 4294 4295 if (info) 4296 kmem_cache_free(btrfs_free_space_cachep, info); 4297 if (map) 4298 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4299 return 0; 4300 } 4301 4302 /* 4303 * Checks to see if the given range is in the free space cache. This is really 4304 * just used to check the absence of space, so if there is free space in the 4305 * range at all we will return 1. 4306 */ 4307 int test_check_exists(struct btrfs_block_group *cache, 4308 u64 offset, u64 bytes) 4309 { 4310 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4311 const int unit = cache->fs_info->sectorsize; 4312 struct btrfs_free_space *info; 4313 int ret = 0; 4314 4315 spin_lock(&ctl->tree_lock); 4316 info = tree_search_offset(ctl, offset, 0, 0); 4317 if (!info) { 4318 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4319 1, 0); 4320 if (!info) 4321 goto out; 4322 } 4323 4324 have_info: 4325 if (info->bitmap) { 4326 u64 bit_off, bit_bytes; 4327 struct rb_node *n; 4328 struct btrfs_free_space *tmp; 4329 4330 bit_off = offset; 4331 bit_bytes = unit; 4332 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4333 if (!ret) { 4334 if (bit_off == offset) { 4335 ret = 1; 4336 goto out; 4337 } else if (bit_off > offset && 4338 offset + bytes > bit_off) { 4339 ret = 1; 4340 goto out; 4341 } 4342 } 4343 4344 n = rb_prev(&info->offset_index); 4345 while (n) { 4346 tmp = rb_entry(n, struct btrfs_free_space, 4347 offset_index); 4348 if (tmp->offset + tmp->bytes < offset) 4349 break; 4350 if (offset + bytes < tmp->offset) { 4351 n = rb_prev(&tmp->offset_index); 4352 continue; 4353 } 4354 info = tmp; 4355 goto have_info; 4356 } 4357 4358 n = rb_next(&info->offset_index); 4359 while (n) { 4360 tmp = rb_entry(n, struct btrfs_free_space, 4361 offset_index); 4362 if (offset + bytes < tmp->offset) 4363 break; 4364 if (tmp->offset + tmp->bytes < offset) { 4365 n = rb_next(&tmp->offset_index); 4366 continue; 4367 } 4368 info = tmp; 4369 goto have_info; 4370 } 4371 4372 ret = 0; 4373 goto out; 4374 } 4375 4376 if (info->offset == offset) { 4377 ret = 1; 4378 goto out; 4379 } 4380 4381 if (offset > info->offset && offset < info->offset + info->bytes) 4382 ret = 1; 4383 out: 4384 spin_unlock(&ctl->tree_lock); 4385 return ret; 4386 } 4387 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4388