1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include <linux/string_choices.h> 15 #include "extent-tree.h" 16 #include "fs.h" 17 #include "messages.h" 18 #include "misc.h" 19 #include "free-space-cache.h" 20 #include "transaction.h" 21 #include "disk-io.h" 22 #include "extent_io.h" 23 #include "space-info.h" 24 #include "block-group.h" 25 #include "discard.h" 26 #include "subpage.h" 27 #include "inode-item.h" 28 #include "accessors.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "super.h" 32 33 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 34 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 35 #define FORCE_EXTENT_THRESHOLD SZ_1M 36 37 static struct kmem_cache *btrfs_free_space_cachep; 38 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 39 40 struct btrfs_trim_range { 41 u64 start; 42 u64 bytes; 43 struct list_head list; 44 }; 45 46 static int link_free_space(struct btrfs_free_space_ctl *ctl, 47 struct btrfs_free_space *info); 48 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 49 struct btrfs_free_space *info, bool update_stat); 50 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 51 struct btrfs_free_space *bitmap_info, u64 *offset, 52 u64 *bytes, bool for_alloc); 53 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 54 struct btrfs_free_space *bitmap_info); 55 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 56 struct btrfs_free_space *info, u64 offset, 57 u64 bytes, bool update_stats); 58 59 static void btrfs_crc32c_final(u32 crc, u8 *result) 60 { 61 put_unaligned_le32(~crc, result); 62 } 63 64 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 65 { 66 struct btrfs_free_space *info; 67 struct rb_node *node; 68 69 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 70 info = rb_entry(node, struct btrfs_free_space, offset_index); 71 if (!info->bitmap) { 72 unlink_free_space(ctl, info, true); 73 kmem_cache_free(btrfs_free_space_cachep, info); 74 } else { 75 free_bitmap(ctl, info); 76 } 77 78 cond_resched_lock(&ctl->tree_lock); 79 } 80 } 81 82 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 83 struct btrfs_path *path, 84 u64 offset) 85 { 86 struct btrfs_key key; 87 struct btrfs_key location; 88 struct btrfs_disk_key disk_key; 89 struct btrfs_free_space_header *header; 90 struct extent_buffer *leaf; 91 struct inode *inode = NULL; 92 unsigned nofs_flag; 93 int ret; 94 95 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 96 key.offset = offset; 97 key.type = 0; 98 99 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 100 if (ret < 0) 101 return ERR_PTR(ret); 102 if (ret > 0) { 103 btrfs_release_path(path); 104 return ERR_PTR(-ENOENT); 105 } 106 107 leaf = path->nodes[0]; 108 header = btrfs_item_ptr(leaf, path->slots[0], 109 struct btrfs_free_space_header); 110 btrfs_free_space_key(leaf, header, &disk_key); 111 btrfs_disk_key_to_cpu(&location, &disk_key); 112 btrfs_release_path(path); 113 114 /* 115 * We are often under a trans handle at this point, so we need to make 116 * sure NOFS is set to keep us from deadlocking. 117 */ 118 nofs_flag = memalloc_nofs_save(); 119 inode = btrfs_iget_path(location.objectid, root, path); 120 btrfs_release_path(path); 121 memalloc_nofs_restore(nofs_flag); 122 if (IS_ERR(inode)) 123 return inode; 124 125 mapping_set_gfp_mask(inode->i_mapping, 126 mapping_gfp_constraint(inode->i_mapping, 127 ~(__GFP_FS | __GFP_HIGHMEM))); 128 129 return inode; 130 } 131 132 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 133 struct btrfs_path *path) 134 { 135 struct btrfs_fs_info *fs_info = block_group->fs_info; 136 struct inode *inode = NULL; 137 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 138 139 spin_lock(&block_group->lock); 140 if (block_group->inode) 141 inode = igrab(&block_group->inode->vfs_inode); 142 spin_unlock(&block_group->lock); 143 if (inode) 144 return inode; 145 146 inode = __lookup_free_space_inode(fs_info->tree_root, path, 147 block_group->start); 148 if (IS_ERR(inode)) 149 return inode; 150 151 spin_lock(&block_group->lock); 152 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 153 btrfs_info(fs_info, "Old style space inode found, converting."); 154 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 155 BTRFS_INODE_NODATACOW; 156 block_group->disk_cache_state = BTRFS_DC_CLEAR; 157 } 158 159 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 160 block_group->inode = BTRFS_I(igrab(inode)); 161 spin_unlock(&block_group->lock); 162 163 return inode; 164 } 165 166 static int __create_free_space_inode(struct btrfs_root *root, 167 struct btrfs_trans_handle *trans, 168 struct btrfs_path *path, 169 u64 ino, u64 offset) 170 { 171 struct btrfs_key key; 172 struct btrfs_disk_key disk_key; 173 struct btrfs_free_space_header *header; 174 struct btrfs_inode_item *inode_item; 175 struct extent_buffer *leaf; 176 /* We inline CRCs for the free disk space cache */ 177 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 178 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 179 int ret; 180 181 ret = btrfs_insert_empty_inode(trans, root, path, ino); 182 if (ret) 183 return ret; 184 185 leaf = path->nodes[0]; 186 inode_item = btrfs_item_ptr(leaf, path->slots[0], 187 struct btrfs_inode_item); 188 btrfs_item_key(leaf, &disk_key, path->slots[0]); 189 memzero_extent_buffer(leaf, (unsigned long)inode_item, 190 sizeof(*inode_item)); 191 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 192 btrfs_set_inode_size(leaf, inode_item, 0); 193 btrfs_set_inode_nbytes(leaf, inode_item, 0); 194 btrfs_set_inode_uid(leaf, inode_item, 0); 195 btrfs_set_inode_gid(leaf, inode_item, 0); 196 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 197 btrfs_set_inode_flags(leaf, inode_item, flags); 198 btrfs_set_inode_nlink(leaf, inode_item, 1); 199 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 200 btrfs_set_inode_block_group(leaf, inode_item, offset); 201 btrfs_release_path(path); 202 203 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 204 key.offset = offset; 205 key.type = 0; 206 ret = btrfs_insert_empty_item(trans, root, path, &key, 207 sizeof(struct btrfs_free_space_header)); 208 if (ret < 0) { 209 btrfs_release_path(path); 210 return ret; 211 } 212 213 leaf = path->nodes[0]; 214 header = btrfs_item_ptr(leaf, path->slots[0], 215 struct btrfs_free_space_header); 216 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 217 btrfs_set_free_space_key(leaf, header, &disk_key); 218 btrfs_release_path(path); 219 220 return 0; 221 } 222 223 int create_free_space_inode(struct btrfs_trans_handle *trans, 224 struct btrfs_block_group *block_group, 225 struct btrfs_path *path) 226 { 227 int ret; 228 u64 ino; 229 230 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 231 if (ret < 0) 232 return ret; 233 234 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 235 ino, block_group->start); 236 } 237 238 /* 239 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 240 * handles lookup, otherwise it takes ownership and iputs the inode. 241 * Don't reuse an inode pointer after passing it into this function. 242 */ 243 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 244 struct inode *inode, 245 struct btrfs_block_group *block_group) 246 { 247 struct btrfs_path *path; 248 struct btrfs_key key; 249 int ret = 0; 250 251 path = btrfs_alloc_path(); 252 if (!path) 253 return -ENOMEM; 254 255 if (!inode) 256 inode = lookup_free_space_inode(block_group, path); 257 if (IS_ERR(inode)) { 258 if (PTR_ERR(inode) != -ENOENT) 259 ret = PTR_ERR(inode); 260 goto out; 261 } 262 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 263 if (ret) { 264 btrfs_add_delayed_iput(BTRFS_I(inode)); 265 goto out; 266 } 267 clear_nlink(inode); 268 /* One for the block groups ref */ 269 spin_lock(&block_group->lock); 270 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 271 block_group->inode = NULL; 272 spin_unlock(&block_group->lock); 273 iput(inode); 274 } else { 275 spin_unlock(&block_group->lock); 276 } 277 /* One for the lookup ref */ 278 btrfs_add_delayed_iput(BTRFS_I(inode)); 279 280 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 281 key.type = 0; 282 key.offset = block_group->start; 283 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 284 -1, 1); 285 if (ret) { 286 if (ret > 0) 287 ret = 0; 288 goto out; 289 } 290 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 291 out: 292 btrfs_free_path(path); 293 return ret; 294 } 295 296 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 297 struct btrfs_block_group *block_group, 298 struct inode *vfs_inode) 299 { 300 struct btrfs_truncate_control control = { 301 .inode = BTRFS_I(vfs_inode), 302 .new_size = 0, 303 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 304 .min_type = BTRFS_EXTENT_DATA_KEY, 305 .clear_extent_range = true, 306 }; 307 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 308 struct btrfs_root *root = inode->root; 309 struct extent_state *cached_state = NULL; 310 int ret = 0; 311 bool locked = false; 312 313 if (block_group) { 314 struct btrfs_path *path = btrfs_alloc_path(); 315 316 if (!path) { 317 ret = -ENOMEM; 318 goto fail; 319 } 320 locked = true; 321 mutex_lock(&trans->transaction->cache_write_mutex); 322 if (!list_empty(&block_group->io_list)) { 323 list_del_init(&block_group->io_list); 324 325 btrfs_wait_cache_io(trans, block_group, path); 326 btrfs_put_block_group(block_group); 327 } 328 329 /* 330 * now that we've truncated the cache away, its no longer 331 * setup or written 332 */ 333 spin_lock(&block_group->lock); 334 block_group->disk_cache_state = BTRFS_DC_CLEAR; 335 spin_unlock(&block_group->lock); 336 btrfs_free_path(path); 337 } 338 339 btrfs_i_size_write(inode, 0); 340 truncate_pagecache(vfs_inode, 0); 341 342 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 343 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 344 345 /* 346 * We skip the throttling logic for free space cache inodes, so we don't 347 * need to check for -EAGAIN. 348 */ 349 ret = btrfs_truncate_inode_items(trans, root, &control); 350 351 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 352 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 353 354 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 355 if (ret) 356 goto fail; 357 358 ret = btrfs_update_inode(trans, inode); 359 360 fail: 361 if (locked) 362 mutex_unlock(&trans->transaction->cache_write_mutex); 363 if (ret) 364 btrfs_abort_transaction(trans, ret); 365 366 return ret; 367 } 368 369 static void readahead_cache(struct inode *inode) 370 { 371 struct file_ra_state ra; 372 unsigned long last_index; 373 374 file_ra_state_init(&ra, inode->i_mapping); 375 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 376 377 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 378 } 379 380 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 381 int write) 382 { 383 int num_pages; 384 385 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 386 387 /* Make sure we can fit our crcs and generation into the first page */ 388 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 389 return -ENOSPC; 390 391 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 392 393 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 394 if (!io_ctl->pages) 395 return -ENOMEM; 396 397 io_ctl->num_pages = num_pages; 398 io_ctl->fs_info = inode_to_fs_info(inode); 399 io_ctl->inode = inode; 400 401 return 0; 402 } 403 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 404 405 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 406 { 407 kfree(io_ctl->pages); 408 io_ctl->pages = NULL; 409 } 410 411 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 412 { 413 if (io_ctl->cur) { 414 io_ctl->cur = NULL; 415 io_ctl->orig = NULL; 416 } 417 } 418 419 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 420 { 421 ASSERT(io_ctl->index < io_ctl->num_pages); 422 io_ctl->page = io_ctl->pages[io_ctl->index++]; 423 io_ctl->cur = page_address(io_ctl->page); 424 io_ctl->orig = io_ctl->cur; 425 io_ctl->size = PAGE_SIZE; 426 if (clear) 427 clear_page(io_ctl->cur); 428 } 429 430 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 431 { 432 int i; 433 434 io_ctl_unmap_page(io_ctl); 435 436 for (i = 0; i < io_ctl->num_pages; i++) { 437 if (io_ctl->pages[i]) { 438 btrfs_folio_clear_checked(io_ctl->fs_info, 439 page_folio(io_ctl->pages[i]), 440 page_offset(io_ctl->pages[i]), 441 PAGE_SIZE); 442 unlock_page(io_ctl->pages[i]); 443 put_page(io_ctl->pages[i]); 444 } 445 } 446 } 447 448 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 449 { 450 struct page *page; 451 struct inode *inode = io_ctl->inode; 452 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 453 int i; 454 455 for (i = 0; i < io_ctl->num_pages; i++) { 456 int ret; 457 458 page = find_or_create_page(inode->i_mapping, i, mask); 459 if (!page) { 460 io_ctl_drop_pages(io_ctl); 461 return -ENOMEM; 462 } 463 464 ret = set_folio_extent_mapped(page_folio(page)); 465 if (ret < 0) { 466 unlock_page(page); 467 put_page(page); 468 io_ctl_drop_pages(io_ctl); 469 return ret; 470 } 471 472 io_ctl->pages[i] = page; 473 if (uptodate && !PageUptodate(page)) { 474 btrfs_read_folio(NULL, page_folio(page)); 475 lock_page(page); 476 if (page->mapping != inode->i_mapping) { 477 btrfs_err(BTRFS_I(inode)->root->fs_info, 478 "free space cache page truncated"); 479 io_ctl_drop_pages(io_ctl); 480 return -EIO; 481 } 482 if (!PageUptodate(page)) { 483 btrfs_err(BTRFS_I(inode)->root->fs_info, 484 "error reading free space cache"); 485 io_ctl_drop_pages(io_ctl); 486 return -EIO; 487 } 488 } 489 } 490 491 for (i = 0; i < io_ctl->num_pages; i++) 492 clear_page_dirty_for_io(io_ctl->pages[i]); 493 494 return 0; 495 } 496 497 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 498 { 499 io_ctl_map_page(io_ctl, 1); 500 501 /* 502 * Skip the csum areas. If we don't check crcs then we just have a 503 * 64bit chunk at the front of the first page. 504 */ 505 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 506 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 507 508 put_unaligned_le64(generation, io_ctl->cur); 509 io_ctl->cur += sizeof(u64); 510 } 511 512 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 513 { 514 u64 cache_gen; 515 516 /* 517 * Skip the crc area. If we don't check crcs then we just have a 64bit 518 * chunk at the front of the first page. 519 */ 520 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 521 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 522 523 cache_gen = get_unaligned_le64(io_ctl->cur); 524 if (cache_gen != generation) { 525 btrfs_err_rl(io_ctl->fs_info, 526 "space cache generation (%llu) does not match inode (%llu)", 527 cache_gen, generation); 528 io_ctl_unmap_page(io_ctl); 529 return -EIO; 530 } 531 io_ctl->cur += sizeof(u64); 532 return 0; 533 } 534 535 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 536 { 537 u32 *tmp; 538 u32 crc = ~(u32)0; 539 unsigned offset = 0; 540 541 if (index == 0) 542 offset = sizeof(u32) * io_ctl->num_pages; 543 544 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 545 btrfs_crc32c_final(crc, (u8 *)&crc); 546 io_ctl_unmap_page(io_ctl); 547 tmp = page_address(io_ctl->pages[0]); 548 tmp += index; 549 *tmp = crc; 550 } 551 552 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 553 { 554 u32 *tmp, val; 555 u32 crc = ~(u32)0; 556 unsigned offset = 0; 557 558 if (index == 0) 559 offset = sizeof(u32) * io_ctl->num_pages; 560 561 tmp = page_address(io_ctl->pages[0]); 562 tmp += index; 563 val = *tmp; 564 565 io_ctl_map_page(io_ctl, 0); 566 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 567 btrfs_crc32c_final(crc, (u8 *)&crc); 568 if (val != crc) { 569 btrfs_err_rl(io_ctl->fs_info, 570 "csum mismatch on free space cache"); 571 io_ctl_unmap_page(io_ctl); 572 return -EIO; 573 } 574 575 return 0; 576 } 577 578 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 579 void *bitmap) 580 { 581 struct btrfs_free_space_entry *entry; 582 583 if (!io_ctl->cur) 584 return -ENOSPC; 585 586 entry = io_ctl->cur; 587 put_unaligned_le64(offset, &entry->offset); 588 put_unaligned_le64(bytes, &entry->bytes); 589 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 590 BTRFS_FREE_SPACE_EXTENT; 591 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 592 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 593 594 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 595 return 0; 596 597 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 598 599 /* No more pages to map */ 600 if (io_ctl->index >= io_ctl->num_pages) 601 return 0; 602 603 /* map the next page */ 604 io_ctl_map_page(io_ctl, 1); 605 return 0; 606 } 607 608 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 609 { 610 if (!io_ctl->cur) 611 return -ENOSPC; 612 613 /* 614 * If we aren't at the start of the current page, unmap this one and 615 * map the next one if there is any left. 616 */ 617 if (io_ctl->cur != io_ctl->orig) { 618 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 619 if (io_ctl->index >= io_ctl->num_pages) 620 return -ENOSPC; 621 io_ctl_map_page(io_ctl, 0); 622 } 623 624 copy_page(io_ctl->cur, bitmap); 625 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 626 if (io_ctl->index < io_ctl->num_pages) 627 io_ctl_map_page(io_ctl, 0); 628 return 0; 629 } 630 631 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 632 { 633 /* 634 * If we're not on the boundary we know we've modified the page and we 635 * need to crc the page. 636 */ 637 if (io_ctl->cur != io_ctl->orig) 638 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 639 else 640 io_ctl_unmap_page(io_ctl); 641 642 while (io_ctl->index < io_ctl->num_pages) { 643 io_ctl_map_page(io_ctl, 1); 644 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 645 } 646 } 647 648 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 649 struct btrfs_free_space *entry, u8 *type) 650 { 651 struct btrfs_free_space_entry *e; 652 int ret; 653 654 if (!io_ctl->cur) { 655 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 656 if (ret) 657 return ret; 658 } 659 660 e = io_ctl->cur; 661 entry->offset = get_unaligned_le64(&e->offset); 662 entry->bytes = get_unaligned_le64(&e->bytes); 663 *type = e->type; 664 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 665 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 666 667 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 668 return 0; 669 670 io_ctl_unmap_page(io_ctl); 671 672 return 0; 673 } 674 675 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 676 struct btrfs_free_space *entry) 677 { 678 int ret; 679 680 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 681 if (ret) 682 return ret; 683 684 copy_page(entry->bitmap, io_ctl->cur); 685 io_ctl_unmap_page(io_ctl); 686 687 return 0; 688 } 689 690 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 691 { 692 struct btrfs_block_group *block_group = ctl->block_group; 693 u64 max_bytes; 694 u64 bitmap_bytes; 695 u64 extent_bytes; 696 u64 size = block_group->length; 697 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 698 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 699 700 max_bitmaps = max_t(u64, max_bitmaps, 1); 701 702 if (ctl->total_bitmaps > max_bitmaps) 703 btrfs_err(block_group->fs_info, 704 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 705 block_group->start, block_group->length, 706 ctl->total_bitmaps, ctl->unit, max_bitmaps, 707 bytes_per_bg); 708 ASSERT(ctl->total_bitmaps <= max_bitmaps); 709 710 /* 711 * We are trying to keep the total amount of memory used per 1GiB of 712 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 713 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 714 * bitmaps, we may end up using more memory than this. 715 */ 716 if (size < SZ_1G) 717 max_bytes = MAX_CACHE_BYTES_PER_GIG; 718 else 719 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 720 721 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 722 723 /* 724 * we want the extent entry threshold to always be at most 1/2 the max 725 * bytes we can have, or whatever is less than that. 726 */ 727 extent_bytes = max_bytes - bitmap_bytes; 728 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 729 730 ctl->extents_thresh = 731 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 732 } 733 734 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 735 struct btrfs_free_space_ctl *ctl, 736 struct btrfs_path *path, u64 offset) 737 { 738 struct btrfs_fs_info *fs_info = root->fs_info; 739 struct btrfs_free_space_header *header; 740 struct extent_buffer *leaf; 741 struct btrfs_io_ctl io_ctl; 742 struct btrfs_key key; 743 struct btrfs_free_space *e, *n; 744 LIST_HEAD(bitmaps); 745 u64 num_entries; 746 u64 num_bitmaps; 747 u64 generation; 748 u8 type; 749 int ret = 0; 750 751 /* Nothing in the space cache, goodbye */ 752 if (!i_size_read(inode)) 753 return 0; 754 755 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 756 key.offset = offset; 757 key.type = 0; 758 759 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 760 if (ret < 0) 761 return 0; 762 else if (ret > 0) { 763 btrfs_release_path(path); 764 return 0; 765 } 766 767 ret = -1; 768 769 leaf = path->nodes[0]; 770 header = btrfs_item_ptr(leaf, path->slots[0], 771 struct btrfs_free_space_header); 772 num_entries = btrfs_free_space_entries(leaf, header); 773 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 774 generation = btrfs_free_space_generation(leaf, header); 775 btrfs_release_path(path); 776 777 if (!BTRFS_I(inode)->generation) { 778 btrfs_info(fs_info, 779 "the free space cache file (%llu) is invalid, skip it", 780 offset); 781 return 0; 782 } 783 784 if (BTRFS_I(inode)->generation != generation) { 785 btrfs_err(fs_info, 786 "free space inode generation (%llu) did not match free space cache generation (%llu)", 787 BTRFS_I(inode)->generation, generation); 788 return 0; 789 } 790 791 if (!num_entries) 792 return 0; 793 794 ret = io_ctl_init(&io_ctl, inode, 0); 795 if (ret) 796 return ret; 797 798 readahead_cache(inode); 799 800 ret = io_ctl_prepare_pages(&io_ctl, true); 801 if (ret) 802 goto out; 803 804 ret = io_ctl_check_crc(&io_ctl, 0); 805 if (ret) 806 goto free_cache; 807 808 ret = io_ctl_check_generation(&io_ctl, generation); 809 if (ret) 810 goto free_cache; 811 812 while (num_entries) { 813 e = kmem_cache_zalloc(btrfs_free_space_cachep, 814 GFP_NOFS); 815 if (!e) { 816 ret = -ENOMEM; 817 goto free_cache; 818 } 819 820 ret = io_ctl_read_entry(&io_ctl, e, &type); 821 if (ret) { 822 kmem_cache_free(btrfs_free_space_cachep, e); 823 goto free_cache; 824 } 825 826 if (!e->bytes) { 827 ret = -1; 828 kmem_cache_free(btrfs_free_space_cachep, e); 829 goto free_cache; 830 } 831 832 if (type == BTRFS_FREE_SPACE_EXTENT) { 833 spin_lock(&ctl->tree_lock); 834 ret = link_free_space(ctl, e); 835 spin_unlock(&ctl->tree_lock); 836 if (ret) { 837 btrfs_err(fs_info, 838 "Duplicate entries in free space cache, dumping"); 839 kmem_cache_free(btrfs_free_space_cachep, e); 840 goto free_cache; 841 } 842 } else { 843 ASSERT(num_bitmaps); 844 num_bitmaps--; 845 e->bitmap = kmem_cache_zalloc( 846 btrfs_free_space_bitmap_cachep, GFP_NOFS); 847 if (!e->bitmap) { 848 ret = -ENOMEM; 849 kmem_cache_free( 850 btrfs_free_space_cachep, e); 851 goto free_cache; 852 } 853 spin_lock(&ctl->tree_lock); 854 ret = link_free_space(ctl, e); 855 if (ret) { 856 spin_unlock(&ctl->tree_lock); 857 btrfs_err(fs_info, 858 "Duplicate entries in free space cache, dumping"); 859 kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); 860 kmem_cache_free(btrfs_free_space_cachep, e); 861 goto free_cache; 862 } 863 ctl->total_bitmaps++; 864 recalculate_thresholds(ctl); 865 spin_unlock(&ctl->tree_lock); 866 list_add_tail(&e->list, &bitmaps); 867 } 868 869 num_entries--; 870 } 871 872 io_ctl_unmap_page(&io_ctl); 873 874 /* 875 * We add the bitmaps at the end of the entries in order that 876 * the bitmap entries are added to the cache. 877 */ 878 list_for_each_entry_safe(e, n, &bitmaps, list) { 879 list_del_init(&e->list); 880 ret = io_ctl_read_bitmap(&io_ctl, e); 881 if (ret) 882 goto free_cache; 883 } 884 885 io_ctl_drop_pages(&io_ctl); 886 ret = 1; 887 out: 888 io_ctl_free(&io_ctl); 889 return ret; 890 free_cache: 891 io_ctl_drop_pages(&io_ctl); 892 893 spin_lock(&ctl->tree_lock); 894 __btrfs_remove_free_space_cache(ctl); 895 spin_unlock(&ctl->tree_lock); 896 goto out; 897 } 898 899 static int copy_free_space_cache(struct btrfs_block_group *block_group, 900 struct btrfs_free_space_ctl *ctl) 901 { 902 struct btrfs_free_space *info; 903 struct rb_node *n; 904 int ret = 0; 905 906 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 907 info = rb_entry(n, struct btrfs_free_space, offset_index); 908 if (!info->bitmap) { 909 const u64 offset = info->offset; 910 const u64 bytes = info->bytes; 911 912 unlink_free_space(ctl, info, true); 913 spin_unlock(&ctl->tree_lock); 914 kmem_cache_free(btrfs_free_space_cachep, info); 915 ret = btrfs_add_free_space(block_group, offset, bytes); 916 spin_lock(&ctl->tree_lock); 917 } else { 918 u64 offset = info->offset; 919 u64 bytes = ctl->unit; 920 921 ret = search_bitmap(ctl, info, &offset, &bytes, false); 922 if (ret == 0) { 923 bitmap_clear_bits(ctl, info, offset, bytes, true); 924 spin_unlock(&ctl->tree_lock); 925 ret = btrfs_add_free_space(block_group, offset, 926 bytes); 927 spin_lock(&ctl->tree_lock); 928 } else { 929 free_bitmap(ctl, info); 930 ret = 0; 931 } 932 } 933 cond_resched_lock(&ctl->tree_lock); 934 } 935 return ret; 936 } 937 938 static struct lock_class_key btrfs_free_space_inode_key; 939 940 int load_free_space_cache(struct btrfs_block_group *block_group) 941 { 942 struct btrfs_fs_info *fs_info = block_group->fs_info; 943 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 944 struct btrfs_free_space_ctl tmp_ctl = {}; 945 struct inode *inode; 946 struct btrfs_path *path; 947 int ret = 0; 948 bool matched; 949 u64 used = block_group->used; 950 951 /* 952 * Because we could potentially discard our loaded free space, we want 953 * to load everything into a temporary structure first, and then if it's 954 * valid copy it all into the actual free space ctl. 955 */ 956 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 957 958 /* 959 * If this block group has been marked to be cleared for one reason or 960 * another then we can't trust the on disk cache, so just return. 961 */ 962 spin_lock(&block_group->lock); 963 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 964 spin_unlock(&block_group->lock); 965 return 0; 966 } 967 spin_unlock(&block_group->lock); 968 969 path = btrfs_alloc_path(); 970 if (!path) 971 return 0; 972 path->search_commit_root = 1; 973 path->skip_locking = 1; 974 975 /* 976 * We must pass a path with search_commit_root set to btrfs_iget in 977 * order to avoid a deadlock when allocating extents for the tree root. 978 * 979 * When we are COWing an extent buffer from the tree root, when looking 980 * for a free extent, at extent-tree.c:find_free_extent(), we can find 981 * block group without its free space cache loaded. When we find one 982 * we must load its space cache which requires reading its free space 983 * cache's inode item from the root tree. If this inode item is located 984 * in the same leaf that we started COWing before, then we end up in 985 * deadlock on the extent buffer (trying to read lock it when we 986 * previously write locked it). 987 * 988 * It's safe to read the inode item using the commit root because 989 * block groups, once loaded, stay in memory forever (until they are 990 * removed) as well as their space caches once loaded. New block groups 991 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 992 * we will never try to read their inode item while the fs is mounted. 993 */ 994 inode = lookup_free_space_inode(block_group, path); 995 if (IS_ERR(inode)) { 996 btrfs_free_path(path); 997 return 0; 998 } 999 1000 /* We may have converted the inode and made the cache invalid. */ 1001 spin_lock(&block_group->lock); 1002 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1003 spin_unlock(&block_group->lock); 1004 btrfs_free_path(path); 1005 goto out; 1006 } 1007 spin_unlock(&block_group->lock); 1008 1009 /* 1010 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1011 * free space inodes to prevent false positives related to locks for normal 1012 * inodes. 1013 */ 1014 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1015 &btrfs_free_space_inode_key); 1016 1017 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1018 path, block_group->start); 1019 btrfs_free_path(path); 1020 if (ret <= 0) 1021 goto out; 1022 1023 matched = (tmp_ctl.free_space == (block_group->length - used - 1024 block_group->bytes_super)); 1025 1026 if (matched) { 1027 spin_lock(&tmp_ctl.tree_lock); 1028 ret = copy_free_space_cache(block_group, &tmp_ctl); 1029 spin_unlock(&tmp_ctl.tree_lock); 1030 /* 1031 * ret == 1 means we successfully loaded the free space cache, 1032 * so we need to re-set it here. 1033 */ 1034 if (ret == 0) 1035 ret = 1; 1036 } else { 1037 /* 1038 * We need to call the _locked variant so we don't try to update 1039 * the discard counters. 1040 */ 1041 spin_lock(&tmp_ctl.tree_lock); 1042 __btrfs_remove_free_space_cache(&tmp_ctl); 1043 spin_unlock(&tmp_ctl.tree_lock); 1044 btrfs_warn(fs_info, 1045 "block group %llu has wrong amount of free space", 1046 block_group->start); 1047 ret = -1; 1048 } 1049 out: 1050 if (ret < 0) { 1051 /* This cache is bogus, make sure it gets cleared */ 1052 spin_lock(&block_group->lock); 1053 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1054 spin_unlock(&block_group->lock); 1055 ret = 0; 1056 1057 btrfs_warn(fs_info, 1058 "failed to load free space cache for block group %llu, rebuilding it now", 1059 block_group->start); 1060 } 1061 1062 spin_lock(&ctl->tree_lock); 1063 btrfs_discard_update_discardable(block_group); 1064 spin_unlock(&ctl->tree_lock); 1065 iput(inode); 1066 return ret; 1067 } 1068 1069 static noinline_for_stack 1070 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1071 struct btrfs_free_space_ctl *ctl, 1072 struct btrfs_block_group *block_group, 1073 int *entries, int *bitmaps, 1074 struct list_head *bitmap_list) 1075 { 1076 int ret; 1077 struct btrfs_free_cluster *cluster = NULL; 1078 struct btrfs_free_cluster *cluster_locked = NULL; 1079 struct rb_node *node = rb_first(&ctl->free_space_offset); 1080 struct btrfs_trim_range *trim_entry; 1081 1082 /* Get the cluster for this block_group if it exists */ 1083 if (block_group && !list_empty(&block_group->cluster_list)) { 1084 cluster = list_entry(block_group->cluster_list.next, 1085 struct btrfs_free_cluster, 1086 block_group_list); 1087 } 1088 1089 if (!node && cluster) { 1090 cluster_locked = cluster; 1091 spin_lock(&cluster_locked->lock); 1092 node = rb_first(&cluster->root); 1093 cluster = NULL; 1094 } 1095 1096 /* Write out the extent entries */ 1097 while (node) { 1098 struct btrfs_free_space *e; 1099 1100 e = rb_entry(node, struct btrfs_free_space, offset_index); 1101 *entries += 1; 1102 1103 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1104 e->bitmap); 1105 if (ret) 1106 goto fail; 1107 1108 if (e->bitmap) { 1109 list_add_tail(&e->list, bitmap_list); 1110 *bitmaps += 1; 1111 } 1112 node = rb_next(node); 1113 if (!node && cluster) { 1114 node = rb_first(&cluster->root); 1115 cluster_locked = cluster; 1116 spin_lock(&cluster_locked->lock); 1117 cluster = NULL; 1118 } 1119 } 1120 if (cluster_locked) { 1121 spin_unlock(&cluster_locked->lock); 1122 cluster_locked = NULL; 1123 } 1124 1125 /* 1126 * Make sure we don't miss any range that was removed from our rbtree 1127 * because trimming is running. Otherwise after a umount+mount (or crash 1128 * after committing the transaction) we would leak free space and get 1129 * an inconsistent free space cache report from fsck. 1130 */ 1131 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1132 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1133 trim_entry->bytes, NULL); 1134 if (ret) 1135 goto fail; 1136 *entries += 1; 1137 } 1138 1139 return 0; 1140 fail: 1141 if (cluster_locked) 1142 spin_unlock(&cluster_locked->lock); 1143 return -ENOSPC; 1144 } 1145 1146 static noinline_for_stack int 1147 update_cache_item(struct btrfs_trans_handle *trans, 1148 struct btrfs_root *root, 1149 struct inode *inode, 1150 struct btrfs_path *path, u64 offset, 1151 int entries, int bitmaps) 1152 { 1153 struct btrfs_key key; 1154 struct btrfs_free_space_header *header; 1155 struct extent_buffer *leaf; 1156 int ret; 1157 1158 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1159 key.offset = offset; 1160 key.type = 0; 1161 1162 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1163 if (ret < 0) { 1164 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1165 EXTENT_DELALLOC, NULL); 1166 goto fail; 1167 } 1168 leaf = path->nodes[0]; 1169 if (ret > 0) { 1170 struct btrfs_key found_key; 1171 ASSERT(path->slots[0]); 1172 path->slots[0]--; 1173 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1174 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1175 found_key.offset != offset) { 1176 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1177 inode->i_size - 1, EXTENT_DELALLOC, 1178 NULL); 1179 btrfs_release_path(path); 1180 goto fail; 1181 } 1182 } 1183 1184 BTRFS_I(inode)->generation = trans->transid; 1185 header = btrfs_item_ptr(leaf, path->slots[0], 1186 struct btrfs_free_space_header); 1187 btrfs_set_free_space_entries(leaf, header, entries); 1188 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1189 btrfs_set_free_space_generation(leaf, header, trans->transid); 1190 btrfs_release_path(path); 1191 1192 return 0; 1193 1194 fail: 1195 return -1; 1196 } 1197 1198 static noinline_for_stack int write_pinned_extent_entries( 1199 struct btrfs_trans_handle *trans, 1200 struct btrfs_block_group *block_group, 1201 struct btrfs_io_ctl *io_ctl, 1202 int *entries) 1203 { 1204 u64 start, extent_start, extent_end, len; 1205 struct extent_io_tree *unpin = NULL; 1206 int ret; 1207 1208 if (!block_group) 1209 return 0; 1210 1211 /* 1212 * We want to add any pinned extents to our free space cache 1213 * so we don't leak the space 1214 * 1215 * We shouldn't have switched the pinned extents yet so this is the 1216 * right one 1217 */ 1218 unpin = &trans->transaction->pinned_extents; 1219 1220 start = block_group->start; 1221 1222 while (start < block_group->start + block_group->length) { 1223 if (!find_first_extent_bit(unpin, start, 1224 &extent_start, &extent_end, 1225 EXTENT_DIRTY, NULL)) 1226 return 0; 1227 1228 /* This pinned extent is out of our range */ 1229 if (extent_start >= block_group->start + block_group->length) 1230 return 0; 1231 1232 extent_start = max(extent_start, start); 1233 extent_end = min(block_group->start + block_group->length, 1234 extent_end + 1); 1235 len = extent_end - extent_start; 1236 1237 *entries += 1; 1238 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1239 if (ret) 1240 return -ENOSPC; 1241 1242 start = extent_end; 1243 } 1244 1245 return 0; 1246 } 1247 1248 static noinline_for_stack int 1249 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1250 { 1251 struct btrfs_free_space *entry, *next; 1252 int ret; 1253 1254 /* Write out the bitmaps */ 1255 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1256 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1257 if (ret) 1258 return -ENOSPC; 1259 list_del_init(&entry->list); 1260 } 1261 1262 return 0; 1263 } 1264 1265 static int flush_dirty_cache(struct inode *inode) 1266 { 1267 int ret; 1268 1269 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 1270 if (ret) 1271 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1272 EXTENT_DELALLOC, NULL); 1273 1274 return ret; 1275 } 1276 1277 static void noinline_for_stack 1278 cleanup_bitmap_list(struct list_head *bitmap_list) 1279 { 1280 struct btrfs_free_space *entry, *next; 1281 1282 list_for_each_entry_safe(entry, next, bitmap_list, list) 1283 list_del_init(&entry->list); 1284 } 1285 1286 static void noinline_for_stack 1287 cleanup_write_cache_enospc(struct inode *inode, 1288 struct btrfs_io_ctl *io_ctl, 1289 struct extent_state **cached_state) 1290 { 1291 io_ctl_drop_pages(io_ctl); 1292 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1293 cached_state); 1294 } 1295 1296 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1297 struct btrfs_trans_handle *trans, 1298 struct btrfs_block_group *block_group, 1299 struct btrfs_io_ctl *io_ctl, 1300 struct btrfs_path *path, u64 offset) 1301 { 1302 int ret; 1303 struct inode *inode = io_ctl->inode; 1304 1305 if (!inode) 1306 return 0; 1307 1308 /* Flush the dirty pages in the cache file. */ 1309 ret = flush_dirty_cache(inode); 1310 if (ret) 1311 goto out; 1312 1313 /* Update the cache item to tell everyone this cache file is valid. */ 1314 ret = update_cache_item(trans, root, inode, path, offset, 1315 io_ctl->entries, io_ctl->bitmaps); 1316 out: 1317 if (ret) { 1318 invalidate_inode_pages2(inode->i_mapping); 1319 BTRFS_I(inode)->generation = 0; 1320 if (block_group) 1321 btrfs_debug(root->fs_info, 1322 "failed to write free space cache for block group %llu error %d", 1323 block_group->start, ret); 1324 } 1325 btrfs_update_inode(trans, BTRFS_I(inode)); 1326 1327 if (block_group) { 1328 /* the dirty list is protected by the dirty_bgs_lock */ 1329 spin_lock(&trans->transaction->dirty_bgs_lock); 1330 1331 /* the disk_cache_state is protected by the block group lock */ 1332 spin_lock(&block_group->lock); 1333 1334 /* 1335 * only mark this as written if we didn't get put back on 1336 * the dirty list while waiting for IO. Otherwise our 1337 * cache state won't be right, and we won't get written again 1338 */ 1339 if (!ret && list_empty(&block_group->dirty_list)) 1340 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1341 else if (ret) 1342 block_group->disk_cache_state = BTRFS_DC_ERROR; 1343 1344 spin_unlock(&block_group->lock); 1345 spin_unlock(&trans->transaction->dirty_bgs_lock); 1346 io_ctl->inode = NULL; 1347 iput(inode); 1348 } 1349 1350 return ret; 1351 1352 } 1353 1354 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1355 struct btrfs_block_group *block_group, 1356 struct btrfs_path *path) 1357 { 1358 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1359 block_group, &block_group->io_ctl, 1360 path, block_group->start); 1361 } 1362 1363 /* 1364 * Write out cached info to an inode. 1365 * 1366 * @inode: freespace inode we are writing out 1367 * @ctl: free space cache we are going to write out 1368 * @block_group: block_group for this cache if it belongs to a block_group 1369 * @io_ctl: holds context for the io 1370 * @trans: the trans handle 1371 * 1372 * This function writes out a free space cache struct to disk for quick recovery 1373 * on mount. This will return 0 if it was successful in writing the cache out, 1374 * or an errno if it was not. 1375 */ 1376 static int __btrfs_write_out_cache(struct inode *inode, 1377 struct btrfs_free_space_ctl *ctl, 1378 struct btrfs_block_group *block_group, 1379 struct btrfs_io_ctl *io_ctl, 1380 struct btrfs_trans_handle *trans) 1381 { 1382 struct extent_state *cached_state = NULL; 1383 LIST_HEAD(bitmap_list); 1384 int entries = 0; 1385 int bitmaps = 0; 1386 int ret; 1387 int must_iput = 0; 1388 int i_size; 1389 1390 if (!i_size_read(inode)) 1391 return -EIO; 1392 1393 WARN_ON(io_ctl->pages); 1394 ret = io_ctl_init(io_ctl, inode, 1); 1395 if (ret) 1396 return ret; 1397 1398 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1399 down_write(&block_group->data_rwsem); 1400 spin_lock(&block_group->lock); 1401 if (block_group->delalloc_bytes) { 1402 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1403 spin_unlock(&block_group->lock); 1404 up_write(&block_group->data_rwsem); 1405 BTRFS_I(inode)->generation = 0; 1406 ret = 0; 1407 must_iput = 1; 1408 goto out; 1409 } 1410 spin_unlock(&block_group->lock); 1411 } 1412 1413 /* Lock all pages first so we can lock the extent safely. */ 1414 ret = io_ctl_prepare_pages(io_ctl, false); 1415 if (ret) 1416 goto out_unlock; 1417 1418 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1419 &cached_state); 1420 1421 io_ctl_set_generation(io_ctl, trans->transid); 1422 1423 mutex_lock(&ctl->cache_writeout_mutex); 1424 /* Write out the extent entries in the free space cache */ 1425 spin_lock(&ctl->tree_lock); 1426 ret = write_cache_extent_entries(io_ctl, ctl, 1427 block_group, &entries, &bitmaps, 1428 &bitmap_list); 1429 if (ret) 1430 goto out_nospc_locked; 1431 1432 /* 1433 * Some spaces that are freed in the current transaction are pinned, 1434 * they will be added into free space cache after the transaction is 1435 * committed, we shouldn't lose them. 1436 * 1437 * If this changes while we are working we'll get added back to 1438 * the dirty list and redo it. No locking needed 1439 */ 1440 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1441 if (ret) 1442 goto out_nospc_locked; 1443 1444 /* 1445 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1446 * locked while doing it because a concurrent trim can be manipulating 1447 * or freeing the bitmap. 1448 */ 1449 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1450 spin_unlock(&ctl->tree_lock); 1451 mutex_unlock(&ctl->cache_writeout_mutex); 1452 if (ret) 1453 goto out_nospc; 1454 1455 /* Zero out the rest of the pages just to make sure */ 1456 io_ctl_zero_remaining_pages(io_ctl); 1457 1458 /* Everything is written out, now we dirty the pages in the file. */ 1459 i_size = i_size_read(inode); 1460 for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { 1461 u64 dirty_start = i * PAGE_SIZE; 1462 u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; 1463 1464 ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), 1465 dirty_start, dirty_len, &cached_state, false); 1466 if (ret < 0) 1467 goto out_nospc; 1468 } 1469 1470 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1471 up_write(&block_group->data_rwsem); 1472 /* 1473 * Release the pages and unlock the extent, we will flush 1474 * them out later 1475 */ 1476 io_ctl_drop_pages(io_ctl); 1477 io_ctl_free(io_ctl); 1478 1479 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1480 &cached_state); 1481 1482 /* 1483 * at this point the pages are under IO and we're happy, 1484 * The caller is responsible for waiting on them and updating 1485 * the cache and the inode 1486 */ 1487 io_ctl->entries = entries; 1488 io_ctl->bitmaps = bitmaps; 1489 1490 ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); 1491 if (ret) 1492 goto out; 1493 1494 return 0; 1495 1496 out_nospc_locked: 1497 cleanup_bitmap_list(&bitmap_list); 1498 spin_unlock(&ctl->tree_lock); 1499 mutex_unlock(&ctl->cache_writeout_mutex); 1500 1501 out_nospc: 1502 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1503 1504 out_unlock: 1505 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1506 up_write(&block_group->data_rwsem); 1507 1508 out: 1509 io_ctl->inode = NULL; 1510 io_ctl_free(io_ctl); 1511 if (ret) { 1512 invalidate_inode_pages2(inode->i_mapping); 1513 BTRFS_I(inode)->generation = 0; 1514 } 1515 btrfs_update_inode(trans, BTRFS_I(inode)); 1516 if (must_iput) 1517 iput(inode); 1518 return ret; 1519 } 1520 1521 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1522 struct btrfs_block_group *block_group, 1523 struct btrfs_path *path) 1524 { 1525 struct btrfs_fs_info *fs_info = trans->fs_info; 1526 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1527 struct inode *inode; 1528 int ret = 0; 1529 1530 spin_lock(&block_group->lock); 1531 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1532 spin_unlock(&block_group->lock); 1533 return 0; 1534 } 1535 spin_unlock(&block_group->lock); 1536 1537 inode = lookup_free_space_inode(block_group, path); 1538 if (IS_ERR(inode)) 1539 return 0; 1540 1541 ret = __btrfs_write_out_cache(inode, ctl, block_group, 1542 &block_group->io_ctl, trans); 1543 if (ret) { 1544 btrfs_debug(fs_info, 1545 "failed to write free space cache for block group %llu error %d", 1546 block_group->start, ret); 1547 spin_lock(&block_group->lock); 1548 block_group->disk_cache_state = BTRFS_DC_ERROR; 1549 spin_unlock(&block_group->lock); 1550 1551 block_group->io_ctl.inode = NULL; 1552 iput(inode); 1553 } 1554 1555 /* 1556 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1557 * to wait for IO and put the inode 1558 */ 1559 1560 return ret; 1561 } 1562 1563 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1564 u64 offset) 1565 { 1566 ASSERT(offset >= bitmap_start); 1567 offset -= bitmap_start; 1568 return (unsigned long)(div_u64(offset, unit)); 1569 } 1570 1571 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1572 { 1573 return (unsigned long)(div_u64(bytes, unit)); 1574 } 1575 1576 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1577 u64 offset) 1578 { 1579 u64 bitmap_start; 1580 u64 bytes_per_bitmap; 1581 1582 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1583 bitmap_start = offset - ctl->start; 1584 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1585 bitmap_start *= bytes_per_bitmap; 1586 bitmap_start += ctl->start; 1587 1588 return bitmap_start; 1589 } 1590 1591 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1592 struct btrfs_free_cluster *cluster, 1593 struct btrfs_free_space *new_entry) 1594 { 1595 struct rb_root *root; 1596 struct rb_node **p; 1597 struct rb_node *parent = NULL; 1598 1599 lockdep_assert_held(&ctl->tree_lock); 1600 1601 if (cluster) { 1602 lockdep_assert_held(&cluster->lock); 1603 root = &cluster->root; 1604 } else { 1605 root = &ctl->free_space_offset; 1606 } 1607 1608 p = &root->rb_node; 1609 1610 while (*p) { 1611 struct btrfs_free_space *info; 1612 1613 parent = *p; 1614 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1615 1616 if (new_entry->offset < info->offset) { 1617 p = &(*p)->rb_left; 1618 } else if (new_entry->offset > info->offset) { 1619 p = &(*p)->rb_right; 1620 } else { 1621 /* 1622 * we could have a bitmap entry and an extent entry 1623 * share the same offset. If this is the case, we want 1624 * the extent entry to always be found first if we do a 1625 * linear search through the tree, since we want to have 1626 * the quickest allocation time, and allocating from an 1627 * extent is faster than allocating from a bitmap. So 1628 * if we're inserting a bitmap and we find an entry at 1629 * this offset, we want to go right, or after this entry 1630 * logically. If we are inserting an extent and we've 1631 * found a bitmap, we want to go left, or before 1632 * logically. 1633 */ 1634 if (new_entry->bitmap) { 1635 if (info->bitmap) { 1636 WARN_ON_ONCE(1); 1637 return -EEXIST; 1638 } 1639 p = &(*p)->rb_right; 1640 } else { 1641 if (!info->bitmap) { 1642 WARN_ON_ONCE(1); 1643 return -EEXIST; 1644 } 1645 p = &(*p)->rb_left; 1646 } 1647 } 1648 } 1649 1650 rb_link_node(&new_entry->offset_index, parent, p); 1651 rb_insert_color(&new_entry->offset_index, root); 1652 1653 return 0; 1654 } 1655 1656 /* 1657 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1658 * searched through the bitmap and figured out the largest ->max_extent_size, 1659 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1660 * allocator the wrong thing, we want to use the actual real max_extent_size 1661 * we've found already if it's larger, or we want to use ->bytes. 1662 * 1663 * This matters because find_free_space() will skip entries who's ->bytes is 1664 * less than the required bytes. So if we didn't search down this bitmap, we 1665 * may pick some previous entry that has a smaller ->max_extent_size than we 1666 * have. For example, assume we have two entries, one that has 1667 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1668 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1669 * call into find_free_space(), and return with max_extent_size == 4K, because 1670 * that first bitmap entry had ->max_extent_size set, but the second one did 1671 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1672 * 8K contiguous range. 1673 * 1674 * Consider the other case, we have 2 8K chunks in that second entry and still 1675 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1676 * allocator comes in it'll fully search our second bitmap, and this time it'll 1677 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1678 * right allocation the next loop through. 1679 */ 1680 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1681 { 1682 if (entry->bitmap && entry->max_extent_size) 1683 return entry->max_extent_size; 1684 return entry->bytes; 1685 } 1686 1687 /* 1688 * We want the largest entry to be leftmost, so this is inverted from what you'd 1689 * normally expect. 1690 */ 1691 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1692 { 1693 const struct btrfs_free_space *entry, *exist; 1694 1695 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1696 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1697 return get_max_extent_size(exist) < get_max_extent_size(entry); 1698 } 1699 1700 /* 1701 * searches the tree for the given offset. 1702 * 1703 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1704 * want a section that has at least bytes size and comes at or after the given 1705 * offset. 1706 */ 1707 static struct btrfs_free_space * 1708 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1709 u64 offset, int bitmap_only, int fuzzy) 1710 { 1711 struct rb_node *n = ctl->free_space_offset.rb_node; 1712 struct btrfs_free_space *entry = NULL, *prev = NULL; 1713 1714 lockdep_assert_held(&ctl->tree_lock); 1715 1716 /* find entry that is closest to the 'offset' */ 1717 while (n) { 1718 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1719 prev = entry; 1720 1721 if (offset < entry->offset) 1722 n = n->rb_left; 1723 else if (offset > entry->offset) 1724 n = n->rb_right; 1725 else 1726 break; 1727 1728 entry = NULL; 1729 } 1730 1731 if (bitmap_only) { 1732 if (!entry) 1733 return NULL; 1734 if (entry->bitmap) 1735 return entry; 1736 1737 /* 1738 * bitmap entry and extent entry may share same offset, 1739 * in that case, bitmap entry comes after extent entry. 1740 */ 1741 n = rb_next(n); 1742 if (!n) 1743 return NULL; 1744 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1745 if (entry->offset != offset) 1746 return NULL; 1747 1748 WARN_ON(!entry->bitmap); 1749 return entry; 1750 } else if (entry) { 1751 if (entry->bitmap) { 1752 /* 1753 * if previous extent entry covers the offset, 1754 * we should return it instead of the bitmap entry 1755 */ 1756 n = rb_prev(&entry->offset_index); 1757 if (n) { 1758 prev = rb_entry(n, struct btrfs_free_space, 1759 offset_index); 1760 if (!prev->bitmap && 1761 prev->offset + prev->bytes > offset) 1762 entry = prev; 1763 } 1764 } 1765 return entry; 1766 } 1767 1768 if (!prev) 1769 return NULL; 1770 1771 /* find last entry before the 'offset' */ 1772 entry = prev; 1773 if (entry->offset > offset) { 1774 n = rb_prev(&entry->offset_index); 1775 if (n) { 1776 entry = rb_entry(n, struct btrfs_free_space, 1777 offset_index); 1778 ASSERT(entry->offset <= offset); 1779 } else { 1780 if (fuzzy) 1781 return entry; 1782 else 1783 return NULL; 1784 } 1785 } 1786 1787 if (entry->bitmap) { 1788 n = rb_prev(&entry->offset_index); 1789 if (n) { 1790 prev = rb_entry(n, struct btrfs_free_space, 1791 offset_index); 1792 if (!prev->bitmap && 1793 prev->offset + prev->bytes > offset) 1794 return prev; 1795 } 1796 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1797 return entry; 1798 } else if (entry->offset + entry->bytes > offset) 1799 return entry; 1800 1801 if (!fuzzy) 1802 return NULL; 1803 1804 while (1) { 1805 n = rb_next(&entry->offset_index); 1806 if (!n) 1807 return NULL; 1808 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1809 if (entry->bitmap) { 1810 if (entry->offset + BITS_PER_BITMAP * 1811 ctl->unit > offset) 1812 break; 1813 } else { 1814 if (entry->offset + entry->bytes > offset) 1815 break; 1816 } 1817 } 1818 return entry; 1819 } 1820 1821 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1822 struct btrfs_free_space *info, 1823 bool update_stat) 1824 { 1825 lockdep_assert_held(&ctl->tree_lock); 1826 1827 rb_erase(&info->offset_index, &ctl->free_space_offset); 1828 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1829 ctl->free_extents--; 1830 1831 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1832 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1833 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1834 } 1835 1836 if (update_stat) 1837 ctl->free_space -= info->bytes; 1838 } 1839 1840 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1841 struct btrfs_free_space *info) 1842 { 1843 int ret = 0; 1844 1845 lockdep_assert_held(&ctl->tree_lock); 1846 1847 ASSERT(info->bytes || info->bitmap); 1848 ret = tree_insert_offset(ctl, NULL, info); 1849 if (ret) 1850 return ret; 1851 1852 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1853 1854 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1855 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1856 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1857 } 1858 1859 ctl->free_space += info->bytes; 1860 ctl->free_extents++; 1861 return ret; 1862 } 1863 1864 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1865 struct btrfs_free_space *info) 1866 { 1867 ASSERT(info->bitmap); 1868 1869 /* 1870 * If our entry is empty it's because we're on a cluster and we don't 1871 * want to re-link it into our ctl bytes index. 1872 */ 1873 if (RB_EMPTY_NODE(&info->bytes_index)) 1874 return; 1875 1876 lockdep_assert_held(&ctl->tree_lock); 1877 1878 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1879 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1880 } 1881 1882 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1883 struct btrfs_free_space *info, 1884 u64 offset, u64 bytes, bool update_stat) 1885 { 1886 unsigned long start, count, end; 1887 int extent_delta = -1; 1888 1889 start = offset_to_bit(info->offset, ctl->unit, offset); 1890 count = bytes_to_bits(bytes, ctl->unit); 1891 end = start + count; 1892 ASSERT(end <= BITS_PER_BITMAP); 1893 1894 bitmap_clear(info->bitmap, start, count); 1895 1896 info->bytes -= bytes; 1897 if (info->max_extent_size > ctl->unit) 1898 info->max_extent_size = 0; 1899 1900 relink_bitmap_entry(ctl, info); 1901 1902 if (start && test_bit(start - 1, info->bitmap)) 1903 extent_delta++; 1904 1905 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1906 extent_delta++; 1907 1908 info->bitmap_extents += extent_delta; 1909 if (!btrfs_free_space_trimmed(info)) { 1910 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1911 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1912 } 1913 1914 if (update_stat) 1915 ctl->free_space -= bytes; 1916 } 1917 1918 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1919 struct btrfs_free_space *info, u64 offset, 1920 u64 bytes) 1921 { 1922 unsigned long start, count, end; 1923 int extent_delta = 1; 1924 1925 start = offset_to_bit(info->offset, ctl->unit, offset); 1926 count = bytes_to_bits(bytes, ctl->unit); 1927 end = start + count; 1928 ASSERT(end <= BITS_PER_BITMAP); 1929 1930 bitmap_set(info->bitmap, start, count); 1931 1932 /* 1933 * We set some bytes, we have no idea what the max extent size is 1934 * anymore. 1935 */ 1936 info->max_extent_size = 0; 1937 info->bytes += bytes; 1938 ctl->free_space += bytes; 1939 1940 relink_bitmap_entry(ctl, info); 1941 1942 if (start && test_bit(start - 1, info->bitmap)) 1943 extent_delta--; 1944 1945 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1946 extent_delta--; 1947 1948 info->bitmap_extents += extent_delta; 1949 if (!btrfs_free_space_trimmed(info)) { 1950 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1951 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1952 } 1953 } 1954 1955 /* 1956 * If we can not find suitable extent, we will use bytes to record 1957 * the size of the max extent. 1958 */ 1959 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1960 struct btrfs_free_space *bitmap_info, u64 *offset, 1961 u64 *bytes, bool for_alloc) 1962 { 1963 unsigned long found_bits = 0; 1964 unsigned long max_bits = 0; 1965 unsigned long bits, i; 1966 unsigned long next_zero; 1967 unsigned long extent_bits; 1968 1969 /* 1970 * Skip searching the bitmap if we don't have a contiguous section that 1971 * is large enough for this allocation. 1972 */ 1973 if (for_alloc && 1974 bitmap_info->max_extent_size && 1975 bitmap_info->max_extent_size < *bytes) { 1976 *bytes = bitmap_info->max_extent_size; 1977 return -1; 1978 } 1979 1980 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1981 max_t(u64, *offset, bitmap_info->offset)); 1982 bits = bytes_to_bits(*bytes, ctl->unit); 1983 1984 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1985 if (for_alloc && bits == 1) { 1986 found_bits = 1; 1987 break; 1988 } 1989 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1990 BITS_PER_BITMAP, i); 1991 extent_bits = next_zero - i; 1992 if (extent_bits >= bits) { 1993 found_bits = extent_bits; 1994 break; 1995 } else if (extent_bits > max_bits) { 1996 max_bits = extent_bits; 1997 } 1998 i = next_zero; 1999 } 2000 2001 if (found_bits) { 2002 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 2003 *bytes = (u64)(found_bits) * ctl->unit; 2004 return 0; 2005 } 2006 2007 *bytes = (u64)(max_bits) * ctl->unit; 2008 bitmap_info->max_extent_size = *bytes; 2009 relink_bitmap_entry(ctl, bitmap_info); 2010 return -1; 2011 } 2012 2013 /* Cache the size of the max extent in bytes */ 2014 static struct btrfs_free_space * 2015 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2016 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2017 { 2018 struct btrfs_free_space *entry; 2019 struct rb_node *node; 2020 u64 tmp; 2021 u64 align_off; 2022 int ret; 2023 2024 if (!ctl->free_space_offset.rb_node) 2025 goto out; 2026 again: 2027 if (use_bytes_index) { 2028 node = rb_first_cached(&ctl->free_space_bytes); 2029 } else { 2030 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2031 0, 1); 2032 if (!entry) 2033 goto out; 2034 node = &entry->offset_index; 2035 } 2036 2037 for (; node; node = rb_next(node)) { 2038 if (use_bytes_index) 2039 entry = rb_entry(node, struct btrfs_free_space, 2040 bytes_index); 2041 else 2042 entry = rb_entry(node, struct btrfs_free_space, 2043 offset_index); 2044 2045 /* 2046 * If we are using the bytes index then all subsequent entries 2047 * in this tree are going to be < bytes, so simply set the max 2048 * extent size and exit the loop. 2049 * 2050 * If we're using the offset index then we need to keep going 2051 * through the rest of the tree. 2052 */ 2053 if (entry->bytes < *bytes) { 2054 *max_extent_size = max(get_max_extent_size(entry), 2055 *max_extent_size); 2056 if (use_bytes_index) 2057 break; 2058 continue; 2059 } 2060 2061 /* make sure the space returned is big enough 2062 * to match our requested alignment 2063 */ 2064 if (*bytes >= align) { 2065 tmp = entry->offset - ctl->start + align - 1; 2066 tmp = div64_u64(tmp, align); 2067 tmp = tmp * align + ctl->start; 2068 align_off = tmp - entry->offset; 2069 } else { 2070 align_off = 0; 2071 tmp = entry->offset; 2072 } 2073 2074 /* 2075 * We don't break here if we're using the bytes index because we 2076 * may have another entry that has the correct alignment that is 2077 * the right size, so we don't want to miss that possibility. 2078 * At worst this adds another loop through the logic, but if we 2079 * broke here we could prematurely ENOSPC. 2080 */ 2081 if (entry->bytes < *bytes + align_off) { 2082 *max_extent_size = max(get_max_extent_size(entry), 2083 *max_extent_size); 2084 continue; 2085 } 2086 2087 if (entry->bitmap) { 2088 struct rb_node *old_next = rb_next(node); 2089 u64 size = *bytes; 2090 2091 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2092 if (!ret) { 2093 *offset = tmp; 2094 *bytes = size; 2095 return entry; 2096 } else { 2097 *max_extent_size = 2098 max(get_max_extent_size(entry), 2099 *max_extent_size); 2100 } 2101 2102 /* 2103 * The bitmap may have gotten re-arranged in the space 2104 * index here because the max_extent_size may have been 2105 * updated. Start from the beginning again if this 2106 * happened. 2107 */ 2108 if (use_bytes_index && old_next != rb_next(node)) 2109 goto again; 2110 continue; 2111 } 2112 2113 *offset = tmp; 2114 *bytes = entry->bytes - align_off; 2115 return entry; 2116 } 2117 out: 2118 return NULL; 2119 } 2120 2121 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2122 struct btrfs_free_space *info, u64 offset) 2123 { 2124 info->offset = offset_to_bitmap(ctl, offset); 2125 info->bytes = 0; 2126 info->bitmap_extents = 0; 2127 INIT_LIST_HEAD(&info->list); 2128 link_free_space(ctl, info); 2129 ctl->total_bitmaps++; 2130 recalculate_thresholds(ctl); 2131 } 2132 2133 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2134 struct btrfs_free_space *bitmap_info) 2135 { 2136 /* 2137 * Normally when this is called, the bitmap is completely empty. However, 2138 * if we are blowing up the free space cache for one reason or another 2139 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2140 * we may leave stats on the table. 2141 */ 2142 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2143 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2144 bitmap_info->bitmap_extents; 2145 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2146 2147 } 2148 unlink_free_space(ctl, bitmap_info, true); 2149 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2150 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2151 ctl->total_bitmaps--; 2152 recalculate_thresholds(ctl); 2153 } 2154 2155 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2156 struct btrfs_free_space *bitmap_info, 2157 u64 *offset, u64 *bytes) 2158 { 2159 u64 end; 2160 u64 search_start, search_bytes; 2161 int ret; 2162 2163 again: 2164 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2165 2166 /* 2167 * We need to search for bits in this bitmap. We could only cover some 2168 * of the extent in this bitmap thanks to how we add space, so we need 2169 * to search for as much as it as we can and clear that amount, and then 2170 * go searching for the next bit. 2171 */ 2172 search_start = *offset; 2173 search_bytes = ctl->unit; 2174 search_bytes = min(search_bytes, end - search_start + 1); 2175 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2176 false); 2177 if (ret < 0 || search_start != *offset) 2178 return -EINVAL; 2179 2180 /* We may have found more bits than what we need */ 2181 search_bytes = min(search_bytes, *bytes); 2182 2183 /* Cannot clear past the end of the bitmap */ 2184 search_bytes = min(search_bytes, end - search_start + 1); 2185 2186 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2187 *offset += search_bytes; 2188 *bytes -= search_bytes; 2189 2190 if (*bytes) { 2191 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2192 if (!bitmap_info->bytes) 2193 free_bitmap(ctl, bitmap_info); 2194 2195 /* 2196 * no entry after this bitmap, but we still have bytes to 2197 * remove, so something has gone wrong. 2198 */ 2199 if (!next) 2200 return -EINVAL; 2201 2202 bitmap_info = rb_entry(next, struct btrfs_free_space, 2203 offset_index); 2204 2205 /* 2206 * if the next entry isn't a bitmap we need to return to let the 2207 * extent stuff do its work. 2208 */ 2209 if (!bitmap_info->bitmap) 2210 return -EAGAIN; 2211 2212 /* 2213 * Ok the next item is a bitmap, but it may not actually hold 2214 * the information for the rest of this free space stuff, so 2215 * look for it, and if we don't find it return so we can try 2216 * everything over again. 2217 */ 2218 search_start = *offset; 2219 search_bytes = ctl->unit; 2220 ret = search_bitmap(ctl, bitmap_info, &search_start, 2221 &search_bytes, false); 2222 if (ret < 0 || search_start != *offset) 2223 return -EAGAIN; 2224 2225 goto again; 2226 } else if (!bitmap_info->bytes) 2227 free_bitmap(ctl, bitmap_info); 2228 2229 return 0; 2230 } 2231 2232 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2233 struct btrfs_free_space *info, u64 offset, 2234 u64 bytes, enum btrfs_trim_state trim_state) 2235 { 2236 u64 bytes_to_set = 0; 2237 u64 end; 2238 2239 /* 2240 * This is a tradeoff to make bitmap trim state minimal. We mark the 2241 * whole bitmap untrimmed if at any point we add untrimmed regions. 2242 */ 2243 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2244 if (btrfs_free_space_trimmed(info)) { 2245 ctl->discardable_extents[BTRFS_STAT_CURR] += 2246 info->bitmap_extents; 2247 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2248 } 2249 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2250 } 2251 2252 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2253 2254 bytes_to_set = min(end - offset, bytes); 2255 2256 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2257 2258 return bytes_to_set; 2259 2260 } 2261 2262 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2263 struct btrfs_free_space *info) 2264 { 2265 struct btrfs_block_group *block_group = ctl->block_group; 2266 struct btrfs_fs_info *fs_info = block_group->fs_info; 2267 bool forced = false; 2268 2269 #ifdef CONFIG_BTRFS_DEBUG 2270 if (btrfs_should_fragment_free_space(block_group)) 2271 forced = true; 2272 #endif 2273 2274 /* This is a way to reclaim large regions from the bitmaps. */ 2275 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2276 return false; 2277 2278 /* 2279 * If we are below the extents threshold then we can add this as an 2280 * extent, and don't have to deal with the bitmap 2281 */ 2282 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2283 /* 2284 * If this block group has some small extents we don't want to 2285 * use up all of our free slots in the cache with them, we want 2286 * to reserve them to larger extents, however if we have plenty 2287 * of cache left then go ahead an dadd them, no sense in adding 2288 * the overhead of a bitmap if we don't have to. 2289 */ 2290 if (info->bytes <= fs_info->sectorsize * 8) { 2291 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2292 return false; 2293 } else { 2294 return false; 2295 } 2296 } 2297 2298 /* 2299 * The original block groups from mkfs can be really small, like 8 2300 * megabytes, so don't bother with a bitmap for those entries. However 2301 * some block groups can be smaller than what a bitmap would cover but 2302 * are still large enough that they could overflow the 32k memory limit, 2303 * so allow those block groups to still be allowed to have a bitmap 2304 * entry. 2305 */ 2306 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2307 return false; 2308 2309 return true; 2310 } 2311 2312 static const struct btrfs_free_space_op free_space_op = { 2313 .use_bitmap = use_bitmap, 2314 }; 2315 2316 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2317 struct btrfs_free_space *info) 2318 { 2319 struct btrfs_free_space *bitmap_info; 2320 struct btrfs_block_group *block_group = NULL; 2321 int added = 0; 2322 u64 bytes, offset, bytes_added; 2323 enum btrfs_trim_state trim_state; 2324 int ret; 2325 2326 bytes = info->bytes; 2327 offset = info->offset; 2328 trim_state = info->trim_state; 2329 2330 if (!ctl->op->use_bitmap(ctl, info)) 2331 return 0; 2332 2333 if (ctl->op == &free_space_op) 2334 block_group = ctl->block_group; 2335 again: 2336 /* 2337 * Since we link bitmaps right into the cluster we need to see if we 2338 * have a cluster here, and if so and it has our bitmap we need to add 2339 * the free space to that bitmap. 2340 */ 2341 if (block_group && !list_empty(&block_group->cluster_list)) { 2342 struct btrfs_free_cluster *cluster; 2343 struct rb_node *node; 2344 struct btrfs_free_space *entry; 2345 2346 cluster = list_entry(block_group->cluster_list.next, 2347 struct btrfs_free_cluster, 2348 block_group_list); 2349 spin_lock(&cluster->lock); 2350 node = rb_first(&cluster->root); 2351 if (!node) { 2352 spin_unlock(&cluster->lock); 2353 goto no_cluster_bitmap; 2354 } 2355 2356 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2357 if (!entry->bitmap) { 2358 spin_unlock(&cluster->lock); 2359 goto no_cluster_bitmap; 2360 } 2361 2362 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2363 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2364 bytes, trim_state); 2365 bytes -= bytes_added; 2366 offset += bytes_added; 2367 } 2368 spin_unlock(&cluster->lock); 2369 if (!bytes) { 2370 ret = 1; 2371 goto out; 2372 } 2373 } 2374 2375 no_cluster_bitmap: 2376 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2377 1, 0); 2378 if (!bitmap_info) { 2379 ASSERT(added == 0); 2380 goto new_bitmap; 2381 } 2382 2383 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2384 trim_state); 2385 bytes -= bytes_added; 2386 offset += bytes_added; 2387 added = 0; 2388 2389 if (!bytes) { 2390 ret = 1; 2391 goto out; 2392 } else 2393 goto again; 2394 2395 new_bitmap: 2396 if (info && info->bitmap) { 2397 add_new_bitmap(ctl, info, offset); 2398 added = 1; 2399 info = NULL; 2400 goto again; 2401 } else { 2402 spin_unlock(&ctl->tree_lock); 2403 2404 /* no pre-allocated info, allocate a new one */ 2405 if (!info) { 2406 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2407 GFP_NOFS); 2408 if (!info) { 2409 spin_lock(&ctl->tree_lock); 2410 ret = -ENOMEM; 2411 goto out; 2412 } 2413 } 2414 2415 /* allocate the bitmap */ 2416 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2417 GFP_NOFS); 2418 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2419 spin_lock(&ctl->tree_lock); 2420 if (!info->bitmap) { 2421 ret = -ENOMEM; 2422 goto out; 2423 } 2424 goto again; 2425 } 2426 2427 out: 2428 if (info) { 2429 if (info->bitmap) 2430 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2431 info->bitmap); 2432 kmem_cache_free(btrfs_free_space_cachep, info); 2433 } 2434 2435 return ret; 2436 } 2437 2438 /* 2439 * Free space merging rules: 2440 * 1) Merge trimmed areas together 2441 * 2) Let untrimmed areas coalesce with trimmed areas 2442 * 3) Always pull neighboring regions from bitmaps 2443 * 2444 * The above rules are for when we merge free space based on btrfs_trim_state. 2445 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2446 * same reason: to promote larger extent regions which makes life easier for 2447 * find_free_extent(). Rule 2 enables coalescing based on the common path 2448 * being returning free space from btrfs_finish_extent_commit(). So when free 2449 * space is trimmed, it will prevent aggregating trimmed new region and 2450 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2451 * and provide find_free_extent() with the largest extents possible hoping for 2452 * the reuse path. 2453 */ 2454 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2455 struct btrfs_free_space *info, bool update_stat) 2456 { 2457 struct btrfs_free_space *left_info = NULL; 2458 struct btrfs_free_space *right_info; 2459 bool merged = false; 2460 u64 offset = info->offset; 2461 u64 bytes = info->bytes; 2462 const bool is_trimmed = btrfs_free_space_trimmed(info); 2463 struct rb_node *right_prev = NULL; 2464 2465 /* 2466 * first we want to see if there is free space adjacent to the range we 2467 * are adding, if there is remove that struct and add a new one to 2468 * cover the entire range 2469 */ 2470 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2471 if (right_info) 2472 right_prev = rb_prev(&right_info->offset_index); 2473 2474 if (right_prev) 2475 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2476 else if (!right_info) 2477 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2478 2479 /* See try_merge_free_space() comment. */ 2480 if (right_info && !right_info->bitmap && 2481 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2482 unlink_free_space(ctl, right_info, update_stat); 2483 info->bytes += right_info->bytes; 2484 kmem_cache_free(btrfs_free_space_cachep, right_info); 2485 merged = true; 2486 } 2487 2488 /* See try_merge_free_space() comment. */ 2489 if (left_info && !left_info->bitmap && 2490 left_info->offset + left_info->bytes == offset && 2491 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2492 unlink_free_space(ctl, left_info, update_stat); 2493 info->offset = left_info->offset; 2494 info->bytes += left_info->bytes; 2495 kmem_cache_free(btrfs_free_space_cachep, left_info); 2496 merged = true; 2497 } 2498 2499 return merged; 2500 } 2501 2502 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2503 struct btrfs_free_space *info, 2504 bool update_stat) 2505 { 2506 struct btrfs_free_space *bitmap; 2507 unsigned long i; 2508 unsigned long j; 2509 const u64 end = info->offset + info->bytes; 2510 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2511 u64 bytes; 2512 2513 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2514 if (!bitmap) 2515 return false; 2516 2517 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2518 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2519 if (j == i) 2520 return false; 2521 bytes = (j - i) * ctl->unit; 2522 info->bytes += bytes; 2523 2524 /* See try_merge_free_space() comment. */ 2525 if (!btrfs_free_space_trimmed(bitmap)) 2526 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2527 2528 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2529 2530 if (!bitmap->bytes) 2531 free_bitmap(ctl, bitmap); 2532 2533 return true; 2534 } 2535 2536 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2537 struct btrfs_free_space *info, 2538 bool update_stat) 2539 { 2540 struct btrfs_free_space *bitmap; 2541 u64 bitmap_offset; 2542 unsigned long i; 2543 unsigned long j; 2544 unsigned long prev_j; 2545 u64 bytes; 2546 2547 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2548 /* If we're on a boundary, try the previous logical bitmap. */ 2549 if (bitmap_offset == info->offset) { 2550 if (info->offset == 0) 2551 return false; 2552 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2553 } 2554 2555 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2556 if (!bitmap) 2557 return false; 2558 2559 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2560 j = 0; 2561 prev_j = (unsigned long)-1; 2562 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2563 if (j > i) 2564 break; 2565 prev_j = j; 2566 } 2567 if (prev_j == i) 2568 return false; 2569 2570 if (prev_j == (unsigned long)-1) 2571 bytes = (i + 1) * ctl->unit; 2572 else 2573 bytes = (i - prev_j) * ctl->unit; 2574 2575 info->offset -= bytes; 2576 info->bytes += bytes; 2577 2578 /* See try_merge_free_space() comment. */ 2579 if (!btrfs_free_space_trimmed(bitmap)) 2580 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2581 2582 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2583 2584 if (!bitmap->bytes) 2585 free_bitmap(ctl, bitmap); 2586 2587 return true; 2588 } 2589 2590 /* 2591 * We prefer always to allocate from extent entries, both for clustered and 2592 * non-clustered allocation requests. So when attempting to add a new extent 2593 * entry, try to see if there's adjacent free space in bitmap entries, and if 2594 * there is, migrate that space from the bitmaps to the extent. 2595 * Like this we get better chances of satisfying space allocation requests 2596 * because we attempt to satisfy them based on a single cache entry, and never 2597 * on 2 or more entries - even if the entries represent a contiguous free space 2598 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2599 * ends). 2600 */ 2601 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2602 struct btrfs_free_space *info, 2603 bool update_stat) 2604 { 2605 /* 2606 * Only work with disconnected entries, as we can change their offset, 2607 * and must be extent entries. 2608 */ 2609 ASSERT(!info->bitmap); 2610 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2611 2612 if (ctl->total_bitmaps > 0) { 2613 bool stole_end; 2614 bool stole_front = false; 2615 2616 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2617 if (ctl->total_bitmaps > 0) 2618 stole_front = steal_from_bitmap_to_front(ctl, info, 2619 update_stat); 2620 2621 if (stole_end || stole_front) 2622 try_merge_free_space(ctl, info, update_stat); 2623 } 2624 } 2625 2626 static int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2627 u64 offset, u64 bytes, 2628 enum btrfs_trim_state trim_state) 2629 { 2630 struct btrfs_fs_info *fs_info = block_group->fs_info; 2631 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2632 struct btrfs_free_space *info; 2633 int ret = 0; 2634 u64 filter_bytes = bytes; 2635 2636 ASSERT(!btrfs_is_zoned(fs_info)); 2637 2638 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2639 if (!info) 2640 return -ENOMEM; 2641 2642 info->offset = offset; 2643 info->bytes = bytes; 2644 info->trim_state = trim_state; 2645 RB_CLEAR_NODE(&info->offset_index); 2646 RB_CLEAR_NODE(&info->bytes_index); 2647 2648 spin_lock(&ctl->tree_lock); 2649 2650 if (try_merge_free_space(ctl, info, true)) 2651 goto link; 2652 2653 /* 2654 * There was no extent directly to the left or right of this new 2655 * extent then we know we're going to have to allocate a new extent, so 2656 * before we do that see if we need to drop this into a bitmap 2657 */ 2658 ret = insert_into_bitmap(ctl, info); 2659 if (ret < 0) { 2660 goto out; 2661 } else if (ret) { 2662 ret = 0; 2663 goto out; 2664 } 2665 link: 2666 /* 2667 * Only steal free space from adjacent bitmaps if we're sure we're not 2668 * going to add the new free space to existing bitmap entries - because 2669 * that would mean unnecessary work that would be reverted. Therefore 2670 * attempt to steal space from bitmaps if we're adding an extent entry. 2671 */ 2672 steal_from_bitmap(ctl, info, true); 2673 2674 filter_bytes = max(filter_bytes, info->bytes); 2675 2676 ret = link_free_space(ctl, info); 2677 if (ret) 2678 kmem_cache_free(btrfs_free_space_cachep, info); 2679 out: 2680 btrfs_discard_update_discardable(block_group); 2681 spin_unlock(&ctl->tree_lock); 2682 2683 if (ret) { 2684 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2685 ASSERT(ret != -EEXIST); 2686 } 2687 2688 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2689 btrfs_discard_check_filter(block_group, filter_bytes); 2690 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2691 } 2692 2693 return ret; 2694 } 2695 2696 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2697 u64 bytenr, u64 size, bool used) 2698 { 2699 struct btrfs_space_info *sinfo = block_group->space_info; 2700 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2701 u64 offset = bytenr - block_group->start; 2702 u64 to_free, to_unusable; 2703 int bg_reclaim_threshold = 0; 2704 bool initial; 2705 u64 reclaimable_unusable; 2706 2707 spin_lock(&block_group->lock); 2708 2709 initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2710 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2711 if (!initial) 2712 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2713 2714 if (!used) 2715 to_free = size; 2716 else if (initial) 2717 to_free = block_group->zone_capacity; 2718 else if (offset >= block_group->alloc_offset) 2719 to_free = size; 2720 else if (offset + size <= block_group->alloc_offset) 2721 to_free = 0; 2722 else 2723 to_free = offset + size - block_group->alloc_offset; 2724 to_unusable = size - to_free; 2725 2726 spin_lock(&ctl->tree_lock); 2727 ctl->free_space += to_free; 2728 spin_unlock(&ctl->tree_lock); 2729 /* 2730 * If the block group is read-only, we should account freed space into 2731 * bytes_readonly. 2732 */ 2733 if (!block_group->ro) { 2734 block_group->zone_unusable += to_unusable; 2735 WARN_ON(block_group->zone_unusable > block_group->length); 2736 } 2737 if (!used) { 2738 block_group->alloc_offset -= size; 2739 } 2740 2741 reclaimable_unusable = block_group->zone_unusable - 2742 (block_group->length - block_group->zone_capacity); 2743 /* All the region is now unusable. Mark it as unused and reclaim */ 2744 if (block_group->zone_unusable == block_group->length) { 2745 btrfs_mark_bg_unused(block_group); 2746 } else if (bg_reclaim_threshold && 2747 reclaimable_unusable >= 2748 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2749 btrfs_mark_bg_to_reclaim(block_group); 2750 } 2751 2752 spin_unlock(&block_group->lock); 2753 2754 return 0; 2755 } 2756 2757 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2758 u64 bytenr, u64 size) 2759 { 2760 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2761 2762 if (btrfs_is_zoned(block_group->fs_info)) 2763 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2764 true); 2765 2766 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2767 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2768 2769 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2770 } 2771 2772 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2773 u64 bytenr, u64 size) 2774 { 2775 if (btrfs_is_zoned(block_group->fs_info)) 2776 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2777 false); 2778 2779 return btrfs_add_free_space(block_group, bytenr, size); 2780 } 2781 2782 /* 2783 * This is a subtle distinction because when adding free space back in general, 2784 * we want it to be added as untrimmed for async. But in the case where we add 2785 * it on loading of a block group, we want to consider it trimmed. 2786 */ 2787 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2788 u64 bytenr, u64 size) 2789 { 2790 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2791 2792 if (btrfs_is_zoned(block_group->fs_info)) 2793 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2794 true); 2795 2796 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2797 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2798 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2799 2800 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2801 } 2802 2803 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2804 u64 offset, u64 bytes) 2805 { 2806 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2807 struct btrfs_free_space *info; 2808 int ret; 2809 bool re_search = false; 2810 2811 if (btrfs_is_zoned(block_group->fs_info)) { 2812 /* 2813 * This can happen with conventional zones when replaying log. 2814 * Since the allocation info of tree-log nodes are not recorded 2815 * to the extent-tree, calculate_alloc_pointer() failed to 2816 * advance the allocation pointer after last allocated tree log 2817 * node blocks. 2818 * 2819 * This function is called from 2820 * btrfs_pin_extent_for_log_replay() when replaying the log. 2821 * Advance the pointer not to overwrite the tree-log nodes. 2822 */ 2823 if (block_group->start + block_group->alloc_offset < 2824 offset + bytes) { 2825 block_group->alloc_offset = 2826 offset + bytes - block_group->start; 2827 } 2828 return 0; 2829 } 2830 2831 spin_lock(&ctl->tree_lock); 2832 2833 again: 2834 ret = 0; 2835 if (!bytes) 2836 goto out_lock; 2837 2838 info = tree_search_offset(ctl, offset, 0, 0); 2839 if (!info) { 2840 /* 2841 * oops didn't find an extent that matched the space we wanted 2842 * to remove, look for a bitmap instead 2843 */ 2844 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2845 1, 0); 2846 if (!info) { 2847 /* 2848 * If we found a partial bit of our free space in a 2849 * bitmap but then couldn't find the other part this may 2850 * be a problem, so WARN about it. 2851 */ 2852 WARN_ON(re_search); 2853 goto out_lock; 2854 } 2855 } 2856 2857 re_search = false; 2858 if (!info->bitmap) { 2859 unlink_free_space(ctl, info, true); 2860 if (offset == info->offset) { 2861 u64 to_free = min(bytes, info->bytes); 2862 2863 info->bytes -= to_free; 2864 info->offset += to_free; 2865 if (info->bytes) { 2866 ret = link_free_space(ctl, info); 2867 WARN_ON(ret); 2868 } else { 2869 kmem_cache_free(btrfs_free_space_cachep, info); 2870 } 2871 2872 offset += to_free; 2873 bytes -= to_free; 2874 goto again; 2875 } else { 2876 u64 old_end = info->bytes + info->offset; 2877 2878 info->bytes = offset - info->offset; 2879 ret = link_free_space(ctl, info); 2880 WARN_ON(ret); 2881 if (ret) 2882 goto out_lock; 2883 2884 /* Not enough bytes in this entry to satisfy us */ 2885 if (old_end < offset + bytes) { 2886 bytes -= old_end - offset; 2887 offset = old_end; 2888 goto again; 2889 } else if (old_end == offset + bytes) { 2890 /* all done */ 2891 goto out_lock; 2892 } 2893 spin_unlock(&ctl->tree_lock); 2894 2895 ret = __btrfs_add_free_space(block_group, 2896 offset + bytes, 2897 old_end - (offset + bytes), 2898 info->trim_state); 2899 WARN_ON(ret); 2900 goto out; 2901 } 2902 } 2903 2904 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2905 if (ret == -EAGAIN) { 2906 re_search = true; 2907 goto again; 2908 } 2909 out_lock: 2910 btrfs_discard_update_discardable(block_group); 2911 spin_unlock(&ctl->tree_lock); 2912 out: 2913 return ret; 2914 } 2915 2916 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2917 u64 bytes) 2918 { 2919 struct btrfs_fs_info *fs_info = block_group->fs_info; 2920 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2921 struct btrfs_free_space *info; 2922 struct rb_node *n; 2923 int count = 0; 2924 2925 /* 2926 * Zoned btrfs does not use free space tree and cluster. Just print 2927 * out the free space after the allocation offset. 2928 */ 2929 if (btrfs_is_zoned(fs_info)) { 2930 btrfs_info(fs_info, "free space %llu active %d", 2931 block_group->zone_capacity - block_group->alloc_offset, 2932 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2933 &block_group->runtime_flags)); 2934 return; 2935 } 2936 2937 spin_lock(&ctl->tree_lock); 2938 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2939 info = rb_entry(n, struct btrfs_free_space, offset_index); 2940 if (info->bytes >= bytes && !block_group->ro) 2941 count++; 2942 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2943 info->offset, info->bytes, str_yes_no(info->bitmap)); 2944 } 2945 spin_unlock(&ctl->tree_lock); 2946 btrfs_info(fs_info, "block group has cluster?: %s", 2947 str_no_yes(list_empty(&block_group->cluster_list))); 2948 btrfs_info(fs_info, 2949 "%d free space entries at or bigger than %llu bytes", 2950 count, bytes); 2951 } 2952 2953 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2954 struct btrfs_free_space_ctl *ctl) 2955 { 2956 struct btrfs_fs_info *fs_info = block_group->fs_info; 2957 2958 spin_lock_init(&ctl->tree_lock); 2959 ctl->unit = fs_info->sectorsize; 2960 ctl->start = block_group->start; 2961 ctl->block_group = block_group; 2962 ctl->op = &free_space_op; 2963 ctl->free_space_bytes = RB_ROOT_CACHED; 2964 INIT_LIST_HEAD(&ctl->trimming_ranges); 2965 mutex_init(&ctl->cache_writeout_mutex); 2966 2967 /* 2968 * we only want to have 32k of ram per block group for keeping 2969 * track of free space, and if we pass 1/2 of that we want to 2970 * start converting things over to using bitmaps 2971 */ 2972 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2973 } 2974 2975 /* 2976 * for a given cluster, put all of its extents back into the free 2977 * space cache. If the block group passed doesn't match the block group 2978 * pointed to by the cluster, someone else raced in and freed the 2979 * cluster already. In that case, we just return without changing anything 2980 */ 2981 static void __btrfs_return_cluster_to_free_space( 2982 struct btrfs_block_group *block_group, 2983 struct btrfs_free_cluster *cluster) 2984 { 2985 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2986 struct rb_node *node; 2987 2988 lockdep_assert_held(&ctl->tree_lock); 2989 2990 spin_lock(&cluster->lock); 2991 if (cluster->block_group != block_group) { 2992 spin_unlock(&cluster->lock); 2993 return; 2994 } 2995 2996 cluster->block_group = NULL; 2997 cluster->window_start = 0; 2998 list_del_init(&cluster->block_group_list); 2999 3000 node = rb_first(&cluster->root); 3001 while (node) { 3002 struct btrfs_free_space *entry; 3003 3004 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3005 node = rb_next(&entry->offset_index); 3006 rb_erase(&entry->offset_index, &cluster->root); 3007 RB_CLEAR_NODE(&entry->offset_index); 3008 3009 if (!entry->bitmap) { 3010 /* Merging treats extents as if they were new */ 3011 if (!btrfs_free_space_trimmed(entry)) { 3012 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3013 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3014 entry->bytes; 3015 } 3016 3017 try_merge_free_space(ctl, entry, false); 3018 steal_from_bitmap(ctl, entry, false); 3019 3020 /* As we insert directly, update these statistics */ 3021 if (!btrfs_free_space_trimmed(entry)) { 3022 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3023 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3024 entry->bytes; 3025 } 3026 } 3027 tree_insert_offset(ctl, NULL, entry); 3028 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3029 entry_less); 3030 } 3031 cluster->root = RB_ROOT; 3032 spin_unlock(&cluster->lock); 3033 btrfs_put_block_group(block_group); 3034 } 3035 3036 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3037 { 3038 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3039 struct btrfs_free_cluster *cluster; 3040 struct list_head *head; 3041 3042 spin_lock(&ctl->tree_lock); 3043 while ((head = block_group->cluster_list.next) != 3044 &block_group->cluster_list) { 3045 cluster = list_entry(head, struct btrfs_free_cluster, 3046 block_group_list); 3047 3048 WARN_ON(cluster->block_group != block_group); 3049 __btrfs_return_cluster_to_free_space(block_group, cluster); 3050 3051 cond_resched_lock(&ctl->tree_lock); 3052 } 3053 __btrfs_remove_free_space_cache(ctl); 3054 btrfs_discard_update_discardable(block_group); 3055 spin_unlock(&ctl->tree_lock); 3056 3057 } 3058 3059 /* 3060 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3061 */ 3062 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3063 { 3064 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3065 struct btrfs_free_space *info; 3066 struct rb_node *node; 3067 bool ret = true; 3068 3069 spin_lock(&ctl->tree_lock); 3070 node = rb_first(&ctl->free_space_offset); 3071 3072 while (node) { 3073 info = rb_entry(node, struct btrfs_free_space, offset_index); 3074 3075 if (!btrfs_free_space_trimmed(info)) { 3076 ret = false; 3077 break; 3078 } 3079 3080 node = rb_next(node); 3081 } 3082 3083 spin_unlock(&ctl->tree_lock); 3084 return ret; 3085 } 3086 3087 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3088 u64 offset, u64 bytes, u64 empty_size, 3089 u64 *max_extent_size) 3090 { 3091 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3092 struct btrfs_discard_ctl *discard_ctl = 3093 &block_group->fs_info->discard_ctl; 3094 struct btrfs_free_space *entry = NULL; 3095 u64 bytes_search = bytes + empty_size; 3096 u64 ret = 0; 3097 u64 align_gap = 0; 3098 u64 align_gap_len = 0; 3099 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3100 bool use_bytes_index = (offset == block_group->start); 3101 3102 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3103 3104 spin_lock(&ctl->tree_lock); 3105 entry = find_free_space(ctl, &offset, &bytes_search, 3106 block_group->full_stripe_len, max_extent_size, 3107 use_bytes_index); 3108 if (!entry) 3109 goto out; 3110 3111 ret = offset; 3112 if (entry->bitmap) { 3113 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3114 3115 if (!btrfs_free_space_trimmed(entry)) 3116 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3117 3118 if (!entry->bytes) 3119 free_bitmap(ctl, entry); 3120 } else { 3121 unlink_free_space(ctl, entry, true); 3122 align_gap_len = offset - entry->offset; 3123 align_gap = entry->offset; 3124 align_gap_trim_state = entry->trim_state; 3125 3126 if (!btrfs_free_space_trimmed(entry)) 3127 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3128 3129 entry->offset = offset + bytes; 3130 WARN_ON(entry->bytes < bytes + align_gap_len); 3131 3132 entry->bytes -= bytes + align_gap_len; 3133 if (!entry->bytes) 3134 kmem_cache_free(btrfs_free_space_cachep, entry); 3135 else 3136 link_free_space(ctl, entry); 3137 } 3138 out: 3139 btrfs_discard_update_discardable(block_group); 3140 spin_unlock(&ctl->tree_lock); 3141 3142 if (align_gap_len) 3143 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3144 align_gap_trim_state); 3145 return ret; 3146 } 3147 3148 /* 3149 * given a cluster, put all of its extents back into the free space 3150 * cache. If a block group is passed, this function will only free 3151 * a cluster that belongs to the passed block group. 3152 * 3153 * Otherwise, it'll get a reference on the block group pointed to by the 3154 * cluster and remove the cluster from it. 3155 */ 3156 void btrfs_return_cluster_to_free_space( 3157 struct btrfs_block_group *block_group, 3158 struct btrfs_free_cluster *cluster) 3159 { 3160 struct btrfs_free_space_ctl *ctl; 3161 3162 /* first, get a safe pointer to the block group */ 3163 spin_lock(&cluster->lock); 3164 if (!block_group) { 3165 block_group = cluster->block_group; 3166 if (!block_group) { 3167 spin_unlock(&cluster->lock); 3168 return; 3169 } 3170 } else if (cluster->block_group != block_group) { 3171 /* someone else has already freed it don't redo their work */ 3172 spin_unlock(&cluster->lock); 3173 return; 3174 } 3175 btrfs_get_block_group(block_group); 3176 spin_unlock(&cluster->lock); 3177 3178 ctl = block_group->free_space_ctl; 3179 3180 /* now return any extents the cluster had on it */ 3181 spin_lock(&ctl->tree_lock); 3182 __btrfs_return_cluster_to_free_space(block_group, cluster); 3183 spin_unlock(&ctl->tree_lock); 3184 3185 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3186 3187 /* finally drop our ref */ 3188 btrfs_put_block_group(block_group); 3189 } 3190 3191 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3192 struct btrfs_free_cluster *cluster, 3193 struct btrfs_free_space *entry, 3194 u64 bytes, u64 min_start, 3195 u64 *max_extent_size) 3196 { 3197 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3198 int err; 3199 u64 search_start = cluster->window_start; 3200 u64 search_bytes = bytes; 3201 u64 ret = 0; 3202 3203 search_start = min_start; 3204 search_bytes = bytes; 3205 3206 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3207 if (err) { 3208 *max_extent_size = max(get_max_extent_size(entry), 3209 *max_extent_size); 3210 return 0; 3211 } 3212 3213 ret = search_start; 3214 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3215 3216 return ret; 3217 } 3218 3219 /* 3220 * given a cluster, try to allocate 'bytes' from it, returns 0 3221 * if it couldn't find anything suitably large, or a logical disk offset 3222 * if things worked out 3223 */ 3224 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3225 struct btrfs_free_cluster *cluster, u64 bytes, 3226 u64 min_start, u64 *max_extent_size) 3227 { 3228 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3229 struct btrfs_discard_ctl *discard_ctl = 3230 &block_group->fs_info->discard_ctl; 3231 struct btrfs_free_space *entry = NULL; 3232 struct rb_node *node; 3233 u64 ret = 0; 3234 3235 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3236 3237 spin_lock(&cluster->lock); 3238 if (bytes > cluster->max_size) 3239 goto out; 3240 3241 if (cluster->block_group != block_group) 3242 goto out; 3243 3244 node = rb_first(&cluster->root); 3245 if (!node) 3246 goto out; 3247 3248 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3249 while (1) { 3250 if (entry->bytes < bytes) 3251 *max_extent_size = max(get_max_extent_size(entry), 3252 *max_extent_size); 3253 3254 if (entry->bytes < bytes || 3255 (!entry->bitmap && entry->offset < min_start)) { 3256 node = rb_next(&entry->offset_index); 3257 if (!node) 3258 break; 3259 entry = rb_entry(node, struct btrfs_free_space, 3260 offset_index); 3261 continue; 3262 } 3263 3264 if (entry->bitmap) { 3265 ret = btrfs_alloc_from_bitmap(block_group, 3266 cluster, entry, bytes, 3267 cluster->window_start, 3268 max_extent_size); 3269 if (ret == 0) { 3270 node = rb_next(&entry->offset_index); 3271 if (!node) 3272 break; 3273 entry = rb_entry(node, struct btrfs_free_space, 3274 offset_index); 3275 continue; 3276 } 3277 cluster->window_start += bytes; 3278 } else { 3279 ret = entry->offset; 3280 3281 entry->offset += bytes; 3282 entry->bytes -= bytes; 3283 } 3284 3285 break; 3286 } 3287 out: 3288 spin_unlock(&cluster->lock); 3289 3290 if (!ret) 3291 return 0; 3292 3293 spin_lock(&ctl->tree_lock); 3294 3295 if (!btrfs_free_space_trimmed(entry)) 3296 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3297 3298 ctl->free_space -= bytes; 3299 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3300 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3301 3302 spin_lock(&cluster->lock); 3303 if (entry->bytes == 0) { 3304 rb_erase(&entry->offset_index, &cluster->root); 3305 ctl->free_extents--; 3306 if (entry->bitmap) { 3307 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3308 entry->bitmap); 3309 ctl->total_bitmaps--; 3310 recalculate_thresholds(ctl); 3311 } else if (!btrfs_free_space_trimmed(entry)) { 3312 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3313 } 3314 kmem_cache_free(btrfs_free_space_cachep, entry); 3315 } 3316 3317 spin_unlock(&cluster->lock); 3318 spin_unlock(&ctl->tree_lock); 3319 3320 return ret; 3321 } 3322 3323 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3324 struct btrfs_free_space *entry, 3325 struct btrfs_free_cluster *cluster, 3326 u64 offset, u64 bytes, 3327 u64 cont1_bytes, u64 min_bytes) 3328 { 3329 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3330 unsigned long next_zero; 3331 unsigned long i; 3332 unsigned long want_bits; 3333 unsigned long min_bits; 3334 unsigned long found_bits; 3335 unsigned long max_bits = 0; 3336 unsigned long start = 0; 3337 unsigned long total_found = 0; 3338 int ret; 3339 3340 lockdep_assert_held(&ctl->tree_lock); 3341 3342 i = offset_to_bit(entry->offset, ctl->unit, 3343 max_t(u64, offset, entry->offset)); 3344 want_bits = bytes_to_bits(bytes, ctl->unit); 3345 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3346 3347 /* 3348 * Don't bother looking for a cluster in this bitmap if it's heavily 3349 * fragmented. 3350 */ 3351 if (entry->max_extent_size && 3352 entry->max_extent_size < cont1_bytes) 3353 return -ENOSPC; 3354 again: 3355 found_bits = 0; 3356 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3357 next_zero = find_next_zero_bit(entry->bitmap, 3358 BITS_PER_BITMAP, i); 3359 if (next_zero - i >= min_bits) { 3360 found_bits = next_zero - i; 3361 if (found_bits > max_bits) 3362 max_bits = found_bits; 3363 break; 3364 } 3365 if (next_zero - i > max_bits) 3366 max_bits = next_zero - i; 3367 i = next_zero; 3368 } 3369 3370 if (!found_bits) { 3371 entry->max_extent_size = (u64)max_bits * ctl->unit; 3372 return -ENOSPC; 3373 } 3374 3375 if (!total_found) { 3376 start = i; 3377 cluster->max_size = 0; 3378 } 3379 3380 total_found += found_bits; 3381 3382 if (cluster->max_size < found_bits * ctl->unit) 3383 cluster->max_size = found_bits * ctl->unit; 3384 3385 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3386 i = next_zero + 1; 3387 goto again; 3388 } 3389 3390 cluster->window_start = start * ctl->unit + entry->offset; 3391 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3392 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3393 3394 /* 3395 * We need to know if we're currently on the normal space index when we 3396 * manipulate the bitmap so that we know we need to remove and re-insert 3397 * it into the space_index tree. Clear the bytes_index node here so the 3398 * bitmap manipulation helpers know not to mess with the space_index 3399 * until this bitmap entry is added back into the normal cache. 3400 */ 3401 RB_CLEAR_NODE(&entry->bytes_index); 3402 3403 ret = tree_insert_offset(ctl, cluster, entry); 3404 ASSERT(!ret); /* -EEXIST; Logic error */ 3405 3406 trace_btrfs_setup_cluster(block_group, cluster, 3407 total_found * ctl->unit, 1); 3408 return 0; 3409 } 3410 3411 /* 3412 * This searches the block group for just extents to fill the cluster with. 3413 * Try to find a cluster with at least bytes total bytes, at least one 3414 * extent of cont1_bytes, and other clusters of at least min_bytes. 3415 */ 3416 static noinline int 3417 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3418 struct btrfs_free_cluster *cluster, 3419 struct list_head *bitmaps, u64 offset, u64 bytes, 3420 u64 cont1_bytes, u64 min_bytes) 3421 { 3422 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3423 struct btrfs_free_space *first = NULL; 3424 struct btrfs_free_space *entry = NULL; 3425 struct btrfs_free_space *last; 3426 struct rb_node *node; 3427 u64 window_free; 3428 u64 max_extent; 3429 u64 total_size = 0; 3430 3431 lockdep_assert_held(&ctl->tree_lock); 3432 3433 entry = tree_search_offset(ctl, offset, 0, 1); 3434 if (!entry) 3435 return -ENOSPC; 3436 3437 /* 3438 * We don't want bitmaps, so just move along until we find a normal 3439 * extent entry. 3440 */ 3441 while (entry->bitmap || entry->bytes < min_bytes) { 3442 if (entry->bitmap && list_empty(&entry->list)) 3443 list_add_tail(&entry->list, bitmaps); 3444 node = rb_next(&entry->offset_index); 3445 if (!node) 3446 return -ENOSPC; 3447 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3448 } 3449 3450 window_free = entry->bytes; 3451 max_extent = entry->bytes; 3452 first = entry; 3453 last = entry; 3454 3455 for (node = rb_next(&entry->offset_index); node; 3456 node = rb_next(&entry->offset_index)) { 3457 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3458 3459 if (entry->bitmap) { 3460 if (list_empty(&entry->list)) 3461 list_add_tail(&entry->list, bitmaps); 3462 continue; 3463 } 3464 3465 if (entry->bytes < min_bytes) 3466 continue; 3467 3468 last = entry; 3469 window_free += entry->bytes; 3470 if (entry->bytes > max_extent) 3471 max_extent = entry->bytes; 3472 } 3473 3474 if (window_free < bytes || max_extent < cont1_bytes) 3475 return -ENOSPC; 3476 3477 cluster->window_start = first->offset; 3478 3479 node = &first->offset_index; 3480 3481 /* 3482 * now we've found our entries, pull them out of the free space 3483 * cache and put them into the cluster rbtree 3484 */ 3485 do { 3486 int ret; 3487 3488 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3489 node = rb_next(&entry->offset_index); 3490 if (entry->bitmap || entry->bytes < min_bytes) 3491 continue; 3492 3493 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3494 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3495 ret = tree_insert_offset(ctl, cluster, entry); 3496 total_size += entry->bytes; 3497 ASSERT(!ret); /* -EEXIST; Logic error */ 3498 } while (node && entry != last); 3499 3500 cluster->max_size = max_extent; 3501 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3502 return 0; 3503 } 3504 3505 /* 3506 * This specifically looks for bitmaps that may work in the cluster, we assume 3507 * that we have already failed to find extents that will work. 3508 */ 3509 static noinline int 3510 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3511 struct btrfs_free_cluster *cluster, 3512 struct list_head *bitmaps, u64 offset, u64 bytes, 3513 u64 cont1_bytes, u64 min_bytes) 3514 { 3515 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3516 struct btrfs_free_space *entry = NULL; 3517 int ret = -ENOSPC; 3518 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3519 3520 if (ctl->total_bitmaps == 0) 3521 return -ENOSPC; 3522 3523 /* 3524 * The bitmap that covers offset won't be in the list unless offset 3525 * is just its start offset. 3526 */ 3527 if (!list_empty(bitmaps)) 3528 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3529 3530 if (!entry || entry->offset != bitmap_offset) { 3531 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3532 if (entry && list_empty(&entry->list)) 3533 list_add(&entry->list, bitmaps); 3534 } 3535 3536 list_for_each_entry(entry, bitmaps, list) { 3537 if (entry->bytes < bytes) 3538 continue; 3539 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3540 bytes, cont1_bytes, min_bytes); 3541 if (!ret) 3542 return 0; 3543 } 3544 3545 /* 3546 * The bitmaps list has all the bitmaps that record free space 3547 * starting after offset, so no more search is required. 3548 */ 3549 return -ENOSPC; 3550 } 3551 3552 /* 3553 * here we try to find a cluster of blocks in a block group. The goal 3554 * is to find at least bytes+empty_size. 3555 * We might not find them all in one contiguous area. 3556 * 3557 * returns zero and sets up cluster if things worked out, otherwise 3558 * it returns -enospc 3559 */ 3560 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3561 struct btrfs_free_cluster *cluster, 3562 u64 offset, u64 bytes, u64 empty_size) 3563 { 3564 struct btrfs_fs_info *fs_info = block_group->fs_info; 3565 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3566 struct btrfs_free_space *entry, *tmp; 3567 LIST_HEAD(bitmaps); 3568 u64 min_bytes; 3569 u64 cont1_bytes; 3570 int ret; 3571 3572 /* 3573 * Choose the minimum extent size we'll require for this 3574 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3575 * For metadata, allow allocates with smaller extents. For 3576 * data, keep it dense. 3577 */ 3578 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3579 cont1_bytes = bytes + empty_size; 3580 min_bytes = cont1_bytes; 3581 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3582 cont1_bytes = bytes; 3583 min_bytes = fs_info->sectorsize; 3584 } else { 3585 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3586 min_bytes = fs_info->sectorsize; 3587 } 3588 3589 spin_lock(&ctl->tree_lock); 3590 3591 /* 3592 * If we know we don't have enough space to make a cluster don't even 3593 * bother doing all the work to try and find one. 3594 */ 3595 if (ctl->free_space < bytes) { 3596 spin_unlock(&ctl->tree_lock); 3597 return -ENOSPC; 3598 } 3599 3600 spin_lock(&cluster->lock); 3601 3602 /* someone already found a cluster, hooray */ 3603 if (cluster->block_group) { 3604 ret = 0; 3605 goto out; 3606 } 3607 3608 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3609 min_bytes); 3610 3611 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3612 bytes + empty_size, 3613 cont1_bytes, min_bytes); 3614 if (ret) 3615 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3616 offset, bytes + empty_size, 3617 cont1_bytes, min_bytes); 3618 3619 /* Clear our temporary list */ 3620 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3621 list_del_init(&entry->list); 3622 3623 if (!ret) { 3624 btrfs_get_block_group(block_group); 3625 list_add_tail(&cluster->block_group_list, 3626 &block_group->cluster_list); 3627 cluster->block_group = block_group; 3628 } else { 3629 trace_btrfs_failed_cluster_setup(block_group); 3630 } 3631 out: 3632 spin_unlock(&cluster->lock); 3633 spin_unlock(&ctl->tree_lock); 3634 3635 return ret; 3636 } 3637 3638 /* 3639 * simple code to zero out a cluster 3640 */ 3641 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3642 { 3643 spin_lock_init(&cluster->lock); 3644 spin_lock_init(&cluster->refill_lock); 3645 cluster->root = RB_ROOT; 3646 cluster->max_size = 0; 3647 cluster->fragmented = false; 3648 INIT_LIST_HEAD(&cluster->block_group_list); 3649 cluster->block_group = NULL; 3650 } 3651 3652 static int do_trimming(struct btrfs_block_group *block_group, 3653 u64 *total_trimmed, u64 start, u64 bytes, 3654 u64 reserved_start, u64 reserved_bytes, 3655 enum btrfs_trim_state reserved_trim_state, 3656 struct btrfs_trim_range *trim_entry) 3657 { 3658 struct btrfs_space_info *space_info = block_group->space_info; 3659 struct btrfs_fs_info *fs_info = block_group->fs_info; 3660 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3661 int ret; 3662 int update = 0; 3663 const u64 end = start + bytes; 3664 const u64 reserved_end = reserved_start + reserved_bytes; 3665 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3666 u64 trimmed = 0; 3667 3668 spin_lock(&space_info->lock); 3669 spin_lock(&block_group->lock); 3670 if (!block_group->ro) { 3671 block_group->reserved += reserved_bytes; 3672 space_info->bytes_reserved += reserved_bytes; 3673 update = 1; 3674 } 3675 spin_unlock(&block_group->lock); 3676 spin_unlock(&space_info->lock); 3677 3678 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3679 if (!ret) { 3680 *total_trimmed += trimmed; 3681 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3682 } 3683 3684 mutex_lock(&ctl->cache_writeout_mutex); 3685 if (reserved_start < start) 3686 __btrfs_add_free_space(block_group, reserved_start, 3687 start - reserved_start, 3688 reserved_trim_state); 3689 if (end < reserved_end) 3690 __btrfs_add_free_space(block_group, end, reserved_end - end, 3691 reserved_trim_state); 3692 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3693 list_del(&trim_entry->list); 3694 mutex_unlock(&ctl->cache_writeout_mutex); 3695 3696 if (update) { 3697 spin_lock(&space_info->lock); 3698 spin_lock(&block_group->lock); 3699 if (block_group->ro) 3700 space_info->bytes_readonly += reserved_bytes; 3701 block_group->reserved -= reserved_bytes; 3702 space_info->bytes_reserved -= reserved_bytes; 3703 spin_unlock(&block_group->lock); 3704 spin_unlock(&space_info->lock); 3705 } 3706 3707 return ret; 3708 } 3709 3710 /* 3711 * If @async is set, then we will trim 1 region and return. 3712 */ 3713 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3714 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3715 bool async) 3716 { 3717 struct btrfs_discard_ctl *discard_ctl = 3718 &block_group->fs_info->discard_ctl; 3719 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3720 struct btrfs_free_space *entry; 3721 struct rb_node *node; 3722 int ret = 0; 3723 u64 extent_start; 3724 u64 extent_bytes; 3725 enum btrfs_trim_state extent_trim_state; 3726 u64 bytes; 3727 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3728 3729 while (start < end) { 3730 struct btrfs_trim_range trim_entry; 3731 3732 mutex_lock(&ctl->cache_writeout_mutex); 3733 spin_lock(&ctl->tree_lock); 3734 3735 if (ctl->free_space < minlen) 3736 goto out_unlock; 3737 3738 entry = tree_search_offset(ctl, start, 0, 1); 3739 if (!entry) 3740 goto out_unlock; 3741 3742 /* Skip bitmaps and if async, already trimmed entries */ 3743 while (entry->bitmap || 3744 (async && btrfs_free_space_trimmed(entry))) { 3745 node = rb_next(&entry->offset_index); 3746 if (!node) 3747 goto out_unlock; 3748 entry = rb_entry(node, struct btrfs_free_space, 3749 offset_index); 3750 } 3751 3752 if (entry->offset >= end) 3753 goto out_unlock; 3754 3755 extent_start = entry->offset; 3756 extent_bytes = entry->bytes; 3757 extent_trim_state = entry->trim_state; 3758 if (async) { 3759 start = entry->offset; 3760 bytes = entry->bytes; 3761 if (bytes < minlen) { 3762 spin_unlock(&ctl->tree_lock); 3763 mutex_unlock(&ctl->cache_writeout_mutex); 3764 goto next; 3765 } 3766 unlink_free_space(ctl, entry, true); 3767 /* 3768 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3769 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3770 * X when we come back around. So trim it now. 3771 */ 3772 if (max_discard_size && 3773 bytes >= (max_discard_size + 3774 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3775 bytes = max_discard_size; 3776 extent_bytes = max_discard_size; 3777 entry->offset += max_discard_size; 3778 entry->bytes -= max_discard_size; 3779 link_free_space(ctl, entry); 3780 } else { 3781 kmem_cache_free(btrfs_free_space_cachep, entry); 3782 } 3783 } else { 3784 start = max(start, extent_start); 3785 bytes = min(extent_start + extent_bytes, end) - start; 3786 if (bytes < minlen) { 3787 spin_unlock(&ctl->tree_lock); 3788 mutex_unlock(&ctl->cache_writeout_mutex); 3789 goto next; 3790 } 3791 3792 unlink_free_space(ctl, entry, true); 3793 kmem_cache_free(btrfs_free_space_cachep, entry); 3794 } 3795 3796 spin_unlock(&ctl->tree_lock); 3797 trim_entry.start = extent_start; 3798 trim_entry.bytes = extent_bytes; 3799 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3800 mutex_unlock(&ctl->cache_writeout_mutex); 3801 3802 ret = do_trimming(block_group, total_trimmed, start, bytes, 3803 extent_start, extent_bytes, extent_trim_state, 3804 &trim_entry); 3805 if (ret) { 3806 block_group->discard_cursor = start + bytes; 3807 break; 3808 } 3809 next: 3810 start += bytes; 3811 block_group->discard_cursor = start; 3812 if (async && *total_trimmed) 3813 break; 3814 3815 if (btrfs_trim_interrupted()) { 3816 ret = -ERESTARTSYS; 3817 break; 3818 } 3819 3820 cond_resched(); 3821 } 3822 3823 return ret; 3824 3825 out_unlock: 3826 block_group->discard_cursor = btrfs_block_group_end(block_group); 3827 spin_unlock(&ctl->tree_lock); 3828 mutex_unlock(&ctl->cache_writeout_mutex); 3829 3830 return ret; 3831 } 3832 3833 /* 3834 * If we break out of trimming a bitmap prematurely, we should reset the 3835 * trimming bit. In a rather contrieved case, it's possible to race here so 3836 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3837 * 3838 * start = start of bitmap 3839 * end = near end of bitmap 3840 * 3841 * Thread 1: Thread 2: 3842 * trim_bitmaps(start) 3843 * trim_bitmaps(end) 3844 * end_trimming_bitmap() 3845 * reset_trimming_bitmap() 3846 */ 3847 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3848 { 3849 struct btrfs_free_space *entry; 3850 3851 spin_lock(&ctl->tree_lock); 3852 entry = tree_search_offset(ctl, offset, 1, 0); 3853 if (entry) { 3854 if (btrfs_free_space_trimmed(entry)) { 3855 ctl->discardable_extents[BTRFS_STAT_CURR] += 3856 entry->bitmap_extents; 3857 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3858 } 3859 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3860 } 3861 3862 spin_unlock(&ctl->tree_lock); 3863 } 3864 3865 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3866 struct btrfs_free_space *entry) 3867 { 3868 if (btrfs_free_space_trimming_bitmap(entry)) { 3869 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3870 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3871 entry->bitmap_extents; 3872 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3873 } 3874 } 3875 3876 /* 3877 * If @async is set, then we will trim 1 region and return. 3878 */ 3879 static int trim_bitmaps(struct btrfs_block_group *block_group, 3880 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3881 u64 maxlen, bool async) 3882 { 3883 struct btrfs_discard_ctl *discard_ctl = 3884 &block_group->fs_info->discard_ctl; 3885 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3886 struct btrfs_free_space *entry; 3887 int ret = 0; 3888 int ret2; 3889 u64 bytes; 3890 u64 offset = offset_to_bitmap(ctl, start); 3891 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3892 3893 while (offset < end) { 3894 bool next_bitmap = false; 3895 struct btrfs_trim_range trim_entry; 3896 3897 mutex_lock(&ctl->cache_writeout_mutex); 3898 spin_lock(&ctl->tree_lock); 3899 3900 if (ctl->free_space < minlen) { 3901 block_group->discard_cursor = 3902 btrfs_block_group_end(block_group); 3903 spin_unlock(&ctl->tree_lock); 3904 mutex_unlock(&ctl->cache_writeout_mutex); 3905 break; 3906 } 3907 3908 entry = tree_search_offset(ctl, offset, 1, 0); 3909 /* 3910 * Bitmaps are marked trimmed lossily now to prevent constant 3911 * discarding of the same bitmap (the reason why we are bound 3912 * by the filters). So, retrim the block group bitmaps when we 3913 * are preparing to punt to the unused_bgs list. This uses 3914 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3915 * which is the only discard index which sets minlen to 0. 3916 */ 3917 if (!entry || (async && minlen && start == offset && 3918 btrfs_free_space_trimmed(entry))) { 3919 spin_unlock(&ctl->tree_lock); 3920 mutex_unlock(&ctl->cache_writeout_mutex); 3921 next_bitmap = true; 3922 goto next; 3923 } 3924 3925 /* 3926 * Async discard bitmap trimming begins at by setting the start 3927 * to be key.objectid and the offset_to_bitmap() aligns to the 3928 * start of the bitmap. This lets us know we are fully 3929 * scanning the bitmap rather than only some portion of it. 3930 */ 3931 if (start == offset) 3932 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3933 3934 bytes = minlen; 3935 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3936 if (ret2 || start >= end) { 3937 /* 3938 * We lossily consider a bitmap trimmed if we only skip 3939 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3940 */ 3941 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3942 end_trimming_bitmap(ctl, entry); 3943 else 3944 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3945 spin_unlock(&ctl->tree_lock); 3946 mutex_unlock(&ctl->cache_writeout_mutex); 3947 next_bitmap = true; 3948 goto next; 3949 } 3950 3951 /* 3952 * We already trimmed a region, but are using the locking above 3953 * to reset the trim_state. 3954 */ 3955 if (async && *total_trimmed) { 3956 spin_unlock(&ctl->tree_lock); 3957 mutex_unlock(&ctl->cache_writeout_mutex); 3958 goto out; 3959 } 3960 3961 bytes = min(bytes, end - start); 3962 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3963 spin_unlock(&ctl->tree_lock); 3964 mutex_unlock(&ctl->cache_writeout_mutex); 3965 goto next; 3966 } 3967 3968 /* 3969 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3970 * If X < @minlen, we won't trim X when we come back around. 3971 * So trim it now. We differ here from trimming extents as we 3972 * don't keep individual state per bit. 3973 */ 3974 if (async && 3975 max_discard_size && 3976 bytes > (max_discard_size + minlen)) 3977 bytes = max_discard_size; 3978 3979 bitmap_clear_bits(ctl, entry, start, bytes, true); 3980 if (entry->bytes == 0) 3981 free_bitmap(ctl, entry); 3982 3983 spin_unlock(&ctl->tree_lock); 3984 trim_entry.start = start; 3985 trim_entry.bytes = bytes; 3986 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3987 mutex_unlock(&ctl->cache_writeout_mutex); 3988 3989 ret = do_trimming(block_group, total_trimmed, start, bytes, 3990 start, bytes, 0, &trim_entry); 3991 if (ret) { 3992 reset_trimming_bitmap(ctl, offset); 3993 block_group->discard_cursor = 3994 btrfs_block_group_end(block_group); 3995 break; 3996 } 3997 next: 3998 if (next_bitmap) { 3999 offset += BITS_PER_BITMAP * ctl->unit; 4000 start = offset; 4001 } else { 4002 start += bytes; 4003 } 4004 block_group->discard_cursor = start; 4005 4006 if (btrfs_trim_interrupted()) { 4007 if (start != offset) 4008 reset_trimming_bitmap(ctl, offset); 4009 ret = -ERESTARTSYS; 4010 break; 4011 } 4012 4013 cond_resched(); 4014 } 4015 4016 if (offset >= end) 4017 block_group->discard_cursor = end; 4018 4019 out: 4020 return ret; 4021 } 4022 4023 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4024 u64 *trimmed, u64 start, u64 end, u64 minlen) 4025 { 4026 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4027 int ret; 4028 u64 rem = 0; 4029 4030 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4031 4032 *trimmed = 0; 4033 4034 spin_lock(&block_group->lock); 4035 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4036 spin_unlock(&block_group->lock); 4037 return 0; 4038 } 4039 btrfs_freeze_block_group(block_group); 4040 spin_unlock(&block_group->lock); 4041 4042 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4043 if (ret) 4044 goto out; 4045 4046 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4047 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4048 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4049 if (rem) 4050 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4051 out: 4052 btrfs_unfreeze_block_group(block_group); 4053 return ret; 4054 } 4055 4056 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4057 u64 *trimmed, u64 start, u64 end, u64 minlen, 4058 bool async) 4059 { 4060 int ret; 4061 4062 *trimmed = 0; 4063 4064 spin_lock(&block_group->lock); 4065 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4066 spin_unlock(&block_group->lock); 4067 return 0; 4068 } 4069 btrfs_freeze_block_group(block_group); 4070 spin_unlock(&block_group->lock); 4071 4072 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4073 btrfs_unfreeze_block_group(block_group); 4074 4075 return ret; 4076 } 4077 4078 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4079 u64 *trimmed, u64 start, u64 end, u64 minlen, 4080 u64 maxlen, bool async) 4081 { 4082 int ret; 4083 4084 *trimmed = 0; 4085 4086 spin_lock(&block_group->lock); 4087 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4088 spin_unlock(&block_group->lock); 4089 return 0; 4090 } 4091 btrfs_freeze_block_group(block_group); 4092 spin_unlock(&block_group->lock); 4093 4094 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4095 async); 4096 4097 btrfs_unfreeze_block_group(block_group); 4098 4099 return ret; 4100 } 4101 4102 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4103 { 4104 return btrfs_super_cache_generation(fs_info->super_copy); 4105 } 4106 4107 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4108 struct btrfs_trans_handle *trans) 4109 { 4110 struct btrfs_block_group *block_group; 4111 struct rb_node *node; 4112 int ret = 0; 4113 4114 btrfs_info(fs_info, "cleaning free space cache v1"); 4115 4116 node = rb_first_cached(&fs_info->block_group_cache_tree); 4117 while (node) { 4118 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4119 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4120 if (ret) 4121 goto out; 4122 node = rb_next(node); 4123 } 4124 out: 4125 return ret; 4126 } 4127 4128 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4129 { 4130 struct btrfs_trans_handle *trans; 4131 int ret; 4132 4133 /* 4134 * update_super_roots will appropriately set or unset 4135 * super_copy->cache_generation based on SPACE_CACHE and 4136 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4137 * transaction commit whether we are enabling space cache v1 and don't 4138 * have any other work to do, or are disabling it and removing free 4139 * space inodes. 4140 */ 4141 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4142 if (IS_ERR(trans)) 4143 return PTR_ERR(trans); 4144 4145 if (!active) { 4146 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4147 ret = cleanup_free_space_cache_v1(fs_info, trans); 4148 if (ret) { 4149 btrfs_abort_transaction(trans, ret); 4150 btrfs_end_transaction(trans); 4151 goto out; 4152 } 4153 } 4154 4155 ret = btrfs_commit_transaction(trans); 4156 out: 4157 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4158 4159 return ret; 4160 } 4161 4162 int __init btrfs_free_space_init(void) 4163 { 4164 btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); 4165 if (!btrfs_free_space_cachep) 4166 return -ENOMEM; 4167 4168 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4169 PAGE_SIZE, PAGE_SIZE, 4170 0, NULL); 4171 if (!btrfs_free_space_bitmap_cachep) { 4172 kmem_cache_destroy(btrfs_free_space_cachep); 4173 return -ENOMEM; 4174 } 4175 4176 return 0; 4177 } 4178 4179 void __cold btrfs_free_space_exit(void) 4180 { 4181 kmem_cache_destroy(btrfs_free_space_cachep); 4182 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4183 } 4184 4185 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4186 /* 4187 * Use this if you need to make a bitmap or extent entry specifically, it 4188 * doesn't do any of the merging that add_free_space does, this acts a lot like 4189 * how the free space cache loading stuff works, so you can get really weird 4190 * configurations. 4191 */ 4192 int test_add_free_space_entry(struct btrfs_block_group *cache, 4193 u64 offset, u64 bytes, bool bitmap) 4194 { 4195 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4196 struct btrfs_free_space *info = NULL, *bitmap_info; 4197 void *map = NULL; 4198 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4199 u64 bytes_added; 4200 int ret; 4201 4202 again: 4203 if (!info) { 4204 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4205 if (!info) 4206 return -ENOMEM; 4207 } 4208 4209 if (!bitmap) { 4210 spin_lock(&ctl->tree_lock); 4211 info->offset = offset; 4212 info->bytes = bytes; 4213 info->max_extent_size = 0; 4214 ret = link_free_space(ctl, info); 4215 spin_unlock(&ctl->tree_lock); 4216 if (ret) 4217 kmem_cache_free(btrfs_free_space_cachep, info); 4218 return ret; 4219 } 4220 4221 if (!map) { 4222 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4223 if (!map) { 4224 kmem_cache_free(btrfs_free_space_cachep, info); 4225 return -ENOMEM; 4226 } 4227 } 4228 4229 spin_lock(&ctl->tree_lock); 4230 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4231 1, 0); 4232 if (!bitmap_info) { 4233 info->bitmap = map; 4234 map = NULL; 4235 add_new_bitmap(ctl, info, offset); 4236 bitmap_info = info; 4237 info = NULL; 4238 } 4239 4240 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4241 trim_state); 4242 4243 bytes -= bytes_added; 4244 offset += bytes_added; 4245 spin_unlock(&ctl->tree_lock); 4246 4247 if (bytes) 4248 goto again; 4249 4250 if (info) 4251 kmem_cache_free(btrfs_free_space_cachep, info); 4252 if (map) 4253 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4254 return 0; 4255 } 4256 4257 /* 4258 * Checks to see if the given range is in the free space cache. This is really 4259 * just used to check the absence of space, so if there is free space in the 4260 * range at all we will return 1. 4261 */ 4262 int test_check_exists(struct btrfs_block_group *cache, 4263 u64 offset, u64 bytes) 4264 { 4265 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4266 struct btrfs_free_space *info; 4267 int ret = 0; 4268 4269 spin_lock(&ctl->tree_lock); 4270 info = tree_search_offset(ctl, offset, 0, 0); 4271 if (!info) { 4272 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4273 1, 0); 4274 if (!info) 4275 goto out; 4276 } 4277 4278 have_info: 4279 if (info->bitmap) { 4280 u64 bit_off, bit_bytes; 4281 struct rb_node *n; 4282 struct btrfs_free_space *tmp; 4283 4284 bit_off = offset; 4285 bit_bytes = ctl->unit; 4286 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4287 if (!ret) { 4288 if (bit_off == offset) { 4289 ret = 1; 4290 goto out; 4291 } else if (bit_off > offset && 4292 offset + bytes > bit_off) { 4293 ret = 1; 4294 goto out; 4295 } 4296 } 4297 4298 n = rb_prev(&info->offset_index); 4299 while (n) { 4300 tmp = rb_entry(n, struct btrfs_free_space, 4301 offset_index); 4302 if (tmp->offset + tmp->bytes < offset) 4303 break; 4304 if (offset + bytes < tmp->offset) { 4305 n = rb_prev(&tmp->offset_index); 4306 continue; 4307 } 4308 info = tmp; 4309 goto have_info; 4310 } 4311 4312 n = rb_next(&info->offset_index); 4313 while (n) { 4314 tmp = rb_entry(n, struct btrfs_free_space, 4315 offset_index); 4316 if (offset + bytes < tmp->offset) 4317 break; 4318 if (tmp->offset + tmp->bytes < offset) { 4319 n = rb_next(&tmp->offset_index); 4320 continue; 4321 } 4322 info = tmp; 4323 goto have_info; 4324 } 4325 4326 ret = 0; 4327 goto out; 4328 } 4329 4330 if (info->offset == offset) { 4331 ret = 1; 4332 goto out; 4333 } 4334 4335 if (offset > info->offset && offset < info->offset + info->bytes) 4336 ret = 1; 4337 out: 4338 spin_unlock(&ctl->tree_lock); 4339 return ret; 4340 } 4341 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4342