1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include <linux/string_choices.h> 15 #include "ctree.h" 16 #include "fs.h" 17 #include "messages.h" 18 #include "misc.h" 19 #include "free-space-cache.h" 20 #include "transaction.h" 21 #include "disk-io.h" 22 #include "extent_io.h" 23 #include "space-info.h" 24 #include "block-group.h" 25 #include "discard.h" 26 #include "subpage.h" 27 #include "inode-item.h" 28 #include "accessors.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "super.h" 32 33 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 34 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 35 #define FORCE_EXTENT_THRESHOLD SZ_1M 36 37 static struct kmem_cache *btrfs_free_space_cachep; 38 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 39 40 struct btrfs_trim_range { 41 u64 start; 42 u64 bytes; 43 struct list_head list; 44 }; 45 46 static int link_free_space(struct btrfs_free_space_ctl *ctl, 47 struct btrfs_free_space *info); 48 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 49 struct btrfs_free_space *info, bool update_stat); 50 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 51 struct btrfs_free_space *bitmap_info, u64 *offset, 52 u64 *bytes, bool for_alloc); 53 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 54 struct btrfs_free_space *bitmap_info); 55 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 56 struct btrfs_free_space *info, u64 offset, 57 u64 bytes, bool update_stats); 58 59 static void btrfs_crc32c_final(u32 crc, u8 *result) 60 { 61 put_unaligned_le32(~crc, result); 62 } 63 64 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 65 { 66 struct btrfs_free_space *info; 67 struct rb_node *node; 68 69 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 70 info = rb_entry(node, struct btrfs_free_space, offset_index); 71 if (!info->bitmap) { 72 unlink_free_space(ctl, info, true); 73 kmem_cache_free(btrfs_free_space_cachep, info); 74 } else { 75 free_bitmap(ctl, info); 76 } 77 78 cond_resched_lock(&ctl->tree_lock); 79 } 80 } 81 82 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 83 struct btrfs_path *path, 84 u64 offset) 85 { 86 struct btrfs_key key; 87 struct btrfs_key location; 88 struct btrfs_disk_key disk_key; 89 struct btrfs_free_space_header *header; 90 struct extent_buffer *leaf; 91 struct inode *inode = NULL; 92 unsigned nofs_flag; 93 int ret; 94 95 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 96 key.offset = offset; 97 key.type = 0; 98 99 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 100 if (ret < 0) 101 return ERR_PTR(ret); 102 if (ret > 0) { 103 btrfs_release_path(path); 104 return ERR_PTR(-ENOENT); 105 } 106 107 leaf = path->nodes[0]; 108 header = btrfs_item_ptr(leaf, path->slots[0], 109 struct btrfs_free_space_header); 110 btrfs_free_space_key(leaf, header, &disk_key); 111 btrfs_disk_key_to_cpu(&location, &disk_key); 112 btrfs_release_path(path); 113 114 /* 115 * We are often under a trans handle at this point, so we need to make 116 * sure NOFS is set to keep us from deadlocking. 117 */ 118 nofs_flag = memalloc_nofs_save(); 119 inode = btrfs_iget_path(location.objectid, root, path); 120 btrfs_release_path(path); 121 memalloc_nofs_restore(nofs_flag); 122 if (IS_ERR(inode)) 123 return inode; 124 125 mapping_set_gfp_mask(inode->i_mapping, 126 mapping_gfp_constraint(inode->i_mapping, 127 ~(__GFP_FS | __GFP_HIGHMEM))); 128 129 return inode; 130 } 131 132 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 133 struct btrfs_path *path) 134 { 135 struct btrfs_fs_info *fs_info = block_group->fs_info; 136 struct inode *inode = NULL; 137 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 138 139 spin_lock(&block_group->lock); 140 if (block_group->inode) 141 inode = igrab(&block_group->inode->vfs_inode); 142 spin_unlock(&block_group->lock); 143 if (inode) 144 return inode; 145 146 inode = __lookup_free_space_inode(fs_info->tree_root, path, 147 block_group->start); 148 if (IS_ERR(inode)) 149 return inode; 150 151 spin_lock(&block_group->lock); 152 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 153 btrfs_info(fs_info, "Old style space inode found, converting."); 154 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 155 BTRFS_INODE_NODATACOW; 156 block_group->disk_cache_state = BTRFS_DC_CLEAR; 157 } 158 159 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 160 block_group->inode = BTRFS_I(igrab(inode)); 161 spin_unlock(&block_group->lock); 162 163 return inode; 164 } 165 166 static int __create_free_space_inode(struct btrfs_root *root, 167 struct btrfs_trans_handle *trans, 168 struct btrfs_path *path, 169 u64 ino, u64 offset) 170 { 171 struct btrfs_key key; 172 struct btrfs_disk_key disk_key; 173 struct btrfs_free_space_header *header; 174 struct btrfs_inode_item *inode_item; 175 struct extent_buffer *leaf; 176 /* We inline CRCs for the free disk space cache */ 177 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 178 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 179 int ret; 180 181 ret = btrfs_insert_empty_inode(trans, root, path, ino); 182 if (ret) 183 return ret; 184 185 leaf = path->nodes[0]; 186 inode_item = btrfs_item_ptr(leaf, path->slots[0], 187 struct btrfs_inode_item); 188 btrfs_item_key(leaf, &disk_key, path->slots[0]); 189 memzero_extent_buffer(leaf, (unsigned long)inode_item, 190 sizeof(*inode_item)); 191 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 192 btrfs_set_inode_size(leaf, inode_item, 0); 193 btrfs_set_inode_nbytes(leaf, inode_item, 0); 194 btrfs_set_inode_uid(leaf, inode_item, 0); 195 btrfs_set_inode_gid(leaf, inode_item, 0); 196 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 197 btrfs_set_inode_flags(leaf, inode_item, flags); 198 btrfs_set_inode_nlink(leaf, inode_item, 1); 199 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 200 btrfs_set_inode_block_group(leaf, inode_item, offset); 201 btrfs_mark_buffer_dirty(trans, leaf); 202 btrfs_release_path(path); 203 204 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 205 key.offset = offset; 206 key.type = 0; 207 ret = btrfs_insert_empty_item(trans, root, path, &key, 208 sizeof(struct btrfs_free_space_header)); 209 if (ret < 0) { 210 btrfs_release_path(path); 211 return ret; 212 } 213 214 leaf = path->nodes[0]; 215 header = btrfs_item_ptr(leaf, path->slots[0], 216 struct btrfs_free_space_header); 217 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 218 btrfs_set_free_space_key(leaf, header, &disk_key); 219 btrfs_mark_buffer_dirty(trans, leaf); 220 btrfs_release_path(path); 221 222 return 0; 223 } 224 225 int create_free_space_inode(struct btrfs_trans_handle *trans, 226 struct btrfs_block_group *block_group, 227 struct btrfs_path *path) 228 { 229 int ret; 230 u64 ino; 231 232 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 233 if (ret < 0) 234 return ret; 235 236 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 237 ino, block_group->start); 238 } 239 240 /* 241 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 242 * handles lookup, otherwise it takes ownership and iputs the inode. 243 * Don't reuse an inode pointer after passing it into this function. 244 */ 245 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 246 struct inode *inode, 247 struct btrfs_block_group *block_group) 248 { 249 struct btrfs_path *path; 250 struct btrfs_key key; 251 int ret = 0; 252 253 path = btrfs_alloc_path(); 254 if (!path) 255 return -ENOMEM; 256 257 if (!inode) 258 inode = lookup_free_space_inode(block_group, path); 259 if (IS_ERR(inode)) { 260 if (PTR_ERR(inode) != -ENOENT) 261 ret = PTR_ERR(inode); 262 goto out; 263 } 264 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 265 if (ret) { 266 btrfs_add_delayed_iput(BTRFS_I(inode)); 267 goto out; 268 } 269 clear_nlink(inode); 270 /* One for the block groups ref */ 271 spin_lock(&block_group->lock); 272 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 273 block_group->inode = NULL; 274 spin_unlock(&block_group->lock); 275 iput(inode); 276 } else { 277 spin_unlock(&block_group->lock); 278 } 279 /* One for the lookup ref */ 280 btrfs_add_delayed_iput(BTRFS_I(inode)); 281 282 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 283 key.type = 0; 284 key.offset = block_group->start; 285 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 286 -1, 1); 287 if (ret) { 288 if (ret > 0) 289 ret = 0; 290 goto out; 291 } 292 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 293 out: 294 btrfs_free_path(path); 295 return ret; 296 } 297 298 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 299 struct btrfs_block_group *block_group, 300 struct inode *vfs_inode) 301 { 302 struct btrfs_truncate_control control = { 303 .inode = BTRFS_I(vfs_inode), 304 .new_size = 0, 305 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 306 .min_type = BTRFS_EXTENT_DATA_KEY, 307 .clear_extent_range = true, 308 }; 309 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 310 struct btrfs_root *root = inode->root; 311 struct extent_state *cached_state = NULL; 312 int ret = 0; 313 bool locked = false; 314 315 if (block_group) { 316 struct btrfs_path *path = btrfs_alloc_path(); 317 318 if (!path) { 319 ret = -ENOMEM; 320 goto fail; 321 } 322 locked = true; 323 mutex_lock(&trans->transaction->cache_write_mutex); 324 if (!list_empty(&block_group->io_list)) { 325 list_del_init(&block_group->io_list); 326 327 btrfs_wait_cache_io(trans, block_group, path); 328 btrfs_put_block_group(block_group); 329 } 330 331 /* 332 * now that we've truncated the cache away, its no longer 333 * setup or written 334 */ 335 spin_lock(&block_group->lock); 336 block_group->disk_cache_state = BTRFS_DC_CLEAR; 337 spin_unlock(&block_group->lock); 338 btrfs_free_path(path); 339 } 340 341 btrfs_i_size_write(inode, 0); 342 truncate_pagecache(vfs_inode, 0); 343 344 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 345 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 346 347 /* 348 * We skip the throttling logic for free space cache inodes, so we don't 349 * need to check for -EAGAIN. 350 */ 351 ret = btrfs_truncate_inode_items(trans, root, &control); 352 353 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 354 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 355 356 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 357 if (ret) 358 goto fail; 359 360 ret = btrfs_update_inode(trans, inode); 361 362 fail: 363 if (locked) 364 mutex_unlock(&trans->transaction->cache_write_mutex); 365 if (ret) 366 btrfs_abort_transaction(trans, ret); 367 368 return ret; 369 } 370 371 static void readahead_cache(struct inode *inode) 372 { 373 struct file_ra_state ra; 374 unsigned long last_index; 375 376 file_ra_state_init(&ra, inode->i_mapping); 377 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 378 379 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 380 } 381 382 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 383 int write) 384 { 385 int num_pages; 386 387 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 388 389 /* Make sure we can fit our crcs and generation into the first page */ 390 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 391 return -ENOSPC; 392 393 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 394 395 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 396 if (!io_ctl->pages) 397 return -ENOMEM; 398 399 io_ctl->num_pages = num_pages; 400 io_ctl->fs_info = inode_to_fs_info(inode); 401 io_ctl->inode = inode; 402 403 return 0; 404 } 405 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 406 407 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 408 { 409 kfree(io_ctl->pages); 410 io_ctl->pages = NULL; 411 } 412 413 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 414 { 415 if (io_ctl->cur) { 416 io_ctl->cur = NULL; 417 io_ctl->orig = NULL; 418 } 419 } 420 421 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 422 { 423 ASSERT(io_ctl->index < io_ctl->num_pages); 424 io_ctl->page = io_ctl->pages[io_ctl->index++]; 425 io_ctl->cur = page_address(io_ctl->page); 426 io_ctl->orig = io_ctl->cur; 427 io_ctl->size = PAGE_SIZE; 428 if (clear) 429 clear_page(io_ctl->cur); 430 } 431 432 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 433 { 434 int i; 435 436 io_ctl_unmap_page(io_ctl); 437 438 for (i = 0; i < io_ctl->num_pages; i++) { 439 if (io_ctl->pages[i]) { 440 btrfs_folio_clear_checked(io_ctl->fs_info, 441 page_folio(io_ctl->pages[i]), 442 page_offset(io_ctl->pages[i]), 443 PAGE_SIZE); 444 unlock_page(io_ctl->pages[i]); 445 put_page(io_ctl->pages[i]); 446 } 447 } 448 } 449 450 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 451 { 452 struct page *page; 453 struct inode *inode = io_ctl->inode; 454 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 455 int i; 456 457 for (i = 0; i < io_ctl->num_pages; i++) { 458 int ret; 459 460 page = find_or_create_page(inode->i_mapping, i, mask); 461 if (!page) { 462 io_ctl_drop_pages(io_ctl); 463 return -ENOMEM; 464 } 465 466 ret = set_page_extent_mapped(page); 467 if (ret < 0) { 468 unlock_page(page); 469 put_page(page); 470 io_ctl_drop_pages(io_ctl); 471 return ret; 472 } 473 474 io_ctl->pages[i] = page; 475 if (uptodate && !PageUptodate(page)) { 476 btrfs_read_folio(NULL, page_folio(page)); 477 lock_page(page); 478 if (page->mapping != inode->i_mapping) { 479 btrfs_err(BTRFS_I(inode)->root->fs_info, 480 "free space cache page truncated"); 481 io_ctl_drop_pages(io_ctl); 482 return -EIO; 483 } 484 if (!PageUptodate(page)) { 485 btrfs_err(BTRFS_I(inode)->root->fs_info, 486 "error reading free space cache"); 487 io_ctl_drop_pages(io_ctl); 488 return -EIO; 489 } 490 } 491 } 492 493 for (i = 0; i < io_ctl->num_pages; i++) 494 clear_page_dirty_for_io(io_ctl->pages[i]); 495 496 return 0; 497 } 498 499 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 500 { 501 io_ctl_map_page(io_ctl, 1); 502 503 /* 504 * Skip the csum areas. If we don't check crcs then we just have a 505 * 64bit chunk at the front of the first page. 506 */ 507 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 508 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 509 510 put_unaligned_le64(generation, io_ctl->cur); 511 io_ctl->cur += sizeof(u64); 512 } 513 514 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 515 { 516 u64 cache_gen; 517 518 /* 519 * Skip the crc area. If we don't check crcs then we just have a 64bit 520 * chunk at the front of the first page. 521 */ 522 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 523 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 524 525 cache_gen = get_unaligned_le64(io_ctl->cur); 526 if (cache_gen != generation) { 527 btrfs_err_rl(io_ctl->fs_info, 528 "space cache generation (%llu) does not match inode (%llu)", 529 cache_gen, generation); 530 io_ctl_unmap_page(io_ctl); 531 return -EIO; 532 } 533 io_ctl->cur += sizeof(u64); 534 return 0; 535 } 536 537 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 538 { 539 u32 *tmp; 540 u32 crc = ~(u32)0; 541 unsigned offset = 0; 542 543 if (index == 0) 544 offset = sizeof(u32) * io_ctl->num_pages; 545 546 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 547 btrfs_crc32c_final(crc, (u8 *)&crc); 548 io_ctl_unmap_page(io_ctl); 549 tmp = page_address(io_ctl->pages[0]); 550 tmp += index; 551 *tmp = crc; 552 } 553 554 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 555 { 556 u32 *tmp, val; 557 u32 crc = ~(u32)0; 558 unsigned offset = 0; 559 560 if (index == 0) 561 offset = sizeof(u32) * io_ctl->num_pages; 562 563 tmp = page_address(io_ctl->pages[0]); 564 tmp += index; 565 val = *tmp; 566 567 io_ctl_map_page(io_ctl, 0); 568 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 569 btrfs_crc32c_final(crc, (u8 *)&crc); 570 if (val != crc) { 571 btrfs_err_rl(io_ctl->fs_info, 572 "csum mismatch on free space cache"); 573 io_ctl_unmap_page(io_ctl); 574 return -EIO; 575 } 576 577 return 0; 578 } 579 580 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 581 void *bitmap) 582 { 583 struct btrfs_free_space_entry *entry; 584 585 if (!io_ctl->cur) 586 return -ENOSPC; 587 588 entry = io_ctl->cur; 589 put_unaligned_le64(offset, &entry->offset); 590 put_unaligned_le64(bytes, &entry->bytes); 591 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 592 BTRFS_FREE_SPACE_EXTENT; 593 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 594 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 595 596 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 597 return 0; 598 599 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 600 601 /* No more pages to map */ 602 if (io_ctl->index >= io_ctl->num_pages) 603 return 0; 604 605 /* map the next page */ 606 io_ctl_map_page(io_ctl, 1); 607 return 0; 608 } 609 610 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 611 { 612 if (!io_ctl->cur) 613 return -ENOSPC; 614 615 /* 616 * If we aren't at the start of the current page, unmap this one and 617 * map the next one if there is any left. 618 */ 619 if (io_ctl->cur != io_ctl->orig) { 620 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 621 if (io_ctl->index >= io_ctl->num_pages) 622 return -ENOSPC; 623 io_ctl_map_page(io_ctl, 0); 624 } 625 626 copy_page(io_ctl->cur, bitmap); 627 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 628 if (io_ctl->index < io_ctl->num_pages) 629 io_ctl_map_page(io_ctl, 0); 630 return 0; 631 } 632 633 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 634 { 635 /* 636 * If we're not on the boundary we know we've modified the page and we 637 * need to crc the page. 638 */ 639 if (io_ctl->cur != io_ctl->orig) 640 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 641 else 642 io_ctl_unmap_page(io_ctl); 643 644 while (io_ctl->index < io_ctl->num_pages) { 645 io_ctl_map_page(io_ctl, 1); 646 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 647 } 648 } 649 650 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 651 struct btrfs_free_space *entry, u8 *type) 652 { 653 struct btrfs_free_space_entry *e; 654 int ret; 655 656 if (!io_ctl->cur) { 657 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 658 if (ret) 659 return ret; 660 } 661 662 e = io_ctl->cur; 663 entry->offset = get_unaligned_le64(&e->offset); 664 entry->bytes = get_unaligned_le64(&e->bytes); 665 *type = e->type; 666 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 667 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 668 669 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 670 return 0; 671 672 io_ctl_unmap_page(io_ctl); 673 674 return 0; 675 } 676 677 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 678 struct btrfs_free_space *entry) 679 { 680 int ret; 681 682 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 683 if (ret) 684 return ret; 685 686 copy_page(entry->bitmap, io_ctl->cur); 687 io_ctl_unmap_page(io_ctl); 688 689 return 0; 690 } 691 692 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 693 { 694 struct btrfs_block_group *block_group = ctl->block_group; 695 u64 max_bytes; 696 u64 bitmap_bytes; 697 u64 extent_bytes; 698 u64 size = block_group->length; 699 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 700 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 701 702 max_bitmaps = max_t(u64, max_bitmaps, 1); 703 704 if (ctl->total_bitmaps > max_bitmaps) 705 btrfs_err(block_group->fs_info, 706 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 707 block_group->start, block_group->length, 708 ctl->total_bitmaps, ctl->unit, max_bitmaps, 709 bytes_per_bg); 710 ASSERT(ctl->total_bitmaps <= max_bitmaps); 711 712 /* 713 * We are trying to keep the total amount of memory used per 1GiB of 714 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 715 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 716 * bitmaps, we may end up using more memory than this. 717 */ 718 if (size < SZ_1G) 719 max_bytes = MAX_CACHE_BYTES_PER_GIG; 720 else 721 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 722 723 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 724 725 /* 726 * we want the extent entry threshold to always be at most 1/2 the max 727 * bytes we can have, or whatever is less than that. 728 */ 729 extent_bytes = max_bytes - bitmap_bytes; 730 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 731 732 ctl->extents_thresh = 733 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 734 } 735 736 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 737 struct btrfs_free_space_ctl *ctl, 738 struct btrfs_path *path, u64 offset) 739 { 740 struct btrfs_fs_info *fs_info = root->fs_info; 741 struct btrfs_free_space_header *header; 742 struct extent_buffer *leaf; 743 struct btrfs_io_ctl io_ctl; 744 struct btrfs_key key; 745 struct btrfs_free_space *e, *n; 746 LIST_HEAD(bitmaps); 747 u64 num_entries; 748 u64 num_bitmaps; 749 u64 generation; 750 u8 type; 751 int ret = 0; 752 753 /* Nothing in the space cache, goodbye */ 754 if (!i_size_read(inode)) 755 return 0; 756 757 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 758 key.offset = offset; 759 key.type = 0; 760 761 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 762 if (ret < 0) 763 return 0; 764 else if (ret > 0) { 765 btrfs_release_path(path); 766 return 0; 767 } 768 769 ret = -1; 770 771 leaf = path->nodes[0]; 772 header = btrfs_item_ptr(leaf, path->slots[0], 773 struct btrfs_free_space_header); 774 num_entries = btrfs_free_space_entries(leaf, header); 775 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 776 generation = btrfs_free_space_generation(leaf, header); 777 btrfs_release_path(path); 778 779 if (!BTRFS_I(inode)->generation) { 780 btrfs_info(fs_info, 781 "the free space cache file (%llu) is invalid, skip it", 782 offset); 783 return 0; 784 } 785 786 if (BTRFS_I(inode)->generation != generation) { 787 btrfs_err(fs_info, 788 "free space inode generation (%llu) did not match free space cache generation (%llu)", 789 BTRFS_I(inode)->generation, generation); 790 return 0; 791 } 792 793 if (!num_entries) 794 return 0; 795 796 ret = io_ctl_init(&io_ctl, inode, 0); 797 if (ret) 798 return ret; 799 800 readahead_cache(inode); 801 802 ret = io_ctl_prepare_pages(&io_ctl, true); 803 if (ret) 804 goto out; 805 806 ret = io_ctl_check_crc(&io_ctl, 0); 807 if (ret) 808 goto free_cache; 809 810 ret = io_ctl_check_generation(&io_ctl, generation); 811 if (ret) 812 goto free_cache; 813 814 while (num_entries) { 815 e = kmem_cache_zalloc(btrfs_free_space_cachep, 816 GFP_NOFS); 817 if (!e) { 818 ret = -ENOMEM; 819 goto free_cache; 820 } 821 822 ret = io_ctl_read_entry(&io_ctl, e, &type); 823 if (ret) { 824 kmem_cache_free(btrfs_free_space_cachep, e); 825 goto free_cache; 826 } 827 828 if (!e->bytes) { 829 ret = -1; 830 kmem_cache_free(btrfs_free_space_cachep, e); 831 goto free_cache; 832 } 833 834 if (type == BTRFS_FREE_SPACE_EXTENT) { 835 spin_lock(&ctl->tree_lock); 836 ret = link_free_space(ctl, e); 837 spin_unlock(&ctl->tree_lock); 838 if (ret) { 839 btrfs_err(fs_info, 840 "Duplicate entries in free space cache, dumping"); 841 kmem_cache_free(btrfs_free_space_cachep, e); 842 goto free_cache; 843 } 844 } else { 845 ASSERT(num_bitmaps); 846 num_bitmaps--; 847 e->bitmap = kmem_cache_zalloc( 848 btrfs_free_space_bitmap_cachep, GFP_NOFS); 849 if (!e->bitmap) { 850 ret = -ENOMEM; 851 kmem_cache_free( 852 btrfs_free_space_cachep, e); 853 goto free_cache; 854 } 855 spin_lock(&ctl->tree_lock); 856 ret = link_free_space(ctl, e); 857 if (ret) { 858 spin_unlock(&ctl->tree_lock); 859 btrfs_err(fs_info, 860 "Duplicate entries in free space cache, dumping"); 861 kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); 862 kmem_cache_free(btrfs_free_space_cachep, e); 863 goto free_cache; 864 } 865 ctl->total_bitmaps++; 866 recalculate_thresholds(ctl); 867 spin_unlock(&ctl->tree_lock); 868 list_add_tail(&e->list, &bitmaps); 869 } 870 871 num_entries--; 872 } 873 874 io_ctl_unmap_page(&io_ctl); 875 876 /* 877 * We add the bitmaps at the end of the entries in order that 878 * the bitmap entries are added to the cache. 879 */ 880 list_for_each_entry_safe(e, n, &bitmaps, list) { 881 list_del_init(&e->list); 882 ret = io_ctl_read_bitmap(&io_ctl, e); 883 if (ret) 884 goto free_cache; 885 } 886 887 io_ctl_drop_pages(&io_ctl); 888 ret = 1; 889 out: 890 io_ctl_free(&io_ctl); 891 return ret; 892 free_cache: 893 io_ctl_drop_pages(&io_ctl); 894 895 spin_lock(&ctl->tree_lock); 896 __btrfs_remove_free_space_cache(ctl); 897 spin_unlock(&ctl->tree_lock); 898 goto out; 899 } 900 901 static int copy_free_space_cache(struct btrfs_block_group *block_group, 902 struct btrfs_free_space_ctl *ctl) 903 { 904 struct btrfs_free_space *info; 905 struct rb_node *n; 906 int ret = 0; 907 908 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 909 info = rb_entry(n, struct btrfs_free_space, offset_index); 910 if (!info->bitmap) { 911 const u64 offset = info->offset; 912 const u64 bytes = info->bytes; 913 914 unlink_free_space(ctl, info, true); 915 spin_unlock(&ctl->tree_lock); 916 kmem_cache_free(btrfs_free_space_cachep, info); 917 ret = btrfs_add_free_space(block_group, offset, bytes); 918 spin_lock(&ctl->tree_lock); 919 } else { 920 u64 offset = info->offset; 921 u64 bytes = ctl->unit; 922 923 ret = search_bitmap(ctl, info, &offset, &bytes, false); 924 if (ret == 0) { 925 bitmap_clear_bits(ctl, info, offset, bytes, true); 926 spin_unlock(&ctl->tree_lock); 927 ret = btrfs_add_free_space(block_group, offset, 928 bytes); 929 spin_lock(&ctl->tree_lock); 930 } else { 931 free_bitmap(ctl, info); 932 ret = 0; 933 } 934 } 935 cond_resched_lock(&ctl->tree_lock); 936 } 937 return ret; 938 } 939 940 static struct lock_class_key btrfs_free_space_inode_key; 941 942 int load_free_space_cache(struct btrfs_block_group *block_group) 943 { 944 struct btrfs_fs_info *fs_info = block_group->fs_info; 945 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 946 struct btrfs_free_space_ctl tmp_ctl = {}; 947 struct inode *inode; 948 struct btrfs_path *path; 949 int ret = 0; 950 bool matched; 951 u64 used = block_group->used; 952 953 /* 954 * Because we could potentially discard our loaded free space, we want 955 * to load everything into a temporary structure first, and then if it's 956 * valid copy it all into the actual free space ctl. 957 */ 958 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 959 960 /* 961 * If this block group has been marked to be cleared for one reason or 962 * another then we can't trust the on disk cache, so just return. 963 */ 964 spin_lock(&block_group->lock); 965 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 966 spin_unlock(&block_group->lock); 967 return 0; 968 } 969 spin_unlock(&block_group->lock); 970 971 path = btrfs_alloc_path(); 972 if (!path) 973 return 0; 974 path->search_commit_root = 1; 975 path->skip_locking = 1; 976 977 /* 978 * We must pass a path with search_commit_root set to btrfs_iget in 979 * order to avoid a deadlock when allocating extents for the tree root. 980 * 981 * When we are COWing an extent buffer from the tree root, when looking 982 * for a free extent, at extent-tree.c:find_free_extent(), we can find 983 * block group without its free space cache loaded. When we find one 984 * we must load its space cache which requires reading its free space 985 * cache's inode item from the root tree. If this inode item is located 986 * in the same leaf that we started COWing before, then we end up in 987 * deadlock on the extent buffer (trying to read lock it when we 988 * previously write locked it). 989 * 990 * It's safe to read the inode item using the commit root because 991 * block groups, once loaded, stay in memory forever (until they are 992 * removed) as well as their space caches once loaded. New block groups 993 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 994 * we will never try to read their inode item while the fs is mounted. 995 */ 996 inode = lookup_free_space_inode(block_group, path); 997 if (IS_ERR(inode)) { 998 btrfs_free_path(path); 999 return 0; 1000 } 1001 1002 /* We may have converted the inode and made the cache invalid. */ 1003 spin_lock(&block_group->lock); 1004 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1005 spin_unlock(&block_group->lock); 1006 btrfs_free_path(path); 1007 goto out; 1008 } 1009 spin_unlock(&block_group->lock); 1010 1011 /* 1012 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1013 * free space inodes to prevent false positives related to locks for normal 1014 * inodes. 1015 */ 1016 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1017 &btrfs_free_space_inode_key); 1018 1019 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1020 path, block_group->start); 1021 btrfs_free_path(path); 1022 if (ret <= 0) 1023 goto out; 1024 1025 matched = (tmp_ctl.free_space == (block_group->length - used - 1026 block_group->bytes_super)); 1027 1028 if (matched) { 1029 spin_lock(&tmp_ctl.tree_lock); 1030 ret = copy_free_space_cache(block_group, &tmp_ctl); 1031 spin_unlock(&tmp_ctl.tree_lock); 1032 /* 1033 * ret == 1 means we successfully loaded the free space cache, 1034 * so we need to re-set it here. 1035 */ 1036 if (ret == 0) 1037 ret = 1; 1038 } else { 1039 /* 1040 * We need to call the _locked variant so we don't try to update 1041 * the discard counters. 1042 */ 1043 spin_lock(&tmp_ctl.tree_lock); 1044 __btrfs_remove_free_space_cache(&tmp_ctl); 1045 spin_unlock(&tmp_ctl.tree_lock); 1046 btrfs_warn(fs_info, 1047 "block group %llu has wrong amount of free space", 1048 block_group->start); 1049 ret = -1; 1050 } 1051 out: 1052 if (ret < 0) { 1053 /* This cache is bogus, make sure it gets cleared */ 1054 spin_lock(&block_group->lock); 1055 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1056 spin_unlock(&block_group->lock); 1057 ret = 0; 1058 1059 btrfs_warn(fs_info, 1060 "failed to load free space cache for block group %llu, rebuilding it now", 1061 block_group->start); 1062 } 1063 1064 spin_lock(&ctl->tree_lock); 1065 btrfs_discard_update_discardable(block_group); 1066 spin_unlock(&ctl->tree_lock); 1067 iput(inode); 1068 return ret; 1069 } 1070 1071 static noinline_for_stack 1072 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1073 struct btrfs_free_space_ctl *ctl, 1074 struct btrfs_block_group *block_group, 1075 int *entries, int *bitmaps, 1076 struct list_head *bitmap_list) 1077 { 1078 int ret; 1079 struct btrfs_free_cluster *cluster = NULL; 1080 struct btrfs_free_cluster *cluster_locked = NULL; 1081 struct rb_node *node = rb_first(&ctl->free_space_offset); 1082 struct btrfs_trim_range *trim_entry; 1083 1084 /* Get the cluster for this block_group if it exists */ 1085 if (block_group && !list_empty(&block_group->cluster_list)) { 1086 cluster = list_entry(block_group->cluster_list.next, 1087 struct btrfs_free_cluster, 1088 block_group_list); 1089 } 1090 1091 if (!node && cluster) { 1092 cluster_locked = cluster; 1093 spin_lock(&cluster_locked->lock); 1094 node = rb_first(&cluster->root); 1095 cluster = NULL; 1096 } 1097 1098 /* Write out the extent entries */ 1099 while (node) { 1100 struct btrfs_free_space *e; 1101 1102 e = rb_entry(node, struct btrfs_free_space, offset_index); 1103 *entries += 1; 1104 1105 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1106 e->bitmap); 1107 if (ret) 1108 goto fail; 1109 1110 if (e->bitmap) { 1111 list_add_tail(&e->list, bitmap_list); 1112 *bitmaps += 1; 1113 } 1114 node = rb_next(node); 1115 if (!node && cluster) { 1116 node = rb_first(&cluster->root); 1117 cluster_locked = cluster; 1118 spin_lock(&cluster_locked->lock); 1119 cluster = NULL; 1120 } 1121 } 1122 if (cluster_locked) { 1123 spin_unlock(&cluster_locked->lock); 1124 cluster_locked = NULL; 1125 } 1126 1127 /* 1128 * Make sure we don't miss any range that was removed from our rbtree 1129 * because trimming is running. Otherwise after a umount+mount (or crash 1130 * after committing the transaction) we would leak free space and get 1131 * an inconsistent free space cache report from fsck. 1132 */ 1133 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1134 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1135 trim_entry->bytes, NULL); 1136 if (ret) 1137 goto fail; 1138 *entries += 1; 1139 } 1140 1141 return 0; 1142 fail: 1143 if (cluster_locked) 1144 spin_unlock(&cluster_locked->lock); 1145 return -ENOSPC; 1146 } 1147 1148 static noinline_for_stack int 1149 update_cache_item(struct btrfs_trans_handle *trans, 1150 struct btrfs_root *root, 1151 struct inode *inode, 1152 struct btrfs_path *path, u64 offset, 1153 int entries, int bitmaps) 1154 { 1155 struct btrfs_key key; 1156 struct btrfs_free_space_header *header; 1157 struct extent_buffer *leaf; 1158 int ret; 1159 1160 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1161 key.offset = offset; 1162 key.type = 0; 1163 1164 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1165 if (ret < 0) { 1166 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1167 EXTENT_DELALLOC, NULL); 1168 goto fail; 1169 } 1170 leaf = path->nodes[0]; 1171 if (ret > 0) { 1172 struct btrfs_key found_key; 1173 ASSERT(path->slots[0]); 1174 path->slots[0]--; 1175 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1176 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1177 found_key.offset != offset) { 1178 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1179 inode->i_size - 1, EXTENT_DELALLOC, 1180 NULL); 1181 btrfs_release_path(path); 1182 goto fail; 1183 } 1184 } 1185 1186 BTRFS_I(inode)->generation = trans->transid; 1187 header = btrfs_item_ptr(leaf, path->slots[0], 1188 struct btrfs_free_space_header); 1189 btrfs_set_free_space_entries(leaf, header, entries); 1190 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1191 btrfs_set_free_space_generation(leaf, header, trans->transid); 1192 btrfs_mark_buffer_dirty(trans, leaf); 1193 btrfs_release_path(path); 1194 1195 return 0; 1196 1197 fail: 1198 return -1; 1199 } 1200 1201 static noinline_for_stack int write_pinned_extent_entries( 1202 struct btrfs_trans_handle *trans, 1203 struct btrfs_block_group *block_group, 1204 struct btrfs_io_ctl *io_ctl, 1205 int *entries) 1206 { 1207 u64 start, extent_start, extent_end, len; 1208 struct extent_io_tree *unpin = NULL; 1209 int ret; 1210 1211 if (!block_group) 1212 return 0; 1213 1214 /* 1215 * We want to add any pinned extents to our free space cache 1216 * so we don't leak the space 1217 * 1218 * We shouldn't have switched the pinned extents yet so this is the 1219 * right one 1220 */ 1221 unpin = &trans->transaction->pinned_extents; 1222 1223 start = block_group->start; 1224 1225 while (start < block_group->start + block_group->length) { 1226 if (!find_first_extent_bit(unpin, start, 1227 &extent_start, &extent_end, 1228 EXTENT_DIRTY, NULL)) 1229 return 0; 1230 1231 /* This pinned extent is out of our range */ 1232 if (extent_start >= block_group->start + block_group->length) 1233 return 0; 1234 1235 extent_start = max(extent_start, start); 1236 extent_end = min(block_group->start + block_group->length, 1237 extent_end + 1); 1238 len = extent_end - extent_start; 1239 1240 *entries += 1; 1241 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1242 if (ret) 1243 return -ENOSPC; 1244 1245 start = extent_end; 1246 } 1247 1248 return 0; 1249 } 1250 1251 static noinline_for_stack int 1252 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1253 { 1254 struct btrfs_free_space *entry, *next; 1255 int ret; 1256 1257 /* Write out the bitmaps */ 1258 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1259 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1260 if (ret) 1261 return -ENOSPC; 1262 list_del_init(&entry->list); 1263 } 1264 1265 return 0; 1266 } 1267 1268 static int flush_dirty_cache(struct inode *inode) 1269 { 1270 int ret; 1271 1272 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 1273 if (ret) 1274 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1275 EXTENT_DELALLOC, NULL); 1276 1277 return ret; 1278 } 1279 1280 static void noinline_for_stack 1281 cleanup_bitmap_list(struct list_head *bitmap_list) 1282 { 1283 struct btrfs_free_space *entry, *next; 1284 1285 list_for_each_entry_safe(entry, next, bitmap_list, list) 1286 list_del_init(&entry->list); 1287 } 1288 1289 static void noinline_for_stack 1290 cleanup_write_cache_enospc(struct inode *inode, 1291 struct btrfs_io_ctl *io_ctl, 1292 struct extent_state **cached_state) 1293 { 1294 io_ctl_drop_pages(io_ctl); 1295 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1296 cached_state); 1297 } 1298 1299 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1300 struct btrfs_trans_handle *trans, 1301 struct btrfs_block_group *block_group, 1302 struct btrfs_io_ctl *io_ctl, 1303 struct btrfs_path *path, u64 offset) 1304 { 1305 int ret; 1306 struct inode *inode = io_ctl->inode; 1307 1308 if (!inode) 1309 return 0; 1310 1311 /* Flush the dirty pages in the cache file. */ 1312 ret = flush_dirty_cache(inode); 1313 if (ret) 1314 goto out; 1315 1316 /* Update the cache item to tell everyone this cache file is valid. */ 1317 ret = update_cache_item(trans, root, inode, path, offset, 1318 io_ctl->entries, io_ctl->bitmaps); 1319 out: 1320 if (ret) { 1321 invalidate_inode_pages2(inode->i_mapping); 1322 BTRFS_I(inode)->generation = 0; 1323 if (block_group) 1324 btrfs_debug(root->fs_info, 1325 "failed to write free space cache for block group %llu error %d", 1326 block_group->start, ret); 1327 } 1328 btrfs_update_inode(trans, BTRFS_I(inode)); 1329 1330 if (block_group) { 1331 /* the dirty list is protected by the dirty_bgs_lock */ 1332 spin_lock(&trans->transaction->dirty_bgs_lock); 1333 1334 /* the disk_cache_state is protected by the block group lock */ 1335 spin_lock(&block_group->lock); 1336 1337 /* 1338 * only mark this as written if we didn't get put back on 1339 * the dirty list while waiting for IO. Otherwise our 1340 * cache state won't be right, and we won't get written again 1341 */ 1342 if (!ret && list_empty(&block_group->dirty_list)) 1343 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1344 else if (ret) 1345 block_group->disk_cache_state = BTRFS_DC_ERROR; 1346 1347 spin_unlock(&block_group->lock); 1348 spin_unlock(&trans->transaction->dirty_bgs_lock); 1349 io_ctl->inode = NULL; 1350 iput(inode); 1351 } 1352 1353 return ret; 1354 1355 } 1356 1357 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1358 struct btrfs_block_group *block_group, 1359 struct btrfs_path *path) 1360 { 1361 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1362 block_group, &block_group->io_ctl, 1363 path, block_group->start); 1364 } 1365 1366 /* 1367 * Write out cached info to an inode. 1368 * 1369 * @inode: freespace inode we are writing out 1370 * @ctl: free space cache we are going to write out 1371 * @block_group: block_group for this cache if it belongs to a block_group 1372 * @io_ctl: holds context for the io 1373 * @trans: the trans handle 1374 * 1375 * This function writes out a free space cache struct to disk for quick recovery 1376 * on mount. This will return 0 if it was successful in writing the cache out, 1377 * or an errno if it was not. 1378 */ 1379 static int __btrfs_write_out_cache(struct inode *inode, 1380 struct btrfs_free_space_ctl *ctl, 1381 struct btrfs_block_group *block_group, 1382 struct btrfs_io_ctl *io_ctl, 1383 struct btrfs_trans_handle *trans) 1384 { 1385 struct extent_state *cached_state = NULL; 1386 LIST_HEAD(bitmap_list); 1387 int entries = 0; 1388 int bitmaps = 0; 1389 int ret; 1390 int must_iput = 0; 1391 int i_size; 1392 1393 if (!i_size_read(inode)) 1394 return -EIO; 1395 1396 WARN_ON(io_ctl->pages); 1397 ret = io_ctl_init(io_ctl, inode, 1); 1398 if (ret) 1399 return ret; 1400 1401 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1402 down_write(&block_group->data_rwsem); 1403 spin_lock(&block_group->lock); 1404 if (block_group->delalloc_bytes) { 1405 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1406 spin_unlock(&block_group->lock); 1407 up_write(&block_group->data_rwsem); 1408 BTRFS_I(inode)->generation = 0; 1409 ret = 0; 1410 must_iput = 1; 1411 goto out; 1412 } 1413 spin_unlock(&block_group->lock); 1414 } 1415 1416 /* Lock all pages first so we can lock the extent safely. */ 1417 ret = io_ctl_prepare_pages(io_ctl, false); 1418 if (ret) 1419 goto out_unlock; 1420 1421 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1422 &cached_state); 1423 1424 io_ctl_set_generation(io_ctl, trans->transid); 1425 1426 mutex_lock(&ctl->cache_writeout_mutex); 1427 /* Write out the extent entries in the free space cache */ 1428 spin_lock(&ctl->tree_lock); 1429 ret = write_cache_extent_entries(io_ctl, ctl, 1430 block_group, &entries, &bitmaps, 1431 &bitmap_list); 1432 if (ret) 1433 goto out_nospc_locked; 1434 1435 /* 1436 * Some spaces that are freed in the current transaction are pinned, 1437 * they will be added into free space cache after the transaction is 1438 * committed, we shouldn't lose them. 1439 * 1440 * If this changes while we are working we'll get added back to 1441 * the dirty list and redo it. No locking needed 1442 */ 1443 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1444 if (ret) 1445 goto out_nospc_locked; 1446 1447 /* 1448 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1449 * locked while doing it because a concurrent trim can be manipulating 1450 * or freeing the bitmap. 1451 */ 1452 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1453 spin_unlock(&ctl->tree_lock); 1454 mutex_unlock(&ctl->cache_writeout_mutex); 1455 if (ret) 1456 goto out_nospc; 1457 1458 /* Zero out the rest of the pages just to make sure */ 1459 io_ctl_zero_remaining_pages(io_ctl); 1460 1461 /* Everything is written out, now we dirty the pages in the file. */ 1462 i_size = i_size_read(inode); 1463 for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { 1464 u64 dirty_start = i * PAGE_SIZE; 1465 u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; 1466 1467 ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), 1468 dirty_start, dirty_len, &cached_state, false); 1469 if (ret < 0) 1470 goto out_nospc; 1471 } 1472 1473 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1474 up_write(&block_group->data_rwsem); 1475 /* 1476 * Release the pages and unlock the extent, we will flush 1477 * them out later 1478 */ 1479 io_ctl_drop_pages(io_ctl); 1480 io_ctl_free(io_ctl); 1481 1482 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1483 &cached_state); 1484 1485 /* 1486 * at this point the pages are under IO and we're happy, 1487 * The caller is responsible for waiting on them and updating 1488 * the cache and the inode 1489 */ 1490 io_ctl->entries = entries; 1491 io_ctl->bitmaps = bitmaps; 1492 1493 ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); 1494 if (ret) 1495 goto out; 1496 1497 return 0; 1498 1499 out_nospc_locked: 1500 cleanup_bitmap_list(&bitmap_list); 1501 spin_unlock(&ctl->tree_lock); 1502 mutex_unlock(&ctl->cache_writeout_mutex); 1503 1504 out_nospc: 1505 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1506 1507 out_unlock: 1508 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1509 up_write(&block_group->data_rwsem); 1510 1511 out: 1512 io_ctl->inode = NULL; 1513 io_ctl_free(io_ctl); 1514 if (ret) { 1515 invalidate_inode_pages2(inode->i_mapping); 1516 BTRFS_I(inode)->generation = 0; 1517 } 1518 btrfs_update_inode(trans, BTRFS_I(inode)); 1519 if (must_iput) 1520 iput(inode); 1521 return ret; 1522 } 1523 1524 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1525 struct btrfs_block_group *block_group, 1526 struct btrfs_path *path) 1527 { 1528 struct btrfs_fs_info *fs_info = trans->fs_info; 1529 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1530 struct inode *inode; 1531 int ret = 0; 1532 1533 spin_lock(&block_group->lock); 1534 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1535 spin_unlock(&block_group->lock); 1536 return 0; 1537 } 1538 spin_unlock(&block_group->lock); 1539 1540 inode = lookup_free_space_inode(block_group, path); 1541 if (IS_ERR(inode)) 1542 return 0; 1543 1544 ret = __btrfs_write_out_cache(inode, ctl, block_group, 1545 &block_group->io_ctl, trans); 1546 if (ret) { 1547 btrfs_debug(fs_info, 1548 "failed to write free space cache for block group %llu error %d", 1549 block_group->start, ret); 1550 spin_lock(&block_group->lock); 1551 block_group->disk_cache_state = BTRFS_DC_ERROR; 1552 spin_unlock(&block_group->lock); 1553 1554 block_group->io_ctl.inode = NULL; 1555 iput(inode); 1556 } 1557 1558 /* 1559 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1560 * to wait for IO and put the inode 1561 */ 1562 1563 return ret; 1564 } 1565 1566 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1567 u64 offset) 1568 { 1569 ASSERT(offset >= bitmap_start); 1570 offset -= bitmap_start; 1571 return (unsigned long)(div_u64(offset, unit)); 1572 } 1573 1574 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1575 { 1576 return (unsigned long)(div_u64(bytes, unit)); 1577 } 1578 1579 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1580 u64 offset) 1581 { 1582 u64 bitmap_start; 1583 u64 bytes_per_bitmap; 1584 1585 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1586 bitmap_start = offset - ctl->start; 1587 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1588 bitmap_start *= bytes_per_bitmap; 1589 bitmap_start += ctl->start; 1590 1591 return bitmap_start; 1592 } 1593 1594 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1595 struct btrfs_free_cluster *cluster, 1596 struct btrfs_free_space *new_entry) 1597 { 1598 struct rb_root *root; 1599 struct rb_node **p; 1600 struct rb_node *parent = NULL; 1601 1602 lockdep_assert_held(&ctl->tree_lock); 1603 1604 if (cluster) { 1605 lockdep_assert_held(&cluster->lock); 1606 root = &cluster->root; 1607 } else { 1608 root = &ctl->free_space_offset; 1609 } 1610 1611 p = &root->rb_node; 1612 1613 while (*p) { 1614 struct btrfs_free_space *info; 1615 1616 parent = *p; 1617 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1618 1619 if (new_entry->offset < info->offset) { 1620 p = &(*p)->rb_left; 1621 } else if (new_entry->offset > info->offset) { 1622 p = &(*p)->rb_right; 1623 } else { 1624 /* 1625 * we could have a bitmap entry and an extent entry 1626 * share the same offset. If this is the case, we want 1627 * the extent entry to always be found first if we do a 1628 * linear search through the tree, since we want to have 1629 * the quickest allocation time, and allocating from an 1630 * extent is faster than allocating from a bitmap. So 1631 * if we're inserting a bitmap and we find an entry at 1632 * this offset, we want to go right, or after this entry 1633 * logically. If we are inserting an extent and we've 1634 * found a bitmap, we want to go left, or before 1635 * logically. 1636 */ 1637 if (new_entry->bitmap) { 1638 if (info->bitmap) { 1639 WARN_ON_ONCE(1); 1640 return -EEXIST; 1641 } 1642 p = &(*p)->rb_right; 1643 } else { 1644 if (!info->bitmap) { 1645 WARN_ON_ONCE(1); 1646 return -EEXIST; 1647 } 1648 p = &(*p)->rb_left; 1649 } 1650 } 1651 } 1652 1653 rb_link_node(&new_entry->offset_index, parent, p); 1654 rb_insert_color(&new_entry->offset_index, root); 1655 1656 return 0; 1657 } 1658 1659 /* 1660 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1661 * searched through the bitmap and figured out the largest ->max_extent_size, 1662 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1663 * allocator the wrong thing, we want to use the actual real max_extent_size 1664 * we've found already if it's larger, or we want to use ->bytes. 1665 * 1666 * This matters because find_free_space() will skip entries who's ->bytes is 1667 * less than the required bytes. So if we didn't search down this bitmap, we 1668 * may pick some previous entry that has a smaller ->max_extent_size than we 1669 * have. For example, assume we have two entries, one that has 1670 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1671 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1672 * call into find_free_space(), and return with max_extent_size == 4K, because 1673 * that first bitmap entry had ->max_extent_size set, but the second one did 1674 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1675 * 8K contiguous range. 1676 * 1677 * Consider the other case, we have 2 8K chunks in that second entry and still 1678 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1679 * allocator comes in it'll fully search our second bitmap, and this time it'll 1680 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1681 * right allocation the next loop through. 1682 */ 1683 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1684 { 1685 if (entry->bitmap && entry->max_extent_size) 1686 return entry->max_extent_size; 1687 return entry->bytes; 1688 } 1689 1690 /* 1691 * We want the largest entry to be leftmost, so this is inverted from what you'd 1692 * normally expect. 1693 */ 1694 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1695 { 1696 const struct btrfs_free_space *entry, *exist; 1697 1698 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1699 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1700 return get_max_extent_size(exist) < get_max_extent_size(entry); 1701 } 1702 1703 /* 1704 * searches the tree for the given offset. 1705 * 1706 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1707 * want a section that has at least bytes size and comes at or after the given 1708 * offset. 1709 */ 1710 static struct btrfs_free_space * 1711 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1712 u64 offset, int bitmap_only, int fuzzy) 1713 { 1714 struct rb_node *n = ctl->free_space_offset.rb_node; 1715 struct btrfs_free_space *entry = NULL, *prev = NULL; 1716 1717 lockdep_assert_held(&ctl->tree_lock); 1718 1719 /* find entry that is closest to the 'offset' */ 1720 while (n) { 1721 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1722 prev = entry; 1723 1724 if (offset < entry->offset) 1725 n = n->rb_left; 1726 else if (offset > entry->offset) 1727 n = n->rb_right; 1728 else 1729 break; 1730 1731 entry = NULL; 1732 } 1733 1734 if (bitmap_only) { 1735 if (!entry) 1736 return NULL; 1737 if (entry->bitmap) 1738 return entry; 1739 1740 /* 1741 * bitmap entry and extent entry may share same offset, 1742 * in that case, bitmap entry comes after extent entry. 1743 */ 1744 n = rb_next(n); 1745 if (!n) 1746 return NULL; 1747 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1748 if (entry->offset != offset) 1749 return NULL; 1750 1751 WARN_ON(!entry->bitmap); 1752 return entry; 1753 } else if (entry) { 1754 if (entry->bitmap) { 1755 /* 1756 * if previous extent entry covers the offset, 1757 * we should return it instead of the bitmap entry 1758 */ 1759 n = rb_prev(&entry->offset_index); 1760 if (n) { 1761 prev = rb_entry(n, struct btrfs_free_space, 1762 offset_index); 1763 if (!prev->bitmap && 1764 prev->offset + prev->bytes > offset) 1765 entry = prev; 1766 } 1767 } 1768 return entry; 1769 } 1770 1771 if (!prev) 1772 return NULL; 1773 1774 /* find last entry before the 'offset' */ 1775 entry = prev; 1776 if (entry->offset > offset) { 1777 n = rb_prev(&entry->offset_index); 1778 if (n) { 1779 entry = rb_entry(n, struct btrfs_free_space, 1780 offset_index); 1781 ASSERT(entry->offset <= offset); 1782 } else { 1783 if (fuzzy) 1784 return entry; 1785 else 1786 return NULL; 1787 } 1788 } 1789 1790 if (entry->bitmap) { 1791 n = rb_prev(&entry->offset_index); 1792 if (n) { 1793 prev = rb_entry(n, struct btrfs_free_space, 1794 offset_index); 1795 if (!prev->bitmap && 1796 prev->offset + prev->bytes > offset) 1797 return prev; 1798 } 1799 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1800 return entry; 1801 } else if (entry->offset + entry->bytes > offset) 1802 return entry; 1803 1804 if (!fuzzy) 1805 return NULL; 1806 1807 while (1) { 1808 n = rb_next(&entry->offset_index); 1809 if (!n) 1810 return NULL; 1811 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1812 if (entry->bitmap) { 1813 if (entry->offset + BITS_PER_BITMAP * 1814 ctl->unit > offset) 1815 break; 1816 } else { 1817 if (entry->offset + entry->bytes > offset) 1818 break; 1819 } 1820 } 1821 return entry; 1822 } 1823 1824 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1825 struct btrfs_free_space *info, 1826 bool update_stat) 1827 { 1828 lockdep_assert_held(&ctl->tree_lock); 1829 1830 rb_erase(&info->offset_index, &ctl->free_space_offset); 1831 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1832 ctl->free_extents--; 1833 1834 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1835 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1836 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1837 } 1838 1839 if (update_stat) 1840 ctl->free_space -= info->bytes; 1841 } 1842 1843 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1844 struct btrfs_free_space *info) 1845 { 1846 int ret = 0; 1847 1848 lockdep_assert_held(&ctl->tree_lock); 1849 1850 ASSERT(info->bytes || info->bitmap); 1851 ret = tree_insert_offset(ctl, NULL, info); 1852 if (ret) 1853 return ret; 1854 1855 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1856 1857 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1858 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1859 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1860 } 1861 1862 ctl->free_space += info->bytes; 1863 ctl->free_extents++; 1864 return ret; 1865 } 1866 1867 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1868 struct btrfs_free_space *info) 1869 { 1870 ASSERT(info->bitmap); 1871 1872 /* 1873 * If our entry is empty it's because we're on a cluster and we don't 1874 * want to re-link it into our ctl bytes index. 1875 */ 1876 if (RB_EMPTY_NODE(&info->bytes_index)) 1877 return; 1878 1879 lockdep_assert_held(&ctl->tree_lock); 1880 1881 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1882 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1883 } 1884 1885 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1886 struct btrfs_free_space *info, 1887 u64 offset, u64 bytes, bool update_stat) 1888 { 1889 unsigned long start, count, end; 1890 int extent_delta = -1; 1891 1892 start = offset_to_bit(info->offset, ctl->unit, offset); 1893 count = bytes_to_bits(bytes, ctl->unit); 1894 end = start + count; 1895 ASSERT(end <= BITS_PER_BITMAP); 1896 1897 bitmap_clear(info->bitmap, start, count); 1898 1899 info->bytes -= bytes; 1900 if (info->max_extent_size > ctl->unit) 1901 info->max_extent_size = 0; 1902 1903 relink_bitmap_entry(ctl, info); 1904 1905 if (start && test_bit(start - 1, info->bitmap)) 1906 extent_delta++; 1907 1908 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1909 extent_delta++; 1910 1911 info->bitmap_extents += extent_delta; 1912 if (!btrfs_free_space_trimmed(info)) { 1913 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1914 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1915 } 1916 1917 if (update_stat) 1918 ctl->free_space -= bytes; 1919 } 1920 1921 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1922 struct btrfs_free_space *info, u64 offset, 1923 u64 bytes) 1924 { 1925 unsigned long start, count, end; 1926 int extent_delta = 1; 1927 1928 start = offset_to_bit(info->offset, ctl->unit, offset); 1929 count = bytes_to_bits(bytes, ctl->unit); 1930 end = start + count; 1931 ASSERT(end <= BITS_PER_BITMAP); 1932 1933 bitmap_set(info->bitmap, start, count); 1934 1935 /* 1936 * We set some bytes, we have no idea what the max extent size is 1937 * anymore. 1938 */ 1939 info->max_extent_size = 0; 1940 info->bytes += bytes; 1941 ctl->free_space += bytes; 1942 1943 relink_bitmap_entry(ctl, info); 1944 1945 if (start && test_bit(start - 1, info->bitmap)) 1946 extent_delta--; 1947 1948 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1949 extent_delta--; 1950 1951 info->bitmap_extents += extent_delta; 1952 if (!btrfs_free_space_trimmed(info)) { 1953 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1954 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1955 } 1956 } 1957 1958 /* 1959 * If we can not find suitable extent, we will use bytes to record 1960 * the size of the max extent. 1961 */ 1962 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1963 struct btrfs_free_space *bitmap_info, u64 *offset, 1964 u64 *bytes, bool for_alloc) 1965 { 1966 unsigned long found_bits = 0; 1967 unsigned long max_bits = 0; 1968 unsigned long bits, i; 1969 unsigned long next_zero; 1970 unsigned long extent_bits; 1971 1972 /* 1973 * Skip searching the bitmap if we don't have a contiguous section that 1974 * is large enough for this allocation. 1975 */ 1976 if (for_alloc && 1977 bitmap_info->max_extent_size && 1978 bitmap_info->max_extent_size < *bytes) { 1979 *bytes = bitmap_info->max_extent_size; 1980 return -1; 1981 } 1982 1983 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1984 max_t(u64, *offset, bitmap_info->offset)); 1985 bits = bytes_to_bits(*bytes, ctl->unit); 1986 1987 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1988 if (for_alloc && bits == 1) { 1989 found_bits = 1; 1990 break; 1991 } 1992 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1993 BITS_PER_BITMAP, i); 1994 extent_bits = next_zero - i; 1995 if (extent_bits >= bits) { 1996 found_bits = extent_bits; 1997 break; 1998 } else if (extent_bits > max_bits) { 1999 max_bits = extent_bits; 2000 } 2001 i = next_zero; 2002 } 2003 2004 if (found_bits) { 2005 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 2006 *bytes = (u64)(found_bits) * ctl->unit; 2007 return 0; 2008 } 2009 2010 *bytes = (u64)(max_bits) * ctl->unit; 2011 bitmap_info->max_extent_size = *bytes; 2012 relink_bitmap_entry(ctl, bitmap_info); 2013 return -1; 2014 } 2015 2016 /* Cache the size of the max extent in bytes */ 2017 static struct btrfs_free_space * 2018 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2019 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2020 { 2021 struct btrfs_free_space *entry; 2022 struct rb_node *node; 2023 u64 tmp; 2024 u64 align_off; 2025 int ret; 2026 2027 if (!ctl->free_space_offset.rb_node) 2028 goto out; 2029 again: 2030 if (use_bytes_index) { 2031 node = rb_first_cached(&ctl->free_space_bytes); 2032 } else { 2033 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2034 0, 1); 2035 if (!entry) 2036 goto out; 2037 node = &entry->offset_index; 2038 } 2039 2040 for (; node; node = rb_next(node)) { 2041 if (use_bytes_index) 2042 entry = rb_entry(node, struct btrfs_free_space, 2043 bytes_index); 2044 else 2045 entry = rb_entry(node, struct btrfs_free_space, 2046 offset_index); 2047 2048 /* 2049 * If we are using the bytes index then all subsequent entries 2050 * in this tree are going to be < bytes, so simply set the max 2051 * extent size and exit the loop. 2052 * 2053 * If we're using the offset index then we need to keep going 2054 * through the rest of the tree. 2055 */ 2056 if (entry->bytes < *bytes) { 2057 *max_extent_size = max(get_max_extent_size(entry), 2058 *max_extent_size); 2059 if (use_bytes_index) 2060 break; 2061 continue; 2062 } 2063 2064 /* make sure the space returned is big enough 2065 * to match our requested alignment 2066 */ 2067 if (*bytes >= align) { 2068 tmp = entry->offset - ctl->start + align - 1; 2069 tmp = div64_u64(tmp, align); 2070 tmp = tmp * align + ctl->start; 2071 align_off = tmp - entry->offset; 2072 } else { 2073 align_off = 0; 2074 tmp = entry->offset; 2075 } 2076 2077 /* 2078 * We don't break here if we're using the bytes index because we 2079 * may have another entry that has the correct alignment that is 2080 * the right size, so we don't want to miss that possibility. 2081 * At worst this adds another loop through the logic, but if we 2082 * broke here we could prematurely ENOSPC. 2083 */ 2084 if (entry->bytes < *bytes + align_off) { 2085 *max_extent_size = max(get_max_extent_size(entry), 2086 *max_extent_size); 2087 continue; 2088 } 2089 2090 if (entry->bitmap) { 2091 struct rb_node *old_next = rb_next(node); 2092 u64 size = *bytes; 2093 2094 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2095 if (!ret) { 2096 *offset = tmp; 2097 *bytes = size; 2098 return entry; 2099 } else { 2100 *max_extent_size = 2101 max(get_max_extent_size(entry), 2102 *max_extent_size); 2103 } 2104 2105 /* 2106 * The bitmap may have gotten re-arranged in the space 2107 * index here because the max_extent_size may have been 2108 * updated. Start from the beginning again if this 2109 * happened. 2110 */ 2111 if (use_bytes_index && old_next != rb_next(node)) 2112 goto again; 2113 continue; 2114 } 2115 2116 *offset = tmp; 2117 *bytes = entry->bytes - align_off; 2118 return entry; 2119 } 2120 out: 2121 return NULL; 2122 } 2123 2124 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2125 struct btrfs_free_space *info, u64 offset) 2126 { 2127 info->offset = offset_to_bitmap(ctl, offset); 2128 info->bytes = 0; 2129 info->bitmap_extents = 0; 2130 INIT_LIST_HEAD(&info->list); 2131 link_free_space(ctl, info); 2132 ctl->total_bitmaps++; 2133 recalculate_thresholds(ctl); 2134 } 2135 2136 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2137 struct btrfs_free_space *bitmap_info) 2138 { 2139 /* 2140 * Normally when this is called, the bitmap is completely empty. However, 2141 * if we are blowing up the free space cache for one reason or another 2142 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2143 * we may leave stats on the table. 2144 */ 2145 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2146 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2147 bitmap_info->bitmap_extents; 2148 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2149 2150 } 2151 unlink_free_space(ctl, bitmap_info, true); 2152 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2153 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2154 ctl->total_bitmaps--; 2155 recalculate_thresholds(ctl); 2156 } 2157 2158 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2159 struct btrfs_free_space *bitmap_info, 2160 u64 *offset, u64 *bytes) 2161 { 2162 u64 end; 2163 u64 search_start, search_bytes; 2164 int ret; 2165 2166 again: 2167 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2168 2169 /* 2170 * We need to search for bits in this bitmap. We could only cover some 2171 * of the extent in this bitmap thanks to how we add space, so we need 2172 * to search for as much as it as we can and clear that amount, and then 2173 * go searching for the next bit. 2174 */ 2175 search_start = *offset; 2176 search_bytes = ctl->unit; 2177 search_bytes = min(search_bytes, end - search_start + 1); 2178 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2179 false); 2180 if (ret < 0 || search_start != *offset) 2181 return -EINVAL; 2182 2183 /* We may have found more bits than what we need */ 2184 search_bytes = min(search_bytes, *bytes); 2185 2186 /* Cannot clear past the end of the bitmap */ 2187 search_bytes = min(search_bytes, end - search_start + 1); 2188 2189 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2190 *offset += search_bytes; 2191 *bytes -= search_bytes; 2192 2193 if (*bytes) { 2194 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2195 if (!bitmap_info->bytes) 2196 free_bitmap(ctl, bitmap_info); 2197 2198 /* 2199 * no entry after this bitmap, but we still have bytes to 2200 * remove, so something has gone wrong. 2201 */ 2202 if (!next) 2203 return -EINVAL; 2204 2205 bitmap_info = rb_entry(next, struct btrfs_free_space, 2206 offset_index); 2207 2208 /* 2209 * if the next entry isn't a bitmap we need to return to let the 2210 * extent stuff do its work. 2211 */ 2212 if (!bitmap_info->bitmap) 2213 return -EAGAIN; 2214 2215 /* 2216 * Ok the next item is a bitmap, but it may not actually hold 2217 * the information for the rest of this free space stuff, so 2218 * look for it, and if we don't find it return so we can try 2219 * everything over again. 2220 */ 2221 search_start = *offset; 2222 search_bytes = ctl->unit; 2223 ret = search_bitmap(ctl, bitmap_info, &search_start, 2224 &search_bytes, false); 2225 if (ret < 0 || search_start != *offset) 2226 return -EAGAIN; 2227 2228 goto again; 2229 } else if (!bitmap_info->bytes) 2230 free_bitmap(ctl, bitmap_info); 2231 2232 return 0; 2233 } 2234 2235 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2236 struct btrfs_free_space *info, u64 offset, 2237 u64 bytes, enum btrfs_trim_state trim_state) 2238 { 2239 u64 bytes_to_set = 0; 2240 u64 end; 2241 2242 /* 2243 * This is a tradeoff to make bitmap trim state minimal. We mark the 2244 * whole bitmap untrimmed if at any point we add untrimmed regions. 2245 */ 2246 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2247 if (btrfs_free_space_trimmed(info)) { 2248 ctl->discardable_extents[BTRFS_STAT_CURR] += 2249 info->bitmap_extents; 2250 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2251 } 2252 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2253 } 2254 2255 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2256 2257 bytes_to_set = min(end - offset, bytes); 2258 2259 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2260 2261 return bytes_to_set; 2262 2263 } 2264 2265 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2266 struct btrfs_free_space *info) 2267 { 2268 struct btrfs_block_group *block_group = ctl->block_group; 2269 struct btrfs_fs_info *fs_info = block_group->fs_info; 2270 bool forced = false; 2271 2272 #ifdef CONFIG_BTRFS_DEBUG 2273 if (btrfs_should_fragment_free_space(block_group)) 2274 forced = true; 2275 #endif 2276 2277 /* This is a way to reclaim large regions from the bitmaps. */ 2278 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2279 return false; 2280 2281 /* 2282 * If we are below the extents threshold then we can add this as an 2283 * extent, and don't have to deal with the bitmap 2284 */ 2285 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2286 /* 2287 * If this block group has some small extents we don't want to 2288 * use up all of our free slots in the cache with them, we want 2289 * to reserve them to larger extents, however if we have plenty 2290 * of cache left then go ahead an dadd them, no sense in adding 2291 * the overhead of a bitmap if we don't have to. 2292 */ 2293 if (info->bytes <= fs_info->sectorsize * 8) { 2294 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2295 return false; 2296 } else { 2297 return false; 2298 } 2299 } 2300 2301 /* 2302 * The original block groups from mkfs can be really small, like 8 2303 * megabytes, so don't bother with a bitmap for those entries. However 2304 * some block groups can be smaller than what a bitmap would cover but 2305 * are still large enough that they could overflow the 32k memory limit, 2306 * so allow those block groups to still be allowed to have a bitmap 2307 * entry. 2308 */ 2309 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2310 return false; 2311 2312 return true; 2313 } 2314 2315 static const struct btrfs_free_space_op free_space_op = { 2316 .use_bitmap = use_bitmap, 2317 }; 2318 2319 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2320 struct btrfs_free_space *info) 2321 { 2322 struct btrfs_free_space *bitmap_info; 2323 struct btrfs_block_group *block_group = NULL; 2324 int added = 0; 2325 u64 bytes, offset, bytes_added; 2326 enum btrfs_trim_state trim_state; 2327 int ret; 2328 2329 bytes = info->bytes; 2330 offset = info->offset; 2331 trim_state = info->trim_state; 2332 2333 if (!ctl->op->use_bitmap(ctl, info)) 2334 return 0; 2335 2336 if (ctl->op == &free_space_op) 2337 block_group = ctl->block_group; 2338 again: 2339 /* 2340 * Since we link bitmaps right into the cluster we need to see if we 2341 * have a cluster here, and if so and it has our bitmap we need to add 2342 * the free space to that bitmap. 2343 */ 2344 if (block_group && !list_empty(&block_group->cluster_list)) { 2345 struct btrfs_free_cluster *cluster; 2346 struct rb_node *node; 2347 struct btrfs_free_space *entry; 2348 2349 cluster = list_entry(block_group->cluster_list.next, 2350 struct btrfs_free_cluster, 2351 block_group_list); 2352 spin_lock(&cluster->lock); 2353 node = rb_first(&cluster->root); 2354 if (!node) { 2355 spin_unlock(&cluster->lock); 2356 goto no_cluster_bitmap; 2357 } 2358 2359 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2360 if (!entry->bitmap) { 2361 spin_unlock(&cluster->lock); 2362 goto no_cluster_bitmap; 2363 } 2364 2365 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2366 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2367 bytes, trim_state); 2368 bytes -= bytes_added; 2369 offset += bytes_added; 2370 } 2371 spin_unlock(&cluster->lock); 2372 if (!bytes) { 2373 ret = 1; 2374 goto out; 2375 } 2376 } 2377 2378 no_cluster_bitmap: 2379 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2380 1, 0); 2381 if (!bitmap_info) { 2382 ASSERT(added == 0); 2383 goto new_bitmap; 2384 } 2385 2386 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2387 trim_state); 2388 bytes -= bytes_added; 2389 offset += bytes_added; 2390 added = 0; 2391 2392 if (!bytes) { 2393 ret = 1; 2394 goto out; 2395 } else 2396 goto again; 2397 2398 new_bitmap: 2399 if (info && info->bitmap) { 2400 add_new_bitmap(ctl, info, offset); 2401 added = 1; 2402 info = NULL; 2403 goto again; 2404 } else { 2405 spin_unlock(&ctl->tree_lock); 2406 2407 /* no pre-allocated info, allocate a new one */ 2408 if (!info) { 2409 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2410 GFP_NOFS); 2411 if (!info) { 2412 spin_lock(&ctl->tree_lock); 2413 ret = -ENOMEM; 2414 goto out; 2415 } 2416 } 2417 2418 /* allocate the bitmap */ 2419 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2420 GFP_NOFS); 2421 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2422 spin_lock(&ctl->tree_lock); 2423 if (!info->bitmap) { 2424 ret = -ENOMEM; 2425 goto out; 2426 } 2427 goto again; 2428 } 2429 2430 out: 2431 if (info) { 2432 if (info->bitmap) 2433 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2434 info->bitmap); 2435 kmem_cache_free(btrfs_free_space_cachep, info); 2436 } 2437 2438 return ret; 2439 } 2440 2441 /* 2442 * Free space merging rules: 2443 * 1) Merge trimmed areas together 2444 * 2) Let untrimmed areas coalesce with trimmed areas 2445 * 3) Always pull neighboring regions from bitmaps 2446 * 2447 * The above rules are for when we merge free space based on btrfs_trim_state. 2448 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2449 * same reason: to promote larger extent regions which makes life easier for 2450 * find_free_extent(). Rule 2 enables coalescing based on the common path 2451 * being returning free space from btrfs_finish_extent_commit(). So when free 2452 * space is trimmed, it will prevent aggregating trimmed new region and 2453 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2454 * and provide find_free_extent() with the largest extents possible hoping for 2455 * the reuse path. 2456 */ 2457 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2458 struct btrfs_free_space *info, bool update_stat) 2459 { 2460 struct btrfs_free_space *left_info = NULL; 2461 struct btrfs_free_space *right_info; 2462 bool merged = false; 2463 u64 offset = info->offset; 2464 u64 bytes = info->bytes; 2465 const bool is_trimmed = btrfs_free_space_trimmed(info); 2466 struct rb_node *right_prev = NULL; 2467 2468 /* 2469 * first we want to see if there is free space adjacent to the range we 2470 * are adding, if there is remove that struct and add a new one to 2471 * cover the entire range 2472 */ 2473 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2474 if (right_info) 2475 right_prev = rb_prev(&right_info->offset_index); 2476 2477 if (right_prev) 2478 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2479 else if (!right_info) 2480 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2481 2482 /* See try_merge_free_space() comment. */ 2483 if (right_info && !right_info->bitmap && 2484 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2485 unlink_free_space(ctl, right_info, update_stat); 2486 info->bytes += right_info->bytes; 2487 kmem_cache_free(btrfs_free_space_cachep, right_info); 2488 merged = true; 2489 } 2490 2491 /* See try_merge_free_space() comment. */ 2492 if (left_info && !left_info->bitmap && 2493 left_info->offset + left_info->bytes == offset && 2494 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2495 unlink_free_space(ctl, left_info, update_stat); 2496 info->offset = left_info->offset; 2497 info->bytes += left_info->bytes; 2498 kmem_cache_free(btrfs_free_space_cachep, left_info); 2499 merged = true; 2500 } 2501 2502 return merged; 2503 } 2504 2505 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2506 struct btrfs_free_space *info, 2507 bool update_stat) 2508 { 2509 struct btrfs_free_space *bitmap; 2510 unsigned long i; 2511 unsigned long j; 2512 const u64 end = info->offset + info->bytes; 2513 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2514 u64 bytes; 2515 2516 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2517 if (!bitmap) 2518 return false; 2519 2520 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2521 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2522 if (j == i) 2523 return false; 2524 bytes = (j - i) * ctl->unit; 2525 info->bytes += bytes; 2526 2527 /* See try_merge_free_space() comment. */ 2528 if (!btrfs_free_space_trimmed(bitmap)) 2529 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2530 2531 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2532 2533 if (!bitmap->bytes) 2534 free_bitmap(ctl, bitmap); 2535 2536 return true; 2537 } 2538 2539 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2540 struct btrfs_free_space *info, 2541 bool update_stat) 2542 { 2543 struct btrfs_free_space *bitmap; 2544 u64 bitmap_offset; 2545 unsigned long i; 2546 unsigned long j; 2547 unsigned long prev_j; 2548 u64 bytes; 2549 2550 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2551 /* If we're on a boundary, try the previous logical bitmap. */ 2552 if (bitmap_offset == info->offset) { 2553 if (info->offset == 0) 2554 return false; 2555 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2556 } 2557 2558 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2559 if (!bitmap) 2560 return false; 2561 2562 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2563 j = 0; 2564 prev_j = (unsigned long)-1; 2565 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2566 if (j > i) 2567 break; 2568 prev_j = j; 2569 } 2570 if (prev_j == i) 2571 return false; 2572 2573 if (prev_j == (unsigned long)-1) 2574 bytes = (i + 1) * ctl->unit; 2575 else 2576 bytes = (i - prev_j) * ctl->unit; 2577 2578 info->offset -= bytes; 2579 info->bytes += bytes; 2580 2581 /* See try_merge_free_space() comment. */ 2582 if (!btrfs_free_space_trimmed(bitmap)) 2583 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2584 2585 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2586 2587 if (!bitmap->bytes) 2588 free_bitmap(ctl, bitmap); 2589 2590 return true; 2591 } 2592 2593 /* 2594 * We prefer always to allocate from extent entries, both for clustered and 2595 * non-clustered allocation requests. So when attempting to add a new extent 2596 * entry, try to see if there's adjacent free space in bitmap entries, and if 2597 * there is, migrate that space from the bitmaps to the extent. 2598 * Like this we get better chances of satisfying space allocation requests 2599 * because we attempt to satisfy them based on a single cache entry, and never 2600 * on 2 or more entries - even if the entries represent a contiguous free space 2601 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2602 * ends). 2603 */ 2604 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2605 struct btrfs_free_space *info, 2606 bool update_stat) 2607 { 2608 /* 2609 * Only work with disconnected entries, as we can change their offset, 2610 * and must be extent entries. 2611 */ 2612 ASSERT(!info->bitmap); 2613 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2614 2615 if (ctl->total_bitmaps > 0) { 2616 bool stole_end; 2617 bool stole_front = false; 2618 2619 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2620 if (ctl->total_bitmaps > 0) 2621 stole_front = steal_from_bitmap_to_front(ctl, info, 2622 update_stat); 2623 2624 if (stole_end || stole_front) 2625 try_merge_free_space(ctl, info, update_stat); 2626 } 2627 } 2628 2629 static int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2630 u64 offset, u64 bytes, 2631 enum btrfs_trim_state trim_state) 2632 { 2633 struct btrfs_fs_info *fs_info = block_group->fs_info; 2634 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2635 struct btrfs_free_space *info; 2636 int ret = 0; 2637 u64 filter_bytes = bytes; 2638 2639 ASSERT(!btrfs_is_zoned(fs_info)); 2640 2641 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2642 if (!info) 2643 return -ENOMEM; 2644 2645 info->offset = offset; 2646 info->bytes = bytes; 2647 info->trim_state = trim_state; 2648 RB_CLEAR_NODE(&info->offset_index); 2649 RB_CLEAR_NODE(&info->bytes_index); 2650 2651 spin_lock(&ctl->tree_lock); 2652 2653 if (try_merge_free_space(ctl, info, true)) 2654 goto link; 2655 2656 /* 2657 * There was no extent directly to the left or right of this new 2658 * extent then we know we're going to have to allocate a new extent, so 2659 * before we do that see if we need to drop this into a bitmap 2660 */ 2661 ret = insert_into_bitmap(ctl, info); 2662 if (ret < 0) { 2663 goto out; 2664 } else if (ret) { 2665 ret = 0; 2666 goto out; 2667 } 2668 link: 2669 /* 2670 * Only steal free space from adjacent bitmaps if we're sure we're not 2671 * going to add the new free space to existing bitmap entries - because 2672 * that would mean unnecessary work that would be reverted. Therefore 2673 * attempt to steal space from bitmaps if we're adding an extent entry. 2674 */ 2675 steal_from_bitmap(ctl, info, true); 2676 2677 filter_bytes = max(filter_bytes, info->bytes); 2678 2679 ret = link_free_space(ctl, info); 2680 if (ret) 2681 kmem_cache_free(btrfs_free_space_cachep, info); 2682 out: 2683 btrfs_discard_update_discardable(block_group); 2684 spin_unlock(&ctl->tree_lock); 2685 2686 if (ret) { 2687 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2688 ASSERT(ret != -EEXIST); 2689 } 2690 2691 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2692 btrfs_discard_check_filter(block_group, filter_bytes); 2693 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2694 } 2695 2696 return ret; 2697 } 2698 2699 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2700 u64 bytenr, u64 size, bool used) 2701 { 2702 struct btrfs_space_info *sinfo = block_group->space_info; 2703 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2704 u64 offset = bytenr - block_group->start; 2705 u64 to_free, to_unusable; 2706 int bg_reclaim_threshold = 0; 2707 bool initial; 2708 u64 reclaimable_unusable; 2709 2710 spin_lock(&block_group->lock); 2711 2712 initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2713 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2714 if (!initial) 2715 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2716 2717 if (!used) 2718 to_free = size; 2719 else if (initial) 2720 to_free = block_group->zone_capacity; 2721 else if (offset >= block_group->alloc_offset) 2722 to_free = size; 2723 else if (offset + size <= block_group->alloc_offset) 2724 to_free = 0; 2725 else 2726 to_free = offset + size - block_group->alloc_offset; 2727 to_unusable = size - to_free; 2728 2729 spin_lock(&ctl->tree_lock); 2730 ctl->free_space += to_free; 2731 spin_unlock(&ctl->tree_lock); 2732 /* 2733 * If the block group is read-only, we should account freed space into 2734 * bytes_readonly. 2735 */ 2736 if (!block_group->ro) { 2737 block_group->zone_unusable += to_unusable; 2738 WARN_ON(block_group->zone_unusable > block_group->length); 2739 } 2740 if (!used) { 2741 block_group->alloc_offset -= size; 2742 } 2743 2744 reclaimable_unusable = block_group->zone_unusable - 2745 (block_group->length - block_group->zone_capacity); 2746 /* All the region is now unusable. Mark it as unused and reclaim */ 2747 if (block_group->zone_unusable == block_group->length) { 2748 btrfs_mark_bg_unused(block_group); 2749 } else if (bg_reclaim_threshold && 2750 reclaimable_unusable >= 2751 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2752 btrfs_mark_bg_to_reclaim(block_group); 2753 } 2754 2755 spin_unlock(&block_group->lock); 2756 2757 return 0; 2758 } 2759 2760 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2761 u64 bytenr, u64 size) 2762 { 2763 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2764 2765 if (btrfs_is_zoned(block_group->fs_info)) 2766 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2767 true); 2768 2769 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2770 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2771 2772 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2773 } 2774 2775 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2776 u64 bytenr, u64 size) 2777 { 2778 if (btrfs_is_zoned(block_group->fs_info)) 2779 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2780 false); 2781 2782 return btrfs_add_free_space(block_group, bytenr, size); 2783 } 2784 2785 /* 2786 * This is a subtle distinction because when adding free space back in general, 2787 * we want it to be added as untrimmed for async. But in the case where we add 2788 * it on loading of a block group, we want to consider it trimmed. 2789 */ 2790 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2791 u64 bytenr, u64 size) 2792 { 2793 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2794 2795 if (btrfs_is_zoned(block_group->fs_info)) 2796 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2797 true); 2798 2799 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2800 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2801 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2802 2803 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2804 } 2805 2806 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2807 u64 offset, u64 bytes) 2808 { 2809 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2810 struct btrfs_free_space *info; 2811 int ret; 2812 bool re_search = false; 2813 2814 if (btrfs_is_zoned(block_group->fs_info)) { 2815 /* 2816 * This can happen with conventional zones when replaying log. 2817 * Since the allocation info of tree-log nodes are not recorded 2818 * to the extent-tree, calculate_alloc_pointer() failed to 2819 * advance the allocation pointer after last allocated tree log 2820 * node blocks. 2821 * 2822 * This function is called from 2823 * btrfs_pin_extent_for_log_replay() when replaying the log. 2824 * Advance the pointer not to overwrite the tree-log nodes. 2825 */ 2826 if (block_group->start + block_group->alloc_offset < 2827 offset + bytes) { 2828 block_group->alloc_offset = 2829 offset + bytes - block_group->start; 2830 } 2831 return 0; 2832 } 2833 2834 spin_lock(&ctl->tree_lock); 2835 2836 again: 2837 ret = 0; 2838 if (!bytes) 2839 goto out_lock; 2840 2841 info = tree_search_offset(ctl, offset, 0, 0); 2842 if (!info) { 2843 /* 2844 * oops didn't find an extent that matched the space we wanted 2845 * to remove, look for a bitmap instead 2846 */ 2847 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2848 1, 0); 2849 if (!info) { 2850 /* 2851 * If we found a partial bit of our free space in a 2852 * bitmap but then couldn't find the other part this may 2853 * be a problem, so WARN about it. 2854 */ 2855 WARN_ON(re_search); 2856 goto out_lock; 2857 } 2858 } 2859 2860 re_search = false; 2861 if (!info->bitmap) { 2862 unlink_free_space(ctl, info, true); 2863 if (offset == info->offset) { 2864 u64 to_free = min(bytes, info->bytes); 2865 2866 info->bytes -= to_free; 2867 info->offset += to_free; 2868 if (info->bytes) { 2869 ret = link_free_space(ctl, info); 2870 WARN_ON(ret); 2871 } else { 2872 kmem_cache_free(btrfs_free_space_cachep, info); 2873 } 2874 2875 offset += to_free; 2876 bytes -= to_free; 2877 goto again; 2878 } else { 2879 u64 old_end = info->bytes + info->offset; 2880 2881 info->bytes = offset - info->offset; 2882 ret = link_free_space(ctl, info); 2883 WARN_ON(ret); 2884 if (ret) 2885 goto out_lock; 2886 2887 /* Not enough bytes in this entry to satisfy us */ 2888 if (old_end < offset + bytes) { 2889 bytes -= old_end - offset; 2890 offset = old_end; 2891 goto again; 2892 } else if (old_end == offset + bytes) { 2893 /* all done */ 2894 goto out_lock; 2895 } 2896 spin_unlock(&ctl->tree_lock); 2897 2898 ret = __btrfs_add_free_space(block_group, 2899 offset + bytes, 2900 old_end - (offset + bytes), 2901 info->trim_state); 2902 WARN_ON(ret); 2903 goto out; 2904 } 2905 } 2906 2907 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2908 if (ret == -EAGAIN) { 2909 re_search = true; 2910 goto again; 2911 } 2912 out_lock: 2913 btrfs_discard_update_discardable(block_group); 2914 spin_unlock(&ctl->tree_lock); 2915 out: 2916 return ret; 2917 } 2918 2919 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2920 u64 bytes) 2921 { 2922 struct btrfs_fs_info *fs_info = block_group->fs_info; 2923 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2924 struct btrfs_free_space *info; 2925 struct rb_node *n; 2926 int count = 0; 2927 2928 /* 2929 * Zoned btrfs does not use free space tree and cluster. Just print 2930 * out the free space after the allocation offset. 2931 */ 2932 if (btrfs_is_zoned(fs_info)) { 2933 btrfs_info(fs_info, "free space %llu active %d", 2934 block_group->zone_capacity - block_group->alloc_offset, 2935 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2936 &block_group->runtime_flags)); 2937 return; 2938 } 2939 2940 spin_lock(&ctl->tree_lock); 2941 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2942 info = rb_entry(n, struct btrfs_free_space, offset_index); 2943 if (info->bytes >= bytes && !block_group->ro) 2944 count++; 2945 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2946 info->offset, info->bytes, str_yes_no(info->bitmap)); 2947 } 2948 spin_unlock(&ctl->tree_lock); 2949 btrfs_info(fs_info, "block group has cluster?: %s", 2950 str_no_yes(list_empty(&block_group->cluster_list))); 2951 btrfs_info(fs_info, 2952 "%d free space entries at or bigger than %llu bytes", 2953 count, bytes); 2954 } 2955 2956 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2957 struct btrfs_free_space_ctl *ctl) 2958 { 2959 struct btrfs_fs_info *fs_info = block_group->fs_info; 2960 2961 spin_lock_init(&ctl->tree_lock); 2962 ctl->unit = fs_info->sectorsize; 2963 ctl->start = block_group->start; 2964 ctl->block_group = block_group; 2965 ctl->op = &free_space_op; 2966 ctl->free_space_bytes = RB_ROOT_CACHED; 2967 INIT_LIST_HEAD(&ctl->trimming_ranges); 2968 mutex_init(&ctl->cache_writeout_mutex); 2969 2970 /* 2971 * we only want to have 32k of ram per block group for keeping 2972 * track of free space, and if we pass 1/2 of that we want to 2973 * start converting things over to using bitmaps 2974 */ 2975 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2976 } 2977 2978 /* 2979 * for a given cluster, put all of its extents back into the free 2980 * space cache. If the block group passed doesn't match the block group 2981 * pointed to by the cluster, someone else raced in and freed the 2982 * cluster already. In that case, we just return without changing anything 2983 */ 2984 static void __btrfs_return_cluster_to_free_space( 2985 struct btrfs_block_group *block_group, 2986 struct btrfs_free_cluster *cluster) 2987 { 2988 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2989 struct rb_node *node; 2990 2991 lockdep_assert_held(&ctl->tree_lock); 2992 2993 spin_lock(&cluster->lock); 2994 if (cluster->block_group != block_group) { 2995 spin_unlock(&cluster->lock); 2996 return; 2997 } 2998 2999 cluster->block_group = NULL; 3000 cluster->window_start = 0; 3001 list_del_init(&cluster->block_group_list); 3002 3003 node = rb_first(&cluster->root); 3004 while (node) { 3005 struct btrfs_free_space *entry; 3006 3007 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3008 node = rb_next(&entry->offset_index); 3009 rb_erase(&entry->offset_index, &cluster->root); 3010 RB_CLEAR_NODE(&entry->offset_index); 3011 3012 if (!entry->bitmap) { 3013 /* Merging treats extents as if they were new */ 3014 if (!btrfs_free_space_trimmed(entry)) { 3015 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3016 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3017 entry->bytes; 3018 } 3019 3020 try_merge_free_space(ctl, entry, false); 3021 steal_from_bitmap(ctl, entry, false); 3022 3023 /* As we insert directly, update these statistics */ 3024 if (!btrfs_free_space_trimmed(entry)) { 3025 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3026 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3027 entry->bytes; 3028 } 3029 } 3030 tree_insert_offset(ctl, NULL, entry); 3031 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3032 entry_less); 3033 } 3034 cluster->root = RB_ROOT; 3035 spin_unlock(&cluster->lock); 3036 btrfs_put_block_group(block_group); 3037 } 3038 3039 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3040 { 3041 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3042 struct btrfs_free_cluster *cluster; 3043 struct list_head *head; 3044 3045 spin_lock(&ctl->tree_lock); 3046 while ((head = block_group->cluster_list.next) != 3047 &block_group->cluster_list) { 3048 cluster = list_entry(head, struct btrfs_free_cluster, 3049 block_group_list); 3050 3051 WARN_ON(cluster->block_group != block_group); 3052 __btrfs_return_cluster_to_free_space(block_group, cluster); 3053 3054 cond_resched_lock(&ctl->tree_lock); 3055 } 3056 __btrfs_remove_free_space_cache(ctl); 3057 btrfs_discard_update_discardable(block_group); 3058 spin_unlock(&ctl->tree_lock); 3059 3060 } 3061 3062 /* 3063 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3064 */ 3065 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3066 { 3067 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3068 struct btrfs_free_space *info; 3069 struct rb_node *node; 3070 bool ret = true; 3071 3072 spin_lock(&ctl->tree_lock); 3073 node = rb_first(&ctl->free_space_offset); 3074 3075 while (node) { 3076 info = rb_entry(node, struct btrfs_free_space, offset_index); 3077 3078 if (!btrfs_free_space_trimmed(info)) { 3079 ret = false; 3080 break; 3081 } 3082 3083 node = rb_next(node); 3084 } 3085 3086 spin_unlock(&ctl->tree_lock); 3087 return ret; 3088 } 3089 3090 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3091 u64 offset, u64 bytes, u64 empty_size, 3092 u64 *max_extent_size) 3093 { 3094 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3095 struct btrfs_discard_ctl *discard_ctl = 3096 &block_group->fs_info->discard_ctl; 3097 struct btrfs_free_space *entry = NULL; 3098 u64 bytes_search = bytes + empty_size; 3099 u64 ret = 0; 3100 u64 align_gap = 0; 3101 u64 align_gap_len = 0; 3102 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3103 bool use_bytes_index = (offset == block_group->start); 3104 3105 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3106 3107 spin_lock(&ctl->tree_lock); 3108 entry = find_free_space(ctl, &offset, &bytes_search, 3109 block_group->full_stripe_len, max_extent_size, 3110 use_bytes_index); 3111 if (!entry) 3112 goto out; 3113 3114 ret = offset; 3115 if (entry->bitmap) { 3116 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3117 3118 if (!btrfs_free_space_trimmed(entry)) 3119 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3120 3121 if (!entry->bytes) 3122 free_bitmap(ctl, entry); 3123 } else { 3124 unlink_free_space(ctl, entry, true); 3125 align_gap_len = offset - entry->offset; 3126 align_gap = entry->offset; 3127 align_gap_trim_state = entry->trim_state; 3128 3129 if (!btrfs_free_space_trimmed(entry)) 3130 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3131 3132 entry->offset = offset + bytes; 3133 WARN_ON(entry->bytes < bytes + align_gap_len); 3134 3135 entry->bytes -= bytes + align_gap_len; 3136 if (!entry->bytes) 3137 kmem_cache_free(btrfs_free_space_cachep, entry); 3138 else 3139 link_free_space(ctl, entry); 3140 } 3141 out: 3142 btrfs_discard_update_discardable(block_group); 3143 spin_unlock(&ctl->tree_lock); 3144 3145 if (align_gap_len) 3146 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3147 align_gap_trim_state); 3148 return ret; 3149 } 3150 3151 /* 3152 * given a cluster, put all of its extents back into the free space 3153 * cache. If a block group is passed, this function will only free 3154 * a cluster that belongs to the passed block group. 3155 * 3156 * Otherwise, it'll get a reference on the block group pointed to by the 3157 * cluster and remove the cluster from it. 3158 */ 3159 void btrfs_return_cluster_to_free_space( 3160 struct btrfs_block_group *block_group, 3161 struct btrfs_free_cluster *cluster) 3162 { 3163 struct btrfs_free_space_ctl *ctl; 3164 3165 /* first, get a safe pointer to the block group */ 3166 spin_lock(&cluster->lock); 3167 if (!block_group) { 3168 block_group = cluster->block_group; 3169 if (!block_group) { 3170 spin_unlock(&cluster->lock); 3171 return; 3172 } 3173 } else if (cluster->block_group != block_group) { 3174 /* someone else has already freed it don't redo their work */ 3175 spin_unlock(&cluster->lock); 3176 return; 3177 } 3178 btrfs_get_block_group(block_group); 3179 spin_unlock(&cluster->lock); 3180 3181 ctl = block_group->free_space_ctl; 3182 3183 /* now return any extents the cluster had on it */ 3184 spin_lock(&ctl->tree_lock); 3185 __btrfs_return_cluster_to_free_space(block_group, cluster); 3186 spin_unlock(&ctl->tree_lock); 3187 3188 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3189 3190 /* finally drop our ref */ 3191 btrfs_put_block_group(block_group); 3192 } 3193 3194 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3195 struct btrfs_free_cluster *cluster, 3196 struct btrfs_free_space *entry, 3197 u64 bytes, u64 min_start, 3198 u64 *max_extent_size) 3199 { 3200 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3201 int err; 3202 u64 search_start = cluster->window_start; 3203 u64 search_bytes = bytes; 3204 u64 ret = 0; 3205 3206 search_start = min_start; 3207 search_bytes = bytes; 3208 3209 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3210 if (err) { 3211 *max_extent_size = max(get_max_extent_size(entry), 3212 *max_extent_size); 3213 return 0; 3214 } 3215 3216 ret = search_start; 3217 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3218 3219 return ret; 3220 } 3221 3222 /* 3223 * given a cluster, try to allocate 'bytes' from it, returns 0 3224 * if it couldn't find anything suitably large, or a logical disk offset 3225 * if things worked out 3226 */ 3227 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3228 struct btrfs_free_cluster *cluster, u64 bytes, 3229 u64 min_start, u64 *max_extent_size) 3230 { 3231 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3232 struct btrfs_discard_ctl *discard_ctl = 3233 &block_group->fs_info->discard_ctl; 3234 struct btrfs_free_space *entry = NULL; 3235 struct rb_node *node; 3236 u64 ret = 0; 3237 3238 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3239 3240 spin_lock(&cluster->lock); 3241 if (bytes > cluster->max_size) 3242 goto out; 3243 3244 if (cluster->block_group != block_group) 3245 goto out; 3246 3247 node = rb_first(&cluster->root); 3248 if (!node) 3249 goto out; 3250 3251 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3252 while (1) { 3253 if (entry->bytes < bytes) 3254 *max_extent_size = max(get_max_extent_size(entry), 3255 *max_extent_size); 3256 3257 if (entry->bytes < bytes || 3258 (!entry->bitmap && entry->offset < min_start)) { 3259 node = rb_next(&entry->offset_index); 3260 if (!node) 3261 break; 3262 entry = rb_entry(node, struct btrfs_free_space, 3263 offset_index); 3264 continue; 3265 } 3266 3267 if (entry->bitmap) { 3268 ret = btrfs_alloc_from_bitmap(block_group, 3269 cluster, entry, bytes, 3270 cluster->window_start, 3271 max_extent_size); 3272 if (ret == 0) { 3273 node = rb_next(&entry->offset_index); 3274 if (!node) 3275 break; 3276 entry = rb_entry(node, struct btrfs_free_space, 3277 offset_index); 3278 continue; 3279 } 3280 cluster->window_start += bytes; 3281 } else { 3282 ret = entry->offset; 3283 3284 entry->offset += bytes; 3285 entry->bytes -= bytes; 3286 } 3287 3288 break; 3289 } 3290 out: 3291 spin_unlock(&cluster->lock); 3292 3293 if (!ret) 3294 return 0; 3295 3296 spin_lock(&ctl->tree_lock); 3297 3298 if (!btrfs_free_space_trimmed(entry)) 3299 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3300 3301 ctl->free_space -= bytes; 3302 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3303 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3304 3305 spin_lock(&cluster->lock); 3306 if (entry->bytes == 0) { 3307 rb_erase(&entry->offset_index, &cluster->root); 3308 ctl->free_extents--; 3309 if (entry->bitmap) { 3310 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3311 entry->bitmap); 3312 ctl->total_bitmaps--; 3313 recalculate_thresholds(ctl); 3314 } else if (!btrfs_free_space_trimmed(entry)) { 3315 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3316 } 3317 kmem_cache_free(btrfs_free_space_cachep, entry); 3318 } 3319 3320 spin_unlock(&cluster->lock); 3321 spin_unlock(&ctl->tree_lock); 3322 3323 return ret; 3324 } 3325 3326 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3327 struct btrfs_free_space *entry, 3328 struct btrfs_free_cluster *cluster, 3329 u64 offset, u64 bytes, 3330 u64 cont1_bytes, u64 min_bytes) 3331 { 3332 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3333 unsigned long next_zero; 3334 unsigned long i; 3335 unsigned long want_bits; 3336 unsigned long min_bits; 3337 unsigned long found_bits; 3338 unsigned long max_bits = 0; 3339 unsigned long start = 0; 3340 unsigned long total_found = 0; 3341 int ret; 3342 3343 lockdep_assert_held(&ctl->tree_lock); 3344 3345 i = offset_to_bit(entry->offset, ctl->unit, 3346 max_t(u64, offset, entry->offset)); 3347 want_bits = bytes_to_bits(bytes, ctl->unit); 3348 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3349 3350 /* 3351 * Don't bother looking for a cluster in this bitmap if it's heavily 3352 * fragmented. 3353 */ 3354 if (entry->max_extent_size && 3355 entry->max_extent_size < cont1_bytes) 3356 return -ENOSPC; 3357 again: 3358 found_bits = 0; 3359 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3360 next_zero = find_next_zero_bit(entry->bitmap, 3361 BITS_PER_BITMAP, i); 3362 if (next_zero - i >= min_bits) { 3363 found_bits = next_zero - i; 3364 if (found_bits > max_bits) 3365 max_bits = found_bits; 3366 break; 3367 } 3368 if (next_zero - i > max_bits) 3369 max_bits = next_zero - i; 3370 i = next_zero; 3371 } 3372 3373 if (!found_bits) { 3374 entry->max_extent_size = (u64)max_bits * ctl->unit; 3375 return -ENOSPC; 3376 } 3377 3378 if (!total_found) { 3379 start = i; 3380 cluster->max_size = 0; 3381 } 3382 3383 total_found += found_bits; 3384 3385 if (cluster->max_size < found_bits * ctl->unit) 3386 cluster->max_size = found_bits * ctl->unit; 3387 3388 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3389 i = next_zero + 1; 3390 goto again; 3391 } 3392 3393 cluster->window_start = start * ctl->unit + entry->offset; 3394 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3395 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3396 3397 /* 3398 * We need to know if we're currently on the normal space index when we 3399 * manipulate the bitmap so that we know we need to remove and re-insert 3400 * it into the space_index tree. Clear the bytes_index node here so the 3401 * bitmap manipulation helpers know not to mess with the space_index 3402 * until this bitmap entry is added back into the normal cache. 3403 */ 3404 RB_CLEAR_NODE(&entry->bytes_index); 3405 3406 ret = tree_insert_offset(ctl, cluster, entry); 3407 ASSERT(!ret); /* -EEXIST; Logic error */ 3408 3409 trace_btrfs_setup_cluster(block_group, cluster, 3410 total_found * ctl->unit, 1); 3411 return 0; 3412 } 3413 3414 /* 3415 * This searches the block group for just extents to fill the cluster with. 3416 * Try to find a cluster with at least bytes total bytes, at least one 3417 * extent of cont1_bytes, and other clusters of at least min_bytes. 3418 */ 3419 static noinline int 3420 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3421 struct btrfs_free_cluster *cluster, 3422 struct list_head *bitmaps, u64 offset, u64 bytes, 3423 u64 cont1_bytes, u64 min_bytes) 3424 { 3425 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3426 struct btrfs_free_space *first = NULL; 3427 struct btrfs_free_space *entry = NULL; 3428 struct btrfs_free_space *last; 3429 struct rb_node *node; 3430 u64 window_free; 3431 u64 max_extent; 3432 u64 total_size = 0; 3433 3434 lockdep_assert_held(&ctl->tree_lock); 3435 3436 entry = tree_search_offset(ctl, offset, 0, 1); 3437 if (!entry) 3438 return -ENOSPC; 3439 3440 /* 3441 * We don't want bitmaps, so just move along until we find a normal 3442 * extent entry. 3443 */ 3444 while (entry->bitmap || entry->bytes < min_bytes) { 3445 if (entry->bitmap && list_empty(&entry->list)) 3446 list_add_tail(&entry->list, bitmaps); 3447 node = rb_next(&entry->offset_index); 3448 if (!node) 3449 return -ENOSPC; 3450 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3451 } 3452 3453 window_free = entry->bytes; 3454 max_extent = entry->bytes; 3455 first = entry; 3456 last = entry; 3457 3458 for (node = rb_next(&entry->offset_index); node; 3459 node = rb_next(&entry->offset_index)) { 3460 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3461 3462 if (entry->bitmap) { 3463 if (list_empty(&entry->list)) 3464 list_add_tail(&entry->list, bitmaps); 3465 continue; 3466 } 3467 3468 if (entry->bytes < min_bytes) 3469 continue; 3470 3471 last = entry; 3472 window_free += entry->bytes; 3473 if (entry->bytes > max_extent) 3474 max_extent = entry->bytes; 3475 } 3476 3477 if (window_free < bytes || max_extent < cont1_bytes) 3478 return -ENOSPC; 3479 3480 cluster->window_start = first->offset; 3481 3482 node = &first->offset_index; 3483 3484 /* 3485 * now we've found our entries, pull them out of the free space 3486 * cache and put them into the cluster rbtree 3487 */ 3488 do { 3489 int ret; 3490 3491 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3492 node = rb_next(&entry->offset_index); 3493 if (entry->bitmap || entry->bytes < min_bytes) 3494 continue; 3495 3496 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3497 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3498 ret = tree_insert_offset(ctl, cluster, entry); 3499 total_size += entry->bytes; 3500 ASSERT(!ret); /* -EEXIST; Logic error */ 3501 } while (node && entry != last); 3502 3503 cluster->max_size = max_extent; 3504 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3505 return 0; 3506 } 3507 3508 /* 3509 * This specifically looks for bitmaps that may work in the cluster, we assume 3510 * that we have already failed to find extents that will work. 3511 */ 3512 static noinline int 3513 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3514 struct btrfs_free_cluster *cluster, 3515 struct list_head *bitmaps, u64 offset, u64 bytes, 3516 u64 cont1_bytes, u64 min_bytes) 3517 { 3518 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3519 struct btrfs_free_space *entry = NULL; 3520 int ret = -ENOSPC; 3521 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3522 3523 if (ctl->total_bitmaps == 0) 3524 return -ENOSPC; 3525 3526 /* 3527 * The bitmap that covers offset won't be in the list unless offset 3528 * is just its start offset. 3529 */ 3530 if (!list_empty(bitmaps)) 3531 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3532 3533 if (!entry || entry->offset != bitmap_offset) { 3534 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3535 if (entry && list_empty(&entry->list)) 3536 list_add(&entry->list, bitmaps); 3537 } 3538 3539 list_for_each_entry(entry, bitmaps, list) { 3540 if (entry->bytes < bytes) 3541 continue; 3542 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3543 bytes, cont1_bytes, min_bytes); 3544 if (!ret) 3545 return 0; 3546 } 3547 3548 /* 3549 * The bitmaps list has all the bitmaps that record free space 3550 * starting after offset, so no more search is required. 3551 */ 3552 return -ENOSPC; 3553 } 3554 3555 /* 3556 * here we try to find a cluster of blocks in a block group. The goal 3557 * is to find at least bytes+empty_size. 3558 * We might not find them all in one contiguous area. 3559 * 3560 * returns zero and sets up cluster if things worked out, otherwise 3561 * it returns -enospc 3562 */ 3563 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3564 struct btrfs_free_cluster *cluster, 3565 u64 offset, u64 bytes, u64 empty_size) 3566 { 3567 struct btrfs_fs_info *fs_info = block_group->fs_info; 3568 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3569 struct btrfs_free_space *entry, *tmp; 3570 LIST_HEAD(bitmaps); 3571 u64 min_bytes; 3572 u64 cont1_bytes; 3573 int ret; 3574 3575 /* 3576 * Choose the minimum extent size we'll require for this 3577 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3578 * For metadata, allow allocates with smaller extents. For 3579 * data, keep it dense. 3580 */ 3581 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3582 cont1_bytes = bytes + empty_size; 3583 min_bytes = cont1_bytes; 3584 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3585 cont1_bytes = bytes; 3586 min_bytes = fs_info->sectorsize; 3587 } else { 3588 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3589 min_bytes = fs_info->sectorsize; 3590 } 3591 3592 spin_lock(&ctl->tree_lock); 3593 3594 /* 3595 * If we know we don't have enough space to make a cluster don't even 3596 * bother doing all the work to try and find one. 3597 */ 3598 if (ctl->free_space < bytes) { 3599 spin_unlock(&ctl->tree_lock); 3600 return -ENOSPC; 3601 } 3602 3603 spin_lock(&cluster->lock); 3604 3605 /* someone already found a cluster, hooray */ 3606 if (cluster->block_group) { 3607 ret = 0; 3608 goto out; 3609 } 3610 3611 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3612 min_bytes); 3613 3614 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3615 bytes + empty_size, 3616 cont1_bytes, min_bytes); 3617 if (ret) 3618 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3619 offset, bytes + empty_size, 3620 cont1_bytes, min_bytes); 3621 3622 /* Clear our temporary list */ 3623 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3624 list_del_init(&entry->list); 3625 3626 if (!ret) { 3627 btrfs_get_block_group(block_group); 3628 list_add_tail(&cluster->block_group_list, 3629 &block_group->cluster_list); 3630 cluster->block_group = block_group; 3631 } else { 3632 trace_btrfs_failed_cluster_setup(block_group); 3633 } 3634 out: 3635 spin_unlock(&cluster->lock); 3636 spin_unlock(&ctl->tree_lock); 3637 3638 return ret; 3639 } 3640 3641 /* 3642 * simple code to zero out a cluster 3643 */ 3644 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3645 { 3646 spin_lock_init(&cluster->lock); 3647 spin_lock_init(&cluster->refill_lock); 3648 cluster->root = RB_ROOT; 3649 cluster->max_size = 0; 3650 cluster->fragmented = false; 3651 INIT_LIST_HEAD(&cluster->block_group_list); 3652 cluster->block_group = NULL; 3653 } 3654 3655 static int do_trimming(struct btrfs_block_group *block_group, 3656 u64 *total_trimmed, u64 start, u64 bytes, 3657 u64 reserved_start, u64 reserved_bytes, 3658 enum btrfs_trim_state reserved_trim_state, 3659 struct btrfs_trim_range *trim_entry) 3660 { 3661 struct btrfs_space_info *space_info = block_group->space_info; 3662 struct btrfs_fs_info *fs_info = block_group->fs_info; 3663 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3664 int ret; 3665 int update = 0; 3666 const u64 end = start + bytes; 3667 const u64 reserved_end = reserved_start + reserved_bytes; 3668 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3669 u64 trimmed = 0; 3670 3671 spin_lock(&space_info->lock); 3672 spin_lock(&block_group->lock); 3673 if (!block_group->ro) { 3674 block_group->reserved += reserved_bytes; 3675 space_info->bytes_reserved += reserved_bytes; 3676 update = 1; 3677 } 3678 spin_unlock(&block_group->lock); 3679 spin_unlock(&space_info->lock); 3680 3681 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3682 if (!ret) { 3683 *total_trimmed += trimmed; 3684 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3685 } 3686 3687 mutex_lock(&ctl->cache_writeout_mutex); 3688 if (reserved_start < start) 3689 __btrfs_add_free_space(block_group, reserved_start, 3690 start - reserved_start, 3691 reserved_trim_state); 3692 if (end < reserved_end) 3693 __btrfs_add_free_space(block_group, end, reserved_end - end, 3694 reserved_trim_state); 3695 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3696 list_del(&trim_entry->list); 3697 mutex_unlock(&ctl->cache_writeout_mutex); 3698 3699 if (update) { 3700 spin_lock(&space_info->lock); 3701 spin_lock(&block_group->lock); 3702 if (block_group->ro) 3703 space_info->bytes_readonly += reserved_bytes; 3704 block_group->reserved -= reserved_bytes; 3705 space_info->bytes_reserved -= reserved_bytes; 3706 spin_unlock(&block_group->lock); 3707 spin_unlock(&space_info->lock); 3708 } 3709 3710 return ret; 3711 } 3712 3713 /* 3714 * If @async is set, then we will trim 1 region and return. 3715 */ 3716 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3717 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3718 bool async) 3719 { 3720 struct btrfs_discard_ctl *discard_ctl = 3721 &block_group->fs_info->discard_ctl; 3722 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3723 struct btrfs_free_space *entry; 3724 struct rb_node *node; 3725 int ret = 0; 3726 u64 extent_start; 3727 u64 extent_bytes; 3728 enum btrfs_trim_state extent_trim_state; 3729 u64 bytes; 3730 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3731 3732 while (start < end) { 3733 struct btrfs_trim_range trim_entry; 3734 3735 mutex_lock(&ctl->cache_writeout_mutex); 3736 spin_lock(&ctl->tree_lock); 3737 3738 if (ctl->free_space < minlen) 3739 goto out_unlock; 3740 3741 entry = tree_search_offset(ctl, start, 0, 1); 3742 if (!entry) 3743 goto out_unlock; 3744 3745 /* Skip bitmaps and if async, already trimmed entries */ 3746 while (entry->bitmap || 3747 (async && btrfs_free_space_trimmed(entry))) { 3748 node = rb_next(&entry->offset_index); 3749 if (!node) 3750 goto out_unlock; 3751 entry = rb_entry(node, struct btrfs_free_space, 3752 offset_index); 3753 } 3754 3755 if (entry->offset >= end) 3756 goto out_unlock; 3757 3758 extent_start = entry->offset; 3759 extent_bytes = entry->bytes; 3760 extent_trim_state = entry->trim_state; 3761 if (async) { 3762 start = entry->offset; 3763 bytes = entry->bytes; 3764 if (bytes < minlen) { 3765 spin_unlock(&ctl->tree_lock); 3766 mutex_unlock(&ctl->cache_writeout_mutex); 3767 goto next; 3768 } 3769 unlink_free_space(ctl, entry, true); 3770 /* 3771 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3772 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3773 * X when we come back around. So trim it now. 3774 */ 3775 if (max_discard_size && 3776 bytes >= (max_discard_size + 3777 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3778 bytes = max_discard_size; 3779 extent_bytes = max_discard_size; 3780 entry->offset += max_discard_size; 3781 entry->bytes -= max_discard_size; 3782 link_free_space(ctl, entry); 3783 } else { 3784 kmem_cache_free(btrfs_free_space_cachep, entry); 3785 } 3786 } else { 3787 start = max(start, extent_start); 3788 bytes = min(extent_start + extent_bytes, end) - start; 3789 if (bytes < minlen) { 3790 spin_unlock(&ctl->tree_lock); 3791 mutex_unlock(&ctl->cache_writeout_mutex); 3792 goto next; 3793 } 3794 3795 unlink_free_space(ctl, entry, true); 3796 kmem_cache_free(btrfs_free_space_cachep, entry); 3797 } 3798 3799 spin_unlock(&ctl->tree_lock); 3800 trim_entry.start = extent_start; 3801 trim_entry.bytes = extent_bytes; 3802 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3803 mutex_unlock(&ctl->cache_writeout_mutex); 3804 3805 ret = do_trimming(block_group, total_trimmed, start, bytes, 3806 extent_start, extent_bytes, extent_trim_state, 3807 &trim_entry); 3808 if (ret) { 3809 block_group->discard_cursor = start + bytes; 3810 break; 3811 } 3812 next: 3813 start += bytes; 3814 block_group->discard_cursor = start; 3815 if (async && *total_trimmed) 3816 break; 3817 3818 if (btrfs_trim_interrupted()) { 3819 ret = -ERESTARTSYS; 3820 break; 3821 } 3822 3823 cond_resched(); 3824 } 3825 3826 return ret; 3827 3828 out_unlock: 3829 block_group->discard_cursor = btrfs_block_group_end(block_group); 3830 spin_unlock(&ctl->tree_lock); 3831 mutex_unlock(&ctl->cache_writeout_mutex); 3832 3833 return ret; 3834 } 3835 3836 /* 3837 * If we break out of trimming a bitmap prematurely, we should reset the 3838 * trimming bit. In a rather contrieved case, it's possible to race here so 3839 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3840 * 3841 * start = start of bitmap 3842 * end = near end of bitmap 3843 * 3844 * Thread 1: Thread 2: 3845 * trim_bitmaps(start) 3846 * trim_bitmaps(end) 3847 * end_trimming_bitmap() 3848 * reset_trimming_bitmap() 3849 */ 3850 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3851 { 3852 struct btrfs_free_space *entry; 3853 3854 spin_lock(&ctl->tree_lock); 3855 entry = tree_search_offset(ctl, offset, 1, 0); 3856 if (entry) { 3857 if (btrfs_free_space_trimmed(entry)) { 3858 ctl->discardable_extents[BTRFS_STAT_CURR] += 3859 entry->bitmap_extents; 3860 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3861 } 3862 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3863 } 3864 3865 spin_unlock(&ctl->tree_lock); 3866 } 3867 3868 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3869 struct btrfs_free_space *entry) 3870 { 3871 if (btrfs_free_space_trimming_bitmap(entry)) { 3872 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3873 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3874 entry->bitmap_extents; 3875 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3876 } 3877 } 3878 3879 /* 3880 * If @async is set, then we will trim 1 region and return. 3881 */ 3882 static int trim_bitmaps(struct btrfs_block_group *block_group, 3883 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3884 u64 maxlen, bool async) 3885 { 3886 struct btrfs_discard_ctl *discard_ctl = 3887 &block_group->fs_info->discard_ctl; 3888 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3889 struct btrfs_free_space *entry; 3890 int ret = 0; 3891 int ret2; 3892 u64 bytes; 3893 u64 offset = offset_to_bitmap(ctl, start); 3894 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3895 3896 while (offset < end) { 3897 bool next_bitmap = false; 3898 struct btrfs_trim_range trim_entry; 3899 3900 mutex_lock(&ctl->cache_writeout_mutex); 3901 spin_lock(&ctl->tree_lock); 3902 3903 if (ctl->free_space < minlen) { 3904 block_group->discard_cursor = 3905 btrfs_block_group_end(block_group); 3906 spin_unlock(&ctl->tree_lock); 3907 mutex_unlock(&ctl->cache_writeout_mutex); 3908 break; 3909 } 3910 3911 entry = tree_search_offset(ctl, offset, 1, 0); 3912 /* 3913 * Bitmaps are marked trimmed lossily now to prevent constant 3914 * discarding of the same bitmap (the reason why we are bound 3915 * by the filters). So, retrim the block group bitmaps when we 3916 * are preparing to punt to the unused_bgs list. This uses 3917 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3918 * which is the only discard index which sets minlen to 0. 3919 */ 3920 if (!entry || (async && minlen && start == offset && 3921 btrfs_free_space_trimmed(entry))) { 3922 spin_unlock(&ctl->tree_lock); 3923 mutex_unlock(&ctl->cache_writeout_mutex); 3924 next_bitmap = true; 3925 goto next; 3926 } 3927 3928 /* 3929 * Async discard bitmap trimming begins at by setting the start 3930 * to be key.objectid and the offset_to_bitmap() aligns to the 3931 * start of the bitmap. This lets us know we are fully 3932 * scanning the bitmap rather than only some portion of it. 3933 */ 3934 if (start == offset) 3935 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3936 3937 bytes = minlen; 3938 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3939 if (ret2 || start >= end) { 3940 /* 3941 * We lossily consider a bitmap trimmed if we only skip 3942 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3943 */ 3944 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3945 end_trimming_bitmap(ctl, entry); 3946 else 3947 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3948 spin_unlock(&ctl->tree_lock); 3949 mutex_unlock(&ctl->cache_writeout_mutex); 3950 next_bitmap = true; 3951 goto next; 3952 } 3953 3954 /* 3955 * We already trimmed a region, but are using the locking above 3956 * to reset the trim_state. 3957 */ 3958 if (async && *total_trimmed) { 3959 spin_unlock(&ctl->tree_lock); 3960 mutex_unlock(&ctl->cache_writeout_mutex); 3961 goto out; 3962 } 3963 3964 bytes = min(bytes, end - start); 3965 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3966 spin_unlock(&ctl->tree_lock); 3967 mutex_unlock(&ctl->cache_writeout_mutex); 3968 goto next; 3969 } 3970 3971 /* 3972 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3973 * If X < @minlen, we won't trim X when we come back around. 3974 * So trim it now. We differ here from trimming extents as we 3975 * don't keep individual state per bit. 3976 */ 3977 if (async && 3978 max_discard_size && 3979 bytes > (max_discard_size + minlen)) 3980 bytes = max_discard_size; 3981 3982 bitmap_clear_bits(ctl, entry, start, bytes, true); 3983 if (entry->bytes == 0) 3984 free_bitmap(ctl, entry); 3985 3986 spin_unlock(&ctl->tree_lock); 3987 trim_entry.start = start; 3988 trim_entry.bytes = bytes; 3989 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3990 mutex_unlock(&ctl->cache_writeout_mutex); 3991 3992 ret = do_trimming(block_group, total_trimmed, start, bytes, 3993 start, bytes, 0, &trim_entry); 3994 if (ret) { 3995 reset_trimming_bitmap(ctl, offset); 3996 block_group->discard_cursor = 3997 btrfs_block_group_end(block_group); 3998 break; 3999 } 4000 next: 4001 if (next_bitmap) { 4002 offset += BITS_PER_BITMAP * ctl->unit; 4003 start = offset; 4004 } else { 4005 start += bytes; 4006 } 4007 block_group->discard_cursor = start; 4008 4009 if (btrfs_trim_interrupted()) { 4010 if (start != offset) 4011 reset_trimming_bitmap(ctl, offset); 4012 ret = -ERESTARTSYS; 4013 break; 4014 } 4015 4016 cond_resched(); 4017 } 4018 4019 if (offset >= end) 4020 block_group->discard_cursor = end; 4021 4022 out: 4023 return ret; 4024 } 4025 4026 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4027 u64 *trimmed, u64 start, u64 end, u64 minlen) 4028 { 4029 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4030 int ret; 4031 u64 rem = 0; 4032 4033 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4034 4035 *trimmed = 0; 4036 4037 spin_lock(&block_group->lock); 4038 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4039 spin_unlock(&block_group->lock); 4040 return 0; 4041 } 4042 btrfs_freeze_block_group(block_group); 4043 spin_unlock(&block_group->lock); 4044 4045 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4046 if (ret) 4047 goto out; 4048 4049 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4050 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4051 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4052 if (rem) 4053 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4054 out: 4055 btrfs_unfreeze_block_group(block_group); 4056 return ret; 4057 } 4058 4059 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4060 u64 *trimmed, u64 start, u64 end, u64 minlen, 4061 bool async) 4062 { 4063 int ret; 4064 4065 *trimmed = 0; 4066 4067 spin_lock(&block_group->lock); 4068 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4069 spin_unlock(&block_group->lock); 4070 return 0; 4071 } 4072 btrfs_freeze_block_group(block_group); 4073 spin_unlock(&block_group->lock); 4074 4075 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4076 btrfs_unfreeze_block_group(block_group); 4077 4078 return ret; 4079 } 4080 4081 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4082 u64 *trimmed, u64 start, u64 end, u64 minlen, 4083 u64 maxlen, bool async) 4084 { 4085 int ret; 4086 4087 *trimmed = 0; 4088 4089 spin_lock(&block_group->lock); 4090 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4091 spin_unlock(&block_group->lock); 4092 return 0; 4093 } 4094 btrfs_freeze_block_group(block_group); 4095 spin_unlock(&block_group->lock); 4096 4097 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4098 async); 4099 4100 btrfs_unfreeze_block_group(block_group); 4101 4102 return ret; 4103 } 4104 4105 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4106 { 4107 return btrfs_super_cache_generation(fs_info->super_copy); 4108 } 4109 4110 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4111 struct btrfs_trans_handle *trans) 4112 { 4113 struct btrfs_block_group *block_group; 4114 struct rb_node *node; 4115 int ret = 0; 4116 4117 btrfs_info(fs_info, "cleaning free space cache v1"); 4118 4119 node = rb_first_cached(&fs_info->block_group_cache_tree); 4120 while (node) { 4121 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4122 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4123 if (ret) 4124 goto out; 4125 node = rb_next(node); 4126 } 4127 out: 4128 return ret; 4129 } 4130 4131 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4132 { 4133 struct btrfs_trans_handle *trans; 4134 int ret; 4135 4136 /* 4137 * update_super_roots will appropriately set or unset 4138 * super_copy->cache_generation based on SPACE_CACHE and 4139 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4140 * transaction commit whether we are enabling space cache v1 and don't 4141 * have any other work to do, or are disabling it and removing free 4142 * space inodes. 4143 */ 4144 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4145 if (IS_ERR(trans)) 4146 return PTR_ERR(trans); 4147 4148 if (!active) { 4149 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4150 ret = cleanup_free_space_cache_v1(fs_info, trans); 4151 if (ret) { 4152 btrfs_abort_transaction(trans, ret); 4153 btrfs_end_transaction(trans); 4154 goto out; 4155 } 4156 } 4157 4158 ret = btrfs_commit_transaction(trans); 4159 out: 4160 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4161 4162 return ret; 4163 } 4164 4165 int __init btrfs_free_space_init(void) 4166 { 4167 btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); 4168 if (!btrfs_free_space_cachep) 4169 return -ENOMEM; 4170 4171 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4172 PAGE_SIZE, PAGE_SIZE, 4173 0, NULL); 4174 if (!btrfs_free_space_bitmap_cachep) { 4175 kmem_cache_destroy(btrfs_free_space_cachep); 4176 return -ENOMEM; 4177 } 4178 4179 return 0; 4180 } 4181 4182 void __cold btrfs_free_space_exit(void) 4183 { 4184 kmem_cache_destroy(btrfs_free_space_cachep); 4185 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4186 } 4187 4188 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4189 /* 4190 * Use this if you need to make a bitmap or extent entry specifically, it 4191 * doesn't do any of the merging that add_free_space does, this acts a lot like 4192 * how the free space cache loading stuff works, so you can get really weird 4193 * configurations. 4194 */ 4195 int test_add_free_space_entry(struct btrfs_block_group *cache, 4196 u64 offset, u64 bytes, bool bitmap) 4197 { 4198 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4199 struct btrfs_free_space *info = NULL, *bitmap_info; 4200 void *map = NULL; 4201 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4202 u64 bytes_added; 4203 int ret; 4204 4205 again: 4206 if (!info) { 4207 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4208 if (!info) 4209 return -ENOMEM; 4210 } 4211 4212 if (!bitmap) { 4213 spin_lock(&ctl->tree_lock); 4214 info->offset = offset; 4215 info->bytes = bytes; 4216 info->max_extent_size = 0; 4217 ret = link_free_space(ctl, info); 4218 spin_unlock(&ctl->tree_lock); 4219 if (ret) 4220 kmem_cache_free(btrfs_free_space_cachep, info); 4221 return ret; 4222 } 4223 4224 if (!map) { 4225 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4226 if (!map) { 4227 kmem_cache_free(btrfs_free_space_cachep, info); 4228 return -ENOMEM; 4229 } 4230 } 4231 4232 spin_lock(&ctl->tree_lock); 4233 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4234 1, 0); 4235 if (!bitmap_info) { 4236 info->bitmap = map; 4237 map = NULL; 4238 add_new_bitmap(ctl, info, offset); 4239 bitmap_info = info; 4240 info = NULL; 4241 } 4242 4243 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4244 trim_state); 4245 4246 bytes -= bytes_added; 4247 offset += bytes_added; 4248 spin_unlock(&ctl->tree_lock); 4249 4250 if (bytes) 4251 goto again; 4252 4253 if (info) 4254 kmem_cache_free(btrfs_free_space_cachep, info); 4255 if (map) 4256 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4257 return 0; 4258 } 4259 4260 /* 4261 * Checks to see if the given range is in the free space cache. This is really 4262 * just used to check the absence of space, so if there is free space in the 4263 * range at all we will return 1. 4264 */ 4265 int test_check_exists(struct btrfs_block_group *cache, 4266 u64 offset, u64 bytes) 4267 { 4268 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4269 struct btrfs_free_space *info; 4270 int ret = 0; 4271 4272 spin_lock(&ctl->tree_lock); 4273 info = tree_search_offset(ctl, offset, 0, 0); 4274 if (!info) { 4275 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4276 1, 0); 4277 if (!info) 4278 goto out; 4279 } 4280 4281 have_info: 4282 if (info->bitmap) { 4283 u64 bit_off, bit_bytes; 4284 struct rb_node *n; 4285 struct btrfs_free_space *tmp; 4286 4287 bit_off = offset; 4288 bit_bytes = ctl->unit; 4289 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4290 if (!ret) { 4291 if (bit_off == offset) { 4292 ret = 1; 4293 goto out; 4294 } else if (bit_off > offset && 4295 offset + bytes > bit_off) { 4296 ret = 1; 4297 goto out; 4298 } 4299 } 4300 4301 n = rb_prev(&info->offset_index); 4302 while (n) { 4303 tmp = rb_entry(n, struct btrfs_free_space, 4304 offset_index); 4305 if (tmp->offset + tmp->bytes < offset) 4306 break; 4307 if (offset + bytes < tmp->offset) { 4308 n = rb_prev(&tmp->offset_index); 4309 continue; 4310 } 4311 info = tmp; 4312 goto have_info; 4313 } 4314 4315 n = rb_next(&info->offset_index); 4316 while (n) { 4317 tmp = rb_entry(n, struct btrfs_free_space, 4318 offset_index); 4319 if (offset + bytes < tmp->offset) 4320 break; 4321 if (tmp->offset + tmp->bytes < offset) { 4322 n = rb_next(&tmp->offset_index); 4323 continue; 4324 } 4325 info = tmp; 4326 goto have_info; 4327 } 4328 4329 ret = 0; 4330 goto out; 4331 } 4332 4333 if (info->offset == offset) { 4334 ret = 1; 4335 goto out; 4336 } 4337 4338 if (offset > info->offset && offset < info->offset + info->bytes) 4339 ret = 1; 4340 out: 4341 spin_unlock(&ctl->tree_lock); 4342 return ret; 4343 } 4344 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4345