1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include <linux/string_choices.h> 15 #include "extent-tree.h" 16 #include "fs.h" 17 #include "messages.h" 18 #include "misc.h" 19 #include "free-space-cache.h" 20 #include "transaction.h" 21 #include "disk-io.h" 22 #include "extent_io.h" 23 #include "space-info.h" 24 #include "block-group.h" 25 #include "discard.h" 26 #include "subpage.h" 27 #include "inode-item.h" 28 #include "accessors.h" 29 #include "file-item.h" 30 #include "file.h" 31 #include "super.h" 32 #include "relocation.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void btrfs_crc32c_final(u32 crc, u8 *result) 61 { 62 put_unaligned_le32(~crc, result); 63 } 64 65 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 66 { 67 struct btrfs_free_space *info; 68 struct rb_node *node; 69 70 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 71 info = rb_entry(node, struct btrfs_free_space, offset_index); 72 if (!info->bitmap) { 73 unlink_free_space(ctl, info, true); 74 kmem_cache_free(btrfs_free_space_cachep, info); 75 } else { 76 free_bitmap(ctl, info); 77 } 78 79 cond_resched_lock(&ctl->tree_lock); 80 } 81 } 82 83 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 84 struct btrfs_path *path, 85 u64 offset) 86 { 87 struct btrfs_key key; 88 struct btrfs_key location; 89 struct btrfs_disk_key disk_key; 90 struct btrfs_free_space_header *header; 91 struct extent_buffer *leaf; 92 struct btrfs_inode *inode; 93 unsigned nofs_flag; 94 int ret; 95 96 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 97 key.type = 0; 98 key.offset = offset; 99 100 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 101 if (ret < 0) 102 return ERR_PTR(ret); 103 if (ret > 0) { 104 btrfs_release_path(path); 105 return ERR_PTR(-ENOENT); 106 } 107 108 leaf = path->nodes[0]; 109 header = btrfs_item_ptr(leaf, path->slots[0], 110 struct btrfs_free_space_header); 111 btrfs_free_space_key(leaf, header, &disk_key); 112 btrfs_disk_key_to_cpu(&location, &disk_key); 113 btrfs_release_path(path); 114 115 /* 116 * We are often under a trans handle at this point, so we need to make 117 * sure NOFS is set to keep us from deadlocking. 118 */ 119 nofs_flag = memalloc_nofs_save(); 120 inode = btrfs_iget_path(location.objectid, root, path); 121 btrfs_release_path(path); 122 memalloc_nofs_restore(nofs_flag); 123 if (IS_ERR(inode)) 124 return ERR_CAST(inode); 125 126 mapping_set_gfp_mask(inode->vfs_inode.i_mapping, 127 mapping_gfp_constraint(inode->vfs_inode.i_mapping, 128 ~(__GFP_FS | __GFP_HIGHMEM))); 129 130 return &inode->vfs_inode; 131 } 132 133 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 134 struct btrfs_path *path) 135 { 136 struct btrfs_fs_info *fs_info = block_group->fs_info; 137 struct inode *inode = NULL; 138 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 139 140 spin_lock(&block_group->lock); 141 if (block_group->inode) 142 inode = igrab(&block_group->inode->vfs_inode); 143 spin_unlock(&block_group->lock); 144 if (inode) 145 return inode; 146 147 inode = __lookup_free_space_inode(fs_info->tree_root, path, 148 block_group->start); 149 if (IS_ERR(inode)) 150 return inode; 151 152 spin_lock(&block_group->lock); 153 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 154 btrfs_info(fs_info, "Old style space inode found, converting."); 155 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 156 BTRFS_INODE_NODATACOW; 157 block_group->disk_cache_state = BTRFS_DC_CLEAR; 158 } 159 160 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 161 block_group->inode = BTRFS_I(igrab(inode)); 162 spin_unlock(&block_group->lock); 163 164 return inode; 165 } 166 167 static int __create_free_space_inode(struct btrfs_root *root, 168 struct btrfs_trans_handle *trans, 169 struct btrfs_path *path, 170 u64 ino, u64 offset) 171 { 172 struct btrfs_key key; 173 struct btrfs_disk_key disk_key; 174 struct btrfs_free_space_header *header; 175 struct btrfs_inode_item *inode_item; 176 struct extent_buffer *leaf; 177 /* We inline CRCs for the free disk space cache */ 178 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 179 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 180 int ret; 181 182 ret = btrfs_insert_empty_inode(trans, root, path, ino); 183 if (ret) 184 return ret; 185 186 leaf = path->nodes[0]; 187 inode_item = btrfs_item_ptr(leaf, path->slots[0], 188 struct btrfs_inode_item); 189 btrfs_item_key(leaf, &disk_key, path->slots[0]); 190 memzero_extent_buffer(leaf, (unsigned long)inode_item, 191 sizeof(*inode_item)); 192 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 193 btrfs_set_inode_size(leaf, inode_item, 0); 194 btrfs_set_inode_nbytes(leaf, inode_item, 0); 195 btrfs_set_inode_uid(leaf, inode_item, 0); 196 btrfs_set_inode_gid(leaf, inode_item, 0); 197 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 198 btrfs_set_inode_flags(leaf, inode_item, flags); 199 btrfs_set_inode_nlink(leaf, inode_item, 1); 200 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 201 btrfs_set_inode_block_group(leaf, inode_item, offset); 202 btrfs_release_path(path); 203 204 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 205 key.type = 0; 206 key.offset = offset; 207 ret = btrfs_insert_empty_item(trans, root, path, &key, 208 sizeof(struct btrfs_free_space_header)); 209 if (ret < 0) { 210 btrfs_release_path(path); 211 return ret; 212 } 213 214 leaf = path->nodes[0]; 215 header = btrfs_item_ptr(leaf, path->slots[0], 216 struct btrfs_free_space_header); 217 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 218 btrfs_set_free_space_key(leaf, header, &disk_key); 219 btrfs_release_path(path); 220 221 return 0; 222 } 223 224 int create_free_space_inode(struct btrfs_trans_handle *trans, 225 struct btrfs_block_group *block_group, 226 struct btrfs_path *path) 227 { 228 int ret; 229 u64 ino; 230 231 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 232 if (ret < 0) 233 return ret; 234 235 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 236 ino, block_group->start); 237 } 238 239 /* 240 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 241 * handles lookup, otherwise it takes ownership and iputs the inode. 242 * Don't reuse an inode pointer after passing it into this function. 243 */ 244 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 245 struct inode *inode, 246 struct btrfs_block_group *block_group) 247 { 248 BTRFS_PATH_AUTO_FREE(path); 249 struct btrfs_key key; 250 int ret = 0; 251 252 path = btrfs_alloc_path(); 253 if (!path) 254 return -ENOMEM; 255 256 if (!inode) 257 inode = lookup_free_space_inode(block_group, path); 258 if (IS_ERR(inode)) { 259 if (PTR_ERR(inode) != -ENOENT) 260 ret = PTR_ERR(inode); 261 return ret; 262 } 263 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 264 if (ret) { 265 btrfs_add_delayed_iput(BTRFS_I(inode)); 266 return ret; 267 } 268 clear_nlink(inode); 269 /* One for the block groups ref */ 270 spin_lock(&block_group->lock); 271 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 272 block_group->inode = NULL; 273 spin_unlock(&block_group->lock); 274 iput(inode); 275 } else { 276 spin_unlock(&block_group->lock); 277 } 278 /* One for the lookup ref */ 279 btrfs_add_delayed_iput(BTRFS_I(inode)); 280 281 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 282 key.type = 0; 283 key.offset = block_group->start; 284 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 285 -1, 1); 286 if (ret) { 287 if (ret > 0) 288 ret = 0; 289 return ret; 290 } 291 return btrfs_del_item(trans, trans->fs_info->tree_root, path); 292 } 293 294 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 295 struct btrfs_block_group *block_group, 296 struct inode *vfs_inode) 297 { 298 struct btrfs_truncate_control control = { 299 .inode = BTRFS_I(vfs_inode), 300 .new_size = 0, 301 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 302 .min_type = BTRFS_EXTENT_DATA_KEY, 303 .clear_extent_range = true, 304 }; 305 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 306 struct btrfs_root *root = inode->root; 307 struct extent_state *cached_state = NULL; 308 int ret = 0; 309 bool locked = false; 310 311 if (block_group) { 312 BTRFS_PATH_AUTO_FREE(path); 313 314 path = btrfs_alloc_path(); 315 if (!path) { 316 ret = -ENOMEM; 317 goto fail; 318 } 319 locked = true; 320 mutex_lock(&trans->transaction->cache_write_mutex); 321 if (!list_empty(&block_group->io_list)) { 322 list_del_init(&block_group->io_list); 323 324 btrfs_wait_cache_io(trans, block_group, path); 325 btrfs_put_block_group(block_group); 326 } 327 328 /* 329 * now that we've truncated the cache away, its no longer 330 * setup or written 331 */ 332 spin_lock(&block_group->lock); 333 block_group->disk_cache_state = BTRFS_DC_CLEAR; 334 spin_unlock(&block_group->lock); 335 } 336 337 btrfs_i_size_write(inode, 0); 338 truncate_pagecache(vfs_inode, 0); 339 340 btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 341 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 342 343 /* 344 * We skip the throttling logic for free space cache inodes, so we don't 345 * need to check for -EAGAIN. 346 */ 347 ret = btrfs_truncate_inode_items(trans, root, &control); 348 349 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 350 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 351 352 btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 353 if (ret) 354 goto fail; 355 356 ret = btrfs_update_inode(trans, inode); 357 358 fail: 359 if (locked) 360 mutex_unlock(&trans->transaction->cache_write_mutex); 361 if (ret) 362 btrfs_abort_transaction(trans, ret); 363 364 return ret; 365 } 366 367 static void readahead_cache(struct inode *inode) 368 { 369 struct file_ra_state ra; 370 pgoff_t last_index; 371 372 file_ra_state_init(&ra, inode->i_mapping); 373 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 374 375 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 376 } 377 378 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 379 int write) 380 { 381 int num_pages; 382 383 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 384 385 /* Make sure we can fit our crcs and generation into the first page */ 386 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 387 return -ENOSPC; 388 389 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 390 391 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 392 if (!io_ctl->pages) 393 return -ENOMEM; 394 395 io_ctl->num_pages = num_pages; 396 io_ctl->fs_info = inode_to_fs_info(inode); 397 io_ctl->inode = inode; 398 399 return 0; 400 } 401 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 402 403 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 404 { 405 kfree(io_ctl->pages); 406 io_ctl->pages = NULL; 407 } 408 409 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 410 { 411 if (io_ctl->cur) { 412 io_ctl->cur = NULL; 413 io_ctl->orig = NULL; 414 } 415 } 416 417 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 418 { 419 ASSERT(io_ctl->index < io_ctl->num_pages); 420 io_ctl->page = io_ctl->pages[io_ctl->index++]; 421 io_ctl->cur = page_address(io_ctl->page); 422 io_ctl->orig = io_ctl->cur; 423 io_ctl->size = PAGE_SIZE; 424 if (clear) 425 clear_page(io_ctl->cur); 426 } 427 428 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 429 { 430 int i; 431 432 io_ctl_unmap_page(io_ctl); 433 434 for (i = 0; i < io_ctl->num_pages; i++) { 435 if (io_ctl->pages[i]) { 436 btrfs_folio_clear_checked(io_ctl->fs_info, 437 page_folio(io_ctl->pages[i]), 438 page_offset(io_ctl->pages[i]), 439 PAGE_SIZE); 440 unlock_page(io_ctl->pages[i]); 441 put_page(io_ctl->pages[i]); 442 } 443 } 444 } 445 446 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 447 { 448 struct folio *folio; 449 struct inode *inode = io_ctl->inode; 450 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 451 int i; 452 453 for (i = 0; i < io_ctl->num_pages; i++) { 454 int ret; 455 456 folio = __filemap_get_folio(inode->i_mapping, i, 457 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 458 mask); 459 if (IS_ERR(folio)) { 460 io_ctl_drop_pages(io_ctl); 461 return PTR_ERR(folio); 462 } 463 464 ret = set_folio_extent_mapped(folio); 465 if (ret < 0) { 466 folio_unlock(folio); 467 folio_put(folio); 468 io_ctl_drop_pages(io_ctl); 469 return ret; 470 } 471 472 io_ctl->pages[i] = &folio->page; 473 if (uptodate && !folio_test_uptodate(folio)) { 474 btrfs_read_folio(NULL, folio); 475 folio_lock(folio); 476 if (folio->mapping != inode->i_mapping) { 477 btrfs_err(BTRFS_I(inode)->root->fs_info, 478 "free space cache page truncated"); 479 io_ctl_drop_pages(io_ctl); 480 return -EIO; 481 } 482 if (!folio_test_uptodate(folio)) { 483 btrfs_err(BTRFS_I(inode)->root->fs_info, 484 "error reading free space cache"); 485 io_ctl_drop_pages(io_ctl); 486 return -EIO; 487 } 488 } 489 } 490 491 for (i = 0; i < io_ctl->num_pages; i++) 492 clear_page_dirty_for_io(io_ctl->pages[i]); 493 494 return 0; 495 } 496 497 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 498 { 499 io_ctl_map_page(io_ctl, 1); 500 501 /* 502 * Skip the csum areas. If we don't check crcs then we just have a 503 * 64bit chunk at the front of the first page. 504 */ 505 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 506 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 507 508 put_unaligned_le64(generation, io_ctl->cur); 509 io_ctl->cur += sizeof(u64); 510 } 511 512 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 513 { 514 u64 cache_gen; 515 516 /* 517 * Skip the crc area. If we don't check crcs then we just have a 64bit 518 * chunk at the front of the first page. 519 */ 520 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 521 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 522 523 cache_gen = get_unaligned_le64(io_ctl->cur); 524 if (cache_gen != generation) { 525 btrfs_err_rl(io_ctl->fs_info, 526 "space cache generation (%llu) does not match inode (%llu)", 527 cache_gen, generation); 528 io_ctl_unmap_page(io_ctl); 529 return -EIO; 530 } 531 io_ctl->cur += sizeof(u64); 532 return 0; 533 } 534 535 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 536 { 537 u32 *tmp; 538 u32 crc = ~(u32)0; 539 unsigned offset = 0; 540 541 if (index == 0) 542 offset = sizeof(u32) * io_ctl->num_pages; 543 544 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 545 btrfs_crc32c_final(crc, (u8 *)&crc); 546 io_ctl_unmap_page(io_ctl); 547 tmp = page_address(io_ctl->pages[0]); 548 tmp += index; 549 *tmp = crc; 550 } 551 552 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 553 { 554 u32 *tmp, val; 555 u32 crc = ~(u32)0; 556 unsigned offset = 0; 557 558 if (index == 0) 559 offset = sizeof(u32) * io_ctl->num_pages; 560 561 tmp = page_address(io_ctl->pages[0]); 562 tmp += index; 563 val = *tmp; 564 565 io_ctl_map_page(io_ctl, 0); 566 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 567 btrfs_crc32c_final(crc, (u8 *)&crc); 568 if (val != crc) { 569 btrfs_err_rl(io_ctl->fs_info, 570 "csum mismatch on free space cache"); 571 io_ctl_unmap_page(io_ctl); 572 return -EIO; 573 } 574 575 return 0; 576 } 577 578 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 579 void *bitmap) 580 { 581 struct btrfs_free_space_entry *entry; 582 583 if (!io_ctl->cur) 584 return -ENOSPC; 585 586 entry = io_ctl->cur; 587 put_unaligned_le64(offset, &entry->offset); 588 put_unaligned_le64(bytes, &entry->bytes); 589 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 590 BTRFS_FREE_SPACE_EXTENT; 591 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 592 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 593 594 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 595 return 0; 596 597 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 598 599 /* No more pages to map */ 600 if (io_ctl->index >= io_ctl->num_pages) 601 return 0; 602 603 /* map the next page */ 604 io_ctl_map_page(io_ctl, 1); 605 return 0; 606 } 607 608 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 609 { 610 if (!io_ctl->cur) 611 return -ENOSPC; 612 613 /* 614 * If we aren't at the start of the current page, unmap this one and 615 * map the next one if there is any left. 616 */ 617 if (io_ctl->cur != io_ctl->orig) { 618 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 619 if (io_ctl->index >= io_ctl->num_pages) 620 return -ENOSPC; 621 io_ctl_map_page(io_ctl, 0); 622 } 623 624 copy_page(io_ctl->cur, bitmap); 625 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 626 if (io_ctl->index < io_ctl->num_pages) 627 io_ctl_map_page(io_ctl, 0); 628 return 0; 629 } 630 631 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 632 { 633 /* 634 * If we're not on the boundary we know we've modified the page and we 635 * need to crc the page. 636 */ 637 if (io_ctl->cur != io_ctl->orig) 638 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 639 else 640 io_ctl_unmap_page(io_ctl); 641 642 while (io_ctl->index < io_ctl->num_pages) { 643 io_ctl_map_page(io_ctl, 1); 644 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 645 } 646 } 647 648 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 649 struct btrfs_free_space *entry, u8 *type) 650 { 651 struct btrfs_free_space_entry *e; 652 int ret; 653 654 if (!io_ctl->cur) { 655 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 656 if (ret) 657 return ret; 658 } 659 660 e = io_ctl->cur; 661 entry->offset = get_unaligned_le64(&e->offset); 662 entry->bytes = get_unaligned_le64(&e->bytes); 663 *type = e->type; 664 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 665 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 666 667 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 668 return 0; 669 670 io_ctl_unmap_page(io_ctl); 671 672 return 0; 673 } 674 675 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 676 struct btrfs_free_space *entry) 677 { 678 int ret; 679 680 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 681 if (ret) 682 return ret; 683 684 copy_page(entry->bitmap, io_ctl->cur); 685 io_ctl_unmap_page(io_ctl); 686 687 return 0; 688 } 689 690 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 691 { 692 struct btrfs_block_group *block_group = ctl->block_group; 693 u64 max_bytes; 694 u64 bitmap_bytes; 695 u64 extent_bytes; 696 u64 size = block_group->length; 697 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 698 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 699 700 max_bitmaps = max_t(u64, max_bitmaps, 1); 701 702 if (ctl->total_bitmaps > max_bitmaps) 703 btrfs_err(block_group->fs_info, 704 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 705 block_group->start, block_group->length, 706 ctl->total_bitmaps, ctl->unit, max_bitmaps, 707 bytes_per_bg); 708 ASSERT(ctl->total_bitmaps <= max_bitmaps); 709 710 /* 711 * We are trying to keep the total amount of memory used per 1GiB of 712 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 713 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 714 * bitmaps, we may end up using more memory than this. 715 */ 716 if (size < SZ_1G) 717 max_bytes = MAX_CACHE_BYTES_PER_GIG; 718 else 719 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 720 721 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 722 723 /* 724 * we want the extent entry threshold to always be at most 1/2 the max 725 * bytes we can have, or whatever is less than that. 726 */ 727 extent_bytes = max_bytes - bitmap_bytes; 728 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 729 730 ctl->extents_thresh = 731 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 732 } 733 734 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 735 struct btrfs_free_space_ctl *ctl, 736 struct btrfs_path *path, u64 offset) 737 { 738 struct btrfs_fs_info *fs_info = root->fs_info; 739 struct btrfs_free_space_header *header; 740 struct extent_buffer *leaf; 741 struct btrfs_io_ctl io_ctl; 742 struct btrfs_key key; 743 struct btrfs_free_space *e, *n; 744 LIST_HEAD(bitmaps); 745 u64 num_entries; 746 u64 num_bitmaps; 747 u64 generation; 748 u8 type; 749 int ret = 0; 750 751 /* Nothing in the space cache, goodbye */ 752 if (!i_size_read(inode)) 753 return 0; 754 755 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 756 key.type = 0; 757 key.offset = offset; 758 759 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 760 if (ret < 0) 761 return 0; 762 else if (ret > 0) { 763 btrfs_release_path(path); 764 return 0; 765 } 766 767 ret = -1; 768 769 leaf = path->nodes[0]; 770 header = btrfs_item_ptr(leaf, path->slots[0], 771 struct btrfs_free_space_header); 772 num_entries = btrfs_free_space_entries(leaf, header); 773 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 774 generation = btrfs_free_space_generation(leaf, header); 775 btrfs_release_path(path); 776 777 if (!BTRFS_I(inode)->generation) { 778 btrfs_info(fs_info, 779 "the free space cache file (%llu) is invalid, skip it", 780 offset); 781 return 0; 782 } 783 784 if (BTRFS_I(inode)->generation != generation) { 785 btrfs_err(fs_info, 786 "free space inode generation (%llu) did not match free space cache generation (%llu)", 787 BTRFS_I(inode)->generation, generation); 788 return 0; 789 } 790 791 if (!num_entries) 792 return 0; 793 794 ret = io_ctl_init(&io_ctl, inode, 0); 795 if (ret) 796 return ret; 797 798 readahead_cache(inode); 799 800 ret = io_ctl_prepare_pages(&io_ctl, true); 801 if (ret) 802 goto out; 803 804 ret = io_ctl_check_crc(&io_ctl, 0); 805 if (ret) 806 goto free_cache; 807 808 ret = io_ctl_check_generation(&io_ctl, generation); 809 if (ret) 810 goto free_cache; 811 812 while (num_entries) { 813 e = kmem_cache_zalloc(btrfs_free_space_cachep, 814 GFP_NOFS); 815 if (!e) { 816 ret = -ENOMEM; 817 goto free_cache; 818 } 819 820 ret = io_ctl_read_entry(&io_ctl, e, &type); 821 if (ret) { 822 kmem_cache_free(btrfs_free_space_cachep, e); 823 goto free_cache; 824 } 825 826 if (!e->bytes) { 827 ret = -1; 828 kmem_cache_free(btrfs_free_space_cachep, e); 829 goto free_cache; 830 } 831 832 if (type == BTRFS_FREE_SPACE_EXTENT) { 833 spin_lock(&ctl->tree_lock); 834 ret = link_free_space(ctl, e); 835 spin_unlock(&ctl->tree_lock); 836 if (ret) { 837 btrfs_err(fs_info, 838 "Duplicate entries in free space cache, dumping"); 839 kmem_cache_free(btrfs_free_space_cachep, e); 840 goto free_cache; 841 } 842 } else { 843 ASSERT(num_bitmaps); 844 num_bitmaps--; 845 e->bitmap = kmem_cache_zalloc( 846 btrfs_free_space_bitmap_cachep, GFP_NOFS); 847 if (!e->bitmap) { 848 ret = -ENOMEM; 849 kmem_cache_free( 850 btrfs_free_space_cachep, e); 851 goto free_cache; 852 } 853 spin_lock(&ctl->tree_lock); 854 ret = link_free_space(ctl, e); 855 if (ret) { 856 spin_unlock(&ctl->tree_lock); 857 btrfs_err(fs_info, 858 "Duplicate entries in free space cache, dumping"); 859 kmem_cache_free(btrfs_free_space_bitmap_cachep, e->bitmap); 860 kmem_cache_free(btrfs_free_space_cachep, e); 861 goto free_cache; 862 } 863 ctl->total_bitmaps++; 864 recalculate_thresholds(ctl); 865 spin_unlock(&ctl->tree_lock); 866 list_add_tail(&e->list, &bitmaps); 867 } 868 869 num_entries--; 870 } 871 872 io_ctl_unmap_page(&io_ctl); 873 874 /* 875 * We add the bitmaps at the end of the entries in order that 876 * the bitmap entries are added to the cache. 877 */ 878 list_for_each_entry_safe(e, n, &bitmaps, list) { 879 list_del_init(&e->list); 880 ret = io_ctl_read_bitmap(&io_ctl, e); 881 if (ret) 882 goto free_cache; 883 } 884 885 io_ctl_drop_pages(&io_ctl); 886 ret = 1; 887 out: 888 io_ctl_free(&io_ctl); 889 return ret; 890 free_cache: 891 io_ctl_drop_pages(&io_ctl); 892 893 spin_lock(&ctl->tree_lock); 894 __btrfs_remove_free_space_cache(ctl); 895 spin_unlock(&ctl->tree_lock); 896 goto out; 897 } 898 899 static int copy_free_space_cache(struct btrfs_block_group *block_group, 900 struct btrfs_free_space_ctl *ctl) 901 { 902 struct btrfs_free_space *info; 903 struct rb_node *n; 904 int ret = 0; 905 906 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 907 info = rb_entry(n, struct btrfs_free_space, offset_index); 908 if (!info->bitmap) { 909 const u64 offset = info->offset; 910 const u64 bytes = info->bytes; 911 912 unlink_free_space(ctl, info, true); 913 spin_unlock(&ctl->tree_lock); 914 kmem_cache_free(btrfs_free_space_cachep, info); 915 ret = btrfs_add_free_space(block_group, offset, bytes); 916 spin_lock(&ctl->tree_lock); 917 } else { 918 u64 offset = info->offset; 919 u64 bytes = ctl->unit; 920 921 ret = search_bitmap(ctl, info, &offset, &bytes, false); 922 if (ret == 0) { 923 bitmap_clear_bits(ctl, info, offset, bytes, true); 924 spin_unlock(&ctl->tree_lock); 925 ret = btrfs_add_free_space(block_group, offset, 926 bytes); 927 spin_lock(&ctl->tree_lock); 928 } else { 929 free_bitmap(ctl, info); 930 ret = 0; 931 } 932 } 933 cond_resched_lock(&ctl->tree_lock); 934 } 935 return ret; 936 } 937 938 static struct lock_class_key btrfs_free_space_inode_key; 939 940 int load_free_space_cache(struct btrfs_block_group *block_group) 941 { 942 struct btrfs_fs_info *fs_info = block_group->fs_info; 943 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 944 struct btrfs_free_space_ctl tmp_ctl = {}; 945 struct inode *inode; 946 struct btrfs_path *path; 947 int ret = 0; 948 bool matched; 949 u64 used = block_group->used; 950 951 /* 952 * Because we could potentially discard our loaded free space, we want 953 * to load everything into a temporary structure first, and then if it's 954 * valid copy it all into the actual free space ctl. 955 */ 956 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 957 958 /* 959 * If this block group has been marked to be cleared for one reason or 960 * another then we can't trust the on disk cache, so just return. 961 */ 962 spin_lock(&block_group->lock); 963 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 964 spin_unlock(&block_group->lock); 965 return 0; 966 } 967 spin_unlock(&block_group->lock); 968 969 path = btrfs_alloc_path(); 970 if (!path) 971 return 0; 972 path->search_commit_root = true; 973 path->skip_locking = true; 974 975 /* 976 * We must pass a path with search_commit_root set to btrfs_iget in 977 * order to avoid a deadlock when allocating extents for the tree root. 978 * 979 * When we are COWing an extent buffer from the tree root, when looking 980 * for a free extent, at extent-tree.c:find_free_extent(), we can find 981 * block group without its free space cache loaded. When we find one 982 * we must load its space cache which requires reading its free space 983 * cache's inode item from the root tree. If this inode item is located 984 * in the same leaf that we started COWing before, then we end up in 985 * deadlock on the extent buffer (trying to read lock it when we 986 * previously write locked it). 987 * 988 * It's safe to read the inode item using the commit root because 989 * block groups, once loaded, stay in memory forever (until they are 990 * removed) as well as their space caches once loaded. New block groups 991 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 992 * we will never try to read their inode item while the fs is mounted. 993 */ 994 inode = lookup_free_space_inode(block_group, path); 995 if (IS_ERR(inode)) { 996 btrfs_free_path(path); 997 return 0; 998 } 999 1000 /* We may have converted the inode and made the cache invalid. */ 1001 spin_lock(&block_group->lock); 1002 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1003 spin_unlock(&block_group->lock); 1004 btrfs_free_path(path); 1005 goto out; 1006 } 1007 spin_unlock(&block_group->lock); 1008 1009 /* 1010 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1011 * free space inodes to prevent false positives related to locks for normal 1012 * inodes. 1013 */ 1014 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1015 &btrfs_free_space_inode_key); 1016 1017 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1018 path, block_group->start); 1019 btrfs_free_path(path); 1020 if (ret <= 0) 1021 goto out; 1022 1023 matched = (tmp_ctl.free_space == (block_group->length - used - 1024 block_group->bytes_super)); 1025 1026 if (matched) { 1027 spin_lock(&tmp_ctl.tree_lock); 1028 ret = copy_free_space_cache(block_group, &tmp_ctl); 1029 spin_unlock(&tmp_ctl.tree_lock); 1030 /* 1031 * ret == 1 means we successfully loaded the free space cache, 1032 * so we need to re-set it here. 1033 */ 1034 if (ret == 0) 1035 ret = 1; 1036 } else { 1037 /* 1038 * We need to call the _locked variant so we don't try to update 1039 * the discard counters. 1040 */ 1041 spin_lock(&tmp_ctl.tree_lock); 1042 __btrfs_remove_free_space_cache(&tmp_ctl); 1043 spin_unlock(&tmp_ctl.tree_lock); 1044 btrfs_warn(fs_info, 1045 "block group %llu has wrong amount of free space", 1046 block_group->start); 1047 ret = -1; 1048 } 1049 out: 1050 if (ret < 0) { 1051 /* This cache is bogus, make sure it gets cleared */ 1052 spin_lock(&block_group->lock); 1053 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1054 spin_unlock(&block_group->lock); 1055 ret = 0; 1056 1057 btrfs_warn(fs_info, 1058 "failed to load free space cache for block group %llu, rebuilding it now", 1059 block_group->start); 1060 } 1061 1062 spin_lock(&ctl->tree_lock); 1063 btrfs_discard_update_discardable(block_group); 1064 spin_unlock(&ctl->tree_lock); 1065 iput(inode); 1066 return ret; 1067 } 1068 1069 static noinline_for_stack 1070 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1071 struct btrfs_free_space_ctl *ctl, 1072 struct btrfs_block_group *block_group, 1073 int *entries, int *bitmaps, 1074 struct list_head *bitmap_list) 1075 { 1076 int ret; 1077 struct btrfs_free_cluster *cluster = NULL; 1078 struct btrfs_free_cluster *cluster_locked = NULL; 1079 struct rb_node *node = rb_first(&ctl->free_space_offset); 1080 struct btrfs_trim_range *trim_entry; 1081 1082 /* Get the cluster for this block_group if it exists */ 1083 if (!list_empty(&block_group->cluster_list)) { 1084 cluster = list_first_entry(&block_group->cluster_list, 1085 struct btrfs_free_cluster, block_group_list); 1086 } 1087 1088 if (!node && cluster) { 1089 cluster_locked = cluster; 1090 spin_lock(&cluster_locked->lock); 1091 node = rb_first(&cluster->root); 1092 cluster = NULL; 1093 } 1094 1095 /* Write out the extent entries */ 1096 while (node) { 1097 struct btrfs_free_space *e; 1098 1099 e = rb_entry(node, struct btrfs_free_space, offset_index); 1100 *entries += 1; 1101 1102 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1103 e->bitmap); 1104 if (ret) 1105 goto fail; 1106 1107 if (e->bitmap) { 1108 list_add_tail(&e->list, bitmap_list); 1109 *bitmaps += 1; 1110 } 1111 node = rb_next(node); 1112 if (!node && cluster) { 1113 node = rb_first(&cluster->root); 1114 cluster_locked = cluster; 1115 spin_lock(&cluster_locked->lock); 1116 cluster = NULL; 1117 } 1118 } 1119 if (cluster_locked) { 1120 spin_unlock(&cluster_locked->lock); 1121 cluster_locked = NULL; 1122 } 1123 1124 /* 1125 * Make sure we don't miss any range that was removed from our rbtree 1126 * because trimming is running. Otherwise after a umount+mount (or crash 1127 * after committing the transaction) we would leak free space and get 1128 * an inconsistent free space cache report from fsck. 1129 */ 1130 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1131 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1132 trim_entry->bytes, NULL); 1133 if (ret) 1134 goto fail; 1135 *entries += 1; 1136 } 1137 1138 return 0; 1139 fail: 1140 if (cluster_locked) 1141 spin_unlock(&cluster_locked->lock); 1142 return -ENOSPC; 1143 } 1144 1145 static noinline_for_stack int 1146 update_cache_item(struct btrfs_trans_handle *trans, 1147 struct btrfs_root *root, 1148 struct inode *inode, 1149 struct btrfs_path *path, u64 offset, 1150 int entries, int bitmaps) 1151 { 1152 struct btrfs_key key; 1153 struct btrfs_free_space_header *header; 1154 struct extent_buffer *leaf; 1155 int ret; 1156 1157 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1158 key.type = 0; 1159 key.offset = offset; 1160 1161 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1162 if (ret < 0) { 1163 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1164 EXTENT_DELALLOC, NULL); 1165 return ret; 1166 } 1167 leaf = path->nodes[0]; 1168 if (ret > 0) { 1169 struct btrfs_key found_key; 1170 ASSERT(path->slots[0]); 1171 path->slots[0]--; 1172 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1173 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1174 found_key.offset != offset) { 1175 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1176 inode->i_size - 1, EXTENT_DELALLOC, 1177 NULL); 1178 btrfs_release_path(path); 1179 return -ENOENT; 1180 } 1181 } 1182 1183 BTRFS_I(inode)->generation = trans->transid; 1184 header = btrfs_item_ptr(leaf, path->slots[0], 1185 struct btrfs_free_space_header); 1186 btrfs_set_free_space_entries(leaf, header, entries); 1187 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1188 btrfs_set_free_space_generation(leaf, header, trans->transid); 1189 btrfs_release_path(path); 1190 1191 return 0; 1192 } 1193 1194 static noinline_for_stack int write_pinned_extent_entries( 1195 struct btrfs_trans_handle *trans, 1196 struct btrfs_block_group *block_group, 1197 struct btrfs_io_ctl *io_ctl, 1198 int *entries) 1199 { 1200 u64 start, extent_start, extent_end, len; 1201 const u64 block_group_end = btrfs_block_group_end(block_group); 1202 struct extent_io_tree *unpin = NULL; 1203 int ret; 1204 1205 /* 1206 * We want to add any pinned extents to our free space cache 1207 * so we don't leak the space 1208 * 1209 * We shouldn't have switched the pinned extents yet so this is the 1210 * right one 1211 */ 1212 unpin = &trans->transaction->pinned_extents; 1213 1214 start = block_group->start; 1215 1216 while (start < block_group_end) { 1217 if (!btrfs_find_first_extent_bit(unpin, start, 1218 &extent_start, &extent_end, 1219 EXTENT_DIRTY, NULL)) 1220 return 0; 1221 1222 /* This pinned extent is out of our range */ 1223 if (extent_start >= block_group_end) 1224 return 0; 1225 1226 extent_start = max(extent_start, start); 1227 extent_end = min(block_group_end, extent_end + 1); 1228 len = extent_end - extent_start; 1229 1230 *entries += 1; 1231 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1232 if (ret) 1233 return -ENOSPC; 1234 1235 start = extent_end; 1236 } 1237 1238 return 0; 1239 } 1240 1241 static noinline_for_stack int 1242 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1243 { 1244 struct btrfs_free_space *entry, *next; 1245 int ret; 1246 1247 /* Write out the bitmaps */ 1248 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1249 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1250 if (ret) 1251 return -ENOSPC; 1252 list_del_init(&entry->list); 1253 } 1254 1255 return 0; 1256 } 1257 1258 static int flush_dirty_cache(struct inode *inode) 1259 { 1260 int ret; 1261 1262 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 1263 if (ret) 1264 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1265 EXTENT_DELALLOC, NULL); 1266 1267 return ret; 1268 } 1269 1270 static void noinline_for_stack 1271 cleanup_bitmap_list(struct list_head *bitmap_list) 1272 { 1273 struct btrfs_free_space *entry, *next; 1274 1275 list_for_each_entry_safe(entry, next, bitmap_list, list) 1276 list_del_init(&entry->list); 1277 } 1278 1279 static void noinline_for_stack 1280 cleanup_write_cache_enospc(struct inode *inode, 1281 struct btrfs_io_ctl *io_ctl, 1282 struct extent_state **cached_state) 1283 { 1284 io_ctl_drop_pages(io_ctl); 1285 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1286 cached_state); 1287 } 1288 1289 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1290 struct btrfs_trans_handle *trans, 1291 struct btrfs_block_group *block_group, 1292 struct btrfs_io_ctl *io_ctl, 1293 struct btrfs_path *path, u64 offset) 1294 { 1295 int ret; 1296 struct inode *inode = io_ctl->inode; 1297 1298 if (!inode) 1299 return 0; 1300 1301 /* Flush the dirty pages in the cache file. */ 1302 ret = flush_dirty_cache(inode); 1303 if (ret) 1304 goto out; 1305 1306 /* Update the cache item to tell everyone this cache file is valid. */ 1307 ret = update_cache_item(trans, root, inode, path, offset, 1308 io_ctl->entries, io_ctl->bitmaps); 1309 out: 1310 if (ret) { 1311 invalidate_inode_pages2(inode->i_mapping); 1312 BTRFS_I(inode)->generation = 0; 1313 if (block_group) 1314 btrfs_debug(root->fs_info, 1315 "failed to write free space cache for block group %llu error %d", 1316 block_group->start, ret); 1317 } 1318 btrfs_update_inode(trans, BTRFS_I(inode)); 1319 1320 if (block_group) { 1321 /* the dirty list is protected by the dirty_bgs_lock */ 1322 spin_lock(&trans->transaction->dirty_bgs_lock); 1323 1324 /* the disk_cache_state is protected by the block group lock */ 1325 spin_lock(&block_group->lock); 1326 1327 /* 1328 * only mark this as written if we didn't get put back on 1329 * the dirty list while waiting for IO. Otherwise our 1330 * cache state won't be right, and we won't get written again 1331 */ 1332 if (!ret && list_empty(&block_group->dirty_list)) 1333 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1334 else if (ret) 1335 block_group->disk_cache_state = BTRFS_DC_ERROR; 1336 1337 spin_unlock(&block_group->lock); 1338 spin_unlock(&trans->transaction->dirty_bgs_lock); 1339 io_ctl->inode = NULL; 1340 iput(inode); 1341 } 1342 1343 return ret; 1344 1345 } 1346 1347 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1348 struct btrfs_block_group *block_group, 1349 struct btrfs_path *path) 1350 { 1351 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1352 block_group, &block_group->io_ctl, 1353 path, block_group->start); 1354 } 1355 1356 /* 1357 * Write out cached info to an inode. 1358 * 1359 * @inode: freespace inode we are writing out 1360 * @ctl: free space cache we are going to write out 1361 * @block_group: block_group for this cache if it belongs to a block_group 1362 * @io_ctl: holds context for the io 1363 * @trans: the trans handle 1364 * 1365 * This function writes out a free space cache struct to disk for quick recovery 1366 * on mount. This will return 0 if it was successful in writing the cache out, 1367 * or an errno if it was not. 1368 */ 1369 static int __btrfs_write_out_cache(struct inode *inode, 1370 struct btrfs_free_space_ctl *ctl, 1371 struct btrfs_block_group *block_group, 1372 struct btrfs_trans_handle *trans) 1373 { 1374 struct btrfs_io_ctl *io_ctl = &block_group->io_ctl; 1375 struct extent_state *cached_state = NULL; 1376 LIST_HEAD(bitmap_list); 1377 int entries = 0; 1378 int bitmaps = 0; 1379 int ret; 1380 int must_iput = 0; 1381 int i_size; 1382 1383 if (!i_size_read(inode)) 1384 return -EIO; 1385 1386 WARN_ON(io_ctl->pages); 1387 ret = io_ctl_init(io_ctl, inode, 1); 1388 if (ret) 1389 return ret; 1390 1391 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 1392 down_write(&block_group->data_rwsem); 1393 spin_lock(&block_group->lock); 1394 if (block_group->delalloc_bytes) { 1395 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1396 spin_unlock(&block_group->lock); 1397 up_write(&block_group->data_rwsem); 1398 BTRFS_I(inode)->generation = 0; 1399 ret = 0; 1400 must_iput = 1; 1401 goto out; 1402 } 1403 spin_unlock(&block_group->lock); 1404 } 1405 1406 /* Lock all pages first so we can lock the extent safely. */ 1407 ret = io_ctl_prepare_pages(io_ctl, false); 1408 if (ret) 1409 goto out_unlock; 1410 1411 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1412 &cached_state); 1413 1414 io_ctl_set_generation(io_ctl, trans->transid); 1415 1416 mutex_lock(&ctl->cache_writeout_mutex); 1417 /* Write out the extent entries in the free space cache */ 1418 spin_lock(&ctl->tree_lock); 1419 ret = write_cache_extent_entries(io_ctl, ctl, 1420 block_group, &entries, &bitmaps, 1421 &bitmap_list); 1422 if (ret) 1423 goto out_nospc_locked; 1424 1425 /* 1426 * Some spaces that are freed in the current transaction are pinned, 1427 * they will be added into free space cache after the transaction is 1428 * committed, we shouldn't lose them. 1429 * 1430 * If this changes while we are working we'll get added back to 1431 * the dirty list and redo it. No locking needed 1432 */ 1433 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1434 if (ret) 1435 goto out_nospc_locked; 1436 1437 /* 1438 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1439 * locked while doing it because a concurrent trim can be manipulating 1440 * or freeing the bitmap. 1441 */ 1442 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1443 spin_unlock(&ctl->tree_lock); 1444 mutex_unlock(&ctl->cache_writeout_mutex); 1445 if (ret) 1446 goto out_nospc; 1447 1448 /* Zero out the rest of the pages just to make sure */ 1449 io_ctl_zero_remaining_pages(io_ctl); 1450 1451 /* Everything is written out, now we dirty the pages in the file. */ 1452 i_size = i_size_read(inode); 1453 for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) { 1454 u64 dirty_start = i * PAGE_SIZE; 1455 u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start; 1456 1457 ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]), 1458 dirty_start, dirty_len, &cached_state, false); 1459 if (ret < 0) 1460 goto out_nospc; 1461 } 1462 1463 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1464 up_write(&block_group->data_rwsem); 1465 /* 1466 * Release the pages and unlock the extent, we will flush 1467 * them out later 1468 */ 1469 io_ctl_drop_pages(io_ctl); 1470 io_ctl_free(io_ctl); 1471 1472 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1473 &cached_state); 1474 1475 /* 1476 * at this point the pages are under IO and we're happy, 1477 * The caller is responsible for waiting on them and updating 1478 * the cache and the inode 1479 */ 1480 io_ctl->entries = entries; 1481 io_ctl->bitmaps = bitmaps; 1482 1483 ret = btrfs_fdatawrite_range(BTRFS_I(inode), 0, (u64)-1); 1484 if (ret) 1485 goto out; 1486 1487 return 0; 1488 1489 out_nospc_locked: 1490 cleanup_bitmap_list(&bitmap_list); 1491 spin_unlock(&ctl->tree_lock); 1492 mutex_unlock(&ctl->cache_writeout_mutex); 1493 1494 out_nospc: 1495 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1496 1497 out_unlock: 1498 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1499 up_write(&block_group->data_rwsem); 1500 1501 out: 1502 io_ctl->inode = NULL; 1503 io_ctl_free(io_ctl); 1504 if (ret) { 1505 invalidate_inode_pages2(inode->i_mapping); 1506 BTRFS_I(inode)->generation = 0; 1507 } 1508 btrfs_update_inode(trans, BTRFS_I(inode)); 1509 if (must_iput) 1510 iput(inode); 1511 return ret; 1512 } 1513 1514 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1515 struct btrfs_block_group *block_group, 1516 struct btrfs_path *path) 1517 { 1518 struct btrfs_fs_info *fs_info = trans->fs_info; 1519 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1520 struct inode *inode; 1521 int ret = 0; 1522 1523 spin_lock(&block_group->lock); 1524 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1525 spin_unlock(&block_group->lock); 1526 return 0; 1527 } 1528 spin_unlock(&block_group->lock); 1529 1530 inode = lookup_free_space_inode(block_group, path); 1531 if (IS_ERR(inode)) 1532 return 0; 1533 1534 ret = __btrfs_write_out_cache(inode, ctl, block_group, trans); 1535 if (ret) { 1536 btrfs_debug(fs_info, 1537 "failed to write free space cache for block group %llu error %d", 1538 block_group->start, ret); 1539 spin_lock(&block_group->lock); 1540 block_group->disk_cache_state = BTRFS_DC_ERROR; 1541 spin_unlock(&block_group->lock); 1542 1543 block_group->io_ctl.inode = NULL; 1544 iput(inode); 1545 } 1546 1547 /* 1548 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1549 * to wait for IO and put the inode 1550 */ 1551 1552 return ret; 1553 } 1554 1555 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1556 u64 offset) 1557 { 1558 ASSERT(offset >= bitmap_start); 1559 offset -= bitmap_start; 1560 return (unsigned long)(div_u64(offset, unit)); 1561 } 1562 1563 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1564 { 1565 return (unsigned long)(div_u64(bytes, unit)); 1566 } 1567 1568 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1569 u64 offset) 1570 { 1571 u64 bitmap_start; 1572 u64 bytes_per_bitmap; 1573 1574 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1575 bitmap_start = offset - ctl->start; 1576 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1577 bitmap_start *= bytes_per_bitmap; 1578 bitmap_start += ctl->start; 1579 1580 return bitmap_start; 1581 } 1582 1583 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1584 struct btrfs_free_cluster *cluster, 1585 struct btrfs_free_space *new_entry) 1586 { 1587 struct rb_root *root; 1588 struct rb_node **p; 1589 struct rb_node *parent = NULL; 1590 1591 lockdep_assert_held(&ctl->tree_lock); 1592 1593 if (cluster) { 1594 lockdep_assert_held(&cluster->lock); 1595 root = &cluster->root; 1596 } else { 1597 root = &ctl->free_space_offset; 1598 } 1599 1600 p = &root->rb_node; 1601 1602 while (*p) { 1603 struct btrfs_free_space *info; 1604 1605 parent = *p; 1606 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1607 1608 if (new_entry->offset < info->offset) { 1609 p = &(*p)->rb_left; 1610 } else if (new_entry->offset > info->offset) { 1611 p = &(*p)->rb_right; 1612 } else { 1613 /* 1614 * we could have a bitmap entry and an extent entry 1615 * share the same offset. If this is the case, we want 1616 * the extent entry to always be found first if we do a 1617 * linear search through the tree, since we want to have 1618 * the quickest allocation time, and allocating from an 1619 * extent is faster than allocating from a bitmap. So 1620 * if we're inserting a bitmap and we find an entry at 1621 * this offset, we want to go right, or after this entry 1622 * logically. If we are inserting an extent and we've 1623 * found a bitmap, we want to go left, or before 1624 * logically. 1625 */ 1626 if (new_entry->bitmap) { 1627 if (info->bitmap) { 1628 WARN_ON_ONCE(1); 1629 return -EEXIST; 1630 } 1631 p = &(*p)->rb_right; 1632 } else { 1633 if (!info->bitmap) { 1634 WARN_ON_ONCE(1); 1635 return -EEXIST; 1636 } 1637 p = &(*p)->rb_left; 1638 } 1639 } 1640 } 1641 1642 rb_link_node(&new_entry->offset_index, parent, p); 1643 rb_insert_color(&new_entry->offset_index, root); 1644 1645 return 0; 1646 } 1647 1648 /* 1649 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1650 * searched through the bitmap and figured out the largest ->max_extent_size, 1651 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1652 * allocator the wrong thing, we want to use the actual real max_extent_size 1653 * we've found already if it's larger, or we want to use ->bytes. 1654 * 1655 * This matters because find_free_space() will skip entries who's ->bytes is 1656 * less than the required bytes. So if we didn't search down this bitmap, we 1657 * may pick some previous entry that has a smaller ->max_extent_size than we 1658 * have. For example, assume we have two entries, one that has 1659 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1660 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1661 * call into find_free_space(), and return with max_extent_size == 4K, because 1662 * that first bitmap entry had ->max_extent_size set, but the second one did 1663 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1664 * 8K contiguous range. 1665 * 1666 * Consider the other case, we have 2 8K chunks in that second entry and still 1667 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1668 * allocator comes in it'll fully search our second bitmap, and this time it'll 1669 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1670 * right allocation the next loop through. 1671 */ 1672 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1673 { 1674 if (entry->bitmap && entry->max_extent_size) 1675 return entry->max_extent_size; 1676 return entry->bytes; 1677 } 1678 1679 /* 1680 * We want the largest entry to be leftmost, so this is inverted from what you'd 1681 * normally expect. 1682 */ 1683 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1684 { 1685 const struct btrfs_free_space *entry, *exist; 1686 1687 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1688 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1689 return get_max_extent_size(exist) < get_max_extent_size(entry); 1690 } 1691 1692 /* 1693 * searches the tree for the given offset. 1694 * 1695 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1696 * want a section that has at least bytes size and comes at or after the given 1697 * offset. 1698 */ 1699 static struct btrfs_free_space * 1700 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1701 u64 offset, int bitmap_only, int fuzzy) 1702 { 1703 struct rb_node *n = ctl->free_space_offset.rb_node; 1704 struct btrfs_free_space *entry = NULL, *prev = NULL; 1705 1706 lockdep_assert_held(&ctl->tree_lock); 1707 1708 /* find entry that is closest to the 'offset' */ 1709 while (n) { 1710 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1711 prev = entry; 1712 1713 if (offset < entry->offset) 1714 n = n->rb_left; 1715 else if (offset > entry->offset) 1716 n = n->rb_right; 1717 else 1718 break; 1719 1720 entry = NULL; 1721 } 1722 1723 if (bitmap_only) { 1724 if (!entry) 1725 return NULL; 1726 if (entry->bitmap) 1727 return entry; 1728 1729 /* 1730 * bitmap entry and extent entry may share same offset, 1731 * in that case, bitmap entry comes after extent entry. 1732 */ 1733 n = rb_next(n); 1734 if (!n) 1735 return NULL; 1736 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1737 if (entry->offset != offset) 1738 return NULL; 1739 1740 WARN_ON(!entry->bitmap); 1741 return entry; 1742 } else if (entry) { 1743 if (entry->bitmap) { 1744 /* 1745 * if previous extent entry covers the offset, 1746 * we should return it instead of the bitmap entry 1747 */ 1748 n = rb_prev(&entry->offset_index); 1749 if (n) { 1750 prev = rb_entry(n, struct btrfs_free_space, 1751 offset_index); 1752 if (!prev->bitmap && 1753 prev->offset + prev->bytes > offset) 1754 entry = prev; 1755 } 1756 } 1757 return entry; 1758 } 1759 1760 if (!prev) 1761 return NULL; 1762 1763 /* find last entry before the 'offset' */ 1764 entry = prev; 1765 if (entry->offset > offset) { 1766 n = rb_prev(&entry->offset_index); 1767 if (n) { 1768 entry = rb_entry(n, struct btrfs_free_space, 1769 offset_index); 1770 ASSERT(entry->offset <= offset); 1771 } else { 1772 if (fuzzy) 1773 return entry; 1774 else 1775 return NULL; 1776 } 1777 } 1778 1779 if (entry->bitmap) { 1780 n = rb_prev(&entry->offset_index); 1781 if (n) { 1782 prev = rb_entry(n, struct btrfs_free_space, 1783 offset_index); 1784 if (!prev->bitmap && 1785 prev->offset + prev->bytes > offset) 1786 return prev; 1787 } 1788 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1789 return entry; 1790 } else if (entry->offset + entry->bytes > offset) 1791 return entry; 1792 1793 if (!fuzzy) 1794 return NULL; 1795 1796 while (1) { 1797 n = rb_next(&entry->offset_index); 1798 if (!n) 1799 return NULL; 1800 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1801 if (entry->bitmap) { 1802 if (entry->offset + BITS_PER_BITMAP * 1803 ctl->unit > offset) 1804 break; 1805 } else { 1806 if (entry->offset + entry->bytes > offset) 1807 break; 1808 } 1809 } 1810 return entry; 1811 } 1812 1813 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1814 struct btrfs_free_space *info, 1815 bool update_stat) 1816 { 1817 lockdep_assert_held(&ctl->tree_lock); 1818 1819 rb_erase(&info->offset_index, &ctl->free_space_offset); 1820 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1821 ctl->free_extents--; 1822 1823 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1824 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1825 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1826 } 1827 1828 if (update_stat) 1829 ctl->free_space -= info->bytes; 1830 } 1831 1832 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1833 struct btrfs_free_space *info) 1834 { 1835 int ret = 0; 1836 1837 lockdep_assert_held(&ctl->tree_lock); 1838 1839 ASSERT(info->bytes || info->bitmap); 1840 ret = tree_insert_offset(ctl, NULL, info); 1841 if (ret) 1842 return ret; 1843 1844 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1845 1846 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1847 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1848 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1849 } 1850 1851 ctl->free_space += info->bytes; 1852 ctl->free_extents++; 1853 return ret; 1854 } 1855 1856 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1857 struct btrfs_free_space *info) 1858 { 1859 ASSERT(info->bitmap); 1860 1861 /* 1862 * If our entry is empty it's because we're on a cluster and we don't 1863 * want to re-link it into our ctl bytes index. 1864 */ 1865 if (RB_EMPTY_NODE(&info->bytes_index)) 1866 return; 1867 1868 lockdep_assert_held(&ctl->tree_lock); 1869 1870 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1871 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1872 } 1873 1874 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1875 struct btrfs_free_space *info, 1876 u64 offset, u64 bytes, bool update_stat) 1877 { 1878 unsigned long start, count, end; 1879 int extent_delta = -1; 1880 1881 start = offset_to_bit(info->offset, ctl->unit, offset); 1882 count = bytes_to_bits(bytes, ctl->unit); 1883 end = start + count; 1884 ASSERT(end <= BITS_PER_BITMAP); 1885 1886 bitmap_clear(info->bitmap, start, count); 1887 1888 info->bytes -= bytes; 1889 if (info->max_extent_size > ctl->unit) 1890 info->max_extent_size = 0; 1891 1892 relink_bitmap_entry(ctl, info); 1893 1894 if (start && test_bit(start - 1, info->bitmap)) 1895 extent_delta++; 1896 1897 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1898 extent_delta++; 1899 1900 info->bitmap_extents += extent_delta; 1901 if (!btrfs_free_space_trimmed(info)) { 1902 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1903 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1904 } 1905 1906 if (update_stat) 1907 ctl->free_space -= bytes; 1908 } 1909 1910 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1911 struct btrfs_free_space *info, u64 offset, 1912 u64 bytes) 1913 { 1914 unsigned long start, count, end; 1915 int extent_delta = 1; 1916 1917 start = offset_to_bit(info->offset, ctl->unit, offset); 1918 count = bytes_to_bits(bytes, ctl->unit); 1919 end = start + count; 1920 ASSERT(end <= BITS_PER_BITMAP); 1921 1922 bitmap_set(info->bitmap, start, count); 1923 1924 /* 1925 * We set some bytes, we have no idea what the max extent size is 1926 * anymore. 1927 */ 1928 info->max_extent_size = 0; 1929 info->bytes += bytes; 1930 ctl->free_space += bytes; 1931 1932 relink_bitmap_entry(ctl, info); 1933 1934 if (start && test_bit(start - 1, info->bitmap)) 1935 extent_delta--; 1936 1937 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1938 extent_delta--; 1939 1940 info->bitmap_extents += extent_delta; 1941 if (!btrfs_free_space_trimmed(info)) { 1942 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1943 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1944 } 1945 } 1946 1947 /* 1948 * If we can not find suitable extent, we will use bytes to record 1949 * the size of the max extent. 1950 */ 1951 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1952 struct btrfs_free_space *bitmap_info, u64 *offset, 1953 u64 *bytes, bool for_alloc) 1954 { 1955 unsigned long found_bits = 0; 1956 unsigned long max_bits = 0; 1957 unsigned long bits, i; 1958 unsigned long next_zero; 1959 unsigned long extent_bits; 1960 1961 /* 1962 * Skip searching the bitmap if we don't have a contiguous section that 1963 * is large enough for this allocation. 1964 */ 1965 if (for_alloc && 1966 bitmap_info->max_extent_size && 1967 bitmap_info->max_extent_size < *bytes) { 1968 *bytes = bitmap_info->max_extent_size; 1969 return -1; 1970 } 1971 1972 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1973 max_t(u64, *offset, bitmap_info->offset)); 1974 bits = bytes_to_bits(*bytes, ctl->unit); 1975 1976 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1977 if (for_alloc && bits == 1) { 1978 found_bits = 1; 1979 break; 1980 } 1981 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1982 BITS_PER_BITMAP, i); 1983 extent_bits = next_zero - i; 1984 if (extent_bits >= bits) { 1985 found_bits = extent_bits; 1986 break; 1987 } else if (extent_bits > max_bits) { 1988 max_bits = extent_bits; 1989 } 1990 i = next_zero; 1991 } 1992 1993 if (found_bits) { 1994 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1995 *bytes = (u64)(found_bits) * ctl->unit; 1996 return 0; 1997 } 1998 1999 *bytes = (u64)(max_bits) * ctl->unit; 2000 bitmap_info->max_extent_size = *bytes; 2001 relink_bitmap_entry(ctl, bitmap_info); 2002 return -1; 2003 } 2004 2005 /* Cache the size of the max extent in bytes */ 2006 static struct btrfs_free_space * 2007 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2008 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2009 { 2010 struct btrfs_free_space *entry; 2011 struct rb_node *node; 2012 u64 tmp; 2013 u64 align_off; 2014 int ret; 2015 2016 if (!ctl->free_space_offset.rb_node) 2017 return NULL; 2018 again: 2019 if (use_bytes_index) { 2020 node = rb_first_cached(&ctl->free_space_bytes); 2021 } else { 2022 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2023 0, 1); 2024 if (!entry) 2025 return NULL; 2026 node = &entry->offset_index; 2027 } 2028 2029 for (; node; node = rb_next(node)) { 2030 if (use_bytes_index) 2031 entry = rb_entry(node, struct btrfs_free_space, 2032 bytes_index); 2033 else 2034 entry = rb_entry(node, struct btrfs_free_space, 2035 offset_index); 2036 2037 /* 2038 * If we are using the bytes index then all subsequent entries 2039 * in this tree are going to be < bytes, so simply set the max 2040 * extent size and exit the loop. 2041 * 2042 * If we're using the offset index then we need to keep going 2043 * through the rest of the tree. 2044 */ 2045 if (entry->bytes < *bytes) { 2046 *max_extent_size = max(get_max_extent_size(entry), 2047 *max_extent_size); 2048 if (use_bytes_index) 2049 break; 2050 continue; 2051 } 2052 2053 /* make sure the space returned is big enough 2054 * to match our requested alignment 2055 */ 2056 if (*bytes >= align) { 2057 tmp = entry->offset - ctl->start + align - 1; 2058 tmp = div64_u64(tmp, align); 2059 tmp = tmp * align + ctl->start; 2060 align_off = tmp - entry->offset; 2061 } else { 2062 align_off = 0; 2063 tmp = entry->offset; 2064 } 2065 2066 /* 2067 * We don't break here if we're using the bytes index because we 2068 * may have another entry that has the correct alignment that is 2069 * the right size, so we don't want to miss that possibility. 2070 * At worst this adds another loop through the logic, but if we 2071 * broke here we could prematurely ENOSPC. 2072 */ 2073 if (entry->bytes < *bytes + align_off) { 2074 *max_extent_size = max(get_max_extent_size(entry), 2075 *max_extent_size); 2076 continue; 2077 } 2078 2079 if (entry->bitmap) { 2080 struct rb_node *old_next = rb_next(node); 2081 u64 size = *bytes; 2082 2083 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2084 if (!ret) { 2085 *offset = tmp; 2086 *bytes = size; 2087 return entry; 2088 } else { 2089 *max_extent_size = 2090 max(get_max_extent_size(entry), 2091 *max_extent_size); 2092 } 2093 2094 /* 2095 * The bitmap may have gotten re-arranged in the space 2096 * index here because the max_extent_size may have been 2097 * updated. Start from the beginning again if this 2098 * happened. 2099 */ 2100 if (use_bytes_index && old_next != rb_next(node)) 2101 goto again; 2102 continue; 2103 } 2104 2105 *offset = tmp; 2106 *bytes = entry->bytes - align_off; 2107 return entry; 2108 } 2109 2110 return NULL; 2111 } 2112 2113 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2114 struct btrfs_free_space *info, u64 offset) 2115 { 2116 info->offset = offset_to_bitmap(ctl, offset); 2117 info->bytes = 0; 2118 info->bitmap_extents = 0; 2119 INIT_LIST_HEAD(&info->list); 2120 link_free_space(ctl, info); 2121 ctl->total_bitmaps++; 2122 recalculate_thresholds(ctl); 2123 } 2124 2125 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2126 struct btrfs_free_space *bitmap_info) 2127 { 2128 /* 2129 * Normally when this is called, the bitmap is completely empty. However, 2130 * if we are blowing up the free space cache for one reason or another 2131 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2132 * we may leave stats on the table. 2133 */ 2134 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2135 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2136 bitmap_info->bitmap_extents; 2137 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2138 2139 } 2140 unlink_free_space(ctl, bitmap_info, true); 2141 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2142 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2143 ctl->total_bitmaps--; 2144 recalculate_thresholds(ctl); 2145 } 2146 2147 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2148 struct btrfs_free_space *bitmap_info, 2149 u64 *offset, u64 *bytes) 2150 { 2151 u64 end; 2152 u64 search_start, search_bytes; 2153 int ret; 2154 2155 again: 2156 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2157 2158 /* 2159 * We need to search for bits in this bitmap. We could only cover some 2160 * of the extent in this bitmap thanks to how we add space, so we need 2161 * to search for as much as it as we can and clear that amount, and then 2162 * go searching for the next bit. 2163 */ 2164 search_start = *offset; 2165 search_bytes = ctl->unit; 2166 search_bytes = min(search_bytes, end - search_start + 1); 2167 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2168 false); 2169 if (ret < 0 || search_start != *offset) 2170 return -EINVAL; 2171 2172 /* We may have found more bits than what we need */ 2173 search_bytes = min(search_bytes, *bytes); 2174 2175 /* Cannot clear past the end of the bitmap */ 2176 search_bytes = min(search_bytes, end - search_start + 1); 2177 2178 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2179 *offset += search_bytes; 2180 *bytes -= search_bytes; 2181 2182 if (*bytes) { 2183 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2184 if (!bitmap_info->bytes) 2185 free_bitmap(ctl, bitmap_info); 2186 2187 /* 2188 * no entry after this bitmap, but we still have bytes to 2189 * remove, so something has gone wrong. 2190 */ 2191 if (!next) 2192 return -EINVAL; 2193 2194 bitmap_info = rb_entry(next, struct btrfs_free_space, 2195 offset_index); 2196 2197 /* 2198 * if the next entry isn't a bitmap we need to return to let the 2199 * extent stuff do its work. 2200 */ 2201 if (!bitmap_info->bitmap) 2202 return -EAGAIN; 2203 2204 /* 2205 * Ok the next item is a bitmap, but it may not actually hold 2206 * the information for the rest of this free space stuff, so 2207 * look for it, and if we don't find it return so we can try 2208 * everything over again. 2209 */ 2210 search_start = *offset; 2211 search_bytes = ctl->unit; 2212 ret = search_bitmap(ctl, bitmap_info, &search_start, 2213 &search_bytes, false); 2214 if (ret < 0 || search_start != *offset) 2215 return -EAGAIN; 2216 2217 goto again; 2218 } else if (!bitmap_info->bytes) 2219 free_bitmap(ctl, bitmap_info); 2220 2221 return 0; 2222 } 2223 2224 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2225 struct btrfs_free_space *info, u64 offset, 2226 u64 bytes, enum btrfs_trim_state trim_state) 2227 { 2228 u64 bytes_to_set = 0; 2229 u64 end; 2230 2231 /* 2232 * This is a tradeoff to make bitmap trim state minimal. We mark the 2233 * whole bitmap untrimmed if at any point we add untrimmed regions. 2234 */ 2235 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2236 if (btrfs_free_space_trimmed(info)) { 2237 ctl->discardable_extents[BTRFS_STAT_CURR] += 2238 info->bitmap_extents; 2239 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2240 } 2241 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2242 } 2243 2244 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2245 2246 bytes_to_set = min(end - offset, bytes); 2247 2248 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2249 2250 return bytes_to_set; 2251 2252 } 2253 2254 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2255 struct btrfs_free_space *info) 2256 { 2257 struct btrfs_block_group *block_group = ctl->block_group; 2258 struct btrfs_fs_info *fs_info = block_group->fs_info; 2259 bool forced = false; 2260 2261 #ifdef CONFIG_BTRFS_DEBUG 2262 if (btrfs_should_fragment_free_space(block_group)) 2263 forced = true; 2264 #endif 2265 2266 /* This is a way to reclaim large regions from the bitmaps. */ 2267 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2268 return false; 2269 2270 /* 2271 * If we are below the extents threshold then we can add this as an 2272 * extent, and don't have to deal with the bitmap 2273 */ 2274 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2275 /* 2276 * If this block group has some small extents we don't want to 2277 * use up all of our free slots in the cache with them, we want 2278 * to reserve them to larger extents, however if we have plenty 2279 * of cache left then go ahead and add them, no sense in adding 2280 * the overhead of a bitmap if we don't have to. 2281 */ 2282 if (info->bytes <= fs_info->sectorsize * 8) { 2283 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2284 return false; 2285 } else { 2286 return false; 2287 } 2288 } 2289 2290 /* 2291 * The original block groups from mkfs can be really small, like 8 2292 * megabytes, so don't bother with a bitmap for those entries. However 2293 * some block groups can be smaller than what a bitmap would cover but 2294 * are still large enough that they could overflow the 32k memory limit, 2295 * so allow those block groups to still be allowed to have a bitmap 2296 * entry. 2297 */ 2298 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2299 return false; 2300 2301 return true; 2302 } 2303 2304 static const struct btrfs_free_space_op free_space_op = { 2305 .use_bitmap = use_bitmap, 2306 }; 2307 2308 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2309 struct btrfs_free_space *info) 2310 { 2311 struct btrfs_free_space *bitmap_info; 2312 struct btrfs_block_group *block_group = NULL; 2313 int added = 0; 2314 u64 bytes, offset, bytes_added; 2315 enum btrfs_trim_state trim_state; 2316 int ret; 2317 2318 bytes = info->bytes; 2319 offset = info->offset; 2320 trim_state = info->trim_state; 2321 2322 if (!ctl->op->use_bitmap(ctl, info)) 2323 return 0; 2324 2325 if (ctl->op == &free_space_op) 2326 block_group = ctl->block_group; 2327 again: 2328 /* 2329 * Since we link bitmaps right into the cluster we need to see if we 2330 * have a cluster here, and if so and it has our bitmap we need to add 2331 * the free space to that bitmap. 2332 */ 2333 if (block_group && !list_empty(&block_group->cluster_list)) { 2334 struct btrfs_free_cluster *cluster; 2335 struct rb_node *node; 2336 struct btrfs_free_space *entry; 2337 2338 cluster = list_first_entry(&block_group->cluster_list, 2339 struct btrfs_free_cluster, block_group_list); 2340 spin_lock(&cluster->lock); 2341 node = rb_first(&cluster->root); 2342 if (!node) { 2343 spin_unlock(&cluster->lock); 2344 goto no_cluster_bitmap; 2345 } 2346 2347 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2348 if (!entry->bitmap) { 2349 spin_unlock(&cluster->lock); 2350 goto no_cluster_bitmap; 2351 } 2352 2353 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2354 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2355 bytes, trim_state); 2356 bytes -= bytes_added; 2357 offset += bytes_added; 2358 } 2359 spin_unlock(&cluster->lock); 2360 if (!bytes) { 2361 ret = 1; 2362 goto out; 2363 } 2364 } 2365 2366 no_cluster_bitmap: 2367 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2368 1, 0); 2369 if (!bitmap_info) { 2370 ASSERT(added == 0); 2371 goto new_bitmap; 2372 } 2373 2374 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2375 trim_state); 2376 bytes -= bytes_added; 2377 offset += bytes_added; 2378 added = 0; 2379 2380 if (!bytes) { 2381 ret = 1; 2382 goto out; 2383 } else 2384 goto again; 2385 2386 new_bitmap: 2387 if (info && info->bitmap) { 2388 add_new_bitmap(ctl, info, offset); 2389 added = 1; 2390 info = NULL; 2391 goto again; 2392 } else { 2393 spin_unlock(&ctl->tree_lock); 2394 2395 /* no pre-allocated info, allocate a new one */ 2396 if (!info) { 2397 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2398 GFP_NOFS); 2399 if (!info) { 2400 spin_lock(&ctl->tree_lock); 2401 ret = -ENOMEM; 2402 goto out; 2403 } 2404 } 2405 2406 /* allocate the bitmap */ 2407 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2408 GFP_NOFS); 2409 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2410 spin_lock(&ctl->tree_lock); 2411 if (!info->bitmap) { 2412 ret = -ENOMEM; 2413 goto out; 2414 } 2415 goto again; 2416 } 2417 2418 out: 2419 if (info) { 2420 if (info->bitmap) 2421 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2422 info->bitmap); 2423 kmem_cache_free(btrfs_free_space_cachep, info); 2424 } 2425 2426 return ret; 2427 } 2428 2429 /* 2430 * Free space merging rules: 2431 * 1) Merge trimmed areas together 2432 * 2) Let untrimmed areas coalesce with trimmed areas 2433 * 3) Always pull neighboring regions from bitmaps 2434 * 2435 * The above rules are for when we merge free space based on btrfs_trim_state. 2436 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2437 * same reason: to promote larger extent regions which makes life easier for 2438 * find_free_extent(). Rule 2 enables coalescing based on the common path 2439 * being returning free space from btrfs_finish_extent_commit(). So when free 2440 * space is trimmed, it will prevent aggregating trimmed new region and 2441 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2442 * and provide find_free_extent() with the largest extents possible hoping for 2443 * the reuse path. 2444 */ 2445 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2446 struct btrfs_free_space *info, bool update_stat) 2447 { 2448 struct btrfs_free_space *left_info = NULL; 2449 struct btrfs_free_space *right_info; 2450 bool merged = false; 2451 u64 offset = info->offset; 2452 u64 bytes = info->bytes; 2453 const bool is_trimmed = btrfs_free_space_trimmed(info); 2454 struct rb_node *right_prev = NULL; 2455 2456 /* 2457 * first we want to see if there is free space adjacent to the range we 2458 * are adding, if there is remove that struct and add a new one to 2459 * cover the entire range 2460 */ 2461 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2462 if (right_info) 2463 right_prev = rb_prev(&right_info->offset_index); 2464 2465 if (right_prev) 2466 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2467 else if (!right_info) 2468 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2469 2470 /* See try_merge_free_space() comment. */ 2471 if (right_info && !right_info->bitmap && 2472 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2473 unlink_free_space(ctl, right_info, update_stat); 2474 info->bytes += right_info->bytes; 2475 kmem_cache_free(btrfs_free_space_cachep, right_info); 2476 merged = true; 2477 } 2478 2479 /* See try_merge_free_space() comment. */ 2480 if (left_info && !left_info->bitmap && 2481 left_info->offset + left_info->bytes == offset && 2482 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2483 unlink_free_space(ctl, left_info, update_stat); 2484 info->offset = left_info->offset; 2485 info->bytes += left_info->bytes; 2486 kmem_cache_free(btrfs_free_space_cachep, left_info); 2487 merged = true; 2488 } 2489 2490 return merged; 2491 } 2492 2493 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2494 struct btrfs_free_space *info, 2495 bool update_stat) 2496 { 2497 struct btrfs_free_space *bitmap; 2498 unsigned long i; 2499 unsigned long j; 2500 const u64 end = info->offset + info->bytes; 2501 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2502 u64 bytes; 2503 2504 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2505 if (!bitmap) 2506 return false; 2507 2508 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2509 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2510 if (j == i) 2511 return false; 2512 bytes = (j - i) * ctl->unit; 2513 info->bytes += bytes; 2514 2515 /* See try_merge_free_space() comment. */ 2516 if (!btrfs_free_space_trimmed(bitmap)) 2517 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2518 2519 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2520 2521 if (!bitmap->bytes) 2522 free_bitmap(ctl, bitmap); 2523 2524 return true; 2525 } 2526 2527 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2528 struct btrfs_free_space *info, 2529 bool update_stat) 2530 { 2531 struct btrfs_free_space *bitmap; 2532 u64 bitmap_offset; 2533 unsigned long i; 2534 unsigned long j; 2535 unsigned long prev_j; 2536 u64 bytes; 2537 2538 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2539 /* If we're on a boundary, try the previous logical bitmap. */ 2540 if (bitmap_offset == info->offset) { 2541 if (info->offset == 0) 2542 return false; 2543 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2544 } 2545 2546 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2547 if (!bitmap) 2548 return false; 2549 2550 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2551 j = 0; 2552 prev_j = (unsigned long)-1; 2553 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2554 if (j > i) 2555 break; 2556 prev_j = j; 2557 } 2558 if (prev_j == i) 2559 return false; 2560 2561 if (prev_j == (unsigned long)-1) 2562 bytes = (i + 1) * ctl->unit; 2563 else 2564 bytes = (i - prev_j) * ctl->unit; 2565 2566 info->offset -= bytes; 2567 info->bytes += bytes; 2568 2569 /* See try_merge_free_space() comment. */ 2570 if (!btrfs_free_space_trimmed(bitmap)) 2571 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2572 2573 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2574 2575 if (!bitmap->bytes) 2576 free_bitmap(ctl, bitmap); 2577 2578 return true; 2579 } 2580 2581 /* 2582 * We prefer always to allocate from extent entries, both for clustered and 2583 * non-clustered allocation requests. So when attempting to add a new extent 2584 * entry, try to see if there's adjacent free space in bitmap entries, and if 2585 * there is, migrate that space from the bitmaps to the extent. 2586 * Like this we get better chances of satisfying space allocation requests 2587 * because we attempt to satisfy them based on a single cache entry, and never 2588 * on 2 or more entries - even if the entries represent a contiguous free space 2589 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2590 * ends). 2591 */ 2592 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2593 struct btrfs_free_space *info, 2594 bool update_stat) 2595 { 2596 /* 2597 * Only work with disconnected entries, as we can change their offset, 2598 * and must be extent entries. 2599 */ 2600 ASSERT(!info->bitmap); 2601 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2602 2603 if (ctl->total_bitmaps > 0) { 2604 bool stole_end; 2605 bool stole_front = false; 2606 2607 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2608 if (ctl->total_bitmaps > 0) 2609 stole_front = steal_from_bitmap_to_front(ctl, info, 2610 update_stat); 2611 2612 if (stole_end || stole_front) 2613 try_merge_free_space(ctl, info, update_stat); 2614 } 2615 } 2616 2617 static int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2618 u64 offset, u64 bytes, 2619 enum btrfs_trim_state trim_state) 2620 { 2621 struct btrfs_fs_info *fs_info = block_group->fs_info; 2622 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2623 struct btrfs_free_space *info; 2624 int ret = 0; 2625 u64 filter_bytes = bytes; 2626 2627 ASSERT(!btrfs_is_zoned(fs_info)); 2628 2629 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2630 if (!info) 2631 return -ENOMEM; 2632 2633 info->offset = offset; 2634 info->bytes = bytes; 2635 info->trim_state = trim_state; 2636 RB_CLEAR_NODE(&info->offset_index); 2637 RB_CLEAR_NODE(&info->bytes_index); 2638 2639 spin_lock(&ctl->tree_lock); 2640 2641 if (try_merge_free_space(ctl, info, true)) 2642 goto link; 2643 2644 /* 2645 * There was no extent directly to the left or right of this new 2646 * extent then we know we're going to have to allocate a new extent, so 2647 * before we do that see if we need to drop this into a bitmap 2648 */ 2649 ret = insert_into_bitmap(ctl, info); 2650 if (ret < 0) { 2651 goto out; 2652 } else if (ret) { 2653 ret = 0; 2654 goto out; 2655 } 2656 link: 2657 /* 2658 * Only steal free space from adjacent bitmaps if we're sure we're not 2659 * going to add the new free space to existing bitmap entries - because 2660 * that would mean unnecessary work that would be reverted. Therefore 2661 * attempt to steal space from bitmaps if we're adding an extent entry. 2662 */ 2663 steal_from_bitmap(ctl, info, true); 2664 2665 filter_bytes = max(filter_bytes, info->bytes); 2666 2667 ret = link_free_space(ctl, info); 2668 if (ret) 2669 kmem_cache_free(btrfs_free_space_cachep, info); 2670 out: 2671 btrfs_discard_update_discardable(block_group); 2672 spin_unlock(&ctl->tree_lock); 2673 2674 if (ret) { 2675 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2676 ASSERT(ret != -EEXIST); 2677 } 2678 2679 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2680 btrfs_discard_check_filter(block_group, filter_bytes); 2681 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2682 } 2683 2684 return ret; 2685 } 2686 2687 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2688 u64 bytenr, u64 size, bool used) 2689 { 2690 struct btrfs_space_info *sinfo = block_group->space_info; 2691 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2692 u64 offset = bytenr - block_group->start; 2693 u64 to_free, to_unusable; 2694 int bg_reclaim_threshold = 0; 2695 bool initial; 2696 u64 reclaimable_unusable; 2697 2698 spin_lock(&block_group->lock); 2699 2700 initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2701 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2702 if (!initial) 2703 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2704 2705 if (!used) 2706 to_free = size; 2707 else if (initial) 2708 to_free = block_group->zone_capacity; 2709 else if (offset >= block_group->alloc_offset) 2710 to_free = size; 2711 else if (offset + size <= block_group->alloc_offset) 2712 to_free = 0; 2713 else 2714 to_free = offset + size - block_group->alloc_offset; 2715 to_unusable = size - to_free; 2716 2717 spin_lock(&ctl->tree_lock); 2718 ctl->free_space += to_free; 2719 spin_unlock(&ctl->tree_lock); 2720 /* 2721 * If the block group is read-only, we should account freed space into 2722 * bytes_readonly. 2723 */ 2724 if (!block_group->ro) { 2725 block_group->zone_unusable += to_unusable; 2726 WARN_ON(block_group->zone_unusable > block_group->length); 2727 } 2728 if (!used) { 2729 block_group->alloc_offset -= size; 2730 } 2731 2732 reclaimable_unusable = block_group->zone_unusable - 2733 (block_group->length - block_group->zone_capacity); 2734 /* All the region is now unusable. Mark it as unused and reclaim */ 2735 if (block_group->zone_unusable == block_group->length) { 2736 btrfs_mark_bg_unused(block_group); 2737 } else if (bg_reclaim_threshold && 2738 reclaimable_unusable >= 2739 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2740 btrfs_mark_bg_to_reclaim(block_group); 2741 } 2742 2743 spin_unlock(&block_group->lock); 2744 2745 return 0; 2746 } 2747 2748 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2749 u64 bytenr, u64 size) 2750 { 2751 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2752 2753 if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) 2754 return 0; 2755 2756 if (btrfs_is_zoned(block_group->fs_info)) 2757 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2758 true); 2759 2760 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2761 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2762 2763 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2764 } 2765 2766 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2767 u64 bytenr, u64 size) 2768 { 2769 if (btrfs_is_zoned(block_group->fs_info)) 2770 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2771 false); 2772 2773 return btrfs_add_free_space(block_group, bytenr, size); 2774 } 2775 2776 /* 2777 * This is a subtle distinction because when adding free space back in general, 2778 * we want it to be added as untrimmed for async. But in the case where we add 2779 * it on loading of a block group, we want to consider it trimmed. 2780 */ 2781 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2782 u64 bytenr, u64 size) 2783 { 2784 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2785 2786 if (btrfs_is_zoned(block_group->fs_info)) 2787 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2788 true); 2789 2790 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2791 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2792 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2793 2794 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2795 } 2796 2797 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2798 u64 offset, u64 bytes) 2799 { 2800 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2801 struct btrfs_free_space *info; 2802 int ret; 2803 bool re_search = false; 2804 2805 if (btrfs_is_zoned(block_group->fs_info)) { 2806 /* 2807 * This can happen with conventional zones when replaying log. 2808 * Since the allocation info of tree-log nodes are not recorded 2809 * to the extent-tree, calculate_alloc_pointer() failed to 2810 * advance the allocation pointer after last allocated tree log 2811 * node blocks. 2812 * 2813 * This function is called from 2814 * btrfs_pin_extent_for_log_replay() when replaying the log. 2815 * Advance the pointer not to overwrite the tree-log nodes. 2816 */ 2817 if (block_group->start + block_group->alloc_offset < 2818 offset + bytes) { 2819 block_group->alloc_offset = 2820 offset + bytes - block_group->start; 2821 } 2822 return 0; 2823 } 2824 2825 spin_lock(&ctl->tree_lock); 2826 2827 again: 2828 ret = 0; 2829 if (!bytes) 2830 goto out_lock; 2831 2832 info = tree_search_offset(ctl, offset, 0, 0); 2833 if (!info) { 2834 /* 2835 * oops didn't find an extent that matched the space we wanted 2836 * to remove, look for a bitmap instead 2837 */ 2838 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2839 1, 0); 2840 if (!info) { 2841 /* 2842 * If we found a partial bit of our free space in a 2843 * bitmap but then couldn't find the other part this may 2844 * be a problem, so WARN about it. 2845 */ 2846 WARN_ON(re_search); 2847 goto out_lock; 2848 } 2849 } 2850 2851 re_search = false; 2852 if (!info->bitmap) { 2853 unlink_free_space(ctl, info, true); 2854 if (offset == info->offset) { 2855 u64 to_free = min(bytes, info->bytes); 2856 2857 info->bytes -= to_free; 2858 info->offset += to_free; 2859 if (info->bytes) { 2860 ret = link_free_space(ctl, info); 2861 WARN_ON(ret); 2862 } else { 2863 kmem_cache_free(btrfs_free_space_cachep, info); 2864 } 2865 2866 offset += to_free; 2867 bytes -= to_free; 2868 goto again; 2869 } else { 2870 u64 old_end = info->bytes + info->offset; 2871 2872 info->bytes = offset - info->offset; 2873 ret = link_free_space(ctl, info); 2874 WARN_ON(ret); 2875 if (ret) 2876 goto out_lock; 2877 2878 /* Not enough bytes in this entry to satisfy us */ 2879 if (old_end < offset + bytes) { 2880 bytes -= old_end - offset; 2881 offset = old_end; 2882 goto again; 2883 } else if (old_end == offset + bytes) { 2884 /* all done */ 2885 goto out_lock; 2886 } 2887 spin_unlock(&ctl->tree_lock); 2888 2889 ret = __btrfs_add_free_space(block_group, 2890 offset + bytes, 2891 old_end - (offset + bytes), 2892 info->trim_state); 2893 WARN_ON(ret); 2894 return ret; 2895 } 2896 } 2897 2898 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2899 if (ret == -EAGAIN) { 2900 re_search = true; 2901 goto again; 2902 } 2903 out_lock: 2904 btrfs_discard_update_discardable(block_group); 2905 spin_unlock(&ctl->tree_lock); 2906 2907 return ret; 2908 } 2909 2910 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2911 u64 bytes) 2912 { 2913 struct btrfs_fs_info *fs_info = block_group->fs_info; 2914 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2915 struct btrfs_free_space *info; 2916 struct rb_node *n; 2917 int count = 0; 2918 2919 /* 2920 * Zoned btrfs does not use free space tree and cluster. Just print 2921 * out the free space after the allocation offset. 2922 */ 2923 if (btrfs_is_zoned(fs_info)) { 2924 btrfs_info(fs_info, "free space %llu active %d", 2925 block_group->zone_capacity - block_group->alloc_offset, 2926 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2927 &block_group->runtime_flags)); 2928 return; 2929 } 2930 2931 spin_lock(&ctl->tree_lock); 2932 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2933 info = rb_entry(n, struct btrfs_free_space, offset_index); 2934 if (info->bytes >= bytes && !block_group->ro) 2935 count++; 2936 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2937 info->offset, info->bytes, str_yes_no(info->bitmap)); 2938 } 2939 spin_unlock(&ctl->tree_lock); 2940 btrfs_info(fs_info, "block group has cluster?: %s", 2941 str_no_yes(list_empty(&block_group->cluster_list))); 2942 btrfs_info(fs_info, 2943 "%d free space entries at or bigger than %llu bytes", 2944 count, bytes); 2945 } 2946 2947 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2948 struct btrfs_free_space_ctl *ctl) 2949 { 2950 struct btrfs_fs_info *fs_info = block_group->fs_info; 2951 2952 spin_lock_init(&ctl->tree_lock); 2953 ctl->unit = fs_info->sectorsize; 2954 ctl->start = block_group->start; 2955 ctl->block_group = block_group; 2956 ctl->op = &free_space_op; 2957 ctl->free_space_bytes = RB_ROOT_CACHED; 2958 INIT_LIST_HEAD(&ctl->trimming_ranges); 2959 mutex_init(&ctl->cache_writeout_mutex); 2960 2961 /* 2962 * we only want to have 32k of ram per block group for keeping 2963 * track of free space, and if we pass 1/2 of that we want to 2964 * start converting things over to using bitmaps 2965 */ 2966 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2967 } 2968 2969 /* 2970 * for a given cluster, put all of its extents back into the free 2971 * space cache. If the block group passed doesn't match the block group 2972 * pointed to by the cluster, someone else raced in and freed the 2973 * cluster already. In that case, we just return without changing anything 2974 */ 2975 static void __btrfs_return_cluster_to_free_space( 2976 struct btrfs_block_group *block_group, 2977 struct btrfs_free_cluster *cluster) 2978 { 2979 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2980 struct rb_node *node; 2981 2982 lockdep_assert_held(&ctl->tree_lock); 2983 2984 spin_lock(&cluster->lock); 2985 if (cluster->block_group != block_group) { 2986 spin_unlock(&cluster->lock); 2987 return; 2988 } 2989 2990 cluster->block_group = NULL; 2991 cluster->window_start = 0; 2992 list_del_init(&cluster->block_group_list); 2993 2994 node = rb_first(&cluster->root); 2995 while (node) { 2996 struct btrfs_free_space *entry; 2997 2998 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2999 node = rb_next(&entry->offset_index); 3000 rb_erase(&entry->offset_index, &cluster->root); 3001 RB_CLEAR_NODE(&entry->offset_index); 3002 3003 if (!entry->bitmap) { 3004 /* Merging treats extents as if they were new */ 3005 if (!btrfs_free_space_trimmed(entry)) { 3006 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3007 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3008 entry->bytes; 3009 } 3010 3011 try_merge_free_space(ctl, entry, false); 3012 steal_from_bitmap(ctl, entry, false); 3013 3014 /* As we insert directly, update these statistics */ 3015 if (!btrfs_free_space_trimmed(entry)) { 3016 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3017 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3018 entry->bytes; 3019 } 3020 } 3021 tree_insert_offset(ctl, NULL, entry); 3022 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3023 entry_less); 3024 } 3025 cluster->root = RB_ROOT; 3026 spin_unlock(&cluster->lock); 3027 btrfs_put_block_group(block_group); 3028 } 3029 3030 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3031 { 3032 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3033 struct btrfs_free_cluster *cluster; 3034 struct list_head *head; 3035 3036 spin_lock(&ctl->tree_lock); 3037 while ((head = block_group->cluster_list.next) != 3038 &block_group->cluster_list) { 3039 cluster = list_entry(head, struct btrfs_free_cluster, 3040 block_group_list); 3041 3042 WARN_ON(cluster->block_group != block_group); 3043 __btrfs_return_cluster_to_free_space(block_group, cluster); 3044 3045 cond_resched_lock(&ctl->tree_lock); 3046 } 3047 __btrfs_remove_free_space_cache(ctl); 3048 btrfs_discard_update_discardable(block_group); 3049 spin_unlock(&ctl->tree_lock); 3050 3051 } 3052 3053 /* 3054 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3055 */ 3056 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3057 { 3058 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3059 struct btrfs_free_space *info; 3060 struct rb_node *node; 3061 bool ret = true; 3062 3063 if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && 3064 !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) && 3065 block_group->identity_remap_count == 0) { 3066 return true; 3067 } 3068 3069 spin_lock(&ctl->tree_lock); 3070 node = rb_first(&ctl->free_space_offset); 3071 3072 while (node) { 3073 info = rb_entry(node, struct btrfs_free_space, offset_index); 3074 3075 if (!btrfs_free_space_trimmed(info)) { 3076 ret = false; 3077 break; 3078 } 3079 3080 node = rb_next(node); 3081 } 3082 3083 spin_unlock(&ctl->tree_lock); 3084 return ret; 3085 } 3086 3087 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3088 u64 offset, u64 bytes, u64 empty_size, 3089 u64 *max_extent_size) 3090 { 3091 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3092 struct btrfs_discard_ctl *discard_ctl = 3093 &block_group->fs_info->discard_ctl; 3094 struct btrfs_free_space *entry = NULL; 3095 u64 bytes_search = bytes + empty_size; 3096 u64 ret = 0; 3097 u64 align_gap = 0; 3098 u64 align_gap_len = 0; 3099 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3100 bool use_bytes_index = (offset == block_group->start); 3101 3102 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3103 3104 spin_lock(&ctl->tree_lock); 3105 entry = find_free_space(ctl, &offset, &bytes_search, 3106 block_group->full_stripe_len, max_extent_size, 3107 use_bytes_index); 3108 if (!entry) 3109 goto out; 3110 3111 ret = offset; 3112 if (entry->bitmap) { 3113 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3114 3115 if (!btrfs_free_space_trimmed(entry)) 3116 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3117 3118 if (!entry->bytes) 3119 free_bitmap(ctl, entry); 3120 } else { 3121 unlink_free_space(ctl, entry, true); 3122 align_gap_len = offset - entry->offset; 3123 align_gap = entry->offset; 3124 align_gap_trim_state = entry->trim_state; 3125 3126 if (!btrfs_free_space_trimmed(entry)) 3127 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3128 3129 entry->offset = offset + bytes; 3130 WARN_ON(entry->bytes < bytes + align_gap_len); 3131 3132 entry->bytes -= bytes + align_gap_len; 3133 if (!entry->bytes) 3134 kmem_cache_free(btrfs_free_space_cachep, entry); 3135 else 3136 link_free_space(ctl, entry); 3137 } 3138 out: 3139 btrfs_discard_update_discardable(block_group); 3140 spin_unlock(&ctl->tree_lock); 3141 3142 if (align_gap_len) 3143 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3144 align_gap_trim_state); 3145 return ret; 3146 } 3147 3148 /* 3149 * given a cluster, put all of its extents back into the free space 3150 * cache. If a block group is passed, this function will only free 3151 * a cluster that belongs to the passed block group. 3152 * 3153 * Otherwise, it'll get a reference on the block group pointed to by the 3154 * cluster and remove the cluster from it. 3155 */ 3156 void btrfs_return_cluster_to_free_space( 3157 struct btrfs_block_group *block_group, 3158 struct btrfs_free_cluster *cluster) 3159 { 3160 struct btrfs_free_space_ctl *ctl; 3161 3162 /* first, get a safe pointer to the block group */ 3163 spin_lock(&cluster->lock); 3164 if (!block_group) { 3165 block_group = cluster->block_group; 3166 if (!block_group) { 3167 spin_unlock(&cluster->lock); 3168 return; 3169 } 3170 } else if (cluster->block_group != block_group) { 3171 /* someone else has already freed it don't redo their work */ 3172 spin_unlock(&cluster->lock); 3173 return; 3174 } 3175 btrfs_get_block_group(block_group); 3176 spin_unlock(&cluster->lock); 3177 3178 ctl = block_group->free_space_ctl; 3179 3180 /* now return any extents the cluster had on it */ 3181 spin_lock(&ctl->tree_lock); 3182 __btrfs_return_cluster_to_free_space(block_group, cluster); 3183 spin_unlock(&ctl->tree_lock); 3184 3185 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3186 3187 /* finally drop our ref */ 3188 btrfs_put_block_group(block_group); 3189 } 3190 3191 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3192 struct btrfs_free_cluster *cluster, 3193 struct btrfs_free_space *entry, 3194 u64 bytes, u64 min_start, 3195 u64 *max_extent_size) 3196 { 3197 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3198 int ret2; 3199 u64 search_start = cluster->window_start; 3200 u64 search_bytes = bytes; 3201 u64 ret = 0; 3202 3203 search_start = min_start; 3204 search_bytes = bytes; 3205 3206 ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3207 if (ret2) { 3208 *max_extent_size = max(get_max_extent_size(entry), 3209 *max_extent_size); 3210 return 0; 3211 } 3212 3213 ret = search_start; 3214 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3215 3216 return ret; 3217 } 3218 3219 /* 3220 * given a cluster, try to allocate 'bytes' from it, returns 0 3221 * if it couldn't find anything suitably large, or a logical disk offset 3222 * if things worked out 3223 */ 3224 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3225 struct btrfs_free_cluster *cluster, u64 bytes, 3226 u64 min_start, u64 *max_extent_size) 3227 { 3228 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3229 struct btrfs_discard_ctl *discard_ctl = 3230 &block_group->fs_info->discard_ctl; 3231 struct btrfs_free_space *entry = NULL; 3232 struct rb_node *node; 3233 u64 ret = 0; 3234 3235 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3236 3237 spin_lock(&cluster->lock); 3238 if (bytes > cluster->max_size) 3239 goto out; 3240 3241 if (cluster->block_group != block_group) 3242 goto out; 3243 3244 node = rb_first(&cluster->root); 3245 if (!node) 3246 goto out; 3247 3248 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3249 while (1) { 3250 if (entry->bytes < bytes) 3251 *max_extent_size = max(get_max_extent_size(entry), 3252 *max_extent_size); 3253 3254 if (entry->bytes < bytes || 3255 (!entry->bitmap && entry->offset < min_start)) { 3256 node = rb_next(&entry->offset_index); 3257 if (!node) 3258 break; 3259 entry = rb_entry(node, struct btrfs_free_space, 3260 offset_index); 3261 continue; 3262 } 3263 3264 if (entry->bitmap) { 3265 ret = btrfs_alloc_from_bitmap(block_group, 3266 cluster, entry, bytes, 3267 cluster->window_start, 3268 max_extent_size); 3269 if (ret == 0) { 3270 node = rb_next(&entry->offset_index); 3271 if (!node) 3272 break; 3273 entry = rb_entry(node, struct btrfs_free_space, 3274 offset_index); 3275 continue; 3276 } 3277 cluster->window_start += bytes; 3278 } else { 3279 ret = entry->offset; 3280 3281 entry->offset += bytes; 3282 entry->bytes -= bytes; 3283 } 3284 3285 break; 3286 } 3287 out: 3288 spin_unlock(&cluster->lock); 3289 3290 if (!ret) 3291 return 0; 3292 3293 spin_lock(&ctl->tree_lock); 3294 3295 if (!btrfs_free_space_trimmed(entry)) 3296 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3297 3298 ctl->free_space -= bytes; 3299 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3300 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3301 3302 spin_lock(&cluster->lock); 3303 if (entry->bytes == 0) { 3304 rb_erase(&entry->offset_index, &cluster->root); 3305 ctl->free_extents--; 3306 if (entry->bitmap) { 3307 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3308 entry->bitmap); 3309 ctl->total_bitmaps--; 3310 recalculate_thresholds(ctl); 3311 } else if (!btrfs_free_space_trimmed(entry)) { 3312 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3313 } 3314 kmem_cache_free(btrfs_free_space_cachep, entry); 3315 } 3316 3317 spin_unlock(&cluster->lock); 3318 spin_unlock(&ctl->tree_lock); 3319 3320 return ret; 3321 } 3322 3323 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3324 struct btrfs_free_space *entry, 3325 struct btrfs_free_cluster *cluster, 3326 u64 offset, u64 bytes, 3327 u64 cont1_bytes, u64 min_bytes) 3328 { 3329 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3330 unsigned long next_zero; 3331 unsigned long i; 3332 unsigned long want_bits; 3333 unsigned long min_bits; 3334 unsigned long found_bits; 3335 unsigned long max_bits = 0; 3336 unsigned long start = 0; 3337 unsigned long total_found = 0; 3338 int ret; 3339 3340 lockdep_assert_held(&ctl->tree_lock); 3341 3342 i = offset_to_bit(entry->offset, ctl->unit, 3343 max_t(u64, offset, entry->offset)); 3344 want_bits = bytes_to_bits(bytes, ctl->unit); 3345 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3346 3347 /* 3348 * Don't bother looking for a cluster in this bitmap if it's heavily 3349 * fragmented. 3350 */ 3351 if (entry->max_extent_size && 3352 entry->max_extent_size < cont1_bytes) 3353 return -ENOSPC; 3354 again: 3355 found_bits = 0; 3356 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3357 next_zero = find_next_zero_bit(entry->bitmap, 3358 BITS_PER_BITMAP, i); 3359 if (next_zero - i >= min_bits) { 3360 found_bits = next_zero - i; 3361 if (found_bits > max_bits) 3362 max_bits = found_bits; 3363 break; 3364 } 3365 if (next_zero - i > max_bits) 3366 max_bits = next_zero - i; 3367 i = next_zero; 3368 } 3369 3370 if (!found_bits) { 3371 entry->max_extent_size = (u64)max_bits * ctl->unit; 3372 return -ENOSPC; 3373 } 3374 3375 if (!total_found) { 3376 start = i; 3377 cluster->max_size = 0; 3378 } 3379 3380 total_found += found_bits; 3381 3382 if (cluster->max_size < found_bits * ctl->unit) 3383 cluster->max_size = found_bits * ctl->unit; 3384 3385 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3386 i = next_zero + 1; 3387 goto again; 3388 } 3389 3390 cluster->window_start = start * ctl->unit + entry->offset; 3391 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3392 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3393 3394 /* 3395 * We need to know if we're currently on the normal space index when we 3396 * manipulate the bitmap so that we know we need to remove and re-insert 3397 * it into the space_index tree. Clear the bytes_index node here so the 3398 * bitmap manipulation helpers know not to mess with the space_index 3399 * until this bitmap entry is added back into the normal cache. 3400 */ 3401 RB_CLEAR_NODE(&entry->bytes_index); 3402 3403 ret = tree_insert_offset(ctl, cluster, entry); 3404 ASSERT(!ret); /* -EEXIST; Logic error */ 3405 3406 trace_btrfs_setup_cluster(block_group, cluster, 3407 total_found * ctl->unit, 1); 3408 return 0; 3409 } 3410 3411 /* 3412 * This searches the block group for just extents to fill the cluster with. 3413 * Try to find a cluster with at least bytes total bytes, at least one 3414 * extent of cont1_bytes, and other clusters of at least min_bytes. 3415 */ 3416 static noinline int 3417 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3418 struct btrfs_free_cluster *cluster, 3419 struct list_head *bitmaps, u64 offset, u64 bytes, 3420 u64 cont1_bytes, u64 min_bytes) 3421 { 3422 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3423 struct btrfs_free_space *first = NULL; 3424 struct btrfs_free_space *entry = NULL; 3425 struct btrfs_free_space *last; 3426 struct rb_node *node; 3427 u64 window_free; 3428 u64 max_extent; 3429 u64 total_size = 0; 3430 3431 lockdep_assert_held(&ctl->tree_lock); 3432 3433 entry = tree_search_offset(ctl, offset, 0, 1); 3434 if (!entry) 3435 return -ENOSPC; 3436 3437 /* 3438 * We don't want bitmaps, so just move along until we find a normal 3439 * extent entry. 3440 */ 3441 while (entry->bitmap || entry->bytes < min_bytes) { 3442 if (entry->bitmap && list_empty(&entry->list)) 3443 list_add_tail(&entry->list, bitmaps); 3444 node = rb_next(&entry->offset_index); 3445 if (!node) 3446 return -ENOSPC; 3447 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3448 } 3449 3450 window_free = entry->bytes; 3451 max_extent = entry->bytes; 3452 first = entry; 3453 last = entry; 3454 3455 for (node = rb_next(&entry->offset_index); node; 3456 node = rb_next(&entry->offset_index)) { 3457 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3458 3459 if (entry->bitmap) { 3460 if (list_empty(&entry->list)) 3461 list_add_tail(&entry->list, bitmaps); 3462 continue; 3463 } 3464 3465 if (entry->bytes < min_bytes) 3466 continue; 3467 3468 last = entry; 3469 window_free += entry->bytes; 3470 if (entry->bytes > max_extent) 3471 max_extent = entry->bytes; 3472 } 3473 3474 if (window_free < bytes || max_extent < cont1_bytes) 3475 return -ENOSPC; 3476 3477 cluster->window_start = first->offset; 3478 3479 node = &first->offset_index; 3480 3481 /* 3482 * now we've found our entries, pull them out of the free space 3483 * cache and put them into the cluster rbtree 3484 */ 3485 do { 3486 int ret; 3487 3488 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3489 node = rb_next(&entry->offset_index); 3490 if (entry->bitmap || entry->bytes < min_bytes) 3491 continue; 3492 3493 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3494 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3495 ret = tree_insert_offset(ctl, cluster, entry); 3496 total_size += entry->bytes; 3497 ASSERT(!ret); /* -EEXIST; Logic error */ 3498 } while (node && entry != last); 3499 3500 cluster->max_size = max_extent; 3501 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3502 return 0; 3503 } 3504 3505 /* 3506 * This specifically looks for bitmaps that may work in the cluster, we assume 3507 * that we have already failed to find extents that will work. 3508 */ 3509 static noinline int 3510 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3511 struct btrfs_free_cluster *cluster, 3512 struct list_head *bitmaps, u64 offset, u64 bytes, 3513 u64 cont1_bytes, u64 min_bytes) 3514 { 3515 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3516 struct btrfs_free_space *entry = NULL; 3517 int ret = -ENOSPC; 3518 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3519 3520 if (ctl->total_bitmaps == 0) 3521 return -ENOSPC; 3522 3523 /* 3524 * The bitmap that covers offset won't be in the list unless offset 3525 * is just its start offset. 3526 */ 3527 if (!list_empty(bitmaps)) 3528 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3529 3530 if (!entry || entry->offset != bitmap_offset) { 3531 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3532 if (entry && list_empty(&entry->list)) 3533 list_add(&entry->list, bitmaps); 3534 } 3535 3536 list_for_each_entry(entry, bitmaps, list) { 3537 if (entry->bytes < bytes) 3538 continue; 3539 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3540 bytes, cont1_bytes, min_bytes); 3541 if (!ret) 3542 return 0; 3543 } 3544 3545 /* 3546 * The bitmaps list has all the bitmaps that record free space 3547 * starting after offset, so no more search is required. 3548 */ 3549 return -ENOSPC; 3550 } 3551 3552 /* 3553 * here we try to find a cluster of blocks in a block group. The goal 3554 * is to find at least bytes+empty_size. 3555 * We might not find them all in one contiguous area. 3556 * 3557 * returns zero and sets up cluster if things worked out, otherwise 3558 * it returns -enospc 3559 */ 3560 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3561 struct btrfs_free_cluster *cluster, 3562 u64 offset, u64 bytes, u64 empty_size) 3563 { 3564 struct btrfs_fs_info *fs_info = block_group->fs_info; 3565 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3566 struct btrfs_free_space *entry, *tmp; 3567 LIST_HEAD(bitmaps); 3568 u64 min_bytes; 3569 u64 cont1_bytes; 3570 int ret; 3571 3572 /* 3573 * Choose the minimum extent size we'll require for this 3574 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3575 * For metadata, allow allocates with smaller extents. For 3576 * data, keep it dense. 3577 */ 3578 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3579 cont1_bytes = bytes + empty_size; 3580 min_bytes = cont1_bytes; 3581 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3582 cont1_bytes = bytes; 3583 min_bytes = fs_info->sectorsize; 3584 } else { 3585 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3586 min_bytes = fs_info->sectorsize; 3587 } 3588 3589 spin_lock(&ctl->tree_lock); 3590 3591 /* 3592 * If we know we don't have enough space to make a cluster don't even 3593 * bother doing all the work to try and find one. 3594 */ 3595 if (ctl->free_space < bytes) { 3596 spin_unlock(&ctl->tree_lock); 3597 return -ENOSPC; 3598 } 3599 3600 spin_lock(&cluster->lock); 3601 3602 /* someone already found a cluster, hooray */ 3603 if (cluster->block_group) { 3604 ret = 0; 3605 goto out; 3606 } 3607 3608 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3609 min_bytes); 3610 3611 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3612 bytes + empty_size, 3613 cont1_bytes, min_bytes); 3614 if (ret) 3615 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3616 offset, bytes + empty_size, 3617 cont1_bytes, min_bytes); 3618 3619 /* Clear our temporary list */ 3620 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3621 list_del_init(&entry->list); 3622 3623 if (!ret) { 3624 btrfs_get_block_group(block_group); 3625 list_add_tail(&cluster->block_group_list, 3626 &block_group->cluster_list); 3627 cluster->block_group = block_group; 3628 } else { 3629 trace_btrfs_failed_cluster_setup(block_group); 3630 } 3631 out: 3632 spin_unlock(&cluster->lock); 3633 spin_unlock(&ctl->tree_lock); 3634 3635 return ret; 3636 } 3637 3638 /* 3639 * simple code to zero out a cluster 3640 */ 3641 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3642 { 3643 spin_lock_init(&cluster->lock); 3644 spin_lock_init(&cluster->refill_lock); 3645 cluster->root = RB_ROOT; 3646 cluster->max_size = 0; 3647 cluster->fragmented = false; 3648 INIT_LIST_HEAD(&cluster->block_group_list); 3649 cluster->block_group = NULL; 3650 } 3651 3652 static int do_trimming(struct btrfs_block_group *block_group, 3653 u64 *total_trimmed, u64 start, u64 bytes, 3654 u64 reserved_start, u64 reserved_bytes, 3655 enum btrfs_trim_state reserved_trim_state, 3656 struct btrfs_trim_range *trim_entry) 3657 { 3658 struct btrfs_space_info *space_info = block_group->space_info; 3659 struct btrfs_fs_info *fs_info = block_group->fs_info; 3660 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3661 int ret; 3662 bool bg_ro; 3663 const u64 end = start + bytes; 3664 const u64 reserved_end = reserved_start + reserved_bytes; 3665 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3666 u64 trimmed = 0; 3667 3668 spin_lock(&space_info->lock); 3669 spin_lock(&block_group->lock); 3670 bg_ro = block_group->ro; 3671 if (!bg_ro) { 3672 block_group->reserved += reserved_bytes; 3673 spin_unlock(&block_group->lock); 3674 space_info->bytes_reserved += reserved_bytes; 3675 } else { 3676 spin_unlock(&block_group->lock); 3677 } 3678 spin_unlock(&space_info->lock); 3679 3680 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false); 3681 if (!ret) { 3682 *total_trimmed += trimmed; 3683 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3684 } 3685 3686 mutex_lock(&ctl->cache_writeout_mutex); 3687 if (reserved_start < start) 3688 __btrfs_add_free_space(block_group, reserved_start, 3689 start - reserved_start, 3690 reserved_trim_state); 3691 if (end < reserved_end) 3692 __btrfs_add_free_space(block_group, end, reserved_end - end, 3693 reserved_trim_state); 3694 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3695 list_del(&trim_entry->list); 3696 mutex_unlock(&ctl->cache_writeout_mutex); 3697 3698 if (!bg_ro) { 3699 spin_lock(&space_info->lock); 3700 spin_lock(&block_group->lock); 3701 bg_ro = block_group->ro; 3702 block_group->reserved -= reserved_bytes; 3703 spin_unlock(&block_group->lock); 3704 3705 space_info->bytes_reserved -= reserved_bytes; 3706 if (bg_ro) 3707 space_info->bytes_readonly += reserved_bytes; 3708 spin_unlock(&space_info->lock); 3709 } 3710 3711 return ret; 3712 } 3713 3714 /* 3715 * If @async is set, then we will trim 1 region and return. 3716 */ 3717 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3718 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3719 bool async) 3720 { 3721 struct btrfs_discard_ctl *discard_ctl = 3722 &block_group->fs_info->discard_ctl; 3723 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3724 struct btrfs_free_space *entry; 3725 struct rb_node *node; 3726 int ret = 0; 3727 u64 extent_start; 3728 u64 extent_bytes; 3729 enum btrfs_trim_state extent_trim_state; 3730 u64 bytes; 3731 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3732 3733 while (start < end) { 3734 struct btrfs_trim_range trim_entry; 3735 3736 mutex_lock(&ctl->cache_writeout_mutex); 3737 spin_lock(&ctl->tree_lock); 3738 3739 if (ctl->free_space < minlen) 3740 goto out_unlock; 3741 3742 entry = tree_search_offset(ctl, start, 0, 1); 3743 if (!entry) 3744 goto out_unlock; 3745 3746 /* Skip bitmaps and if async, already trimmed entries */ 3747 while (entry->bitmap || 3748 (async && btrfs_free_space_trimmed(entry))) { 3749 node = rb_next(&entry->offset_index); 3750 if (!node) 3751 goto out_unlock; 3752 entry = rb_entry(node, struct btrfs_free_space, 3753 offset_index); 3754 } 3755 3756 if (entry->offset >= end) 3757 goto out_unlock; 3758 3759 extent_start = entry->offset; 3760 extent_bytes = entry->bytes; 3761 extent_trim_state = entry->trim_state; 3762 if (async) { 3763 start = entry->offset; 3764 bytes = entry->bytes; 3765 if (bytes < minlen) { 3766 spin_unlock(&ctl->tree_lock); 3767 mutex_unlock(&ctl->cache_writeout_mutex); 3768 goto next; 3769 } 3770 unlink_free_space(ctl, entry, true); 3771 /* 3772 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3773 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3774 * X when we come back around. So trim it now. 3775 */ 3776 if (max_discard_size && 3777 bytes >= (max_discard_size + 3778 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3779 bytes = max_discard_size; 3780 extent_bytes = max_discard_size; 3781 entry->offset += max_discard_size; 3782 entry->bytes -= max_discard_size; 3783 link_free_space(ctl, entry); 3784 } else { 3785 kmem_cache_free(btrfs_free_space_cachep, entry); 3786 } 3787 } else { 3788 start = max(start, extent_start); 3789 bytes = min(extent_start + extent_bytes, end) - start; 3790 if (bytes < minlen) { 3791 spin_unlock(&ctl->tree_lock); 3792 mutex_unlock(&ctl->cache_writeout_mutex); 3793 goto next; 3794 } 3795 3796 unlink_free_space(ctl, entry, true); 3797 kmem_cache_free(btrfs_free_space_cachep, entry); 3798 } 3799 3800 spin_unlock(&ctl->tree_lock); 3801 trim_entry.start = extent_start; 3802 trim_entry.bytes = extent_bytes; 3803 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3804 mutex_unlock(&ctl->cache_writeout_mutex); 3805 3806 ret = do_trimming(block_group, total_trimmed, start, bytes, 3807 extent_start, extent_bytes, extent_trim_state, 3808 &trim_entry); 3809 if (ret) { 3810 block_group->discard_cursor = start + bytes; 3811 break; 3812 } 3813 next: 3814 start += bytes; 3815 block_group->discard_cursor = start; 3816 if (async && *total_trimmed) 3817 break; 3818 3819 if (btrfs_trim_interrupted()) { 3820 ret = -ERESTARTSYS; 3821 break; 3822 } 3823 3824 cond_resched(); 3825 } 3826 3827 return ret; 3828 3829 out_unlock: 3830 block_group->discard_cursor = btrfs_block_group_end(block_group); 3831 spin_unlock(&ctl->tree_lock); 3832 mutex_unlock(&ctl->cache_writeout_mutex); 3833 3834 return ret; 3835 } 3836 3837 void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) 3838 { 3839 struct btrfs_fs_info *fs_info = bg->fs_info; 3840 struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; 3841 int ret = 0; 3842 u64 bytes, trimmed; 3843 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3844 u64 end = btrfs_block_group_end(bg); 3845 3846 if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) { 3847 bg->discard_cursor = end; 3848 3849 if (bg->used == 0) { 3850 spin_lock(&fs_info->unused_bgs_lock); 3851 if (!list_empty(&bg->bg_list)) { 3852 list_del_init(&bg->bg_list); 3853 btrfs_put_block_group(bg); 3854 } 3855 spin_unlock(&fs_info->unused_bgs_lock); 3856 3857 btrfs_mark_bg_unused(bg); 3858 } 3859 3860 return; 3861 } 3862 3863 bytes = end - bg->discard_cursor; 3864 3865 if (max_discard_size && 3866 bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) 3867 bytes = max_discard_size; 3868 3869 ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false); 3870 if (ret) 3871 return; 3872 3873 bg->discard_cursor += trimmed; 3874 3875 if (bg->discard_cursor < end) 3876 return; 3877 3878 btrfs_complete_bg_remapping(bg); 3879 } 3880 3881 /* 3882 * If we break out of trimming a bitmap prematurely, we should reset the 3883 * trimming bit. In a rather contrived case, it's possible to race here so 3884 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3885 * 3886 * start = start of bitmap 3887 * end = near end of bitmap 3888 * 3889 * Thread 1: Thread 2: 3890 * trim_bitmaps(start) 3891 * trim_bitmaps(end) 3892 * end_trimming_bitmap() 3893 * reset_trimming_bitmap() 3894 */ 3895 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3896 { 3897 struct btrfs_free_space *entry; 3898 3899 spin_lock(&ctl->tree_lock); 3900 entry = tree_search_offset(ctl, offset, 1, 0); 3901 if (entry) { 3902 if (btrfs_free_space_trimmed(entry)) { 3903 ctl->discardable_extents[BTRFS_STAT_CURR] += 3904 entry->bitmap_extents; 3905 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3906 } 3907 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3908 } 3909 3910 spin_unlock(&ctl->tree_lock); 3911 } 3912 3913 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3914 struct btrfs_free_space *entry) 3915 { 3916 if (btrfs_free_space_trimming_bitmap(entry)) { 3917 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3918 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3919 entry->bitmap_extents; 3920 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3921 } 3922 } 3923 3924 /* 3925 * If @async is set, then we will trim 1 region and return. 3926 */ 3927 static int trim_bitmaps(struct btrfs_block_group *block_group, 3928 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3929 u64 maxlen, bool async) 3930 { 3931 struct btrfs_discard_ctl *discard_ctl = 3932 &block_group->fs_info->discard_ctl; 3933 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3934 struct btrfs_free_space *entry; 3935 int ret = 0; 3936 int ret2; 3937 u64 bytes; 3938 u64 offset = offset_to_bitmap(ctl, start); 3939 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3940 3941 while (offset < end) { 3942 bool next_bitmap = false; 3943 struct btrfs_trim_range trim_entry; 3944 3945 mutex_lock(&ctl->cache_writeout_mutex); 3946 spin_lock(&ctl->tree_lock); 3947 3948 if (ctl->free_space < minlen) { 3949 block_group->discard_cursor = 3950 btrfs_block_group_end(block_group); 3951 spin_unlock(&ctl->tree_lock); 3952 mutex_unlock(&ctl->cache_writeout_mutex); 3953 break; 3954 } 3955 3956 entry = tree_search_offset(ctl, offset, 1, 0); 3957 /* 3958 * Bitmaps are marked trimmed lossily now to prevent constant 3959 * discarding of the same bitmap (the reason why we are bound 3960 * by the filters). So, retrim the block group bitmaps when we 3961 * are preparing to punt to the unused_bgs list. This uses 3962 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3963 * which is the only discard index which sets minlen to 0. 3964 */ 3965 if (!entry || (async && minlen && start == offset && 3966 btrfs_free_space_trimmed(entry))) { 3967 spin_unlock(&ctl->tree_lock); 3968 mutex_unlock(&ctl->cache_writeout_mutex); 3969 next_bitmap = true; 3970 goto next; 3971 } 3972 3973 /* 3974 * Async discard bitmap trimming begins at by setting the start 3975 * to be key.objectid and the offset_to_bitmap() aligns to the 3976 * start of the bitmap. This lets us know we are fully 3977 * scanning the bitmap rather than only some portion of it. 3978 */ 3979 if (start == offset) 3980 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3981 3982 bytes = minlen; 3983 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3984 if (ret2 || start >= end) { 3985 /* 3986 * We lossily consider a bitmap trimmed if we only skip 3987 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3988 */ 3989 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3990 end_trimming_bitmap(ctl, entry); 3991 else 3992 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3993 spin_unlock(&ctl->tree_lock); 3994 mutex_unlock(&ctl->cache_writeout_mutex); 3995 next_bitmap = true; 3996 goto next; 3997 } 3998 3999 /* 4000 * We already trimmed a region, but are using the locking above 4001 * to reset the trim_state. 4002 */ 4003 if (async && *total_trimmed) { 4004 spin_unlock(&ctl->tree_lock); 4005 mutex_unlock(&ctl->cache_writeout_mutex); 4006 return ret; 4007 } 4008 4009 bytes = min(bytes, end - start); 4010 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 4011 spin_unlock(&ctl->tree_lock); 4012 mutex_unlock(&ctl->cache_writeout_mutex); 4013 goto next; 4014 } 4015 4016 /* 4017 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 4018 * If X < @minlen, we won't trim X when we come back around. 4019 * So trim it now. We differ here from trimming extents as we 4020 * don't keep individual state per bit. 4021 */ 4022 if (async && 4023 max_discard_size && 4024 bytes > (max_discard_size + minlen)) 4025 bytes = max_discard_size; 4026 4027 bitmap_clear_bits(ctl, entry, start, bytes, true); 4028 if (entry->bytes == 0) 4029 free_bitmap(ctl, entry); 4030 4031 spin_unlock(&ctl->tree_lock); 4032 trim_entry.start = start; 4033 trim_entry.bytes = bytes; 4034 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 4035 mutex_unlock(&ctl->cache_writeout_mutex); 4036 4037 ret = do_trimming(block_group, total_trimmed, start, bytes, 4038 start, bytes, 0, &trim_entry); 4039 if (ret) { 4040 reset_trimming_bitmap(ctl, offset); 4041 block_group->discard_cursor = 4042 btrfs_block_group_end(block_group); 4043 break; 4044 } 4045 next: 4046 if (next_bitmap) { 4047 offset += BITS_PER_BITMAP * ctl->unit; 4048 start = offset; 4049 } else { 4050 start += bytes; 4051 } 4052 block_group->discard_cursor = start; 4053 4054 if (btrfs_trim_interrupted()) { 4055 if (start != offset) 4056 reset_trimming_bitmap(ctl, offset); 4057 ret = -ERESTARTSYS; 4058 break; 4059 } 4060 4061 cond_resched(); 4062 } 4063 4064 if (offset >= end) 4065 block_group->discard_cursor = end; 4066 4067 return ret; 4068 } 4069 4070 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4071 u64 *trimmed, u64 start, u64 end, u64 minlen) 4072 { 4073 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4074 int ret; 4075 u64 rem = 0; 4076 4077 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4078 4079 *trimmed = 0; 4080 4081 spin_lock(&block_group->lock); 4082 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4083 spin_unlock(&block_group->lock); 4084 return 0; 4085 } 4086 btrfs_freeze_block_group(block_group); 4087 spin_unlock(&block_group->lock); 4088 4089 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4090 if (ret) 4091 goto out; 4092 4093 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4094 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4095 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4096 if (rem) 4097 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4098 out: 4099 btrfs_unfreeze_block_group(block_group); 4100 return ret; 4101 } 4102 4103 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4104 u64 *trimmed, u64 start, u64 end, u64 minlen, 4105 bool async) 4106 { 4107 int ret; 4108 4109 *trimmed = 0; 4110 4111 spin_lock(&block_group->lock); 4112 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4113 spin_unlock(&block_group->lock); 4114 return 0; 4115 } 4116 btrfs_freeze_block_group(block_group); 4117 spin_unlock(&block_group->lock); 4118 4119 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4120 btrfs_unfreeze_block_group(block_group); 4121 4122 return ret; 4123 } 4124 4125 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4126 u64 *trimmed, u64 start, u64 end, u64 minlen, 4127 u64 maxlen, bool async) 4128 { 4129 int ret; 4130 4131 *trimmed = 0; 4132 4133 spin_lock(&block_group->lock); 4134 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4135 spin_unlock(&block_group->lock); 4136 return 0; 4137 } 4138 btrfs_freeze_block_group(block_group); 4139 spin_unlock(&block_group->lock); 4140 4141 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4142 async); 4143 4144 btrfs_unfreeze_block_group(block_group); 4145 4146 return ret; 4147 } 4148 4149 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4150 { 4151 return btrfs_super_cache_generation(fs_info->super_copy); 4152 } 4153 4154 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4155 struct btrfs_trans_handle *trans) 4156 { 4157 struct btrfs_block_group *block_group; 4158 struct rb_node *node; 4159 4160 btrfs_info(fs_info, "cleaning free space cache v1"); 4161 4162 node = rb_first_cached(&fs_info->block_group_cache_tree); 4163 while (node) { 4164 int ret; 4165 4166 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4167 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4168 if (ret) 4169 return ret; 4170 node = rb_next(node); 4171 } 4172 return 0; 4173 } 4174 4175 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4176 { 4177 struct btrfs_trans_handle *trans; 4178 int ret; 4179 4180 /* 4181 * update_super_roots will appropriately set or unset 4182 * super_copy->cache_generation based on SPACE_CACHE and 4183 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4184 * transaction commit whether we are enabling space cache v1 and don't 4185 * have any other work to do, or are disabling it and removing free 4186 * space inodes. 4187 */ 4188 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4189 if (IS_ERR(trans)) 4190 return PTR_ERR(trans); 4191 4192 if (!active) { 4193 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4194 ret = cleanup_free_space_cache_v1(fs_info, trans); 4195 if (unlikely(ret)) { 4196 btrfs_abort_transaction(trans, ret); 4197 btrfs_end_transaction(trans); 4198 goto out; 4199 } 4200 } 4201 4202 ret = btrfs_commit_transaction(trans); 4203 out: 4204 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4205 4206 return ret; 4207 } 4208 4209 int __init btrfs_free_space_init(void) 4210 { 4211 btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); 4212 if (!btrfs_free_space_cachep) 4213 return -ENOMEM; 4214 4215 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4216 PAGE_SIZE, PAGE_SIZE, 4217 0, NULL); 4218 if (!btrfs_free_space_bitmap_cachep) { 4219 kmem_cache_destroy(btrfs_free_space_cachep); 4220 return -ENOMEM; 4221 } 4222 4223 return 0; 4224 } 4225 4226 void __cold btrfs_free_space_exit(void) 4227 { 4228 kmem_cache_destroy(btrfs_free_space_cachep); 4229 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4230 } 4231 4232 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4233 /* 4234 * Use this if you need to make a bitmap or extent entry specifically, it 4235 * doesn't do any of the merging that add_free_space does, this acts a lot like 4236 * how the free space cache loading stuff works, so you can get really weird 4237 * configurations. 4238 */ 4239 int test_add_free_space_entry(struct btrfs_block_group *cache, 4240 u64 offset, u64 bytes, bool bitmap) 4241 { 4242 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4243 struct btrfs_free_space *info = NULL, *bitmap_info; 4244 void *map = NULL; 4245 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4246 u64 bytes_added; 4247 int ret; 4248 4249 again: 4250 if (!info) { 4251 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4252 if (!info) 4253 return -ENOMEM; 4254 } 4255 4256 if (!bitmap) { 4257 spin_lock(&ctl->tree_lock); 4258 info->offset = offset; 4259 info->bytes = bytes; 4260 info->max_extent_size = 0; 4261 ret = link_free_space(ctl, info); 4262 spin_unlock(&ctl->tree_lock); 4263 if (ret) 4264 kmem_cache_free(btrfs_free_space_cachep, info); 4265 return ret; 4266 } 4267 4268 if (!map) { 4269 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4270 if (!map) { 4271 kmem_cache_free(btrfs_free_space_cachep, info); 4272 return -ENOMEM; 4273 } 4274 } 4275 4276 spin_lock(&ctl->tree_lock); 4277 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4278 1, 0); 4279 if (!bitmap_info) { 4280 info->bitmap = map; 4281 map = NULL; 4282 add_new_bitmap(ctl, info, offset); 4283 bitmap_info = info; 4284 info = NULL; 4285 } 4286 4287 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4288 trim_state); 4289 4290 bytes -= bytes_added; 4291 offset += bytes_added; 4292 spin_unlock(&ctl->tree_lock); 4293 4294 if (bytes) 4295 goto again; 4296 4297 if (info) 4298 kmem_cache_free(btrfs_free_space_cachep, info); 4299 if (map) 4300 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4301 return 0; 4302 } 4303 4304 /* 4305 * Checks to see if the given range is in the free space cache. This is really 4306 * just used to check the absence of space, so if there is free space in the 4307 * range at all we will return 1. 4308 */ 4309 int test_check_exists(struct btrfs_block_group *cache, 4310 u64 offset, u64 bytes) 4311 { 4312 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4313 struct btrfs_free_space *info; 4314 int ret = 0; 4315 4316 spin_lock(&ctl->tree_lock); 4317 info = tree_search_offset(ctl, offset, 0, 0); 4318 if (!info) { 4319 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4320 1, 0); 4321 if (!info) 4322 goto out; 4323 } 4324 4325 have_info: 4326 if (info->bitmap) { 4327 u64 bit_off, bit_bytes; 4328 struct rb_node *n; 4329 struct btrfs_free_space *tmp; 4330 4331 bit_off = offset; 4332 bit_bytes = ctl->unit; 4333 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4334 if (!ret) { 4335 if (bit_off == offset) { 4336 ret = 1; 4337 goto out; 4338 } else if (bit_off > offset && 4339 offset + bytes > bit_off) { 4340 ret = 1; 4341 goto out; 4342 } 4343 } 4344 4345 n = rb_prev(&info->offset_index); 4346 while (n) { 4347 tmp = rb_entry(n, struct btrfs_free_space, 4348 offset_index); 4349 if (tmp->offset + tmp->bytes < offset) 4350 break; 4351 if (offset + bytes < tmp->offset) { 4352 n = rb_prev(&tmp->offset_index); 4353 continue; 4354 } 4355 info = tmp; 4356 goto have_info; 4357 } 4358 4359 n = rb_next(&info->offset_index); 4360 while (n) { 4361 tmp = rb_entry(n, struct btrfs_free_space, 4362 offset_index); 4363 if (offset + bytes < tmp->offset) 4364 break; 4365 if (tmp->offset + tmp->bytes < offset) { 4366 n = rb_next(&tmp->offset_index); 4367 continue; 4368 } 4369 info = tmp; 4370 goto have_info; 4371 } 4372 4373 ret = 0; 4374 goto out; 4375 } 4376 4377 if (info->offset == offset) { 4378 ret = 1; 4379 goto out; 4380 } 4381 4382 if (offset > info->offset && offset < info->offset + info->bytes) 4383 ret = 1; 4384 out: 4385 spin_unlock(&ctl->tree_lock); 4386 return ret; 4387 } 4388 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4389