1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void btrfs_crc32c_final(u32 crc, u8 *result) 61 { 62 put_unaligned_le32(~crc, result); 63 } 64 65 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 66 { 67 struct btrfs_free_space *info; 68 struct rb_node *node; 69 70 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 71 info = rb_entry(node, struct btrfs_free_space, offset_index); 72 if (!info->bitmap) { 73 unlink_free_space(ctl, info, true); 74 kmem_cache_free(btrfs_free_space_cachep, info); 75 } else { 76 free_bitmap(ctl, info); 77 } 78 79 cond_resched_lock(&ctl->tree_lock); 80 } 81 } 82 83 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 84 struct btrfs_path *path, 85 u64 offset) 86 { 87 struct btrfs_fs_info *fs_info = root->fs_info; 88 struct btrfs_key key; 89 struct btrfs_key location; 90 struct btrfs_disk_key disk_key; 91 struct btrfs_free_space_header *header; 92 struct extent_buffer *leaf; 93 struct inode *inode = NULL; 94 unsigned nofs_flag; 95 int ret; 96 97 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 98 key.offset = offset; 99 key.type = 0; 100 101 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 102 if (ret < 0) 103 return ERR_PTR(ret); 104 if (ret > 0) { 105 btrfs_release_path(path); 106 return ERR_PTR(-ENOENT); 107 } 108 109 leaf = path->nodes[0]; 110 header = btrfs_item_ptr(leaf, path->slots[0], 111 struct btrfs_free_space_header); 112 btrfs_free_space_key(leaf, header, &disk_key); 113 btrfs_disk_key_to_cpu(&location, &disk_key); 114 btrfs_release_path(path); 115 116 /* 117 * We are often under a trans handle at this point, so we need to make 118 * sure NOFS is set to keep us from deadlocking. 119 */ 120 nofs_flag = memalloc_nofs_save(); 121 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 122 btrfs_release_path(path); 123 memalloc_nofs_restore(nofs_flag); 124 if (IS_ERR(inode)) 125 return inode; 126 127 mapping_set_gfp_mask(inode->i_mapping, 128 mapping_gfp_constraint(inode->i_mapping, 129 ~(__GFP_FS | __GFP_HIGHMEM))); 130 131 return inode; 132 } 133 134 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 135 struct btrfs_path *path) 136 { 137 struct btrfs_fs_info *fs_info = block_group->fs_info; 138 struct inode *inode = NULL; 139 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 140 141 spin_lock(&block_group->lock); 142 if (block_group->inode) 143 inode = igrab(block_group->inode); 144 spin_unlock(&block_group->lock); 145 if (inode) 146 return inode; 147 148 inode = __lookup_free_space_inode(fs_info->tree_root, path, 149 block_group->start); 150 if (IS_ERR(inode)) 151 return inode; 152 153 spin_lock(&block_group->lock); 154 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 155 btrfs_info(fs_info, "Old style space inode found, converting."); 156 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 157 BTRFS_INODE_NODATACOW; 158 block_group->disk_cache_state = BTRFS_DC_CLEAR; 159 } 160 161 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 162 block_group->inode = igrab(inode); 163 spin_unlock(&block_group->lock); 164 165 return inode; 166 } 167 168 static int __create_free_space_inode(struct btrfs_root *root, 169 struct btrfs_trans_handle *trans, 170 struct btrfs_path *path, 171 u64 ino, u64 offset) 172 { 173 struct btrfs_key key; 174 struct btrfs_disk_key disk_key; 175 struct btrfs_free_space_header *header; 176 struct btrfs_inode_item *inode_item; 177 struct extent_buffer *leaf; 178 /* We inline CRCs for the free disk space cache */ 179 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 180 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 181 int ret; 182 183 ret = btrfs_insert_empty_inode(trans, root, path, ino); 184 if (ret) 185 return ret; 186 187 leaf = path->nodes[0]; 188 inode_item = btrfs_item_ptr(leaf, path->slots[0], 189 struct btrfs_inode_item); 190 btrfs_item_key(leaf, &disk_key, path->slots[0]); 191 memzero_extent_buffer(leaf, (unsigned long)inode_item, 192 sizeof(*inode_item)); 193 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 194 btrfs_set_inode_size(leaf, inode_item, 0); 195 btrfs_set_inode_nbytes(leaf, inode_item, 0); 196 btrfs_set_inode_uid(leaf, inode_item, 0); 197 btrfs_set_inode_gid(leaf, inode_item, 0); 198 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 199 btrfs_set_inode_flags(leaf, inode_item, flags); 200 btrfs_set_inode_nlink(leaf, inode_item, 1); 201 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 202 btrfs_set_inode_block_group(leaf, inode_item, offset); 203 btrfs_mark_buffer_dirty(trans, leaf); 204 btrfs_release_path(path); 205 206 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 207 key.offset = offset; 208 key.type = 0; 209 ret = btrfs_insert_empty_item(trans, root, path, &key, 210 sizeof(struct btrfs_free_space_header)); 211 if (ret < 0) { 212 btrfs_release_path(path); 213 return ret; 214 } 215 216 leaf = path->nodes[0]; 217 header = btrfs_item_ptr(leaf, path->slots[0], 218 struct btrfs_free_space_header); 219 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 220 btrfs_set_free_space_key(leaf, header, &disk_key); 221 btrfs_mark_buffer_dirty(trans, leaf); 222 btrfs_release_path(path); 223 224 return 0; 225 } 226 227 int create_free_space_inode(struct btrfs_trans_handle *trans, 228 struct btrfs_block_group *block_group, 229 struct btrfs_path *path) 230 { 231 int ret; 232 u64 ino; 233 234 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 235 if (ret < 0) 236 return ret; 237 238 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 239 ino, block_group->start); 240 } 241 242 /* 243 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 244 * handles lookup, otherwise it takes ownership and iputs the inode. 245 * Don't reuse an inode pointer after passing it into this function. 246 */ 247 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 248 struct inode *inode, 249 struct btrfs_block_group *block_group) 250 { 251 struct btrfs_path *path; 252 struct btrfs_key key; 253 int ret = 0; 254 255 path = btrfs_alloc_path(); 256 if (!path) 257 return -ENOMEM; 258 259 if (!inode) 260 inode = lookup_free_space_inode(block_group, path); 261 if (IS_ERR(inode)) { 262 if (PTR_ERR(inode) != -ENOENT) 263 ret = PTR_ERR(inode); 264 goto out; 265 } 266 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 267 if (ret) { 268 btrfs_add_delayed_iput(BTRFS_I(inode)); 269 goto out; 270 } 271 clear_nlink(inode); 272 /* One for the block groups ref */ 273 spin_lock(&block_group->lock); 274 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 275 block_group->inode = NULL; 276 spin_unlock(&block_group->lock); 277 iput(inode); 278 } else { 279 spin_unlock(&block_group->lock); 280 } 281 /* One for the lookup ref */ 282 btrfs_add_delayed_iput(BTRFS_I(inode)); 283 284 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 285 key.type = 0; 286 key.offset = block_group->start; 287 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 288 -1, 1); 289 if (ret) { 290 if (ret > 0) 291 ret = 0; 292 goto out; 293 } 294 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 295 out: 296 btrfs_free_path(path); 297 return ret; 298 } 299 300 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 301 struct btrfs_block_group *block_group, 302 struct inode *vfs_inode) 303 { 304 struct btrfs_truncate_control control = { 305 .inode = BTRFS_I(vfs_inode), 306 .new_size = 0, 307 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 308 .min_type = BTRFS_EXTENT_DATA_KEY, 309 .clear_extent_range = true, 310 }; 311 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 312 struct btrfs_root *root = inode->root; 313 struct extent_state *cached_state = NULL; 314 int ret = 0; 315 bool locked = false; 316 317 if (block_group) { 318 struct btrfs_path *path = btrfs_alloc_path(); 319 320 if (!path) { 321 ret = -ENOMEM; 322 goto fail; 323 } 324 locked = true; 325 mutex_lock(&trans->transaction->cache_write_mutex); 326 if (!list_empty(&block_group->io_list)) { 327 list_del_init(&block_group->io_list); 328 329 btrfs_wait_cache_io(trans, block_group, path); 330 btrfs_put_block_group(block_group); 331 } 332 333 /* 334 * now that we've truncated the cache away, its no longer 335 * setup or written 336 */ 337 spin_lock(&block_group->lock); 338 block_group->disk_cache_state = BTRFS_DC_CLEAR; 339 spin_unlock(&block_group->lock); 340 btrfs_free_path(path); 341 } 342 343 btrfs_i_size_write(inode, 0); 344 truncate_pagecache(vfs_inode, 0); 345 346 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 347 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 348 349 /* 350 * We skip the throttling logic for free space cache inodes, so we don't 351 * need to check for -EAGAIN. 352 */ 353 ret = btrfs_truncate_inode_items(trans, root, &control); 354 355 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 356 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 357 358 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 359 if (ret) 360 goto fail; 361 362 ret = btrfs_update_inode(trans, inode); 363 364 fail: 365 if (locked) 366 mutex_unlock(&trans->transaction->cache_write_mutex); 367 if (ret) 368 btrfs_abort_transaction(trans, ret); 369 370 return ret; 371 } 372 373 static void readahead_cache(struct inode *inode) 374 { 375 struct file_ra_state ra; 376 unsigned long last_index; 377 378 file_ra_state_init(&ra, inode->i_mapping); 379 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 380 381 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 382 } 383 384 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 385 int write) 386 { 387 int num_pages; 388 389 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 390 391 /* Make sure we can fit our crcs and generation into the first page */ 392 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 393 return -ENOSPC; 394 395 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 396 397 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 398 if (!io_ctl->pages) 399 return -ENOMEM; 400 401 io_ctl->num_pages = num_pages; 402 io_ctl->fs_info = btrfs_sb(inode->i_sb); 403 io_ctl->inode = inode; 404 405 return 0; 406 } 407 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 408 409 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 410 { 411 kfree(io_ctl->pages); 412 io_ctl->pages = NULL; 413 } 414 415 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 416 { 417 if (io_ctl->cur) { 418 io_ctl->cur = NULL; 419 io_ctl->orig = NULL; 420 } 421 } 422 423 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 424 { 425 ASSERT(io_ctl->index < io_ctl->num_pages); 426 io_ctl->page = io_ctl->pages[io_ctl->index++]; 427 io_ctl->cur = page_address(io_ctl->page); 428 io_ctl->orig = io_ctl->cur; 429 io_ctl->size = PAGE_SIZE; 430 if (clear) 431 clear_page(io_ctl->cur); 432 } 433 434 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 435 { 436 int i; 437 438 io_ctl_unmap_page(io_ctl); 439 440 for (i = 0; i < io_ctl->num_pages; i++) { 441 if (io_ctl->pages[i]) { 442 btrfs_page_clear_checked(io_ctl->fs_info, 443 io_ctl->pages[i], 444 page_offset(io_ctl->pages[i]), 445 PAGE_SIZE); 446 unlock_page(io_ctl->pages[i]); 447 put_page(io_ctl->pages[i]); 448 } 449 } 450 } 451 452 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 453 { 454 struct page *page; 455 struct inode *inode = io_ctl->inode; 456 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 457 int i; 458 459 for (i = 0; i < io_ctl->num_pages; i++) { 460 int ret; 461 462 page = find_or_create_page(inode->i_mapping, i, mask); 463 if (!page) { 464 io_ctl_drop_pages(io_ctl); 465 return -ENOMEM; 466 } 467 468 ret = set_page_extent_mapped(page); 469 if (ret < 0) { 470 unlock_page(page); 471 put_page(page); 472 io_ctl_drop_pages(io_ctl); 473 return ret; 474 } 475 476 io_ctl->pages[i] = page; 477 if (uptodate && !PageUptodate(page)) { 478 btrfs_read_folio(NULL, page_folio(page)); 479 lock_page(page); 480 if (page->mapping != inode->i_mapping) { 481 btrfs_err(BTRFS_I(inode)->root->fs_info, 482 "free space cache page truncated"); 483 io_ctl_drop_pages(io_ctl); 484 return -EIO; 485 } 486 if (!PageUptodate(page)) { 487 btrfs_err(BTRFS_I(inode)->root->fs_info, 488 "error reading free space cache"); 489 io_ctl_drop_pages(io_ctl); 490 return -EIO; 491 } 492 } 493 } 494 495 for (i = 0; i < io_ctl->num_pages; i++) 496 clear_page_dirty_for_io(io_ctl->pages[i]); 497 498 return 0; 499 } 500 501 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 502 { 503 io_ctl_map_page(io_ctl, 1); 504 505 /* 506 * Skip the csum areas. If we don't check crcs then we just have a 507 * 64bit chunk at the front of the first page. 508 */ 509 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 510 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 511 512 put_unaligned_le64(generation, io_ctl->cur); 513 io_ctl->cur += sizeof(u64); 514 } 515 516 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 517 { 518 u64 cache_gen; 519 520 /* 521 * Skip the crc area. If we don't check crcs then we just have a 64bit 522 * chunk at the front of the first page. 523 */ 524 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 525 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 526 527 cache_gen = get_unaligned_le64(io_ctl->cur); 528 if (cache_gen != generation) { 529 btrfs_err_rl(io_ctl->fs_info, 530 "space cache generation (%llu) does not match inode (%llu)", 531 cache_gen, generation); 532 io_ctl_unmap_page(io_ctl); 533 return -EIO; 534 } 535 io_ctl->cur += sizeof(u64); 536 return 0; 537 } 538 539 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 540 { 541 u32 *tmp; 542 u32 crc = ~(u32)0; 543 unsigned offset = 0; 544 545 if (index == 0) 546 offset = sizeof(u32) * io_ctl->num_pages; 547 548 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 549 btrfs_crc32c_final(crc, (u8 *)&crc); 550 io_ctl_unmap_page(io_ctl); 551 tmp = page_address(io_ctl->pages[0]); 552 tmp += index; 553 *tmp = crc; 554 } 555 556 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 557 { 558 u32 *tmp, val; 559 u32 crc = ~(u32)0; 560 unsigned offset = 0; 561 562 if (index == 0) 563 offset = sizeof(u32) * io_ctl->num_pages; 564 565 tmp = page_address(io_ctl->pages[0]); 566 tmp += index; 567 val = *tmp; 568 569 io_ctl_map_page(io_ctl, 0); 570 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 571 btrfs_crc32c_final(crc, (u8 *)&crc); 572 if (val != crc) { 573 btrfs_err_rl(io_ctl->fs_info, 574 "csum mismatch on free space cache"); 575 io_ctl_unmap_page(io_ctl); 576 return -EIO; 577 } 578 579 return 0; 580 } 581 582 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 583 void *bitmap) 584 { 585 struct btrfs_free_space_entry *entry; 586 587 if (!io_ctl->cur) 588 return -ENOSPC; 589 590 entry = io_ctl->cur; 591 put_unaligned_le64(offset, &entry->offset); 592 put_unaligned_le64(bytes, &entry->bytes); 593 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 594 BTRFS_FREE_SPACE_EXTENT; 595 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 596 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 597 598 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 599 return 0; 600 601 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 602 603 /* No more pages to map */ 604 if (io_ctl->index >= io_ctl->num_pages) 605 return 0; 606 607 /* map the next page */ 608 io_ctl_map_page(io_ctl, 1); 609 return 0; 610 } 611 612 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 613 { 614 if (!io_ctl->cur) 615 return -ENOSPC; 616 617 /* 618 * If we aren't at the start of the current page, unmap this one and 619 * map the next one if there is any left. 620 */ 621 if (io_ctl->cur != io_ctl->orig) { 622 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 623 if (io_ctl->index >= io_ctl->num_pages) 624 return -ENOSPC; 625 io_ctl_map_page(io_ctl, 0); 626 } 627 628 copy_page(io_ctl->cur, bitmap); 629 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 630 if (io_ctl->index < io_ctl->num_pages) 631 io_ctl_map_page(io_ctl, 0); 632 return 0; 633 } 634 635 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 636 { 637 /* 638 * If we're not on the boundary we know we've modified the page and we 639 * need to crc the page. 640 */ 641 if (io_ctl->cur != io_ctl->orig) 642 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 643 else 644 io_ctl_unmap_page(io_ctl); 645 646 while (io_ctl->index < io_ctl->num_pages) { 647 io_ctl_map_page(io_ctl, 1); 648 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 649 } 650 } 651 652 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 653 struct btrfs_free_space *entry, u8 *type) 654 { 655 struct btrfs_free_space_entry *e; 656 int ret; 657 658 if (!io_ctl->cur) { 659 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 660 if (ret) 661 return ret; 662 } 663 664 e = io_ctl->cur; 665 entry->offset = get_unaligned_le64(&e->offset); 666 entry->bytes = get_unaligned_le64(&e->bytes); 667 *type = e->type; 668 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 669 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 670 671 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 672 return 0; 673 674 io_ctl_unmap_page(io_ctl); 675 676 return 0; 677 } 678 679 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 680 struct btrfs_free_space *entry) 681 { 682 int ret; 683 684 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 685 if (ret) 686 return ret; 687 688 copy_page(entry->bitmap, io_ctl->cur); 689 io_ctl_unmap_page(io_ctl); 690 691 return 0; 692 } 693 694 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 695 { 696 struct btrfs_block_group *block_group = ctl->block_group; 697 u64 max_bytes; 698 u64 bitmap_bytes; 699 u64 extent_bytes; 700 u64 size = block_group->length; 701 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 702 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 703 704 max_bitmaps = max_t(u64, max_bitmaps, 1); 705 706 if (ctl->total_bitmaps > max_bitmaps) 707 btrfs_err(block_group->fs_info, 708 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 709 block_group->start, block_group->length, 710 ctl->total_bitmaps, ctl->unit, max_bitmaps, 711 bytes_per_bg); 712 ASSERT(ctl->total_bitmaps <= max_bitmaps); 713 714 /* 715 * We are trying to keep the total amount of memory used per 1GiB of 716 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 717 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 718 * bitmaps, we may end up using more memory than this. 719 */ 720 if (size < SZ_1G) 721 max_bytes = MAX_CACHE_BYTES_PER_GIG; 722 else 723 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 724 725 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 726 727 /* 728 * we want the extent entry threshold to always be at most 1/2 the max 729 * bytes we can have, or whatever is less than that. 730 */ 731 extent_bytes = max_bytes - bitmap_bytes; 732 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 733 734 ctl->extents_thresh = 735 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 736 } 737 738 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 739 struct btrfs_free_space_ctl *ctl, 740 struct btrfs_path *path, u64 offset) 741 { 742 struct btrfs_fs_info *fs_info = root->fs_info; 743 struct btrfs_free_space_header *header; 744 struct extent_buffer *leaf; 745 struct btrfs_io_ctl io_ctl; 746 struct btrfs_key key; 747 struct btrfs_free_space *e, *n; 748 LIST_HEAD(bitmaps); 749 u64 num_entries; 750 u64 num_bitmaps; 751 u64 generation; 752 u8 type; 753 int ret = 0; 754 755 /* Nothing in the space cache, goodbye */ 756 if (!i_size_read(inode)) 757 return 0; 758 759 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 760 key.offset = offset; 761 key.type = 0; 762 763 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 764 if (ret < 0) 765 return 0; 766 else if (ret > 0) { 767 btrfs_release_path(path); 768 return 0; 769 } 770 771 ret = -1; 772 773 leaf = path->nodes[0]; 774 header = btrfs_item_ptr(leaf, path->slots[0], 775 struct btrfs_free_space_header); 776 num_entries = btrfs_free_space_entries(leaf, header); 777 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 778 generation = btrfs_free_space_generation(leaf, header); 779 btrfs_release_path(path); 780 781 if (!BTRFS_I(inode)->generation) { 782 btrfs_info(fs_info, 783 "the free space cache file (%llu) is invalid, skip it", 784 offset); 785 return 0; 786 } 787 788 if (BTRFS_I(inode)->generation != generation) { 789 btrfs_err(fs_info, 790 "free space inode generation (%llu) did not match free space cache generation (%llu)", 791 BTRFS_I(inode)->generation, generation); 792 return 0; 793 } 794 795 if (!num_entries) 796 return 0; 797 798 ret = io_ctl_init(&io_ctl, inode, 0); 799 if (ret) 800 return ret; 801 802 readahead_cache(inode); 803 804 ret = io_ctl_prepare_pages(&io_ctl, true); 805 if (ret) 806 goto out; 807 808 ret = io_ctl_check_crc(&io_ctl, 0); 809 if (ret) 810 goto free_cache; 811 812 ret = io_ctl_check_generation(&io_ctl, generation); 813 if (ret) 814 goto free_cache; 815 816 while (num_entries) { 817 e = kmem_cache_zalloc(btrfs_free_space_cachep, 818 GFP_NOFS); 819 if (!e) { 820 ret = -ENOMEM; 821 goto free_cache; 822 } 823 824 ret = io_ctl_read_entry(&io_ctl, e, &type); 825 if (ret) { 826 kmem_cache_free(btrfs_free_space_cachep, e); 827 goto free_cache; 828 } 829 830 if (!e->bytes) { 831 ret = -1; 832 kmem_cache_free(btrfs_free_space_cachep, e); 833 goto free_cache; 834 } 835 836 if (type == BTRFS_FREE_SPACE_EXTENT) { 837 spin_lock(&ctl->tree_lock); 838 ret = link_free_space(ctl, e); 839 spin_unlock(&ctl->tree_lock); 840 if (ret) { 841 btrfs_err(fs_info, 842 "Duplicate entries in free space cache, dumping"); 843 kmem_cache_free(btrfs_free_space_cachep, e); 844 goto free_cache; 845 } 846 } else { 847 ASSERT(num_bitmaps); 848 num_bitmaps--; 849 e->bitmap = kmem_cache_zalloc( 850 btrfs_free_space_bitmap_cachep, GFP_NOFS); 851 if (!e->bitmap) { 852 ret = -ENOMEM; 853 kmem_cache_free( 854 btrfs_free_space_cachep, e); 855 goto free_cache; 856 } 857 spin_lock(&ctl->tree_lock); 858 ret = link_free_space(ctl, e); 859 if (ret) { 860 spin_unlock(&ctl->tree_lock); 861 btrfs_err(fs_info, 862 "Duplicate entries in free space cache, dumping"); 863 kmem_cache_free(btrfs_free_space_cachep, e); 864 goto free_cache; 865 } 866 ctl->total_bitmaps++; 867 recalculate_thresholds(ctl); 868 spin_unlock(&ctl->tree_lock); 869 list_add_tail(&e->list, &bitmaps); 870 } 871 872 num_entries--; 873 } 874 875 io_ctl_unmap_page(&io_ctl); 876 877 /* 878 * We add the bitmaps at the end of the entries in order that 879 * the bitmap entries are added to the cache. 880 */ 881 list_for_each_entry_safe(e, n, &bitmaps, list) { 882 list_del_init(&e->list); 883 ret = io_ctl_read_bitmap(&io_ctl, e); 884 if (ret) 885 goto free_cache; 886 } 887 888 io_ctl_drop_pages(&io_ctl); 889 ret = 1; 890 out: 891 io_ctl_free(&io_ctl); 892 return ret; 893 free_cache: 894 io_ctl_drop_pages(&io_ctl); 895 896 spin_lock(&ctl->tree_lock); 897 __btrfs_remove_free_space_cache(ctl); 898 spin_unlock(&ctl->tree_lock); 899 goto out; 900 } 901 902 static int copy_free_space_cache(struct btrfs_block_group *block_group, 903 struct btrfs_free_space_ctl *ctl) 904 { 905 struct btrfs_free_space *info; 906 struct rb_node *n; 907 int ret = 0; 908 909 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 910 info = rb_entry(n, struct btrfs_free_space, offset_index); 911 if (!info->bitmap) { 912 const u64 offset = info->offset; 913 const u64 bytes = info->bytes; 914 915 unlink_free_space(ctl, info, true); 916 spin_unlock(&ctl->tree_lock); 917 kmem_cache_free(btrfs_free_space_cachep, info); 918 ret = btrfs_add_free_space(block_group, offset, bytes); 919 spin_lock(&ctl->tree_lock); 920 } else { 921 u64 offset = info->offset; 922 u64 bytes = ctl->unit; 923 924 ret = search_bitmap(ctl, info, &offset, &bytes, false); 925 if (ret == 0) { 926 bitmap_clear_bits(ctl, info, offset, bytes, true); 927 spin_unlock(&ctl->tree_lock); 928 ret = btrfs_add_free_space(block_group, offset, 929 bytes); 930 spin_lock(&ctl->tree_lock); 931 } else { 932 free_bitmap(ctl, info); 933 ret = 0; 934 } 935 } 936 cond_resched_lock(&ctl->tree_lock); 937 } 938 return ret; 939 } 940 941 static struct lock_class_key btrfs_free_space_inode_key; 942 943 int load_free_space_cache(struct btrfs_block_group *block_group) 944 { 945 struct btrfs_fs_info *fs_info = block_group->fs_info; 946 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 947 struct btrfs_free_space_ctl tmp_ctl = {}; 948 struct inode *inode; 949 struct btrfs_path *path; 950 int ret = 0; 951 bool matched; 952 u64 used = block_group->used; 953 954 /* 955 * Because we could potentially discard our loaded free space, we want 956 * to load everything into a temporary structure first, and then if it's 957 * valid copy it all into the actual free space ctl. 958 */ 959 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 960 961 /* 962 * If this block group has been marked to be cleared for one reason or 963 * another then we can't trust the on disk cache, so just return. 964 */ 965 spin_lock(&block_group->lock); 966 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 967 spin_unlock(&block_group->lock); 968 return 0; 969 } 970 spin_unlock(&block_group->lock); 971 972 path = btrfs_alloc_path(); 973 if (!path) 974 return 0; 975 path->search_commit_root = 1; 976 path->skip_locking = 1; 977 978 /* 979 * We must pass a path with search_commit_root set to btrfs_iget in 980 * order to avoid a deadlock when allocating extents for the tree root. 981 * 982 * When we are COWing an extent buffer from the tree root, when looking 983 * for a free extent, at extent-tree.c:find_free_extent(), we can find 984 * block group without its free space cache loaded. When we find one 985 * we must load its space cache which requires reading its free space 986 * cache's inode item from the root tree. If this inode item is located 987 * in the same leaf that we started COWing before, then we end up in 988 * deadlock on the extent buffer (trying to read lock it when we 989 * previously write locked it). 990 * 991 * It's safe to read the inode item using the commit root because 992 * block groups, once loaded, stay in memory forever (until they are 993 * removed) as well as their space caches once loaded. New block groups 994 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 995 * we will never try to read their inode item while the fs is mounted. 996 */ 997 inode = lookup_free_space_inode(block_group, path); 998 if (IS_ERR(inode)) { 999 btrfs_free_path(path); 1000 return 0; 1001 } 1002 1003 /* We may have converted the inode and made the cache invalid. */ 1004 spin_lock(&block_group->lock); 1005 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1006 spin_unlock(&block_group->lock); 1007 btrfs_free_path(path); 1008 goto out; 1009 } 1010 spin_unlock(&block_group->lock); 1011 1012 /* 1013 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1014 * free space inodes to prevent false positives related to locks for normal 1015 * inodes. 1016 */ 1017 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1018 &btrfs_free_space_inode_key); 1019 1020 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1021 path, block_group->start); 1022 btrfs_free_path(path); 1023 if (ret <= 0) 1024 goto out; 1025 1026 matched = (tmp_ctl.free_space == (block_group->length - used - 1027 block_group->bytes_super)); 1028 1029 if (matched) { 1030 spin_lock(&tmp_ctl.tree_lock); 1031 ret = copy_free_space_cache(block_group, &tmp_ctl); 1032 spin_unlock(&tmp_ctl.tree_lock); 1033 /* 1034 * ret == 1 means we successfully loaded the free space cache, 1035 * so we need to re-set it here. 1036 */ 1037 if (ret == 0) 1038 ret = 1; 1039 } else { 1040 /* 1041 * We need to call the _locked variant so we don't try to update 1042 * the discard counters. 1043 */ 1044 spin_lock(&tmp_ctl.tree_lock); 1045 __btrfs_remove_free_space_cache(&tmp_ctl); 1046 spin_unlock(&tmp_ctl.tree_lock); 1047 btrfs_warn(fs_info, 1048 "block group %llu has wrong amount of free space", 1049 block_group->start); 1050 ret = -1; 1051 } 1052 out: 1053 if (ret < 0) { 1054 /* This cache is bogus, make sure it gets cleared */ 1055 spin_lock(&block_group->lock); 1056 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1057 spin_unlock(&block_group->lock); 1058 ret = 0; 1059 1060 btrfs_warn(fs_info, 1061 "failed to load free space cache for block group %llu, rebuilding it now", 1062 block_group->start); 1063 } 1064 1065 spin_lock(&ctl->tree_lock); 1066 btrfs_discard_update_discardable(block_group); 1067 spin_unlock(&ctl->tree_lock); 1068 iput(inode); 1069 return ret; 1070 } 1071 1072 static noinline_for_stack 1073 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1074 struct btrfs_free_space_ctl *ctl, 1075 struct btrfs_block_group *block_group, 1076 int *entries, int *bitmaps, 1077 struct list_head *bitmap_list) 1078 { 1079 int ret; 1080 struct btrfs_free_cluster *cluster = NULL; 1081 struct btrfs_free_cluster *cluster_locked = NULL; 1082 struct rb_node *node = rb_first(&ctl->free_space_offset); 1083 struct btrfs_trim_range *trim_entry; 1084 1085 /* Get the cluster for this block_group if it exists */ 1086 if (block_group && !list_empty(&block_group->cluster_list)) { 1087 cluster = list_entry(block_group->cluster_list.next, 1088 struct btrfs_free_cluster, 1089 block_group_list); 1090 } 1091 1092 if (!node && cluster) { 1093 cluster_locked = cluster; 1094 spin_lock(&cluster_locked->lock); 1095 node = rb_first(&cluster->root); 1096 cluster = NULL; 1097 } 1098 1099 /* Write out the extent entries */ 1100 while (node) { 1101 struct btrfs_free_space *e; 1102 1103 e = rb_entry(node, struct btrfs_free_space, offset_index); 1104 *entries += 1; 1105 1106 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1107 e->bitmap); 1108 if (ret) 1109 goto fail; 1110 1111 if (e->bitmap) { 1112 list_add_tail(&e->list, bitmap_list); 1113 *bitmaps += 1; 1114 } 1115 node = rb_next(node); 1116 if (!node && cluster) { 1117 node = rb_first(&cluster->root); 1118 cluster_locked = cluster; 1119 spin_lock(&cluster_locked->lock); 1120 cluster = NULL; 1121 } 1122 } 1123 if (cluster_locked) { 1124 spin_unlock(&cluster_locked->lock); 1125 cluster_locked = NULL; 1126 } 1127 1128 /* 1129 * Make sure we don't miss any range that was removed from our rbtree 1130 * because trimming is running. Otherwise after a umount+mount (or crash 1131 * after committing the transaction) we would leak free space and get 1132 * an inconsistent free space cache report from fsck. 1133 */ 1134 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1135 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1136 trim_entry->bytes, NULL); 1137 if (ret) 1138 goto fail; 1139 *entries += 1; 1140 } 1141 1142 return 0; 1143 fail: 1144 if (cluster_locked) 1145 spin_unlock(&cluster_locked->lock); 1146 return -ENOSPC; 1147 } 1148 1149 static noinline_for_stack int 1150 update_cache_item(struct btrfs_trans_handle *trans, 1151 struct btrfs_root *root, 1152 struct inode *inode, 1153 struct btrfs_path *path, u64 offset, 1154 int entries, int bitmaps) 1155 { 1156 struct btrfs_key key; 1157 struct btrfs_free_space_header *header; 1158 struct extent_buffer *leaf; 1159 int ret; 1160 1161 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1162 key.offset = offset; 1163 key.type = 0; 1164 1165 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1166 if (ret < 0) { 1167 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1168 EXTENT_DELALLOC, NULL); 1169 goto fail; 1170 } 1171 leaf = path->nodes[0]; 1172 if (ret > 0) { 1173 struct btrfs_key found_key; 1174 ASSERT(path->slots[0]); 1175 path->slots[0]--; 1176 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1177 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1178 found_key.offset != offset) { 1179 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1180 inode->i_size - 1, EXTENT_DELALLOC, 1181 NULL); 1182 btrfs_release_path(path); 1183 goto fail; 1184 } 1185 } 1186 1187 BTRFS_I(inode)->generation = trans->transid; 1188 header = btrfs_item_ptr(leaf, path->slots[0], 1189 struct btrfs_free_space_header); 1190 btrfs_set_free_space_entries(leaf, header, entries); 1191 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1192 btrfs_set_free_space_generation(leaf, header, trans->transid); 1193 btrfs_mark_buffer_dirty(trans, leaf); 1194 btrfs_release_path(path); 1195 1196 return 0; 1197 1198 fail: 1199 return -1; 1200 } 1201 1202 static noinline_for_stack int write_pinned_extent_entries( 1203 struct btrfs_trans_handle *trans, 1204 struct btrfs_block_group *block_group, 1205 struct btrfs_io_ctl *io_ctl, 1206 int *entries) 1207 { 1208 u64 start, extent_start, extent_end, len; 1209 struct extent_io_tree *unpin = NULL; 1210 int ret; 1211 1212 if (!block_group) 1213 return 0; 1214 1215 /* 1216 * We want to add any pinned extents to our free space cache 1217 * so we don't leak the space 1218 * 1219 * We shouldn't have switched the pinned extents yet so this is the 1220 * right one 1221 */ 1222 unpin = &trans->transaction->pinned_extents; 1223 1224 start = block_group->start; 1225 1226 while (start < block_group->start + block_group->length) { 1227 if (!find_first_extent_bit(unpin, start, 1228 &extent_start, &extent_end, 1229 EXTENT_DIRTY, NULL)) 1230 return 0; 1231 1232 /* This pinned extent is out of our range */ 1233 if (extent_start >= block_group->start + block_group->length) 1234 return 0; 1235 1236 extent_start = max(extent_start, start); 1237 extent_end = min(block_group->start + block_group->length, 1238 extent_end + 1); 1239 len = extent_end - extent_start; 1240 1241 *entries += 1; 1242 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1243 if (ret) 1244 return -ENOSPC; 1245 1246 start = extent_end; 1247 } 1248 1249 return 0; 1250 } 1251 1252 static noinline_for_stack int 1253 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1254 { 1255 struct btrfs_free_space *entry, *next; 1256 int ret; 1257 1258 /* Write out the bitmaps */ 1259 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1260 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1261 if (ret) 1262 return -ENOSPC; 1263 list_del_init(&entry->list); 1264 } 1265 1266 return 0; 1267 } 1268 1269 static int flush_dirty_cache(struct inode *inode) 1270 { 1271 int ret; 1272 1273 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1274 if (ret) 1275 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1276 EXTENT_DELALLOC, NULL); 1277 1278 return ret; 1279 } 1280 1281 static void noinline_for_stack 1282 cleanup_bitmap_list(struct list_head *bitmap_list) 1283 { 1284 struct btrfs_free_space *entry, *next; 1285 1286 list_for_each_entry_safe(entry, next, bitmap_list, list) 1287 list_del_init(&entry->list); 1288 } 1289 1290 static void noinline_for_stack 1291 cleanup_write_cache_enospc(struct inode *inode, 1292 struct btrfs_io_ctl *io_ctl, 1293 struct extent_state **cached_state) 1294 { 1295 io_ctl_drop_pages(io_ctl); 1296 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1297 cached_state); 1298 } 1299 1300 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1301 struct btrfs_trans_handle *trans, 1302 struct btrfs_block_group *block_group, 1303 struct btrfs_io_ctl *io_ctl, 1304 struct btrfs_path *path, u64 offset) 1305 { 1306 int ret; 1307 struct inode *inode = io_ctl->inode; 1308 1309 if (!inode) 1310 return 0; 1311 1312 /* Flush the dirty pages in the cache file. */ 1313 ret = flush_dirty_cache(inode); 1314 if (ret) 1315 goto out; 1316 1317 /* Update the cache item to tell everyone this cache file is valid. */ 1318 ret = update_cache_item(trans, root, inode, path, offset, 1319 io_ctl->entries, io_ctl->bitmaps); 1320 out: 1321 if (ret) { 1322 invalidate_inode_pages2(inode->i_mapping); 1323 BTRFS_I(inode)->generation = 0; 1324 if (block_group) 1325 btrfs_debug(root->fs_info, 1326 "failed to write free space cache for block group %llu error %d", 1327 block_group->start, ret); 1328 } 1329 btrfs_update_inode(trans, BTRFS_I(inode)); 1330 1331 if (block_group) { 1332 /* the dirty list is protected by the dirty_bgs_lock */ 1333 spin_lock(&trans->transaction->dirty_bgs_lock); 1334 1335 /* the disk_cache_state is protected by the block group lock */ 1336 spin_lock(&block_group->lock); 1337 1338 /* 1339 * only mark this as written if we didn't get put back on 1340 * the dirty list while waiting for IO. Otherwise our 1341 * cache state won't be right, and we won't get written again 1342 */ 1343 if (!ret && list_empty(&block_group->dirty_list)) 1344 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1345 else if (ret) 1346 block_group->disk_cache_state = BTRFS_DC_ERROR; 1347 1348 spin_unlock(&block_group->lock); 1349 spin_unlock(&trans->transaction->dirty_bgs_lock); 1350 io_ctl->inode = NULL; 1351 iput(inode); 1352 } 1353 1354 return ret; 1355 1356 } 1357 1358 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1359 struct btrfs_block_group *block_group, 1360 struct btrfs_path *path) 1361 { 1362 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1363 block_group, &block_group->io_ctl, 1364 path, block_group->start); 1365 } 1366 1367 /* 1368 * Write out cached info to an inode. 1369 * 1370 * @inode: freespace inode we are writing out 1371 * @ctl: free space cache we are going to write out 1372 * @block_group: block_group for this cache if it belongs to a block_group 1373 * @io_ctl: holds context for the io 1374 * @trans: the trans handle 1375 * 1376 * This function writes out a free space cache struct to disk for quick recovery 1377 * on mount. This will return 0 if it was successful in writing the cache out, 1378 * or an errno if it was not. 1379 */ 1380 static int __btrfs_write_out_cache(struct inode *inode, 1381 struct btrfs_free_space_ctl *ctl, 1382 struct btrfs_block_group *block_group, 1383 struct btrfs_io_ctl *io_ctl, 1384 struct btrfs_trans_handle *trans) 1385 { 1386 struct extent_state *cached_state = NULL; 1387 LIST_HEAD(bitmap_list); 1388 int entries = 0; 1389 int bitmaps = 0; 1390 int ret; 1391 int must_iput = 0; 1392 1393 if (!i_size_read(inode)) 1394 return -EIO; 1395 1396 WARN_ON(io_ctl->pages); 1397 ret = io_ctl_init(io_ctl, inode, 1); 1398 if (ret) 1399 return ret; 1400 1401 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1402 down_write(&block_group->data_rwsem); 1403 spin_lock(&block_group->lock); 1404 if (block_group->delalloc_bytes) { 1405 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1406 spin_unlock(&block_group->lock); 1407 up_write(&block_group->data_rwsem); 1408 BTRFS_I(inode)->generation = 0; 1409 ret = 0; 1410 must_iput = 1; 1411 goto out; 1412 } 1413 spin_unlock(&block_group->lock); 1414 } 1415 1416 /* Lock all pages first so we can lock the extent safely. */ 1417 ret = io_ctl_prepare_pages(io_ctl, false); 1418 if (ret) 1419 goto out_unlock; 1420 1421 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1422 &cached_state); 1423 1424 io_ctl_set_generation(io_ctl, trans->transid); 1425 1426 mutex_lock(&ctl->cache_writeout_mutex); 1427 /* Write out the extent entries in the free space cache */ 1428 spin_lock(&ctl->tree_lock); 1429 ret = write_cache_extent_entries(io_ctl, ctl, 1430 block_group, &entries, &bitmaps, 1431 &bitmap_list); 1432 if (ret) 1433 goto out_nospc_locked; 1434 1435 /* 1436 * Some spaces that are freed in the current transaction are pinned, 1437 * they will be added into free space cache after the transaction is 1438 * committed, we shouldn't lose them. 1439 * 1440 * If this changes while we are working we'll get added back to 1441 * the dirty list and redo it. No locking needed 1442 */ 1443 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1444 if (ret) 1445 goto out_nospc_locked; 1446 1447 /* 1448 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1449 * locked while doing it because a concurrent trim can be manipulating 1450 * or freeing the bitmap. 1451 */ 1452 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1453 spin_unlock(&ctl->tree_lock); 1454 mutex_unlock(&ctl->cache_writeout_mutex); 1455 if (ret) 1456 goto out_nospc; 1457 1458 /* Zero out the rest of the pages just to make sure */ 1459 io_ctl_zero_remaining_pages(io_ctl); 1460 1461 /* Everything is written out, now we dirty the pages in the file. */ 1462 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1463 io_ctl->num_pages, 0, i_size_read(inode), 1464 &cached_state, false); 1465 if (ret) 1466 goto out_nospc; 1467 1468 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1469 up_write(&block_group->data_rwsem); 1470 /* 1471 * Release the pages and unlock the extent, we will flush 1472 * them out later 1473 */ 1474 io_ctl_drop_pages(io_ctl); 1475 io_ctl_free(io_ctl); 1476 1477 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1478 &cached_state); 1479 1480 /* 1481 * at this point the pages are under IO and we're happy, 1482 * The caller is responsible for waiting on them and updating 1483 * the cache and the inode 1484 */ 1485 io_ctl->entries = entries; 1486 io_ctl->bitmaps = bitmaps; 1487 1488 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1489 if (ret) 1490 goto out; 1491 1492 return 0; 1493 1494 out_nospc_locked: 1495 cleanup_bitmap_list(&bitmap_list); 1496 spin_unlock(&ctl->tree_lock); 1497 mutex_unlock(&ctl->cache_writeout_mutex); 1498 1499 out_nospc: 1500 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1501 1502 out_unlock: 1503 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1504 up_write(&block_group->data_rwsem); 1505 1506 out: 1507 io_ctl->inode = NULL; 1508 io_ctl_free(io_ctl); 1509 if (ret) { 1510 invalidate_inode_pages2(inode->i_mapping); 1511 BTRFS_I(inode)->generation = 0; 1512 } 1513 btrfs_update_inode(trans, BTRFS_I(inode)); 1514 if (must_iput) 1515 iput(inode); 1516 return ret; 1517 } 1518 1519 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1520 struct btrfs_block_group *block_group, 1521 struct btrfs_path *path) 1522 { 1523 struct btrfs_fs_info *fs_info = trans->fs_info; 1524 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1525 struct inode *inode; 1526 int ret = 0; 1527 1528 spin_lock(&block_group->lock); 1529 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1530 spin_unlock(&block_group->lock); 1531 return 0; 1532 } 1533 spin_unlock(&block_group->lock); 1534 1535 inode = lookup_free_space_inode(block_group, path); 1536 if (IS_ERR(inode)) 1537 return 0; 1538 1539 ret = __btrfs_write_out_cache(inode, ctl, block_group, 1540 &block_group->io_ctl, trans); 1541 if (ret) { 1542 btrfs_debug(fs_info, 1543 "failed to write free space cache for block group %llu error %d", 1544 block_group->start, ret); 1545 spin_lock(&block_group->lock); 1546 block_group->disk_cache_state = BTRFS_DC_ERROR; 1547 spin_unlock(&block_group->lock); 1548 1549 block_group->io_ctl.inode = NULL; 1550 iput(inode); 1551 } 1552 1553 /* 1554 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1555 * to wait for IO and put the inode 1556 */ 1557 1558 return ret; 1559 } 1560 1561 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1562 u64 offset) 1563 { 1564 ASSERT(offset >= bitmap_start); 1565 offset -= bitmap_start; 1566 return (unsigned long)(div_u64(offset, unit)); 1567 } 1568 1569 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1570 { 1571 return (unsigned long)(div_u64(bytes, unit)); 1572 } 1573 1574 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1575 u64 offset) 1576 { 1577 u64 bitmap_start; 1578 u64 bytes_per_bitmap; 1579 1580 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1581 bitmap_start = offset - ctl->start; 1582 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1583 bitmap_start *= bytes_per_bitmap; 1584 bitmap_start += ctl->start; 1585 1586 return bitmap_start; 1587 } 1588 1589 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1590 struct btrfs_free_cluster *cluster, 1591 struct btrfs_free_space *new_entry) 1592 { 1593 struct rb_root *root; 1594 struct rb_node **p; 1595 struct rb_node *parent = NULL; 1596 1597 lockdep_assert_held(&ctl->tree_lock); 1598 1599 if (cluster) { 1600 lockdep_assert_held(&cluster->lock); 1601 root = &cluster->root; 1602 } else { 1603 root = &ctl->free_space_offset; 1604 } 1605 1606 p = &root->rb_node; 1607 1608 while (*p) { 1609 struct btrfs_free_space *info; 1610 1611 parent = *p; 1612 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1613 1614 if (new_entry->offset < info->offset) { 1615 p = &(*p)->rb_left; 1616 } else if (new_entry->offset > info->offset) { 1617 p = &(*p)->rb_right; 1618 } else { 1619 /* 1620 * we could have a bitmap entry and an extent entry 1621 * share the same offset. If this is the case, we want 1622 * the extent entry to always be found first if we do a 1623 * linear search through the tree, since we want to have 1624 * the quickest allocation time, and allocating from an 1625 * extent is faster than allocating from a bitmap. So 1626 * if we're inserting a bitmap and we find an entry at 1627 * this offset, we want to go right, or after this entry 1628 * logically. If we are inserting an extent and we've 1629 * found a bitmap, we want to go left, or before 1630 * logically. 1631 */ 1632 if (new_entry->bitmap) { 1633 if (info->bitmap) { 1634 WARN_ON_ONCE(1); 1635 return -EEXIST; 1636 } 1637 p = &(*p)->rb_right; 1638 } else { 1639 if (!info->bitmap) { 1640 WARN_ON_ONCE(1); 1641 return -EEXIST; 1642 } 1643 p = &(*p)->rb_left; 1644 } 1645 } 1646 } 1647 1648 rb_link_node(&new_entry->offset_index, parent, p); 1649 rb_insert_color(&new_entry->offset_index, root); 1650 1651 return 0; 1652 } 1653 1654 /* 1655 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1656 * searched through the bitmap and figured out the largest ->max_extent_size, 1657 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1658 * allocator the wrong thing, we want to use the actual real max_extent_size 1659 * we've found already if it's larger, or we want to use ->bytes. 1660 * 1661 * This matters because find_free_space() will skip entries who's ->bytes is 1662 * less than the required bytes. So if we didn't search down this bitmap, we 1663 * may pick some previous entry that has a smaller ->max_extent_size than we 1664 * have. For example, assume we have two entries, one that has 1665 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1666 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1667 * call into find_free_space(), and return with max_extent_size == 4K, because 1668 * that first bitmap entry had ->max_extent_size set, but the second one did 1669 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1670 * 8K contiguous range. 1671 * 1672 * Consider the other case, we have 2 8K chunks in that second entry and still 1673 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1674 * allocator comes in it'll fully search our second bitmap, and this time it'll 1675 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1676 * right allocation the next loop through. 1677 */ 1678 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1679 { 1680 if (entry->bitmap && entry->max_extent_size) 1681 return entry->max_extent_size; 1682 return entry->bytes; 1683 } 1684 1685 /* 1686 * We want the largest entry to be leftmost, so this is inverted from what you'd 1687 * normally expect. 1688 */ 1689 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1690 { 1691 const struct btrfs_free_space *entry, *exist; 1692 1693 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1694 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1695 return get_max_extent_size(exist) < get_max_extent_size(entry); 1696 } 1697 1698 /* 1699 * searches the tree for the given offset. 1700 * 1701 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1702 * want a section that has at least bytes size and comes at or after the given 1703 * offset. 1704 */ 1705 static struct btrfs_free_space * 1706 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1707 u64 offset, int bitmap_only, int fuzzy) 1708 { 1709 struct rb_node *n = ctl->free_space_offset.rb_node; 1710 struct btrfs_free_space *entry = NULL, *prev = NULL; 1711 1712 lockdep_assert_held(&ctl->tree_lock); 1713 1714 /* find entry that is closest to the 'offset' */ 1715 while (n) { 1716 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1717 prev = entry; 1718 1719 if (offset < entry->offset) 1720 n = n->rb_left; 1721 else if (offset > entry->offset) 1722 n = n->rb_right; 1723 else 1724 break; 1725 1726 entry = NULL; 1727 } 1728 1729 if (bitmap_only) { 1730 if (!entry) 1731 return NULL; 1732 if (entry->bitmap) 1733 return entry; 1734 1735 /* 1736 * bitmap entry and extent entry may share same offset, 1737 * in that case, bitmap entry comes after extent entry. 1738 */ 1739 n = rb_next(n); 1740 if (!n) 1741 return NULL; 1742 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1743 if (entry->offset != offset) 1744 return NULL; 1745 1746 WARN_ON(!entry->bitmap); 1747 return entry; 1748 } else if (entry) { 1749 if (entry->bitmap) { 1750 /* 1751 * if previous extent entry covers the offset, 1752 * we should return it instead of the bitmap entry 1753 */ 1754 n = rb_prev(&entry->offset_index); 1755 if (n) { 1756 prev = rb_entry(n, struct btrfs_free_space, 1757 offset_index); 1758 if (!prev->bitmap && 1759 prev->offset + prev->bytes > offset) 1760 entry = prev; 1761 } 1762 } 1763 return entry; 1764 } 1765 1766 if (!prev) 1767 return NULL; 1768 1769 /* find last entry before the 'offset' */ 1770 entry = prev; 1771 if (entry->offset > offset) { 1772 n = rb_prev(&entry->offset_index); 1773 if (n) { 1774 entry = rb_entry(n, struct btrfs_free_space, 1775 offset_index); 1776 ASSERT(entry->offset <= offset); 1777 } else { 1778 if (fuzzy) 1779 return entry; 1780 else 1781 return NULL; 1782 } 1783 } 1784 1785 if (entry->bitmap) { 1786 n = rb_prev(&entry->offset_index); 1787 if (n) { 1788 prev = rb_entry(n, struct btrfs_free_space, 1789 offset_index); 1790 if (!prev->bitmap && 1791 prev->offset + prev->bytes > offset) 1792 return prev; 1793 } 1794 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1795 return entry; 1796 } else if (entry->offset + entry->bytes > offset) 1797 return entry; 1798 1799 if (!fuzzy) 1800 return NULL; 1801 1802 while (1) { 1803 n = rb_next(&entry->offset_index); 1804 if (!n) 1805 return NULL; 1806 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1807 if (entry->bitmap) { 1808 if (entry->offset + BITS_PER_BITMAP * 1809 ctl->unit > offset) 1810 break; 1811 } else { 1812 if (entry->offset + entry->bytes > offset) 1813 break; 1814 } 1815 } 1816 return entry; 1817 } 1818 1819 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1820 struct btrfs_free_space *info, 1821 bool update_stat) 1822 { 1823 lockdep_assert_held(&ctl->tree_lock); 1824 1825 rb_erase(&info->offset_index, &ctl->free_space_offset); 1826 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1827 ctl->free_extents--; 1828 1829 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1830 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1831 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1832 } 1833 1834 if (update_stat) 1835 ctl->free_space -= info->bytes; 1836 } 1837 1838 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1839 struct btrfs_free_space *info) 1840 { 1841 int ret = 0; 1842 1843 lockdep_assert_held(&ctl->tree_lock); 1844 1845 ASSERT(info->bytes || info->bitmap); 1846 ret = tree_insert_offset(ctl, NULL, info); 1847 if (ret) 1848 return ret; 1849 1850 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1851 1852 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1853 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1854 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1855 } 1856 1857 ctl->free_space += info->bytes; 1858 ctl->free_extents++; 1859 return ret; 1860 } 1861 1862 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1863 struct btrfs_free_space *info) 1864 { 1865 ASSERT(info->bitmap); 1866 1867 /* 1868 * If our entry is empty it's because we're on a cluster and we don't 1869 * want to re-link it into our ctl bytes index. 1870 */ 1871 if (RB_EMPTY_NODE(&info->bytes_index)) 1872 return; 1873 1874 lockdep_assert_held(&ctl->tree_lock); 1875 1876 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1877 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1878 } 1879 1880 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1881 struct btrfs_free_space *info, 1882 u64 offset, u64 bytes, bool update_stat) 1883 { 1884 unsigned long start, count, end; 1885 int extent_delta = -1; 1886 1887 start = offset_to_bit(info->offset, ctl->unit, offset); 1888 count = bytes_to_bits(bytes, ctl->unit); 1889 end = start + count; 1890 ASSERT(end <= BITS_PER_BITMAP); 1891 1892 bitmap_clear(info->bitmap, start, count); 1893 1894 info->bytes -= bytes; 1895 if (info->max_extent_size > ctl->unit) 1896 info->max_extent_size = 0; 1897 1898 relink_bitmap_entry(ctl, info); 1899 1900 if (start && test_bit(start - 1, info->bitmap)) 1901 extent_delta++; 1902 1903 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1904 extent_delta++; 1905 1906 info->bitmap_extents += extent_delta; 1907 if (!btrfs_free_space_trimmed(info)) { 1908 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1909 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1910 } 1911 1912 if (update_stat) 1913 ctl->free_space -= bytes; 1914 } 1915 1916 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1917 struct btrfs_free_space *info, u64 offset, 1918 u64 bytes) 1919 { 1920 unsigned long start, count, end; 1921 int extent_delta = 1; 1922 1923 start = offset_to_bit(info->offset, ctl->unit, offset); 1924 count = bytes_to_bits(bytes, ctl->unit); 1925 end = start + count; 1926 ASSERT(end <= BITS_PER_BITMAP); 1927 1928 bitmap_set(info->bitmap, start, count); 1929 1930 /* 1931 * We set some bytes, we have no idea what the max extent size is 1932 * anymore. 1933 */ 1934 info->max_extent_size = 0; 1935 info->bytes += bytes; 1936 ctl->free_space += bytes; 1937 1938 relink_bitmap_entry(ctl, info); 1939 1940 if (start && test_bit(start - 1, info->bitmap)) 1941 extent_delta--; 1942 1943 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1944 extent_delta--; 1945 1946 info->bitmap_extents += extent_delta; 1947 if (!btrfs_free_space_trimmed(info)) { 1948 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1949 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1950 } 1951 } 1952 1953 /* 1954 * If we can not find suitable extent, we will use bytes to record 1955 * the size of the max extent. 1956 */ 1957 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1958 struct btrfs_free_space *bitmap_info, u64 *offset, 1959 u64 *bytes, bool for_alloc) 1960 { 1961 unsigned long found_bits = 0; 1962 unsigned long max_bits = 0; 1963 unsigned long bits, i; 1964 unsigned long next_zero; 1965 unsigned long extent_bits; 1966 1967 /* 1968 * Skip searching the bitmap if we don't have a contiguous section that 1969 * is large enough for this allocation. 1970 */ 1971 if (for_alloc && 1972 bitmap_info->max_extent_size && 1973 bitmap_info->max_extent_size < *bytes) { 1974 *bytes = bitmap_info->max_extent_size; 1975 return -1; 1976 } 1977 1978 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1979 max_t(u64, *offset, bitmap_info->offset)); 1980 bits = bytes_to_bits(*bytes, ctl->unit); 1981 1982 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1983 if (for_alloc && bits == 1) { 1984 found_bits = 1; 1985 break; 1986 } 1987 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1988 BITS_PER_BITMAP, i); 1989 extent_bits = next_zero - i; 1990 if (extent_bits >= bits) { 1991 found_bits = extent_bits; 1992 break; 1993 } else if (extent_bits > max_bits) { 1994 max_bits = extent_bits; 1995 } 1996 i = next_zero; 1997 } 1998 1999 if (found_bits) { 2000 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 2001 *bytes = (u64)(found_bits) * ctl->unit; 2002 return 0; 2003 } 2004 2005 *bytes = (u64)(max_bits) * ctl->unit; 2006 bitmap_info->max_extent_size = *bytes; 2007 relink_bitmap_entry(ctl, bitmap_info); 2008 return -1; 2009 } 2010 2011 /* Cache the size of the max extent in bytes */ 2012 static struct btrfs_free_space * 2013 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2014 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2015 { 2016 struct btrfs_free_space *entry; 2017 struct rb_node *node; 2018 u64 tmp; 2019 u64 align_off; 2020 int ret; 2021 2022 if (!ctl->free_space_offset.rb_node) 2023 goto out; 2024 again: 2025 if (use_bytes_index) { 2026 node = rb_first_cached(&ctl->free_space_bytes); 2027 } else { 2028 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2029 0, 1); 2030 if (!entry) 2031 goto out; 2032 node = &entry->offset_index; 2033 } 2034 2035 for (; node; node = rb_next(node)) { 2036 if (use_bytes_index) 2037 entry = rb_entry(node, struct btrfs_free_space, 2038 bytes_index); 2039 else 2040 entry = rb_entry(node, struct btrfs_free_space, 2041 offset_index); 2042 2043 /* 2044 * If we are using the bytes index then all subsequent entries 2045 * in this tree are going to be < bytes, so simply set the max 2046 * extent size and exit the loop. 2047 * 2048 * If we're using the offset index then we need to keep going 2049 * through the rest of the tree. 2050 */ 2051 if (entry->bytes < *bytes) { 2052 *max_extent_size = max(get_max_extent_size(entry), 2053 *max_extent_size); 2054 if (use_bytes_index) 2055 break; 2056 continue; 2057 } 2058 2059 /* make sure the space returned is big enough 2060 * to match our requested alignment 2061 */ 2062 if (*bytes >= align) { 2063 tmp = entry->offset - ctl->start + align - 1; 2064 tmp = div64_u64(tmp, align); 2065 tmp = tmp * align + ctl->start; 2066 align_off = tmp - entry->offset; 2067 } else { 2068 align_off = 0; 2069 tmp = entry->offset; 2070 } 2071 2072 /* 2073 * We don't break here if we're using the bytes index because we 2074 * may have another entry that has the correct alignment that is 2075 * the right size, so we don't want to miss that possibility. 2076 * At worst this adds another loop through the logic, but if we 2077 * broke here we could prematurely ENOSPC. 2078 */ 2079 if (entry->bytes < *bytes + align_off) { 2080 *max_extent_size = max(get_max_extent_size(entry), 2081 *max_extent_size); 2082 continue; 2083 } 2084 2085 if (entry->bitmap) { 2086 struct rb_node *old_next = rb_next(node); 2087 u64 size = *bytes; 2088 2089 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2090 if (!ret) { 2091 *offset = tmp; 2092 *bytes = size; 2093 return entry; 2094 } else { 2095 *max_extent_size = 2096 max(get_max_extent_size(entry), 2097 *max_extent_size); 2098 } 2099 2100 /* 2101 * The bitmap may have gotten re-arranged in the space 2102 * index here because the max_extent_size may have been 2103 * updated. Start from the beginning again if this 2104 * happened. 2105 */ 2106 if (use_bytes_index && old_next != rb_next(node)) 2107 goto again; 2108 continue; 2109 } 2110 2111 *offset = tmp; 2112 *bytes = entry->bytes - align_off; 2113 return entry; 2114 } 2115 out: 2116 return NULL; 2117 } 2118 2119 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2120 struct btrfs_free_space *info, u64 offset) 2121 { 2122 info->offset = offset_to_bitmap(ctl, offset); 2123 info->bytes = 0; 2124 info->bitmap_extents = 0; 2125 INIT_LIST_HEAD(&info->list); 2126 link_free_space(ctl, info); 2127 ctl->total_bitmaps++; 2128 recalculate_thresholds(ctl); 2129 } 2130 2131 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2132 struct btrfs_free_space *bitmap_info) 2133 { 2134 /* 2135 * Normally when this is called, the bitmap is completely empty. However, 2136 * if we are blowing up the free space cache for one reason or another 2137 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2138 * we may leave stats on the table. 2139 */ 2140 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2141 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2142 bitmap_info->bitmap_extents; 2143 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2144 2145 } 2146 unlink_free_space(ctl, bitmap_info, true); 2147 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2148 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2149 ctl->total_bitmaps--; 2150 recalculate_thresholds(ctl); 2151 } 2152 2153 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2154 struct btrfs_free_space *bitmap_info, 2155 u64 *offset, u64 *bytes) 2156 { 2157 u64 end; 2158 u64 search_start, search_bytes; 2159 int ret; 2160 2161 again: 2162 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2163 2164 /* 2165 * We need to search for bits in this bitmap. We could only cover some 2166 * of the extent in this bitmap thanks to how we add space, so we need 2167 * to search for as much as it as we can and clear that amount, and then 2168 * go searching for the next bit. 2169 */ 2170 search_start = *offset; 2171 search_bytes = ctl->unit; 2172 search_bytes = min(search_bytes, end - search_start + 1); 2173 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2174 false); 2175 if (ret < 0 || search_start != *offset) 2176 return -EINVAL; 2177 2178 /* We may have found more bits than what we need */ 2179 search_bytes = min(search_bytes, *bytes); 2180 2181 /* Cannot clear past the end of the bitmap */ 2182 search_bytes = min(search_bytes, end - search_start + 1); 2183 2184 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2185 *offset += search_bytes; 2186 *bytes -= search_bytes; 2187 2188 if (*bytes) { 2189 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2190 if (!bitmap_info->bytes) 2191 free_bitmap(ctl, bitmap_info); 2192 2193 /* 2194 * no entry after this bitmap, but we still have bytes to 2195 * remove, so something has gone wrong. 2196 */ 2197 if (!next) 2198 return -EINVAL; 2199 2200 bitmap_info = rb_entry(next, struct btrfs_free_space, 2201 offset_index); 2202 2203 /* 2204 * if the next entry isn't a bitmap we need to return to let the 2205 * extent stuff do its work. 2206 */ 2207 if (!bitmap_info->bitmap) 2208 return -EAGAIN; 2209 2210 /* 2211 * Ok the next item is a bitmap, but it may not actually hold 2212 * the information for the rest of this free space stuff, so 2213 * look for it, and if we don't find it return so we can try 2214 * everything over again. 2215 */ 2216 search_start = *offset; 2217 search_bytes = ctl->unit; 2218 ret = search_bitmap(ctl, bitmap_info, &search_start, 2219 &search_bytes, false); 2220 if (ret < 0 || search_start != *offset) 2221 return -EAGAIN; 2222 2223 goto again; 2224 } else if (!bitmap_info->bytes) 2225 free_bitmap(ctl, bitmap_info); 2226 2227 return 0; 2228 } 2229 2230 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2231 struct btrfs_free_space *info, u64 offset, 2232 u64 bytes, enum btrfs_trim_state trim_state) 2233 { 2234 u64 bytes_to_set = 0; 2235 u64 end; 2236 2237 /* 2238 * This is a tradeoff to make bitmap trim state minimal. We mark the 2239 * whole bitmap untrimmed if at any point we add untrimmed regions. 2240 */ 2241 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2242 if (btrfs_free_space_trimmed(info)) { 2243 ctl->discardable_extents[BTRFS_STAT_CURR] += 2244 info->bitmap_extents; 2245 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2246 } 2247 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2248 } 2249 2250 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2251 2252 bytes_to_set = min(end - offset, bytes); 2253 2254 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2255 2256 return bytes_to_set; 2257 2258 } 2259 2260 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2261 struct btrfs_free_space *info) 2262 { 2263 struct btrfs_block_group *block_group = ctl->block_group; 2264 struct btrfs_fs_info *fs_info = block_group->fs_info; 2265 bool forced = false; 2266 2267 #ifdef CONFIG_BTRFS_DEBUG 2268 if (btrfs_should_fragment_free_space(block_group)) 2269 forced = true; 2270 #endif 2271 2272 /* This is a way to reclaim large regions from the bitmaps. */ 2273 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2274 return false; 2275 2276 /* 2277 * If we are below the extents threshold then we can add this as an 2278 * extent, and don't have to deal with the bitmap 2279 */ 2280 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2281 /* 2282 * If this block group has some small extents we don't want to 2283 * use up all of our free slots in the cache with them, we want 2284 * to reserve them to larger extents, however if we have plenty 2285 * of cache left then go ahead an dadd them, no sense in adding 2286 * the overhead of a bitmap if we don't have to. 2287 */ 2288 if (info->bytes <= fs_info->sectorsize * 8) { 2289 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2290 return false; 2291 } else { 2292 return false; 2293 } 2294 } 2295 2296 /* 2297 * The original block groups from mkfs can be really small, like 8 2298 * megabytes, so don't bother with a bitmap for those entries. However 2299 * some block groups can be smaller than what a bitmap would cover but 2300 * are still large enough that they could overflow the 32k memory limit, 2301 * so allow those block groups to still be allowed to have a bitmap 2302 * entry. 2303 */ 2304 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2305 return false; 2306 2307 return true; 2308 } 2309 2310 static const struct btrfs_free_space_op free_space_op = { 2311 .use_bitmap = use_bitmap, 2312 }; 2313 2314 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2315 struct btrfs_free_space *info) 2316 { 2317 struct btrfs_free_space *bitmap_info; 2318 struct btrfs_block_group *block_group = NULL; 2319 int added = 0; 2320 u64 bytes, offset, bytes_added; 2321 enum btrfs_trim_state trim_state; 2322 int ret; 2323 2324 bytes = info->bytes; 2325 offset = info->offset; 2326 trim_state = info->trim_state; 2327 2328 if (!ctl->op->use_bitmap(ctl, info)) 2329 return 0; 2330 2331 if (ctl->op == &free_space_op) 2332 block_group = ctl->block_group; 2333 again: 2334 /* 2335 * Since we link bitmaps right into the cluster we need to see if we 2336 * have a cluster here, and if so and it has our bitmap we need to add 2337 * the free space to that bitmap. 2338 */ 2339 if (block_group && !list_empty(&block_group->cluster_list)) { 2340 struct btrfs_free_cluster *cluster; 2341 struct rb_node *node; 2342 struct btrfs_free_space *entry; 2343 2344 cluster = list_entry(block_group->cluster_list.next, 2345 struct btrfs_free_cluster, 2346 block_group_list); 2347 spin_lock(&cluster->lock); 2348 node = rb_first(&cluster->root); 2349 if (!node) { 2350 spin_unlock(&cluster->lock); 2351 goto no_cluster_bitmap; 2352 } 2353 2354 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2355 if (!entry->bitmap) { 2356 spin_unlock(&cluster->lock); 2357 goto no_cluster_bitmap; 2358 } 2359 2360 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2361 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2362 bytes, trim_state); 2363 bytes -= bytes_added; 2364 offset += bytes_added; 2365 } 2366 spin_unlock(&cluster->lock); 2367 if (!bytes) { 2368 ret = 1; 2369 goto out; 2370 } 2371 } 2372 2373 no_cluster_bitmap: 2374 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2375 1, 0); 2376 if (!bitmap_info) { 2377 ASSERT(added == 0); 2378 goto new_bitmap; 2379 } 2380 2381 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2382 trim_state); 2383 bytes -= bytes_added; 2384 offset += bytes_added; 2385 added = 0; 2386 2387 if (!bytes) { 2388 ret = 1; 2389 goto out; 2390 } else 2391 goto again; 2392 2393 new_bitmap: 2394 if (info && info->bitmap) { 2395 add_new_bitmap(ctl, info, offset); 2396 added = 1; 2397 info = NULL; 2398 goto again; 2399 } else { 2400 spin_unlock(&ctl->tree_lock); 2401 2402 /* no pre-allocated info, allocate a new one */ 2403 if (!info) { 2404 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2405 GFP_NOFS); 2406 if (!info) { 2407 spin_lock(&ctl->tree_lock); 2408 ret = -ENOMEM; 2409 goto out; 2410 } 2411 } 2412 2413 /* allocate the bitmap */ 2414 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2415 GFP_NOFS); 2416 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2417 spin_lock(&ctl->tree_lock); 2418 if (!info->bitmap) { 2419 ret = -ENOMEM; 2420 goto out; 2421 } 2422 goto again; 2423 } 2424 2425 out: 2426 if (info) { 2427 if (info->bitmap) 2428 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2429 info->bitmap); 2430 kmem_cache_free(btrfs_free_space_cachep, info); 2431 } 2432 2433 return ret; 2434 } 2435 2436 /* 2437 * Free space merging rules: 2438 * 1) Merge trimmed areas together 2439 * 2) Let untrimmed areas coalesce with trimmed areas 2440 * 3) Always pull neighboring regions from bitmaps 2441 * 2442 * The above rules are for when we merge free space based on btrfs_trim_state. 2443 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2444 * same reason: to promote larger extent regions which makes life easier for 2445 * find_free_extent(). Rule 2 enables coalescing based on the common path 2446 * being returning free space from btrfs_finish_extent_commit(). So when free 2447 * space is trimmed, it will prevent aggregating trimmed new region and 2448 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2449 * and provide find_free_extent() with the largest extents possible hoping for 2450 * the reuse path. 2451 */ 2452 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2453 struct btrfs_free_space *info, bool update_stat) 2454 { 2455 struct btrfs_free_space *left_info = NULL; 2456 struct btrfs_free_space *right_info; 2457 bool merged = false; 2458 u64 offset = info->offset; 2459 u64 bytes = info->bytes; 2460 const bool is_trimmed = btrfs_free_space_trimmed(info); 2461 struct rb_node *right_prev = NULL; 2462 2463 /* 2464 * first we want to see if there is free space adjacent to the range we 2465 * are adding, if there is remove that struct and add a new one to 2466 * cover the entire range 2467 */ 2468 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2469 if (right_info) 2470 right_prev = rb_prev(&right_info->offset_index); 2471 2472 if (right_prev) 2473 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2474 else if (!right_info) 2475 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2476 2477 /* See try_merge_free_space() comment. */ 2478 if (right_info && !right_info->bitmap && 2479 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2480 unlink_free_space(ctl, right_info, update_stat); 2481 info->bytes += right_info->bytes; 2482 kmem_cache_free(btrfs_free_space_cachep, right_info); 2483 merged = true; 2484 } 2485 2486 /* See try_merge_free_space() comment. */ 2487 if (left_info && !left_info->bitmap && 2488 left_info->offset + left_info->bytes == offset && 2489 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2490 unlink_free_space(ctl, left_info, update_stat); 2491 info->offset = left_info->offset; 2492 info->bytes += left_info->bytes; 2493 kmem_cache_free(btrfs_free_space_cachep, left_info); 2494 merged = true; 2495 } 2496 2497 return merged; 2498 } 2499 2500 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2501 struct btrfs_free_space *info, 2502 bool update_stat) 2503 { 2504 struct btrfs_free_space *bitmap; 2505 unsigned long i; 2506 unsigned long j; 2507 const u64 end = info->offset + info->bytes; 2508 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2509 u64 bytes; 2510 2511 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2512 if (!bitmap) 2513 return false; 2514 2515 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2516 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2517 if (j == i) 2518 return false; 2519 bytes = (j - i) * ctl->unit; 2520 info->bytes += bytes; 2521 2522 /* See try_merge_free_space() comment. */ 2523 if (!btrfs_free_space_trimmed(bitmap)) 2524 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2525 2526 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2527 2528 if (!bitmap->bytes) 2529 free_bitmap(ctl, bitmap); 2530 2531 return true; 2532 } 2533 2534 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2535 struct btrfs_free_space *info, 2536 bool update_stat) 2537 { 2538 struct btrfs_free_space *bitmap; 2539 u64 bitmap_offset; 2540 unsigned long i; 2541 unsigned long j; 2542 unsigned long prev_j; 2543 u64 bytes; 2544 2545 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2546 /* If we're on a boundary, try the previous logical bitmap. */ 2547 if (bitmap_offset == info->offset) { 2548 if (info->offset == 0) 2549 return false; 2550 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2551 } 2552 2553 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2554 if (!bitmap) 2555 return false; 2556 2557 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2558 j = 0; 2559 prev_j = (unsigned long)-1; 2560 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2561 if (j > i) 2562 break; 2563 prev_j = j; 2564 } 2565 if (prev_j == i) 2566 return false; 2567 2568 if (prev_j == (unsigned long)-1) 2569 bytes = (i + 1) * ctl->unit; 2570 else 2571 bytes = (i - prev_j) * ctl->unit; 2572 2573 info->offset -= bytes; 2574 info->bytes += bytes; 2575 2576 /* See try_merge_free_space() comment. */ 2577 if (!btrfs_free_space_trimmed(bitmap)) 2578 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2579 2580 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2581 2582 if (!bitmap->bytes) 2583 free_bitmap(ctl, bitmap); 2584 2585 return true; 2586 } 2587 2588 /* 2589 * We prefer always to allocate from extent entries, both for clustered and 2590 * non-clustered allocation requests. So when attempting to add a new extent 2591 * entry, try to see if there's adjacent free space in bitmap entries, and if 2592 * there is, migrate that space from the bitmaps to the extent. 2593 * Like this we get better chances of satisfying space allocation requests 2594 * because we attempt to satisfy them based on a single cache entry, and never 2595 * on 2 or more entries - even if the entries represent a contiguous free space 2596 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2597 * ends). 2598 */ 2599 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2600 struct btrfs_free_space *info, 2601 bool update_stat) 2602 { 2603 /* 2604 * Only work with disconnected entries, as we can change their offset, 2605 * and must be extent entries. 2606 */ 2607 ASSERT(!info->bitmap); 2608 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2609 2610 if (ctl->total_bitmaps > 0) { 2611 bool stole_end; 2612 bool stole_front = false; 2613 2614 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2615 if (ctl->total_bitmaps > 0) 2616 stole_front = steal_from_bitmap_to_front(ctl, info, 2617 update_stat); 2618 2619 if (stole_end || stole_front) 2620 try_merge_free_space(ctl, info, update_stat); 2621 } 2622 } 2623 2624 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2625 u64 offset, u64 bytes, 2626 enum btrfs_trim_state trim_state) 2627 { 2628 struct btrfs_fs_info *fs_info = block_group->fs_info; 2629 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2630 struct btrfs_free_space *info; 2631 int ret = 0; 2632 u64 filter_bytes = bytes; 2633 2634 ASSERT(!btrfs_is_zoned(fs_info)); 2635 2636 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2637 if (!info) 2638 return -ENOMEM; 2639 2640 info->offset = offset; 2641 info->bytes = bytes; 2642 info->trim_state = trim_state; 2643 RB_CLEAR_NODE(&info->offset_index); 2644 RB_CLEAR_NODE(&info->bytes_index); 2645 2646 spin_lock(&ctl->tree_lock); 2647 2648 if (try_merge_free_space(ctl, info, true)) 2649 goto link; 2650 2651 /* 2652 * There was no extent directly to the left or right of this new 2653 * extent then we know we're going to have to allocate a new extent, so 2654 * before we do that see if we need to drop this into a bitmap 2655 */ 2656 ret = insert_into_bitmap(ctl, info); 2657 if (ret < 0) { 2658 goto out; 2659 } else if (ret) { 2660 ret = 0; 2661 goto out; 2662 } 2663 link: 2664 /* 2665 * Only steal free space from adjacent bitmaps if we're sure we're not 2666 * going to add the new free space to existing bitmap entries - because 2667 * that would mean unnecessary work that would be reverted. Therefore 2668 * attempt to steal space from bitmaps if we're adding an extent entry. 2669 */ 2670 steal_from_bitmap(ctl, info, true); 2671 2672 filter_bytes = max(filter_bytes, info->bytes); 2673 2674 ret = link_free_space(ctl, info); 2675 if (ret) 2676 kmem_cache_free(btrfs_free_space_cachep, info); 2677 out: 2678 btrfs_discard_update_discardable(block_group); 2679 spin_unlock(&ctl->tree_lock); 2680 2681 if (ret) { 2682 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2683 ASSERT(ret != -EEXIST); 2684 } 2685 2686 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2687 btrfs_discard_check_filter(block_group, filter_bytes); 2688 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2689 } 2690 2691 return ret; 2692 } 2693 2694 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2695 u64 bytenr, u64 size, bool used) 2696 { 2697 struct btrfs_space_info *sinfo = block_group->space_info; 2698 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2699 u64 offset = bytenr - block_group->start; 2700 u64 to_free, to_unusable; 2701 int bg_reclaim_threshold = 0; 2702 bool initial = (size == block_group->length); 2703 u64 reclaimable_unusable; 2704 2705 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2706 2707 if (!initial) 2708 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2709 2710 spin_lock(&ctl->tree_lock); 2711 if (!used) 2712 to_free = size; 2713 else if (initial) 2714 to_free = block_group->zone_capacity; 2715 else if (offset >= block_group->alloc_offset) 2716 to_free = size; 2717 else if (offset + size <= block_group->alloc_offset) 2718 to_free = 0; 2719 else 2720 to_free = offset + size - block_group->alloc_offset; 2721 to_unusable = size - to_free; 2722 2723 ctl->free_space += to_free; 2724 /* 2725 * If the block group is read-only, we should account freed space into 2726 * bytes_readonly. 2727 */ 2728 if (!block_group->ro) 2729 block_group->zone_unusable += to_unusable; 2730 spin_unlock(&ctl->tree_lock); 2731 if (!used) { 2732 spin_lock(&block_group->lock); 2733 block_group->alloc_offset -= size; 2734 spin_unlock(&block_group->lock); 2735 } 2736 2737 reclaimable_unusable = block_group->zone_unusable - 2738 (block_group->length - block_group->zone_capacity); 2739 /* All the region is now unusable. Mark it as unused and reclaim */ 2740 if (block_group->zone_unusable == block_group->length) { 2741 btrfs_mark_bg_unused(block_group); 2742 } else if (bg_reclaim_threshold && 2743 reclaimable_unusable >= 2744 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2745 btrfs_mark_bg_to_reclaim(block_group); 2746 } 2747 2748 return 0; 2749 } 2750 2751 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2752 u64 bytenr, u64 size) 2753 { 2754 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2755 2756 if (btrfs_is_zoned(block_group->fs_info)) 2757 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2758 true); 2759 2760 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2761 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2762 2763 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2764 } 2765 2766 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2767 u64 bytenr, u64 size) 2768 { 2769 if (btrfs_is_zoned(block_group->fs_info)) 2770 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2771 false); 2772 2773 return btrfs_add_free_space(block_group, bytenr, size); 2774 } 2775 2776 /* 2777 * This is a subtle distinction because when adding free space back in general, 2778 * we want it to be added as untrimmed for async. But in the case where we add 2779 * it on loading of a block group, we want to consider it trimmed. 2780 */ 2781 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2782 u64 bytenr, u64 size) 2783 { 2784 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2785 2786 if (btrfs_is_zoned(block_group->fs_info)) 2787 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2788 true); 2789 2790 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2791 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2792 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2793 2794 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2795 } 2796 2797 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2798 u64 offset, u64 bytes) 2799 { 2800 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2801 struct btrfs_free_space *info; 2802 int ret; 2803 bool re_search = false; 2804 2805 if (btrfs_is_zoned(block_group->fs_info)) { 2806 /* 2807 * This can happen with conventional zones when replaying log. 2808 * Since the allocation info of tree-log nodes are not recorded 2809 * to the extent-tree, calculate_alloc_pointer() failed to 2810 * advance the allocation pointer after last allocated tree log 2811 * node blocks. 2812 * 2813 * This function is called from 2814 * btrfs_pin_extent_for_log_replay() when replaying the log. 2815 * Advance the pointer not to overwrite the tree-log nodes. 2816 */ 2817 if (block_group->start + block_group->alloc_offset < 2818 offset + bytes) { 2819 block_group->alloc_offset = 2820 offset + bytes - block_group->start; 2821 } 2822 return 0; 2823 } 2824 2825 spin_lock(&ctl->tree_lock); 2826 2827 again: 2828 ret = 0; 2829 if (!bytes) 2830 goto out_lock; 2831 2832 info = tree_search_offset(ctl, offset, 0, 0); 2833 if (!info) { 2834 /* 2835 * oops didn't find an extent that matched the space we wanted 2836 * to remove, look for a bitmap instead 2837 */ 2838 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2839 1, 0); 2840 if (!info) { 2841 /* 2842 * If we found a partial bit of our free space in a 2843 * bitmap but then couldn't find the other part this may 2844 * be a problem, so WARN about it. 2845 */ 2846 WARN_ON(re_search); 2847 goto out_lock; 2848 } 2849 } 2850 2851 re_search = false; 2852 if (!info->bitmap) { 2853 unlink_free_space(ctl, info, true); 2854 if (offset == info->offset) { 2855 u64 to_free = min(bytes, info->bytes); 2856 2857 info->bytes -= to_free; 2858 info->offset += to_free; 2859 if (info->bytes) { 2860 ret = link_free_space(ctl, info); 2861 WARN_ON(ret); 2862 } else { 2863 kmem_cache_free(btrfs_free_space_cachep, info); 2864 } 2865 2866 offset += to_free; 2867 bytes -= to_free; 2868 goto again; 2869 } else { 2870 u64 old_end = info->bytes + info->offset; 2871 2872 info->bytes = offset - info->offset; 2873 ret = link_free_space(ctl, info); 2874 WARN_ON(ret); 2875 if (ret) 2876 goto out_lock; 2877 2878 /* Not enough bytes in this entry to satisfy us */ 2879 if (old_end < offset + bytes) { 2880 bytes -= old_end - offset; 2881 offset = old_end; 2882 goto again; 2883 } else if (old_end == offset + bytes) { 2884 /* all done */ 2885 goto out_lock; 2886 } 2887 spin_unlock(&ctl->tree_lock); 2888 2889 ret = __btrfs_add_free_space(block_group, 2890 offset + bytes, 2891 old_end - (offset + bytes), 2892 info->trim_state); 2893 WARN_ON(ret); 2894 goto out; 2895 } 2896 } 2897 2898 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2899 if (ret == -EAGAIN) { 2900 re_search = true; 2901 goto again; 2902 } 2903 out_lock: 2904 btrfs_discard_update_discardable(block_group); 2905 spin_unlock(&ctl->tree_lock); 2906 out: 2907 return ret; 2908 } 2909 2910 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2911 u64 bytes) 2912 { 2913 struct btrfs_fs_info *fs_info = block_group->fs_info; 2914 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2915 struct btrfs_free_space *info; 2916 struct rb_node *n; 2917 int count = 0; 2918 2919 /* 2920 * Zoned btrfs does not use free space tree and cluster. Just print 2921 * out the free space after the allocation offset. 2922 */ 2923 if (btrfs_is_zoned(fs_info)) { 2924 btrfs_info(fs_info, "free space %llu active %d", 2925 block_group->zone_capacity - block_group->alloc_offset, 2926 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2927 &block_group->runtime_flags)); 2928 return; 2929 } 2930 2931 spin_lock(&ctl->tree_lock); 2932 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2933 info = rb_entry(n, struct btrfs_free_space, offset_index); 2934 if (info->bytes >= bytes && !block_group->ro) 2935 count++; 2936 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2937 info->offset, info->bytes, 2938 (info->bitmap) ? "yes" : "no"); 2939 } 2940 spin_unlock(&ctl->tree_lock); 2941 btrfs_info(fs_info, "block group has cluster?: %s", 2942 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2943 btrfs_info(fs_info, 2944 "%d free space entries at or bigger than %llu bytes", 2945 count, bytes); 2946 } 2947 2948 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2949 struct btrfs_free_space_ctl *ctl) 2950 { 2951 struct btrfs_fs_info *fs_info = block_group->fs_info; 2952 2953 spin_lock_init(&ctl->tree_lock); 2954 ctl->unit = fs_info->sectorsize; 2955 ctl->start = block_group->start; 2956 ctl->block_group = block_group; 2957 ctl->op = &free_space_op; 2958 ctl->free_space_bytes = RB_ROOT_CACHED; 2959 INIT_LIST_HEAD(&ctl->trimming_ranges); 2960 mutex_init(&ctl->cache_writeout_mutex); 2961 2962 /* 2963 * we only want to have 32k of ram per block group for keeping 2964 * track of free space, and if we pass 1/2 of that we want to 2965 * start converting things over to using bitmaps 2966 */ 2967 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2968 } 2969 2970 /* 2971 * for a given cluster, put all of its extents back into the free 2972 * space cache. If the block group passed doesn't match the block group 2973 * pointed to by the cluster, someone else raced in and freed the 2974 * cluster already. In that case, we just return without changing anything 2975 */ 2976 static void __btrfs_return_cluster_to_free_space( 2977 struct btrfs_block_group *block_group, 2978 struct btrfs_free_cluster *cluster) 2979 { 2980 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2981 struct rb_node *node; 2982 2983 lockdep_assert_held(&ctl->tree_lock); 2984 2985 spin_lock(&cluster->lock); 2986 if (cluster->block_group != block_group) { 2987 spin_unlock(&cluster->lock); 2988 return; 2989 } 2990 2991 cluster->block_group = NULL; 2992 cluster->window_start = 0; 2993 list_del_init(&cluster->block_group_list); 2994 2995 node = rb_first(&cluster->root); 2996 while (node) { 2997 struct btrfs_free_space *entry; 2998 2999 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3000 node = rb_next(&entry->offset_index); 3001 rb_erase(&entry->offset_index, &cluster->root); 3002 RB_CLEAR_NODE(&entry->offset_index); 3003 3004 if (!entry->bitmap) { 3005 /* Merging treats extents as if they were new */ 3006 if (!btrfs_free_space_trimmed(entry)) { 3007 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3008 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3009 entry->bytes; 3010 } 3011 3012 try_merge_free_space(ctl, entry, false); 3013 steal_from_bitmap(ctl, entry, false); 3014 3015 /* As we insert directly, update these statistics */ 3016 if (!btrfs_free_space_trimmed(entry)) { 3017 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3018 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3019 entry->bytes; 3020 } 3021 } 3022 tree_insert_offset(ctl, NULL, entry); 3023 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3024 entry_less); 3025 } 3026 cluster->root = RB_ROOT; 3027 spin_unlock(&cluster->lock); 3028 btrfs_put_block_group(block_group); 3029 } 3030 3031 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3032 { 3033 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3034 struct btrfs_free_cluster *cluster; 3035 struct list_head *head; 3036 3037 spin_lock(&ctl->tree_lock); 3038 while ((head = block_group->cluster_list.next) != 3039 &block_group->cluster_list) { 3040 cluster = list_entry(head, struct btrfs_free_cluster, 3041 block_group_list); 3042 3043 WARN_ON(cluster->block_group != block_group); 3044 __btrfs_return_cluster_to_free_space(block_group, cluster); 3045 3046 cond_resched_lock(&ctl->tree_lock); 3047 } 3048 __btrfs_remove_free_space_cache(ctl); 3049 btrfs_discard_update_discardable(block_group); 3050 spin_unlock(&ctl->tree_lock); 3051 3052 } 3053 3054 /* 3055 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3056 */ 3057 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3058 { 3059 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3060 struct btrfs_free_space *info; 3061 struct rb_node *node; 3062 bool ret = true; 3063 3064 spin_lock(&ctl->tree_lock); 3065 node = rb_first(&ctl->free_space_offset); 3066 3067 while (node) { 3068 info = rb_entry(node, struct btrfs_free_space, offset_index); 3069 3070 if (!btrfs_free_space_trimmed(info)) { 3071 ret = false; 3072 break; 3073 } 3074 3075 node = rb_next(node); 3076 } 3077 3078 spin_unlock(&ctl->tree_lock); 3079 return ret; 3080 } 3081 3082 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3083 u64 offset, u64 bytes, u64 empty_size, 3084 u64 *max_extent_size) 3085 { 3086 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3087 struct btrfs_discard_ctl *discard_ctl = 3088 &block_group->fs_info->discard_ctl; 3089 struct btrfs_free_space *entry = NULL; 3090 u64 bytes_search = bytes + empty_size; 3091 u64 ret = 0; 3092 u64 align_gap = 0; 3093 u64 align_gap_len = 0; 3094 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3095 bool use_bytes_index = (offset == block_group->start); 3096 3097 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3098 3099 spin_lock(&ctl->tree_lock); 3100 entry = find_free_space(ctl, &offset, &bytes_search, 3101 block_group->full_stripe_len, max_extent_size, 3102 use_bytes_index); 3103 if (!entry) 3104 goto out; 3105 3106 ret = offset; 3107 if (entry->bitmap) { 3108 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3109 3110 if (!btrfs_free_space_trimmed(entry)) 3111 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3112 3113 if (!entry->bytes) 3114 free_bitmap(ctl, entry); 3115 } else { 3116 unlink_free_space(ctl, entry, true); 3117 align_gap_len = offset - entry->offset; 3118 align_gap = entry->offset; 3119 align_gap_trim_state = entry->trim_state; 3120 3121 if (!btrfs_free_space_trimmed(entry)) 3122 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3123 3124 entry->offset = offset + bytes; 3125 WARN_ON(entry->bytes < bytes + align_gap_len); 3126 3127 entry->bytes -= bytes + align_gap_len; 3128 if (!entry->bytes) 3129 kmem_cache_free(btrfs_free_space_cachep, entry); 3130 else 3131 link_free_space(ctl, entry); 3132 } 3133 out: 3134 btrfs_discard_update_discardable(block_group); 3135 spin_unlock(&ctl->tree_lock); 3136 3137 if (align_gap_len) 3138 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3139 align_gap_trim_state); 3140 return ret; 3141 } 3142 3143 /* 3144 * given a cluster, put all of its extents back into the free space 3145 * cache. If a block group is passed, this function will only free 3146 * a cluster that belongs to the passed block group. 3147 * 3148 * Otherwise, it'll get a reference on the block group pointed to by the 3149 * cluster and remove the cluster from it. 3150 */ 3151 void btrfs_return_cluster_to_free_space( 3152 struct btrfs_block_group *block_group, 3153 struct btrfs_free_cluster *cluster) 3154 { 3155 struct btrfs_free_space_ctl *ctl; 3156 3157 /* first, get a safe pointer to the block group */ 3158 spin_lock(&cluster->lock); 3159 if (!block_group) { 3160 block_group = cluster->block_group; 3161 if (!block_group) { 3162 spin_unlock(&cluster->lock); 3163 return; 3164 } 3165 } else if (cluster->block_group != block_group) { 3166 /* someone else has already freed it don't redo their work */ 3167 spin_unlock(&cluster->lock); 3168 return; 3169 } 3170 btrfs_get_block_group(block_group); 3171 spin_unlock(&cluster->lock); 3172 3173 ctl = block_group->free_space_ctl; 3174 3175 /* now return any extents the cluster had on it */ 3176 spin_lock(&ctl->tree_lock); 3177 __btrfs_return_cluster_to_free_space(block_group, cluster); 3178 spin_unlock(&ctl->tree_lock); 3179 3180 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3181 3182 /* finally drop our ref */ 3183 btrfs_put_block_group(block_group); 3184 } 3185 3186 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3187 struct btrfs_free_cluster *cluster, 3188 struct btrfs_free_space *entry, 3189 u64 bytes, u64 min_start, 3190 u64 *max_extent_size) 3191 { 3192 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3193 int err; 3194 u64 search_start = cluster->window_start; 3195 u64 search_bytes = bytes; 3196 u64 ret = 0; 3197 3198 search_start = min_start; 3199 search_bytes = bytes; 3200 3201 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3202 if (err) { 3203 *max_extent_size = max(get_max_extent_size(entry), 3204 *max_extent_size); 3205 return 0; 3206 } 3207 3208 ret = search_start; 3209 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3210 3211 return ret; 3212 } 3213 3214 /* 3215 * given a cluster, try to allocate 'bytes' from it, returns 0 3216 * if it couldn't find anything suitably large, or a logical disk offset 3217 * if things worked out 3218 */ 3219 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3220 struct btrfs_free_cluster *cluster, u64 bytes, 3221 u64 min_start, u64 *max_extent_size) 3222 { 3223 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3224 struct btrfs_discard_ctl *discard_ctl = 3225 &block_group->fs_info->discard_ctl; 3226 struct btrfs_free_space *entry = NULL; 3227 struct rb_node *node; 3228 u64 ret = 0; 3229 3230 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3231 3232 spin_lock(&cluster->lock); 3233 if (bytes > cluster->max_size) 3234 goto out; 3235 3236 if (cluster->block_group != block_group) 3237 goto out; 3238 3239 node = rb_first(&cluster->root); 3240 if (!node) 3241 goto out; 3242 3243 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3244 while (1) { 3245 if (entry->bytes < bytes) 3246 *max_extent_size = max(get_max_extent_size(entry), 3247 *max_extent_size); 3248 3249 if (entry->bytes < bytes || 3250 (!entry->bitmap && entry->offset < min_start)) { 3251 node = rb_next(&entry->offset_index); 3252 if (!node) 3253 break; 3254 entry = rb_entry(node, struct btrfs_free_space, 3255 offset_index); 3256 continue; 3257 } 3258 3259 if (entry->bitmap) { 3260 ret = btrfs_alloc_from_bitmap(block_group, 3261 cluster, entry, bytes, 3262 cluster->window_start, 3263 max_extent_size); 3264 if (ret == 0) { 3265 node = rb_next(&entry->offset_index); 3266 if (!node) 3267 break; 3268 entry = rb_entry(node, struct btrfs_free_space, 3269 offset_index); 3270 continue; 3271 } 3272 cluster->window_start += bytes; 3273 } else { 3274 ret = entry->offset; 3275 3276 entry->offset += bytes; 3277 entry->bytes -= bytes; 3278 } 3279 3280 break; 3281 } 3282 out: 3283 spin_unlock(&cluster->lock); 3284 3285 if (!ret) 3286 return 0; 3287 3288 spin_lock(&ctl->tree_lock); 3289 3290 if (!btrfs_free_space_trimmed(entry)) 3291 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3292 3293 ctl->free_space -= bytes; 3294 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3295 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3296 3297 spin_lock(&cluster->lock); 3298 if (entry->bytes == 0) { 3299 rb_erase(&entry->offset_index, &cluster->root); 3300 ctl->free_extents--; 3301 if (entry->bitmap) { 3302 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3303 entry->bitmap); 3304 ctl->total_bitmaps--; 3305 recalculate_thresholds(ctl); 3306 } else if (!btrfs_free_space_trimmed(entry)) { 3307 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3308 } 3309 kmem_cache_free(btrfs_free_space_cachep, entry); 3310 } 3311 3312 spin_unlock(&cluster->lock); 3313 spin_unlock(&ctl->tree_lock); 3314 3315 return ret; 3316 } 3317 3318 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3319 struct btrfs_free_space *entry, 3320 struct btrfs_free_cluster *cluster, 3321 u64 offset, u64 bytes, 3322 u64 cont1_bytes, u64 min_bytes) 3323 { 3324 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3325 unsigned long next_zero; 3326 unsigned long i; 3327 unsigned long want_bits; 3328 unsigned long min_bits; 3329 unsigned long found_bits; 3330 unsigned long max_bits = 0; 3331 unsigned long start = 0; 3332 unsigned long total_found = 0; 3333 int ret; 3334 3335 lockdep_assert_held(&ctl->tree_lock); 3336 3337 i = offset_to_bit(entry->offset, ctl->unit, 3338 max_t(u64, offset, entry->offset)); 3339 want_bits = bytes_to_bits(bytes, ctl->unit); 3340 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3341 3342 /* 3343 * Don't bother looking for a cluster in this bitmap if it's heavily 3344 * fragmented. 3345 */ 3346 if (entry->max_extent_size && 3347 entry->max_extent_size < cont1_bytes) 3348 return -ENOSPC; 3349 again: 3350 found_bits = 0; 3351 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3352 next_zero = find_next_zero_bit(entry->bitmap, 3353 BITS_PER_BITMAP, i); 3354 if (next_zero - i >= min_bits) { 3355 found_bits = next_zero - i; 3356 if (found_bits > max_bits) 3357 max_bits = found_bits; 3358 break; 3359 } 3360 if (next_zero - i > max_bits) 3361 max_bits = next_zero - i; 3362 i = next_zero; 3363 } 3364 3365 if (!found_bits) { 3366 entry->max_extent_size = (u64)max_bits * ctl->unit; 3367 return -ENOSPC; 3368 } 3369 3370 if (!total_found) { 3371 start = i; 3372 cluster->max_size = 0; 3373 } 3374 3375 total_found += found_bits; 3376 3377 if (cluster->max_size < found_bits * ctl->unit) 3378 cluster->max_size = found_bits * ctl->unit; 3379 3380 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3381 i = next_zero + 1; 3382 goto again; 3383 } 3384 3385 cluster->window_start = start * ctl->unit + entry->offset; 3386 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3387 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3388 3389 /* 3390 * We need to know if we're currently on the normal space index when we 3391 * manipulate the bitmap so that we know we need to remove and re-insert 3392 * it into the space_index tree. Clear the bytes_index node here so the 3393 * bitmap manipulation helpers know not to mess with the space_index 3394 * until this bitmap entry is added back into the normal cache. 3395 */ 3396 RB_CLEAR_NODE(&entry->bytes_index); 3397 3398 ret = tree_insert_offset(ctl, cluster, entry); 3399 ASSERT(!ret); /* -EEXIST; Logic error */ 3400 3401 trace_btrfs_setup_cluster(block_group, cluster, 3402 total_found * ctl->unit, 1); 3403 return 0; 3404 } 3405 3406 /* 3407 * This searches the block group for just extents to fill the cluster with. 3408 * Try to find a cluster with at least bytes total bytes, at least one 3409 * extent of cont1_bytes, and other clusters of at least min_bytes. 3410 */ 3411 static noinline int 3412 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3413 struct btrfs_free_cluster *cluster, 3414 struct list_head *bitmaps, u64 offset, u64 bytes, 3415 u64 cont1_bytes, u64 min_bytes) 3416 { 3417 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3418 struct btrfs_free_space *first = NULL; 3419 struct btrfs_free_space *entry = NULL; 3420 struct btrfs_free_space *last; 3421 struct rb_node *node; 3422 u64 window_free; 3423 u64 max_extent; 3424 u64 total_size = 0; 3425 3426 lockdep_assert_held(&ctl->tree_lock); 3427 3428 entry = tree_search_offset(ctl, offset, 0, 1); 3429 if (!entry) 3430 return -ENOSPC; 3431 3432 /* 3433 * We don't want bitmaps, so just move along until we find a normal 3434 * extent entry. 3435 */ 3436 while (entry->bitmap || entry->bytes < min_bytes) { 3437 if (entry->bitmap && list_empty(&entry->list)) 3438 list_add_tail(&entry->list, bitmaps); 3439 node = rb_next(&entry->offset_index); 3440 if (!node) 3441 return -ENOSPC; 3442 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3443 } 3444 3445 window_free = entry->bytes; 3446 max_extent = entry->bytes; 3447 first = entry; 3448 last = entry; 3449 3450 for (node = rb_next(&entry->offset_index); node; 3451 node = rb_next(&entry->offset_index)) { 3452 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3453 3454 if (entry->bitmap) { 3455 if (list_empty(&entry->list)) 3456 list_add_tail(&entry->list, bitmaps); 3457 continue; 3458 } 3459 3460 if (entry->bytes < min_bytes) 3461 continue; 3462 3463 last = entry; 3464 window_free += entry->bytes; 3465 if (entry->bytes > max_extent) 3466 max_extent = entry->bytes; 3467 } 3468 3469 if (window_free < bytes || max_extent < cont1_bytes) 3470 return -ENOSPC; 3471 3472 cluster->window_start = first->offset; 3473 3474 node = &first->offset_index; 3475 3476 /* 3477 * now we've found our entries, pull them out of the free space 3478 * cache and put them into the cluster rbtree 3479 */ 3480 do { 3481 int ret; 3482 3483 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3484 node = rb_next(&entry->offset_index); 3485 if (entry->bitmap || entry->bytes < min_bytes) 3486 continue; 3487 3488 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3489 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3490 ret = tree_insert_offset(ctl, cluster, entry); 3491 total_size += entry->bytes; 3492 ASSERT(!ret); /* -EEXIST; Logic error */ 3493 } while (node && entry != last); 3494 3495 cluster->max_size = max_extent; 3496 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3497 return 0; 3498 } 3499 3500 /* 3501 * This specifically looks for bitmaps that may work in the cluster, we assume 3502 * that we have already failed to find extents that will work. 3503 */ 3504 static noinline int 3505 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3506 struct btrfs_free_cluster *cluster, 3507 struct list_head *bitmaps, u64 offset, u64 bytes, 3508 u64 cont1_bytes, u64 min_bytes) 3509 { 3510 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3511 struct btrfs_free_space *entry = NULL; 3512 int ret = -ENOSPC; 3513 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3514 3515 if (ctl->total_bitmaps == 0) 3516 return -ENOSPC; 3517 3518 /* 3519 * The bitmap that covers offset won't be in the list unless offset 3520 * is just its start offset. 3521 */ 3522 if (!list_empty(bitmaps)) 3523 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3524 3525 if (!entry || entry->offset != bitmap_offset) { 3526 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3527 if (entry && list_empty(&entry->list)) 3528 list_add(&entry->list, bitmaps); 3529 } 3530 3531 list_for_each_entry(entry, bitmaps, list) { 3532 if (entry->bytes < bytes) 3533 continue; 3534 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3535 bytes, cont1_bytes, min_bytes); 3536 if (!ret) 3537 return 0; 3538 } 3539 3540 /* 3541 * The bitmaps list has all the bitmaps that record free space 3542 * starting after offset, so no more search is required. 3543 */ 3544 return -ENOSPC; 3545 } 3546 3547 /* 3548 * here we try to find a cluster of blocks in a block group. The goal 3549 * is to find at least bytes+empty_size. 3550 * We might not find them all in one contiguous area. 3551 * 3552 * returns zero and sets up cluster if things worked out, otherwise 3553 * it returns -enospc 3554 */ 3555 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3556 struct btrfs_free_cluster *cluster, 3557 u64 offset, u64 bytes, u64 empty_size) 3558 { 3559 struct btrfs_fs_info *fs_info = block_group->fs_info; 3560 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3561 struct btrfs_free_space *entry, *tmp; 3562 LIST_HEAD(bitmaps); 3563 u64 min_bytes; 3564 u64 cont1_bytes; 3565 int ret; 3566 3567 /* 3568 * Choose the minimum extent size we'll require for this 3569 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3570 * For metadata, allow allocates with smaller extents. For 3571 * data, keep it dense. 3572 */ 3573 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3574 cont1_bytes = bytes + empty_size; 3575 min_bytes = cont1_bytes; 3576 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3577 cont1_bytes = bytes; 3578 min_bytes = fs_info->sectorsize; 3579 } else { 3580 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3581 min_bytes = fs_info->sectorsize; 3582 } 3583 3584 spin_lock(&ctl->tree_lock); 3585 3586 /* 3587 * If we know we don't have enough space to make a cluster don't even 3588 * bother doing all the work to try and find one. 3589 */ 3590 if (ctl->free_space < bytes) { 3591 spin_unlock(&ctl->tree_lock); 3592 return -ENOSPC; 3593 } 3594 3595 spin_lock(&cluster->lock); 3596 3597 /* someone already found a cluster, hooray */ 3598 if (cluster->block_group) { 3599 ret = 0; 3600 goto out; 3601 } 3602 3603 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3604 min_bytes); 3605 3606 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3607 bytes + empty_size, 3608 cont1_bytes, min_bytes); 3609 if (ret) 3610 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3611 offset, bytes + empty_size, 3612 cont1_bytes, min_bytes); 3613 3614 /* Clear our temporary list */ 3615 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3616 list_del_init(&entry->list); 3617 3618 if (!ret) { 3619 btrfs_get_block_group(block_group); 3620 list_add_tail(&cluster->block_group_list, 3621 &block_group->cluster_list); 3622 cluster->block_group = block_group; 3623 } else { 3624 trace_btrfs_failed_cluster_setup(block_group); 3625 } 3626 out: 3627 spin_unlock(&cluster->lock); 3628 spin_unlock(&ctl->tree_lock); 3629 3630 return ret; 3631 } 3632 3633 /* 3634 * simple code to zero out a cluster 3635 */ 3636 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3637 { 3638 spin_lock_init(&cluster->lock); 3639 spin_lock_init(&cluster->refill_lock); 3640 cluster->root = RB_ROOT; 3641 cluster->max_size = 0; 3642 cluster->fragmented = false; 3643 INIT_LIST_HEAD(&cluster->block_group_list); 3644 cluster->block_group = NULL; 3645 } 3646 3647 static int do_trimming(struct btrfs_block_group *block_group, 3648 u64 *total_trimmed, u64 start, u64 bytes, 3649 u64 reserved_start, u64 reserved_bytes, 3650 enum btrfs_trim_state reserved_trim_state, 3651 struct btrfs_trim_range *trim_entry) 3652 { 3653 struct btrfs_space_info *space_info = block_group->space_info; 3654 struct btrfs_fs_info *fs_info = block_group->fs_info; 3655 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3656 int ret; 3657 int update = 0; 3658 const u64 end = start + bytes; 3659 const u64 reserved_end = reserved_start + reserved_bytes; 3660 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3661 u64 trimmed = 0; 3662 3663 spin_lock(&space_info->lock); 3664 spin_lock(&block_group->lock); 3665 if (!block_group->ro) { 3666 block_group->reserved += reserved_bytes; 3667 space_info->bytes_reserved += reserved_bytes; 3668 update = 1; 3669 } 3670 spin_unlock(&block_group->lock); 3671 spin_unlock(&space_info->lock); 3672 3673 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3674 if (!ret) { 3675 *total_trimmed += trimmed; 3676 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3677 } 3678 3679 mutex_lock(&ctl->cache_writeout_mutex); 3680 if (reserved_start < start) 3681 __btrfs_add_free_space(block_group, reserved_start, 3682 start - reserved_start, 3683 reserved_trim_state); 3684 if (end < reserved_end) 3685 __btrfs_add_free_space(block_group, end, reserved_end - end, 3686 reserved_trim_state); 3687 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3688 list_del(&trim_entry->list); 3689 mutex_unlock(&ctl->cache_writeout_mutex); 3690 3691 if (update) { 3692 spin_lock(&space_info->lock); 3693 spin_lock(&block_group->lock); 3694 if (block_group->ro) 3695 space_info->bytes_readonly += reserved_bytes; 3696 block_group->reserved -= reserved_bytes; 3697 space_info->bytes_reserved -= reserved_bytes; 3698 spin_unlock(&block_group->lock); 3699 spin_unlock(&space_info->lock); 3700 } 3701 3702 return ret; 3703 } 3704 3705 /* 3706 * If @async is set, then we will trim 1 region and return. 3707 */ 3708 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3709 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3710 bool async) 3711 { 3712 struct btrfs_discard_ctl *discard_ctl = 3713 &block_group->fs_info->discard_ctl; 3714 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3715 struct btrfs_free_space *entry; 3716 struct rb_node *node; 3717 int ret = 0; 3718 u64 extent_start; 3719 u64 extent_bytes; 3720 enum btrfs_trim_state extent_trim_state; 3721 u64 bytes; 3722 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3723 3724 while (start < end) { 3725 struct btrfs_trim_range trim_entry; 3726 3727 mutex_lock(&ctl->cache_writeout_mutex); 3728 spin_lock(&ctl->tree_lock); 3729 3730 if (ctl->free_space < minlen) 3731 goto out_unlock; 3732 3733 entry = tree_search_offset(ctl, start, 0, 1); 3734 if (!entry) 3735 goto out_unlock; 3736 3737 /* Skip bitmaps and if async, already trimmed entries */ 3738 while (entry->bitmap || 3739 (async && btrfs_free_space_trimmed(entry))) { 3740 node = rb_next(&entry->offset_index); 3741 if (!node) 3742 goto out_unlock; 3743 entry = rb_entry(node, struct btrfs_free_space, 3744 offset_index); 3745 } 3746 3747 if (entry->offset >= end) 3748 goto out_unlock; 3749 3750 extent_start = entry->offset; 3751 extent_bytes = entry->bytes; 3752 extent_trim_state = entry->trim_state; 3753 if (async) { 3754 start = entry->offset; 3755 bytes = entry->bytes; 3756 if (bytes < minlen) { 3757 spin_unlock(&ctl->tree_lock); 3758 mutex_unlock(&ctl->cache_writeout_mutex); 3759 goto next; 3760 } 3761 unlink_free_space(ctl, entry, true); 3762 /* 3763 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3764 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3765 * X when we come back around. So trim it now. 3766 */ 3767 if (max_discard_size && 3768 bytes >= (max_discard_size + 3769 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3770 bytes = max_discard_size; 3771 extent_bytes = max_discard_size; 3772 entry->offset += max_discard_size; 3773 entry->bytes -= max_discard_size; 3774 link_free_space(ctl, entry); 3775 } else { 3776 kmem_cache_free(btrfs_free_space_cachep, entry); 3777 } 3778 } else { 3779 start = max(start, extent_start); 3780 bytes = min(extent_start + extent_bytes, end) - start; 3781 if (bytes < minlen) { 3782 spin_unlock(&ctl->tree_lock); 3783 mutex_unlock(&ctl->cache_writeout_mutex); 3784 goto next; 3785 } 3786 3787 unlink_free_space(ctl, entry, true); 3788 kmem_cache_free(btrfs_free_space_cachep, entry); 3789 } 3790 3791 spin_unlock(&ctl->tree_lock); 3792 trim_entry.start = extent_start; 3793 trim_entry.bytes = extent_bytes; 3794 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3795 mutex_unlock(&ctl->cache_writeout_mutex); 3796 3797 ret = do_trimming(block_group, total_trimmed, start, bytes, 3798 extent_start, extent_bytes, extent_trim_state, 3799 &trim_entry); 3800 if (ret) { 3801 block_group->discard_cursor = start + bytes; 3802 break; 3803 } 3804 next: 3805 start += bytes; 3806 block_group->discard_cursor = start; 3807 if (async && *total_trimmed) 3808 break; 3809 3810 if (fatal_signal_pending(current)) { 3811 ret = -ERESTARTSYS; 3812 break; 3813 } 3814 3815 cond_resched(); 3816 } 3817 3818 return ret; 3819 3820 out_unlock: 3821 block_group->discard_cursor = btrfs_block_group_end(block_group); 3822 spin_unlock(&ctl->tree_lock); 3823 mutex_unlock(&ctl->cache_writeout_mutex); 3824 3825 return ret; 3826 } 3827 3828 /* 3829 * If we break out of trimming a bitmap prematurely, we should reset the 3830 * trimming bit. In a rather contrieved case, it's possible to race here so 3831 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3832 * 3833 * start = start of bitmap 3834 * end = near end of bitmap 3835 * 3836 * Thread 1: Thread 2: 3837 * trim_bitmaps(start) 3838 * trim_bitmaps(end) 3839 * end_trimming_bitmap() 3840 * reset_trimming_bitmap() 3841 */ 3842 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3843 { 3844 struct btrfs_free_space *entry; 3845 3846 spin_lock(&ctl->tree_lock); 3847 entry = tree_search_offset(ctl, offset, 1, 0); 3848 if (entry) { 3849 if (btrfs_free_space_trimmed(entry)) { 3850 ctl->discardable_extents[BTRFS_STAT_CURR] += 3851 entry->bitmap_extents; 3852 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3853 } 3854 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3855 } 3856 3857 spin_unlock(&ctl->tree_lock); 3858 } 3859 3860 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3861 struct btrfs_free_space *entry) 3862 { 3863 if (btrfs_free_space_trimming_bitmap(entry)) { 3864 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3865 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3866 entry->bitmap_extents; 3867 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3868 } 3869 } 3870 3871 /* 3872 * If @async is set, then we will trim 1 region and return. 3873 */ 3874 static int trim_bitmaps(struct btrfs_block_group *block_group, 3875 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3876 u64 maxlen, bool async) 3877 { 3878 struct btrfs_discard_ctl *discard_ctl = 3879 &block_group->fs_info->discard_ctl; 3880 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3881 struct btrfs_free_space *entry; 3882 int ret = 0; 3883 int ret2; 3884 u64 bytes; 3885 u64 offset = offset_to_bitmap(ctl, start); 3886 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3887 3888 while (offset < end) { 3889 bool next_bitmap = false; 3890 struct btrfs_trim_range trim_entry; 3891 3892 mutex_lock(&ctl->cache_writeout_mutex); 3893 spin_lock(&ctl->tree_lock); 3894 3895 if (ctl->free_space < minlen) { 3896 block_group->discard_cursor = 3897 btrfs_block_group_end(block_group); 3898 spin_unlock(&ctl->tree_lock); 3899 mutex_unlock(&ctl->cache_writeout_mutex); 3900 break; 3901 } 3902 3903 entry = tree_search_offset(ctl, offset, 1, 0); 3904 /* 3905 * Bitmaps are marked trimmed lossily now to prevent constant 3906 * discarding of the same bitmap (the reason why we are bound 3907 * by the filters). So, retrim the block group bitmaps when we 3908 * are preparing to punt to the unused_bgs list. This uses 3909 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3910 * which is the only discard index which sets minlen to 0. 3911 */ 3912 if (!entry || (async && minlen && start == offset && 3913 btrfs_free_space_trimmed(entry))) { 3914 spin_unlock(&ctl->tree_lock); 3915 mutex_unlock(&ctl->cache_writeout_mutex); 3916 next_bitmap = true; 3917 goto next; 3918 } 3919 3920 /* 3921 * Async discard bitmap trimming begins at by setting the start 3922 * to be key.objectid and the offset_to_bitmap() aligns to the 3923 * start of the bitmap. This lets us know we are fully 3924 * scanning the bitmap rather than only some portion of it. 3925 */ 3926 if (start == offset) 3927 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3928 3929 bytes = minlen; 3930 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3931 if (ret2 || start >= end) { 3932 /* 3933 * We lossily consider a bitmap trimmed if we only skip 3934 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3935 */ 3936 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3937 end_trimming_bitmap(ctl, entry); 3938 else 3939 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3940 spin_unlock(&ctl->tree_lock); 3941 mutex_unlock(&ctl->cache_writeout_mutex); 3942 next_bitmap = true; 3943 goto next; 3944 } 3945 3946 /* 3947 * We already trimmed a region, but are using the locking above 3948 * to reset the trim_state. 3949 */ 3950 if (async && *total_trimmed) { 3951 spin_unlock(&ctl->tree_lock); 3952 mutex_unlock(&ctl->cache_writeout_mutex); 3953 goto out; 3954 } 3955 3956 bytes = min(bytes, end - start); 3957 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3958 spin_unlock(&ctl->tree_lock); 3959 mutex_unlock(&ctl->cache_writeout_mutex); 3960 goto next; 3961 } 3962 3963 /* 3964 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3965 * If X < @minlen, we won't trim X when we come back around. 3966 * So trim it now. We differ here from trimming extents as we 3967 * don't keep individual state per bit. 3968 */ 3969 if (async && 3970 max_discard_size && 3971 bytes > (max_discard_size + minlen)) 3972 bytes = max_discard_size; 3973 3974 bitmap_clear_bits(ctl, entry, start, bytes, true); 3975 if (entry->bytes == 0) 3976 free_bitmap(ctl, entry); 3977 3978 spin_unlock(&ctl->tree_lock); 3979 trim_entry.start = start; 3980 trim_entry.bytes = bytes; 3981 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3982 mutex_unlock(&ctl->cache_writeout_mutex); 3983 3984 ret = do_trimming(block_group, total_trimmed, start, bytes, 3985 start, bytes, 0, &trim_entry); 3986 if (ret) { 3987 reset_trimming_bitmap(ctl, offset); 3988 block_group->discard_cursor = 3989 btrfs_block_group_end(block_group); 3990 break; 3991 } 3992 next: 3993 if (next_bitmap) { 3994 offset += BITS_PER_BITMAP * ctl->unit; 3995 start = offset; 3996 } else { 3997 start += bytes; 3998 } 3999 block_group->discard_cursor = start; 4000 4001 if (fatal_signal_pending(current)) { 4002 if (start != offset) 4003 reset_trimming_bitmap(ctl, offset); 4004 ret = -ERESTARTSYS; 4005 break; 4006 } 4007 4008 cond_resched(); 4009 } 4010 4011 if (offset >= end) 4012 block_group->discard_cursor = end; 4013 4014 out: 4015 return ret; 4016 } 4017 4018 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4019 u64 *trimmed, u64 start, u64 end, u64 minlen) 4020 { 4021 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4022 int ret; 4023 u64 rem = 0; 4024 4025 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4026 4027 *trimmed = 0; 4028 4029 spin_lock(&block_group->lock); 4030 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4031 spin_unlock(&block_group->lock); 4032 return 0; 4033 } 4034 btrfs_freeze_block_group(block_group); 4035 spin_unlock(&block_group->lock); 4036 4037 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4038 if (ret) 4039 goto out; 4040 4041 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4042 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4043 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4044 if (rem) 4045 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4046 out: 4047 btrfs_unfreeze_block_group(block_group); 4048 return ret; 4049 } 4050 4051 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4052 u64 *trimmed, u64 start, u64 end, u64 minlen, 4053 bool async) 4054 { 4055 int ret; 4056 4057 *trimmed = 0; 4058 4059 spin_lock(&block_group->lock); 4060 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4061 spin_unlock(&block_group->lock); 4062 return 0; 4063 } 4064 btrfs_freeze_block_group(block_group); 4065 spin_unlock(&block_group->lock); 4066 4067 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4068 btrfs_unfreeze_block_group(block_group); 4069 4070 return ret; 4071 } 4072 4073 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4074 u64 *trimmed, u64 start, u64 end, u64 minlen, 4075 u64 maxlen, bool async) 4076 { 4077 int ret; 4078 4079 *trimmed = 0; 4080 4081 spin_lock(&block_group->lock); 4082 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4083 spin_unlock(&block_group->lock); 4084 return 0; 4085 } 4086 btrfs_freeze_block_group(block_group); 4087 spin_unlock(&block_group->lock); 4088 4089 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4090 async); 4091 4092 btrfs_unfreeze_block_group(block_group); 4093 4094 return ret; 4095 } 4096 4097 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4098 { 4099 return btrfs_super_cache_generation(fs_info->super_copy); 4100 } 4101 4102 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4103 struct btrfs_trans_handle *trans) 4104 { 4105 struct btrfs_block_group *block_group; 4106 struct rb_node *node; 4107 int ret = 0; 4108 4109 btrfs_info(fs_info, "cleaning free space cache v1"); 4110 4111 node = rb_first_cached(&fs_info->block_group_cache_tree); 4112 while (node) { 4113 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4114 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4115 if (ret) 4116 goto out; 4117 node = rb_next(node); 4118 } 4119 out: 4120 return ret; 4121 } 4122 4123 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4124 { 4125 struct btrfs_trans_handle *trans; 4126 int ret; 4127 4128 /* 4129 * update_super_roots will appropriately set or unset 4130 * super_copy->cache_generation based on SPACE_CACHE and 4131 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4132 * transaction commit whether we are enabling space cache v1 and don't 4133 * have any other work to do, or are disabling it and removing free 4134 * space inodes. 4135 */ 4136 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4137 if (IS_ERR(trans)) 4138 return PTR_ERR(trans); 4139 4140 if (!active) { 4141 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4142 ret = cleanup_free_space_cache_v1(fs_info, trans); 4143 if (ret) { 4144 btrfs_abort_transaction(trans, ret); 4145 btrfs_end_transaction(trans); 4146 goto out; 4147 } 4148 } 4149 4150 ret = btrfs_commit_transaction(trans); 4151 out: 4152 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4153 4154 return ret; 4155 } 4156 4157 int __init btrfs_free_space_init(void) 4158 { 4159 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4160 sizeof(struct btrfs_free_space), 0, 4161 SLAB_MEM_SPREAD, NULL); 4162 if (!btrfs_free_space_cachep) 4163 return -ENOMEM; 4164 4165 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4166 PAGE_SIZE, PAGE_SIZE, 4167 SLAB_MEM_SPREAD, NULL); 4168 if (!btrfs_free_space_bitmap_cachep) { 4169 kmem_cache_destroy(btrfs_free_space_cachep); 4170 return -ENOMEM; 4171 } 4172 4173 return 0; 4174 } 4175 4176 void __cold btrfs_free_space_exit(void) 4177 { 4178 kmem_cache_destroy(btrfs_free_space_cachep); 4179 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4180 } 4181 4182 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4183 /* 4184 * Use this if you need to make a bitmap or extent entry specifically, it 4185 * doesn't do any of the merging that add_free_space does, this acts a lot like 4186 * how the free space cache loading stuff works, so you can get really weird 4187 * configurations. 4188 */ 4189 int test_add_free_space_entry(struct btrfs_block_group *cache, 4190 u64 offset, u64 bytes, bool bitmap) 4191 { 4192 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4193 struct btrfs_free_space *info = NULL, *bitmap_info; 4194 void *map = NULL; 4195 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4196 u64 bytes_added; 4197 int ret; 4198 4199 again: 4200 if (!info) { 4201 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4202 if (!info) 4203 return -ENOMEM; 4204 } 4205 4206 if (!bitmap) { 4207 spin_lock(&ctl->tree_lock); 4208 info->offset = offset; 4209 info->bytes = bytes; 4210 info->max_extent_size = 0; 4211 ret = link_free_space(ctl, info); 4212 spin_unlock(&ctl->tree_lock); 4213 if (ret) 4214 kmem_cache_free(btrfs_free_space_cachep, info); 4215 return ret; 4216 } 4217 4218 if (!map) { 4219 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4220 if (!map) { 4221 kmem_cache_free(btrfs_free_space_cachep, info); 4222 return -ENOMEM; 4223 } 4224 } 4225 4226 spin_lock(&ctl->tree_lock); 4227 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4228 1, 0); 4229 if (!bitmap_info) { 4230 info->bitmap = map; 4231 map = NULL; 4232 add_new_bitmap(ctl, info, offset); 4233 bitmap_info = info; 4234 info = NULL; 4235 } 4236 4237 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4238 trim_state); 4239 4240 bytes -= bytes_added; 4241 offset += bytes_added; 4242 spin_unlock(&ctl->tree_lock); 4243 4244 if (bytes) 4245 goto again; 4246 4247 if (info) 4248 kmem_cache_free(btrfs_free_space_cachep, info); 4249 if (map) 4250 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4251 return 0; 4252 } 4253 4254 /* 4255 * Checks to see if the given range is in the free space cache. This is really 4256 * just used to check the absence of space, so if there is free space in the 4257 * range at all we will return 1. 4258 */ 4259 int test_check_exists(struct btrfs_block_group *cache, 4260 u64 offset, u64 bytes) 4261 { 4262 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4263 struct btrfs_free_space *info; 4264 int ret = 0; 4265 4266 spin_lock(&ctl->tree_lock); 4267 info = tree_search_offset(ctl, offset, 0, 0); 4268 if (!info) { 4269 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4270 1, 0); 4271 if (!info) 4272 goto out; 4273 } 4274 4275 have_info: 4276 if (info->bitmap) { 4277 u64 bit_off, bit_bytes; 4278 struct rb_node *n; 4279 struct btrfs_free_space *tmp; 4280 4281 bit_off = offset; 4282 bit_bytes = ctl->unit; 4283 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4284 if (!ret) { 4285 if (bit_off == offset) { 4286 ret = 1; 4287 goto out; 4288 } else if (bit_off > offset && 4289 offset + bytes > bit_off) { 4290 ret = 1; 4291 goto out; 4292 } 4293 } 4294 4295 n = rb_prev(&info->offset_index); 4296 while (n) { 4297 tmp = rb_entry(n, struct btrfs_free_space, 4298 offset_index); 4299 if (tmp->offset + tmp->bytes < offset) 4300 break; 4301 if (offset + bytes < tmp->offset) { 4302 n = rb_prev(&tmp->offset_index); 4303 continue; 4304 } 4305 info = tmp; 4306 goto have_info; 4307 } 4308 4309 n = rb_next(&info->offset_index); 4310 while (n) { 4311 tmp = rb_entry(n, struct btrfs_free_space, 4312 offset_index); 4313 if (offset + bytes < tmp->offset) 4314 break; 4315 if (tmp->offset + tmp->bytes < offset) { 4316 n = rb_next(&tmp->offset_index); 4317 continue; 4318 } 4319 info = tmp; 4320 goto have_info; 4321 } 4322 4323 ret = 0; 4324 goto out; 4325 } 4326 4327 if (info->offset == offset) { 4328 ret = 1; 4329 goto out; 4330 } 4331 4332 if (offset > info->offset && offset < info->offset + info->bytes) 4333 ret = 1; 4334 out: 4335 spin_unlock(&ctl->tree_lock); 4336 return ret; 4337 } 4338 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4339