1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "space-info.h" 23 #include "block-group.h" 24 #include "discard.h" 25 #include "subpage.h" 26 #include "inode-item.h" 27 #include "accessors.h" 28 #include "file-item.h" 29 #include "file.h" 30 #include "super.h" 31 32 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 33 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 34 #define FORCE_EXTENT_THRESHOLD SZ_1M 35 36 static struct kmem_cache *btrfs_free_space_cachep; 37 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 38 39 struct btrfs_trim_range { 40 u64 start; 41 u64 bytes; 42 struct list_head list; 43 }; 44 45 static int link_free_space(struct btrfs_free_space_ctl *ctl, 46 struct btrfs_free_space *info); 47 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info, bool update_stat); 49 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *bitmap_info, u64 *offset, 51 u64 *bytes, bool for_alloc); 52 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 53 struct btrfs_free_space *bitmap_info); 54 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *info, u64 offset, 56 u64 bytes, bool update_stats); 57 58 static void btrfs_crc32c_final(u32 crc, u8 *result) 59 { 60 put_unaligned_le32(~crc, result); 61 } 62 63 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 64 { 65 struct btrfs_free_space *info; 66 struct rb_node *node; 67 68 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 69 info = rb_entry(node, struct btrfs_free_space, offset_index); 70 if (!info->bitmap) { 71 unlink_free_space(ctl, info, true); 72 kmem_cache_free(btrfs_free_space_cachep, info); 73 } else { 74 free_bitmap(ctl, info); 75 } 76 77 cond_resched_lock(&ctl->tree_lock); 78 } 79 } 80 81 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 82 struct btrfs_path *path, 83 u64 offset) 84 { 85 struct btrfs_fs_info *fs_info = root->fs_info; 86 struct btrfs_key key; 87 struct btrfs_key location; 88 struct btrfs_disk_key disk_key; 89 struct btrfs_free_space_header *header; 90 struct extent_buffer *leaf; 91 struct inode *inode = NULL; 92 unsigned nofs_flag; 93 int ret; 94 95 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 96 key.offset = offset; 97 key.type = 0; 98 99 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 100 if (ret < 0) 101 return ERR_PTR(ret); 102 if (ret > 0) { 103 btrfs_release_path(path); 104 return ERR_PTR(-ENOENT); 105 } 106 107 leaf = path->nodes[0]; 108 header = btrfs_item_ptr(leaf, path->slots[0], 109 struct btrfs_free_space_header); 110 btrfs_free_space_key(leaf, header, &disk_key); 111 btrfs_disk_key_to_cpu(&location, &disk_key); 112 btrfs_release_path(path); 113 114 /* 115 * We are often under a trans handle at this point, so we need to make 116 * sure NOFS is set to keep us from deadlocking. 117 */ 118 nofs_flag = memalloc_nofs_save(); 119 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 120 btrfs_release_path(path); 121 memalloc_nofs_restore(nofs_flag); 122 if (IS_ERR(inode)) 123 return inode; 124 125 mapping_set_gfp_mask(inode->i_mapping, 126 mapping_gfp_constraint(inode->i_mapping, 127 ~(__GFP_FS | __GFP_HIGHMEM))); 128 129 return inode; 130 } 131 132 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 133 struct btrfs_path *path) 134 { 135 struct btrfs_fs_info *fs_info = block_group->fs_info; 136 struct inode *inode = NULL; 137 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 138 139 spin_lock(&block_group->lock); 140 if (block_group->inode) 141 inode = igrab(block_group->inode); 142 spin_unlock(&block_group->lock); 143 if (inode) 144 return inode; 145 146 inode = __lookup_free_space_inode(fs_info->tree_root, path, 147 block_group->start); 148 if (IS_ERR(inode)) 149 return inode; 150 151 spin_lock(&block_group->lock); 152 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 153 btrfs_info(fs_info, "Old style space inode found, converting."); 154 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 155 BTRFS_INODE_NODATACOW; 156 block_group->disk_cache_state = BTRFS_DC_CLEAR; 157 } 158 159 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 160 block_group->inode = igrab(inode); 161 spin_unlock(&block_group->lock); 162 163 return inode; 164 } 165 166 static int __create_free_space_inode(struct btrfs_root *root, 167 struct btrfs_trans_handle *trans, 168 struct btrfs_path *path, 169 u64 ino, u64 offset) 170 { 171 struct btrfs_key key; 172 struct btrfs_disk_key disk_key; 173 struct btrfs_free_space_header *header; 174 struct btrfs_inode_item *inode_item; 175 struct extent_buffer *leaf; 176 /* We inline CRCs for the free disk space cache */ 177 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 178 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 179 int ret; 180 181 ret = btrfs_insert_empty_inode(trans, root, path, ino); 182 if (ret) 183 return ret; 184 185 leaf = path->nodes[0]; 186 inode_item = btrfs_item_ptr(leaf, path->slots[0], 187 struct btrfs_inode_item); 188 btrfs_item_key(leaf, &disk_key, path->slots[0]); 189 memzero_extent_buffer(leaf, (unsigned long)inode_item, 190 sizeof(*inode_item)); 191 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 192 btrfs_set_inode_size(leaf, inode_item, 0); 193 btrfs_set_inode_nbytes(leaf, inode_item, 0); 194 btrfs_set_inode_uid(leaf, inode_item, 0); 195 btrfs_set_inode_gid(leaf, inode_item, 0); 196 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 197 btrfs_set_inode_flags(leaf, inode_item, flags); 198 btrfs_set_inode_nlink(leaf, inode_item, 1); 199 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 200 btrfs_set_inode_block_group(leaf, inode_item, offset); 201 btrfs_mark_buffer_dirty(trans, leaf); 202 btrfs_release_path(path); 203 204 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 205 key.offset = offset; 206 key.type = 0; 207 ret = btrfs_insert_empty_item(trans, root, path, &key, 208 sizeof(struct btrfs_free_space_header)); 209 if (ret < 0) { 210 btrfs_release_path(path); 211 return ret; 212 } 213 214 leaf = path->nodes[0]; 215 header = btrfs_item_ptr(leaf, path->slots[0], 216 struct btrfs_free_space_header); 217 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 218 btrfs_set_free_space_key(leaf, header, &disk_key); 219 btrfs_mark_buffer_dirty(trans, leaf); 220 btrfs_release_path(path); 221 222 return 0; 223 } 224 225 int create_free_space_inode(struct btrfs_trans_handle *trans, 226 struct btrfs_block_group *block_group, 227 struct btrfs_path *path) 228 { 229 int ret; 230 u64 ino; 231 232 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 233 if (ret < 0) 234 return ret; 235 236 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 237 ino, block_group->start); 238 } 239 240 /* 241 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 242 * handles lookup, otherwise it takes ownership and iputs the inode. 243 * Don't reuse an inode pointer after passing it into this function. 244 */ 245 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 246 struct inode *inode, 247 struct btrfs_block_group *block_group) 248 { 249 struct btrfs_path *path; 250 struct btrfs_key key; 251 int ret = 0; 252 253 path = btrfs_alloc_path(); 254 if (!path) 255 return -ENOMEM; 256 257 if (!inode) 258 inode = lookup_free_space_inode(block_group, path); 259 if (IS_ERR(inode)) { 260 if (PTR_ERR(inode) != -ENOENT) 261 ret = PTR_ERR(inode); 262 goto out; 263 } 264 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 265 if (ret) { 266 btrfs_add_delayed_iput(BTRFS_I(inode)); 267 goto out; 268 } 269 clear_nlink(inode); 270 /* One for the block groups ref */ 271 spin_lock(&block_group->lock); 272 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 273 block_group->inode = NULL; 274 spin_unlock(&block_group->lock); 275 iput(inode); 276 } else { 277 spin_unlock(&block_group->lock); 278 } 279 /* One for the lookup ref */ 280 btrfs_add_delayed_iput(BTRFS_I(inode)); 281 282 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 283 key.type = 0; 284 key.offset = block_group->start; 285 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 286 -1, 1); 287 if (ret) { 288 if (ret > 0) 289 ret = 0; 290 goto out; 291 } 292 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 293 out: 294 btrfs_free_path(path); 295 return ret; 296 } 297 298 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 299 struct btrfs_block_group *block_group, 300 struct inode *vfs_inode) 301 { 302 struct btrfs_truncate_control control = { 303 .inode = BTRFS_I(vfs_inode), 304 .new_size = 0, 305 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 306 .min_type = BTRFS_EXTENT_DATA_KEY, 307 .clear_extent_range = true, 308 }; 309 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 310 struct btrfs_root *root = inode->root; 311 struct extent_state *cached_state = NULL; 312 int ret = 0; 313 bool locked = false; 314 315 if (block_group) { 316 struct btrfs_path *path = btrfs_alloc_path(); 317 318 if (!path) { 319 ret = -ENOMEM; 320 goto fail; 321 } 322 locked = true; 323 mutex_lock(&trans->transaction->cache_write_mutex); 324 if (!list_empty(&block_group->io_list)) { 325 list_del_init(&block_group->io_list); 326 327 btrfs_wait_cache_io(trans, block_group, path); 328 btrfs_put_block_group(block_group); 329 } 330 331 /* 332 * now that we've truncated the cache away, its no longer 333 * setup or written 334 */ 335 spin_lock(&block_group->lock); 336 block_group->disk_cache_state = BTRFS_DC_CLEAR; 337 spin_unlock(&block_group->lock); 338 btrfs_free_path(path); 339 } 340 341 btrfs_i_size_write(inode, 0); 342 truncate_pagecache(vfs_inode, 0); 343 344 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 345 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 346 347 /* 348 * We skip the throttling logic for free space cache inodes, so we don't 349 * need to check for -EAGAIN. 350 */ 351 ret = btrfs_truncate_inode_items(trans, root, &control); 352 353 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 354 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 355 356 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 357 if (ret) 358 goto fail; 359 360 ret = btrfs_update_inode(trans, inode); 361 362 fail: 363 if (locked) 364 mutex_unlock(&trans->transaction->cache_write_mutex); 365 if (ret) 366 btrfs_abort_transaction(trans, ret); 367 368 return ret; 369 } 370 371 static void readahead_cache(struct inode *inode) 372 { 373 struct file_ra_state ra; 374 unsigned long last_index; 375 376 file_ra_state_init(&ra, inode->i_mapping); 377 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 378 379 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 380 } 381 382 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 383 int write) 384 { 385 int num_pages; 386 387 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 388 389 /* Make sure we can fit our crcs and generation into the first page */ 390 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 391 return -ENOSPC; 392 393 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 394 395 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 396 if (!io_ctl->pages) 397 return -ENOMEM; 398 399 io_ctl->num_pages = num_pages; 400 io_ctl->fs_info = inode_to_fs_info(inode); 401 io_ctl->inode = inode; 402 403 return 0; 404 } 405 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 406 407 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 408 { 409 kfree(io_ctl->pages); 410 io_ctl->pages = NULL; 411 } 412 413 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 414 { 415 if (io_ctl->cur) { 416 io_ctl->cur = NULL; 417 io_ctl->orig = NULL; 418 } 419 } 420 421 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 422 { 423 ASSERT(io_ctl->index < io_ctl->num_pages); 424 io_ctl->page = io_ctl->pages[io_ctl->index++]; 425 io_ctl->cur = page_address(io_ctl->page); 426 io_ctl->orig = io_ctl->cur; 427 io_ctl->size = PAGE_SIZE; 428 if (clear) 429 clear_page(io_ctl->cur); 430 } 431 432 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 433 { 434 int i; 435 436 io_ctl_unmap_page(io_ctl); 437 438 for (i = 0; i < io_ctl->num_pages; i++) { 439 if (io_ctl->pages[i]) { 440 btrfs_folio_clear_checked(io_ctl->fs_info, 441 page_folio(io_ctl->pages[i]), 442 page_offset(io_ctl->pages[i]), 443 PAGE_SIZE); 444 unlock_page(io_ctl->pages[i]); 445 put_page(io_ctl->pages[i]); 446 } 447 } 448 } 449 450 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 451 { 452 struct page *page; 453 struct inode *inode = io_ctl->inode; 454 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 455 int i; 456 457 for (i = 0; i < io_ctl->num_pages; i++) { 458 int ret; 459 460 page = find_or_create_page(inode->i_mapping, i, mask); 461 if (!page) { 462 io_ctl_drop_pages(io_ctl); 463 return -ENOMEM; 464 } 465 466 ret = set_page_extent_mapped(page); 467 if (ret < 0) { 468 unlock_page(page); 469 put_page(page); 470 io_ctl_drop_pages(io_ctl); 471 return ret; 472 } 473 474 io_ctl->pages[i] = page; 475 if (uptodate && !PageUptodate(page)) { 476 btrfs_read_folio(NULL, page_folio(page)); 477 lock_page(page); 478 if (page->mapping != inode->i_mapping) { 479 btrfs_err(BTRFS_I(inode)->root->fs_info, 480 "free space cache page truncated"); 481 io_ctl_drop_pages(io_ctl); 482 return -EIO; 483 } 484 if (!PageUptodate(page)) { 485 btrfs_err(BTRFS_I(inode)->root->fs_info, 486 "error reading free space cache"); 487 io_ctl_drop_pages(io_ctl); 488 return -EIO; 489 } 490 } 491 } 492 493 for (i = 0; i < io_ctl->num_pages; i++) 494 clear_page_dirty_for_io(io_ctl->pages[i]); 495 496 return 0; 497 } 498 499 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 500 { 501 io_ctl_map_page(io_ctl, 1); 502 503 /* 504 * Skip the csum areas. If we don't check crcs then we just have a 505 * 64bit chunk at the front of the first page. 506 */ 507 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 508 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 509 510 put_unaligned_le64(generation, io_ctl->cur); 511 io_ctl->cur += sizeof(u64); 512 } 513 514 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 515 { 516 u64 cache_gen; 517 518 /* 519 * Skip the crc area. If we don't check crcs then we just have a 64bit 520 * chunk at the front of the first page. 521 */ 522 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 523 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 524 525 cache_gen = get_unaligned_le64(io_ctl->cur); 526 if (cache_gen != generation) { 527 btrfs_err_rl(io_ctl->fs_info, 528 "space cache generation (%llu) does not match inode (%llu)", 529 cache_gen, generation); 530 io_ctl_unmap_page(io_ctl); 531 return -EIO; 532 } 533 io_ctl->cur += sizeof(u64); 534 return 0; 535 } 536 537 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 538 { 539 u32 *tmp; 540 u32 crc = ~(u32)0; 541 unsigned offset = 0; 542 543 if (index == 0) 544 offset = sizeof(u32) * io_ctl->num_pages; 545 546 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 547 btrfs_crc32c_final(crc, (u8 *)&crc); 548 io_ctl_unmap_page(io_ctl); 549 tmp = page_address(io_ctl->pages[0]); 550 tmp += index; 551 *tmp = crc; 552 } 553 554 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 555 { 556 u32 *tmp, val; 557 u32 crc = ~(u32)0; 558 unsigned offset = 0; 559 560 if (index == 0) 561 offset = sizeof(u32) * io_ctl->num_pages; 562 563 tmp = page_address(io_ctl->pages[0]); 564 tmp += index; 565 val = *tmp; 566 567 io_ctl_map_page(io_ctl, 0); 568 crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 569 btrfs_crc32c_final(crc, (u8 *)&crc); 570 if (val != crc) { 571 btrfs_err_rl(io_ctl->fs_info, 572 "csum mismatch on free space cache"); 573 io_ctl_unmap_page(io_ctl); 574 return -EIO; 575 } 576 577 return 0; 578 } 579 580 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 581 void *bitmap) 582 { 583 struct btrfs_free_space_entry *entry; 584 585 if (!io_ctl->cur) 586 return -ENOSPC; 587 588 entry = io_ctl->cur; 589 put_unaligned_le64(offset, &entry->offset); 590 put_unaligned_le64(bytes, &entry->bytes); 591 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 592 BTRFS_FREE_SPACE_EXTENT; 593 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 594 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 595 596 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 597 return 0; 598 599 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 600 601 /* No more pages to map */ 602 if (io_ctl->index >= io_ctl->num_pages) 603 return 0; 604 605 /* map the next page */ 606 io_ctl_map_page(io_ctl, 1); 607 return 0; 608 } 609 610 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 611 { 612 if (!io_ctl->cur) 613 return -ENOSPC; 614 615 /* 616 * If we aren't at the start of the current page, unmap this one and 617 * map the next one if there is any left. 618 */ 619 if (io_ctl->cur != io_ctl->orig) { 620 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 621 if (io_ctl->index >= io_ctl->num_pages) 622 return -ENOSPC; 623 io_ctl_map_page(io_ctl, 0); 624 } 625 626 copy_page(io_ctl->cur, bitmap); 627 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 628 if (io_ctl->index < io_ctl->num_pages) 629 io_ctl_map_page(io_ctl, 0); 630 return 0; 631 } 632 633 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 634 { 635 /* 636 * If we're not on the boundary we know we've modified the page and we 637 * need to crc the page. 638 */ 639 if (io_ctl->cur != io_ctl->orig) 640 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 641 else 642 io_ctl_unmap_page(io_ctl); 643 644 while (io_ctl->index < io_ctl->num_pages) { 645 io_ctl_map_page(io_ctl, 1); 646 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 647 } 648 } 649 650 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 651 struct btrfs_free_space *entry, u8 *type) 652 { 653 struct btrfs_free_space_entry *e; 654 int ret; 655 656 if (!io_ctl->cur) { 657 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 658 if (ret) 659 return ret; 660 } 661 662 e = io_ctl->cur; 663 entry->offset = get_unaligned_le64(&e->offset); 664 entry->bytes = get_unaligned_le64(&e->bytes); 665 *type = e->type; 666 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 667 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 668 669 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 670 return 0; 671 672 io_ctl_unmap_page(io_ctl); 673 674 return 0; 675 } 676 677 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 678 struct btrfs_free_space *entry) 679 { 680 int ret; 681 682 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 683 if (ret) 684 return ret; 685 686 copy_page(entry->bitmap, io_ctl->cur); 687 io_ctl_unmap_page(io_ctl); 688 689 return 0; 690 } 691 692 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 693 { 694 struct btrfs_block_group *block_group = ctl->block_group; 695 u64 max_bytes; 696 u64 bitmap_bytes; 697 u64 extent_bytes; 698 u64 size = block_group->length; 699 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 700 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 701 702 max_bitmaps = max_t(u64, max_bitmaps, 1); 703 704 if (ctl->total_bitmaps > max_bitmaps) 705 btrfs_err(block_group->fs_info, 706 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 707 block_group->start, block_group->length, 708 ctl->total_bitmaps, ctl->unit, max_bitmaps, 709 bytes_per_bg); 710 ASSERT(ctl->total_bitmaps <= max_bitmaps); 711 712 /* 713 * We are trying to keep the total amount of memory used per 1GiB of 714 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 715 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 716 * bitmaps, we may end up using more memory than this. 717 */ 718 if (size < SZ_1G) 719 max_bytes = MAX_CACHE_BYTES_PER_GIG; 720 else 721 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 722 723 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 724 725 /* 726 * we want the extent entry threshold to always be at most 1/2 the max 727 * bytes we can have, or whatever is less than that. 728 */ 729 extent_bytes = max_bytes - bitmap_bytes; 730 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 731 732 ctl->extents_thresh = 733 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 734 } 735 736 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 737 struct btrfs_free_space_ctl *ctl, 738 struct btrfs_path *path, u64 offset) 739 { 740 struct btrfs_fs_info *fs_info = root->fs_info; 741 struct btrfs_free_space_header *header; 742 struct extent_buffer *leaf; 743 struct btrfs_io_ctl io_ctl; 744 struct btrfs_key key; 745 struct btrfs_free_space *e, *n; 746 LIST_HEAD(bitmaps); 747 u64 num_entries; 748 u64 num_bitmaps; 749 u64 generation; 750 u8 type; 751 int ret = 0; 752 753 /* Nothing in the space cache, goodbye */ 754 if (!i_size_read(inode)) 755 return 0; 756 757 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 758 key.offset = offset; 759 key.type = 0; 760 761 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 762 if (ret < 0) 763 return 0; 764 else if (ret > 0) { 765 btrfs_release_path(path); 766 return 0; 767 } 768 769 ret = -1; 770 771 leaf = path->nodes[0]; 772 header = btrfs_item_ptr(leaf, path->slots[0], 773 struct btrfs_free_space_header); 774 num_entries = btrfs_free_space_entries(leaf, header); 775 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 776 generation = btrfs_free_space_generation(leaf, header); 777 btrfs_release_path(path); 778 779 if (!BTRFS_I(inode)->generation) { 780 btrfs_info(fs_info, 781 "the free space cache file (%llu) is invalid, skip it", 782 offset); 783 return 0; 784 } 785 786 if (BTRFS_I(inode)->generation != generation) { 787 btrfs_err(fs_info, 788 "free space inode generation (%llu) did not match free space cache generation (%llu)", 789 BTRFS_I(inode)->generation, generation); 790 return 0; 791 } 792 793 if (!num_entries) 794 return 0; 795 796 ret = io_ctl_init(&io_ctl, inode, 0); 797 if (ret) 798 return ret; 799 800 readahead_cache(inode); 801 802 ret = io_ctl_prepare_pages(&io_ctl, true); 803 if (ret) 804 goto out; 805 806 ret = io_ctl_check_crc(&io_ctl, 0); 807 if (ret) 808 goto free_cache; 809 810 ret = io_ctl_check_generation(&io_ctl, generation); 811 if (ret) 812 goto free_cache; 813 814 while (num_entries) { 815 e = kmem_cache_zalloc(btrfs_free_space_cachep, 816 GFP_NOFS); 817 if (!e) { 818 ret = -ENOMEM; 819 goto free_cache; 820 } 821 822 ret = io_ctl_read_entry(&io_ctl, e, &type); 823 if (ret) { 824 kmem_cache_free(btrfs_free_space_cachep, e); 825 goto free_cache; 826 } 827 828 if (!e->bytes) { 829 ret = -1; 830 kmem_cache_free(btrfs_free_space_cachep, e); 831 goto free_cache; 832 } 833 834 if (type == BTRFS_FREE_SPACE_EXTENT) { 835 spin_lock(&ctl->tree_lock); 836 ret = link_free_space(ctl, e); 837 spin_unlock(&ctl->tree_lock); 838 if (ret) { 839 btrfs_err(fs_info, 840 "Duplicate entries in free space cache, dumping"); 841 kmem_cache_free(btrfs_free_space_cachep, e); 842 goto free_cache; 843 } 844 } else { 845 ASSERT(num_bitmaps); 846 num_bitmaps--; 847 e->bitmap = kmem_cache_zalloc( 848 btrfs_free_space_bitmap_cachep, GFP_NOFS); 849 if (!e->bitmap) { 850 ret = -ENOMEM; 851 kmem_cache_free( 852 btrfs_free_space_cachep, e); 853 goto free_cache; 854 } 855 spin_lock(&ctl->tree_lock); 856 ret = link_free_space(ctl, e); 857 if (ret) { 858 spin_unlock(&ctl->tree_lock); 859 btrfs_err(fs_info, 860 "Duplicate entries in free space cache, dumping"); 861 kmem_cache_free(btrfs_free_space_cachep, e); 862 goto free_cache; 863 } 864 ctl->total_bitmaps++; 865 recalculate_thresholds(ctl); 866 spin_unlock(&ctl->tree_lock); 867 list_add_tail(&e->list, &bitmaps); 868 } 869 870 num_entries--; 871 } 872 873 io_ctl_unmap_page(&io_ctl); 874 875 /* 876 * We add the bitmaps at the end of the entries in order that 877 * the bitmap entries are added to the cache. 878 */ 879 list_for_each_entry_safe(e, n, &bitmaps, list) { 880 list_del_init(&e->list); 881 ret = io_ctl_read_bitmap(&io_ctl, e); 882 if (ret) 883 goto free_cache; 884 } 885 886 io_ctl_drop_pages(&io_ctl); 887 ret = 1; 888 out: 889 io_ctl_free(&io_ctl); 890 return ret; 891 free_cache: 892 io_ctl_drop_pages(&io_ctl); 893 894 spin_lock(&ctl->tree_lock); 895 __btrfs_remove_free_space_cache(ctl); 896 spin_unlock(&ctl->tree_lock); 897 goto out; 898 } 899 900 static int copy_free_space_cache(struct btrfs_block_group *block_group, 901 struct btrfs_free_space_ctl *ctl) 902 { 903 struct btrfs_free_space *info; 904 struct rb_node *n; 905 int ret = 0; 906 907 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 908 info = rb_entry(n, struct btrfs_free_space, offset_index); 909 if (!info->bitmap) { 910 const u64 offset = info->offset; 911 const u64 bytes = info->bytes; 912 913 unlink_free_space(ctl, info, true); 914 spin_unlock(&ctl->tree_lock); 915 kmem_cache_free(btrfs_free_space_cachep, info); 916 ret = btrfs_add_free_space(block_group, offset, bytes); 917 spin_lock(&ctl->tree_lock); 918 } else { 919 u64 offset = info->offset; 920 u64 bytes = ctl->unit; 921 922 ret = search_bitmap(ctl, info, &offset, &bytes, false); 923 if (ret == 0) { 924 bitmap_clear_bits(ctl, info, offset, bytes, true); 925 spin_unlock(&ctl->tree_lock); 926 ret = btrfs_add_free_space(block_group, offset, 927 bytes); 928 spin_lock(&ctl->tree_lock); 929 } else { 930 free_bitmap(ctl, info); 931 ret = 0; 932 } 933 } 934 cond_resched_lock(&ctl->tree_lock); 935 } 936 return ret; 937 } 938 939 static struct lock_class_key btrfs_free_space_inode_key; 940 941 int load_free_space_cache(struct btrfs_block_group *block_group) 942 { 943 struct btrfs_fs_info *fs_info = block_group->fs_info; 944 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 945 struct btrfs_free_space_ctl tmp_ctl = {}; 946 struct inode *inode; 947 struct btrfs_path *path; 948 int ret = 0; 949 bool matched; 950 u64 used = block_group->used; 951 952 /* 953 * Because we could potentially discard our loaded free space, we want 954 * to load everything into a temporary structure first, and then if it's 955 * valid copy it all into the actual free space ctl. 956 */ 957 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 958 959 /* 960 * If this block group has been marked to be cleared for one reason or 961 * another then we can't trust the on disk cache, so just return. 962 */ 963 spin_lock(&block_group->lock); 964 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 965 spin_unlock(&block_group->lock); 966 return 0; 967 } 968 spin_unlock(&block_group->lock); 969 970 path = btrfs_alloc_path(); 971 if (!path) 972 return 0; 973 path->search_commit_root = 1; 974 path->skip_locking = 1; 975 976 /* 977 * We must pass a path with search_commit_root set to btrfs_iget in 978 * order to avoid a deadlock when allocating extents for the tree root. 979 * 980 * When we are COWing an extent buffer from the tree root, when looking 981 * for a free extent, at extent-tree.c:find_free_extent(), we can find 982 * block group without its free space cache loaded. When we find one 983 * we must load its space cache which requires reading its free space 984 * cache's inode item from the root tree. If this inode item is located 985 * in the same leaf that we started COWing before, then we end up in 986 * deadlock on the extent buffer (trying to read lock it when we 987 * previously write locked it). 988 * 989 * It's safe to read the inode item using the commit root because 990 * block groups, once loaded, stay in memory forever (until they are 991 * removed) as well as their space caches once loaded. New block groups 992 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 993 * we will never try to read their inode item while the fs is mounted. 994 */ 995 inode = lookup_free_space_inode(block_group, path); 996 if (IS_ERR(inode)) { 997 btrfs_free_path(path); 998 return 0; 999 } 1000 1001 /* We may have converted the inode and made the cache invalid. */ 1002 spin_lock(&block_group->lock); 1003 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1004 spin_unlock(&block_group->lock); 1005 btrfs_free_path(path); 1006 goto out; 1007 } 1008 spin_unlock(&block_group->lock); 1009 1010 /* 1011 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1012 * free space inodes to prevent false positives related to locks for normal 1013 * inodes. 1014 */ 1015 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1016 &btrfs_free_space_inode_key); 1017 1018 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1019 path, block_group->start); 1020 btrfs_free_path(path); 1021 if (ret <= 0) 1022 goto out; 1023 1024 matched = (tmp_ctl.free_space == (block_group->length - used - 1025 block_group->bytes_super)); 1026 1027 if (matched) { 1028 spin_lock(&tmp_ctl.tree_lock); 1029 ret = copy_free_space_cache(block_group, &tmp_ctl); 1030 spin_unlock(&tmp_ctl.tree_lock); 1031 /* 1032 * ret == 1 means we successfully loaded the free space cache, 1033 * so we need to re-set it here. 1034 */ 1035 if (ret == 0) 1036 ret = 1; 1037 } else { 1038 /* 1039 * We need to call the _locked variant so we don't try to update 1040 * the discard counters. 1041 */ 1042 spin_lock(&tmp_ctl.tree_lock); 1043 __btrfs_remove_free_space_cache(&tmp_ctl); 1044 spin_unlock(&tmp_ctl.tree_lock); 1045 btrfs_warn(fs_info, 1046 "block group %llu has wrong amount of free space", 1047 block_group->start); 1048 ret = -1; 1049 } 1050 out: 1051 if (ret < 0) { 1052 /* This cache is bogus, make sure it gets cleared */ 1053 spin_lock(&block_group->lock); 1054 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1055 spin_unlock(&block_group->lock); 1056 ret = 0; 1057 1058 btrfs_warn(fs_info, 1059 "failed to load free space cache for block group %llu, rebuilding it now", 1060 block_group->start); 1061 } 1062 1063 spin_lock(&ctl->tree_lock); 1064 btrfs_discard_update_discardable(block_group); 1065 spin_unlock(&ctl->tree_lock); 1066 iput(inode); 1067 return ret; 1068 } 1069 1070 static noinline_for_stack 1071 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1072 struct btrfs_free_space_ctl *ctl, 1073 struct btrfs_block_group *block_group, 1074 int *entries, int *bitmaps, 1075 struct list_head *bitmap_list) 1076 { 1077 int ret; 1078 struct btrfs_free_cluster *cluster = NULL; 1079 struct btrfs_free_cluster *cluster_locked = NULL; 1080 struct rb_node *node = rb_first(&ctl->free_space_offset); 1081 struct btrfs_trim_range *trim_entry; 1082 1083 /* Get the cluster for this block_group if it exists */ 1084 if (block_group && !list_empty(&block_group->cluster_list)) { 1085 cluster = list_entry(block_group->cluster_list.next, 1086 struct btrfs_free_cluster, 1087 block_group_list); 1088 } 1089 1090 if (!node && cluster) { 1091 cluster_locked = cluster; 1092 spin_lock(&cluster_locked->lock); 1093 node = rb_first(&cluster->root); 1094 cluster = NULL; 1095 } 1096 1097 /* Write out the extent entries */ 1098 while (node) { 1099 struct btrfs_free_space *e; 1100 1101 e = rb_entry(node, struct btrfs_free_space, offset_index); 1102 *entries += 1; 1103 1104 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1105 e->bitmap); 1106 if (ret) 1107 goto fail; 1108 1109 if (e->bitmap) { 1110 list_add_tail(&e->list, bitmap_list); 1111 *bitmaps += 1; 1112 } 1113 node = rb_next(node); 1114 if (!node && cluster) { 1115 node = rb_first(&cluster->root); 1116 cluster_locked = cluster; 1117 spin_lock(&cluster_locked->lock); 1118 cluster = NULL; 1119 } 1120 } 1121 if (cluster_locked) { 1122 spin_unlock(&cluster_locked->lock); 1123 cluster_locked = NULL; 1124 } 1125 1126 /* 1127 * Make sure we don't miss any range that was removed from our rbtree 1128 * because trimming is running. Otherwise after a umount+mount (or crash 1129 * after committing the transaction) we would leak free space and get 1130 * an inconsistent free space cache report from fsck. 1131 */ 1132 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1133 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1134 trim_entry->bytes, NULL); 1135 if (ret) 1136 goto fail; 1137 *entries += 1; 1138 } 1139 1140 return 0; 1141 fail: 1142 if (cluster_locked) 1143 spin_unlock(&cluster_locked->lock); 1144 return -ENOSPC; 1145 } 1146 1147 static noinline_for_stack int 1148 update_cache_item(struct btrfs_trans_handle *trans, 1149 struct btrfs_root *root, 1150 struct inode *inode, 1151 struct btrfs_path *path, u64 offset, 1152 int entries, int bitmaps) 1153 { 1154 struct btrfs_key key; 1155 struct btrfs_free_space_header *header; 1156 struct extent_buffer *leaf; 1157 int ret; 1158 1159 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1160 key.offset = offset; 1161 key.type = 0; 1162 1163 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1164 if (ret < 0) { 1165 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1166 EXTENT_DELALLOC, NULL); 1167 goto fail; 1168 } 1169 leaf = path->nodes[0]; 1170 if (ret > 0) { 1171 struct btrfs_key found_key; 1172 ASSERT(path->slots[0]); 1173 path->slots[0]--; 1174 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1175 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1176 found_key.offset != offset) { 1177 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1178 inode->i_size - 1, EXTENT_DELALLOC, 1179 NULL); 1180 btrfs_release_path(path); 1181 goto fail; 1182 } 1183 } 1184 1185 BTRFS_I(inode)->generation = trans->transid; 1186 header = btrfs_item_ptr(leaf, path->slots[0], 1187 struct btrfs_free_space_header); 1188 btrfs_set_free_space_entries(leaf, header, entries); 1189 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1190 btrfs_set_free_space_generation(leaf, header, trans->transid); 1191 btrfs_mark_buffer_dirty(trans, leaf); 1192 btrfs_release_path(path); 1193 1194 return 0; 1195 1196 fail: 1197 return -1; 1198 } 1199 1200 static noinline_for_stack int write_pinned_extent_entries( 1201 struct btrfs_trans_handle *trans, 1202 struct btrfs_block_group *block_group, 1203 struct btrfs_io_ctl *io_ctl, 1204 int *entries) 1205 { 1206 u64 start, extent_start, extent_end, len; 1207 struct extent_io_tree *unpin = NULL; 1208 int ret; 1209 1210 if (!block_group) 1211 return 0; 1212 1213 /* 1214 * We want to add any pinned extents to our free space cache 1215 * so we don't leak the space 1216 * 1217 * We shouldn't have switched the pinned extents yet so this is the 1218 * right one 1219 */ 1220 unpin = &trans->transaction->pinned_extents; 1221 1222 start = block_group->start; 1223 1224 while (start < block_group->start + block_group->length) { 1225 if (!find_first_extent_bit(unpin, start, 1226 &extent_start, &extent_end, 1227 EXTENT_DIRTY, NULL)) 1228 return 0; 1229 1230 /* This pinned extent is out of our range */ 1231 if (extent_start >= block_group->start + block_group->length) 1232 return 0; 1233 1234 extent_start = max(extent_start, start); 1235 extent_end = min(block_group->start + block_group->length, 1236 extent_end + 1); 1237 len = extent_end - extent_start; 1238 1239 *entries += 1; 1240 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1241 if (ret) 1242 return -ENOSPC; 1243 1244 start = extent_end; 1245 } 1246 1247 return 0; 1248 } 1249 1250 static noinline_for_stack int 1251 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1252 { 1253 struct btrfs_free_space *entry, *next; 1254 int ret; 1255 1256 /* Write out the bitmaps */ 1257 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1258 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1259 if (ret) 1260 return -ENOSPC; 1261 list_del_init(&entry->list); 1262 } 1263 1264 return 0; 1265 } 1266 1267 static int flush_dirty_cache(struct inode *inode) 1268 { 1269 int ret; 1270 1271 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1272 if (ret) 1273 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1274 EXTENT_DELALLOC, NULL); 1275 1276 return ret; 1277 } 1278 1279 static void noinline_for_stack 1280 cleanup_bitmap_list(struct list_head *bitmap_list) 1281 { 1282 struct btrfs_free_space *entry, *next; 1283 1284 list_for_each_entry_safe(entry, next, bitmap_list, list) 1285 list_del_init(&entry->list); 1286 } 1287 1288 static void noinline_for_stack 1289 cleanup_write_cache_enospc(struct inode *inode, 1290 struct btrfs_io_ctl *io_ctl, 1291 struct extent_state **cached_state) 1292 { 1293 io_ctl_drop_pages(io_ctl); 1294 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1295 cached_state); 1296 } 1297 1298 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1299 struct btrfs_trans_handle *trans, 1300 struct btrfs_block_group *block_group, 1301 struct btrfs_io_ctl *io_ctl, 1302 struct btrfs_path *path, u64 offset) 1303 { 1304 int ret; 1305 struct inode *inode = io_ctl->inode; 1306 1307 if (!inode) 1308 return 0; 1309 1310 /* Flush the dirty pages in the cache file. */ 1311 ret = flush_dirty_cache(inode); 1312 if (ret) 1313 goto out; 1314 1315 /* Update the cache item to tell everyone this cache file is valid. */ 1316 ret = update_cache_item(trans, root, inode, path, offset, 1317 io_ctl->entries, io_ctl->bitmaps); 1318 out: 1319 if (ret) { 1320 invalidate_inode_pages2(inode->i_mapping); 1321 BTRFS_I(inode)->generation = 0; 1322 if (block_group) 1323 btrfs_debug(root->fs_info, 1324 "failed to write free space cache for block group %llu error %d", 1325 block_group->start, ret); 1326 } 1327 btrfs_update_inode(trans, BTRFS_I(inode)); 1328 1329 if (block_group) { 1330 /* the dirty list is protected by the dirty_bgs_lock */ 1331 spin_lock(&trans->transaction->dirty_bgs_lock); 1332 1333 /* the disk_cache_state is protected by the block group lock */ 1334 spin_lock(&block_group->lock); 1335 1336 /* 1337 * only mark this as written if we didn't get put back on 1338 * the dirty list while waiting for IO. Otherwise our 1339 * cache state won't be right, and we won't get written again 1340 */ 1341 if (!ret && list_empty(&block_group->dirty_list)) 1342 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1343 else if (ret) 1344 block_group->disk_cache_state = BTRFS_DC_ERROR; 1345 1346 spin_unlock(&block_group->lock); 1347 spin_unlock(&trans->transaction->dirty_bgs_lock); 1348 io_ctl->inode = NULL; 1349 iput(inode); 1350 } 1351 1352 return ret; 1353 1354 } 1355 1356 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1357 struct btrfs_block_group *block_group, 1358 struct btrfs_path *path) 1359 { 1360 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1361 block_group, &block_group->io_ctl, 1362 path, block_group->start); 1363 } 1364 1365 /* 1366 * Write out cached info to an inode. 1367 * 1368 * @inode: freespace inode we are writing out 1369 * @ctl: free space cache we are going to write out 1370 * @block_group: block_group for this cache if it belongs to a block_group 1371 * @io_ctl: holds context for the io 1372 * @trans: the trans handle 1373 * 1374 * This function writes out a free space cache struct to disk for quick recovery 1375 * on mount. This will return 0 if it was successful in writing the cache out, 1376 * or an errno if it was not. 1377 */ 1378 static int __btrfs_write_out_cache(struct inode *inode, 1379 struct btrfs_free_space_ctl *ctl, 1380 struct btrfs_block_group *block_group, 1381 struct btrfs_io_ctl *io_ctl, 1382 struct btrfs_trans_handle *trans) 1383 { 1384 struct extent_state *cached_state = NULL; 1385 LIST_HEAD(bitmap_list); 1386 int entries = 0; 1387 int bitmaps = 0; 1388 int ret; 1389 int must_iput = 0; 1390 1391 if (!i_size_read(inode)) 1392 return -EIO; 1393 1394 WARN_ON(io_ctl->pages); 1395 ret = io_ctl_init(io_ctl, inode, 1); 1396 if (ret) 1397 return ret; 1398 1399 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1400 down_write(&block_group->data_rwsem); 1401 spin_lock(&block_group->lock); 1402 if (block_group->delalloc_bytes) { 1403 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1404 spin_unlock(&block_group->lock); 1405 up_write(&block_group->data_rwsem); 1406 BTRFS_I(inode)->generation = 0; 1407 ret = 0; 1408 must_iput = 1; 1409 goto out; 1410 } 1411 spin_unlock(&block_group->lock); 1412 } 1413 1414 /* Lock all pages first so we can lock the extent safely. */ 1415 ret = io_ctl_prepare_pages(io_ctl, false); 1416 if (ret) 1417 goto out_unlock; 1418 1419 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1420 &cached_state); 1421 1422 io_ctl_set_generation(io_ctl, trans->transid); 1423 1424 mutex_lock(&ctl->cache_writeout_mutex); 1425 /* Write out the extent entries in the free space cache */ 1426 spin_lock(&ctl->tree_lock); 1427 ret = write_cache_extent_entries(io_ctl, ctl, 1428 block_group, &entries, &bitmaps, 1429 &bitmap_list); 1430 if (ret) 1431 goto out_nospc_locked; 1432 1433 /* 1434 * Some spaces that are freed in the current transaction are pinned, 1435 * they will be added into free space cache after the transaction is 1436 * committed, we shouldn't lose them. 1437 * 1438 * If this changes while we are working we'll get added back to 1439 * the dirty list and redo it. No locking needed 1440 */ 1441 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1442 if (ret) 1443 goto out_nospc_locked; 1444 1445 /* 1446 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1447 * locked while doing it because a concurrent trim can be manipulating 1448 * or freeing the bitmap. 1449 */ 1450 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1451 spin_unlock(&ctl->tree_lock); 1452 mutex_unlock(&ctl->cache_writeout_mutex); 1453 if (ret) 1454 goto out_nospc; 1455 1456 /* Zero out the rest of the pages just to make sure */ 1457 io_ctl_zero_remaining_pages(io_ctl); 1458 1459 /* Everything is written out, now we dirty the pages in the file. */ 1460 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1461 io_ctl->num_pages, 0, i_size_read(inode), 1462 &cached_state, false); 1463 if (ret) 1464 goto out_nospc; 1465 1466 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1467 up_write(&block_group->data_rwsem); 1468 /* 1469 * Release the pages and unlock the extent, we will flush 1470 * them out later 1471 */ 1472 io_ctl_drop_pages(io_ctl); 1473 io_ctl_free(io_ctl); 1474 1475 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1476 &cached_state); 1477 1478 /* 1479 * at this point the pages are under IO and we're happy, 1480 * The caller is responsible for waiting on them and updating 1481 * the cache and the inode 1482 */ 1483 io_ctl->entries = entries; 1484 io_ctl->bitmaps = bitmaps; 1485 1486 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1487 if (ret) 1488 goto out; 1489 1490 return 0; 1491 1492 out_nospc_locked: 1493 cleanup_bitmap_list(&bitmap_list); 1494 spin_unlock(&ctl->tree_lock); 1495 mutex_unlock(&ctl->cache_writeout_mutex); 1496 1497 out_nospc: 1498 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1499 1500 out_unlock: 1501 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1502 up_write(&block_group->data_rwsem); 1503 1504 out: 1505 io_ctl->inode = NULL; 1506 io_ctl_free(io_ctl); 1507 if (ret) { 1508 invalidate_inode_pages2(inode->i_mapping); 1509 BTRFS_I(inode)->generation = 0; 1510 } 1511 btrfs_update_inode(trans, BTRFS_I(inode)); 1512 if (must_iput) 1513 iput(inode); 1514 return ret; 1515 } 1516 1517 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1518 struct btrfs_block_group *block_group, 1519 struct btrfs_path *path) 1520 { 1521 struct btrfs_fs_info *fs_info = trans->fs_info; 1522 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1523 struct inode *inode; 1524 int ret = 0; 1525 1526 spin_lock(&block_group->lock); 1527 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1528 spin_unlock(&block_group->lock); 1529 return 0; 1530 } 1531 spin_unlock(&block_group->lock); 1532 1533 inode = lookup_free_space_inode(block_group, path); 1534 if (IS_ERR(inode)) 1535 return 0; 1536 1537 ret = __btrfs_write_out_cache(inode, ctl, block_group, 1538 &block_group->io_ctl, trans); 1539 if (ret) { 1540 btrfs_debug(fs_info, 1541 "failed to write free space cache for block group %llu error %d", 1542 block_group->start, ret); 1543 spin_lock(&block_group->lock); 1544 block_group->disk_cache_state = BTRFS_DC_ERROR; 1545 spin_unlock(&block_group->lock); 1546 1547 block_group->io_ctl.inode = NULL; 1548 iput(inode); 1549 } 1550 1551 /* 1552 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1553 * to wait for IO and put the inode 1554 */ 1555 1556 return ret; 1557 } 1558 1559 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1560 u64 offset) 1561 { 1562 ASSERT(offset >= bitmap_start); 1563 offset -= bitmap_start; 1564 return (unsigned long)(div_u64(offset, unit)); 1565 } 1566 1567 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1568 { 1569 return (unsigned long)(div_u64(bytes, unit)); 1570 } 1571 1572 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1573 u64 offset) 1574 { 1575 u64 bitmap_start; 1576 u64 bytes_per_bitmap; 1577 1578 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1579 bitmap_start = offset - ctl->start; 1580 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1581 bitmap_start *= bytes_per_bitmap; 1582 bitmap_start += ctl->start; 1583 1584 return bitmap_start; 1585 } 1586 1587 static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, 1588 struct btrfs_free_cluster *cluster, 1589 struct btrfs_free_space *new_entry) 1590 { 1591 struct rb_root *root; 1592 struct rb_node **p; 1593 struct rb_node *parent = NULL; 1594 1595 lockdep_assert_held(&ctl->tree_lock); 1596 1597 if (cluster) { 1598 lockdep_assert_held(&cluster->lock); 1599 root = &cluster->root; 1600 } else { 1601 root = &ctl->free_space_offset; 1602 } 1603 1604 p = &root->rb_node; 1605 1606 while (*p) { 1607 struct btrfs_free_space *info; 1608 1609 parent = *p; 1610 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1611 1612 if (new_entry->offset < info->offset) { 1613 p = &(*p)->rb_left; 1614 } else if (new_entry->offset > info->offset) { 1615 p = &(*p)->rb_right; 1616 } else { 1617 /* 1618 * we could have a bitmap entry and an extent entry 1619 * share the same offset. If this is the case, we want 1620 * the extent entry to always be found first if we do a 1621 * linear search through the tree, since we want to have 1622 * the quickest allocation time, and allocating from an 1623 * extent is faster than allocating from a bitmap. So 1624 * if we're inserting a bitmap and we find an entry at 1625 * this offset, we want to go right, or after this entry 1626 * logically. If we are inserting an extent and we've 1627 * found a bitmap, we want to go left, or before 1628 * logically. 1629 */ 1630 if (new_entry->bitmap) { 1631 if (info->bitmap) { 1632 WARN_ON_ONCE(1); 1633 return -EEXIST; 1634 } 1635 p = &(*p)->rb_right; 1636 } else { 1637 if (!info->bitmap) { 1638 WARN_ON_ONCE(1); 1639 return -EEXIST; 1640 } 1641 p = &(*p)->rb_left; 1642 } 1643 } 1644 } 1645 1646 rb_link_node(&new_entry->offset_index, parent, p); 1647 rb_insert_color(&new_entry->offset_index, root); 1648 1649 return 0; 1650 } 1651 1652 /* 1653 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1654 * searched through the bitmap and figured out the largest ->max_extent_size, 1655 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1656 * allocator the wrong thing, we want to use the actual real max_extent_size 1657 * we've found already if it's larger, or we want to use ->bytes. 1658 * 1659 * This matters because find_free_space() will skip entries who's ->bytes is 1660 * less than the required bytes. So if we didn't search down this bitmap, we 1661 * may pick some previous entry that has a smaller ->max_extent_size than we 1662 * have. For example, assume we have two entries, one that has 1663 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1664 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1665 * call into find_free_space(), and return with max_extent_size == 4K, because 1666 * that first bitmap entry had ->max_extent_size set, but the second one did 1667 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1668 * 8K contiguous range. 1669 * 1670 * Consider the other case, we have 2 8K chunks in that second entry and still 1671 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1672 * allocator comes in it'll fully search our second bitmap, and this time it'll 1673 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1674 * right allocation the next loop through. 1675 */ 1676 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1677 { 1678 if (entry->bitmap && entry->max_extent_size) 1679 return entry->max_extent_size; 1680 return entry->bytes; 1681 } 1682 1683 /* 1684 * We want the largest entry to be leftmost, so this is inverted from what you'd 1685 * normally expect. 1686 */ 1687 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1688 { 1689 const struct btrfs_free_space *entry, *exist; 1690 1691 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1692 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1693 return get_max_extent_size(exist) < get_max_extent_size(entry); 1694 } 1695 1696 /* 1697 * searches the tree for the given offset. 1698 * 1699 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1700 * want a section that has at least bytes size and comes at or after the given 1701 * offset. 1702 */ 1703 static struct btrfs_free_space * 1704 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1705 u64 offset, int bitmap_only, int fuzzy) 1706 { 1707 struct rb_node *n = ctl->free_space_offset.rb_node; 1708 struct btrfs_free_space *entry = NULL, *prev = NULL; 1709 1710 lockdep_assert_held(&ctl->tree_lock); 1711 1712 /* find entry that is closest to the 'offset' */ 1713 while (n) { 1714 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1715 prev = entry; 1716 1717 if (offset < entry->offset) 1718 n = n->rb_left; 1719 else if (offset > entry->offset) 1720 n = n->rb_right; 1721 else 1722 break; 1723 1724 entry = NULL; 1725 } 1726 1727 if (bitmap_only) { 1728 if (!entry) 1729 return NULL; 1730 if (entry->bitmap) 1731 return entry; 1732 1733 /* 1734 * bitmap entry and extent entry may share same offset, 1735 * in that case, bitmap entry comes after extent entry. 1736 */ 1737 n = rb_next(n); 1738 if (!n) 1739 return NULL; 1740 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1741 if (entry->offset != offset) 1742 return NULL; 1743 1744 WARN_ON(!entry->bitmap); 1745 return entry; 1746 } else if (entry) { 1747 if (entry->bitmap) { 1748 /* 1749 * if previous extent entry covers the offset, 1750 * we should return it instead of the bitmap entry 1751 */ 1752 n = rb_prev(&entry->offset_index); 1753 if (n) { 1754 prev = rb_entry(n, struct btrfs_free_space, 1755 offset_index); 1756 if (!prev->bitmap && 1757 prev->offset + prev->bytes > offset) 1758 entry = prev; 1759 } 1760 } 1761 return entry; 1762 } 1763 1764 if (!prev) 1765 return NULL; 1766 1767 /* find last entry before the 'offset' */ 1768 entry = prev; 1769 if (entry->offset > offset) { 1770 n = rb_prev(&entry->offset_index); 1771 if (n) { 1772 entry = rb_entry(n, struct btrfs_free_space, 1773 offset_index); 1774 ASSERT(entry->offset <= offset); 1775 } else { 1776 if (fuzzy) 1777 return entry; 1778 else 1779 return NULL; 1780 } 1781 } 1782 1783 if (entry->bitmap) { 1784 n = rb_prev(&entry->offset_index); 1785 if (n) { 1786 prev = rb_entry(n, struct btrfs_free_space, 1787 offset_index); 1788 if (!prev->bitmap && 1789 prev->offset + prev->bytes > offset) 1790 return prev; 1791 } 1792 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1793 return entry; 1794 } else if (entry->offset + entry->bytes > offset) 1795 return entry; 1796 1797 if (!fuzzy) 1798 return NULL; 1799 1800 while (1) { 1801 n = rb_next(&entry->offset_index); 1802 if (!n) 1803 return NULL; 1804 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1805 if (entry->bitmap) { 1806 if (entry->offset + BITS_PER_BITMAP * 1807 ctl->unit > offset) 1808 break; 1809 } else { 1810 if (entry->offset + entry->bytes > offset) 1811 break; 1812 } 1813 } 1814 return entry; 1815 } 1816 1817 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1818 struct btrfs_free_space *info, 1819 bool update_stat) 1820 { 1821 lockdep_assert_held(&ctl->tree_lock); 1822 1823 rb_erase(&info->offset_index, &ctl->free_space_offset); 1824 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1825 ctl->free_extents--; 1826 1827 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1828 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1829 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1830 } 1831 1832 if (update_stat) 1833 ctl->free_space -= info->bytes; 1834 } 1835 1836 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1837 struct btrfs_free_space *info) 1838 { 1839 int ret = 0; 1840 1841 lockdep_assert_held(&ctl->tree_lock); 1842 1843 ASSERT(info->bytes || info->bitmap); 1844 ret = tree_insert_offset(ctl, NULL, info); 1845 if (ret) 1846 return ret; 1847 1848 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1849 1850 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1851 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1852 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1853 } 1854 1855 ctl->free_space += info->bytes; 1856 ctl->free_extents++; 1857 return ret; 1858 } 1859 1860 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1861 struct btrfs_free_space *info) 1862 { 1863 ASSERT(info->bitmap); 1864 1865 /* 1866 * If our entry is empty it's because we're on a cluster and we don't 1867 * want to re-link it into our ctl bytes index. 1868 */ 1869 if (RB_EMPTY_NODE(&info->bytes_index)) 1870 return; 1871 1872 lockdep_assert_held(&ctl->tree_lock); 1873 1874 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1875 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1876 } 1877 1878 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1879 struct btrfs_free_space *info, 1880 u64 offset, u64 bytes, bool update_stat) 1881 { 1882 unsigned long start, count, end; 1883 int extent_delta = -1; 1884 1885 start = offset_to_bit(info->offset, ctl->unit, offset); 1886 count = bytes_to_bits(bytes, ctl->unit); 1887 end = start + count; 1888 ASSERT(end <= BITS_PER_BITMAP); 1889 1890 bitmap_clear(info->bitmap, start, count); 1891 1892 info->bytes -= bytes; 1893 if (info->max_extent_size > ctl->unit) 1894 info->max_extent_size = 0; 1895 1896 relink_bitmap_entry(ctl, info); 1897 1898 if (start && test_bit(start - 1, info->bitmap)) 1899 extent_delta++; 1900 1901 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1902 extent_delta++; 1903 1904 info->bitmap_extents += extent_delta; 1905 if (!btrfs_free_space_trimmed(info)) { 1906 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1907 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1908 } 1909 1910 if (update_stat) 1911 ctl->free_space -= bytes; 1912 } 1913 1914 static void btrfs_bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1915 struct btrfs_free_space *info, u64 offset, 1916 u64 bytes) 1917 { 1918 unsigned long start, count, end; 1919 int extent_delta = 1; 1920 1921 start = offset_to_bit(info->offset, ctl->unit, offset); 1922 count = bytes_to_bits(bytes, ctl->unit); 1923 end = start + count; 1924 ASSERT(end <= BITS_PER_BITMAP); 1925 1926 bitmap_set(info->bitmap, start, count); 1927 1928 /* 1929 * We set some bytes, we have no idea what the max extent size is 1930 * anymore. 1931 */ 1932 info->max_extent_size = 0; 1933 info->bytes += bytes; 1934 ctl->free_space += bytes; 1935 1936 relink_bitmap_entry(ctl, info); 1937 1938 if (start && test_bit(start - 1, info->bitmap)) 1939 extent_delta--; 1940 1941 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1942 extent_delta--; 1943 1944 info->bitmap_extents += extent_delta; 1945 if (!btrfs_free_space_trimmed(info)) { 1946 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1947 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1948 } 1949 } 1950 1951 /* 1952 * If we can not find suitable extent, we will use bytes to record 1953 * the size of the max extent. 1954 */ 1955 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1956 struct btrfs_free_space *bitmap_info, u64 *offset, 1957 u64 *bytes, bool for_alloc) 1958 { 1959 unsigned long found_bits = 0; 1960 unsigned long max_bits = 0; 1961 unsigned long bits, i; 1962 unsigned long next_zero; 1963 unsigned long extent_bits; 1964 1965 /* 1966 * Skip searching the bitmap if we don't have a contiguous section that 1967 * is large enough for this allocation. 1968 */ 1969 if (for_alloc && 1970 bitmap_info->max_extent_size && 1971 bitmap_info->max_extent_size < *bytes) { 1972 *bytes = bitmap_info->max_extent_size; 1973 return -1; 1974 } 1975 1976 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1977 max_t(u64, *offset, bitmap_info->offset)); 1978 bits = bytes_to_bits(*bytes, ctl->unit); 1979 1980 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1981 if (for_alloc && bits == 1) { 1982 found_bits = 1; 1983 break; 1984 } 1985 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1986 BITS_PER_BITMAP, i); 1987 extent_bits = next_zero - i; 1988 if (extent_bits >= bits) { 1989 found_bits = extent_bits; 1990 break; 1991 } else if (extent_bits > max_bits) { 1992 max_bits = extent_bits; 1993 } 1994 i = next_zero; 1995 } 1996 1997 if (found_bits) { 1998 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1999 *bytes = (u64)(found_bits) * ctl->unit; 2000 return 0; 2001 } 2002 2003 *bytes = (u64)(max_bits) * ctl->unit; 2004 bitmap_info->max_extent_size = *bytes; 2005 relink_bitmap_entry(ctl, bitmap_info); 2006 return -1; 2007 } 2008 2009 /* Cache the size of the max extent in bytes */ 2010 static struct btrfs_free_space * 2011 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2012 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2013 { 2014 struct btrfs_free_space *entry; 2015 struct rb_node *node; 2016 u64 tmp; 2017 u64 align_off; 2018 int ret; 2019 2020 if (!ctl->free_space_offset.rb_node) 2021 goto out; 2022 again: 2023 if (use_bytes_index) { 2024 node = rb_first_cached(&ctl->free_space_bytes); 2025 } else { 2026 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2027 0, 1); 2028 if (!entry) 2029 goto out; 2030 node = &entry->offset_index; 2031 } 2032 2033 for (; node; node = rb_next(node)) { 2034 if (use_bytes_index) 2035 entry = rb_entry(node, struct btrfs_free_space, 2036 bytes_index); 2037 else 2038 entry = rb_entry(node, struct btrfs_free_space, 2039 offset_index); 2040 2041 /* 2042 * If we are using the bytes index then all subsequent entries 2043 * in this tree are going to be < bytes, so simply set the max 2044 * extent size and exit the loop. 2045 * 2046 * If we're using the offset index then we need to keep going 2047 * through the rest of the tree. 2048 */ 2049 if (entry->bytes < *bytes) { 2050 *max_extent_size = max(get_max_extent_size(entry), 2051 *max_extent_size); 2052 if (use_bytes_index) 2053 break; 2054 continue; 2055 } 2056 2057 /* make sure the space returned is big enough 2058 * to match our requested alignment 2059 */ 2060 if (*bytes >= align) { 2061 tmp = entry->offset - ctl->start + align - 1; 2062 tmp = div64_u64(tmp, align); 2063 tmp = tmp * align + ctl->start; 2064 align_off = tmp - entry->offset; 2065 } else { 2066 align_off = 0; 2067 tmp = entry->offset; 2068 } 2069 2070 /* 2071 * We don't break here if we're using the bytes index because we 2072 * may have another entry that has the correct alignment that is 2073 * the right size, so we don't want to miss that possibility. 2074 * At worst this adds another loop through the logic, but if we 2075 * broke here we could prematurely ENOSPC. 2076 */ 2077 if (entry->bytes < *bytes + align_off) { 2078 *max_extent_size = max(get_max_extent_size(entry), 2079 *max_extent_size); 2080 continue; 2081 } 2082 2083 if (entry->bitmap) { 2084 struct rb_node *old_next = rb_next(node); 2085 u64 size = *bytes; 2086 2087 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2088 if (!ret) { 2089 *offset = tmp; 2090 *bytes = size; 2091 return entry; 2092 } else { 2093 *max_extent_size = 2094 max(get_max_extent_size(entry), 2095 *max_extent_size); 2096 } 2097 2098 /* 2099 * The bitmap may have gotten re-arranged in the space 2100 * index here because the max_extent_size may have been 2101 * updated. Start from the beginning again if this 2102 * happened. 2103 */ 2104 if (use_bytes_index && old_next != rb_next(node)) 2105 goto again; 2106 continue; 2107 } 2108 2109 *offset = tmp; 2110 *bytes = entry->bytes - align_off; 2111 return entry; 2112 } 2113 out: 2114 return NULL; 2115 } 2116 2117 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2118 struct btrfs_free_space *info, u64 offset) 2119 { 2120 info->offset = offset_to_bitmap(ctl, offset); 2121 info->bytes = 0; 2122 info->bitmap_extents = 0; 2123 INIT_LIST_HEAD(&info->list); 2124 link_free_space(ctl, info); 2125 ctl->total_bitmaps++; 2126 recalculate_thresholds(ctl); 2127 } 2128 2129 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2130 struct btrfs_free_space *bitmap_info) 2131 { 2132 /* 2133 * Normally when this is called, the bitmap is completely empty. However, 2134 * if we are blowing up the free space cache for one reason or another 2135 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2136 * we may leave stats on the table. 2137 */ 2138 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2139 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2140 bitmap_info->bitmap_extents; 2141 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2142 2143 } 2144 unlink_free_space(ctl, bitmap_info, true); 2145 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2146 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2147 ctl->total_bitmaps--; 2148 recalculate_thresholds(ctl); 2149 } 2150 2151 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2152 struct btrfs_free_space *bitmap_info, 2153 u64 *offset, u64 *bytes) 2154 { 2155 u64 end; 2156 u64 search_start, search_bytes; 2157 int ret; 2158 2159 again: 2160 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2161 2162 /* 2163 * We need to search for bits in this bitmap. We could only cover some 2164 * of the extent in this bitmap thanks to how we add space, so we need 2165 * to search for as much as it as we can and clear that amount, and then 2166 * go searching for the next bit. 2167 */ 2168 search_start = *offset; 2169 search_bytes = ctl->unit; 2170 search_bytes = min(search_bytes, end - search_start + 1); 2171 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2172 false); 2173 if (ret < 0 || search_start != *offset) 2174 return -EINVAL; 2175 2176 /* We may have found more bits than what we need */ 2177 search_bytes = min(search_bytes, *bytes); 2178 2179 /* Cannot clear past the end of the bitmap */ 2180 search_bytes = min(search_bytes, end - search_start + 1); 2181 2182 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2183 *offset += search_bytes; 2184 *bytes -= search_bytes; 2185 2186 if (*bytes) { 2187 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2188 if (!bitmap_info->bytes) 2189 free_bitmap(ctl, bitmap_info); 2190 2191 /* 2192 * no entry after this bitmap, but we still have bytes to 2193 * remove, so something has gone wrong. 2194 */ 2195 if (!next) 2196 return -EINVAL; 2197 2198 bitmap_info = rb_entry(next, struct btrfs_free_space, 2199 offset_index); 2200 2201 /* 2202 * if the next entry isn't a bitmap we need to return to let the 2203 * extent stuff do its work. 2204 */ 2205 if (!bitmap_info->bitmap) 2206 return -EAGAIN; 2207 2208 /* 2209 * Ok the next item is a bitmap, but it may not actually hold 2210 * the information for the rest of this free space stuff, so 2211 * look for it, and if we don't find it return so we can try 2212 * everything over again. 2213 */ 2214 search_start = *offset; 2215 search_bytes = ctl->unit; 2216 ret = search_bitmap(ctl, bitmap_info, &search_start, 2217 &search_bytes, false); 2218 if (ret < 0 || search_start != *offset) 2219 return -EAGAIN; 2220 2221 goto again; 2222 } else if (!bitmap_info->bytes) 2223 free_bitmap(ctl, bitmap_info); 2224 2225 return 0; 2226 } 2227 2228 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2229 struct btrfs_free_space *info, u64 offset, 2230 u64 bytes, enum btrfs_trim_state trim_state) 2231 { 2232 u64 bytes_to_set = 0; 2233 u64 end; 2234 2235 /* 2236 * This is a tradeoff to make bitmap trim state minimal. We mark the 2237 * whole bitmap untrimmed if at any point we add untrimmed regions. 2238 */ 2239 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2240 if (btrfs_free_space_trimmed(info)) { 2241 ctl->discardable_extents[BTRFS_STAT_CURR] += 2242 info->bitmap_extents; 2243 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2244 } 2245 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2246 } 2247 2248 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2249 2250 bytes_to_set = min(end - offset, bytes); 2251 2252 btrfs_bitmap_set_bits(ctl, info, offset, bytes_to_set); 2253 2254 return bytes_to_set; 2255 2256 } 2257 2258 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2259 struct btrfs_free_space *info) 2260 { 2261 struct btrfs_block_group *block_group = ctl->block_group; 2262 struct btrfs_fs_info *fs_info = block_group->fs_info; 2263 bool forced = false; 2264 2265 #ifdef CONFIG_BTRFS_DEBUG 2266 if (btrfs_should_fragment_free_space(block_group)) 2267 forced = true; 2268 #endif 2269 2270 /* This is a way to reclaim large regions from the bitmaps. */ 2271 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2272 return false; 2273 2274 /* 2275 * If we are below the extents threshold then we can add this as an 2276 * extent, and don't have to deal with the bitmap 2277 */ 2278 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2279 /* 2280 * If this block group has some small extents we don't want to 2281 * use up all of our free slots in the cache with them, we want 2282 * to reserve them to larger extents, however if we have plenty 2283 * of cache left then go ahead an dadd them, no sense in adding 2284 * the overhead of a bitmap if we don't have to. 2285 */ 2286 if (info->bytes <= fs_info->sectorsize * 8) { 2287 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2288 return false; 2289 } else { 2290 return false; 2291 } 2292 } 2293 2294 /* 2295 * The original block groups from mkfs can be really small, like 8 2296 * megabytes, so don't bother with a bitmap for those entries. However 2297 * some block groups can be smaller than what a bitmap would cover but 2298 * are still large enough that they could overflow the 32k memory limit, 2299 * so allow those block groups to still be allowed to have a bitmap 2300 * entry. 2301 */ 2302 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2303 return false; 2304 2305 return true; 2306 } 2307 2308 static const struct btrfs_free_space_op free_space_op = { 2309 .use_bitmap = use_bitmap, 2310 }; 2311 2312 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2313 struct btrfs_free_space *info) 2314 { 2315 struct btrfs_free_space *bitmap_info; 2316 struct btrfs_block_group *block_group = NULL; 2317 int added = 0; 2318 u64 bytes, offset, bytes_added; 2319 enum btrfs_trim_state trim_state; 2320 int ret; 2321 2322 bytes = info->bytes; 2323 offset = info->offset; 2324 trim_state = info->trim_state; 2325 2326 if (!ctl->op->use_bitmap(ctl, info)) 2327 return 0; 2328 2329 if (ctl->op == &free_space_op) 2330 block_group = ctl->block_group; 2331 again: 2332 /* 2333 * Since we link bitmaps right into the cluster we need to see if we 2334 * have a cluster here, and if so and it has our bitmap we need to add 2335 * the free space to that bitmap. 2336 */ 2337 if (block_group && !list_empty(&block_group->cluster_list)) { 2338 struct btrfs_free_cluster *cluster; 2339 struct rb_node *node; 2340 struct btrfs_free_space *entry; 2341 2342 cluster = list_entry(block_group->cluster_list.next, 2343 struct btrfs_free_cluster, 2344 block_group_list); 2345 spin_lock(&cluster->lock); 2346 node = rb_first(&cluster->root); 2347 if (!node) { 2348 spin_unlock(&cluster->lock); 2349 goto no_cluster_bitmap; 2350 } 2351 2352 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2353 if (!entry->bitmap) { 2354 spin_unlock(&cluster->lock); 2355 goto no_cluster_bitmap; 2356 } 2357 2358 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2359 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2360 bytes, trim_state); 2361 bytes -= bytes_added; 2362 offset += bytes_added; 2363 } 2364 spin_unlock(&cluster->lock); 2365 if (!bytes) { 2366 ret = 1; 2367 goto out; 2368 } 2369 } 2370 2371 no_cluster_bitmap: 2372 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2373 1, 0); 2374 if (!bitmap_info) { 2375 ASSERT(added == 0); 2376 goto new_bitmap; 2377 } 2378 2379 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2380 trim_state); 2381 bytes -= bytes_added; 2382 offset += bytes_added; 2383 added = 0; 2384 2385 if (!bytes) { 2386 ret = 1; 2387 goto out; 2388 } else 2389 goto again; 2390 2391 new_bitmap: 2392 if (info && info->bitmap) { 2393 add_new_bitmap(ctl, info, offset); 2394 added = 1; 2395 info = NULL; 2396 goto again; 2397 } else { 2398 spin_unlock(&ctl->tree_lock); 2399 2400 /* no pre-allocated info, allocate a new one */ 2401 if (!info) { 2402 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2403 GFP_NOFS); 2404 if (!info) { 2405 spin_lock(&ctl->tree_lock); 2406 ret = -ENOMEM; 2407 goto out; 2408 } 2409 } 2410 2411 /* allocate the bitmap */ 2412 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2413 GFP_NOFS); 2414 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2415 spin_lock(&ctl->tree_lock); 2416 if (!info->bitmap) { 2417 ret = -ENOMEM; 2418 goto out; 2419 } 2420 goto again; 2421 } 2422 2423 out: 2424 if (info) { 2425 if (info->bitmap) 2426 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2427 info->bitmap); 2428 kmem_cache_free(btrfs_free_space_cachep, info); 2429 } 2430 2431 return ret; 2432 } 2433 2434 /* 2435 * Free space merging rules: 2436 * 1) Merge trimmed areas together 2437 * 2) Let untrimmed areas coalesce with trimmed areas 2438 * 3) Always pull neighboring regions from bitmaps 2439 * 2440 * The above rules are for when we merge free space based on btrfs_trim_state. 2441 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2442 * same reason: to promote larger extent regions which makes life easier for 2443 * find_free_extent(). Rule 2 enables coalescing based on the common path 2444 * being returning free space from btrfs_finish_extent_commit(). So when free 2445 * space is trimmed, it will prevent aggregating trimmed new region and 2446 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2447 * and provide find_free_extent() with the largest extents possible hoping for 2448 * the reuse path. 2449 */ 2450 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2451 struct btrfs_free_space *info, bool update_stat) 2452 { 2453 struct btrfs_free_space *left_info = NULL; 2454 struct btrfs_free_space *right_info; 2455 bool merged = false; 2456 u64 offset = info->offset; 2457 u64 bytes = info->bytes; 2458 const bool is_trimmed = btrfs_free_space_trimmed(info); 2459 struct rb_node *right_prev = NULL; 2460 2461 /* 2462 * first we want to see if there is free space adjacent to the range we 2463 * are adding, if there is remove that struct and add a new one to 2464 * cover the entire range 2465 */ 2466 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2467 if (right_info) 2468 right_prev = rb_prev(&right_info->offset_index); 2469 2470 if (right_prev) 2471 left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); 2472 else if (!right_info) 2473 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2474 2475 /* See try_merge_free_space() comment. */ 2476 if (right_info && !right_info->bitmap && 2477 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2478 unlink_free_space(ctl, right_info, update_stat); 2479 info->bytes += right_info->bytes; 2480 kmem_cache_free(btrfs_free_space_cachep, right_info); 2481 merged = true; 2482 } 2483 2484 /* See try_merge_free_space() comment. */ 2485 if (left_info && !left_info->bitmap && 2486 left_info->offset + left_info->bytes == offset && 2487 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2488 unlink_free_space(ctl, left_info, update_stat); 2489 info->offset = left_info->offset; 2490 info->bytes += left_info->bytes; 2491 kmem_cache_free(btrfs_free_space_cachep, left_info); 2492 merged = true; 2493 } 2494 2495 return merged; 2496 } 2497 2498 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2499 struct btrfs_free_space *info, 2500 bool update_stat) 2501 { 2502 struct btrfs_free_space *bitmap; 2503 unsigned long i; 2504 unsigned long j; 2505 const u64 end = info->offset + info->bytes; 2506 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2507 u64 bytes; 2508 2509 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2510 if (!bitmap) 2511 return false; 2512 2513 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2514 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2515 if (j == i) 2516 return false; 2517 bytes = (j - i) * ctl->unit; 2518 info->bytes += bytes; 2519 2520 /* See try_merge_free_space() comment. */ 2521 if (!btrfs_free_space_trimmed(bitmap)) 2522 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2523 2524 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2525 2526 if (!bitmap->bytes) 2527 free_bitmap(ctl, bitmap); 2528 2529 return true; 2530 } 2531 2532 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2533 struct btrfs_free_space *info, 2534 bool update_stat) 2535 { 2536 struct btrfs_free_space *bitmap; 2537 u64 bitmap_offset; 2538 unsigned long i; 2539 unsigned long j; 2540 unsigned long prev_j; 2541 u64 bytes; 2542 2543 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2544 /* If we're on a boundary, try the previous logical bitmap. */ 2545 if (bitmap_offset == info->offset) { 2546 if (info->offset == 0) 2547 return false; 2548 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2549 } 2550 2551 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2552 if (!bitmap) 2553 return false; 2554 2555 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2556 j = 0; 2557 prev_j = (unsigned long)-1; 2558 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2559 if (j > i) 2560 break; 2561 prev_j = j; 2562 } 2563 if (prev_j == i) 2564 return false; 2565 2566 if (prev_j == (unsigned long)-1) 2567 bytes = (i + 1) * ctl->unit; 2568 else 2569 bytes = (i - prev_j) * ctl->unit; 2570 2571 info->offset -= bytes; 2572 info->bytes += bytes; 2573 2574 /* See try_merge_free_space() comment. */ 2575 if (!btrfs_free_space_trimmed(bitmap)) 2576 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2577 2578 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2579 2580 if (!bitmap->bytes) 2581 free_bitmap(ctl, bitmap); 2582 2583 return true; 2584 } 2585 2586 /* 2587 * We prefer always to allocate from extent entries, both for clustered and 2588 * non-clustered allocation requests. So when attempting to add a new extent 2589 * entry, try to see if there's adjacent free space in bitmap entries, and if 2590 * there is, migrate that space from the bitmaps to the extent. 2591 * Like this we get better chances of satisfying space allocation requests 2592 * because we attempt to satisfy them based on a single cache entry, and never 2593 * on 2 or more entries - even if the entries represent a contiguous free space 2594 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2595 * ends). 2596 */ 2597 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2598 struct btrfs_free_space *info, 2599 bool update_stat) 2600 { 2601 /* 2602 * Only work with disconnected entries, as we can change their offset, 2603 * and must be extent entries. 2604 */ 2605 ASSERT(!info->bitmap); 2606 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2607 2608 if (ctl->total_bitmaps > 0) { 2609 bool stole_end; 2610 bool stole_front = false; 2611 2612 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2613 if (ctl->total_bitmaps > 0) 2614 stole_front = steal_from_bitmap_to_front(ctl, info, 2615 update_stat); 2616 2617 if (stole_end || stole_front) 2618 try_merge_free_space(ctl, info, update_stat); 2619 } 2620 } 2621 2622 static int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2623 u64 offset, u64 bytes, 2624 enum btrfs_trim_state trim_state) 2625 { 2626 struct btrfs_fs_info *fs_info = block_group->fs_info; 2627 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2628 struct btrfs_free_space *info; 2629 int ret = 0; 2630 u64 filter_bytes = bytes; 2631 2632 ASSERT(!btrfs_is_zoned(fs_info)); 2633 2634 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2635 if (!info) 2636 return -ENOMEM; 2637 2638 info->offset = offset; 2639 info->bytes = bytes; 2640 info->trim_state = trim_state; 2641 RB_CLEAR_NODE(&info->offset_index); 2642 RB_CLEAR_NODE(&info->bytes_index); 2643 2644 spin_lock(&ctl->tree_lock); 2645 2646 if (try_merge_free_space(ctl, info, true)) 2647 goto link; 2648 2649 /* 2650 * There was no extent directly to the left or right of this new 2651 * extent then we know we're going to have to allocate a new extent, so 2652 * before we do that see if we need to drop this into a bitmap 2653 */ 2654 ret = insert_into_bitmap(ctl, info); 2655 if (ret < 0) { 2656 goto out; 2657 } else if (ret) { 2658 ret = 0; 2659 goto out; 2660 } 2661 link: 2662 /* 2663 * Only steal free space from adjacent bitmaps if we're sure we're not 2664 * going to add the new free space to existing bitmap entries - because 2665 * that would mean unnecessary work that would be reverted. Therefore 2666 * attempt to steal space from bitmaps if we're adding an extent entry. 2667 */ 2668 steal_from_bitmap(ctl, info, true); 2669 2670 filter_bytes = max(filter_bytes, info->bytes); 2671 2672 ret = link_free_space(ctl, info); 2673 if (ret) 2674 kmem_cache_free(btrfs_free_space_cachep, info); 2675 out: 2676 btrfs_discard_update_discardable(block_group); 2677 spin_unlock(&ctl->tree_lock); 2678 2679 if (ret) { 2680 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2681 ASSERT(ret != -EEXIST); 2682 } 2683 2684 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2685 btrfs_discard_check_filter(block_group, filter_bytes); 2686 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2687 } 2688 2689 return ret; 2690 } 2691 2692 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2693 u64 bytenr, u64 size, bool used) 2694 { 2695 struct btrfs_space_info *sinfo = block_group->space_info; 2696 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2697 u64 offset = bytenr - block_group->start; 2698 u64 to_free, to_unusable; 2699 int bg_reclaim_threshold = 0; 2700 bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); 2701 u64 reclaimable_unusable; 2702 2703 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2704 2705 if (!initial) 2706 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2707 2708 spin_lock(&ctl->tree_lock); 2709 if (!used) 2710 to_free = size; 2711 else if (initial) 2712 to_free = block_group->zone_capacity; 2713 else if (offset >= block_group->alloc_offset) 2714 to_free = size; 2715 else if (offset + size <= block_group->alloc_offset) 2716 to_free = 0; 2717 else 2718 to_free = offset + size - block_group->alloc_offset; 2719 to_unusable = size - to_free; 2720 2721 ctl->free_space += to_free; 2722 /* 2723 * If the block group is read-only, we should account freed space into 2724 * bytes_readonly. 2725 */ 2726 if (!block_group->ro) 2727 block_group->zone_unusable += to_unusable; 2728 spin_unlock(&ctl->tree_lock); 2729 if (!used) { 2730 spin_lock(&block_group->lock); 2731 block_group->alloc_offset -= size; 2732 spin_unlock(&block_group->lock); 2733 } 2734 2735 reclaimable_unusable = block_group->zone_unusable - 2736 (block_group->length - block_group->zone_capacity); 2737 /* All the region is now unusable. Mark it as unused and reclaim */ 2738 if (block_group->zone_unusable == block_group->length) { 2739 btrfs_mark_bg_unused(block_group); 2740 } else if (bg_reclaim_threshold && 2741 reclaimable_unusable >= 2742 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2743 btrfs_mark_bg_to_reclaim(block_group); 2744 } 2745 2746 return 0; 2747 } 2748 2749 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2750 u64 bytenr, u64 size) 2751 { 2752 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2753 2754 if (btrfs_is_zoned(block_group->fs_info)) 2755 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2756 true); 2757 2758 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2759 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2760 2761 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2762 } 2763 2764 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2765 u64 bytenr, u64 size) 2766 { 2767 if (btrfs_is_zoned(block_group->fs_info)) 2768 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2769 false); 2770 2771 return btrfs_add_free_space(block_group, bytenr, size); 2772 } 2773 2774 /* 2775 * This is a subtle distinction because when adding free space back in general, 2776 * we want it to be added as untrimmed for async. But in the case where we add 2777 * it on loading of a block group, we want to consider it trimmed. 2778 */ 2779 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2780 u64 bytenr, u64 size) 2781 { 2782 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2783 2784 if (btrfs_is_zoned(block_group->fs_info)) 2785 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2786 true); 2787 2788 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2789 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2790 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2791 2792 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2793 } 2794 2795 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2796 u64 offset, u64 bytes) 2797 { 2798 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2799 struct btrfs_free_space *info; 2800 int ret; 2801 bool re_search = false; 2802 2803 if (btrfs_is_zoned(block_group->fs_info)) { 2804 /* 2805 * This can happen with conventional zones when replaying log. 2806 * Since the allocation info of tree-log nodes are not recorded 2807 * to the extent-tree, calculate_alloc_pointer() failed to 2808 * advance the allocation pointer after last allocated tree log 2809 * node blocks. 2810 * 2811 * This function is called from 2812 * btrfs_pin_extent_for_log_replay() when replaying the log. 2813 * Advance the pointer not to overwrite the tree-log nodes. 2814 */ 2815 if (block_group->start + block_group->alloc_offset < 2816 offset + bytes) { 2817 block_group->alloc_offset = 2818 offset + bytes - block_group->start; 2819 } 2820 return 0; 2821 } 2822 2823 spin_lock(&ctl->tree_lock); 2824 2825 again: 2826 ret = 0; 2827 if (!bytes) 2828 goto out_lock; 2829 2830 info = tree_search_offset(ctl, offset, 0, 0); 2831 if (!info) { 2832 /* 2833 * oops didn't find an extent that matched the space we wanted 2834 * to remove, look for a bitmap instead 2835 */ 2836 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2837 1, 0); 2838 if (!info) { 2839 /* 2840 * If we found a partial bit of our free space in a 2841 * bitmap but then couldn't find the other part this may 2842 * be a problem, so WARN about it. 2843 */ 2844 WARN_ON(re_search); 2845 goto out_lock; 2846 } 2847 } 2848 2849 re_search = false; 2850 if (!info->bitmap) { 2851 unlink_free_space(ctl, info, true); 2852 if (offset == info->offset) { 2853 u64 to_free = min(bytes, info->bytes); 2854 2855 info->bytes -= to_free; 2856 info->offset += to_free; 2857 if (info->bytes) { 2858 ret = link_free_space(ctl, info); 2859 WARN_ON(ret); 2860 } else { 2861 kmem_cache_free(btrfs_free_space_cachep, info); 2862 } 2863 2864 offset += to_free; 2865 bytes -= to_free; 2866 goto again; 2867 } else { 2868 u64 old_end = info->bytes + info->offset; 2869 2870 info->bytes = offset - info->offset; 2871 ret = link_free_space(ctl, info); 2872 WARN_ON(ret); 2873 if (ret) 2874 goto out_lock; 2875 2876 /* Not enough bytes in this entry to satisfy us */ 2877 if (old_end < offset + bytes) { 2878 bytes -= old_end - offset; 2879 offset = old_end; 2880 goto again; 2881 } else if (old_end == offset + bytes) { 2882 /* all done */ 2883 goto out_lock; 2884 } 2885 spin_unlock(&ctl->tree_lock); 2886 2887 ret = __btrfs_add_free_space(block_group, 2888 offset + bytes, 2889 old_end - (offset + bytes), 2890 info->trim_state); 2891 WARN_ON(ret); 2892 goto out; 2893 } 2894 } 2895 2896 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2897 if (ret == -EAGAIN) { 2898 re_search = true; 2899 goto again; 2900 } 2901 out_lock: 2902 btrfs_discard_update_discardable(block_group); 2903 spin_unlock(&ctl->tree_lock); 2904 out: 2905 return ret; 2906 } 2907 2908 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2909 u64 bytes) 2910 { 2911 struct btrfs_fs_info *fs_info = block_group->fs_info; 2912 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2913 struct btrfs_free_space *info; 2914 struct rb_node *n; 2915 int count = 0; 2916 2917 /* 2918 * Zoned btrfs does not use free space tree and cluster. Just print 2919 * out the free space after the allocation offset. 2920 */ 2921 if (btrfs_is_zoned(fs_info)) { 2922 btrfs_info(fs_info, "free space %llu active %d", 2923 block_group->zone_capacity - block_group->alloc_offset, 2924 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2925 &block_group->runtime_flags)); 2926 return; 2927 } 2928 2929 spin_lock(&ctl->tree_lock); 2930 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2931 info = rb_entry(n, struct btrfs_free_space, offset_index); 2932 if (info->bytes >= bytes && !block_group->ro) 2933 count++; 2934 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2935 info->offset, info->bytes, 2936 (info->bitmap) ? "yes" : "no"); 2937 } 2938 spin_unlock(&ctl->tree_lock); 2939 btrfs_info(fs_info, "block group has cluster?: %s", 2940 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2941 btrfs_info(fs_info, 2942 "%d free space entries at or bigger than %llu bytes", 2943 count, bytes); 2944 } 2945 2946 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2947 struct btrfs_free_space_ctl *ctl) 2948 { 2949 struct btrfs_fs_info *fs_info = block_group->fs_info; 2950 2951 spin_lock_init(&ctl->tree_lock); 2952 ctl->unit = fs_info->sectorsize; 2953 ctl->start = block_group->start; 2954 ctl->block_group = block_group; 2955 ctl->op = &free_space_op; 2956 ctl->free_space_bytes = RB_ROOT_CACHED; 2957 INIT_LIST_HEAD(&ctl->trimming_ranges); 2958 mutex_init(&ctl->cache_writeout_mutex); 2959 2960 /* 2961 * we only want to have 32k of ram per block group for keeping 2962 * track of free space, and if we pass 1/2 of that we want to 2963 * start converting things over to using bitmaps 2964 */ 2965 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2966 } 2967 2968 /* 2969 * for a given cluster, put all of its extents back into the free 2970 * space cache. If the block group passed doesn't match the block group 2971 * pointed to by the cluster, someone else raced in and freed the 2972 * cluster already. In that case, we just return without changing anything 2973 */ 2974 static void __btrfs_return_cluster_to_free_space( 2975 struct btrfs_block_group *block_group, 2976 struct btrfs_free_cluster *cluster) 2977 { 2978 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2979 struct rb_node *node; 2980 2981 lockdep_assert_held(&ctl->tree_lock); 2982 2983 spin_lock(&cluster->lock); 2984 if (cluster->block_group != block_group) { 2985 spin_unlock(&cluster->lock); 2986 return; 2987 } 2988 2989 cluster->block_group = NULL; 2990 cluster->window_start = 0; 2991 list_del_init(&cluster->block_group_list); 2992 2993 node = rb_first(&cluster->root); 2994 while (node) { 2995 struct btrfs_free_space *entry; 2996 2997 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2998 node = rb_next(&entry->offset_index); 2999 rb_erase(&entry->offset_index, &cluster->root); 3000 RB_CLEAR_NODE(&entry->offset_index); 3001 3002 if (!entry->bitmap) { 3003 /* Merging treats extents as if they were new */ 3004 if (!btrfs_free_space_trimmed(entry)) { 3005 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3006 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 3007 entry->bytes; 3008 } 3009 3010 try_merge_free_space(ctl, entry, false); 3011 steal_from_bitmap(ctl, entry, false); 3012 3013 /* As we insert directly, update these statistics */ 3014 if (!btrfs_free_space_trimmed(entry)) { 3015 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3016 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3017 entry->bytes; 3018 } 3019 } 3020 tree_insert_offset(ctl, NULL, entry); 3021 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3022 entry_less); 3023 } 3024 cluster->root = RB_ROOT; 3025 spin_unlock(&cluster->lock); 3026 btrfs_put_block_group(block_group); 3027 } 3028 3029 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3030 { 3031 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3032 struct btrfs_free_cluster *cluster; 3033 struct list_head *head; 3034 3035 spin_lock(&ctl->tree_lock); 3036 while ((head = block_group->cluster_list.next) != 3037 &block_group->cluster_list) { 3038 cluster = list_entry(head, struct btrfs_free_cluster, 3039 block_group_list); 3040 3041 WARN_ON(cluster->block_group != block_group); 3042 __btrfs_return_cluster_to_free_space(block_group, cluster); 3043 3044 cond_resched_lock(&ctl->tree_lock); 3045 } 3046 __btrfs_remove_free_space_cache(ctl); 3047 btrfs_discard_update_discardable(block_group); 3048 spin_unlock(&ctl->tree_lock); 3049 3050 } 3051 3052 /* 3053 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3054 */ 3055 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3056 { 3057 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3058 struct btrfs_free_space *info; 3059 struct rb_node *node; 3060 bool ret = true; 3061 3062 spin_lock(&ctl->tree_lock); 3063 node = rb_first(&ctl->free_space_offset); 3064 3065 while (node) { 3066 info = rb_entry(node, struct btrfs_free_space, offset_index); 3067 3068 if (!btrfs_free_space_trimmed(info)) { 3069 ret = false; 3070 break; 3071 } 3072 3073 node = rb_next(node); 3074 } 3075 3076 spin_unlock(&ctl->tree_lock); 3077 return ret; 3078 } 3079 3080 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3081 u64 offset, u64 bytes, u64 empty_size, 3082 u64 *max_extent_size) 3083 { 3084 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3085 struct btrfs_discard_ctl *discard_ctl = 3086 &block_group->fs_info->discard_ctl; 3087 struct btrfs_free_space *entry = NULL; 3088 u64 bytes_search = bytes + empty_size; 3089 u64 ret = 0; 3090 u64 align_gap = 0; 3091 u64 align_gap_len = 0; 3092 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3093 bool use_bytes_index = (offset == block_group->start); 3094 3095 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3096 3097 spin_lock(&ctl->tree_lock); 3098 entry = find_free_space(ctl, &offset, &bytes_search, 3099 block_group->full_stripe_len, max_extent_size, 3100 use_bytes_index); 3101 if (!entry) 3102 goto out; 3103 3104 ret = offset; 3105 if (entry->bitmap) { 3106 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3107 3108 if (!btrfs_free_space_trimmed(entry)) 3109 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3110 3111 if (!entry->bytes) 3112 free_bitmap(ctl, entry); 3113 } else { 3114 unlink_free_space(ctl, entry, true); 3115 align_gap_len = offset - entry->offset; 3116 align_gap = entry->offset; 3117 align_gap_trim_state = entry->trim_state; 3118 3119 if (!btrfs_free_space_trimmed(entry)) 3120 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3121 3122 entry->offset = offset + bytes; 3123 WARN_ON(entry->bytes < bytes + align_gap_len); 3124 3125 entry->bytes -= bytes + align_gap_len; 3126 if (!entry->bytes) 3127 kmem_cache_free(btrfs_free_space_cachep, entry); 3128 else 3129 link_free_space(ctl, entry); 3130 } 3131 out: 3132 btrfs_discard_update_discardable(block_group); 3133 spin_unlock(&ctl->tree_lock); 3134 3135 if (align_gap_len) 3136 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3137 align_gap_trim_state); 3138 return ret; 3139 } 3140 3141 /* 3142 * given a cluster, put all of its extents back into the free space 3143 * cache. If a block group is passed, this function will only free 3144 * a cluster that belongs to the passed block group. 3145 * 3146 * Otherwise, it'll get a reference on the block group pointed to by the 3147 * cluster and remove the cluster from it. 3148 */ 3149 void btrfs_return_cluster_to_free_space( 3150 struct btrfs_block_group *block_group, 3151 struct btrfs_free_cluster *cluster) 3152 { 3153 struct btrfs_free_space_ctl *ctl; 3154 3155 /* first, get a safe pointer to the block group */ 3156 spin_lock(&cluster->lock); 3157 if (!block_group) { 3158 block_group = cluster->block_group; 3159 if (!block_group) { 3160 spin_unlock(&cluster->lock); 3161 return; 3162 } 3163 } else if (cluster->block_group != block_group) { 3164 /* someone else has already freed it don't redo their work */ 3165 spin_unlock(&cluster->lock); 3166 return; 3167 } 3168 btrfs_get_block_group(block_group); 3169 spin_unlock(&cluster->lock); 3170 3171 ctl = block_group->free_space_ctl; 3172 3173 /* now return any extents the cluster had on it */ 3174 spin_lock(&ctl->tree_lock); 3175 __btrfs_return_cluster_to_free_space(block_group, cluster); 3176 spin_unlock(&ctl->tree_lock); 3177 3178 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3179 3180 /* finally drop our ref */ 3181 btrfs_put_block_group(block_group); 3182 } 3183 3184 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3185 struct btrfs_free_cluster *cluster, 3186 struct btrfs_free_space *entry, 3187 u64 bytes, u64 min_start, 3188 u64 *max_extent_size) 3189 { 3190 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3191 int err; 3192 u64 search_start = cluster->window_start; 3193 u64 search_bytes = bytes; 3194 u64 ret = 0; 3195 3196 search_start = min_start; 3197 search_bytes = bytes; 3198 3199 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3200 if (err) { 3201 *max_extent_size = max(get_max_extent_size(entry), 3202 *max_extent_size); 3203 return 0; 3204 } 3205 3206 ret = search_start; 3207 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3208 3209 return ret; 3210 } 3211 3212 /* 3213 * given a cluster, try to allocate 'bytes' from it, returns 0 3214 * if it couldn't find anything suitably large, or a logical disk offset 3215 * if things worked out 3216 */ 3217 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3218 struct btrfs_free_cluster *cluster, u64 bytes, 3219 u64 min_start, u64 *max_extent_size) 3220 { 3221 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3222 struct btrfs_discard_ctl *discard_ctl = 3223 &block_group->fs_info->discard_ctl; 3224 struct btrfs_free_space *entry = NULL; 3225 struct rb_node *node; 3226 u64 ret = 0; 3227 3228 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3229 3230 spin_lock(&cluster->lock); 3231 if (bytes > cluster->max_size) 3232 goto out; 3233 3234 if (cluster->block_group != block_group) 3235 goto out; 3236 3237 node = rb_first(&cluster->root); 3238 if (!node) 3239 goto out; 3240 3241 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3242 while (1) { 3243 if (entry->bytes < bytes) 3244 *max_extent_size = max(get_max_extent_size(entry), 3245 *max_extent_size); 3246 3247 if (entry->bytes < bytes || 3248 (!entry->bitmap && entry->offset < min_start)) { 3249 node = rb_next(&entry->offset_index); 3250 if (!node) 3251 break; 3252 entry = rb_entry(node, struct btrfs_free_space, 3253 offset_index); 3254 continue; 3255 } 3256 3257 if (entry->bitmap) { 3258 ret = btrfs_alloc_from_bitmap(block_group, 3259 cluster, entry, bytes, 3260 cluster->window_start, 3261 max_extent_size); 3262 if (ret == 0) { 3263 node = rb_next(&entry->offset_index); 3264 if (!node) 3265 break; 3266 entry = rb_entry(node, struct btrfs_free_space, 3267 offset_index); 3268 continue; 3269 } 3270 cluster->window_start += bytes; 3271 } else { 3272 ret = entry->offset; 3273 3274 entry->offset += bytes; 3275 entry->bytes -= bytes; 3276 } 3277 3278 break; 3279 } 3280 out: 3281 spin_unlock(&cluster->lock); 3282 3283 if (!ret) 3284 return 0; 3285 3286 spin_lock(&ctl->tree_lock); 3287 3288 if (!btrfs_free_space_trimmed(entry)) 3289 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3290 3291 ctl->free_space -= bytes; 3292 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3293 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3294 3295 spin_lock(&cluster->lock); 3296 if (entry->bytes == 0) { 3297 rb_erase(&entry->offset_index, &cluster->root); 3298 ctl->free_extents--; 3299 if (entry->bitmap) { 3300 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3301 entry->bitmap); 3302 ctl->total_bitmaps--; 3303 recalculate_thresholds(ctl); 3304 } else if (!btrfs_free_space_trimmed(entry)) { 3305 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3306 } 3307 kmem_cache_free(btrfs_free_space_cachep, entry); 3308 } 3309 3310 spin_unlock(&cluster->lock); 3311 spin_unlock(&ctl->tree_lock); 3312 3313 return ret; 3314 } 3315 3316 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3317 struct btrfs_free_space *entry, 3318 struct btrfs_free_cluster *cluster, 3319 u64 offset, u64 bytes, 3320 u64 cont1_bytes, u64 min_bytes) 3321 { 3322 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3323 unsigned long next_zero; 3324 unsigned long i; 3325 unsigned long want_bits; 3326 unsigned long min_bits; 3327 unsigned long found_bits; 3328 unsigned long max_bits = 0; 3329 unsigned long start = 0; 3330 unsigned long total_found = 0; 3331 int ret; 3332 3333 lockdep_assert_held(&ctl->tree_lock); 3334 3335 i = offset_to_bit(entry->offset, ctl->unit, 3336 max_t(u64, offset, entry->offset)); 3337 want_bits = bytes_to_bits(bytes, ctl->unit); 3338 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3339 3340 /* 3341 * Don't bother looking for a cluster in this bitmap if it's heavily 3342 * fragmented. 3343 */ 3344 if (entry->max_extent_size && 3345 entry->max_extent_size < cont1_bytes) 3346 return -ENOSPC; 3347 again: 3348 found_bits = 0; 3349 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3350 next_zero = find_next_zero_bit(entry->bitmap, 3351 BITS_PER_BITMAP, i); 3352 if (next_zero - i >= min_bits) { 3353 found_bits = next_zero - i; 3354 if (found_bits > max_bits) 3355 max_bits = found_bits; 3356 break; 3357 } 3358 if (next_zero - i > max_bits) 3359 max_bits = next_zero - i; 3360 i = next_zero; 3361 } 3362 3363 if (!found_bits) { 3364 entry->max_extent_size = (u64)max_bits * ctl->unit; 3365 return -ENOSPC; 3366 } 3367 3368 if (!total_found) { 3369 start = i; 3370 cluster->max_size = 0; 3371 } 3372 3373 total_found += found_bits; 3374 3375 if (cluster->max_size < found_bits * ctl->unit) 3376 cluster->max_size = found_bits * ctl->unit; 3377 3378 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3379 i = next_zero + 1; 3380 goto again; 3381 } 3382 3383 cluster->window_start = start * ctl->unit + entry->offset; 3384 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3385 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3386 3387 /* 3388 * We need to know if we're currently on the normal space index when we 3389 * manipulate the bitmap so that we know we need to remove and re-insert 3390 * it into the space_index tree. Clear the bytes_index node here so the 3391 * bitmap manipulation helpers know not to mess with the space_index 3392 * until this bitmap entry is added back into the normal cache. 3393 */ 3394 RB_CLEAR_NODE(&entry->bytes_index); 3395 3396 ret = tree_insert_offset(ctl, cluster, entry); 3397 ASSERT(!ret); /* -EEXIST; Logic error */ 3398 3399 trace_btrfs_setup_cluster(block_group, cluster, 3400 total_found * ctl->unit, 1); 3401 return 0; 3402 } 3403 3404 /* 3405 * This searches the block group for just extents to fill the cluster with. 3406 * Try to find a cluster with at least bytes total bytes, at least one 3407 * extent of cont1_bytes, and other clusters of at least min_bytes. 3408 */ 3409 static noinline int 3410 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3411 struct btrfs_free_cluster *cluster, 3412 struct list_head *bitmaps, u64 offset, u64 bytes, 3413 u64 cont1_bytes, u64 min_bytes) 3414 { 3415 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3416 struct btrfs_free_space *first = NULL; 3417 struct btrfs_free_space *entry = NULL; 3418 struct btrfs_free_space *last; 3419 struct rb_node *node; 3420 u64 window_free; 3421 u64 max_extent; 3422 u64 total_size = 0; 3423 3424 lockdep_assert_held(&ctl->tree_lock); 3425 3426 entry = tree_search_offset(ctl, offset, 0, 1); 3427 if (!entry) 3428 return -ENOSPC; 3429 3430 /* 3431 * We don't want bitmaps, so just move along until we find a normal 3432 * extent entry. 3433 */ 3434 while (entry->bitmap || entry->bytes < min_bytes) { 3435 if (entry->bitmap && list_empty(&entry->list)) 3436 list_add_tail(&entry->list, bitmaps); 3437 node = rb_next(&entry->offset_index); 3438 if (!node) 3439 return -ENOSPC; 3440 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3441 } 3442 3443 window_free = entry->bytes; 3444 max_extent = entry->bytes; 3445 first = entry; 3446 last = entry; 3447 3448 for (node = rb_next(&entry->offset_index); node; 3449 node = rb_next(&entry->offset_index)) { 3450 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3451 3452 if (entry->bitmap) { 3453 if (list_empty(&entry->list)) 3454 list_add_tail(&entry->list, bitmaps); 3455 continue; 3456 } 3457 3458 if (entry->bytes < min_bytes) 3459 continue; 3460 3461 last = entry; 3462 window_free += entry->bytes; 3463 if (entry->bytes > max_extent) 3464 max_extent = entry->bytes; 3465 } 3466 3467 if (window_free < bytes || max_extent < cont1_bytes) 3468 return -ENOSPC; 3469 3470 cluster->window_start = first->offset; 3471 3472 node = &first->offset_index; 3473 3474 /* 3475 * now we've found our entries, pull them out of the free space 3476 * cache and put them into the cluster rbtree 3477 */ 3478 do { 3479 int ret; 3480 3481 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3482 node = rb_next(&entry->offset_index); 3483 if (entry->bitmap || entry->bytes < min_bytes) 3484 continue; 3485 3486 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3487 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3488 ret = tree_insert_offset(ctl, cluster, entry); 3489 total_size += entry->bytes; 3490 ASSERT(!ret); /* -EEXIST; Logic error */ 3491 } while (node && entry != last); 3492 3493 cluster->max_size = max_extent; 3494 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3495 return 0; 3496 } 3497 3498 /* 3499 * This specifically looks for bitmaps that may work in the cluster, we assume 3500 * that we have already failed to find extents that will work. 3501 */ 3502 static noinline int 3503 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3504 struct btrfs_free_cluster *cluster, 3505 struct list_head *bitmaps, u64 offset, u64 bytes, 3506 u64 cont1_bytes, u64 min_bytes) 3507 { 3508 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3509 struct btrfs_free_space *entry = NULL; 3510 int ret = -ENOSPC; 3511 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3512 3513 if (ctl->total_bitmaps == 0) 3514 return -ENOSPC; 3515 3516 /* 3517 * The bitmap that covers offset won't be in the list unless offset 3518 * is just its start offset. 3519 */ 3520 if (!list_empty(bitmaps)) 3521 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3522 3523 if (!entry || entry->offset != bitmap_offset) { 3524 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3525 if (entry && list_empty(&entry->list)) 3526 list_add(&entry->list, bitmaps); 3527 } 3528 3529 list_for_each_entry(entry, bitmaps, list) { 3530 if (entry->bytes < bytes) 3531 continue; 3532 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3533 bytes, cont1_bytes, min_bytes); 3534 if (!ret) 3535 return 0; 3536 } 3537 3538 /* 3539 * The bitmaps list has all the bitmaps that record free space 3540 * starting after offset, so no more search is required. 3541 */ 3542 return -ENOSPC; 3543 } 3544 3545 /* 3546 * here we try to find a cluster of blocks in a block group. The goal 3547 * is to find at least bytes+empty_size. 3548 * We might not find them all in one contiguous area. 3549 * 3550 * returns zero and sets up cluster if things worked out, otherwise 3551 * it returns -enospc 3552 */ 3553 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3554 struct btrfs_free_cluster *cluster, 3555 u64 offset, u64 bytes, u64 empty_size) 3556 { 3557 struct btrfs_fs_info *fs_info = block_group->fs_info; 3558 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3559 struct btrfs_free_space *entry, *tmp; 3560 LIST_HEAD(bitmaps); 3561 u64 min_bytes; 3562 u64 cont1_bytes; 3563 int ret; 3564 3565 /* 3566 * Choose the minimum extent size we'll require for this 3567 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3568 * For metadata, allow allocates with smaller extents. For 3569 * data, keep it dense. 3570 */ 3571 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3572 cont1_bytes = bytes + empty_size; 3573 min_bytes = cont1_bytes; 3574 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3575 cont1_bytes = bytes; 3576 min_bytes = fs_info->sectorsize; 3577 } else { 3578 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3579 min_bytes = fs_info->sectorsize; 3580 } 3581 3582 spin_lock(&ctl->tree_lock); 3583 3584 /* 3585 * If we know we don't have enough space to make a cluster don't even 3586 * bother doing all the work to try and find one. 3587 */ 3588 if (ctl->free_space < bytes) { 3589 spin_unlock(&ctl->tree_lock); 3590 return -ENOSPC; 3591 } 3592 3593 spin_lock(&cluster->lock); 3594 3595 /* someone already found a cluster, hooray */ 3596 if (cluster->block_group) { 3597 ret = 0; 3598 goto out; 3599 } 3600 3601 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3602 min_bytes); 3603 3604 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3605 bytes + empty_size, 3606 cont1_bytes, min_bytes); 3607 if (ret) 3608 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3609 offset, bytes + empty_size, 3610 cont1_bytes, min_bytes); 3611 3612 /* Clear our temporary list */ 3613 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3614 list_del_init(&entry->list); 3615 3616 if (!ret) { 3617 btrfs_get_block_group(block_group); 3618 list_add_tail(&cluster->block_group_list, 3619 &block_group->cluster_list); 3620 cluster->block_group = block_group; 3621 } else { 3622 trace_btrfs_failed_cluster_setup(block_group); 3623 } 3624 out: 3625 spin_unlock(&cluster->lock); 3626 spin_unlock(&ctl->tree_lock); 3627 3628 return ret; 3629 } 3630 3631 /* 3632 * simple code to zero out a cluster 3633 */ 3634 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3635 { 3636 spin_lock_init(&cluster->lock); 3637 spin_lock_init(&cluster->refill_lock); 3638 cluster->root = RB_ROOT; 3639 cluster->max_size = 0; 3640 cluster->fragmented = false; 3641 INIT_LIST_HEAD(&cluster->block_group_list); 3642 cluster->block_group = NULL; 3643 } 3644 3645 static int do_trimming(struct btrfs_block_group *block_group, 3646 u64 *total_trimmed, u64 start, u64 bytes, 3647 u64 reserved_start, u64 reserved_bytes, 3648 enum btrfs_trim_state reserved_trim_state, 3649 struct btrfs_trim_range *trim_entry) 3650 { 3651 struct btrfs_space_info *space_info = block_group->space_info; 3652 struct btrfs_fs_info *fs_info = block_group->fs_info; 3653 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3654 int ret; 3655 int update = 0; 3656 const u64 end = start + bytes; 3657 const u64 reserved_end = reserved_start + reserved_bytes; 3658 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3659 u64 trimmed = 0; 3660 3661 spin_lock(&space_info->lock); 3662 spin_lock(&block_group->lock); 3663 if (!block_group->ro) { 3664 block_group->reserved += reserved_bytes; 3665 space_info->bytes_reserved += reserved_bytes; 3666 update = 1; 3667 } 3668 spin_unlock(&block_group->lock); 3669 spin_unlock(&space_info->lock); 3670 3671 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3672 if (!ret) { 3673 *total_trimmed += trimmed; 3674 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3675 } 3676 3677 mutex_lock(&ctl->cache_writeout_mutex); 3678 if (reserved_start < start) 3679 __btrfs_add_free_space(block_group, reserved_start, 3680 start - reserved_start, 3681 reserved_trim_state); 3682 if (end < reserved_end) 3683 __btrfs_add_free_space(block_group, end, reserved_end - end, 3684 reserved_trim_state); 3685 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3686 list_del(&trim_entry->list); 3687 mutex_unlock(&ctl->cache_writeout_mutex); 3688 3689 if (update) { 3690 spin_lock(&space_info->lock); 3691 spin_lock(&block_group->lock); 3692 if (block_group->ro) 3693 space_info->bytes_readonly += reserved_bytes; 3694 block_group->reserved -= reserved_bytes; 3695 space_info->bytes_reserved -= reserved_bytes; 3696 spin_unlock(&block_group->lock); 3697 spin_unlock(&space_info->lock); 3698 } 3699 3700 return ret; 3701 } 3702 3703 /* 3704 * If @async is set, then we will trim 1 region and return. 3705 */ 3706 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3707 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3708 bool async) 3709 { 3710 struct btrfs_discard_ctl *discard_ctl = 3711 &block_group->fs_info->discard_ctl; 3712 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3713 struct btrfs_free_space *entry; 3714 struct rb_node *node; 3715 int ret = 0; 3716 u64 extent_start; 3717 u64 extent_bytes; 3718 enum btrfs_trim_state extent_trim_state; 3719 u64 bytes; 3720 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3721 3722 while (start < end) { 3723 struct btrfs_trim_range trim_entry; 3724 3725 mutex_lock(&ctl->cache_writeout_mutex); 3726 spin_lock(&ctl->tree_lock); 3727 3728 if (ctl->free_space < minlen) 3729 goto out_unlock; 3730 3731 entry = tree_search_offset(ctl, start, 0, 1); 3732 if (!entry) 3733 goto out_unlock; 3734 3735 /* Skip bitmaps and if async, already trimmed entries */ 3736 while (entry->bitmap || 3737 (async && btrfs_free_space_trimmed(entry))) { 3738 node = rb_next(&entry->offset_index); 3739 if (!node) 3740 goto out_unlock; 3741 entry = rb_entry(node, struct btrfs_free_space, 3742 offset_index); 3743 } 3744 3745 if (entry->offset >= end) 3746 goto out_unlock; 3747 3748 extent_start = entry->offset; 3749 extent_bytes = entry->bytes; 3750 extent_trim_state = entry->trim_state; 3751 if (async) { 3752 start = entry->offset; 3753 bytes = entry->bytes; 3754 if (bytes < minlen) { 3755 spin_unlock(&ctl->tree_lock); 3756 mutex_unlock(&ctl->cache_writeout_mutex); 3757 goto next; 3758 } 3759 unlink_free_space(ctl, entry, true); 3760 /* 3761 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3762 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3763 * X when we come back around. So trim it now. 3764 */ 3765 if (max_discard_size && 3766 bytes >= (max_discard_size + 3767 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3768 bytes = max_discard_size; 3769 extent_bytes = max_discard_size; 3770 entry->offset += max_discard_size; 3771 entry->bytes -= max_discard_size; 3772 link_free_space(ctl, entry); 3773 } else { 3774 kmem_cache_free(btrfs_free_space_cachep, entry); 3775 } 3776 } else { 3777 start = max(start, extent_start); 3778 bytes = min(extent_start + extent_bytes, end) - start; 3779 if (bytes < minlen) { 3780 spin_unlock(&ctl->tree_lock); 3781 mutex_unlock(&ctl->cache_writeout_mutex); 3782 goto next; 3783 } 3784 3785 unlink_free_space(ctl, entry, true); 3786 kmem_cache_free(btrfs_free_space_cachep, entry); 3787 } 3788 3789 spin_unlock(&ctl->tree_lock); 3790 trim_entry.start = extent_start; 3791 trim_entry.bytes = extent_bytes; 3792 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3793 mutex_unlock(&ctl->cache_writeout_mutex); 3794 3795 ret = do_trimming(block_group, total_trimmed, start, bytes, 3796 extent_start, extent_bytes, extent_trim_state, 3797 &trim_entry); 3798 if (ret) { 3799 block_group->discard_cursor = start + bytes; 3800 break; 3801 } 3802 next: 3803 start += bytes; 3804 block_group->discard_cursor = start; 3805 if (async && *total_trimmed) 3806 break; 3807 3808 if (fatal_signal_pending(current)) { 3809 ret = -ERESTARTSYS; 3810 break; 3811 } 3812 3813 cond_resched(); 3814 } 3815 3816 return ret; 3817 3818 out_unlock: 3819 block_group->discard_cursor = btrfs_block_group_end(block_group); 3820 spin_unlock(&ctl->tree_lock); 3821 mutex_unlock(&ctl->cache_writeout_mutex); 3822 3823 return ret; 3824 } 3825 3826 /* 3827 * If we break out of trimming a bitmap prematurely, we should reset the 3828 * trimming bit. In a rather contrieved case, it's possible to race here so 3829 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3830 * 3831 * start = start of bitmap 3832 * end = near end of bitmap 3833 * 3834 * Thread 1: Thread 2: 3835 * trim_bitmaps(start) 3836 * trim_bitmaps(end) 3837 * end_trimming_bitmap() 3838 * reset_trimming_bitmap() 3839 */ 3840 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3841 { 3842 struct btrfs_free_space *entry; 3843 3844 spin_lock(&ctl->tree_lock); 3845 entry = tree_search_offset(ctl, offset, 1, 0); 3846 if (entry) { 3847 if (btrfs_free_space_trimmed(entry)) { 3848 ctl->discardable_extents[BTRFS_STAT_CURR] += 3849 entry->bitmap_extents; 3850 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3851 } 3852 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3853 } 3854 3855 spin_unlock(&ctl->tree_lock); 3856 } 3857 3858 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3859 struct btrfs_free_space *entry) 3860 { 3861 if (btrfs_free_space_trimming_bitmap(entry)) { 3862 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3863 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3864 entry->bitmap_extents; 3865 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3866 } 3867 } 3868 3869 /* 3870 * If @async is set, then we will trim 1 region and return. 3871 */ 3872 static int trim_bitmaps(struct btrfs_block_group *block_group, 3873 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3874 u64 maxlen, bool async) 3875 { 3876 struct btrfs_discard_ctl *discard_ctl = 3877 &block_group->fs_info->discard_ctl; 3878 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3879 struct btrfs_free_space *entry; 3880 int ret = 0; 3881 int ret2; 3882 u64 bytes; 3883 u64 offset = offset_to_bitmap(ctl, start); 3884 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3885 3886 while (offset < end) { 3887 bool next_bitmap = false; 3888 struct btrfs_trim_range trim_entry; 3889 3890 mutex_lock(&ctl->cache_writeout_mutex); 3891 spin_lock(&ctl->tree_lock); 3892 3893 if (ctl->free_space < minlen) { 3894 block_group->discard_cursor = 3895 btrfs_block_group_end(block_group); 3896 spin_unlock(&ctl->tree_lock); 3897 mutex_unlock(&ctl->cache_writeout_mutex); 3898 break; 3899 } 3900 3901 entry = tree_search_offset(ctl, offset, 1, 0); 3902 /* 3903 * Bitmaps are marked trimmed lossily now to prevent constant 3904 * discarding of the same bitmap (the reason why we are bound 3905 * by the filters). So, retrim the block group bitmaps when we 3906 * are preparing to punt to the unused_bgs list. This uses 3907 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3908 * which is the only discard index which sets minlen to 0. 3909 */ 3910 if (!entry || (async && minlen && start == offset && 3911 btrfs_free_space_trimmed(entry))) { 3912 spin_unlock(&ctl->tree_lock); 3913 mutex_unlock(&ctl->cache_writeout_mutex); 3914 next_bitmap = true; 3915 goto next; 3916 } 3917 3918 /* 3919 * Async discard bitmap trimming begins at by setting the start 3920 * to be key.objectid and the offset_to_bitmap() aligns to the 3921 * start of the bitmap. This lets us know we are fully 3922 * scanning the bitmap rather than only some portion of it. 3923 */ 3924 if (start == offset) 3925 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3926 3927 bytes = minlen; 3928 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3929 if (ret2 || start >= end) { 3930 /* 3931 * We lossily consider a bitmap trimmed if we only skip 3932 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3933 */ 3934 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3935 end_trimming_bitmap(ctl, entry); 3936 else 3937 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3938 spin_unlock(&ctl->tree_lock); 3939 mutex_unlock(&ctl->cache_writeout_mutex); 3940 next_bitmap = true; 3941 goto next; 3942 } 3943 3944 /* 3945 * We already trimmed a region, but are using the locking above 3946 * to reset the trim_state. 3947 */ 3948 if (async && *total_trimmed) { 3949 spin_unlock(&ctl->tree_lock); 3950 mutex_unlock(&ctl->cache_writeout_mutex); 3951 goto out; 3952 } 3953 3954 bytes = min(bytes, end - start); 3955 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3956 spin_unlock(&ctl->tree_lock); 3957 mutex_unlock(&ctl->cache_writeout_mutex); 3958 goto next; 3959 } 3960 3961 /* 3962 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3963 * If X < @minlen, we won't trim X when we come back around. 3964 * So trim it now. We differ here from trimming extents as we 3965 * don't keep individual state per bit. 3966 */ 3967 if (async && 3968 max_discard_size && 3969 bytes > (max_discard_size + minlen)) 3970 bytes = max_discard_size; 3971 3972 bitmap_clear_bits(ctl, entry, start, bytes, true); 3973 if (entry->bytes == 0) 3974 free_bitmap(ctl, entry); 3975 3976 spin_unlock(&ctl->tree_lock); 3977 trim_entry.start = start; 3978 trim_entry.bytes = bytes; 3979 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3980 mutex_unlock(&ctl->cache_writeout_mutex); 3981 3982 ret = do_trimming(block_group, total_trimmed, start, bytes, 3983 start, bytes, 0, &trim_entry); 3984 if (ret) { 3985 reset_trimming_bitmap(ctl, offset); 3986 block_group->discard_cursor = 3987 btrfs_block_group_end(block_group); 3988 break; 3989 } 3990 next: 3991 if (next_bitmap) { 3992 offset += BITS_PER_BITMAP * ctl->unit; 3993 start = offset; 3994 } else { 3995 start += bytes; 3996 } 3997 block_group->discard_cursor = start; 3998 3999 if (fatal_signal_pending(current)) { 4000 if (start != offset) 4001 reset_trimming_bitmap(ctl, offset); 4002 ret = -ERESTARTSYS; 4003 break; 4004 } 4005 4006 cond_resched(); 4007 } 4008 4009 if (offset >= end) 4010 block_group->discard_cursor = end; 4011 4012 out: 4013 return ret; 4014 } 4015 4016 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4017 u64 *trimmed, u64 start, u64 end, u64 minlen) 4018 { 4019 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4020 int ret; 4021 u64 rem = 0; 4022 4023 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4024 4025 *trimmed = 0; 4026 4027 spin_lock(&block_group->lock); 4028 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4029 spin_unlock(&block_group->lock); 4030 return 0; 4031 } 4032 btrfs_freeze_block_group(block_group); 4033 spin_unlock(&block_group->lock); 4034 4035 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4036 if (ret) 4037 goto out; 4038 4039 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4040 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4041 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4042 if (rem) 4043 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4044 out: 4045 btrfs_unfreeze_block_group(block_group); 4046 return ret; 4047 } 4048 4049 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4050 u64 *trimmed, u64 start, u64 end, u64 minlen, 4051 bool async) 4052 { 4053 int ret; 4054 4055 *trimmed = 0; 4056 4057 spin_lock(&block_group->lock); 4058 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4059 spin_unlock(&block_group->lock); 4060 return 0; 4061 } 4062 btrfs_freeze_block_group(block_group); 4063 spin_unlock(&block_group->lock); 4064 4065 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4066 btrfs_unfreeze_block_group(block_group); 4067 4068 return ret; 4069 } 4070 4071 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4072 u64 *trimmed, u64 start, u64 end, u64 minlen, 4073 u64 maxlen, bool async) 4074 { 4075 int ret; 4076 4077 *trimmed = 0; 4078 4079 spin_lock(&block_group->lock); 4080 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4081 spin_unlock(&block_group->lock); 4082 return 0; 4083 } 4084 btrfs_freeze_block_group(block_group); 4085 spin_unlock(&block_group->lock); 4086 4087 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4088 async); 4089 4090 btrfs_unfreeze_block_group(block_group); 4091 4092 return ret; 4093 } 4094 4095 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4096 { 4097 return btrfs_super_cache_generation(fs_info->super_copy); 4098 } 4099 4100 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4101 struct btrfs_trans_handle *trans) 4102 { 4103 struct btrfs_block_group *block_group; 4104 struct rb_node *node; 4105 int ret = 0; 4106 4107 btrfs_info(fs_info, "cleaning free space cache v1"); 4108 4109 node = rb_first_cached(&fs_info->block_group_cache_tree); 4110 while (node) { 4111 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4112 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4113 if (ret) 4114 goto out; 4115 node = rb_next(node); 4116 } 4117 out: 4118 return ret; 4119 } 4120 4121 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4122 { 4123 struct btrfs_trans_handle *trans; 4124 int ret; 4125 4126 /* 4127 * update_super_roots will appropriately set or unset 4128 * super_copy->cache_generation based on SPACE_CACHE and 4129 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4130 * transaction commit whether we are enabling space cache v1 and don't 4131 * have any other work to do, or are disabling it and removing free 4132 * space inodes. 4133 */ 4134 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4135 if (IS_ERR(trans)) 4136 return PTR_ERR(trans); 4137 4138 if (!active) { 4139 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4140 ret = cleanup_free_space_cache_v1(fs_info, trans); 4141 if (ret) { 4142 btrfs_abort_transaction(trans, ret); 4143 btrfs_end_transaction(trans); 4144 goto out; 4145 } 4146 } 4147 4148 ret = btrfs_commit_transaction(trans); 4149 out: 4150 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4151 4152 return ret; 4153 } 4154 4155 int __init btrfs_free_space_init(void) 4156 { 4157 btrfs_free_space_cachep = KMEM_CACHE(btrfs_free_space, 0); 4158 if (!btrfs_free_space_cachep) 4159 return -ENOMEM; 4160 4161 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4162 PAGE_SIZE, PAGE_SIZE, 4163 0, NULL); 4164 if (!btrfs_free_space_bitmap_cachep) { 4165 kmem_cache_destroy(btrfs_free_space_cachep); 4166 return -ENOMEM; 4167 } 4168 4169 return 0; 4170 } 4171 4172 void __cold btrfs_free_space_exit(void) 4173 { 4174 kmem_cache_destroy(btrfs_free_space_cachep); 4175 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4176 } 4177 4178 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4179 /* 4180 * Use this if you need to make a bitmap or extent entry specifically, it 4181 * doesn't do any of the merging that add_free_space does, this acts a lot like 4182 * how the free space cache loading stuff works, so you can get really weird 4183 * configurations. 4184 */ 4185 int test_add_free_space_entry(struct btrfs_block_group *cache, 4186 u64 offset, u64 bytes, bool bitmap) 4187 { 4188 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4189 struct btrfs_free_space *info = NULL, *bitmap_info; 4190 void *map = NULL; 4191 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4192 u64 bytes_added; 4193 int ret; 4194 4195 again: 4196 if (!info) { 4197 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4198 if (!info) 4199 return -ENOMEM; 4200 } 4201 4202 if (!bitmap) { 4203 spin_lock(&ctl->tree_lock); 4204 info->offset = offset; 4205 info->bytes = bytes; 4206 info->max_extent_size = 0; 4207 ret = link_free_space(ctl, info); 4208 spin_unlock(&ctl->tree_lock); 4209 if (ret) 4210 kmem_cache_free(btrfs_free_space_cachep, info); 4211 return ret; 4212 } 4213 4214 if (!map) { 4215 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4216 if (!map) { 4217 kmem_cache_free(btrfs_free_space_cachep, info); 4218 return -ENOMEM; 4219 } 4220 } 4221 4222 spin_lock(&ctl->tree_lock); 4223 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4224 1, 0); 4225 if (!bitmap_info) { 4226 info->bitmap = map; 4227 map = NULL; 4228 add_new_bitmap(ctl, info, offset); 4229 bitmap_info = info; 4230 info = NULL; 4231 } 4232 4233 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4234 trim_state); 4235 4236 bytes -= bytes_added; 4237 offset += bytes_added; 4238 spin_unlock(&ctl->tree_lock); 4239 4240 if (bytes) 4241 goto again; 4242 4243 if (info) 4244 kmem_cache_free(btrfs_free_space_cachep, info); 4245 if (map) 4246 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4247 return 0; 4248 } 4249 4250 /* 4251 * Checks to see if the given range is in the free space cache. This is really 4252 * just used to check the absence of space, so if there is free space in the 4253 * range at all we will return 1. 4254 */ 4255 int test_check_exists(struct btrfs_block_group *cache, 4256 u64 offset, u64 bytes) 4257 { 4258 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4259 struct btrfs_free_space *info; 4260 int ret = 0; 4261 4262 spin_lock(&ctl->tree_lock); 4263 info = tree_search_offset(ctl, offset, 0, 0); 4264 if (!info) { 4265 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4266 1, 0); 4267 if (!info) 4268 goto out; 4269 } 4270 4271 have_info: 4272 if (info->bitmap) { 4273 u64 bit_off, bit_bytes; 4274 struct rb_node *n; 4275 struct btrfs_free_space *tmp; 4276 4277 bit_off = offset; 4278 bit_bytes = ctl->unit; 4279 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4280 if (!ret) { 4281 if (bit_off == offset) { 4282 ret = 1; 4283 goto out; 4284 } else if (bit_off > offset && 4285 offset + bytes > bit_off) { 4286 ret = 1; 4287 goto out; 4288 } 4289 } 4290 4291 n = rb_prev(&info->offset_index); 4292 while (n) { 4293 tmp = rb_entry(n, struct btrfs_free_space, 4294 offset_index); 4295 if (tmp->offset + tmp->bytes < offset) 4296 break; 4297 if (offset + bytes < tmp->offset) { 4298 n = rb_prev(&tmp->offset_index); 4299 continue; 4300 } 4301 info = tmp; 4302 goto have_info; 4303 } 4304 4305 n = rb_next(&info->offset_index); 4306 while (n) { 4307 tmp = rb_entry(n, struct btrfs_free_space, 4308 offset_index); 4309 if (offset + bytes < tmp->offset) 4310 break; 4311 if (tmp->offset + tmp->bytes < offset) { 4312 n = rb_next(&tmp->offset_index); 4313 continue; 4314 } 4315 info = tmp; 4316 goto have_info; 4317 } 4318 4319 ret = 0; 4320 goto out; 4321 } 4322 4323 if (info->offset == offset) { 4324 ret = 1; 4325 goto out; 4326 } 4327 4328 if (offset > info->offset && offset < info->offset + info->bytes) 4329 ret = 1; 4330 out: 4331 spin_unlock(&ctl->tree_lock); 4332 return ret; 4333 } 4334 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4335