1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 61 { 62 struct btrfs_free_space *info; 63 struct rb_node *node; 64 65 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 66 info = rb_entry(node, struct btrfs_free_space, offset_index); 67 if (!info->bitmap) { 68 unlink_free_space(ctl, info, true); 69 kmem_cache_free(btrfs_free_space_cachep, info); 70 } else { 71 free_bitmap(ctl, info); 72 } 73 74 cond_resched_lock(&ctl->tree_lock); 75 } 76 } 77 78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 79 struct btrfs_path *path, 80 u64 offset) 81 { 82 struct btrfs_fs_info *fs_info = root->fs_info; 83 struct btrfs_key key; 84 struct btrfs_key location; 85 struct btrfs_disk_key disk_key; 86 struct btrfs_free_space_header *header; 87 struct extent_buffer *leaf; 88 struct inode *inode = NULL; 89 unsigned nofs_flag; 90 int ret; 91 92 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 93 key.offset = offset; 94 key.type = 0; 95 96 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 97 if (ret < 0) 98 return ERR_PTR(ret); 99 if (ret > 0) { 100 btrfs_release_path(path); 101 return ERR_PTR(-ENOENT); 102 } 103 104 leaf = path->nodes[0]; 105 header = btrfs_item_ptr(leaf, path->slots[0], 106 struct btrfs_free_space_header); 107 btrfs_free_space_key(leaf, header, &disk_key); 108 btrfs_disk_key_to_cpu(&location, &disk_key); 109 btrfs_release_path(path); 110 111 /* 112 * We are often under a trans handle at this point, so we need to make 113 * sure NOFS is set to keep us from deadlocking. 114 */ 115 nofs_flag = memalloc_nofs_save(); 116 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 117 btrfs_release_path(path); 118 memalloc_nofs_restore(nofs_flag); 119 if (IS_ERR(inode)) 120 return inode; 121 122 mapping_set_gfp_mask(inode->i_mapping, 123 mapping_gfp_constraint(inode->i_mapping, 124 ~(__GFP_FS | __GFP_HIGHMEM))); 125 126 return inode; 127 } 128 129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 130 struct btrfs_path *path) 131 { 132 struct btrfs_fs_info *fs_info = block_group->fs_info; 133 struct inode *inode = NULL; 134 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 135 136 spin_lock(&block_group->lock); 137 if (block_group->inode) 138 inode = igrab(block_group->inode); 139 spin_unlock(&block_group->lock); 140 if (inode) 141 return inode; 142 143 inode = __lookup_free_space_inode(fs_info->tree_root, path, 144 block_group->start); 145 if (IS_ERR(inode)) 146 return inode; 147 148 spin_lock(&block_group->lock); 149 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 150 btrfs_info(fs_info, "Old style space inode found, converting."); 151 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 152 BTRFS_INODE_NODATACOW; 153 block_group->disk_cache_state = BTRFS_DC_CLEAR; 154 } 155 156 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 157 block_group->inode = igrab(inode); 158 spin_unlock(&block_group->lock); 159 160 return inode; 161 } 162 163 static int __create_free_space_inode(struct btrfs_root *root, 164 struct btrfs_trans_handle *trans, 165 struct btrfs_path *path, 166 u64 ino, u64 offset) 167 { 168 struct btrfs_key key; 169 struct btrfs_disk_key disk_key; 170 struct btrfs_free_space_header *header; 171 struct btrfs_inode_item *inode_item; 172 struct extent_buffer *leaf; 173 /* We inline CRCs for the free disk space cache */ 174 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 175 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 176 int ret; 177 178 ret = btrfs_insert_empty_inode(trans, root, path, ino); 179 if (ret) 180 return ret; 181 182 leaf = path->nodes[0]; 183 inode_item = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_inode_item); 185 btrfs_item_key(leaf, &disk_key, path->slots[0]); 186 memzero_extent_buffer(leaf, (unsigned long)inode_item, 187 sizeof(*inode_item)); 188 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 189 btrfs_set_inode_size(leaf, inode_item, 0); 190 btrfs_set_inode_nbytes(leaf, inode_item, 0); 191 btrfs_set_inode_uid(leaf, inode_item, 0); 192 btrfs_set_inode_gid(leaf, inode_item, 0); 193 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 194 btrfs_set_inode_flags(leaf, inode_item, flags); 195 btrfs_set_inode_nlink(leaf, inode_item, 1); 196 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 197 btrfs_set_inode_block_group(leaf, inode_item, offset); 198 btrfs_mark_buffer_dirty(leaf); 199 btrfs_release_path(path); 200 201 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 202 key.offset = offset; 203 key.type = 0; 204 ret = btrfs_insert_empty_item(trans, root, path, &key, 205 sizeof(struct btrfs_free_space_header)); 206 if (ret < 0) { 207 btrfs_release_path(path); 208 return ret; 209 } 210 211 leaf = path->nodes[0]; 212 header = btrfs_item_ptr(leaf, path->slots[0], 213 struct btrfs_free_space_header); 214 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 215 btrfs_set_free_space_key(leaf, header, &disk_key); 216 btrfs_mark_buffer_dirty(leaf); 217 btrfs_release_path(path); 218 219 return 0; 220 } 221 222 int create_free_space_inode(struct btrfs_trans_handle *trans, 223 struct btrfs_block_group *block_group, 224 struct btrfs_path *path) 225 { 226 int ret; 227 u64 ino; 228 229 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 230 if (ret < 0) 231 return ret; 232 233 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 234 ino, block_group->start); 235 } 236 237 /* 238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 239 * handles lookup, otherwise it takes ownership and iputs the inode. 240 * Don't reuse an inode pointer after passing it into this function. 241 */ 242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 243 struct inode *inode, 244 struct btrfs_block_group *block_group) 245 { 246 struct btrfs_path *path; 247 struct btrfs_key key; 248 int ret = 0; 249 250 path = btrfs_alloc_path(); 251 if (!path) 252 return -ENOMEM; 253 254 if (!inode) 255 inode = lookup_free_space_inode(block_group, path); 256 if (IS_ERR(inode)) { 257 if (PTR_ERR(inode) != -ENOENT) 258 ret = PTR_ERR(inode); 259 goto out; 260 } 261 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 262 if (ret) { 263 btrfs_add_delayed_iput(BTRFS_I(inode)); 264 goto out; 265 } 266 clear_nlink(inode); 267 /* One for the block groups ref */ 268 spin_lock(&block_group->lock); 269 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 270 block_group->inode = NULL; 271 spin_unlock(&block_group->lock); 272 iput(inode); 273 } else { 274 spin_unlock(&block_group->lock); 275 } 276 /* One for the lookup ref */ 277 btrfs_add_delayed_iput(BTRFS_I(inode)); 278 279 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 280 key.type = 0; 281 key.offset = block_group->start; 282 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 283 -1, 1); 284 if (ret) { 285 if (ret > 0) 286 ret = 0; 287 goto out; 288 } 289 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 290 out: 291 btrfs_free_path(path); 292 return ret; 293 } 294 295 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, 296 struct btrfs_block_rsv *rsv) 297 { 298 u64 needed_bytes; 299 int ret; 300 301 /* 1 for slack space, 1 for updating the inode */ 302 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + 303 btrfs_calc_metadata_size(fs_info, 1); 304 305 spin_lock(&rsv->lock); 306 if (rsv->reserved < needed_bytes) 307 ret = -ENOSPC; 308 else 309 ret = 0; 310 spin_unlock(&rsv->lock); 311 return ret; 312 } 313 314 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 315 struct btrfs_block_group *block_group, 316 struct inode *vfs_inode) 317 { 318 struct btrfs_truncate_control control = { 319 .inode = BTRFS_I(vfs_inode), 320 .new_size = 0, 321 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 322 .min_type = BTRFS_EXTENT_DATA_KEY, 323 .clear_extent_range = true, 324 }; 325 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 326 struct btrfs_root *root = inode->root; 327 struct extent_state *cached_state = NULL; 328 int ret = 0; 329 bool locked = false; 330 331 if (block_group) { 332 struct btrfs_path *path = btrfs_alloc_path(); 333 334 if (!path) { 335 ret = -ENOMEM; 336 goto fail; 337 } 338 locked = true; 339 mutex_lock(&trans->transaction->cache_write_mutex); 340 if (!list_empty(&block_group->io_list)) { 341 list_del_init(&block_group->io_list); 342 343 btrfs_wait_cache_io(trans, block_group, path); 344 btrfs_put_block_group(block_group); 345 } 346 347 /* 348 * now that we've truncated the cache away, its no longer 349 * setup or written 350 */ 351 spin_lock(&block_group->lock); 352 block_group->disk_cache_state = BTRFS_DC_CLEAR; 353 spin_unlock(&block_group->lock); 354 btrfs_free_path(path); 355 } 356 357 btrfs_i_size_write(inode, 0); 358 truncate_pagecache(vfs_inode, 0); 359 360 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 361 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 362 363 /* 364 * We skip the throttling logic for free space cache inodes, so we don't 365 * need to check for -EAGAIN. 366 */ 367 ret = btrfs_truncate_inode_items(trans, root, &control); 368 369 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 370 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 371 372 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 373 if (ret) 374 goto fail; 375 376 ret = btrfs_update_inode(trans, root, inode); 377 378 fail: 379 if (locked) 380 mutex_unlock(&trans->transaction->cache_write_mutex); 381 if (ret) 382 btrfs_abort_transaction(trans, ret); 383 384 return ret; 385 } 386 387 static void readahead_cache(struct inode *inode) 388 { 389 struct file_ra_state ra; 390 unsigned long last_index; 391 392 file_ra_state_init(&ra, inode->i_mapping); 393 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 394 395 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 396 } 397 398 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 399 int write) 400 { 401 int num_pages; 402 403 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 404 405 /* Make sure we can fit our crcs and generation into the first page */ 406 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 407 return -ENOSPC; 408 409 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 410 411 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 412 if (!io_ctl->pages) 413 return -ENOMEM; 414 415 io_ctl->num_pages = num_pages; 416 io_ctl->fs_info = btrfs_sb(inode->i_sb); 417 io_ctl->inode = inode; 418 419 return 0; 420 } 421 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 422 423 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 424 { 425 kfree(io_ctl->pages); 426 io_ctl->pages = NULL; 427 } 428 429 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 430 { 431 if (io_ctl->cur) { 432 io_ctl->cur = NULL; 433 io_ctl->orig = NULL; 434 } 435 } 436 437 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 438 { 439 ASSERT(io_ctl->index < io_ctl->num_pages); 440 io_ctl->page = io_ctl->pages[io_ctl->index++]; 441 io_ctl->cur = page_address(io_ctl->page); 442 io_ctl->orig = io_ctl->cur; 443 io_ctl->size = PAGE_SIZE; 444 if (clear) 445 clear_page(io_ctl->cur); 446 } 447 448 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 449 { 450 int i; 451 452 io_ctl_unmap_page(io_ctl); 453 454 for (i = 0; i < io_ctl->num_pages; i++) { 455 if (io_ctl->pages[i]) { 456 btrfs_page_clear_checked(io_ctl->fs_info, 457 io_ctl->pages[i], 458 page_offset(io_ctl->pages[i]), 459 PAGE_SIZE); 460 unlock_page(io_ctl->pages[i]); 461 put_page(io_ctl->pages[i]); 462 } 463 } 464 } 465 466 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 467 { 468 struct page *page; 469 struct inode *inode = io_ctl->inode; 470 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 471 int i; 472 473 for (i = 0; i < io_ctl->num_pages; i++) { 474 int ret; 475 476 page = find_or_create_page(inode->i_mapping, i, mask); 477 if (!page) { 478 io_ctl_drop_pages(io_ctl); 479 return -ENOMEM; 480 } 481 482 ret = set_page_extent_mapped(page); 483 if (ret < 0) { 484 unlock_page(page); 485 put_page(page); 486 io_ctl_drop_pages(io_ctl); 487 return ret; 488 } 489 490 io_ctl->pages[i] = page; 491 if (uptodate && !PageUptodate(page)) { 492 btrfs_read_folio(NULL, page_folio(page)); 493 lock_page(page); 494 if (page->mapping != inode->i_mapping) { 495 btrfs_err(BTRFS_I(inode)->root->fs_info, 496 "free space cache page truncated"); 497 io_ctl_drop_pages(io_ctl); 498 return -EIO; 499 } 500 if (!PageUptodate(page)) { 501 btrfs_err(BTRFS_I(inode)->root->fs_info, 502 "error reading free space cache"); 503 io_ctl_drop_pages(io_ctl); 504 return -EIO; 505 } 506 } 507 } 508 509 for (i = 0; i < io_ctl->num_pages; i++) 510 clear_page_dirty_for_io(io_ctl->pages[i]); 511 512 return 0; 513 } 514 515 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 516 { 517 io_ctl_map_page(io_ctl, 1); 518 519 /* 520 * Skip the csum areas. If we don't check crcs then we just have a 521 * 64bit chunk at the front of the first page. 522 */ 523 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 524 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 525 526 put_unaligned_le64(generation, io_ctl->cur); 527 io_ctl->cur += sizeof(u64); 528 } 529 530 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 531 { 532 u64 cache_gen; 533 534 /* 535 * Skip the crc area. If we don't check crcs then we just have a 64bit 536 * chunk at the front of the first page. 537 */ 538 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 539 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 540 541 cache_gen = get_unaligned_le64(io_ctl->cur); 542 if (cache_gen != generation) { 543 btrfs_err_rl(io_ctl->fs_info, 544 "space cache generation (%llu) does not match inode (%llu)", 545 cache_gen, generation); 546 io_ctl_unmap_page(io_ctl); 547 return -EIO; 548 } 549 io_ctl->cur += sizeof(u64); 550 return 0; 551 } 552 553 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 554 { 555 u32 *tmp; 556 u32 crc = ~(u32)0; 557 unsigned offset = 0; 558 559 if (index == 0) 560 offset = sizeof(u32) * io_ctl->num_pages; 561 562 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 563 btrfs_crc32c_final(crc, (u8 *)&crc); 564 io_ctl_unmap_page(io_ctl); 565 tmp = page_address(io_ctl->pages[0]); 566 tmp += index; 567 *tmp = crc; 568 } 569 570 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 571 { 572 u32 *tmp, val; 573 u32 crc = ~(u32)0; 574 unsigned offset = 0; 575 576 if (index == 0) 577 offset = sizeof(u32) * io_ctl->num_pages; 578 579 tmp = page_address(io_ctl->pages[0]); 580 tmp += index; 581 val = *tmp; 582 583 io_ctl_map_page(io_ctl, 0); 584 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 585 btrfs_crc32c_final(crc, (u8 *)&crc); 586 if (val != crc) { 587 btrfs_err_rl(io_ctl->fs_info, 588 "csum mismatch on free space cache"); 589 io_ctl_unmap_page(io_ctl); 590 return -EIO; 591 } 592 593 return 0; 594 } 595 596 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 597 void *bitmap) 598 { 599 struct btrfs_free_space_entry *entry; 600 601 if (!io_ctl->cur) 602 return -ENOSPC; 603 604 entry = io_ctl->cur; 605 put_unaligned_le64(offset, &entry->offset); 606 put_unaligned_le64(bytes, &entry->bytes); 607 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 608 BTRFS_FREE_SPACE_EXTENT; 609 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 610 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 611 612 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 613 return 0; 614 615 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 616 617 /* No more pages to map */ 618 if (io_ctl->index >= io_ctl->num_pages) 619 return 0; 620 621 /* map the next page */ 622 io_ctl_map_page(io_ctl, 1); 623 return 0; 624 } 625 626 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 627 { 628 if (!io_ctl->cur) 629 return -ENOSPC; 630 631 /* 632 * If we aren't at the start of the current page, unmap this one and 633 * map the next one if there is any left. 634 */ 635 if (io_ctl->cur != io_ctl->orig) { 636 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 637 if (io_ctl->index >= io_ctl->num_pages) 638 return -ENOSPC; 639 io_ctl_map_page(io_ctl, 0); 640 } 641 642 copy_page(io_ctl->cur, bitmap); 643 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 644 if (io_ctl->index < io_ctl->num_pages) 645 io_ctl_map_page(io_ctl, 0); 646 return 0; 647 } 648 649 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 650 { 651 /* 652 * If we're not on the boundary we know we've modified the page and we 653 * need to crc the page. 654 */ 655 if (io_ctl->cur != io_ctl->orig) 656 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 657 else 658 io_ctl_unmap_page(io_ctl); 659 660 while (io_ctl->index < io_ctl->num_pages) { 661 io_ctl_map_page(io_ctl, 1); 662 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 663 } 664 } 665 666 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 667 struct btrfs_free_space *entry, u8 *type) 668 { 669 struct btrfs_free_space_entry *e; 670 int ret; 671 672 if (!io_ctl->cur) { 673 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 674 if (ret) 675 return ret; 676 } 677 678 e = io_ctl->cur; 679 entry->offset = get_unaligned_le64(&e->offset); 680 entry->bytes = get_unaligned_le64(&e->bytes); 681 *type = e->type; 682 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 683 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 684 685 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 686 return 0; 687 688 io_ctl_unmap_page(io_ctl); 689 690 return 0; 691 } 692 693 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 694 struct btrfs_free_space *entry) 695 { 696 int ret; 697 698 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 699 if (ret) 700 return ret; 701 702 copy_page(entry->bitmap, io_ctl->cur); 703 io_ctl_unmap_page(io_ctl); 704 705 return 0; 706 } 707 708 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 709 { 710 struct btrfs_block_group *block_group = ctl->block_group; 711 u64 max_bytes; 712 u64 bitmap_bytes; 713 u64 extent_bytes; 714 u64 size = block_group->length; 715 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 716 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 717 718 max_bitmaps = max_t(u64, max_bitmaps, 1); 719 720 if (ctl->total_bitmaps > max_bitmaps) 721 btrfs_err(block_group->fs_info, 722 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 723 block_group->start, block_group->length, 724 ctl->total_bitmaps, ctl->unit, max_bitmaps, 725 bytes_per_bg); 726 ASSERT(ctl->total_bitmaps <= max_bitmaps); 727 728 /* 729 * We are trying to keep the total amount of memory used per 1GiB of 730 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 731 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 732 * bitmaps, we may end up using more memory than this. 733 */ 734 if (size < SZ_1G) 735 max_bytes = MAX_CACHE_BYTES_PER_GIG; 736 else 737 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 738 739 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 740 741 /* 742 * we want the extent entry threshold to always be at most 1/2 the max 743 * bytes we can have, or whatever is less than that. 744 */ 745 extent_bytes = max_bytes - bitmap_bytes; 746 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 747 748 ctl->extents_thresh = 749 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 750 } 751 752 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 753 struct btrfs_free_space_ctl *ctl, 754 struct btrfs_path *path, u64 offset) 755 { 756 struct btrfs_fs_info *fs_info = root->fs_info; 757 struct btrfs_free_space_header *header; 758 struct extent_buffer *leaf; 759 struct btrfs_io_ctl io_ctl; 760 struct btrfs_key key; 761 struct btrfs_free_space *e, *n; 762 LIST_HEAD(bitmaps); 763 u64 num_entries; 764 u64 num_bitmaps; 765 u64 generation; 766 u8 type; 767 int ret = 0; 768 769 /* Nothing in the space cache, goodbye */ 770 if (!i_size_read(inode)) 771 return 0; 772 773 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 774 key.offset = offset; 775 key.type = 0; 776 777 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 778 if (ret < 0) 779 return 0; 780 else if (ret > 0) { 781 btrfs_release_path(path); 782 return 0; 783 } 784 785 ret = -1; 786 787 leaf = path->nodes[0]; 788 header = btrfs_item_ptr(leaf, path->slots[0], 789 struct btrfs_free_space_header); 790 num_entries = btrfs_free_space_entries(leaf, header); 791 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 792 generation = btrfs_free_space_generation(leaf, header); 793 btrfs_release_path(path); 794 795 if (!BTRFS_I(inode)->generation) { 796 btrfs_info(fs_info, 797 "the free space cache file (%llu) is invalid, skip it", 798 offset); 799 return 0; 800 } 801 802 if (BTRFS_I(inode)->generation != generation) { 803 btrfs_err(fs_info, 804 "free space inode generation (%llu) did not match free space cache generation (%llu)", 805 BTRFS_I(inode)->generation, generation); 806 return 0; 807 } 808 809 if (!num_entries) 810 return 0; 811 812 ret = io_ctl_init(&io_ctl, inode, 0); 813 if (ret) 814 return ret; 815 816 readahead_cache(inode); 817 818 ret = io_ctl_prepare_pages(&io_ctl, true); 819 if (ret) 820 goto out; 821 822 ret = io_ctl_check_crc(&io_ctl, 0); 823 if (ret) 824 goto free_cache; 825 826 ret = io_ctl_check_generation(&io_ctl, generation); 827 if (ret) 828 goto free_cache; 829 830 while (num_entries) { 831 e = kmem_cache_zalloc(btrfs_free_space_cachep, 832 GFP_NOFS); 833 if (!e) { 834 ret = -ENOMEM; 835 goto free_cache; 836 } 837 838 ret = io_ctl_read_entry(&io_ctl, e, &type); 839 if (ret) { 840 kmem_cache_free(btrfs_free_space_cachep, e); 841 goto free_cache; 842 } 843 844 if (!e->bytes) { 845 ret = -1; 846 kmem_cache_free(btrfs_free_space_cachep, e); 847 goto free_cache; 848 } 849 850 if (type == BTRFS_FREE_SPACE_EXTENT) { 851 spin_lock(&ctl->tree_lock); 852 ret = link_free_space(ctl, e); 853 spin_unlock(&ctl->tree_lock); 854 if (ret) { 855 btrfs_err(fs_info, 856 "Duplicate entries in free space cache, dumping"); 857 kmem_cache_free(btrfs_free_space_cachep, e); 858 goto free_cache; 859 } 860 } else { 861 ASSERT(num_bitmaps); 862 num_bitmaps--; 863 e->bitmap = kmem_cache_zalloc( 864 btrfs_free_space_bitmap_cachep, GFP_NOFS); 865 if (!e->bitmap) { 866 ret = -ENOMEM; 867 kmem_cache_free( 868 btrfs_free_space_cachep, e); 869 goto free_cache; 870 } 871 spin_lock(&ctl->tree_lock); 872 ret = link_free_space(ctl, e); 873 ctl->total_bitmaps++; 874 recalculate_thresholds(ctl); 875 spin_unlock(&ctl->tree_lock); 876 if (ret) { 877 btrfs_err(fs_info, 878 "Duplicate entries in free space cache, dumping"); 879 kmem_cache_free(btrfs_free_space_cachep, e); 880 goto free_cache; 881 } 882 list_add_tail(&e->list, &bitmaps); 883 } 884 885 num_entries--; 886 } 887 888 io_ctl_unmap_page(&io_ctl); 889 890 /* 891 * We add the bitmaps at the end of the entries in order that 892 * the bitmap entries are added to the cache. 893 */ 894 list_for_each_entry_safe(e, n, &bitmaps, list) { 895 list_del_init(&e->list); 896 ret = io_ctl_read_bitmap(&io_ctl, e); 897 if (ret) 898 goto free_cache; 899 } 900 901 io_ctl_drop_pages(&io_ctl); 902 ret = 1; 903 out: 904 io_ctl_free(&io_ctl); 905 return ret; 906 free_cache: 907 io_ctl_drop_pages(&io_ctl); 908 909 spin_lock(&ctl->tree_lock); 910 __btrfs_remove_free_space_cache(ctl); 911 spin_unlock(&ctl->tree_lock); 912 goto out; 913 } 914 915 static int copy_free_space_cache(struct btrfs_block_group *block_group, 916 struct btrfs_free_space_ctl *ctl) 917 { 918 struct btrfs_free_space *info; 919 struct rb_node *n; 920 int ret = 0; 921 922 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 923 info = rb_entry(n, struct btrfs_free_space, offset_index); 924 if (!info->bitmap) { 925 unlink_free_space(ctl, info, true); 926 ret = btrfs_add_free_space(block_group, info->offset, 927 info->bytes); 928 kmem_cache_free(btrfs_free_space_cachep, info); 929 } else { 930 u64 offset = info->offset; 931 u64 bytes = ctl->unit; 932 933 while (search_bitmap(ctl, info, &offset, &bytes, 934 false) == 0) { 935 ret = btrfs_add_free_space(block_group, offset, 936 bytes); 937 if (ret) 938 break; 939 bitmap_clear_bits(ctl, info, offset, bytes, true); 940 offset = info->offset; 941 bytes = ctl->unit; 942 } 943 free_bitmap(ctl, info); 944 } 945 cond_resched(); 946 } 947 return ret; 948 } 949 950 static struct lock_class_key btrfs_free_space_inode_key; 951 952 int load_free_space_cache(struct btrfs_block_group *block_group) 953 { 954 struct btrfs_fs_info *fs_info = block_group->fs_info; 955 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 956 struct btrfs_free_space_ctl tmp_ctl = {}; 957 struct inode *inode; 958 struct btrfs_path *path; 959 int ret = 0; 960 bool matched; 961 u64 used = block_group->used; 962 963 /* 964 * Because we could potentially discard our loaded free space, we want 965 * to load everything into a temporary structure first, and then if it's 966 * valid copy it all into the actual free space ctl. 967 */ 968 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 969 970 /* 971 * If this block group has been marked to be cleared for one reason or 972 * another then we can't trust the on disk cache, so just return. 973 */ 974 spin_lock(&block_group->lock); 975 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 976 spin_unlock(&block_group->lock); 977 return 0; 978 } 979 spin_unlock(&block_group->lock); 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return 0; 984 path->search_commit_root = 1; 985 path->skip_locking = 1; 986 987 /* 988 * We must pass a path with search_commit_root set to btrfs_iget in 989 * order to avoid a deadlock when allocating extents for the tree root. 990 * 991 * When we are COWing an extent buffer from the tree root, when looking 992 * for a free extent, at extent-tree.c:find_free_extent(), we can find 993 * block group without its free space cache loaded. When we find one 994 * we must load its space cache which requires reading its free space 995 * cache's inode item from the root tree. If this inode item is located 996 * in the same leaf that we started COWing before, then we end up in 997 * deadlock on the extent buffer (trying to read lock it when we 998 * previously write locked it). 999 * 1000 * It's safe to read the inode item using the commit root because 1001 * block groups, once loaded, stay in memory forever (until they are 1002 * removed) as well as their space caches once loaded. New block groups 1003 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 1004 * we will never try to read their inode item while the fs is mounted. 1005 */ 1006 inode = lookup_free_space_inode(block_group, path); 1007 if (IS_ERR(inode)) { 1008 btrfs_free_path(path); 1009 return 0; 1010 } 1011 1012 /* We may have converted the inode and made the cache invalid. */ 1013 spin_lock(&block_group->lock); 1014 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1015 spin_unlock(&block_group->lock); 1016 btrfs_free_path(path); 1017 goto out; 1018 } 1019 spin_unlock(&block_group->lock); 1020 1021 /* 1022 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1023 * free space inodes to prevent false positives related to locks for normal 1024 * inodes. 1025 */ 1026 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1027 &btrfs_free_space_inode_key); 1028 1029 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1030 path, block_group->start); 1031 btrfs_free_path(path); 1032 if (ret <= 0) 1033 goto out; 1034 1035 matched = (tmp_ctl.free_space == (block_group->length - used - 1036 block_group->bytes_super)); 1037 1038 if (matched) { 1039 ret = copy_free_space_cache(block_group, &tmp_ctl); 1040 /* 1041 * ret == 1 means we successfully loaded the free space cache, 1042 * so we need to re-set it here. 1043 */ 1044 if (ret == 0) 1045 ret = 1; 1046 } else { 1047 /* 1048 * We need to call the _locked variant so we don't try to update 1049 * the discard counters. 1050 */ 1051 spin_lock(&tmp_ctl.tree_lock); 1052 __btrfs_remove_free_space_cache(&tmp_ctl); 1053 spin_unlock(&tmp_ctl.tree_lock); 1054 btrfs_warn(fs_info, 1055 "block group %llu has wrong amount of free space", 1056 block_group->start); 1057 ret = -1; 1058 } 1059 out: 1060 if (ret < 0) { 1061 /* This cache is bogus, make sure it gets cleared */ 1062 spin_lock(&block_group->lock); 1063 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1064 spin_unlock(&block_group->lock); 1065 ret = 0; 1066 1067 btrfs_warn(fs_info, 1068 "failed to load free space cache for block group %llu, rebuilding it now", 1069 block_group->start); 1070 } 1071 1072 spin_lock(&ctl->tree_lock); 1073 btrfs_discard_update_discardable(block_group); 1074 spin_unlock(&ctl->tree_lock); 1075 iput(inode); 1076 return ret; 1077 } 1078 1079 static noinline_for_stack 1080 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1081 struct btrfs_free_space_ctl *ctl, 1082 struct btrfs_block_group *block_group, 1083 int *entries, int *bitmaps, 1084 struct list_head *bitmap_list) 1085 { 1086 int ret; 1087 struct btrfs_free_cluster *cluster = NULL; 1088 struct btrfs_free_cluster *cluster_locked = NULL; 1089 struct rb_node *node = rb_first(&ctl->free_space_offset); 1090 struct btrfs_trim_range *trim_entry; 1091 1092 /* Get the cluster for this block_group if it exists */ 1093 if (block_group && !list_empty(&block_group->cluster_list)) { 1094 cluster = list_entry(block_group->cluster_list.next, 1095 struct btrfs_free_cluster, 1096 block_group_list); 1097 } 1098 1099 if (!node && cluster) { 1100 cluster_locked = cluster; 1101 spin_lock(&cluster_locked->lock); 1102 node = rb_first(&cluster->root); 1103 cluster = NULL; 1104 } 1105 1106 /* Write out the extent entries */ 1107 while (node) { 1108 struct btrfs_free_space *e; 1109 1110 e = rb_entry(node, struct btrfs_free_space, offset_index); 1111 *entries += 1; 1112 1113 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1114 e->bitmap); 1115 if (ret) 1116 goto fail; 1117 1118 if (e->bitmap) { 1119 list_add_tail(&e->list, bitmap_list); 1120 *bitmaps += 1; 1121 } 1122 node = rb_next(node); 1123 if (!node && cluster) { 1124 node = rb_first(&cluster->root); 1125 cluster_locked = cluster; 1126 spin_lock(&cluster_locked->lock); 1127 cluster = NULL; 1128 } 1129 } 1130 if (cluster_locked) { 1131 spin_unlock(&cluster_locked->lock); 1132 cluster_locked = NULL; 1133 } 1134 1135 /* 1136 * Make sure we don't miss any range that was removed from our rbtree 1137 * because trimming is running. Otherwise after a umount+mount (or crash 1138 * after committing the transaction) we would leak free space and get 1139 * an inconsistent free space cache report from fsck. 1140 */ 1141 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1142 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1143 trim_entry->bytes, NULL); 1144 if (ret) 1145 goto fail; 1146 *entries += 1; 1147 } 1148 1149 return 0; 1150 fail: 1151 if (cluster_locked) 1152 spin_unlock(&cluster_locked->lock); 1153 return -ENOSPC; 1154 } 1155 1156 static noinline_for_stack int 1157 update_cache_item(struct btrfs_trans_handle *trans, 1158 struct btrfs_root *root, 1159 struct inode *inode, 1160 struct btrfs_path *path, u64 offset, 1161 int entries, int bitmaps) 1162 { 1163 struct btrfs_key key; 1164 struct btrfs_free_space_header *header; 1165 struct extent_buffer *leaf; 1166 int ret; 1167 1168 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1169 key.offset = offset; 1170 key.type = 0; 1171 1172 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1173 if (ret < 0) { 1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1175 EXTENT_DELALLOC, NULL); 1176 goto fail; 1177 } 1178 leaf = path->nodes[0]; 1179 if (ret > 0) { 1180 struct btrfs_key found_key; 1181 ASSERT(path->slots[0]); 1182 path->slots[0]--; 1183 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1184 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1185 found_key.offset != offset) { 1186 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1187 inode->i_size - 1, EXTENT_DELALLOC, 1188 NULL); 1189 btrfs_release_path(path); 1190 goto fail; 1191 } 1192 } 1193 1194 BTRFS_I(inode)->generation = trans->transid; 1195 header = btrfs_item_ptr(leaf, path->slots[0], 1196 struct btrfs_free_space_header); 1197 btrfs_set_free_space_entries(leaf, header, entries); 1198 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1199 btrfs_set_free_space_generation(leaf, header, trans->transid); 1200 btrfs_mark_buffer_dirty(leaf); 1201 btrfs_release_path(path); 1202 1203 return 0; 1204 1205 fail: 1206 return -1; 1207 } 1208 1209 static noinline_for_stack int write_pinned_extent_entries( 1210 struct btrfs_trans_handle *trans, 1211 struct btrfs_block_group *block_group, 1212 struct btrfs_io_ctl *io_ctl, 1213 int *entries) 1214 { 1215 u64 start, extent_start, extent_end, len; 1216 struct extent_io_tree *unpin = NULL; 1217 int ret; 1218 1219 if (!block_group) 1220 return 0; 1221 1222 /* 1223 * We want to add any pinned extents to our free space cache 1224 * so we don't leak the space 1225 * 1226 * We shouldn't have switched the pinned extents yet so this is the 1227 * right one 1228 */ 1229 unpin = &trans->transaction->pinned_extents; 1230 1231 start = block_group->start; 1232 1233 while (start < block_group->start + block_group->length) { 1234 ret = find_first_extent_bit(unpin, start, 1235 &extent_start, &extent_end, 1236 EXTENT_DIRTY, NULL); 1237 if (ret) 1238 return 0; 1239 1240 /* This pinned extent is out of our range */ 1241 if (extent_start >= block_group->start + block_group->length) 1242 return 0; 1243 1244 extent_start = max(extent_start, start); 1245 extent_end = min(block_group->start + block_group->length, 1246 extent_end + 1); 1247 len = extent_end - extent_start; 1248 1249 *entries += 1; 1250 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1251 if (ret) 1252 return -ENOSPC; 1253 1254 start = extent_end; 1255 } 1256 1257 return 0; 1258 } 1259 1260 static noinline_for_stack int 1261 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1262 { 1263 struct btrfs_free_space *entry, *next; 1264 int ret; 1265 1266 /* Write out the bitmaps */ 1267 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1268 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1269 if (ret) 1270 return -ENOSPC; 1271 list_del_init(&entry->list); 1272 } 1273 1274 return 0; 1275 } 1276 1277 static int flush_dirty_cache(struct inode *inode) 1278 { 1279 int ret; 1280 1281 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1282 if (ret) 1283 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1284 EXTENT_DELALLOC, NULL); 1285 1286 return ret; 1287 } 1288 1289 static void noinline_for_stack 1290 cleanup_bitmap_list(struct list_head *bitmap_list) 1291 { 1292 struct btrfs_free_space *entry, *next; 1293 1294 list_for_each_entry_safe(entry, next, bitmap_list, list) 1295 list_del_init(&entry->list); 1296 } 1297 1298 static void noinline_for_stack 1299 cleanup_write_cache_enospc(struct inode *inode, 1300 struct btrfs_io_ctl *io_ctl, 1301 struct extent_state **cached_state) 1302 { 1303 io_ctl_drop_pages(io_ctl); 1304 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1305 cached_state); 1306 } 1307 1308 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1309 struct btrfs_trans_handle *trans, 1310 struct btrfs_block_group *block_group, 1311 struct btrfs_io_ctl *io_ctl, 1312 struct btrfs_path *path, u64 offset) 1313 { 1314 int ret; 1315 struct inode *inode = io_ctl->inode; 1316 1317 if (!inode) 1318 return 0; 1319 1320 /* Flush the dirty pages in the cache file. */ 1321 ret = flush_dirty_cache(inode); 1322 if (ret) 1323 goto out; 1324 1325 /* Update the cache item to tell everyone this cache file is valid. */ 1326 ret = update_cache_item(trans, root, inode, path, offset, 1327 io_ctl->entries, io_ctl->bitmaps); 1328 out: 1329 if (ret) { 1330 invalidate_inode_pages2(inode->i_mapping); 1331 BTRFS_I(inode)->generation = 0; 1332 if (block_group) 1333 btrfs_debug(root->fs_info, 1334 "failed to write free space cache for block group %llu error %d", 1335 block_group->start, ret); 1336 } 1337 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1338 1339 if (block_group) { 1340 /* the dirty list is protected by the dirty_bgs_lock */ 1341 spin_lock(&trans->transaction->dirty_bgs_lock); 1342 1343 /* the disk_cache_state is protected by the block group lock */ 1344 spin_lock(&block_group->lock); 1345 1346 /* 1347 * only mark this as written if we didn't get put back on 1348 * the dirty list while waiting for IO. Otherwise our 1349 * cache state won't be right, and we won't get written again 1350 */ 1351 if (!ret && list_empty(&block_group->dirty_list)) 1352 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1353 else if (ret) 1354 block_group->disk_cache_state = BTRFS_DC_ERROR; 1355 1356 spin_unlock(&block_group->lock); 1357 spin_unlock(&trans->transaction->dirty_bgs_lock); 1358 io_ctl->inode = NULL; 1359 iput(inode); 1360 } 1361 1362 return ret; 1363 1364 } 1365 1366 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1367 struct btrfs_block_group *block_group, 1368 struct btrfs_path *path) 1369 { 1370 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1371 block_group, &block_group->io_ctl, 1372 path, block_group->start); 1373 } 1374 1375 /* 1376 * Write out cached info to an inode. 1377 * 1378 * @root: root the inode belongs to 1379 * @inode: freespace inode we are writing out 1380 * @ctl: free space cache we are going to write out 1381 * @block_group: block_group for this cache if it belongs to a block_group 1382 * @io_ctl: holds context for the io 1383 * @trans: the trans handle 1384 * 1385 * This function writes out a free space cache struct to disk for quick recovery 1386 * on mount. This will return 0 if it was successful in writing the cache out, 1387 * or an errno if it was not. 1388 */ 1389 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1390 struct btrfs_free_space_ctl *ctl, 1391 struct btrfs_block_group *block_group, 1392 struct btrfs_io_ctl *io_ctl, 1393 struct btrfs_trans_handle *trans) 1394 { 1395 struct extent_state *cached_state = NULL; 1396 LIST_HEAD(bitmap_list); 1397 int entries = 0; 1398 int bitmaps = 0; 1399 int ret; 1400 int must_iput = 0; 1401 1402 if (!i_size_read(inode)) 1403 return -EIO; 1404 1405 WARN_ON(io_ctl->pages); 1406 ret = io_ctl_init(io_ctl, inode, 1); 1407 if (ret) 1408 return ret; 1409 1410 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1411 down_write(&block_group->data_rwsem); 1412 spin_lock(&block_group->lock); 1413 if (block_group->delalloc_bytes) { 1414 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1415 spin_unlock(&block_group->lock); 1416 up_write(&block_group->data_rwsem); 1417 BTRFS_I(inode)->generation = 0; 1418 ret = 0; 1419 must_iput = 1; 1420 goto out; 1421 } 1422 spin_unlock(&block_group->lock); 1423 } 1424 1425 /* Lock all pages first so we can lock the extent safely. */ 1426 ret = io_ctl_prepare_pages(io_ctl, false); 1427 if (ret) 1428 goto out_unlock; 1429 1430 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1431 &cached_state); 1432 1433 io_ctl_set_generation(io_ctl, trans->transid); 1434 1435 mutex_lock(&ctl->cache_writeout_mutex); 1436 /* Write out the extent entries in the free space cache */ 1437 spin_lock(&ctl->tree_lock); 1438 ret = write_cache_extent_entries(io_ctl, ctl, 1439 block_group, &entries, &bitmaps, 1440 &bitmap_list); 1441 if (ret) 1442 goto out_nospc_locked; 1443 1444 /* 1445 * Some spaces that are freed in the current transaction are pinned, 1446 * they will be added into free space cache after the transaction is 1447 * committed, we shouldn't lose them. 1448 * 1449 * If this changes while we are working we'll get added back to 1450 * the dirty list and redo it. No locking needed 1451 */ 1452 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1453 if (ret) 1454 goto out_nospc_locked; 1455 1456 /* 1457 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1458 * locked while doing it because a concurrent trim can be manipulating 1459 * or freeing the bitmap. 1460 */ 1461 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1462 spin_unlock(&ctl->tree_lock); 1463 mutex_unlock(&ctl->cache_writeout_mutex); 1464 if (ret) 1465 goto out_nospc; 1466 1467 /* Zero out the rest of the pages just to make sure */ 1468 io_ctl_zero_remaining_pages(io_ctl); 1469 1470 /* Everything is written out, now we dirty the pages in the file. */ 1471 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1472 io_ctl->num_pages, 0, i_size_read(inode), 1473 &cached_state, false); 1474 if (ret) 1475 goto out_nospc; 1476 1477 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1478 up_write(&block_group->data_rwsem); 1479 /* 1480 * Release the pages and unlock the extent, we will flush 1481 * them out later 1482 */ 1483 io_ctl_drop_pages(io_ctl); 1484 io_ctl_free(io_ctl); 1485 1486 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1487 &cached_state); 1488 1489 /* 1490 * at this point the pages are under IO and we're happy, 1491 * The caller is responsible for waiting on them and updating 1492 * the cache and the inode 1493 */ 1494 io_ctl->entries = entries; 1495 io_ctl->bitmaps = bitmaps; 1496 1497 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1498 if (ret) 1499 goto out; 1500 1501 return 0; 1502 1503 out_nospc_locked: 1504 cleanup_bitmap_list(&bitmap_list); 1505 spin_unlock(&ctl->tree_lock); 1506 mutex_unlock(&ctl->cache_writeout_mutex); 1507 1508 out_nospc: 1509 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1510 1511 out_unlock: 1512 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1513 up_write(&block_group->data_rwsem); 1514 1515 out: 1516 io_ctl->inode = NULL; 1517 io_ctl_free(io_ctl); 1518 if (ret) { 1519 invalidate_inode_pages2(inode->i_mapping); 1520 BTRFS_I(inode)->generation = 0; 1521 } 1522 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1523 if (must_iput) 1524 iput(inode); 1525 return ret; 1526 } 1527 1528 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1529 struct btrfs_block_group *block_group, 1530 struct btrfs_path *path) 1531 { 1532 struct btrfs_fs_info *fs_info = trans->fs_info; 1533 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1534 struct inode *inode; 1535 int ret = 0; 1536 1537 spin_lock(&block_group->lock); 1538 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1539 spin_unlock(&block_group->lock); 1540 return 0; 1541 } 1542 spin_unlock(&block_group->lock); 1543 1544 inode = lookup_free_space_inode(block_group, path); 1545 if (IS_ERR(inode)) 1546 return 0; 1547 1548 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1549 block_group, &block_group->io_ctl, trans); 1550 if (ret) { 1551 btrfs_debug(fs_info, 1552 "failed to write free space cache for block group %llu error %d", 1553 block_group->start, ret); 1554 spin_lock(&block_group->lock); 1555 block_group->disk_cache_state = BTRFS_DC_ERROR; 1556 spin_unlock(&block_group->lock); 1557 1558 block_group->io_ctl.inode = NULL; 1559 iput(inode); 1560 } 1561 1562 /* 1563 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1564 * to wait for IO and put the inode 1565 */ 1566 1567 return ret; 1568 } 1569 1570 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1571 u64 offset) 1572 { 1573 ASSERT(offset >= bitmap_start); 1574 offset -= bitmap_start; 1575 return (unsigned long)(div_u64(offset, unit)); 1576 } 1577 1578 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1579 { 1580 return (unsigned long)(div_u64(bytes, unit)); 1581 } 1582 1583 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1584 u64 offset) 1585 { 1586 u64 bitmap_start; 1587 u64 bytes_per_bitmap; 1588 1589 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1590 bitmap_start = offset - ctl->start; 1591 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1592 bitmap_start *= bytes_per_bitmap; 1593 bitmap_start += ctl->start; 1594 1595 return bitmap_start; 1596 } 1597 1598 static int tree_insert_offset(struct rb_root *root, u64 offset, 1599 struct rb_node *node, int bitmap) 1600 { 1601 struct rb_node **p = &root->rb_node; 1602 struct rb_node *parent = NULL; 1603 struct btrfs_free_space *info; 1604 1605 while (*p) { 1606 parent = *p; 1607 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1608 1609 if (offset < info->offset) { 1610 p = &(*p)->rb_left; 1611 } else if (offset > info->offset) { 1612 p = &(*p)->rb_right; 1613 } else { 1614 /* 1615 * we could have a bitmap entry and an extent entry 1616 * share the same offset. If this is the case, we want 1617 * the extent entry to always be found first if we do a 1618 * linear search through the tree, since we want to have 1619 * the quickest allocation time, and allocating from an 1620 * extent is faster than allocating from a bitmap. So 1621 * if we're inserting a bitmap and we find an entry at 1622 * this offset, we want to go right, or after this entry 1623 * logically. If we are inserting an extent and we've 1624 * found a bitmap, we want to go left, or before 1625 * logically. 1626 */ 1627 if (bitmap) { 1628 if (info->bitmap) { 1629 WARN_ON_ONCE(1); 1630 return -EEXIST; 1631 } 1632 p = &(*p)->rb_right; 1633 } else { 1634 if (!info->bitmap) { 1635 WARN_ON_ONCE(1); 1636 return -EEXIST; 1637 } 1638 p = &(*p)->rb_left; 1639 } 1640 } 1641 } 1642 1643 rb_link_node(node, parent, p); 1644 rb_insert_color(node, root); 1645 1646 return 0; 1647 } 1648 1649 /* 1650 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1651 * searched through the bitmap and figured out the largest ->max_extent_size, 1652 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1653 * allocator the wrong thing, we want to use the actual real max_extent_size 1654 * we've found already if it's larger, or we want to use ->bytes. 1655 * 1656 * This matters because find_free_space() will skip entries who's ->bytes is 1657 * less than the required bytes. So if we didn't search down this bitmap, we 1658 * may pick some previous entry that has a smaller ->max_extent_size than we 1659 * have. For example, assume we have two entries, one that has 1660 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1661 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1662 * call into find_free_space(), and return with max_extent_size == 4K, because 1663 * that first bitmap entry had ->max_extent_size set, but the second one did 1664 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1665 * 8K contiguous range. 1666 * 1667 * Consider the other case, we have 2 8K chunks in that second entry and still 1668 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1669 * allocator comes in it'll fully search our second bitmap, and this time it'll 1670 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1671 * right allocation the next loop through. 1672 */ 1673 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1674 { 1675 if (entry->bitmap && entry->max_extent_size) 1676 return entry->max_extent_size; 1677 return entry->bytes; 1678 } 1679 1680 /* 1681 * We want the largest entry to be leftmost, so this is inverted from what you'd 1682 * normally expect. 1683 */ 1684 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1685 { 1686 const struct btrfs_free_space *entry, *exist; 1687 1688 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1689 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1690 return get_max_extent_size(exist) < get_max_extent_size(entry); 1691 } 1692 1693 /* 1694 * searches the tree for the given offset. 1695 * 1696 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1697 * want a section that has at least bytes size and comes at or after the given 1698 * offset. 1699 */ 1700 static struct btrfs_free_space * 1701 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1702 u64 offset, int bitmap_only, int fuzzy) 1703 { 1704 struct rb_node *n = ctl->free_space_offset.rb_node; 1705 struct btrfs_free_space *entry = NULL, *prev = NULL; 1706 1707 /* find entry that is closest to the 'offset' */ 1708 while (n) { 1709 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1710 prev = entry; 1711 1712 if (offset < entry->offset) 1713 n = n->rb_left; 1714 else if (offset > entry->offset) 1715 n = n->rb_right; 1716 else 1717 break; 1718 1719 entry = NULL; 1720 } 1721 1722 if (bitmap_only) { 1723 if (!entry) 1724 return NULL; 1725 if (entry->bitmap) 1726 return entry; 1727 1728 /* 1729 * bitmap entry and extent entry may share same offset, 1730 * in that case, bitmap entry comes after extent entry. 1731 */ 1732 n = rb_next(n); 1733 if (!n) 1734 return NULL; 1735 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1736 if (entry->offset != offset) 1737 return NULL; 1738 1739 WARN_ON(!entry->bitmap); 1740 return entry; 1741 } else if (entry) { 1742 if (entry->bitmap) { 1743 /* 1744 * if previous extent entry covers the offset, 1745 * we should return it instead of the bitmap entry 1746 */ 1747 n = rb_prev(&entry->offset_index); 1748 if (n) { 1749 prev = rb_entry(n, struct btrfs_free_space, 1750 offset_index); 1751 if (!prev->bitmap && 1752 prev->offset + prev->bytes > offset) 1753 entry = prev; 1754 } 1755 } 1756 return entry; 1757 } 1758 1759 if (!prev) 1760 return NULL; 1761 1762 /* find last entry before the 'offset' */ 1763 entry = prev; 1764 if (entry->offset > offset) { 1765 n = rb_prev(&entry->offset_index); 1766 if (n) { 1767 entry = rb_entry(n, struct btrfs_free_space, 1768 offset_index); 1769 ASSERT(entry->offset <= offset); 1770 } else { 1771 if (fuzzy) 1772 return entry; 1773 else 1774 return NULL; 1775 } 1776 } 1777 1778 if (entry->bitmap) { 1779 n = rb_prev(&entry->offset_index); 1780 if (n) { 1781 prev = rb_entry(n, struct btrfs_free_space, 1782 offset_index); 1783 if (!prev->bitmap && 1784 prev->offset + prev->bytes > offset) 1785 return prev; 1786 } 1787 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1788 return entry; 1789 } else if (entry->offset + entry->bytes > offset) 1790 return entry; 1791 1792 if (!fuzzy) 1793 return NULL; 1794 1795 while (1) { 1796 n = rb_next(&entry->offset_index); 1797 if (!n) 1798 return NULL; 1799 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1800 if (entry->bitmap) { 1801 if (entry->offset + BITS_PER_BITMAP * 1802 ctl->unit > offset) 1803 break; 1804 } else { 1805 if (entry->offset + entry->bytes > offset) 1806 break; 1807 } 1808 } 1809 return entry; 1810 } 1811 1812 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1813 struct btrfs_free_space *info, 1814 bool update_stat) 1815 { 1816 rb_erase(&info->offset_index, &ctl->free_space_offset); 1817 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1818 ctl->free_extents--; 1819 1820 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1821 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1822 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1823 } 1824 1825 if (update_stat) 1826 ctl->free_space -= info->bytes; 1827 } 1828 1829 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1830 struct btrfs_free_space *info) 1831 { 1832 int ret = 0; 1833 1834 ASSERT(info->bytes || info->bitmap); 1835 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1836 &info->offset_index, (info->bitmap != NULL)); 1837 if (ret) 1838 return ret; 1839 1840 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1841 1842 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1843 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1844 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1845 } 1846 1847 ctl->free_space += info->bytes; 1848 ctl->free_extents++; 1849 return ret; 1850 } 1851 1852 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1853 struct btrfs_free_space *info) 1854 { 1855 ASSERT(info->bitmap); 1856 1857 /* 1858 * If our entry is empty it's because we're on a cluster and we don't 1859 * want to re-link it into our ctl bytes index. 1860 */ 1861 if (RB_EMPTY_NODE(&info->bytes_index)) 1862 return; 1863 1864 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1865 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1866 } 1867 1868 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1869 struct btrfs_free_space *info, 1870 u64 offset, u64 bytes, bool update_stat) 1871 { 1872 unsigned long start, count, end; 1873 int extent_delta = -1; 1874 1875 start = offset_to_bit(info->offset, ctl->unit, offset); 1876 count = bytes_to_bits(bytes, ctl->unit); 1877 end = start + count; 1878 ASSERT(end <= BITS_PER_BITMAP); 1879 1880 bitmap_clear(info->bitmap, start, count); 1881 1882 info->bytes -= bytes; 1883 if (info->max_extent_size > ctl->unit) 1884 info->max_extent_size = 0; 1885 1886 relink_bitmap_entry(ctl, info); 1887 1888 if (start && test_bit(start - 1, info->bitmap)) 1889 extent_delta++; 1890 1891 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1892 extent_delta++; 1893 1894 info->bitmap_extents += extent_delta; 1895 if (!btrfs_free_space_trimmed(info)) { 1896 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1897 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1898 } 1899 1900 if (update_stat) 1901 ctl->free_space -= bytes; 1902 } 1903 1904 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1905 struct btrfs_free_space *info, u64 offset, 1906 u64 bytes) 1907 { 1908 unsigned long start, count, end; 1909 int extent_delta = 1; 1910 1911 start = offset_to_bit(info->offset, ctl->unit, offset); 1912 count = bytes_to_bits(bytes, ctl->unit); 1913 end = start + count; 1914 ASSERT(end <= BITS_PER_BITMAP); 1915 1916 bitmap_set(info->bitmap, start, count); 1917 1918 /* 1919 * We set some bytes, we have no idea what the max extent size is 1920 * anymore. 1921 */ 1922 info->max_extent_size = 0; 1923 info->bytes += bytes; 1924 ctl->free_space += bytes; 1925 1926 relink_bitmap_entry(ctl, info); 1927 1928 if (start && test_bit(start - 1, info->bitmap)) 1929 extent_delta--; 1930 1931 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1932 extent_delta--; 1933 1934 info->bitmap_extents += extent_delta; 1935 if (!btrfs_free_space_trimmed(info)) { 1936 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1937 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1938 } 1939 } 1940 1941 /* 1942 * If we can not find suitable extent, we will use bytes to record 1943 * the size of the max extent. 1944 */ 1945 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1946 struct btrfs_free_space *bitmap_info, u64 *offset, 1947 u64 *bytes, bool for_alloc) 1948 { 1949 unsigned long found_bits = 0; 1950 unsigned long max_bits = 0; 1951 unsigned long bits, i; 1952 unsigned long next_zero; 1953 unsigned long extent_bits; 1954 1955 /* 1956 * Skip searching the bitmap if we don't have a contiguous section that 1957 * is large enough for this allocation. 1958 */ 1959 if (for_alloc && 1960 bitmap_info->max_extent_size && 1961 bitmap_info->max_extent_size < *bytes) { 1962 *bytes = bitmap_info->max_extent_size; 1963 return -1; 1964 } 1965 1966 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1967 max_t(u64, *offset, bitmap_info->offset)); 1968 bits = bytes_to_bits(*bytes, ctl->unit); 1969 1970 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1971 if (for_alloc && bits == 1) { 1972 found_bits = 1; 1973 break; 1974 } 1975 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1976 BITS_PER_BITMAP, i); 1977 extent_bits = next_zero - i; 1978 if (extent_bits >= bits) { 1979 found_bits = extent_bits; 1980 break; 1981 } else if (extent_bits > max_bits) { 1982 max_bits = extent_bits; 1983 } 1984 i = next_zero; 1985 } 1986 1987 if (found_bits) { 1988 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1989 *bytes = (u64)(found_bits) * ctl->unit; 1990 return 0; 1991 } 1992 1993 *bytes = (u64)(max_bits) * ctl->unit; 1994 bitmap_info->max_extent_size = *bytes; 1995 relink_bitmap_entry(ctl, bitmap_info); 1996 return -1; 1997 } 1998 1999 /* Cache the size of the max extent in bytes */ 2000 static struct btrfs_free_space * 2001 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2002 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2003 { 2004 struct btrfs_free_space *entry; 2005 struct rb_node *node; 2006 u64 tmp; 2007 u64 align_off; 2008 int ret; 2009 2010 if (!ctl->free_space_offset.rb_node) 2011 goto out; 2012 again: 2013 if (use_bytes_index) { 2014 node = rb_first_cached(&ctl->free_space_bytes); 2015 } else { 2016 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2017 0, 1); 2018 if (!entry) 2019 goto out; 2020 node = &entry->offset_index; 2021 } 2022 2023 for (; node; node = rb_next(node)) { 2024 if (use_bytes_index) 2025 entry = rb_entry(node, struct btrfs_free_space, 2026 bytes_index); 2027 else 2028 entry = rb_entry(node, struct btrfs_free_space, 2029 offset_index); 2030 2031 /* 2032 * If we are using the bytes index then all subsequent entries 2033 * in this tree are going to be < bytes, so simply set the max 2034 * extent size and exit the loop. 2035 * 2036 * If we're using the offset index then we need to keep going 2037 * through the rest of the tree. 2038 */ 2039 if (entry->bytes < *bytes) { 2040 *max_extent_size = max(get_max_extent_size(entry), 2041 *max_extent_size); 2042 if (use_bytes_index) 2043 break; 2044 continue; 2045 } 2046 2047 /* make sure the space returned is big enough 2048 * to match our requested alignment 2049 */ 2050 if (*bytes >= align) { 2051 tmp = entry->offset - ctl->start + align - 1; 2052 tmp = div64_u64(tmp, align); 2053 tmp = tmp * align + ctl->start; 2054 align_off = tmp - entry->offset; 2055 } else { 2056 align_off = 0; 2057 tmp = entry->offset; 2058 } 2059 2060 /* 2061 * We don't break here if we're using the bytes index because we 2062 * may have another entry that has the correct alignment that is 2063 * the right size, so we don't want to miss that possibility. 2064 * At worst this adds another loop through the logic, but if we 2065 * broke here we could prematurely ENOSPC. 2066 */ 2067 if (entry->bytes < *bytes + align_off) { 2068 *max_extent_size = max(get_max_extent_size(entry), 2069 *max_extent_size); 2070 continue; 2071 } 2072 2073 if (entry->bitmap) { 2074 struct rb_node *old_next = rb_next(node); 2075 u64 size = *bytes; 2076 2077 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2078 if (!ret) { 2079 *offset = tmp; 2080 *bytes = size; 2081 return entry; 2082 } else { 2083 *max_extent_size = 2084 max(get_max_extent_size(entry), 2085 *max_extent_size); 2086 } 2087 2088 /* 2089 * The bitmap may have gotten re-arranged in the space 2090 * index here because the max_extent_size may have been 2091 * updated. Start from the beginning again if this 2092 * happened. 2093 */ 2094 if (use_bytes_index && old_next != rb_next(node)) 2095 goto again; 2096 continue; 2097 } 2098 2099 *offset = tmp; 2100 *bytes = entry->bytes - align_off; 2101 return entry; 2102 } 2103 out: 2104 return NULL; 2105 } 2106 2107 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2108 struct btrfs_free_space *info, u64 offset) 2109 { 2110 info->offset = offset_to_bitmap(ctl, offset); 2111 info->bytes = 0; 2112 info->bitmap_extents = 0; 2113 INIT_LIST_HEAD(&info->list); 2114 link_free_space(ctl, info); 2115 ctl->total_bitmaps++; 2116 recalculate_thresholds(ctl); 2117 } 2118 2119 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2120 struct btrfs_free_space *bitmap_info) 2121 { 2122 /* 2123 * Normally when this is called, the bitmap is completely empty. However, 2124 * if we are blowing up the free space cache for one reason or another 2125 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2126 * we may leave stats on the table. 2127 */ 2128 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2129 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2130 bitmap_info->bitmap_extents; 2131 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2132 2133 } 2134 unlink_free_space(ctl, bitmap_info, true); 2135 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2136 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2137 ctl->total_bitmaps--; 2138 recalculate_thresholds(ctl); 2139 } 2140 2141 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2142 struct btrfs_free_space *bitmap_info, 2143 u64 *offset, u64 *bytes) 2144 { 2145 u64 end; 2146 u64 search_start, search_bytes; 2147 int ret; 2148 2149 again: 2150 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2151 2152 /* 2153 * We need to search for bits in this bitmap. We could only cover some 2154 * of the extent in this bitmap thanks to how we add space, so we need 2155 * to search for as much as it as we can and clear that amount, and then 2156 * go searching for the next bit. 2157 */ 2158 search_start = *offset; 2159 search_bytes = ctl->unit; 2160 search_bytes = min(search_bytes, end - search_start + 1); 2161 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2162 false); 2163 if (ret < 0 || search_start != *offset) 2164 return -EINVAL; 2165 2166 /* We may have found more bits than what we need */ 2167 search_bytes = min(search_bytes, *bytes); 2168 2169 /* Cannot clear past the end of the bitmap */ 2170 search_bytes = min(search_bytes, end - search_start + 1); 2171 2172 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2173 *offset += search_bytes; 2174 *bytes -= search_bytes; 2175 2176 if (*bytes) { 2177 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2178 if (!bitmap_info->bytes) 2179 free_bitmap(ctl, bitmap_info); 2180 2181 /* 2182 * no entry after this bitmap, but we still have bytes to 2183 * remove, so something has gone wrong. 2184 */ 2185 if (!next) 2186 return -EINVAL; 2187 2188 bitmap_info = rb_entry(next, struct btrfs_free_space, 2189 offset_index); 2190 2191 /* 2192 * if the next entry isn't a bitmap we need to return to let the 2193 * extent stuff do its work. 2194 */ 2195 if (!bitmap_info->bitmap) 2196 return -EAGAIN; 2197 2198 /* 2199 * Ok the next item is a bitmap, but it may not actually hold 2200 * the information for the rest of this free space stuff, so 2201 * look for it, and if we don't find it return so we can try 2202 * everything over again. 2203 */ 2204 search_start = *offset; 2205 search_bytes = ctl->unit; 2206 ret = search_bitmap(ctl, bitmap_info, &search_start, 2207 &search_bytes, false); 2208 if (ret < 0 || search_start != *offset) 2209 return -EAGAIN; 2210 2211 goto again; 2212 } else if (!bitmap_info->bytes) 2213 free_bitmap(ctl, bitmap_info); 2214 2215 return 0; 2216 } 2217 2218 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2219 struct btrfs_free_space *info, u64 offset, 2220 u64 bytes, enum btrfs_trim_state trim_state) 2221 { 2222 u64 bytes_to_set = 0; 2223 u64 end; 2224 2225 /* 2226 * This is a tradeoff to make bitmap trim state minimal. We mark the 2227 * whole bitmap untrimmed if at any point we add untrimmed regions. 2228 */ 2229 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2230 if (btrfs_free_space_trimmed(info)) { 2231 ctl->discardable_extents[BTRFS_STAT_CURR] += 2232 info->bitmap_extents; 2233 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2234 } 2235 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2236 } 2237 2238 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2239 2240 bytes_to_set = min(end - offset, bytes); 2241 2242 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2243 2244 return bytes_to_set; 2245 2246 } 2247 2248 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2249 struct btrfs_free_space *info) 2250 { 2251 struct btrfs_block_group *block_group = ctl->block_group; 2252 struct btrfs_fs_info *fs_info = block_group->fs_info; 2253 bool forced = false; 2254 2255 #ifdef CONFIG_BTRFS_DEBUG 2256 if (btrfs_should_fragment_free_space(block_group)) 2257 forced = true; 2258 #endif 2259 2260 /* This is a way to reclaim large regions from the bitmaps. */ 2261 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2262 return false; 2263 2264 /* 2265 * If we are below the extents threshold then we can add this as an 2266 * extent, and don't have to deal with the bitmap 2267 */ 2268 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2269 /* 2270 * If this block group has some small extents we don't want to 2271 * use up all of our free slots in the cache with them, we want 2272 * to reserve them to larger extents, however if we have plenty 2273 * of cache left then go ahead an dadd them, no sense in adding 2274 * the overhead of a bitmap if we don't have to. 2275 */ 2276 if (info->bytes <= fs_info->sectorsize * 8) { 2277 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2278 return false; 2279 } else { 2280 return false; 2281 } 2282 } 2283 2284 /* 2285 * The original block groups from mkfs can be really small, like 8 2286 * megabytes, so don't bother with a bitmap for those entries. However 2287 * some block groups can be smaller than what a bitmap would cover but 2288 * are still large enough that they could overflow the 32k memory limit, 2289 * so allow those block groups to still be allowed to have a bitmap 2290 * entry. 2291 */ 2292 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2293 return false; 2294 2295 return true; 2296 } 2297 2298 static const struct btrfs_free_space_op free_space_op = { 2299 .use_bitmap = use_bitmap, 2300 }; 2301 2302 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2303 struct btrfs_free_space *info) 2304 { 2305 struct btrfs_free_space *bitmap_info; 2306 struct btrfs_block_group *block_group = NULL; 2307 int added = 0; 2308 u64 bytes, offset, bytes_added; 2309 enum btrfs_trim_state trim_state; 2310 int ret; 2311 2312 bytes = info->bytes; 2313 offset = info->offset; 2314 trim_state = info->trim_state; 2315 2316 if (!ctl->op->use_bitmap(ctl, info)) 2317 return 0; 2318 2319 if (ctl->op == &free_space_op) 2320 block_group = ctl->block_group; 2321 again: 2322 /* 2323 * Since we link bitmaps right into the cluster we need to see if we 2324 * have a cluster here, and if so and it has our bitmap we need to add 2325 * the free space to that bitmap. 2326 */ 2327 if (block_group && !list_empty(&block_group->cluster_list)) { 2328 struct btrfs_free_cluster *cluster; 2329 struct rb_node *node; 2330 struct btrfs_free_space *entry; 2331 2332 cluster = list_entry(block_group->cluster_list.next, 2333 struct btrfs_free_cluster, 2334 block_group_list); 2335 spin_lock(&cluster->lock); 2336 node = rb_first(&cluster->root); 2337 if (!node) { 2338 spin_unlock(&cluster->lock); 2339 goto no_cluster_bitmap; 2340 } 2341 2342 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2343 if (!entry->bitmap) { 2344 spin_unlock(&cluster->lock); 2345 goto no_cluster_bitmap; 2346 } 2347 2348 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2349 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2350 bytes, trim_state); 2351 bytes -= bytes_added; 2352 offset += bytes_added; 2353 } 2354 spin_unlock(&cluster->lock); 2355 if (!bytes) { 2356 ret = 1; 2357 goto out; 2358 } 2359 } 2360 2361 no_cluster_bitmap: 2362 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2363 1, 0); 2364 if (!bitmap_info) { 2365 ASSERT(added == 0); 2366 goto new_bitmap; 2367 } 2368 2369 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2370 trim_state); 2371 bytes -= bytes_added; 2372 offset += bytes_added; 2373 added = 0; 2374 2375 if (!bytes) { 2376 ret = 1; 2377 goto out; 2378 } else 2379 goto again; 2380 2381 new_bitmap: 2382 if (info && info->bitmap) { 2383 add_new_bitmap(ctl, info, offset); 2384 added = 1; 2385 info = NULL; 2386 goto again; 2387 } else { 2388 spin_unlock(&ctl->tree_lock); 2389 2390 /* no pre-allocated info, allocate a new one */ 2391 if (!info) { 2392 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2393 GFP_NOFS); 2394 if (!info) { 2395 spin_lock(&ctl->tree_lock); 2396 ret = -ENOMEM; 2397 goto out; 2398 } 2399 } 2400 2401 /* allocate the bitmap */ 2402 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2403 GFP_NOFS); 2404 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2405 spin_lock(&ctl->tree_lock); 2406 if (!info->bitmap) { 2407 ret = -ENOMEM; 2408 goto out; 2409 } 2410 goto again; 2411 } 2412 2413 out: 2414 if (info) { 2415 if (info->bitmap) 2416 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2417 info->bitmap); 2418 kmem_cache_free(btrfs_free_space_cachep, info); 2419 } 2420 2421 return ret; 2422 } 2423 2424 /* 2425 * Free space merging rules: 2426 * 1) Merge trimmed areas together 2427 * 2) Let untrimmed areas coalesce with trimmed areas 2428 * 3) Always pull neighboring regions from bitmaps 2429 * 2430 * The above rules are for when we merge free space based on btrfs_trim_state. 2431 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2432 * same reason: to promote larger extent regions which makes life easier for 2433 * find_free_extent(). Rule 2 enables coalescing based on the common path 2434 * being returning free space from btrfs_finish_extent_commit(). So when free 2435 * space is trimmed, it will prevent aggregating trimmed new region and 2436 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2437 * and provide find_free_extent() with the largest extents possible hoping for 2438 * the reuse path. 2439 */ 2440 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2441 struct btrfs_free_space *info, bool update_stat) 2442 { 2443 struct btrfs_free_space *left_info = NULL; 2444 struct btrfs_free_space *right_info; 2445 bool merged = false; 2446 u64 offset = info->offset; 2447 u64 bytes = info->bytes; 2448 const bool is_trimmed = btrfs_free_space_trimmed(info); 2449 2450 /* 2451 * first we want to see if there is free space adjacent to the range we 2452 * are adding, if there is remove that struct and add a new one to 2453 * cover the entire range 2454 */ 2455 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2456 if (right_info && rb_prev(&right_info->offset_index)) 2457 left_info = rb_entry(rb_prev(&right_info->offset_index), 2458 struct btrfs_free_space, offset_index); 2459 else if (!right_info) 2460 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2461 2462 /* See try_merge_free_space() comment. */ 2463 if (right_info && !right_info->bitmap && 2464 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2465 unlink_free_space(ctl, right_info, update_stat); 2466 info->bytes += right_info->bytes; 2467 kmem_cache_free(btrfs_free_space_cachep, right_info); 2468 merged = true; 2469 } 2470 2471 /* See try_merge_free_space() comment. */ 2472 if (left_info && !left_info->bitmap && 2473 left_info->offset + left_info->bytes == offset && 2474 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2475 unlink_free_space(ctl, left_info, update_stat); 2476 info->offset = left_info->offset; 2477 info->bytes += left_info->bytes; 2478 kmem_cache_free(btrfs_free_space_cachep, left_info); 2479 merged = true; 2480 } 2481 2482 return merged; 2483 } 2484 2485 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2486 struct btrfs_free_space *info, 2487 bool update_stat) 2488 { 2489 struct btrfs_free_space *bitmap; 2490 unsigned long i; 2491 unsigned long j; 2492 const u64 end = info->offset + info->bytes; 2493 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2494 u64 bytes; 2495 2496 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2497 if (!bitmap) 2498 return false; 2499 2500 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2501 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2502 if (j == i) 2503 return false; 2504 bytes = (j - i) * ctl->unit; 2505 info->bytes += bytes; 2506 2507 /* See try_merge_free_space() comment. */ 2508 if (!btrfs_free_space_trimmed(bitmap)) 2509 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2510 2511 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2512 2513 if (!bitmap->bytes) 2514 free_bitmap(ctl, bitmap); 2515 2516 return true; 2517 } 2518 2519 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2520 struct btrfs_free_space *info, 2521 bool update_stat) 2522 { 2523 struct btrfs_free_space *bitmap; 2524 u64 bitmap_offset; 2525 unsigned long i; 2526 unsigned long j; 2527 unsigned long prev_j; 2528 u64 bytes; 2529 2530 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2531 /* If we're on a boundary, try the previous logical bitmap. */ 2532 if (bitmap_offset == info->offset) { 2533 if (info->offset == 0) 2534 return false; 2535 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2536 } 2537 2538 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2539 if (!bitmap) 2540 return false; 2541 2542 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2543 j = 0; 2544 prev_j = (unsigned long)-1; 2545 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2546 if (j > i) 2547 break; 2548 prev_j = j; 2549 } 2550 if (prev_j == i) 2551 return false; 2552 2553 if (prev_j == (unsigned long)-1) 2554 bytes = (i + 1) * ctl->unit; 2555 else 2556 bytes = (i - prev_j) * ctl->unit; 2557 2558 info->offset -= bytes; 2559 info->bytes += bytes; 2560 2561 /* See try_merge_free_space() comment. */ 2562 if (!btrfs_free_space_trimmed(bitmap)) 2563 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2564 2565 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2566 2567 if (!bitmap->bytes) 2568 free_bitmap(ctl, bitmap); 2569 2570 return true; 2571 } 2572 2573 /* 2574 * We prefer always to allocate from extent entries, both for clustered and 2575 * non-clustered allocation requests. So when attempting to add a new extent 2576 * entry, try to see if there's adjacent free space in bitmap entries, and if 2577 * there is, migrate that space from the bitmaps to the extent. 2578 * Like this we get better chances of satisfying space allocation requests 2579 * because we attempt to satisfy them based on a single cache entry, and never 2580 * on 2 or more entries - even if the entries represent a contiguous free space 2581 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2582 * ends). 2583 */ 2584 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2585 struct btrfs_free_space *info, 2586 bool update_stat) 2587 { 2588 /* 2589 * Only work with disconnected entries, as we can change their offset, 2590 * and must be extent entries. 2591 */ 2592 ASSERT(!info->bitmap); 2593 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2594 2595 if (ctl->total_bitmaps > 0) { 2596 bool stole_end; 2597 bool stole_front = false; 2598 2599 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2600 if (ctl->total_bitmaps > 0) 2601 stole_front = steal_from_bitmap_to_front(ctl, info, 2602 update_stat); 2603 2604 if (stole_end || stole_front) 2605 try_merge_free_space(ctl, info, update_stat); 2606 } 2607 } 2608 2609 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2610 u64 offset, u64 bytes, 2611 enum btrfs_trim_state trim_state) 2612 { 2613 struct btrfs_fs_info *fs_info = block_group->fs_info; 2614 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2615 struct btrfs_free_space *info; 2616 int ret = 0; 2617 u64 filter_bytes = bytes; 2618 2619 ASSERT(!btrfs_is_zoned(fs_info)); 2620 2621 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2622 if (!info) 2623 return -ENOMEM; 2624 2625 info->offset = offset; 2626 info->bytes = bytes; 2627 info->trim_state = trim_state; 2628 RB_CLEAR_NODE(&info->offset_index); 2629 RB_CLEAR_NODE(&info->bytes_index); 2630 2631 spin_lock(&ctl->tree_lock); 2632 2633 if (try_merge_free_space(ctl, info, true)) 2634 goto link; 2635 2636 /* 2637 * There was no extent directly to the left or right of this new 2638 * extent then we know we're going to have to allocate a new extent, so 2639 * before we do that see if we need to drop this into a bitmap 2640 */ 2641 ret = insert_into_bitmap(ctl, info); 2642 if (ret < 0) { 2643 goto out; 2644 } else if (ret) { 2645 ret = 0; 2646 goto out; 2647 } 2648 link: 2649 /* 2650 * Only steal free space from adjacent bitmaps if we're sure we're not 2651 * going to add the new free space to existing bitmap entries - because 2652 * that would mean unnecessary work that would be reverted. Therefore 2653 * attempt to steal space from bitmaps if we're adding an extent entry. 2654 */ 2655 steal_from_bitmap(ctl, info, true); 2656 2657 filter_bytes = max(filter_bytes, info->bytes); 2658 2659 ret = link_free_space(ctl, info); 2660 if (ret) 2661 kmem_cache_free(btrfs_free_space_cachep, info); 2662 out: 2663 btrfs_discard_update_discardable(block_group); 2664 spin_unlock(&ctl->tree_lock); 2665 2666 if (ret) { 2667 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2668 ASSERT(ret != -EEXIST); 2669 } 2670 2671 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2672 btrfs_discard_check_filter(block_group, filter_bytes); 2673 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2674 } 2675 2676 return ret; 2677 } 2678 2679 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2680 u64 bytenr, u64 size, bool used) 2681 { 2682 struct btrfs_space_info *sinfo = block_group->space_info; 2683 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2684 u64 offset = bytenr - block_group->start; 2685 u64 to_free, to_unusable; 2686 int bg_reclaim_threshold = 0; 2687 bool initial = (size == block_group->length); 2688 u64 reclaimable_unusable; 2689 2690 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2691 2692 if (!initial) 2693 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2694 2695 spin_lock(&ctl->tree_lock); 2696 if (!used) 2697 to_free = size; 2698 else if (initial) 2699 to_free = block_group->zone_capacity; 2700 else if (offset >= block_group->alloc_offset) 2701 to_free = size; 2702 else if (offset + size <= block_group->alloc_offset) 2703 to_free = 0; 2704 else 2705 to_free = offset + size - block_group->alloc_offset; 2706 to_unusable = size - to_free; 2707 2708 ctl->free_space += to_free; 2709 /* 2710 * If the block group is read-only, we should account freed space into 2711 * bytes_readonly. 2712 */ 2713 if (!block_group->ro) 2714 block_group->zone_unusable += to_unusable; 2715 spin_unlock(&ctl->tree_lock); 2716 if (!used) { 2717 spin_lock(&block_group->lock); 2718 block_group->alloc_offset -= size; 2719 spin_unlock(&block_group->lock); 2720 } 2721 2722 reclaimable_unusable = block_group->zone_unusable - 2723 (block_group->length - block_group->zone_capacity); 2724 /* All the region is now unusable. Mark it as unused and reclaim */ 2725 if (block_group->zone_unusable == block_group->length) { 2726 btrfs_mark_bg_unused(block_group); 2727 } else if (bg_reclaim_threshold && 2728 reclaimable_unusable >= 2729 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2730 btrfs_mark_bg_to_reclaim(block_group); 2731 } 2732 2733 return 0; 2734 } 2735 2736 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2737 u64 bytenr, u64 size) 2738 { 2739 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2740 2741 if (btrfs_is_zoned(block_group->fs_info)) 2742 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2743 true); 2744 2745 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2746 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2747 2748 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2749 } 2750 2751 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2752 u64 bytenr, u64 size) 2753 { 2754 if (btrfs_is_zoned(block_group->fs_info)) 2755 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2756 false); 2757 2758 return btrfs_add_free_space(block_group, bytenr, size); 2759 } 2760 2761 /* 2762 * This is a subtle distinction because when adding free space back in general, 2763 * we want it to be added as untrimmed for async. But in the case where we add 2764 * it on loading of a block group, we want to consider it trimmed. 2765 */ 2766 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2767 u64 bytenr, u64 size) 2768 { 2769 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2770 2771 if (btrfs_is_zoned(block_group->fs_info)) 2772 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2773 true); 2774 2775 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2776 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2777 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2778 2779 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2780 } 2781 2782 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2783 u64 offset, u64 bytes) 2784 { 2785 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2786 struct btrfs_free_space *info; 2787 int ret; 2788 bool re_search = false; 2789 2790 if (btrfs_is_zoned(block_group->fs_info)) { 2791 /* 2792 * This can happen with conventional zones when replaying log. 2793 * Since the allocation info of tree-log nodes are not recorded 2794 * to the extent-tree, calculate_alloc_pointer() failed to 2795 * advance the allocation pointer after last allocated tree log 2796 * node blocks. 2797 * 2798 * This function is called from 2799 * btrfs_pin_extent_for_log_replay() when replaying the log. 2800 * Advance the pointer not to overwrite the tree-log nodes. 2801 */ 2802 if (block_group->start + block_group->alloc_offset < 2803 offset + bytes) { 2804 block_group->alloc_offset = 2805 offset + bytes - block_group->start; 2806 } 2807 return 0; 2808 } 2809 2810 spin_lock(&ctl->tree_lock); 2811 2812 again: 2813 ret = 0; 2814 if (!bytes) 2815 goto out_lock; 2816 2817 info = tree_search_offset(ctl, offset, 0, 0); 2818 if (!info) { 2819 /* 2820 * oops didn't find an extent that matched the space we wanted 2821 * to remove, look for a bitmap instead 2822 */ 2823 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2824 1, 0); 2825 if (!info) { 2826 /* 2827 * If we found a partial bit of our free space in a 2828 * bitmap but then couldn't find the other part this may 2829 * be a problem, so WARN about it. 2830 */ 2831 WARN_ON(re_search); 2832 goto out_lock; 2833 } 2834 } 2835 2836 re_search = false; 2837 if (!info->bitmap) { 2838 unlink_free_space(ctl, info, true); 2839 if (offset == info->offset) { 2840 u64 to_free = min(bytes, info->bytes); 2841 2842 info->bytes -= to_free; 2843 info->offset += to_free; 2844 if (info->bytes) { 2845 ret = link_free_space(ctl, info); 2846 WARN_ON(ret); 2847 } else { 2848 kmem_cache_free(btrfs_free_space_cachep, info); 2849 } 2850 2851 offset += to_free; 2852 bytes -= to_free; 2853 goto again; 2854 } else { 2855 u64 old_end = info->bytes + info->offset; 2856 2857 info->bytes = offset - info->offset; 2858 ret = link_free_space(ctl, info); 2859 WARN_ON(ret); 2860 if (ret) 2861 goto out_lock; 2862 2863 /* Not enough bytes in this entry to satisfy us */ 2864 if (old_end < offset + bytes) { 2865 bytes -= old_end - offset; 2866 offset = old_end; 2867 goto again; 2868 } else if (old_end == offset + bytes) { 2869 /* all done */ 2870 goto out_lock; 2871 } 2872 spin_unlock(&ctl->tree_lock); 2873 2874 ret = __btrfs_add_free_space(block_group, 2875 offset + bytes, 2876 old_end - (offset + bytes), 2877 info->trim_state); 2878 WARN_ON(ret); 2879 goto out; 2880 } 2881 } 2882 2883 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2884 if (ret == -EAGAIN) { 2885 re_search = true; 2886 goto again; 2887 } 2888 out_lock: 2889 btrfs_discard_update_discardable(block_group); 2890 spin_unlock(&ctl->tree_lock); 2891 out: 2892 return ret; 2893 } 2894 2895 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2896 u64 bytes) 2897 { 2898 struct btrfs_fs_info *fs_info = block_group->fs_info; 2899 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2900 struct btrfs_free_space *info; 2901 struct rb_node *n; 2902 int count = 0; 2903 2904 /* 2905 * Zoned btrfs does not use free space tree and cluster. Just print 2906 * out the free space after the allocation offset. 2907 */ 2908 if (btrfs_is_zoned(fs_info)) { 2909 btrfs_info(fs_info, "free space %llu active %d", 2910 block_group->zone_capacity - block_group->alloc_offset, 2911 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2912 &block_group->runtime_flags)); 2913 return; 2914 } 2915 2916 spin_lock(&ctl->tree_lock); 2917 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2918 info = rb_entry(n, struct btrfs_free_space, offset_index); 2919 if (info->bytes >= bytes && !block_group->ro) 2920 count++; 2921 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2922 info->offset, info->bytes, 2923 (info->bitmap) ? "yes" : "no"); 2924 } 2925 spin_unlock(&ctl->tree_lock); 2926 btrfs_info(fs_info, "block group has cluster?: %s", 2927 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2928 btrfs_info(fs_info, 2929 "%d blocks of free space at or bigger than bytes is", count); 2930 } 2931 2932 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2933 struct btrfs_free_space_ctl *ctl) 2934 { 2935 struct btrfs_fs_info *fs_info = block_group->fs_info; 2936 2937 spin_lock_init(&ctl->tree_lock); 2938 ctl->unit = fs_info->sectorsize; 2939 ctl->start = block_group->start; 2940 ctl->block_group = block_group; 2941 ctl->op = &free_space_op; 2942 ctl->free_space_bytes = RB_ROOT_CACHED; 2943 INIT_LIST_HEAD(&ctl->trimming_ranges); 2944 mutex_init(&ctl->cache_writeout_mutex); 2945 2946 /* 2947 * we only want to have 32k of ram per block group for keeping 2948 * track of free space, and if we pass 1/2 of that we want to 2949 * start converting things over to using bitmaps 2950 */ 2951 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2952 } 2953 2954 /* 2955 * for a given cluster, put all of its extents back into the free 2956 * space cache. If the block group passed doesn't match the block group 2957 * pointed to by the cluster, someone else raced in and freed the 2958 * cluster already. In that case, we just return without changing anything 2959 */ 2960 static void __btrfs_return_cluster_to_free_space( 2961 struct btrfs_block_group *block_group, 2962 struct btrfs_free_cluster *cluster) 2963 { 2964 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2965 struct btrfs_free_space *entry; 2966 struct rb_node *node; 2967 2968 spin_lock(&cluster->lock); 2969 if (cluster->block_group != block_group) { 2970 spin_unlock(&cluster->lock); 2971 return; 2972 } 2973 2974 cluster->block_group = NULL; 2975 cluster->window_start = 0; 2976 list_del_init(&cluster->block_group_list); 2977 2978 node = rb_first(&cluster->root); 2979 while (node) { 2980 bool bitmap; 2981 2982 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2983 node = rb_next(&entry->offset_index); 2984 rb_erase(&entry->offset_index, &cluster->root); 2985 RB_CLEAR_NODE(&entry->offset_index); 2986 2987 bitmap = (entry->bitmap != NULL); 2988 if (!bitmap) { 2989 /* Merging treats extents as if they were new */ 2990 if (!btrfs_free_space_trimmed(entry)) { 2991 ctl->discardable_extents[BTRFS_STAT_CURR]--; 2992 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 2993 entry->bytes; 2994 } 2995 2996 try_merge_free_space(ctl, entry, false); 2997 steal_from_bitmap(ctl, entry, false); 2998 2999 /* As we insert directly, update these statistics */ 3000 if (!btrfs_free_space_trimmed(entry)) { 3001 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3002 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3003 entry->bytes; 3004 } 3005 } 3006 tree_insert_offset(&ctl->free_space_offset, 3007 entry->offset, &entry->offset_index, bitmap); 3008 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3009 entry_less); 3010 } 3011 cluster->root = RB_ROOT; 3012 spin_unlock(&cluster->lock); 3013 btrfs_put_block_group(block_group); 3014 } 3015 3016 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3017 { 3018 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3019 struct btrfs_free_cluster *cluster; 3020 struct list_head *head; 3021 3022 spin_lock(&ctl->tree_lock); 3023 while ((head = block_group->cluster_list.next) != 3024 &block_group->cluster_list) { 3025 cluster = list_entry(head, struct btrfs_free_cluster, 3026 block_group_list); 3027 3028 WARN_ON(cluster->block_group != block_group); 3029 __btrfs_return_cluster_to_free_space(block_group, cluster); 3030 3031 cond_resched_lock(&ctl->tree_lock); 3032 } 3033 __btrfs_remove_free_space_cache(ctl); 3034 btrfs_discard_update_discardable(block_group); 3035 spin_unlock(&ctl->tree_lock); 3036 3037 } 3038 3039 /* 3040 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3041 */ 3042 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3043 { 3044 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3045 struct btrfs_free_space *info; 3046 struct rb_node *node; 3047 bool ret = true; 3048 3049 spin_lock(&ctl->tree_lock); 3050 node = rb_first(&ctl->free_space_offset); 3051 3052 while (node) { 3053 info = rb_entry(node, struct btrfs_free_space, offset_index); 3054 3055 if (!btrfs_free_space_trimmed(info)) { 3056 ret = false; 3057 break; 3058 } 3059 3060 node = rb_next(node); 3061 } 3062 3063 spin_unlock(&ctl->tree_lock); 3064 return ret; 3065 } 3066 3067 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3068 u64 offset, u64 bytes, u64 empty_size, 3069 u64 *max_extent_size) 3070 { 3071 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3072 struct btrfs_discard_ctl *discard_ctl = 3073 &block_group->fs_info->discard_ctl; 3074 struct btrfs_free_space *entry = NULL; 3075 u64 bytes_search = bytes + empty_size; 3076 u64 ret = 0; 3077 u64 align_gap = 0; 3078 u64 align_gap_len = 0; 3079 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3080 bool use_bytes_index = (offset == block_group->start); 3081 3082 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3083 3084 spin_lock(&ctl->tree_lock); 3085 entry = find_free_space(ctl, &offset, &bytes_search, 3086 block_group->full_stripe_len, max_extent_size, 3087 use_bytes_index); 3088 if (!entry) 3089 goto out; 3090 3091 ret = offset; 3092 if (entry->bitmap) { 3093 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3094 3095 if (!btrfs_free_space_trimmed(entry)) 3096 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3097 3098 if (!entry->bytes) 3099 free_bitmap(ctl, entry); 3100 } else { 3101 unlink_free_space(ctl, entry, true); 3102 align_gap_len = offset - entry->offset; 3103 align_gap = entry->offset; 3104 align_gap_trim_state = entry->trim_state; 3105 3106 if (!btrfs_free_space_trimmed(entry)) 3107 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3108 3109 entry->offset = offset + bytes; 3110 WARN_ON(entry->bytes < bytes + align_gap_len); 3111 3112 entry->bytes -= bytes + align_gap_len; 3113 if (!entry->bytes) 3114 kmem_cache_free(btrfs_free_space_cachep, entry); 3115 else 3116 link_free_space(ctl, entry); 3117 } 3118 out: 3119 btrfs_discard_update_discardable(block_group); 3120 spin_unlock(&ctl->tree_lock); 3121 3122 if (align_gap_len) 3123 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3124 align_gap_trim_state); 3125 return ret; 3126 } 3127 3128 /* 3129 * given a cluster, put all of its extents back into the free space 3130 * cache. If a block group is passed, this function will only free 3131 * a cluster that belongs to the passed block group. 3132 * 3133 * Otherwise, it'll get a reference on the block group pointed to by the 3134 * cluster and remove the cluster from it. 3135 */ 3136 void btrfs_return_cluster_to_free_space( 3137 struct btrfs_block_group *block_group, 3138 struct btrfs_free_cluster *cluster) 3139 { 3140 struct btrfs_free_space_ctl *ctl; 3141 3142 /* first, get a safe pointer to the block group */ 3143 spin_lock(&cluster->lock); 3144 if (!block_group) { 3145 block_group = cluster->block_group; 3146 if (!block_group) { 3147 spin_unlock(&cluster->lock); 3148 return; 3149 } 3150 } else if (cluster->block_group != block_group) { 3151 /* someone else has already freed it don't redo their work */ 3152 spin_unlock(&cluster->lock); 3153 return; 3154 } 3155 btrfs_get_block_group(block_group); 3156 spin_unlock(&cluster->lock); 3157 3158 ctl = block_group->free_space_ctl; 3159 3160 /* now return any extents the cluster had on it */ 3161 spin_lock(&ctl->tree_lock); 3162 __btrfs_return_cluster_to_free_space(block_group, cluster); 3163 spin_unlock(&ctl->tree_lock); 3164 3165 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3166 3167 /* finally drop our ref */ 3168 btrfs_put_block_group(block_group); 3169 } 3170 3171 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3172 struct btrfs_free_cluster *cluster, 3173 struct btrfs_free_space *entry, 3174 u64 bytes, u64 min_start, 3175 u64 *max_extent_size) 3176 { 3177 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3178 int err; 3179 u64 search_start = cluster->window_start; 3180 u64 search_bytes = bytes; 3181 u64 ret = 0; 3182 3183 search_start = min_start; 3184 search_bytes = bytes; 3185 3186 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3187 if (err) { 3188 *max_extent_size = max(get_max_extent_size(entry), 3189 *max_extent_size); 3190 return 0; 3191 } 3192 3193 ret = search_start; 3194 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3195 3196 return ret; 3197 } 3198 3199 /* 3200 * given a cluster, try to allocate 'bytes' from it, returns 0 3201 * if it couldn't find anything suitably large, or a logical disk offset 3202 * if things worked out 3203 */ 3204 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3205 struct btrfs_free_cluster *cluster, u64 bytes, 3206 u64 min_start, u64 *max_extent_size) 3207 { 3208 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3209 struct btrfs_discard_ctl *discard_ctl = 3210 &block_group->fs_info->discard_ctl; 3211 struct btrfs_free_space *entry = NULL; 3212 struct rb_node *node; 3213 u64 ret = 0; 3214 3215 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3216 3217 spin_lock(&cluster->lock); 3218 if (bytes > cluster->max_size) 3219 goto out; 3220 3221 if (cluster->block_group != block_group) 3222 goto out; 3223 3224 node = rb_first(&cluster->root); 3225 if (!node) 3226 goto out; 3227 3228 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3229 while (1) { 3230 if (entry->bytes < bytes) 3231 *max_extent_size = max(get_max_extent_size(entry), 3232 *max_extent_size); 3233 3234 if (entry->bytes < bytes || 3235 (!entry->bitmap && entry->offset < min_start)) { 3236 node = rb_next(&entry->offset_index); 3237 if (!node) 3238 break; 3239 entry = rb_entry(node, struct btrfs_free_space, 3240 offset_index); 3241 continue; 3242 } 3243 3244 if (entry->bitmap) { 3245 ret = btrfs_alloc_from_bitmap(block_group, 3246 cluster, entry, bytes, 3247 cluster->window_start, 3248 max_extent_size); 3249 if (ret == 0) { 3250 node = rb_next(&entry->offset_index); 3251 if (!node) 3252 break; 3253 entry = rb_entry(node, struct btrfs_free_space, 3254 offset_index); 3255 continue; 3256 } 3257 cluster->window_start += bytes; 3258 } else { 3259 ret = entry->offset; 3260 3261 entry->offset += bytes; 3262 entry->bytes -= bytes; 3263 } 3264 3265 break; 3266 } 3267 out: 3268 spin_unlock(&cluster->lock); 3269 3270 if (!ret) 3271 return 0; 3272 3273 spin_lock(&ctl->tree_lock); 3274 3275 if (!btrfs_free_space_trimmed(entry)) 3276 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3277 3278 ctl->free_space -= bytes; 3279 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3280 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3281 3282 spin_lock(&cluster->lock); 3283 if (entry->bytes == 0) { 3284 rb_erase(&entry->offset_index, &cluster->root); 3285 ctl->free_extents--; 3286 if (entry->bitmap) { 3287 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3288 entry->bitmap); 3289 ctl->total_bitmaps--; 3290 recalculate_thresholds(ctl); 3291 } else if (!btrfs_free_space_trimmed(entry)) { 3292 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3293 } 3294 kmem_cache_free(btrfs_free_space_cachep, entry); 3295 } 3296 3297 spin_unlock(&cluster->lock); 3298 spin_unlock(&ctl->tree_lock); 3299 3300 return ret; 3301 } 3302 3303 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3304 struct btrfs_free_space *entry, 3305 struct btrfs_free_cluster *cluster, 3306 u64 offset, u64 bytes, 3307 u64 cont1_bytes, u64 min_bytes) 3308 { 3309 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3310 unsigned long next_zero; 3311 unsigned long i; 3312 unsigned long want_bits; 3313 unsigned long min_bits; 3314 unsigned long found_bits; 3315 unsigned long max_bits = 0; 3316 unsigned long start = 0; 3317 unsigned long total_found = 0; 3318 int ret; 3319 3320 i = offset_to_bit(entry->offset, ctl->unit, 3321 max_t(u64, offset, entry->offset)); 3322 want_bits = bytes_to_bits(bytes, ctl->unit); 3323 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3324 3325 /* 3326 * Don't bother looking for a cluster in this bitmap if it's heavily 3327 * fragmented. 3328 */ 3329 if (entry->max_extent_size && 3330 entry->max_extent_size < cont1_bytes) 3331 return -ENOSPC; 3332 again: 3333 found_bits = 0; 3334 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3335 next_zero = find_next_zero_bit(entry->bitmap, 3336 BITS_PER_BITMAP, i); 3337 if (next_zero - i >= min_bits) { 3338 found_bits = next_zero - i; 3339 if (found_bits > max_bits) 3340 max_bits = found_bits; 3341 break; 3342 } 3343 if (next_zero - i > max_bits) 3344 max_bits = next_zero - i; 3345 i = next_zero; 3346 } 3347 3348 if (!found_bits) { 3349 entry->max_extent_size = (u64)max_bits * ctl->unit; 3350 return -ENOSPC; 3351 } 3352 3353 if (!total_found) { 3354 start = i; 3355 cluster->max_size = 0; 3356 } 3357 3358 total_found += found_bits; 3359 3360 if (cluster->max_size < found_bits * ctl->unit) 3361 cluster->max_size = found_bits * ctl->unit; 3362 3363 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3364 i = next_zero + 1; 3365 goto again; 3366 } 3367 3368 cluster->window_start = start * ctl->unit + entry->offset; 3369 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3370 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3371 3372 /* 3373 * We need to know if we're currently on the normal space index when we 3374 * manipulate the bitmap so that we know we need to remove and re-insert 3375 * it into the space_index tree. Clear the bytes_index node here so the 3376 * bitmap manipulation helpers know not to mess with the space_index 3377 * until this bitmap entry is added back into the normal cache. 3378 */ 3379 RB_CLEAR_NODE(&entry->bytes_index); 3380 3381 ret = tree_insert_offset(&cluster->root, entry->offset, 3382 &entry->offset_index, 1); 3383 ASSERT(!ret); /* -EEXIST; Logic error */ 3384 3385 trace_btrfs_setup_cluster(block_group, cluster, 3386 total_found * ctl->unit, 1); 3387 return 0; 3388 } 3389 3390 /* 3391 * This searches the block group for just extents to fill the cluster with. 3392 * Try to find a cluster with at least bytes total bytes, at least one 3393 * extent of cont1_bytes, and other clusters of at least min_bytes. 3394 */ 3395 static noinline int 3396 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3397 struct btrfs_free_cluster *cluster, 3398 struct list_head *bitmaps, u64 offset, u64 bytes, 3399 u64 cont1_bytes, u64 min_bytes) 3400 { 3401 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3402 struct btrfs_free_space *first = NULL; 3403 struct btrfs_free_space *entry = NULL; 3404 struct btrfs_free_space *last; 3405 struct rb_node *node; 3406 u64 window_free; 3407 u64 max_extent; 3408 u64 total_size = 0; 3409 3410 entry = tree_search_offset(ctl, offset, 0, 1); 3411 if (!entry) 3412 return -ENOSPC; 3413 3414 /* 3415 * We don't want bitmaps, so just move along until we find a normal 3416 * extent entry. 3417 */ 3418 while (entry->bitmap || entry->bytes < min_bytes) { 3419 if (entry->bitmap && list_empty(&entry->list)) 3420 list_add_tail(&entry->list, bitmaps); 3421 node = rb_next(&entry->offset_index); 3422 if (!node) 3423 return -ENOSPC; 3424 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3425 } 3426 3427 window_free = entry->bytes; 3428 max_extent = entry->bytes; 3429 first = entry; 3430 last = entry; 3431 3432 for (node = rb_next(&entry->offset_index); node; 3433 node = rb_next(&entry->offset_index)) { 3434 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3435 3436 if (entry->bitmap) { 3437 if (list_empty(&entry->list)) 3438 list_add_tail(&entry->list, bitmaps); 3439 continue; 3440 } 3441 3442 if (entry->bytes < min_bytes) 3443 continue; 3444 3445 last = entry; 3446 window_free += entry->bytes; 3447 if (entry->bytes > max_extent) 3448 max_extent = entry->bytes; 3449 } 3450 3451 if (window_free < bytes || max_extent < cont1_bytes) 3452 return -ENOSPC; 3453 3454 cluster->window_start = first->offset; 3455 3456 node = &first->offset_index; 3457 3458 /* 3459 * now we've found our entries, pull them out of the free space 3460 * cache and put them into the cluster rbtree 3461 */ 3462 do { 3463 int ret; 3464 3465 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3466 node = rb_next(&entry->offset_index); 3467 if (entry->bitmap || entry->bytes < min_bytes) 3468 continue; 3469 3470 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3471 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3472 ret = tree_insert_offset(&cluster->root, entry->offset, 3473 &entry->offset_index, 0); 3474 total_size += entry->bytes; 3475 ASSERT(!ret); /* -EEXIST; Logic error */ 3476 } while (node && entry != last); 3477 3478 cluster->max_size = max_extent; 3479 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3480 return 0; 3481 } 3482 3483 /* 3484 * This specifically looks for bitmaps that may work in the cluster, we assume 3485 * that we have already failed to find extents that will work. 3486 */ 3487 static noinline int 3488 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3489 struct btrfs_free_cluster *cluster, 3490 struct list_head *bitmaps, u64 offset, u64 bytes, 3491 u64 cont1_bytes, u64 min_bytes) 3492 { 3493 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3494 struct btrfs_free_space *entry = NULL; 3495 int ret = -ENOSPC; 3496 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3497 3498 if (ctl->total_bitmaps == 0) 3499 return -ENOSPC; 3500 3501 /* 3502 * The bitmap that covers offset won't be in the list unless offset 3503 * is just its start offset. 3504 */ 3505 if (!list_empty(bitmaps)) 3506 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3507 3508 if (!entry || entry->offset != bitmap_offset) { 3509 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3510 if (entry && list_empty(&entry->list)) 3511 list_add(&entry->list, bitmaps); 3512 } 3513 3514 list_for_each_entry(entry, bitmaps, list) { 3515 if (entry->bytes < bytes) 3516 continue; 3517 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3518 bytes, cont1_bytes, min_bytes); 3519 if (!ret) 3520 return 0; 3521 } 3522 3523 /* 3524 * The bitmaps list has all the bitmaps that record free space 3525 * starting after offset, so no more search is required. 3526 */ 3527 return -ENOSPC; 3528 } 3529 3530 /* 3531 * here we try to find a cluster of blocks in a block group. The goal 3532 * is to find at least bytes+empty_size. 3533 * We might not find them all in one contiguous area. 3534 * 3535 * returns zero and sets up cluster if things worked out, otherwise 3536 * it returns -enospc 3537 */ 3538 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3539 struct btrfs_free_cluster *cluster, 3540 u64 offset, u64 bytes, u64 empty_size) 3541 { 3542 struct btrfs_fs_info *fs_info = block_group->fs_info; 3543 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3544 struct btrfs_free_space *entry, *tmp; 3545 LIST_HEAD(bitmaps); 3546 u64 min_bytes; 3547 u64 cont1_bytes; 3548 int ret; 3549 3550 /* 3551 * Choose the minimum extent size we'll require for this 3552 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3553 * For metadata, allow allocates with smaller extents. For 3554 * data, keep it dense. 3555 */ 3556 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3557 cont1_bytes = bytes + empty_size; 3558 min_bytes = cont1_bytes; 3559 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3560 cont1_bytes = bytes; 3561 min_bytes = fs_info->sectorsize; 3562 } else { 3563 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3564 min_bytes = fs_info->sectorsize; 3565 } 3566 3567 spin_lock(&ctl->tree_lock); 3568 3569 /* 3570 * If we know we don't have enough space to make a cluster don't even 3571 * bother doing all the work to try and find one. 3572 */ 3573 if (ctl->free_space < bytes) { 3574 spin_unlock(&ctl->tree_lock); 3575 return -ENOSPC; 3576 } 3577 3578 spin_lock(&cluster->lock); 3579 3580 /* someone already found a cluster, hooray */ 3581 if (cluster->block_group) { 3582 ret = 0; 3583 goto out; 3584 } 3585 3586 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3587 min_bytes); 3588 3589 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3590 bytes + empty_size, 3591 cont1_bytes, min_bytes); 3592 if (ret) 3593 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3594 offset, bytes + empty_size, 3595 cont1_bytes, min_bytes); 3596 3597 /* Clear our temporary list */ 3598 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3599 list_del_init(&entry->list); 3600 3601 if (!ret) { 3602 btrfs_get_block_group(block_group); 3603 list_add_tail(&cluster->block_group_list, 3604 &block_group->cluster_list); 3605 cluster->block_group = block_group; 3606 } else { 3607 trace_btrfs_failed_cluster_setup(block_group); 3608 } 3609 out: 3610 spin_unlock(&cluster->lock); 3611 spin_unlock(&ctl->tree_lock); 3612 3613 return ret; 3614 } 3615 3616 /* 3617 * simple code to zero out a cluster 3618 */ 3619 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3620 { 3621 spin_lock_init(&cluster->lock); 3622 spin_lock_init(&cluster->refill_lock); 3623 cluster->root = RB_ROOT; 3624 cluster->max_size = 0; 3625 cluster->fragmented = false; 3626 INIT_LIST_HEAD(&cluster->block_group_list); 3627 cluster->block_group = NULL; 3628 } 3629 3630 static int do_trimming(struct btrfs_block_group *block_group, 3631 u64 *total_trimmed, u64 start, u64 bytes, 3632 u64 reserved_start, u64 reserved_bytes, 3633 enum btrfs_trim_state reserved_trim_state, 3634 struct btrfs_trim_range *trim_entry) 3635 { 3636 struct btrfs_space_info *space_info = block_group->space_info; 3637 struct btrfs_fs_info *fs_info = block_group->fs_info; 3638 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3639 int ret; 3640 int update = 0; 3641 const u64 end = start + bytes; 3642 const u64 reserved_end = reserved_start + reserved_bytes; 3643 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3644 u64 trimmed = 0; 3645 3646 spin_lock(&space_info->lock); 3647 spin_lock(&block_group->lock); 3648 if (!block_group->ro) { 3649 block_group->reserved += reserved_bytes; 3650 space_info->bytes_reserved += reserved_bytes; 3651 update = 1; 3652 } 3653 spin_unlock(&block_group->lock); 3654 spin_unlock(&space_info->lock); 3655 3656 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3657 if (!ret) { 3658 *total_trimmed += trimmed; 3659 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3660 } 3661 3662 mutex_lock(&ctl->cache_writeout_mutex); 3663 if (reserved_start < start) 3664 __btrfs_add_free_space(block_group, reserved_start, 3665 start - reserved_start, 3666 reserved_trim_state); 3667 if (start + bytes < reserved_start + reserved_bytes) 3668 __btrfs_add_free_space(block_group, end, reserved_end - end, 3669 reserved_trim_state); 3670 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3671 list_del(&trim_entry->list); 3672 mutex_unlock(&ctl->cache_writeout_mutex); 3673 3674 if (update) { 3675 spin_lock(&space_info->lock); 3676 spin_lock(&block_group->lock); 3677 if (block_group->ro) 3678 space_info->bytes_readonly += reserved_bytes; 3679 block_group->reserved -= reserved_bytes; 3680 space_info->bytes_reserved -= reserved_bytes; 3681 spin_unlock(&block_group->lock); 3682 spin_unlock(&space_info->lock); 3683 } 3684 3685 return ret; 3686 } 3687 3688 /* 3689 * If @async is set, then we will trim 1 region and return. 3690 */ 3691 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3692 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3693 bool async) 3694 { 3695 struct btrfs_discard_ctl *discard_ctl = 3696 &block_group->fs_info->discard_ctl; 3697 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3698 struct btrfs_free_space *entry; 3699 struct rb_node *node; 3700 int ret = 0; 3701 u64 extent_start; 3702 u64 extent_bytes; 3703 enum btrfs_trim_state extent_trim_state; 3704 u64 bytes; 3705 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3706 3707 while (start < end) { 3708 struct btrfs_trim_range trim_entry; 3709 3710 mutex_lock(&ctl->cache_writeout_mutex); 3711 spin_lock(&ctl->tree_lock); 3712 3713 if (ctl->free_space < minlen) 3714 goto out_unlock; 3715 3716 entry = tree_search_offset(ctl, start, 0, 1); 3717 if (!entry) 3718 goto out_unlock; 3719 3720 /* Skip bitmaps and if async, already trimmed entries */ 3721 while (entry->bitmap || 3722 (async && btrfs_free_space_trimmed(entry))) { 3723 node = rb_next(&entry->offset_index); 3724 if (!node) 3725 goto out_unlock; 3726 entry = rb_entry(node, struct btrfs_free_space, 3727 offset_index); 3728 } 3729 3730 if (entry->offset >= end) 3731 goto out_unlock; 3732 3733 extent_start = entry->offset; 3734 extent_bytes = entry->bytes; 3735 extent_trim_state = entry->trim_state; 3736 if (async) { 3737 start = entry->offset; 3738 bytes = entry->bytes; 3739 if (bytes < minlen) { 3740 spin_unlock(&ctl->tree_lock); 3741 mutex_unlock(&ctl->cache_writeout_mutex); 3742 goto next; 3743 } 3744 unlink_free_space(ctl, entry, true); 3745 /* 3746 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3747 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3748 * X when we come back around. So trim it now. 3749 */ 3750 if (max_discard_size && 3751 bytes >= (max_discard_size + 3752 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3753 bytes = max_discard_size; 3754 extent_bytes = max_discard_size; 3755 entry->offset += max_discard_size; 3756 entry->bytes -= max_discard_size; 3757 link_free_space(ctl, entry); 3758 } else { 3759 kmem_cache_free(btrfs_free_space_cachep, entry); 3760 } 3761 } else { 3762 start = max(start, extent_start); 3763 bytes = min(extent_start + extent_bytes, end) - start; 3764 if (bytes < minlen) { 3765 spin_unlock(&ctl->tree_lock); 3766 mutex_unlock(&ctl->cache_writeout_mutex); 3767 goto next; 3768 } 3769 3770 unlink_free_space(ctl, entry, true); 3771 kmem_cache_free(btrfs_free_space_cachep, entry); 3772 } 3773 3774 spin_unlock(&ctl->tree_lock); 3775 trim_entry.start = extent_start; 3776 trim_entry.bytes = extent_bytes; 3777 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3778 mutex_unlock(&ctl->cache_writeout_mutex); 3779 3780 ret = do_trimming(block_group, total_trimmed, start, bytes, 3781 extent_start, extent_bytes, extent_trim_state, 3782 &trim_entry); 3783 if (ret) { 3784 block_group->discard_cursor = start + bytes; 3785 break; 3786 } 3787 next: 3788 start += bytes; 3789 block_group->discard_cursor = start; 3790 if (async && *total_trimmed) 3791 break; 3792 3793 if (fatal_signal_pending(current)) { 3794 ret = -ERESTARTSYS; 3795 break; 3796 } 3797 3798 cond_resched(); 3799 } 3800 3801 return ret; 3802 3803 out_unlock: 3804 block_group->discard_cursor = btrfs_block_group_end(block_group); 3805 spin_unlock(&ctl->tree_lock); 3806 mutex_unlock(&ctl->cache_writeout_mutex); 3807 3808 return ret; 3809 } 3810 3811 /* 3812 * If we break out of trimming a bitmap prematurely, we should reset the 3813 * trimming bit. In a rather contrieved case, it's possible to race here so 3814 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3815 * 3816 * start = start of bitmap 3817 * end = near end of bitmap 3818 * 3819 * Thread 1: Thread 2: 3820 * trim_bitmaps(start) 3821 * trim_bitmaps(end) 3822 * end_trimming_bitmap() 3823 * reset_trimming_bitmap() 3824 */ 3825 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3826 { 3827 struct btrfs_free_space *entry; 3828 3829 spin_lock(&ctl->tree_lock); 3830 entry = tree_search_offset(ctl, offset, 1, 0); 3831 if (entry) { 3832 if (btrfs_free_space_trimmed(entry)) { 3833 ctl->discardable_extents[BTRFS_STAT_CURR] += 3834 entry->bitmap_extents; 3835 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3836 } 3837 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3838 } 3839 3840 spin_unlock(&ctl->tree_lock); 3841 } 3842 3843 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3844 struct btrfs_free_space *entry) 3845 { 3846 if (btrfs_free_space_trimming_bitmap(entry)) { 3847 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3848 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3849 entry->bitmap_extents; 3850 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3851 } 3852 } 3853 3854 /* 3855 * If @async is set, then we will trim 1 region and return. 3856 */ 3857 static int trim_bitmaps(struct btrfs_block_group *block_group, 3858 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3859 u64 maxlen, bool async) 3860 { 3861 struct btrfs_discard_ctl *discard_ctl = 3862 &block_group->fs_info->discard_ctl; 3863 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3864 struct btrfs_free_space *entry; 3865 int ret = 0; 3866 int ret2; 3867 u64 bytes; 3868 u64 offset = offset_to_bitmap(ctl, start); 3869 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3870 3871 while (offset < end) { 3872 bool next_bitmap = false; 3873 struct btrfs_trim_range trim_entry; 3874 3875 mutex_lock(&ctl->cache_writeout_mutex); 3876 spin_lock(&ctl->tree_lock); 3877 3878 if (ctl->free_space < minlen) { 3879 block_group->discard_cursor = 3880 btrfs_block_group_end(block_group); 3881 spin_unlock(&ctl->tree_lock); 3882 mutex_unlock(&ctl->cache_writeout_mutex); 3883 break; 3884 } 3885 3886 entry = tree_search_offset(ctl, offset, 1, 0); 3887 /* 3888 * Bitmaps are marked trimmed lossily now to prevent constant 3889 * discarding of the same bitmap (the reason why we are bound 3890 * by the filters). So, retrim the block group bitmaps when we 3891 * are preparing to punt to the unused_bgs list. This uses 3892 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3893 * which is the only discard index which sets minlen to 0. 3894 */ 3895 if (!entry || (async && minlen && start == offset && 3896 btrfs_free_space_trimmed(entry))) { 3897 spin_unlock(&ctl->tree_lock); 3898 mutex_unlock(&ctl->cache_writeout_mutex); 3899 next_bitmap = true; 3900 goto next; 3901 } 3902 3903 /* 3904 * Async discard bitmap trimming begins at by setting the start 3905 * to be key.objectid and the offset_to_bitmap() aligns to the 3906 * start of the bitmap. This lets us know we are fully 3907 * scanning the bitmap rather than only some portion of it. 3908 */ 3909 if (start == offset) 3910 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3911 3912 bytes = minlen; 3913 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3914 if (ret2 || start >= end) { 3915 /* 3916 * We lossily consider a bitmap trimmed if we only skip 3917 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3918 */ 3919 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3920 end_trimming_bitmap(ctl, entry); 3921 else 3922 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3923 spin_unlock(&ctl->tree_lock); 3924 mutex_unlock(&ctl->cache_writeout_mutex); 3925 next_bitmap = true; 3926 goto next; 3927 } 3928 3929 /* 3930 * We already trimmed a region, but are using the locking above 3931 * to reset the trim_state. 3932 */ 3933 if (async && *total_trimmed) { 3934 spin_unlock(&ctl->tree_lock); 3935 mutex_unlock(&ctl->cache_writeout_mutex); 3936 goto out; 3937 } 3938 3939 bytes = min(bytes, end - start); 3940 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3941 spin_unlock(&ctl->tree_lock); 3942 mutex_unlock(&ctl->cache_writeout_mutex); 3943 goto next; 3944 } 3945 3946 /* 3947 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3948 * If X < @minlen, we won't trim X when we come back around. 3949 * So trim it now. We differ here from trimming extents as we 3950 * don't keep individual state per bit. 3951 */ 3952 if (async && 3953 max_discard_size && 3954 bytes > (max_discard_size + minlen)) 3955 bytes = max_discard_size; 3956 3957 bitmap_clear_bits(ctl, entry, start, bytes, true); 3958 if (entry->bytes == 0) 3959 free_bitmap(ctl, entry); 3960 3961 spin_unlock(&ctl->tree_lock); 3962 trim_entry.start = start; 3963 trim_entry.bytes = bytes; 3964 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3965 mutex_unlock(&ctl->cache_writeout_mutex); 3966 3967 ret = do_trimming(block_group, total_trimmed, start, bytes, 3968 start, bytes, 0, &trim_entry); 3969 if (ret) { 3970 reset_trimming_bitmap(ctl, offset); 3971 block_group->discard_cursor = 3972 btrfs_block_group_end(block_group); 3973 break; 3974 } 3975 next: 3976 if (next_bitmap) { 3977 offset += BITS_PER_BITMAP * ctl->unit; 3978 start = offset; 3979 } else { 3980 start += bytes; 3981 } 3982 block_group->discard_cursor = start; 3983 3984 if (fatal_signal_pending(current)) { 3985 if (start != offset) 3986 reset_trimming_bitmap(ctl, offset); 3987 ret = -ERESTARTSYS; 3988 break; 3989 } 3990 3991 cond_resched(); 3992 } 3993 3994 if (offset >= end) 3995 block_group->discard_cursor = end; 3996 3997 out: 3998 return ret; 3999 } 4000 4001 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4002 u64 *trimmed, u64 start, u64 end, u64 minlen) 4003 { 4004 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4005 int ret; 4006 u64 rem = 0; 4007 4008 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4009 4010 *trimmed = 0; 4011 4012 spin_lock(&block_group->lock); 4013 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4014 spin_unlock(&block_group->lock); 4015 return 0; 4016 } 4017 btrfs_freeze_block_group(block_group); 4018 spin_unlock(&block_group->lock); 4019 4020 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4021 if (ret) 4022 goto out; 4023 4024 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4025 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4026 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4027 if (rem) 4028 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4029 out: 4030 btrfs_unfreeze_block_group(block_group); 4031 return ret; 4032 } 4033 4034 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4035 u64 *trimmed, u64 start, u64 end, u64 minlen, 4036 bool async) 4037 { 4038 int ret; 4039 4040 *trimmed = 0; 4041 4042 spin_lock(&block_group->lock); 4043 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4044 spin_unlock(&block_group->lock); 4045 return 0; 4046 } 4047 btrfs_freeze_block_group(block_group); 4048 spin_unlock(&block_group->lock); 4049 4050 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4051 btrfs_unfreeze_block_group(block_group); 4052 4053 return ret; 4054 } 4055 4056 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4057 u64 *trimmed, u64 start, u64 end, u64 minlen, 4058 u64 maxlen, bool async) 4059 { 4060 int ret; 4061 4062 *trimmed = 0; 4063 4064 spin_lock(&block_group->lock); 4065 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4066 spin_unlock(&block_group->lock); 4067 return 0; 4068 } 4069 btrfs_freeze_block_group(block_group); 4070 spin_unlock(&block_group->lock); 4071 4072 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4073 async); 4074 4075 btrfs_unfreeze_block_group(block_group); 4076 4077 return ret; 4078 } 4079 4080 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4081 { 4082 return btrfs_super_cache_generation(fs_info->super_copy); 4083 } 4084 4085 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4086 struct btrfs_trans_handle *trans) 4087 { 4088 struct btrfs_block_group *block_group; 4089 struct rb_node *node; 4090 int ret = 0; 4091 4092 btrfs_info(fs_info, "cleaning free space cache v1"); 4093 4094 node = rb_first_cached(&fs_info->block_group_cache_tree); 4095 while (node) { 4096 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4097 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4098 if (ret) 4099 goto out; 4100 node = rb_next(node); 4101 } 4102 out: 4103 return ret; 4104 } 4105 4106 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4107 { 4108 struct btrfs_trans_handle *trans; 4109 int ret; 4110 4111 /* 4112 * update_super_roots will appropriately set or unset 4113 * super_copy->cache_generation based on SPACE_CACHE and 4114 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4115 * transaction commit whether we are enabling space cache v1 and don't 4116 * have any other work to do, or are disabling it and removing free 4117 * space inodes. 4118 */ 4119 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4120 if (IS_ERR(trans)) 4121 return PTR_ERR(trans); 4122 4123 if (!active) { 4124 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4125 ret = cleanup_free_space_cache_v1(fs_info, trans); 4126 if (ret) { 4127 btrfs_abort_transaction(trans, ret); 4128 btrfs_end_transaction(trans); 4129 goto out; 4130 } 4131 } 4132 4133 ret = btrfs_commit_transaction(trans); 4134 out: 4135 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4136 4137 return ret; 4138 } 4139 4140 int __init btrfs_free_space_init(void) 4141 { 4142 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4143 sizeof(struct btrfs_free_space), 0, 4144 SLAB_MEM_SPREAD, NULL); 4145 if (!btrfs_free_space_cachep) 4146 return -ENOMEM; 4147 4148 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4149 PAGE_SIZE, PAGE_SIZE, 4150 SLAB_MEM_SPREAD, NULL); 4151 if (!btrfs_free_space_bitmap_cachep) { 4152 kmem_cache_destroy(btrfs_free_space_cachep); 4153 return -ENOMEM; 4154 } 4155 4156 return 0; 4157 } 4158 4159 void __cold btrfs_free_space_exit(void) 4160 { 4161 kmem_cache_destroy(btrfs_free_space_cachep); 4162 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4163 } 4164 4165 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4166 /* 4167 * Use this if you need to make a bitmap or extent entry specifically, it 4168 * doesn't do any of the merging that add_free_space does, this acts a lot like 4169 * how the free space cache loading stuff works, so you can get really weird 4170 * configurations. 4171 */ 4172 int test_add_free_space_entry(struct btrfs_block_group *cache, 4173 u64 offset, u64 bytes, bool bitmap) 4174 { 4175 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4176 struct btrfs_free_space *info = NULL, *bitmap_info; 4177 void *map = NULL; 4178 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4179 u64 bytes_added; 4180 int ret; 4181 4182 again: 4183 if (!info) { 4184 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4185 if (!info) 4186 return -ENOMEM; 4187 } 4188 4189 if (!bitmap) { 4190 spin_lock(&ctl->tree_lock); 4191 info->offset = offset; 4192 info->bytes = bytes; 4193 info->max_extent_size = 0; 4194 ret = link_free_space(ctl, info); 4195 spin_unlock(&ctl->tree_lock); 4196 if (ret) 4197 kmem_cache_free(btrfs_free_space_cachep, info); 4198 return ret; 4199 } 4200 4201 if (!map) { 4202 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4203 if (!map) { 4204 kmem_cache_free(btrfs_free_space_cachep, info); 4205 return -ENOMEM; 4206 } 4207 } 4208 4209 spin_lock(&ctl->tree_lock); 4210 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4211 1, 0); 4212 if (!bitmap_info) { 4213 info->bitmap = map; 4214 map = NULL; 4215 add_new_bitmap(ctl, info, offset); 4216 bitmap_info = info; 4217 info = NULL; 4218 } 4219 4220 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4221 trim_state); 4222 4223 bytes -= bytes_added; 4224 offset += bytes_added; 4225 spin_unlock(&ctl->tree_lock); 4226 4227 if (bytes) 4228 goto again; 4229 4230 if (info) 4231 kmem_cache_free(btrfs_free_space_cachep, info); 4232 if (map) 4233 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4234 return 0; 4235 } 4236 4237 /* 4238 * Checks to see if the given range is in the free space cache. This is really 4239 * just used to check the absence of space, so if there is free space in the 4240 * range at all we will return 1. 4241 */ 4242 int test_check_exists(struct btrfs_block_group *cache, 4243 u64 offset, u64 bytes) 4244 { 4245 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4246 struct btrfs_free_space *info; 4247 int ret = 0; 4248 4249 spin_lock(&ctl->tree_lock); 4250 info = tree_search_offset(ctl, offset, 0, 0); 4251 if (!info) { 4252 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4253 1, 0); 4254 if (!info) 4255 goto out; 4256 } 4257 4258 have_info: 4259 if (info->bitmap) { 4260 u64 bit_off, bit_bytes; 4261 struct rb_node *n; 4262 struct btrfs_free_space *tmp; 4263 4264 bit_off = offset; 4265 bit_bytes = ctl->unit; 4266 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4267 if (!ret) { 4268 if (bit_off == offset) { 4269 ret = 1; 4270 goto out; 4271 } else if (bit_off > offset && 4272 offset + bytes > bit_off) { 4273 ret = 1; 4274 goto out; 4275 } 4276 } 4277 4278 n = rb_prev(&info->offset_index); 4279 while (n) { 4280 tmp = rb_entry(n, struct btrfs_free_space, 4281 offset_index); 4282 if (tmp->offset + tmp->bytes < offset) 4283 break; 4284 if (offset + bytes < tmp->offset) { 4285 n = rb_prev(&tmp->offset_index); 4286 continue; 4287 } 4288 info = tmp; 4289 goto have_info; 4290 } 4291 4292 n = rb_next(&info->offset_index); 4293 while (n) { 4294 tmp = rb_entry(n, struct btrfs_free_space, 4295 offset_index); 4296 if (offset + bytes < tmp->offset) 4297 break; 4298 if (tmp->offset + tmp->bytes < offset) { 4299 n = rb_next(&tmp->offset_index); 4300 continue; 4301 } 4302 info = tmp; 4303 goto have_info; 4304 } 4305 4306 ret = 0; 4307 goto out; 4308 } 4309 4310 if (info->offset == offset) { 4311 ret = 1; 4312 goto out; 4313 } 4314 4315 if (offset > info->offset && offset < info->offset + info->bytes) 4316 ret = 1; 4317 out: 4318 spin_unlock(&ctl->tree_lock); 4319 return ret; 4320 } 4321 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4322