1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "fs.h" 16 #include "messages.h" 17 #include "misc.h" 18 #include "free-space-cache.h" 19 #include "transaction.h" 20 #include "disk-io.h" 21 #include "extent_io.h" 22 #include "volumes.h" 23 #include "space-info.h" 24 #include "delalloc-space.h" 25 #include "block-group.h" 26 #include "discard.h" 27 #include "subpage.h" 28 #include "inode-item.h" 29 #include "accessors.h" 30 #include "file-item.h" 31 #include "file.h" 32 #include "super.h" 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 36 #define FORCE_EXTENT_THRESHOLD SZ_1M 37 38 static struct kmem_cache *btrfs_free_space_cachep; 39 static struct kmem_cache *btrfs_free_space_bitmap_cachep; 40 41 struct btrfs_trim_range { 42 u64 start; 43 u64 bytes; 44 struct list_head list; 45 }; 46 47 static int link_free_space(struct btrfs_free_space_ctl *ctl, 48 struct btrfs_free_space *info); 49 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 50 struct btrfs_free_space *info, bool update_stat); 51 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 52 struct btrfs_free_space *bitmap_info, u64 *offset, 53 u64 *bytes, bool for_alloc); 54 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 55 struct btrfs_free_space *bitmap_info); 56 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 57 struct btrfs_free_space *info, u64 offset, 58 u64 bytes, bool update_stats); 59 60 static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 61 { 62 struct btrfs_free_space *info; 63 struct rb_node *node; 64 65 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 66 info = rb_entry(node, struct btrfs_free_space, offset_index); 67 if (!info->bitmap) { 68 unlink_free_space(ctl, info, true); 69 kmem_cache_free(btrfs_free_space_cachep, info); 70 } else { 71 free_bitmap(ctl, info); 72 } 73 74 cond_resched_lock(&ctl->tree_lock); 75 } 76 } 77 78 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 79 struct btrfs_path *path, 80 u64 offset) 81 { 82 struct btrfs_fs_info *fs_info = root->fs_info; 83 struct btrfs_key key; 84 struct btrfs_key location; 85 struct btrfs_disk_key disk_key; 86 struct btrfs_free_space_header *header; 87 struct extent_buffer *leaf; 88 struct inode *inode = NULL; 89 unsigned nofs_flag; 90 int ret; 91 92 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 93 key.offset = offset; 94 key.type = 0; 95 96 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 97 if (ret < 0) 98 return ERR_PTR(ret); 99 if (ret > 0) { 100 btrfs_release_path(path); 101 return ERR_PTR(-ENOENT); 102 } 103 104 leaf = path->nodes[0]; 105 header = btrfs_item_ptr(leaf, path->slots[0], 106 struct btrfs_free_space_header); 107 btrfs_free_space_key(leaf, header, &disk_key); 108 btrfs_disk_key_to_cpu(&location, &disk_key); 109 btrfs_release_path(path); 110 111 /* 112 * We are often under a trans handle at this point, so we need to make 113 * sure NOFS is set to keep us from deadlocking. 114 */ 115 nofs_flag = memalloc_nofs_save(); 116 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 117 btrfs_release_path(path); 118 memalloc_nofs_restore(nofs_flag); 119 if (IS_ERR(inode)) 120 return inode; 121 122 mapping_set_gfp_mask(inode->i_mapping, 123 mapping_gfp_constraint(inode->i_mapping, 124 ~(__GFP_FS | __GFP_HIGHMEM))); 125 126 return inode; 127 } 128 129 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 130 struct btrfs_path *path) 131 { 132 struct btrfs_fs_info *fs_info = block_group->fs_info; 133 struct inode *inode = NULL; 134 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 135 136 spin_lock(&block_group->lock); 137 if (block_group->inode) 138 inode = igrab(block_group->inode); 139 spin_unlock(&block_group->lock); 140 if (inode) 141 return inode; 142 143 inode = __lookup_free_space_inode(fs_info->tree_root, path, 144 block_group->start); 145 if (IS_ERR(inode)) 146 return inode; 147 148 spin_lock(&block_group->lock); 149 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 150 btrfs_info(fs_info, "Old style space inode found, converting."); 151 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 152 BTRFS_INODE_NODATACOW; 153 block_group->disk_cache_state = BTRFS_DC_CLEAR; 154 } 155 156 if (!test_and_set_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) 157 block_group->inode = igrab(inode); 158 spin_unlock(&block_group->lock); 159 160 return inode; 161 } 162 163 static int __create_free_space_inode(struct btrfs_root *root, 164 struct btrfs_trans_handle *trans, 165 struct btrfs_path *path, 166 u64 ino, u64 offset) 167 { 168 struct btrfs_key key; 169 struct btrfs_disk_key disk_key; 170 struct btrfs_free_space_header *header; 171 struct btrfs_inode_item *inode_item; 172 struct extent_buffer *leaf; 173 /* We inline CRCs for the free disk space cache */ 174 const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC | 175 BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 176 int ret; 177 178 ret = btrfs_insert_empty_inode(trans, root, path, ino); 179 if (ret) 180 return ret; 181 182 leaf = path->nodes[0]; 183 inode_item = btrfs_item_ptr(leaf, path->slots[0], 184 struct btrfs_inode_item); 185 btrfs_item_key(leaf, &disk_key, path->slots[0]); 186 memzero_extent_buffer(leaf, (unsigned long)inode_item, 187 sizeof(*inode_item)); 188 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 189 btrfs_set_inode_size(leaf, inode_item, 0); 190 btrfs_set_inode_nbytes(leaf, inode_item, 0); 191 btrfs_set_inode_uid(leaf, inode_item, 0); 192 btrfs_set_inode_gid(leaf, inode_item, 0); 193 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 194 btrfs_set_inode_flags(leaf, inode_item, flags); 195 btrfs_set_inode_nlink(leaf, inode_item, 1); 196 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 197 btrfs_set_inode_block_group(leaf, inode_item, offset); 198 btrfs_mark_buffer_dirty(leaf); 199 btrfs_release_path(path); 200 201 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 202 key.offset = offset; 203 key.type = 0; 204 ret = btrfs_insert_empty_item(trans, root, path, &key, 205 sizeof(struct btrfs_free_space_header)); 206 if (ret < 0) { 207 btrfs_release_path(path); 208 return ret; 209 } 210 211 leaf = path->nodes[0]; 212 header = btrfs_item_ptr(leaf, path->slots[0], 213 struct btrfs_free_space_header); 214 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 215 btrfs_set_free_space_key(leaf, header, &disk_key); 216 btrfs_mark_buffer_dirty(leaf); 217 btrfs_release_path(path); 218 219 return 0; 220 } 221 222 int create_free_space_inode(struct btrfs_trans_handle *trans, 223 struct btrfs_block_group *block_group, 224 struct btrfs_path *path) 225 { 226 int ret; 227 u64 ino; 228 229 ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); 230 if (ret < 0) 231 return ret; 232 233 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 234 ino, block_group->start); 235 } 236 237 /* 238 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode 239 * handles lookup, otherwise it takes ownership and iputs the inode. 240 * Don't reuse an inode pointer after passing it into this function. 241 */ 242 int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, 243 struct inode *inode, 244 struct btrfs_block_group *block_group) 245 { 246 struct btrfs_path *path; 247 struct btrfs_key key; 248 int ret = 0; 249 250 path = btrfs_alloc_path(); 251 if (!path) 252 return -ENOMEM; 253 254 if (!inode) 255 inode = lookup_free_space_inode(block_group, path); 256 if (IS_ERR(inode)) { 257 if (PTR_ERR(inode) != -ENOENT) 258 ret = PTR_ERR(inode); 259 goto out; 260 } 261 ret = btrfs_orphan_add(trans, BTRFS_I(inode)); 262 if (ret) { 263 btrfs_add_delayed_iput(BTRFS_I(inode)); 264 goto out; 265 } 266 clear_nlink(inode); 267 /* One for the block groups ref */ 268 spin_lock(&block_group->lock); 269 if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF, &block_group->runtime_flags)) { 270 block_group->inode = NULL; 271 spin_unlock(&block_group->lock); 272 iput(inode); 273 } else { 274 spin_unlock(&block_group->lock); 275 } 276 /* One for the lookup ref */ 277 btrfs_add_delayed_iput(BTRFS_I(inode)); 278 279 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 280 key.type = 0; 281 key.offset = block_group->start; 282 ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path, 283 -1, 1); 284 if (ret) { 285 if (ret > 0) 286 ret = 0; 287 goto out; 288 } 289 ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); 290 out: 291 btrfs_free_path(path); 292 return ret; 293 } 294 295 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, 296 struct btrfs_block_rsv *rsv) 297 { 298 u64 needed_bytes; 299 int ret; 300 301 /* 1 for slack space, 1 for updating the inode */ 302 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + 303 btrfs_calc_metadata_size(fs_info, 1); 304 305 spin_lock(&rsv->lock); 306 if (rsv->reserved < needed_bytes) 307 ret = -ENOSPC; 308 else 309 ret = 0; 310 spin_unlock(&rsv->lock); 311 return ret; 312 } 313 314 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 315 struct btrfs_block_group *block_group, 316 struct inode *vfs_inode) 317 { 318 struct btrfs_truncate_control control = { 319 .inode = BTRFS_I(vfs_inode), 320 .new_size = 0, 321 .ino = btrfs_ino(BTRFS_I(vfs_inode)), 322 .min_type = BTRFS_EXTENT_DATA_KEY, 323 .clear_extent_range = true, 324 }; 325 struct btrfs_inode *inode = BTRFS_I(vfs_inode); 326 struct btrfs_root *root = inode->root; 327 struct extent_state *cached_state = NULL; 328 int ret = 0; 329 bool locked = false; 330 331 if (block_group) { 332 struct btrfs_path *path = btrfs_alloc_path(); 333 334 if (!path) { 335 ret = -ENOMEM; 336 goto fail; 337 } 338 locked = true; 339 mutex_lock(&trans->transaction->cache_write_mutex); 340 if (!list_empty(&block_group->io_list)) { 341 list_del_init(&block_group->io_list); 342 343 btrfs_wait_cache_io(trans, block_group, path); 344 btrfs_put_block_group(block_group); 345 } 346 347 /* 348 * now that we've truncated the cache away, its no longer 349 * setup or written 350 */ 351 spin_lock(&block_group->lock); 352 block_group->disk_cache_state = BTRFS_DC_CLEAR; 353 spin_unlock(&block_group->lock); 354 btrfs_free_path(path); 355 } 356 357 btrfs_i_size_write(inode, 0); 358 truncate_pagecache(vfs_inode, 0); 359 360 lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 361 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); 362 363 /* 364 * We skip the throttling logic for free space cache inodes, so we don't 365 * need to check for -EAGAIN. 366 */ 367 ret = btrfs_truncate_inode_items(trans, root, &control); 368 369 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); 370 btrfs_inode_safe_disk_i_size_write(inode, control.last_size); 371 372 unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); 373 if (ret) 374 goto fail; 375 376 ret = btrfs_update_inode(trans, root, inode); 377 378 fail: 379 if (locked) 380 mutex_unlock(&trans->transaction->cache_write_mutex); 381 if (ret) 382 btrfs_abort_transaction(trans, ret); 383 384 return ret; 385 } 386 387 static void readahead_cache(struct inode *inode) 388 { 389 struct file_ra_state ra; 390 unsigned long last_index; 391 392 file_ra_state_init(&ra, inode->i_mapping); 393 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 394 395 page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); 396 } 397 398 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 399 int write) 400 { 401 int num_pages; 402 403 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 404 405 /* Make sure we can fit our crcs and generation into the first page */ 406 if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 407 return -ENOSPC; 408 409 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 410 411 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 412 if (!io_ctl->pages) 413 return -ENOMEM; 414 415 io_ctl->num_pages = num_pages; 416 io_ctl->fs_info = btrfs_sb(inode->i_sb); 417 io_ctl->inode = inode; 418 419 return 0; 420 } 421 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 422 423 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 424 { 425 kfree(io_ctl->pages); 426 io_ctl->pages = NULL; 427 } 428 429 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 430 { 431 if (io_ctl->cur) { 432 io_ctl->cur = NULL; 433 io_ctl->orig = NULL; 434 } 435 } 436 437 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 438 { 439 ASSERT(io_ctl->index < io_ctl->num_pages); 440 io_ctl->page = io_ctl->pages[io_ctl->index++]; 441 io_ctl->cur = page_address(io_ctl->page); 442 io_ctl->orig = io_ctl->cur; 443 io_ctl->size = PAGE_SIZE; 444 if (clear) 445 clear_page(io_ctl->cur); 446 } 447 448 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 449 { 450 int i; 451 452 io_ctl_unmap_page(io_ctl); 453 454 for (i = 0; i < io_ctl->num_pages; i++) { 455 if (io_ctl->pages[i]) { 456 btrfs_page_clear_checked(io_ctl->fs_info, 457 io_ctl->pages[i], 458 page_offset(io_ctl->pages[i]), 459 PAGE_SIZE); 460 unlock_page(io_ctl->pages[i]); 461 put_page(io_ctl->pages[i]); 462 } 463 } 464 } 465 466 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 467 { 468 struct page *page; 469 struct inode *inode = io_ctl->inode; 470 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 471 int i; 472 473 for (i = 0; i < io_ctl->num_pages; i++) { 474 int ret; 475 476 page = find_or_create_page(inode->i_mapping, i, mask); 477 if (!page) { 478 io_ctl_drop_pages(io_ctl); 479 return -ENOMEM; 480 } 481 482 ret = set_page_extent_mapped(page); 483 if (ret < 0) { 484 unlock_page(page); 485 put_page(page); 486 io_ctl_drop_pages(io_ctl); 487 return ret; 488 } 489 490 io_ctl->pages[i] = page; 491 if (uptodate && !PageUptodate(page)) { 492 btrfs_read_folio(NULL, page_folio(page)); 493 lock_page(page); 494 if (page->mapping != inode->i_mapping) { 495 btrfs_err(BTRFS_I(inode)->root->fs_info, 496 "free space cache page truncated"); 497 io_ctl_drop_pages(io_ctl); 498 return -EIO; 499 } 500 if (!PageUptodate(page)) { 501 btrfs_err(BTRFS_I(inode)->root->fs_info, 502 "error reading free space cache"); 503 io_ctl_drop_pages(io_ctl); 504 return -EIO; 505 } 506 } 507 } 508 509 for (i = 0; i < io_ctl->num_pages; i++) 510 clear_page_dirty_for_io(io_ctl->pages[i]); 511 512 return 0; 513 } 514 515 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 516 { 517 io_ctl_map_page(io_ctl, 1); 518 519 /* 520 * Skip the csum areas. If we don't check crcs then we just have a 521 * 64bit chunk at the front of the first page. 522 */ 523 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 524 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 525 526 put_unaligned_le64(generation, io_ctl->cur); 527 io_ctl->cur += sizeof(u64); 528 } 529 530 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 531 { 532 u64 cache_gen; 533 534 /* 535 * Skip the crc area. If we don't check crcs then we just have a 64bit 536 * chunk at the front of the first page. 537 */ 538 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 539 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 540 541 cache_gen = get_unaligned_le64(io_ctl->cur); 542 if (cache_gen != generation) { 543 btrfs_err_rl(io_ctl->fs_info, 544 "space cache generation (%llu) does not match inode (%llu)", 545 cache_gen, generation); 546 io_ctl_unmap_page(io_ctl); 547 return -EIO; 548 } 549 io_ctl->cur += sizeof(u64); 550 return 0; 551 } 552 553 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 554 { 555 u32 *tmp; 556 u32 crc = ~(u32)0; 557 unsigned offset = 0; 558 559 if (index == 0) 560 offset = sizeof(u32) * io_ctl->num_pages; 561 562 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 563 btrfs_crc32c_final(crc, (u8 *)&crc); 564 io_ctl_unmap_page(io_ctl); 565 tmp = page_address(io_ctl->pages[0]); 566 tmp += index; 567 *tmp = crc; 568 } 569 570 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 571 { 572 u32 *tmp, val; 573 u32 crc = ~(u32)0; 574 unsigned offset = 0; 575 576 if (index == 0) 577 offset = sizeof(u32) * io_ctl->num_pages; 578 579 tmp = page_address(io_ctl->pages[0]); 580 tmp += index; 581 val = *tmp; 582 583 io_ctl_map_page(io_ctl, 0); 584 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 585 btrfs_crc32c_final(crc, (u8 *)&crc); 586 if (val != crc) { 587 btrfs_err_rl(io_ctl->fs_info, 588 "csum mismatch on free space cache"); 589 io_ctl_unmap_page(io_ctl); 590 return -EIO; 591 } 592 593 return 0; 594 } 595 596 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 597 void *bitmap) 598 { 599 struct btrfs_free_space_entry *entry; 600 601 if (!io_ctl->cur) 602 return -ENOSPC; 603 604 entry = io_ctl->cur; 605 put_unaligned_le64(offset, &entry->offset); 606 put_unaligned_le64(bytes, &entry->bytes); 607 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 608 BTRFS_FREE_SPACE_EXTENT; 609 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 610 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 611 612 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 613 return 0; 614 615 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 616 617 /* No more pages to map */ 618 if (io_ctl->index >= io_ctl->num_pages) 619 return 0; 620 621 /* map the next page */ 622 io_ctl_map_page(io_ctl, 1); 623 return 0; 624 } 625 626 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 627 { 628 if (!io_ctl->cur) 629 return -ENOSPC; 630 631 /* 632 * If we aren't at the start of the current page, unmap this one and 633 * map the next one if there is any left. 634 */ 635 if (io_ctl->cur != io_ctl->orig) { 636 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 637 if (io_ctl->index >= io_ctl->num_pages) 638 return -ENOSPC; 639 io_ctl_map_page(io_ctl, 0); 640 } 641 642 copy_page(io_ctl->cur, bitmap); 643 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 644 if (io_ctl->index < io_ctl->num_pages) 645 io_ctl_map_page(io_ctl, 0); 646 return 0; 647 } 648 649 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 650 { 651 /* 652 * If we're not on the boundary we know we've modified the page and we 653 * need to crc the page. 654 */ 655 if (io_ctl->cur != io_ctl->orig) 656 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 657 else 658 io_ctl_unmap_page(io_ctl); 659 660 while (io_ctl->index < io_ctl->num_pages) { 661 io_ctl_map_page(io_ctl, 1); 662 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 663 } 664 } 665 666 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 667 struct btrfs_free_space *entry, u8 *type) 668 { 669 struct btrfs_free_space_entry *e; 670 int ret; 671 672 if (!io_ctl->cur) { 673 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 674 if (ret) 675 return ret; 676 } 677 678 e = io_ctl->cur; 679 entry->offset = get_unaligned_le64(&e->offset); 680 entry->bytes = get_unaligned_le64(&e->bytes); 681 *type = e->type; 682 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 683 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 684 685 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 686 return 0; 687 688 io_ctl_unmap_page(io_ctl); 689 690 return 0; 691 } 692 693 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 694 struct btrfs_free_space *entry) 695 { 696 int ret; 697 698 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 699 if (ret) 700 return ret; 701 702 copy_page(entry->bitmap, io_ctl->cur); 703 io_ctl_unmap_page(io_ctl); 704 705 return 0; 706 } 707 708 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 709 { 710 struct btrfs_block_group *block_group = ctl->block_group; 711 u64 max_bytes; 712 u64 bitmap_bytes; 713 u64 extent_bytes; 714 u64 size = block_group->length; 715 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 716 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 717 718 max_bitmaps = max_t(u64, max_bitmaps, 1); 719 720 if (ctl->total_bitmaps > max_bitmaps) 721 btrfs_err(block_group->fs_info, 722 "invalid free space control: bg start=%llu len=%llu total_bitmaps=%u unit=%u max_bitmaps=%llu bytes_per_bg=%llu", 723 block_group->start, block_group->length, 724 ctl->total_bitmaps, ctl->unit, max_bitmaps, 725 bytes_per_bg); 726 ASSERT(ctl->total_bitmaps <= max_bitmaps); 727 728 /* 729 * We are trying to keep the total amount of memory used per 1GiB of 730 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 731 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 732 * bitmaps, we may end up using more memory than this. 733 */ 734 if (size < SZ_1G) 735 max_bytes = MAX_CACHE_BYTES_PER_GIG; 736 else 737 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 738 739 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 740 741 /* 742 * we want the extent entry threshold to always be at most 1/2 the max 743 * bytes we can have, or whatever is less than that. 744 */ 745 extent_bytes = max_bytes - bitmap_bytes; 746 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 747 748 ctl->extents_thresh = 749 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 750 } 751 752 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 753 struct btrfs_free_space_ctl *ctl, 754 struct btrfs_path *path, u64 offset) 755 { 756 struct btrfs_fs_info *fs_info = root->fs_info; 757 struct btrfs_free_space_header *header; 758 struct extent_buffer *leaf; 759 struct btrfs_io_ctl io_ctl; 760 struct btrfs_key key; 761 struct btrfs_free_space *e, *n; 762 LIST_HEAD(bitmaps); 763 u64 num_entries; 764 u64 num_bitmaps; 765 u64 generation; 766 u8 type; 767 int ret = 0; 768 769 /* Nothing in the space cache, goodbye */ 770 if (!i_size_read(inode)) 771 return 0; 772 773 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 774 key.offset = offset; 775 key.type = 0; 776 777 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 778 if (ret < 0) 779 return 0; 780 else if (ret > 0) { 781 btrfs_release_path(path); 782 return 0; 783 } 784 785 ret = -1; 786 787 leaf = path->nodes[0]; 788 header = btrfs_item_ptr(leaf, path->slots[0], 789 struct btrfs_free_space_header); 790 num_entries = btrfs_free_space_entries(leaf, header); 791 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 792 generation = btrfs_free_space_generation(leaf, header); 793 btrfs_release_path(path); 794 795 if (!BTRFS_I(inode)->generation) { 796 btrfs_info(fs_info, 797 "the free space cache file (%llu) is invalid, skip it", 798 offset); 799 return 0; 800 } 801 802 if (BTRFS_I(inode)->generation != generation) { 803 btrfs_err(fs_info, 804 "free space inode generation (%llu) did not match free space cache generation (%llu)", 805 BTRFS_I(inode)->generation, generation); 806 return 0; 807 } 808 809 if (!num_entries) 810 return 0; 811 812 ret = io_ctl_init(&io_ctl, inode, 0); 813 if (ret) 814 return ret; 815 816 readahead_cache(inode); 817 818 ret = io_ctl_prepare_pages(&io_ctl, true); 819 if (ret) 820 goto out; 821 822 ret = io_ctl_check_crc(&io_ctl, 0); 823 if (ret) 824 goto free_cache; 825 826 ret = io_ctl_check_generation(&io_ctl, generation); 827 if (ret) 828 goto free_cache; 829 830 while (num_entries) { 831 e = kmem_cache_zalloc(btrfs_free_space_cachep, 832 GFP_NOFS); 833 if (!e) { 834 ret = -ENOMEM; 835 goto free_cache; 836 } 837 838 ret = io_ctl_read_entry(&io_ctl, e, &type); 839 if (ret) { 840 kmem_cache_free(btrfs_free_space_cachep, e); 841 goto free_cache; 842 } 843 844 if (!e->bytes) { 845 ret = -1; 846 kmem_cache_free(btrfs_free_space_cachep, e); 847 goto free_cache; 848 } 849 850 if (type == BTRFS_FREE_SPACE_EXTENT) { 851 spin_lock(&ctl->tree_lock); 852 ret = link_free_space(ctl, e); 853 spin_unlock(&ctl->tree_lock); 854 if (ret) { 855 btrfs_err(fs_info, 856 "Duplicate entries in free space cache, dumping"); 857 kmem_cache_free(btrfs_free_space_cachep, e); 858 goto free_cache; 859 } 860 } else { 861 ASSERT(num_bitmaps); 862 num_bitmaps--; 863 e->bitmap = kmem_cache_zalloc( 864 btrfs_free_space_bitmap_cachep, GFP_NOFS); 865 if (!e->bitmap) { 866 ret = -ENOMEM; 867 kmem_cache_free( 868 btrfs_free_space_cachep, e); 869 goto free_cache; 870 } 871 spin_lock(&ctl->tree_lock); 872 ret = link_free_space(ctl, e); 873 ctl->total_bitmaps++; 874 recalculate_thresholds(ctl); 875 spin_unlock(&ctl->tree_lock); 876 if (ret) { 877 btrfs_err(fs_info, 878 "Duplicate entries in free space cache, dumping"); 879 kmem_cache_free(btrfs_free_space_cachep, e); 880 goto free_cache; 881 } 882 list_add_tail(&e->list, &bitmaps); 883 } 884 885 num_entries--; 886 } 887 888 io_ctl_unmap_page(&io_ctl); 889 890 /* 891 * We add the bitmaps at the end of the entries in order that 892 * the bitmap entries are added to the cache. 893 */ 894 list_for_each_entry_safe(e, n, &bitmaps, list) { 895 list_del_init(&e->list); 896 ret = io_ctl_read_bitmap(&io_ctl, e); 897 if (ret) 898 goto free_cache; 899 } 900 901 io_ctl_drop_pages(&io_ctl); 902 ret = 1; 903 out: 904 io_ctl_free(&io_ctl); 905 return ret; 906 free_cache: 907 io_ctl_drop_pages(&io_ctl); 908 909 spin_lock(&ctl->tree_lock); 910 __btrfs_remove_free_space_cache(ctl); 911 spin_unlock(&ctl->tree_lock); 912 goto out; 913 } 914 915 static int copy_free_space_cache(struct btrfs_block_group *block_group, 916 struct btrfs_free_space_ctl *ctl) 917 { 918 struct btrfs_free_space *info; 919 struct rb_node *n; 920 int ret = 0; 921 922 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 923 info = rb_entry(n, struct btrfs_free_space, offset_index); 924 if (!info->bitmap) { 925 unlink_free_space(ctl, info, true); 926 ret = btrfs_add_free_space(block_group, info->offset, 927 info->bytes); 928 kmem_cache_free(btrfs_free_space_cachep, info); 929 } else { 930 u64 offset = info->offset; 931 u64 bytes = ctl->unit; 932 933 while (search_bitmap(ctl, info, &offset, &bytes, 934 false) == 0) { 935 ret = btrfs_add_free_space(block_group, offset, 936 bytes); 937 if (ret) 938 break; 939 bitmap_clear_bits(ctl, info, offset, bytes, true); 940 offset = info->offset; 941 bytes = ctl->unit; 942 } 943 free_bitmap(ctl, info); 944 } 945 cond_resched(); 946 } 947 return ret; 948 } 949 950 static struct lock_class_key btrfs_free_space_inode_key; 951 952 int load_free_space_cache(struct btrfs_block_group *block_group) 953 { 954 struct btrfs_fs_info *fs_info = block_group->fs_info; 955 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 956 struct btrfs_free_space_ctl tmp_ctl = {}; 957 struct inode *inode; 958 struct btrfs_path *path; 959 int ret = 0; 960 bool matched; 961 u64 used = block_group->used; 962 963 /* 964 * Because we could potentially discard our loaded free space, we want 965 * to load everything into a temporary structure first, and then if it's 966 * valid copy it all into the actual free space ctl. 967 */ 968 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 969 970 /* 971 * If this block group has been marked to be cleared for one reason or 972 * another then we can't trust the on disk cache, so just return. 973 */ 974 spin_lock(&block_group->lock); 975 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 976 spin_unlock(&block_group->lock); 977 return 0; 978 } 979 spin_unlock(&block_group->lock); 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return 0; 984 path->search_commit_root = 1; 985 path->skip_locking = 1; 986 987 /* 988 * We must pass a path with search_commit_root set to btrfs_iget in 989 * order to avoid a deadlock when allocating extents for the tree root. 990 * 991 * When we are COWing an extent buffer from the tree root, when looking 992 * for a free extent, at extent-tree.c:find_free_extent(), we can find 993 * block group without its free space cache loaded. When we find one 994 * we must load its space cache which requires reading its free space 995 * cache's inode item from the root tree. If this inode item is located 996 * in the same leaf that we started COWing before, then we end up in 997 * deadlock on the extent buffer (trying to read lock it when we 998 * previously write locked it). 999 * 1000 * It's safe to read the inode item using the commit root because 1001 * block groups, once loaded, stay in memory forever (until they are 1002 * removed) as well as their space caches once loaded. New block groups 1003 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 1004 * we will never try to read their inode item while the fs is mounted. 1005 */ 1006 inode = lookup_free_space_inode(block_group, path); 1007 if (IS_ERR(inode)) { 1008 btrfs_free_path(path); 1009 return 0; 1010 } 1011 1012 /* We may have converted the inode and made the cache invalid. */ 1013 spin_lock(&block_group->lock); 1014 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 1015 spin_unlock(&block_group->lock); 1016 btrfs_free_path(path); 1017 goto out; 1018 } 1019 spin_unlock(&block_group->lock); 1020 1021 /* 1022 * Reinitialize the class of struct inode's mapping->invalidate_lock for 1023 * free space inodes to prevent false positives related to locks for normal 1024 * inodes. 1025 */ 1026 lockdep_set_class(&(&inode->i_data)->invalidate_lock, 1027 &btrfs_free_space_inode_key); 1028 1029 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 1030 path, block_group->start); 1031 btrfs_free_path(path); 1032 if (ret <= 0) 1033 goto out; 1034 1035 matched = (tmp_ctl.free_space == (block_group->length - used - 1036 block_group->bytes_super)); 1037 1038 if (matched) { 1039 ret = copy_free_space_cache(block_group, &tmp_ctl); 1040 /* 1041 * ret == 1 means we successfully loaded the free space cache, 1042 * so we need to re-set it here. 1043 */ 1044 if (ret == 0) 1045 ret = 1; 1046 } else { 1047 /* 1048 * We need to call the _locked variant so we don't try to update 1049 * the discard counters. 1050 */ 1051 spin_lock(&tmp_ctl.tree_lock); 1052 __btrfs_remove_free_space_cache(&tmp_ctl); 1053 spin_unlock(&tmp_ctl.tree_lock); 1054 btrfs_warn(fs_info, 1055 "block group %llu has wrong amount of free space", 1056 block_group->start); 1057 ret = -1; 1058 } 1059 out: 1060 if (ret < 0) { 1061 /* This cache is bogus, make sure it gets cleared */ 1062 spin_lock(&block_group->lock); 1063 block_group->disk_cache_state = BTRFS_DC_CLEAR; 1064 spin_unlock(&block_group->lock); 1065 ret = 0; 1066 1067 btrfs_warn(fs_info, 1068 "failed to load free space cache for block group %llu, rebuilding it now", 1069 block_group->start); 1070 } 1071 1072 spin_lock(&ctl->tree_lock); 1073 btrfs_discard_update_discardable(block_group); 1074 spin_unlock(&ctl->tree_lock); 1075 iput(inode); 1076 return ret; 1077 } 1078 1079 static noinline_for_stack 1080 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 1081 struct btrfs_free_space_ctl *ctl, 1082 struct btrfs_block_group *block_group, 1083 int *entries, int *bitmaps, 1084 struct list_head *bitmap_list) 1085 { 1086 int ret; 1087 struct btrfs_free_cluster *cluster = NULL; 1088 struct btrfs_free_cluster *cluster_locked = NULL; 1089 struct rb_node *node = rb_first(&ctl->free_space_offset); 1090 struct btrfs_trim_range *trim_entry; 1091 1092 /* Get the cluster for this block_group if it exists */ 1093 if (block_group && !list_empty(&block_group->cluster_list)) { 1094 cluster = list_entry(block_group->cluster_list.next, 1095 struct btrfs_free_cluster, 1096 block_group_list); 1097 } 1098 1099 if (!node && cluster) { 1100 cluster_locked = cluster; 1101 spin_lock(&cluster_locked->lock); 1102 node = rb_first(&cluster->root); 1103 cluster = NULL; 1104 } 1105 1106 /* Write out the extent entries */ 1107 while (node) { 1108 struct btrfs_free_space *e; 1109 1110 e = rb_entry(node, struct btrfs_free_space, offset_index); 1111 *entries += 1; 1112 1113 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 1114 e->bitmap); 1115 if (ret) 1116 goto fail; 1117 1118 if (e->bitmap) { 1119 list_add_tail(&e->list, bitmap_list); 1120 *bitmaps += 1; 1121 } 1122 node = rb_next(node); 1123 if (!node && cluster) { 1124 node = rb_first(&cluster->root); 1125 cluster_locked = cluster; 1126 spin_lock(&cluster_locked->lock); 1127 cluster = NULL; 1128 } 1129 } 1130 if (cluster_locked) { 1131 spin_unlock(&cluster_locked->lock); 1132 cluster_locked = NULL; 1133 } 1134 1135 /* 1136 * Make sure we don't miss any range that was removed from our rbtree 1137 * because trimming is running. Otherwise after a umount+mount (or crash 1138 * after committing the transaction) we would leak free space and get 1139 * an inconsistent free space cache report from fsck. 1140 */ 1141 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 1142 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 1143 trim_entry->bytes, NULL); 1144 if (ret) 1145 goto fail; 1146 *entries += 1; 1147 } 1148 1149 return 0; 1150 fail: 1151 if (cluster_locked) 1152 spin_unlock(&cluster_locked->lock); 1153 return -ENOSPC; 1154 } 1155 1156 static noinline_for_stack int 1157 update_cache_item(struct btrfs_trans_handle *trans, 1158 struct btrfs_root *root, 1159 struct inode *inode, 1160 struct btrfs_path *path, u64 offset, 1161 int entries, int bitmaps) 1162 { 1163 struct btrfs_key key; 1164 struct btrfs_free_space_header *header; 1165 struct extent_buffer *leaf; 1166 int ret; 1167 1168 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1169 key.offset = offset; 1170 key.type = 0; 1171 1172 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1173 if (ret < 0) { 1174 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1175 EXTENT_DELALLOC, NULL); 1176 goto fail; 1177 } 1178 leaf = path->nodes[0]; 1179 if (ret > 0) { 1180 struct btrfs_key found_key; 1181 ASSERT(path->slots[0]); 1182 path->slots[0]--; 1183 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1184 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1185 found_key.offset != offset) { 1186 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1187 inode->i_size - 1, EXTENT_DELALLOC, 1188 NULL); 1189 btrfs_release_path(path); 1190 goto fail; 1191 } 1192 } 1193 1194 BTRFS_I(inode)->generation = trans->transid; 1195 header = btrfs_item_ptr(leaf, path->slots[0], 1196 struct btrfs_free_space_header); 1197 btrfs_set_free_space_entries(leaf, header, entries); 1198 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1199 btrfs_set_free_space_generation(leaf, header, trans->transid); 1200 btrfs_mark_buffer_dirty(leaf); 1201 btrfs_release_path(path); 1202 1203 return 0; 1204 1205 fail: 1206 return -1; 1207 } 1208 1209 static noinline_for_stack int write_pinned_extent_entries( 1210 struct btrfs_trans_handle *trans, 1211 struct btrfs_block_group *block_group, 1212 struct btrfs_io_ctl *io_ctl, 1213 int *entries) 1214 { 1215 u64 start, extent_start, extent_end, len; 1216 struct extent_io_tree *unpin = NULL; 1217 int ret; 1218 1219 if (!block_group) 1220 return 0; 1221 1222 /* 1223 * We want to add any pinned extents to our free space cache 1224 * so we don't leak the space 1225 * 1226 * We shouldn't have switched the pinned extents yet so this is the 1227 * right one 1228 */ 1229 unpin = &trans->transaction->pinned_extents; 1230 1231 start = block_group->start; 1232 1233 while (start < block_group->start + block_group->length) { 1234 ret = find_first_extent_bit(unpin, start, 1235 &extent_start, &extent_end, 1236 EXTENT_DIRTY, NULL); 1237 if (ret) 1238 return 0; 1239 1240 /* This pinned extent is out of our range */ 1241 if (extent_start >= block_group->start + block_group->length) 1242 return 0; 1243 1244 extent_start = max(extent_start, start); 1245 extent_end = min(block_group->start + block_group->length, 1246 extent_end + 1); 1247 len = extent_end - extent_start; 1248 1249 *entries += 1; 1250 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1251 if (ret) 1252 return -ENOSPC; 1253 1254 start = extent_end; 1255 } 1256 1257 return 0; 1258 } 1259 1260 static noinline_for_stack int 1261 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1262 { 1263 struct btrfs_free_space *entry, *next; 1264 int ret; 1265 1266 /* Write out the bitmaps */ 1267 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1268 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1269 if (ret) 1270 return -ENOSPC; 1271 list_del_init(&entry->list); 1272 } 1273 1274 return 0; 1275 } 1276 1277 static int flush_dirty_cache(struct inode *inode) 1278 { 1279 int ret; 1280 1281 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1282 if (ret) 1283 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1284 EXTENT_DELALLOC, NULL); 1285 1286 return ret; 1287 } 1288 1289 static void noinline_for_stack 1290 cleanup_bitmap_list(struct list_head *bitmap_list) 1291 { 1292 struct btrfs_free_space *entry, *next; 1293 1294 list_for_each_entry_safe(entry, next, bitmap_list, list) 1295 list_del_init(&entry->list); 1296 } 1297 1298 static void noinline_for_stack 1299 cleanup_write_cache_enospc(struct inode *inode, 1300 struct btrfs_io_ctl *io_ctl, 1301 struct extent_state **cached_state) 1302 { 1303 io_ctl_drop_pages(io_ctl); 1304 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1305 cached_state); 1306 } 1307 1308 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1309 struct btrfs_trans_handle *trans, 1310 struct btrfs_block_group *block_group, 1311 struct btrfs_io_ctl *io_ctl, 1312 struct btrfs_path *path, u64 offset) 1313 { 1314 int ret; 1315 struct inode *inode = io_ctl->inode; 1316 1317 if (!inode) 1318 return 0; 1319 1320 /* Flush the dirty pages in the cache file. */ 1321 ret = flush_dirty_cache(inode); 1322 if (ret) 1323 goto out; 1324 1325 /* Update the cache item to tell everyone this cache file is valid. */ 1326 ret = update_cache_item(trans, root, inode, path, offset, 1327 io_ctl->entries, io_ctl->bitmaps); 1328 out: 1329 if (ret) { 1330 invalidate_inode_pages2(inode->i_mapping); 1331 BTRFS_I(inode)->generation = 0; 1332 if (block_group) 1333 btrfs_debug(root->fs_info, 1334 "failed to write free space cache for block group %llu error %d", 1335 block_group->start, ret); 1336 } 1337 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1338 1339 if (block_group) { 1340 /* the dirty list is protected by the dirty_bgs_lock */ 1341 spin_lock(&trans->transaction->dirty_bgs_lock); 1342 1343 /* the disk_cache_state is protected by the block group lock */ 1344 spin_lock(&block_group->lock); 1345 1346 /* 1347 * only mark this as written if we didn't get put back on 1348 * the dirty list while waiting for IO. Otherwise our 1349 * cache state won't be right, and we won't get written again 1350 */ 1351 if (!ret && list_empty(&block_group->dirty_list)) 1352 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1353 else if (ret) 1354 block_group->disk_cache_state = BTRFS_DC_ERROR; 1355 1356 spin_unlock(&block_group->lock); 1357 spin_unlock(&trans->transaction->dirty_bgs_lock); 1358 io_ctl->inode = NULL; 1359 iput(inode); 1360 } 1361 1362 return ret; 1363 1364 } 1365 1366 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1367 struct btrfs_block_group *block_group, 1368 struct btrfs_path *path) 1369 { 1370 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1371 block_group, &block_group->io_ctl, 1372 path, block_group->start); 1373 } 1374 1375 /* 1376 * Write out cached info to an inode. 1377 * 1378 * @root: root the inode belongs to 1379 * @inode: freespace inode we are writing out 1380 * @ctl: free space cache we are going to write out 1381 * @block_group: block_group for this cache if it belongs to a block_group 1382 * @io_ctl: holds context for the io 1383 * @trans: the trans handle 1384 * 1385 * This function writes out a free space cache struct to disk for quick recovery 1386 * on mount. This will return 0 if it was successful in writing the cache out, 1387 * or an errno if it was not. 1388 */ 1389 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1390 struct btrfs_free_space_ctl *ctl, 1391 struct btrfs_block_group *block_group, 1392 struct btrfs_io_ctl *io_ctl, 1393 struct btrfs_trans_handle *trans) 1394 { 1395 struct extent_state *cached_state = NULL; 1396 LIST_HEAD(bitmap_list); 1397 int entries = 0; 1398 int bitmaps = 0; 1399 int ret; 1400 int must_iput = 0; 1401 1402 if (!i_size_read(inode)) 1403 return -EIO; 1404 1405 WARN_ON(io_ctl->pages); 1406 ret = io_ctl_init(io_ctl, inode, 1); 1407 if (ret) 1408 return ret; 1409 1410 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1411 down_write(&block_group->data_rwsem); 1412 spin_lock(&block_group->lock); 1413 if (block_group->delalloc_bytes) { 1414 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1415 spin_unlock(&block_group->lock); 1416 up_write(&block_group->data_rwsem); 1417 BTRFS_I(inode)->generation = 0; 1418 ret = 0; 1419 must_iput = 1; 1420 goto out; 1421 } 1422 spin_unlock(&block_group->lock); 1423 } 1424 1425 /* Lock all pages first so we can lock the extent safely. */ 1426 ret = io_ctl_prepare_pages(io_ctl, false); 1427 if (ret) 1428 goto out_unlock; 1429 1430 lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1431 &cached_state); 1432 1433 io_ctl_set_generation(io_ctl, trans->transid); 1434 1435 mutex_lock(&ctl->cache_writeout_mutex); 1436 /* Write out the extent entries in the free space cache */ 1437 spin_lock(&ctl->tree_lock); 1438 ret = write_cache_extent_entries(io_ctl, ctl, 1439 block_group, &entries, &bitmaps, 1440 &bitmap_list); 1441 if (ret) 1442 goto out_nospc_locked; 1443 1444 /* 1445 * Some spaces that are freed in the current transaction are pinned, 1446 * they will be added into free space cache after the transaction is 1447 * committed, we shouldn't lose them. 1448 * 1449 * If this changes while we are working we'll get added back to 1450 * the dirty list and redo it. No locking needed 1451 */ 1452 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1453 if (ret) 1454 goto out_nospc_locked; 1455 1456 /* 1457 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1458 * locked while doing it because a concurrent trim can be manipulating 1459 * or freeing the bitmap. 1460 */ 1461 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1462 spin_unlock(&ctl->tree_lock); 1463 mutex_unlock(&ctl->cache_writeout_mutex); 1464 if (ret) 1465 goto out_nospc; 1466 1467 /* Zero out the rest of the pages just to make sure */ 1468 io_ctl_zero_remaining_pages(io_ctl); 1469 1470 /* Everything is written out, now we dirty the pages in the file. */ 1471 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1472 io_ctl->num_pages, 0, i_size_read(inode), 1473 &cached_state, false); 1474 if (ret) 1475 goto out_nospc; 1476 1477 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1478 up_write(&block_group->data_rwsem); 1479 /* 1480 * Release the pages and unlock the extent, we will flush 1481 * them out later 1482 */ 1483 io_ctl_drop_pages(io_ctl); 1484 io_ctl_free(io_ctl); 1485 1486 unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1487 &cached_state); 1488 1489 /* 1490 * at this point the pages are under IO and we're happy, 1491 * The caller is responsible for waiting on them and updating 1492 * the cache and the inode 1493 */ 1494 io_ctl->entries = entries; 1495 io_ctl->bitmaps = bitmaps; 1496 1497 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1498 if (ret) 1499 goto out; 1500 1501 return 0; 1502 1503 out_nospc_locked: 1504 cleanup_bitmap_list(&bitmap_list); 1505 spin_unlock(&ctl->tree_lock); 1506 mutex_unlock(&ctl->cache_writeout_mutex); 1507 1508 out_nospc: 1509 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1510 1511 out_unlock: 1512 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1513 up_write(&block_group->data_rwsem); 1514 1515 out: 1516 io_ctl->inode = NULL; 1517 io_ctl_free(io_ctl); 1518 if (ret) { 1519 invalidate_inode_pages2(inode->i_mapping); 1520 BTRFS_I(inode)->generation = 0; 1521 } 1522 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1523 if (must_iput) 1524 iput(inode); 1525 return ret; 1526 } 1527 1528 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1529 struct btrfs_block_group *block_group, 1530 struct btrfs_path *path) 1531 { 1532 struct btrfs_fs_info *fs_info = trans->fs_info; 1533 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1534 struct inode *inode; 1535 int ret = 0; 1536 1537 spin_lock(&block_group->lock); 1538 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1539 spin_unlock(&block_group->lock); 1540 return 0; 1541 } 1542 spin_unlock(&block_group->lock); 1543 1544 inode = lookup_free_space_inode(block_group, path); 1545 if (IS_ERR(inode)) 1546 return 0; 1547 1548 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1549 block_group, &block_group->io_ctl, trans); 1550 if (ret) { 1551 btrfs_debug(fs_info, 1552 "failed to write free space cache for block group %llu error %d", 1553 block_group->start, ret); 1554 spin_lock(&block_group->lock); 1555 block_group->disk_cache_state = BTRFS_DC_ERROR; 1556 spin_unlock(&block_group->lock); 1557 1558 block_group->io_ctl.inode = NULL; 1559 iput(inode); 1560 } 1561 1562 /* 1563 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1564 * to wait for IO and put the inode 1565 */ 1566 1567 return ret; 1568 } 1569 1570 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1571 u64 offset) 1572 { 1573 ASSERT(offset >= bitmap_start); 1574 offset -= bitmap_start; 1575 return (unsigned long)(div_u64(offset, unit)); 1576 } 1577 1578 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1579 { 1580 return (unsigned long)(div_u64(bytes, unit)); 1581 } 1582 1583 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1584 u64 offset) 1585 { 1586 u64 bitmap_start; 1587 u64 bytes_per_bitmap; 1588 1589 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1590 bitmap_start = offset - ctl->start; 1591 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1592 bitmap_start *= bytes_per_bitmap; 1593 bitmap_start += ctl->start; 1594 1595 return bitmap_start; 1596 } 1597 1598 static int tree_insert_offset(struct rb_root *root, u64 offset, 1599 struct rb_node *node, int bitmap) 1600 { 1601 struct rb_node **p = &root->rb_node; 1602 struct rb_node *parent = NULL; 1603 struct btrfs_free_space *info; 1604 1605 while (*p) { 1606 parent = *p; 1607 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1608 1609 if (offset < info->offset) { 1610 p = &(*p)->rb_left; 1611 } else if (offset > info->offset) { 1612 p = &(*p)->rb_right; 1613 } else { 1614 /* 1615 * we could have a bitmap entry and an extent entry 1616 * share the same offset. If this is the case, we want 1617 * the extent entry to always be found first if we do a 1618 * linear search through the tree, since we want to have 1619 * the quickest allocation time, and allocating from an 1620 * extent is faster than allocating from a bitmap. So 1621 * if we're inserting a bitmap and we find an entry at 1622 * this offset, we want to go right, or after this entry 1623 * logically. If we are inserting an extent and we've 1624 * found a bitmap, we want to go left, or before 1625 * logically. 1626 */ 1627 if (bitmap) { 1628 if (info->bitmap) { 1629 WARN_ON_ONCE(1); 1630 return -EEXIST; 1631 } 1632 p = &(*p)->rb_right; 1633 } else { 1634 if (!info->bitmap) { 1635 WARN_ON_ONCE(1); 1636 return -EEXIST; 1637 } 1638 p = &(*p)->rb_left; 1639 } 1640 } 1641 } 1642 1643 rb_link_node(node, parent, p); 1644 rb_insert_color(node, root); 1645 1646 return 0; 1647 } 1648 1649 /* 1650 * This is a little subtle. We *only* have ->max_extent_size set if we actually 1651 * searched through the bitmap and figured out the largest ->max_extent_size, 1652 * otherwise it's 0. In the case that it's 0 we don't want to tell the 1653 * allocator the wrong thing, we want to use the actual real max_extent_size 1654 * we've found already if it's larger, or we want to use ->bytes. 1655 * 1656 * This matters because find_free_space() will skip entries who's ->bytes is 1657 * less than the required bytes. So if we didn't search down this bitmap, we 1658 * may pick some previous entry that has a smaller ->max_extent_size than we 1659 * have. For example, assume we have two entries, one that has 1660 * ->max_extent_size set to 4K and ->bytes set to 1M. A second entry hasn't set 1661 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous. We will 1662 * call into find_free_space(), and return with max_extent_size == 4K, because 1663 * that first bitmap entry had ->max_extent_size set, but the second one did 1664 * not. If instead we returned 8K we'd come in searching for 8K, and find the 1665 * 8K contiguous range. 1666 * 1667 * Consider the other case, we have 2 8K chunks in that second entry and still 1668 * don't have ->max_extent_size set. We'll return 16K, and the next time the 1669 * allocator comes in it'll fully search our second bitmap, and this time it'll 1670 * get an uptodate value of 8K as the maximum chunk size. Then we'll get the 1671 * right allocation the next loop through. 1672 */ 1673 static inline u64 get_max_extent_size(const struct btrfs_free_space *entry) 1674 { 1675 if (entry->bitmap && entry->max_extent_size) 1676 return entry->max_extent_size; 1677 return entry->bytes; 1678 } 1679 1680 /* 1681 * We want the largest entry to be leftmost, so this is inverted from what you'd 1682 * normally expect. 1683 */ 1684 static bool entry_less(struct rb_node *node, const struct rb_node *parent) 1685 { 1686 const struct btrfs_free_space *entry, *exist; 1687 1688 entry = rb_entry(node, struct btrfs_free_space, bytes_index); 1689 exist = rb_entry(parent, struct btrfs_free_space, bytes_index); 1690 return get_max_extent_size(exist) < get_max_extent_size(entry); 1691 } 1692 1693 /* 1694 * searches the tree for the given offset. 1695 * 1696 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1697 * want a section that has at least bytes size and comes at or after the given 1698 * offset. 1699 */ 1700 static struct btrfs_free_space * 1701 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1702 u64 offset, int bitmap_only, int fuzzy) 1703 { 1704 struct rb_node *n = ctl->free_space_offset.rb_node; 1705 struct btrfs_free_space *entry = NULL, *prev = NULL; 1706 1707 /* find entry that is closest to the 'offset' */ 1708 while (n) { 1709 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1710 prev = entry; 1711 1712 if (offset < entry->offset) 1713 n = n->rb_left; 1714 else if (offset > entry->offset) 1715 n = n->rb_right; 1716 else 1717 break; 1718 1719 entry = NULL; 1720 } 1721 1722 if (bitmap_only) { 1723 if (!entry) 1724 return NULL; 1725 if (entry->bitmap) 1726 return entry; 1727 1728 /* 1729 * bitmap entry and extent entry may share same offset, 1730 * in that case, bitmap entry comes after extent entry. 1731 */ 1732 n = rb_next(n); 1733 if (!n) 1734 return NULL; 1735 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1736 if (entry->offset != offset) 1737 return NULL; 1738 1739 WARN_ON(!entry->bitmap); 1740 return entry; 1741 } else if (entry) { 1742 if (entry->bitmap) { 1743 /* 1744 * if previous extent entry covers the offset, 1745 * we should return it instead of the bitmap entry 1746 */ 1747 n = rb_prev(&entry->offset_index); 1748 if (n) { 1749 prev = rb_entry(n, struct btrfs_free_space, 1750 offset_index); 1751 if (!prev->bitmap && 1752 prev->offset + prev->bytes > offset) 1753 entry = prev; 1754 } 1755 } 1756 return entry; 1757 } 1758 1759 if (!prev) 1760 return NULL; 1761 1762 /* find last entry before the 'offset' */ 1763 entry = prev; 1764 if (entry->offset > offset) { 1765 n = rb_prev(&entry->offset_index); 1766 if (n) { 1767 entry = rb_entry(n, struct btrfs_free_space, 1768 offset_index); 1769 ASSERT(entry->offset <= offset); 1770 } else { 1771 if (fuzzy) 1772 return entry; 1773 else 1774 return NULL; 1775 } 1776 } 1777 1778 if (entry->bitmap) { 1779 n = rb_prev(&entry->offset_index); 1780 if (n) { 1781 prev = rb_entry(n, struct btrfs_free_space, 1782 offset_index); 1783 if (!prev->bitmap && 1784 prev->offset + prev->bytes > offset) 1785 return prev; 1786 } 1787 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1788 return entry; 1789 } else if (entry->offset + entry->bytes > offset) 1790 return entry; 1791 1792 if (!fuzzy) 1793 return NULL; 1794 1795 while (1) { 1796 n = rb_next(&entry->offset_index); 1797 if (!n) 1798 return NULL; 1799 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1800 if (entry->bitmap) { 1801 if (entry->offset + BITS_PER_BITMAP * 1802 ctl->unit > offset) 1803 break; 1804 } else { 1805 if (entry->offset + entry->bytes > offset) 1806 break; 1807 } 1808 } 1809 return entry; 1810 } 1811 1812 static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1813 struct btrfs_free_space *info, 1814 bool update_stat) 1815 { 1816 rb_erase(&info->offset_index, &ctl->free_space_offset); 1817 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1818 ctl->free_extents--; 1819 1820 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1821 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1822 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1823 } 1824 1825 if (update_stat) 1826 ctl->free_space -= info->bytes; 1827 } 1828 1829 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1830 struct btrfs_free_space *info) 1831 { 1832 int ret = 0; 1833 1834 ASSERT(info->bytes || info->bitmap); 1835 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1836 &info->offset_index, (info->bitmap != NULL)); 1837 if (ret) 1838 return ret; 1839 1840 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1841 1842 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1843 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1844 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1845 } 1846 1847 ctl->free_space += info->bytes; 1848 ctl->free_extents++; 1849 return ret; 1850 } 1851 1852 static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, 1853 struct btrfs_free_space *info) 1854 { 1855 ASSERT(info->bitmap); 1856 1857 /* 1858 * If our entry is empty it's because we're on a cluster and we don't 1859 * want to re-link it into our ctl bytes index. 1860 */ 1861 if (RB_EMPTY_NODE(&info->bytes_index)) 1862 return; 1863 1864 rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); 1865 rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); 1866 } 1867 1868 static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1869 struct btrfs_free_space *info, 1870 u64 offset, u64 bytes, bool update_stat) 1871 { 1872 unsigned long start, count, end; 1873 int extent_delta = -1; 1874 1875 start = offset_to_bit(info->offset, ctl->unit, offset); 1876 count = bytes_to_bits(bytes, ctl->unit); 1877 end = start + count; 1878 ASSERT(end <= BITS_PER_BITMAP); 1879 1880 bitmap_clear(info->bitmap, start, count); 1881 1882 info->bytes -= bytes; 1883 if (info->max_extent_size > ctl->unit) 1884 info->max_extent_size = 0; 1885 1886 relink_bitmap_entry(ctl, info); 1887 1888 if (start && test_bit(start - 1, info->bitmap)) 1889 extent_delta++; 1890 1891 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1892 extent_delta++; 1893 1894 info->bitmap_extents += extent_delta; 1895 if (!btrfs_free_space_trimmed(info)) { 1896 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1897 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1898 } 1899 1900 if (update_stat) 1901 ctl->free_space -= bytes; 1902 } 1903 1904 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1905 struct btrfs_free_space *info, u64 offset, 1906 u64 bytes) 1907 { 1908 unsigned long start, count, end; 1909 int extent_delta = 1; 1910 1911 start = offset_to_bit(info->offset, ctl->unit, offset); 1912 count = bytes_to_bits(bytes, ctl->unit); 1913 end = start + count; 1914 ASSERT(end <= BITS_PER_BITMAP); 1915 1916 bitmap_set(info->bitmap, start, count); 1917 1918 /* 1919 * We set some bytes, we have no idea what the max extent size is 1920 * anymore. 1921 */ 1922 info->max_extent_size = 0; 1923 info->bytes += bytes; 1924 ctl->free_space += bytes; 1925 1926 relink_bitmap_entry(ctl, info); 1927 1928 if (start && test_bit(start - 1, info->bitmap)) 1929 extent_delta--; 1930 1931 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1932 extent_delta--; 1933 1934 info->bitmap_extents += extent_delta; 1935 if (!btrfs_free_space_trimmed(info)) { 1936 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1937 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1938 } 1939 } 1940 1941 /* 1942 * If we can not find suitable extent, we will use bytes to record 1943 * the size of the max extent. 1944 */ 1945 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1946 struct btrfs_free_space *bitmap_info, u64 *offset, 1947 u64 *bytes, bool for_alloc) 1948 { 1949 unsigned long found_bits = 0; 1950 unsigned long max_bits = 0; 1951 unsigned long bits, i; 1952 unsigned long next_zero; 1953 unsigned long extent_bits; 1954 1955 /* 1956 * Skip searching the bitmap if we don't have a contiguous section that 1957 * is large enough for this allocation. 1958 */ 1959 if (for_alloc && 1960 bitmap_info->max_extent_size && 1961 bitmap_info->max_extent_size < *bytes) { 1962 *bytes = bitmap_info->max_extent_size; 1963 return -1; 1964 } 1965 1966 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1967 max_t(u64, *offset, bitmap_info->offset)); 1968 bits = bytes_to_bits(*bytes, ctl->unit); 1969 1970 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1971 if (for_alloc && bits == 1) { 1972 found_bits = 1; 1973 break; 1974 } 1975 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1976 BITS_PER_BITMAP, i); 1977 extent_bits = next_zero - i; 1978 if (extent_bits >= bits) { 1979 found_bits = extent_bits; 1980 break; 1981 } else if (extent_bits > max_bits) { 1982 max_bits = extent_bits; 1983 } 1984 i = next_zero; 1985 } 1986 1987 if (found_bits) { 1988 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1989 *bytes = (u64)(found_bits) * ctl->unit; 1990 return 0; 1991 } 1992 1993 *bytes = (u64)(max_bits) * ctl->unit; 1994 bitmap_info->max_extent_size = *bytes; 1995 relink_bitmap_entry(ctl, bitmap_info); 1996 return -1; 1997 } 1998 1999 /* Cache the size of the max extent in bytes */ 2000 static struct btrfs_free_space * 2001 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 2002 unsigned long align, u64 *max_extent_size, bool use_bytes_index) 2003 { 2004 struct btrfs_free_space *entry; 2005 struct rb_node *node; 2006 u64 tmp; 2007 u64 align_off; 2008 int ret; 2009 2010 if (!ctl->free_space_offset.rb_node) 2011 goto out; 2012 again: 2013 if (use_bytes_index) { 2014 node = rb_first_cached(&ctl->free_space_bytes); 2015 } else { 2016 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2017 0, 1); 2018 if (!entry) 2019 goto out; 2020 node = &entry->offset_index; 2021 } 2022 2023 for (; node; node = rb_next(node)) { 2024 if (use_bytes_index) 2025 entry = rb_entry(node, struct btrfs_free_space, 2026 bytes_index); 2027 else 2028 entry = rb_entry(node, struct btrfs_free_space, 2029 offset_index); 2030 2031 /* 2032 * If we are using the bytes index then all subsequent entries 2033 * in this tree are going to be < bytes, so simply set the max 2034 * extent size and exit the loop. 2035 * 2036 * If we're using the offset index then we need to keep going 2037 * through the rest of the tree. 2038 */ 2039 if (entry->bytes < *bytes) { 2040 *max_extent_size = max(get_max_extent_size(entry), 2041 *max_extent_size); 2042 if (use_bytes_index) 2043 break; 2044 continue; 2045 } 2046 2047 /* make sure the space returned is big enough 2048 * to match our requested alignment 2049 */ 2050 if (*bytes >= align) { 2051 tmp = entry->offset - ctl->start + align - 1; 2052 tmp = div64_u64(tmp, align); 2053 tmp = tmp * align + ctl->start; 2054 align_off = tmp - entry->offset; 2055 } else { 2056 align_off = 0; 2057 tmp = entry->offset; 2058 } 2059 2060 /* 2061 * We don't break here if we're using the bytes index because we 2062 * may have another entry that has the correct alignment that is 2063 * the right size, so we don't want to miss that possibility. 2064 * At worst this adds another loop through the logic, but if we 2065 * broke here we could prematurely ENOSPC. 2066 */ 2067 if (entry->bytes < *bytes + align_off) { 2068 *max_extent_size = max(get_max_extent_size(entry), 2069 *max_extent_size); 2070 continue; 2071 } 2072 2073 if (entry->bitmap) { 2074 struct rb_node *old_next = rb_next(node); 2075 u64 size = *bytes; 2076 2077 ret = search_bitmap(ctl, entry, &tmp, &size, true); 2078 if (!ret) { 2079 *offset = tmp; 2080 *bytes = size; 2081 return entry; 2082 } else { 2083 *max_extent_size = 2084 max(get_max_extent_size(entry), 2085 *max_extent_size); 2086 } 2087 2088 /* 2089 * The bitmap may have gotten re-arranged in the space 2090 * index here because the max_extent_size may have been 2091 * updated. Start from the beginning again if this 2092 * happened. 2093 */ 2094 if (use_bytes_index && old_next != rb_next(node)) 2095 goto again; 2096 continue; 2097 } 2098 2099 *offset = tmp; 2100 *bytes = entry->bytes - align_off; 2101 return entry; 2102 } 2103 out: 2104 return NULL; 2105 } 2106 2107 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 2108 struct btrfs_free_space *info, u64 offset) 2109 { 2110 info->offset = offset_to_bitmap(ctl, offset); 2111 info->bytes = 0; 2112 info->bitmap_extents = 0; 2113 INIT_LIST_HEAD(&info->list); 2114 link_free_space(ctl, info); 2115 ctl->total_bitmaps++; 2116 recalculate_thresholds(ctl); 2117 } 2118 2119 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 2120 struct btrfs_free_space *bitmap_info) 2121 { 2122 /* 2123 * Normally when this is called, the bitmap is completely empty. However, 2124 * if we are blowing up the free space cache for one reason or another 2125 * via __btrfs_remove_free_space_cache(), then it may not be freed and 2126 * we may leave stats on the table. 2127 */ 2128 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 2129 ctl->discardable_extents[BTRFS_STAT_CURR] -= 2130 bitmap_info->bitmap_extents; 2131 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 2132 2133 } 2134 unlink_free_space(ctl, bitmap_info, true); 2135 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 2136 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 2137 ctl->total_bitmaps--; 2138 recalculate_thresholds(ctl); 2139 } 2140 2141 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 2142 struct btrfs_free_space *bitmap_info, 2143 u64 *offset, u64 *bytes) 2144 { 2145 u64 end; 2146 u64 search_start, search_bytes; 2147 int ret; 2148 2149 again: 2150 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 2151 2152 /* 2153 * We need to search for bits in this bitmap. We could only cover some 2154 * of the extent in this bitmap thanks to how we add space, so we need 2155 * to search for as much as it as we can and clear that amount, and then 2156 * go searching for the next bit. 2157 */ 2158 search_start = *offset; 2159 search_bytes = ctl->unit; 2160 search_bytes = min(search_bytes, end - search_start + 1); 2161 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 2162 false); 2163 if (ret < 0 || search_start != *offset) 2164 return -EINVAL; 2165 2166 /* We may have found more bits than what we need */ 2167 search_bytes = min(search_bytes, *bytes); 2168 2169 /* Cannot clear past the end of the bitmap */ 2170 search_bytes = min(search_bytes, end - search_start + 1); 2171 2172 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true); 2173 *offset += search_bytes; 2174 *bytes -= search_bytes; 2175 2176 if (*bytes) { 2177 struct rb_node *next = rb_next(&bitmap_info->offset_index); 2178 if (!bitmap_info->bytes) 2179 free_bitmap(ctl, bitmap_info); 2180 2181 /* 2182 * no entry after this bitmap, but we still have bytes to 2183 * remove, so something has gone wrong. 2184 */ 2185 if (!next) 2186 return -EINVAL; 2187 2188 bitmap_info = rb_entry(next, struct btrfs_free_space, 2189 offset_index); 2190 2191 /* 2192 * if the next entry isn't a bitmap we need to return to let the 2193 * extent stuff do its work. 2194 */ 2195 if (!bitmap_info->bitmap) 2196 return -EAGAIN; 2197 2198 /* 2199 * Ok the next item is a bitmap, but it may not actually hold 2200 * the information for the rest of this free space stuff, so 2201 * look for it, and if we don't find it return so we can try 2202 * everything over again. 2203 */ 2204 search_start = *offset; 2205 search_bytes = ctl->unit; 2206 ret = search_bitmap(ctl, bitmap_info, &search_start, 2207 &search_bytes, false); 2208 if (ret < 0 || search_start != *offset) 2209 return -EAGAIN; 2210 2211 goto again; 2212 } else if (!bitmap_info->bytes) 2213 free_bitmap(ctl, bitmap_info); 2214 2215 return 0; 2216 } 2217 2218 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2219 struct btrfs_free_space *info, u64 offset, 2220 u64 bytes, enum btrfs_trim_state trim_state) 2221 { 2222 u64 bytes_to_set = 0; 2223 u64 end; 2224 2225 /* 2226 * This is a tradeoff to make bitmap trim state minimal. We mark the 2227 * whole bitmap untrimmed if at any point we add untrimmed regions. 2228 */ 2229 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2230 if (btrfs_free_space_trimmed(info)) { 2231 ctl->discardable_extents[BTRFS_STAT_CURR] += 2232 info->bitmap_extents; 2233 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2234 } 2235 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2236 } 2237 2238 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2239 2240 bytes_to_set = min(end - offset, bytes); 2241 2242 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2243 2244 return bytes_to_set; 2245 2246 } 2247 2248 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2249 struct btrfs_free_space *info) 2250 { 2251 struct btrfs_block_group *block_group = ctl->block_group; 2252 struct btrfs_fs_info *fs_info = block_group->fs_info; 2253 bool forced = false; 2254 2255 #ifdef CONFIG_BTRFS_DEBUG 2256 if (btrfs_should_fragment_free_space(block_group)) 2257 forced = true; 2258 #endif 2259 2260 /* This is a way to reclaim large regions from the bitmaps. */ 2261 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2262 return false; 2263 2264 /* 2265 * If we are below the extents threshold then we can add this as an 2266 * extent, and don't have to deal with the bitmap 2267 */ 2268 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2269 /* 2270 * If this block group has some small extents we don't want to 2271 * use up all of our free slots in the cache with them, we want 2272 * to reserve them to larger extents, however if we have plenty 2273 * of cache left then go ahead an dadd them, no sense in adding 2274 * the overhead of a bitmap if we don't have to. 2275 */ 2276 if (info->bytes <= fs_info->sectorsize * 8) { 2277 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2278 return false; 2279 } else { 2280 return false; 2281 } 2282 } 2283 2284 /* 2285 * The original block groups from mkfs can be really small, like 8 2286 * megabytes, so don't bother with a bitmap for those entries. However 2287 * some block groups can be smaller than what a bitmap would cover but 2288 * are still large enough that they could overflow the 32k memory limit, 2289 * so allow those block groups to still be allowed to have a bitmap 2290 * entry. 2291 */ 2292 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2293 return false; 2294 2295 return true; 2296 } 2297 2298 static const struct btrfs_free_space_op free_space_op = { 2299 .use_bitmap = use_bitmap, 2300 }; 2301 2302 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2303 struct btrfs_free_space *info) 2304 { 2305 struct btrfs_free_space *bitmap_info; 2306 struct btrfs_block_group *block_group = NULL; 2307 int added = 0; 2308 u64 bytes, offset, bytes_added; 2309 enum btrfs_trim_state trim_state; 2310 int ret; 2311 2312 bytes = info->bytes; 2313 offset = info->offset; 2314 trim_state = info->trim_state; 2315 2316 if (!ctl->op->use_bitmap(ctl, info)) 2317 return 0; 2318 2319 if (ctl->op == &free_space_op) 2320 block_group = ctl->block_group; 2321 again: 2322 /* 2323 * Since we link bitmaps right into the cluster we need to see if we 2324 * have a cluster here, and if so and it has our bitmap we need to add 2325 * the free space to that bitmap. 2326 */ 2327 if (block_group && !list_empty(&block_group->cluster_list)) { 2328 struct btrfs_free_cluster *cluster; 2329 struct rb_node *node; 2330 struct btrfs_free_space *entry; 2331 2332 cluster = list_entry(block_group->cluster_list.next, 2333 struct btrfs_free_cluster, 2334 block_group_list); 2335 spin_lock(&cluster->lock); 2336 node = rb_first(&cluster->root); 2337 if (!node) { 2338 spin_unlock(&cluster->lock); 2339 goto no_cluster_bitmap; 2340 } 2341 2342 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2343 if (!entry->bitmap) { 2344 spin_unlock(&cluster->lock); 2345 goto no_cluster_bitmap; 2346 } 2347 2348 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2349 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2350 bytes, trim_state); 2351 bytes -= bytes_added; 2352 offset += bytes_added; 2353 } 2354 spin_unlock(&cluster->lock); 2355 if (!bytes) { 2356 ret = 1; 2357 goto out; 2358 } 2359 } 2360 2361 no_cluster_bitmap: 2362 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2363 1, 0); 2364 if (!bitmap_info) { 2365 ASSERT(added == 0); 2366 goto new_bitmap; 2367 } 2368 2369 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2370 trim_state); 2371 bytes -= bytes_added; 2372 offset += bytes_added; 2373 added = 0; 2374 2375 if (!bytes) { 2376 ret = 1; 2377 goto out; 2378 } else 2379 goto again; 2380 2381 new_bitmap: 2382 if (info && info->bitmap) { 2383 add_new_bitmap(ctl, info, offset); 2384 added = 1; 2385 info = NULL; 2386 goto again; 2387 } else { 2388 spin_unlock(&ctl->tree_lock); 2389 2390 /* no pre-allocated info, allocate a new one */ 2391 if (!info) { 2392 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2393 GFP_NOFS); 2394 if (!info) { 2395 spin_lock(&ctl->tree_lock); 2396 ret = -ENOMEM; 2397 goto out; 2398 } 2399 } 2400 2401 /* allocate the bitmap */ 2402 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2403 GFP_NOFS); 2404 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2405 spin_lock(&ctl->tree_lock); 2406 if (!info->bitmap) { 2407 ret = -ENOMEM; 2408 goto out; 2409 } 2410 goto again; 2411 } 2412 2413 out: 2414 if (info) { 2415 if (info->bitmap) 2416 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2417 info->bitmap); 2418 kmem_cache_free(btrfs_free_space_cachep, info); 2419 } 2420 2421 return ret; 2422 } 2423 2424 /* 2425 * Free space merging rules: 2426 * 1) Merge trimmed areas together 2427 * 2) Let untrimmed areas coalesce with trimmed areas 2428 * 3) Always pull neighboring regions from bitmaps 2429 * 2430 * The above rules are for when we merge free space based on btrfs_trim_state. 2431 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2432 * same reason: to promote larger extent regions which makes life easier for 2433 * find_free_extent(). Rule 2 enables coalescing based on the common path 2434 * being returning free space from btrfs_finish_extent_commit(). So when free 2435 * space is trimmed, it will prevent aggregating trimmed new region and 2436 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2437 * and provide find_free_extent() with the largest extents possible hoping for 2438 * the reuse path. 2439 */ 2440 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2441 struct btrfs_free_space *info, bool update_stat) 2442 { 2443 struct btrfs_free_space *left_info = NULL; 2444 struct btrfs_free_space *right_info; 2445 bool merged = false; 2446 u64 offset = info->offset; 2447 u64 bytes = info->bytes; 2448 const bool is_trimmed = btrfs_free_space_trimmed(info); 2449 2450 /* 2451 * first we want to see if there is free space adjacent to the range we 2452 * are adding, if there is remove that struct and add a new one to 2453 * cover the entire range 2454 */ 2455 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2456 if (right_info && rb_prev(&right_info->offset_index)) 2457 left_info = rb_entry(rb_prev(&right_info->offset_index), 2458 struct btrfs_free_space, offset_index); 2459 else if (!right_info) 2460 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2461 2462 /* See try_merge_free_space() comment. */ 2463 if (right_info && !right_info->bitmap && 2464 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2465 unlink_free_space(ctl, right_info, update_stat); 2466 info->bytes += right_info->bytes; 2467 kmem_cache_free(btrfs_free_space_cachep, right_info); 2468 merged = true; 2469 } 2470 2471 /* See try_merge_free_space() comment. */ 2472 if (left_info && !left_info->bitmap && 2473 left_info->offset + left_info->bytes == offset && 2474 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2475 unlink_free_space(ctl, left_info, update_stat); 2476 info->offset = left_info->offset; 2477 info->bytes += left_info->bytes; 2478 kmem_cache_free(btrfs_free_space_cachep, left_info); 2479 merged = true; 2480 } 2481 2482 return merged; 2483 } 2484 2485 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2486 struct btrfs_free_space *info, 2487 bool update_stat) 2488 { 2489 struct btrfs_free_space *bitmap; 2490 unsigned long i; 2491 unsigned long j; 2492 const u64 end = info->offset + info->bytes; 2493 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2494 u64 bytes; 2495 2496 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2497 if (!bitmap) 2498 return false; 2499 2500 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2501 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2502 if (j == i) 2503 return false; 2504 bytes = (j - i) * ctl->unit; 2505 info->bytes += bytes; 2506 2507 /* See try_merge_free_space() comment. */ 2508 if (!btrfs_free_space_trimmed(bitmap)) 2509 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2510 2511 bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat); 2512 2513 if (!bitmap->bytes) 2514 free_bitmap(ctl, bitmap); 2515 2516 return true; 2517 } 2518 2519 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2520 struct btrfs_free_space *info, 2521 bool update_stat) 2522 { 2523 struct btrfs_free_space *bitmap; 2524 u64 bitmap_offset; 2525 unsigned long i; 2526 unsigned long j; 2527 unsigned long prev_j; 2528 u64 bytes; 2529 2530 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2531 /* If we're on a boundary, try the previous logical bitmap. */ 2532 if (bitmap_offset == info->offset) { 2533 if (info->offset == 0) 2534 return false; 2535 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2536 } 2537 2538 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2539 if (!bitmap) 2540 return false; 2541 2542 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2543 j = 0; 2544 prev_j = (unsigned long)-1; 2545 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2546 if (j > i) 2547 break; 2548 prev_j = j; 2549 } 2550 if (prev_j == i) 2551 return false; 2552 2553 if (prev_j == (unsigned long)-1) 2554 bytes = (i + 1) * ctl->unit; 2555 else 2556 bytes = (i - prev_j) * ctl->unit; 2557 2558 info->offset -= bytes; 2559 info->bytes += bytes; 2560 2561 /* See try_merge_free_space() comment. */ 2562 if (!btrfs_free_space_trimmed(bitmap)) 2563 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2564 2565 bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat); 2566 2567 if (!bitmap->bytes) 2568 free_bitmap(ctl, bitmap); 2569 2570 return true; 2571 } 2572 2573 /* 2574 * We prefer always to allocate from extent entries, both for clustered and 2575 * non-clustered allocation requests. So when attempting to add a new extent 2576 * entry, try to see if there's adjacent free space in bitmap entries, and if 2577 * there is, migrate that space from the bitmaps to the extent. 2578 * Like this we get better chances of satisfying space allocation requests 2579 * because we attempt to satisfy them based on a single cache entry, and never 2580 * on 2 or more entries - even if the entries represent a contiguous free space 2581 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2582 * ends). 2583 */ 2584 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2585 struct btrfs_free_space *info, 2586 bool update_stat) 2587 { 2588 /* 2589 * Only work with disconnected entries, as we can change their offset, 2590 * and must be extent entries. 2591 */ 2592 ASSERT(!info->bitmap); 2593 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2594 2595 if (ctl->total_bitmaps > 0) { 2596 bool stole_end; 2597 bool stole_front = false; 2598 2599 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2600 if (ctl->total_bitmaps > 0) 2601 stole_front = steal_from_bitmap_to_front(ctl, info, 2602 update_stat); 2603 2604 if (stole_end || stole_front) 2605 try_merge_free_space(ctl, info, update_stat); 2606 } 2607 } 2608 2609 int __btrfs_add_free_space(struct btrfs_block_group *block_group, 2610 u64 offset, u64 bytes, 2611 enum btrfs_trim_state trim_state) 2612 { 2613 struct btrfs_fs_info *fs_info = block_group->fs_info; 2614 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2615 struct btrfs_free_space *info; 2616 int ret = 0; 2617 u64 filter_bytes = bytes; 2618 2619 ASSERT(!btrfs_is_zoned(fs_info)); 2620 2621 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2622 if (!info) 2623 return -ENOMEM; 2624 2625 info->offset = offset; 2626 info->bytes = bytes; 2627 info->trim_state = trim_state; 2628 RB_CLEAR_NODE(&info->offset_index); 2629 RB_CLEAR_NODE(&info->bytes_index); 2630 2631 spin_lock(&ctl->tree_lock); 2632 2633 if (try_merge_free_space(ctl, info, true)) 2634 goto link; 2635 2636 /* 2637 * There was no extent directly to the left or right of this new 2638 * extent then we know we're going to have to allocate a new extent, so 2639 * before we do that see if we need to drop this into a bitmap 2640 */ 2641 ret = insert_into_bitmap(ctl, info); 2642 if (ret < 0) { 2643 goto out; 2644 } else if (ret) { 2645 ret = 0; 2646 goto out; 2647 } 2648 link: 2649 /* 2650 * Only steal free space from adjacent bitmaps if we're sure we're not 2651 * going to add the new free space to existing bitmap entries - because 2652 * that would mean unnecessary work that would be reverted. Therefore 2653 * attempt to steal space from bitmaps if we're adding an extent entry. 2654 */ 2655 steal_from_bitmap(ctl, info, true); 2656 2657 filter_bytes = max(filter_bytes, info->bytes); 2658 2659 ret = link_free_space(ctl, info); 2660 if (ret) 2661 kmem_cache_free(btrfs_free_space_cachep, info); 2662 out: 2663 btrfs_discard_update_discardable(block_group); 2664 spin_unlock(&ctl->tree_lock); 2665 2666 if (ret) { 2667 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2668 ASSERT(ret != -EEXIST); 2669 } 2670 2671 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2672 btrfs_discard_check_filter(block_group, filter_bytes); 2673 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2674 } 2675 2676 return ret; 2677 } 2678 2679 static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, 2680 u64 bytenr, u64 size, bool used) 2681 { 2682 struct btrfs_space_info *sinfo = block_group->space_info; 2683 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2684 u64 offset = bytenr - block_group->start; 2685 u64 to_free, to_unusable; 2686 int bg_reclaim_threshold = 0; 2687 bool initial = (size == block_group->length); 2688 u64 reclaimable_unusable; 2689 2690 WARN_ON(!initial && offset + size > block_group->zone_capacity); 2691 2692 if (!initial) 2693 bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); 2694 2695 spin_lock(&ctl->tree_lock); 2696 /* Count initial region as zone_unusable until it gets activated. */ 2697 if (!used) 2698 to_free = size; 2699 else if (initial && 2700 test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && 2701 (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) 2702 to_free = 0; 2703 else if (initial) 2704 to_free = block_group->zone_capacity; 2705 else if (offset >= block_group->alloc_offset) 2706 to_free = size; 2707 else if (offset + size <= block_group->alloc_offset) 2708 to_free = 0; 2709 else 2710 to_free = offset + size - block_group->alloc_offset; 2711 to_unusable = size - to_free; 2712 2713 ctl->free_space += to_free; 2714 /* 2715 * If the block group is read-only, we should account freed space into 2716 * bytes_readonly. 2717 */ 2718 if (!block_group->ro) 2719 block_group->zone_unusable += to_unusable; 2720 spin_unlock(&ctl->tree_lock); 2721 if (!used) { 2722 spin_lock(&block_group->lock); 2723 block_group->alloc_offset -= size; 2724 spin_unlock(&block_group->lock); 2725 } 2726 2727 reclaimable_unusable = block_group->zone_unusable - 2728 (block_group->length - block_group->zone_capacity); 2729 /* All the region is now unusable. Mark it as unused and reclaim */ 2730 if (block_group->zone_unusable == block_group->length && 2731 block_group->alloc_offset) { 2732 btrfs_mark_bg_unused(block_group); 2733 } else if (bg_reclaim_threshold && 2734 reclaimable_unusable >= 2735 mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { 2736 btrfs_mark_bg_to_reclaim(block_group); 2737 } 2738 2739 return 0; 2740 } 2741 2742 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2743 u64 bytenr, u64 size) 2744 { 2745 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2746 2747 if (btrfs_is_zoned(block_group->fs_info)) 2748 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2749 true); 2750 2751 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2752 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2753 2754 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2755 } 2756 2757 int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, 2758 u64 bytenr, u64 size) 2759 { 2760 if (btrfs_is_zoned(block_group->fs_info)) 2761 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2762 false); 2763 2764 return btrfs_add_free_space(block_group, bytenr, size); 2765 } 2766 2767 /* 2768 * This is a subtle distinction because when adding free space back in general, 2769 * we want it to be added as untrimmed for async. But in the case where we add 2770 * it on loading of a block group, we want to consider it trimmed. 2771 */ 2772 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2773 u64 bytenr, u64 size) 2774 { 2775 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2776 2777 if (btrfs_is_zoned(block_group->fs_info)) 2778 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2779 true); 2780 2781 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2782 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2783 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2784 2785 return __btrfs_add_free_space(block_group, bytenr, size, trim_state); 2786 } 2787 2788 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2789 u64 offset, u64 bytes) 2790 { 2791 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2792 struct btrfs_free_space *info; 2793 int ret; 2794 bool re_search = false; 2795 2796 if (btrfs_is_zoned(block_group->fs_info)) { 2797 /* 2798 * This can happen with conventional zones when replaying log. 2799 * Since the allocation info of tree-log nodes are not recorded 2800 * to the extent-tree, calculate_alloc_pointer() failed to 2801 * advance the allocation pointer after last allocated tree log 2802 * node blocks. 2803 * 2804 * This function is called from 2805 * btrfs_pin_extent_for_log_replay() when replaying the log. 2806 * Advance the pointer not to overwrite the tree-log nodes. 2807 */ 2808 if (block_group->start + block_group->alloc_offset < 2809 offset + bytes) { 2810 block_group->alloc_offset = 2811 offset + bytes - block_group->start; 2812 } 2813 return 0; 2814 } 2815 2816 spin_lock(&ctl->tree_lock); 2817 2818 again: 2819 ret = 0; 2820 if (!bytes) 2821 goto out_lock; 2822 2823 info = tree_search_offset(ctl, offset, 0, 0); 2824 if (!info) { 2825 /* 2826 * oops didn't find an extent that matched the space we wanted 2827 * to remove, look for a bitmap instead 2828 */ 2829 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2830 1, 0); 2831 if (!info) { 2832 /* 2833 * If we found a partial bit of our free space in a 2834 * bitmap but then couldn't find the other part this may 2835 * be a problem, so WARN about it. 2836 */ 2837 WARN_ON(re_search); 2838 goto out_lock; 2839 } 2840 } 2841 2842 re_search = false; 2843 if (!info->bitmap) { 2844 unlink_free_space(ctl, info, true); 2845 if (offset == info->offset) { 2846 u64 to_free = min(bytes, info->bytes); 2847 2848 info->bytes -= to_free; 2849 info->offset += to_free; 2850 if (info->bytes) { 2851 ret = link_free_space(ctl, info); 2852 WARN_ON(ret); 2853 } else { 2854 kmem_cache_free(btrfs_free_space_cachep, info); 2855 } 2856 2857 offset += to_free; 2858 bytes -= to_free; 2859 goto again; 2860 } else { 2861 u64 old_end = info->bytes + info->offset; 2862 2863 info->bytes = offset - info->offset; 2864 ret = link_free_space(ctl, info); 2865 WARN_ON(ret); 2866 if (ret) 2867 goto out_lock; 2868 2869 /* Not enough bytes in this entry to satisfy us */ 2870 if (old_end < offset + bytes) { 2871 bytes -= old_end - offset; 2872 offset = old_end; 2873 goto again; 2874 } else if (old_end == offset + bytes) { 2875 /* all done */ 2876 goto out_lock; 2877 } 2878 spin_unlock(&ctl->tree_lock); 2879 2880 ret = __btrfs_add_free_space(block_group, 2881 offset + bytes, 2882 old_end - (offset + bytes), 2883 info->trim_state); 2884 WARN_ON(ret); 2885 goto out; 2886 } 2887 } 2888 2889 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2890 if (ret == -EAGAIN) { 2891 re_search = true; 2892 goto again; 2893 } 2894 out_lock: 2895 btrfs_discard_update_discardable(block_group); 2896 spin_unlock(&ctl->tree_lock); 2897 out: 2898 return ret; 2899 } 2900 2901 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2902 u64 bytes) 2903 { 2904 struct btrfs_fs_info *fs_info = block_group->fs_info; 2905 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2906 struct btrfs_free_space *info; 2907 struct rb_node *n; 2908 int count = 0; 2909 2910 /* 2911 * Zoned btrfs does not use free space tree and cluster. Just print 2912 * out the free space after the allocation offset. 2913 */ 2914 if (btrfs_is_zoned(fs_info)) { 2915 btrfs_info(fs_info, "free space %llu active %d", 2916 block_group->zone_capacity - block_group->alloc_offset, 2917 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2918 &block_group->runtime_flags)); 2919 return; 2920 } 2921 2922 spin_lock(&ctl->tree_lock); 2923 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2924 info = rb_entry(n, struct btrfs_free_space, offset_index); 2925 if (info->bytes >= bytes && !block_group->ro) 2926 count++; 2927 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2928 info->offset, info->bytes, 2929 (info->bitmap) ? "yes" : "no"); 2930 } 2931 spin_unlock(&ctl->tree_lock); 2932 btrfs_info(fs_info, "block group has cluster?: %s", 2933 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2934 btrfs_info(fs_info, 2935 "%d blocks of free space at or bigger than bytes is", count); 2936 } 2937 2938 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2939 struct btrfs_free_space_ctl *ctl) 2940 { 2941 struct btrfs_fs_info *fs_info = block_group->fs_info; 2942 2943 spin_lock_init(&ctl->tree_lock); 2944 ctl->unit = fs_info->sectorsize; 2945 ctl->start = block_group->start; 2946 ctl->block_group = block_group; 2947 ctl->op = &free_space_op; 2948 ctl->free_space_bytes = RB_ROOT_CACHED; 2949 INIT_LIST_HEAD(&ctl->trimming_ranges); 2950 mutex_init(&ctl->cache_writeout_mutex); 2951 2952 /* 2953 * we only want to have 32k of ram per block group for keeping 2954 * track of free space, and if we pass 1/2 of that we want to 2955 * start converting things over to using bitmaps 2956 */ 2957 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2958 } 2959 2960 /* 2961 * for a given cluster, put all of its extents back into the free 2962 * space cache. If the block group passed doesn't match the block group 2963 * pointed to by the cluster, someone else raced in and freed the 2964 * cluster already. In that case, we just return without changing anything 2965 */ 2966 static void __btrfs_return_cluster_to_free_space( 2967 struct btrfs_block_group *block_group, 2968 struct btrfs_free_cluster *cluster) 2969 { 2970 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2971 struct btrfs_free_space *entry; 2972 struct rb_node *node; 2973 2974 spin_lock(&cluster->lock); 2975 if (cluster->block_group != block_group) { 2976 spin_unlock(&cluster->lock); 2977 return; 2978 } 2979 2980 cluster->block_group = NULL; 2981 cluster->window_start = 0; 2982 list_del_init(&cluster->block_group_list); 2983 2984 node = rb_first(&cluster->root); 2985 while (node) { 2986 bool bitmap; 2987 2988 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2989 node = rb_next(&entry->offset_index); 2990 rb_erase(&entry->offset_index, &cluster->root); 2991 RB_CLEAR_NODE(&entry->offset_index); 2992 2993 bitmap = (entry->bitmap != NULL); 2994 if (!bitmap) { 2995 /* Merging treats extents as if they were new */ 2996 if (!btrfs_free_space_trimmed(entry)) { 2997 ctl->discardable_extents[BTRFS_STAT_CURR]--; 2998 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 2999 entry->bytes; 3000 } 3001 3002 try_merge_free_space(ctl, entry, false); 3003 steal_from_bitmap(ctl, entry, false); 3004 3005 /* As we insert directly, update these statistics */ 3006 if (!btrfs_free_space_trimmed(entry)) { 3007 ctl->discardable_extents[BTRFS_STAT_CURR]++; 3008 ctl->discardable_bytes[BTRFS_STAT_CURR] += 3009 entry->bytes; 3010 } 3011 } 3012 tree_insert_offset(&ctl->free_space_offset, 3013 entry->offset, &entry->offset_index, bitmap); 3014 rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, 3015 entry_less); 3016 } 3017 cluster->root = RB_ROOT; 3018 spin_unlock(&cluster->lock); 3019 btrfs_put_block_group(block_group); 3020 } 3021 3022 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 3023 { 3024 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3025 struct btrfs_free_cluster *cluster; 3026 struct list_head *head; 3027 3028 spin_lock(&ctl->tree_lock); 3029 while ((head = block_group->cluster_list.next) != 3030 &block_group->cluster_list) { 3031 cluster = list_entry(head, struct btrfs_free_cluster, 3032 block_group_list); 3033 3034 WARN_ON(cluster->block_group != block_group); 3035 __btrfs_return_cluster_to_free_space(block_group, cluster); 3036 3037 cond_resched_lock(&ctl->tree_lock); 3038 } 3039 __btrfs_remove_free_space_cache(ctl); 3040 btrfs_discard_update_discardable(block_group); 3041 spin_unlock(&ctl->tree_lock); 3042 3043 } 3044 3045 /* 3046 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 3047 */ 3048 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 3049 { 3050 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3051 struct btrfs_free_space *info; 3052 struct rb_node *node; 3053 bool ret = true; 3054 3055 spin_lock(&ctl->tree_lock); 3056 node = rb_first(&ctl->free_space_offset); 3057 3058 while (node) { 3059 info = rb_entry(node, struct btrfs_free_space, offset_index); 3060 3061 if (!btrfs_free_space_trimmed(info)) { 3062 ret = false; 3063 break; 3064 } 3065 3066 node = rb_next(node); 3067 } 3068 3069 spin_unlock(&ctl->tree_lock); 3070 return ret; 3071 } 3072 3073 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 3074 u64 offset, u64 bytes, u64 empty_size, 3075 u64 *max_extent_size) 3076 { 3077 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3078 struct btrfs_discard_ctl *discard_ctl = 3079 &block_group->fs_info->discard_ctl; 3080 struct btrfs_free_space *entry = NULL; 3081 u64 bytes_search = bytes + empty_size; 3082 u64 ret = 0; 3083 u64 align_gap = 0; 3084 u64 align_gap_len = 0; 3085 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3086 bool use_bytes_index = (offset == block_group->start); 3087 3088 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3089 3090 spin_lock(&ctl->tree_lock); 3091 entry = find_free_space(ctl, &offset, &bytes_search, 3092 block_group->full_stripe_len, max_extent_size, 3093 use_bytes_index); 3094 if (!entry) 3095 goto out; 3096 3097 ret = offset; 3098 if (entry->bitmap) { 3099 bitmap_clear_bits(ctl, entry, offset, bytes, true); 3100 3101 if (!btrfs_free_space_trimmed(entry)) 3102 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3103 3104 if (!entry->bytes) 3105 free_bitmap(ctl, entry); 3106 } else { 3107 unlink_free_space(ctl, entry, true); 3108 align_gap_len = offset - entry->offset; 3109 align_gap = entry->offset; 3110 align_gap_trim_state = entry->trim_state; 3111 3112 if (!btrfs_free_space_trimmed(entry)) 3113 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3114 3115 entry->offset = offset + bytes; 3116 WARN_ON(entry->bytes < bytes + align_gap_len); 3117 3118 entry->bytes -= bytes + align_gap_len; 3119 if (!entry->bytes) 3120 kmem_cache_free(btrfs_free_space_cachep, entry); 3121 else 3122 link_free_space(ctl, entry); 3123 } 3124 out: 3125 btrfs_discard_update_discardable(block_group); 3126 spin_unlock(&ctl->tree_lock); 3127 3128 if (align_gap_len) 3129 __btrfs_add_free_space(block_group, align_gap, align_gap_len, 3130 align_gap_trim_state); 3131 return ret; 3132 } 3133 3134 /* 3135 * given a cluster, put all of its extents back into the free space 3136 * cache. If a block group is passed, this function will only free 3137 * a cluster that belongs to the passed block group. 3138 * 3139 * Otherwise, it'll get a reference on the block group pointed to by the 3140 * cluster and remove the cluster from it. 3141 */ 3142 void btrfs_return_cluster_to_free_space( 3143 struct btrfs_block_group *block_group, 3144 struct btrfs_free_cluster *cluster) 3145 { 3146 struct btrfs_free_space_ctl *ctl; 3147 3148 /* first, get a safe pointer to the block group */ 3149 spin_lock(&cluster->lock); 3150 if (!block_group) { 3151 block_group = cluster->block_group; 3152 if (!block_group) { 3153 spin_unlock(&cluster->lock); 3154 return; 3155 } 3156 } else if (cluster->block_group != block_group) { 3157 /* someone else has already freed it don't redo their work */ 3158 spin_unlock(&cluster->lock); 3159 return; 3160 } 3161 btrfs_get_block_group(block_group); 3162 spin_unlock(&cluster->lock); 3163 3164 ctl = block_group->free_space_ctl; 3165 3166 /* now return any extents the cluster had on it */ 3167 spin_lock(&ctl->tree_lock); 3168 __btrfs_return_cluster_to_free_space(block_group, cluster); 3169 spin_unlock(&ctl->tree_lock); 3170 3171 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 3172 3173 /* finally drop our ref */ 3174 btrfs_put_block_group(block_group); 3175 } 3176 3177 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 3178 struct btrfs_free_cluster *cluster, 3179 struct btrfs_free_space *entry, 3180 u64 bytes, u64 min_start, 3181 u64 *max_extent_size) 3182 { 3183 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3184 int err; 3185 u64 search_start = cluster->window_start; 3186 u64 search_bytes = bytes; 3187 u64 ret = 0; 3188 3189 search_start = min_start; 3190 search_bytes = bytes; 3191 3192 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 3193 if (err) { 3194 *max_extent_size = max(get_max_extent_size(entry), 3195 *max_extent_size); 3196 return 0; 3197 } 3198 3199 ret = search_start; 3200 bitmap_clear_bits(ctl, entry, ret, bytes, false); 3201 3202 return ret; 3203 } 3204 3205 /* 3206 * given a cluster, try to allocate 'bytes' from it, returns 0 3207 * if it couldn't find anything suitably large, or a logical disk offset 3208 * if things worked out 3209 */ 3210 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 3211 struct btrfs_free_cluster *cluster, u64 bytes, 3212 u64 min_start, u64 *max_extent_size) 3213 { 3214 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3215 struct btrfs_discard_ctl *discard_ctl = 3216 &block_group->fs_info->discard_ctl; 3217 struct btrfs_free_space *entry = NULL; 3218 struct rb_node *node; 3219 u64 ret = 0; 3220 3221 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 3222 3223 spin_lock(&cluster->lock); 3224 if (bytes > cluster->max_size) 3225 goto out; 3226 3227 if (cluster->block_group != block_group) 3228 goto out; 3229 3230 node = rb_first(&cluster->root); 3231 if (!node) 3232 goto out; 3233 3234 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3235 while (1) { 3236 if (entry->bytes < bytes) 3237 *max_extent_size = max(get_max_extent_size(entry), 3238 *max_extent_size); 3239 3240 if (entry->bytes < bytes || 3241 (!entry->bitmap && entry->offset < min_start)) { 3242 node = rb_next(&entry->offset_index); 3243 if (!node) 3244 break; 3245 entry = rb_entry(node, struct btrfs_free_space, 3246 offset_index); 3247 continue; 3248 } 3249 3250 if (entry->bitmap) { 3251 ret = btrfs_alloc_from_bitmap(block_group, 3252 cluster, entry, bytes, 3253 cluster->window_start, 3254 max_extent_size); 3255 if (ret == 0) { 3256 node = rb_next(&entry->offset_index); 3257 if (!node) 3258 break; 3259 entry = rb_entry(node, struct btrfs_free_space, 3260 offset_index); 3261 continue; 3262 } 3263 cluster->window_start += bytes; 3264 } else { 3265 ret = entry->offset; 3266 3267 entry->offset += bytes; 3268 entry->bytes -= bytes; 3269 } 3270 3271 break; 3272 } 3273 out: 3274 spin_unlock(&cluster->lock); 3275 3276 if (!ret) 3277 return 0; 3278 3279 spin_lock(&ctl->tree_lock); 3280 3281 if (!btrfs_free_space_trimmed(entry)) 3282 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3283 3284 ctl->free_space -= bytes; 3285 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3286 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3287 3288 spin_lock(&cluster->lock); 3289 if (entry->bytes == 0) { 3290 rb_erase(&entry->offset_index, &cluster->root); 3291 ctl->free_extents--; 3292 if (entry->bitmap) { 3293 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3294 entry->bitmap); 3295 ctl->total_bitmaps--; 3296 recalculate_thresholds(ctl); 3297 } else if (!btrfs_free_space_trimmed(entry)) { 3298 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3299 } 3300 kmem_cache_free(btrfs_free_space_cachep, entry); 3301 } 3302 3303 spin_unlock(&cluster->lock); 3304 spin_unlock(&ctl->tree_lock); 3305 3306 return ret; 3307 } 3308 3309 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3310 struct btrfs_free_space *entry, 3311 struct btrfs_free_cluster *cluster, 3312 u64 offset, u64 bytes, 3313 u64 cont1_bytes, u64 min_bytes) 3314 { 3315 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3316 unsigned long next_zero; 3317 unsigned long i; 3318 unsigned long want_bits; 3319 unsigned long min_bits; 3320 unsigned long found_bits; 3321 unsigned long max_bits = 0; 3322 unsigned long start = 0; 3323 unsigned long total_found = 0; 3324 int ret; 3325 3326 i = offset_to_bit(entry->offset, ctl->unit, 3327 max_t(u64, offset, entry->offset)); 3328 want_bits = bytes_to_bits(bytes, ctl->unit); 3329 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3330 3331 /* 3332 * Don't bother looking for a cluster in this bitmap if it's heavily 3333 * fragmented. 3334 */ 3335 if (entry->max_extent_size && 3336 entry->max_extent_size < cont1_bytes) 3337 return -ENOSPC; 3338 again: 3339 found_bits = 0; 3340 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3341 next_zero = find_next_zero_bit(entry->bitmap, 3342 BITS_PER_BITMAP, i); 3343 if (next_zero - i >= min_bits) { 3344 found_bits = next_zero - i; 3345 if (found_bits > max_bits) 3346 max_bits = found_bits; 3347 break; 3348 } 3349 if (next_zero - i > max_bits) 3350 max_bits = next_zero - i; 3351 i = next_zero; 3352 } 3353 3354 if (!found_bits) { 3355 entry->max_extent_size = (u64)max_bits * ctl->unit; 3356 return -ENOSPC; 3357 } 3358 3359 if (!total_found) { 3360 start = i; 3361 cluster->max_size = 0; 3362 } 3363 3364 total_found += found_bits; 3365 3366 if (cluster->max_size < found_bits * ctl->unit) 3367 cluster->max_size = found_bits * ctl->unit; 3368 3369 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3370 i = next_zero + 1; 3371 goto again; 3372 } 3373 3374 cluster->window_start = start * ctl->unit + entry->offset; 3375 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3376 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3377 3378 /* 3379 * We need to know if we're currently on the normal space index when we 3380 * manipulate the bitmap so that we know we need to remove and re-insert 3381 * it into the space_index tree. Clear the bytes_index node here so the 3382 * bitmap manipulation helpers know not to mess with the space_index 3383 * until this bitmap entry is added back into the normal cache. 3384 */ 3385 RB_CLEAR_NODE(&entry->bytes_index); 3386 3387 ret = tree_insert_offset(&cluster->root, entry->offset, 3388 &entry->offset_index, 1); 3389 ASSERT(!ret); /* -EEXIST; Logic error */ 3390 3391 trace_btrfs_setup_cluster(block_group, cluster, 3392 total_found * ctl->unit, 1); 3393 return 0; 3394 } 3395 3396 /* 3397 * This searches the block group for just extents to fill the cluster with. 3398 * Try to find a cluster with at least bytes total bytes, at least one 3399 * extent of cont1_bytes, and other clusters of at least min_bytes. 3400 */ 3401 static noinline int 3402 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3403 struct btrfs_free_cluster *cluster, 3404 struct list_head *bitmaps, u64 offset, u64 bytes, 3405 u64 cont1_bytes, u64 min_bytes) 3406 { 3407 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3408 struct btrfs_free_space *first = NULL; 3409 struct btrfs_free_space *entry = NULL; 3410 struct btrfs_free_space *last; 3411 struct rb_node *node; 3412 u64 window_free; 3413 u64 max_extent; 3414 u64 total_size = 0; 3415 3416 entry = tree_search_offset(ctl, offset, 0, 1); 3417 if (!entry) 3418 return -ENOSPC; 3419 3420 /* 3421 * We don't want bitmaps, so just move along until we find a normal 3422 * extent entry. 3423 */ 3424 while (entry->bitmap || entry->bytes < min_bytes) { 3425 if (entry->bitmap && list_empty(&entry->list)) 3426 list_add_tail(&entry->list, bitmaps); 3427 node = rb_next(&entry->offset_index); 3428 if (!node) 3429 return -ENOSPC; 3430 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3431 } 3432 3433 window_free = entry->bytes; 3434 max_extent = entry->bytes; 3435 first = entry; 3436 last = entry; 3437 3438 for (node = rb_next(&entry->offset_index); node; 3439 node = rb_next(&entry->offset_index)) { 3440 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3441 3442 if (entry->bitmap) { 3443 if (list_empty(&entry->list)) 3444 list_add_tail(&entry->list, bitmaps); 3445 continue; 3446 } 3447 3448 if (entry->bytes < min_bytes) 3449 continue; 3450 3451 last = entry; 3452 window_free += entry->bytes; 3453 if (entry->bytes > max_extent) 3454 max_extent = entry->bytes; 3455 } 3456 3457 if (window_free < bytes || max_extent < cont1_bytes) 3458 return -ENOSPC; 3459 3460 cluster->window_start = first->offset; 3461 3462 node = &first->offset_index; 3463 3464 /* 3465 * now we've found our entries, pull them out of the free space 3466 * cache and put them into the cluster rbtree 3467 */ 3468 do { 3469 int ret; 3470 3471 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3472 node = rb_next(&entry->offset_index); 3473 if (entry->bitmap || entry->bytes < min_bytes) 3474 continue; 3475 3476 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3477 rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); 3478 ret = tree_insert_offset(&cluster->root, entry->offset, 3479 &entry->offset_index, 0); 3480 total_size += entry->bytes; 3481 ASSERT(!ret); /* -EEXIST; Logic error */ 3482 } while (node && entry != last); 3483 3484 cluster->max_size = max_extent; 3485 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3486 return 0; 3487 } 3488 3489 /* 3490 * This specifically looks for bitmaps that may work in the cluster, we assume 3491 * that we have already failed to find extents that will work. 3492 */ 3493 static noinline int 3494 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3495 struct btrfs_free_cluster *cluster, 3496 struct list_head *bitmaps, u64 offset, u64 bytes, 3497 u64 cont1_bytes, u64 min_bytes) 3498 { 3499 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3500 struct btrfs_free_space *entry = NULL; 3501 int ret = -ENOSPC; 3502 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3503 3504 if (ctl->total_bitmaps == 0) 3505 return -ENOSPC; 3506 3507 /* 3508 * The bitmap that covers offset won't be in the list unless offset 3509 * is just its start offset. 3510 */ 3511 if (!list_empty(bitmaps)) 3512 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3513 3514 if (!entry || entry->offset != bitmap_offset) { 3515 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3516 if (entry && list_empty(&entry->list)) 3517 list_add(&entry->list, bitmaps); 3518 } 3519 3520 list_for_each_entry(entry, bitmaps, list) { 3521 if (entry->bytes < bytes) 3522 continue; 3523 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3524 bytes, cont1_bytes, min_bytes); 3525 if (!ret) 3526 return 0; 3527 } 3528 3529 /* 3530 * The bitmaps list has all the bitmaps that record free space 3531 * starting after offset, so no more search is required. 3532 */ 3533 return -ENOSPC; 3534 } 3535 3536 /* 3537 * here we try to find a cluster of blocks in a block group. The goal 3538 * is to find at least bytes+empty_size. 3539 * We might not find them all in one contiguous area. 3540 * 3541 * returns zero and sets up cluster if things worked out, otherwise 3542 * it returns -enospc 3543 */ 3544 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3545 struct btrfs_free_cluster *cluster, 3546 u64 offset, u64 bytes, u64 empty_size) 3547 { 3548 struct btrfs_fs_info *fs_info = block_group->fs_info; 3549 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3550 struct btrfs_free_space *entry, *tmp; 3551 LIST_HEAD(bitmaps); 3552 u64 min_bytes; 3553 u64 cont1_bytes; 3554 int ret; 3555 3556 /* 3557 * Choose the minimum extent size we'll require for this 3558 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3559 * For metadata, allow allocates with smaller extents. For 3560 * data, keep it dense. 3561 */ 3562 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3563 cont1_bytes = bytes + empty_size; 3564 min_bytes = cont1_bytes; 3565 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3566 cont1_bytes = bytes; 3567 min_bytes = fs_info->sectorsize; 3568 } else { 3569 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3570 min_bytes = fs_info->sectorsize; 3571 } 3572 3573 spin_lock(&ctl->tree_lock); 3574 3575 /* 3576 * If we know we don't have enough space to make a cluster don't even 3577 * bother doing all the work to try and find one. 3578 */ 3579 if (ctl->free_space < bytes) { 3580 spin_unlock(&ctl->tree_lock); 3581 return -ENOSPC; 3582 } 3583 3584 spin_lock(&cluster->lock); 3585 3586 /* someone already found a cluster, hooray */ 3587 if (cluster->block_group) { 3588 ret = 0; 3589 goto out; 3590 } 3591 3592 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3593 min_bytes); 3594 3595 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3596 bytes + empty_size, 3597 cont1_bytes, min_bytes); 3598 if (ret) 3599 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3600 offset, bytes + empty_size, 3601 cont1_bytes, min_bytes); 3602 3603 /* Clear our temporary list */ 3604 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3605 list_del_init(&entry->list); 3606 3607 if (!ret) { 3608 btrfs_get_block_group(block_group); 3609 list_add_tail(&cluster->block_group_list, 3610 &block_group->cluster_list); 3611 cluster->block_group = block_group; 3612 } else { 3613 trace_btrfs_failed_cluster_setup(block_group); 3614 } 3615 out: 3616 spin_unlock(&cluster->lock); 3617 spin_unlock(&ctl->tree_lock); 3618 3619 return ret; 3620 } 3621 3622 /* 3623 * simple code to zero out a cluster 3624 */ 3625 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3626 { 3627 spin_lock_init(&cluster->lock); 3628 spin_lock_init(&cluster->refill_lock); 3629 cluster->root = RB_ROOT; 3630 cluster->max_size = 0; 3631 cluster->fragmented = false; 3632 INIT_LIST_HEAD(&cluster->block_group_list); 3633 cluster->block_group = NULL; 3634 } 3635 3636 static int do_trimming(struct btrfs_block_group *block_group, 3637 u64 *total_trimmed, u64 start, u64 bytes, 3638 u64 reserved_start, u64 reserved_bytes, 3639 enum btrfs_trim_state reserved_trim_state, 3640 struct btrfs_trim_range *trim_entry) 3641 { 3642 struct btrfs_space_info *space_info = block_group->space_info; 3643 struct btrfs_fs_info *fs_info = block_group->fs_info; 3644 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3645 int ret; 3646 int update = 0; 3647 const u64 end = start + bytes; 3648 const u64 reserved_end = reserved_start + reserved_bytes; 3649 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3650 u64 trimmed = 0; 3651 3652 spin_lock(&space_info->lock); 3653 spin_lock(&block_group->lock); 3654 if (!block_group->ro) { 3655 block_group->reserved += reserved_bytes; 3656 space_info->bytes_reserved += reserved_bytes; 3657 update = 1; 3658 } 3659 spin_unlock(&block_group->lock); 3660 spin_unlock(&space_info->lock); 3661 3662 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3663 if (!ret) { 3664 *total_trimmed += trimmed; 3665 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3666 } 3667 3668 mutex_lock(&ctl->cache_writeout_mutex); 3669 if (reserved_start < start) 3670 __btrfs_add_free_space(block_group, reserved_start, 3671 start - reserved_start, 3672 reserved_trim_state); 3673 if (start + bytes < reserved_start + reserved_bytes) 3674 __btrfs_add_free_space(block_group, end, reserved_end - end, 3675 reserved_trim_state); 3676 __btrfs_add_free_space(block_group, start, bytes, trim_state); 3677 list_del(&trim_entry->list); 3678 mutex_unlock(&ctl->cache_writeout_mutex); 3679 3680 if (update) { 3681 spin_lock(&space_info->lock); 3682 spin_lock(&block_group->lock); 3683 if (block_group->ro) 3684 space_info->bytes_readonly += reserved_bytes; 3685 block_group->reserved -= reserved_bytes; 3686 space_info->bytes_reserved -= reserved_bytes; 3687 spin_unlock(&block_group->lock); 3688 spin_unlock(&space_info->lock); 3689 } 3690 3691 return ret; 3692 } 3693 3694 /* 3695 * If @async is set, then we will trim 1 region and return. 3696 */ 3697 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3698 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3699 bool async) 3700 { 3701 struct btrfs_discard_ctl *discard_ctl = 3702 &block_group->fs_info->discard_ctl; 3703 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3704 struct btrfs_free_space *entry; 3705 struct rb_node *node; 3706 int ret = 0; 3707 u64 extent_start; 3708 u64 extent_bytes; 3709 enum btrfs_trim_state extent_trim_state; 3710 u64 bytes; 3711 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3712 3713 while (start < end) { 3714 struct btrfs_trim_range trim_entry; 3715 3716 mutex_lock(&ctl->cache_writeout_mutex); 3717 spin_lock(&ctl->tree_lock); 3718 3719 if (ctl->free_space < minlen) 3720 goto out_unlock; 3721 3722 entry = tree_search_offset(ctl, start, 0, 1); 3723 if (!entry) 3724 goto out_unlock; 3725 3726 /* Skip bitmaps and if async, already trimmed entries */ 3727 while (entry->bitmap || 3728 (async && btrfs_free_space_trimmed(entry))) { 3729 node = rb_next(&entry->offset_index); 3730 if (!node) 3731 goto out_unlock; 3732 entry = rb_entry(node, struct btrfs_free_space, 3733 offset_index); 3734 } 3735 3736 if (entry->offset >= end) 3737 goto out_unlock; 3738 3739 extent_start = entry->offset; 3740 extent_bytes = entry->bytes; 3741 extent_trim_state = entry->trim_state; 3742 if (async) { 3743 start = entry->offset; 3744 bytes = entry->bytes; 3745 if (bytes < minlen) { 3746 spin_unlock(&ctl->tree_lock); 3747 mutex_unlock(&ctl->cache_writeout_mutex); 3748 goto next; 3749 } 3750 unlink_free_space(ctl, entry, true); 3751 /* 3752 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3753 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3754 * X when we come back around. So trim it now. 3755 */ 3756 if (max_discard_size && 3757 bytes >= (max_discard_size + 3758 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3759 bytes = max_discard_size; 3760 extent_bytes = max_discard_size; 3761 entry->offset += max_discard_size; 3762 entry->bytes -= max_discard_size; 3763 link_free_space(ctl, entry); 3764 } else { 3765 kmem_cache_free(btrfs_free_space_cachep, entry); 3766 } 3767 } else { 3768 start = max(start, extent_start); 3769 bytes = min(extent_start + extent_bytes, end) - start; 3770 if (bytes < minlen) { 3771 spin_unlock(&ctl->tree_lock); 3772 mutex_unlock(&ctl->cache_writeout_mutex); 3773 goto next; 3774 } 3775 3776 unlink_free_space(ctl, entry, true); 3777 kmem_cache_free(btrfs_free_space_cachep, entry); 3778 } 3779 3780 spin_unlock(&ctl->tree_lock); 3781 trim_entry.start = extent_start; 3782 trim_entry.bytes = extent_bytes; 3783 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3784 mutex_unlock(&ctl->cache_writeout_mutex); 3785 3786 ret = do_trimming(block_group, total_trimmed, start, bytes, 3787 extent_start, extent_bytes, extent_trim_state, 3788 &trim_entry); 3789 if (ret) { 3790 block_group->discard_cursor = start + bytes; 3791 break; 3792 } 3793 next: 3794 start += bytes; 3795 block_group->discard_cursor = start; 3796 if (async && *total_trimmed) 3797 break; 3798 3799 if (fatal_signal_pending(current)) { 3800 ret = -ERESTARTSYS; 3801 break; 3802 } 3803 3804 cond_resched(); 3805 } 3806 3807 return ret; 3808 3809 out_unlock: 3810 block_group->discard_cursor = btrfs_block_group_end(block_group); 3811 spin_unlock(&ctl->tree_lock); 3812 mutex_unlock(&ctl->cache_writeout_mutex); 3813 3814 return ret; 3815 } 3816 3817 /* 3818 * If we break out of trimming a bitmap prematurely, we should reset the 3819 * trimming bit. In a rather contrieved case, it's possible to race here so 3820 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3821 * 3822 * start = start of bitmap 3823 * end = near end of bitmap 3824 * 3825 * Thread 1: Thread 2: 3826 * trim_bitmaps(start) 3827 * trim_bitmaps(end) 3828 * end_trimming_bitmap() 3829 * reset_trimming_bitmap() 3830 */ 3831 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3832 { 3833 struct btrfs_free_space *entry; 3834 3835 spin_lock(&ctl->tree_lock); 3836 entry = tree_search_offset(ctl, offset, 1, 0); 3837 if (entry) { 3838 if (btrfs_free_space_trimmed(entry)) { 3839 ctl->discardable_extents[BTRFS_STAT_CURR] += 3840 entry->bitmap_extents; 3841 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3842 } 3843 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3844 } 3845 3846 spin_unlock(&ctl->tree_lock); 3847 } 3848 3849 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3850 struct btrfs_free_space *entry) 3851 { 3852 if (btrfs_free_space_trimming_bitmap(entry)) { 3853 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3854 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3855 entry->bitmap_extents; 3856 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3857 } 3858 } 3859 3860 /* 3861 * If @async is set, then we will trim 1 region and return. 3862 */ 3863 static int trim_bitmaps(struct btrfs_block_group *block_group, 3864 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3865 u64 maxlen, bool async) 3866 { 3867 struct btrfs_discard_ctl *discard_ctl = 3868 &block_group->fs_info->discard_ctl; 3869 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3870 struct btrfs_free_space *entry; 3871 int ret = 0; 3872 int ret2; 3873 u64 bytes; 3874 u64 offset = offset_to_bitmap(ctl, start); 3875 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3876 3877 while (offset < end) { 3878 bool next_bitmap = false; 3879 struct btrfs_trim_range trim_entry; 3880 3881 mutex_lock(&ctl->cache_writeout_mutex); 3882 spin_lock(&ctl->tree_lock); 3883 3884 if (ctl->free_space < minlen) { 3885 block_group->discard_cursor = 3886 btrfs_block_group_end(block_group); 3887 spin_unlock(&ctl->tree_lock); 3888 mutex_unlock(&ctl->cache_writeout_mutex); 3889 break; 3890 } 3891 3892 entry = tree_search_offset(ctl, offset, 1, 0); 3893 /* 3894 * Bitmaps are marked trimmed lossily now to prevent constant 3895 * discarding of the same bitmap (the reason why we are bound 3896 * by the filters). So, retrim the block group bitmaps when we 3897 * are preparing to punt to the unused_bgs list. This uses 3898 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3899 * which is the only discard index which sets minlen to 0. 3900 */ 3901 if (!entry || (async && minlen && start == offset && 3902 btrfs_free_space_trimmed(entry))) { 3903 spin_unlock(&ctl->tree_lock); 3904 mutex_unlock(&ctl->cache_writeout_mutex); 3905 next_bitmap = true; 3906 goto next; 3907 } 3908 3909 /* 3910 * Async discard bitmap trimming begins at by setting the start 3911 * to be key.objectid and the offset_to_bitmap() aligns to the 3912 * start of the bitmap. This lets us know we are fully 3913 * scanning the bitmap rather than only some portion of it. 3914 */ 3915 if (start == offset) 3916 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3917 3918 bytes = minlen; 3919 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3920 if (ret2 || start >= end) { 3921 /* 3922 * We lossily consider a bitmap trimmed if we only skip 3923 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3924 */ 3925 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3926 end_trimming_bitmap(ctl, entry); 3927 else 3928 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3929 spin_unlock(&ctl->tree_lock); 3930 mutex_unlock(&ctl->cache_writeout_mutex); 3931 next_bitmap = true; 3932 goto next; 3933 } 3934 3935 /* 3936 * We already trimmed a region, but are using the locking above 3937 * to reset the trim_state. 3938 */ 3939 if (async && *total_trimmed) { 3940 spin_unlock(&ctl->tree_lock); 3941 mutex_unlock(&ctl->cache_writeout_mutex); 3942 goto out; 3943 } 3944 3945 bytes = min(bytes, end - start); 3946 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3947 spin_unlock(&ctl->tree_lock); 3948 mutex_unlock(&ctl->cache_writeout_mutex); 3949 goto next; 3950 } 3951 3952 /* 3953 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3954 * If X < @minlen, we won't trim X when we come back around. 3955 * So trim it now. We differ here from trimming extents as we 3956 * don't keep individual state per bit. 3957 */ 3958 if (async && 3959 max_discard_size && 3960 bytes > (max_discard_size + minlen)) 3961 bytes = max_discard_size; 3962 3963 bitmap_clear_bits(ctl, entry, start, bytes, true); 3964 if (entry->bytes == 0) 3965 free_bitmap(ctl, entry); 3966 3967 spin_unlock(&ctl->tree_lock); 3968 trim_entry.start = start; 3969 trim_entry.bytes = bytes; 3970 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3971 mutex_unlock(&ctl->cache_writeout_mutex); 3972 3973 ret = do_trimming(block_group, total_trimmed, start, bytes, 3974 start, bytes, 0, &trim_entry); 3975 if (ret) { 3976 reset_trimming_bitmap(ctl, offset); 3977 block_group->discard_cursor = 3978 btrfs_block_group_end(block_group); 3979 break; 3980 } 3981 next: 3982 if (next_bitmap) { 3983 offset += BITS_PER_BITMAP * ctl->unit; 3984 start = offset; 3985 } else { 3986 start += bytes; 3987 } 3988 block_group->discard_cursor = start; 3989 3990 if (fatal_signal_pending(current)) { 3991 if (start != offset) 3992 reset_trimming_bitmap(ctl, offset); 3993 ret = -ERESTARTSYS; 3994 break; 3995 } 3996 3997 cond_resched(); 3998 } 3999 4000 if (offset >= end) 4001 block_group->discard_cursor = end; 4002 4003 out: 4004 return ret; 4005 } 4006 4007 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 4008 u64 *trimmed, u64 start, u64 end, u64 minlen) 4009 { 4010 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4011 int ret; 4012 u64 rem = 0; 4013 4014 ASSERT(!btrfs_is_zoned(block_group->fs_info)); 4015 4016 *trimmed = 0; 4017 4018 spin_lock(&block_group->lock); 4019 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4020 spin_unlock(&block_group->lock); 4021 return 0; 4022 } 4023 btrfs_freeze_block_group(block_group); 4024 spin_unlock(&block_group->lock); 4025 4026 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 4027 if (ret) 4028 goto out; 4029 4030 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 4031 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 4032 /* If we ended in the middle of a bitmap, reset the trimming flag */ 4033 if (rem) 4034 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 4035 out: 4036 btrfs_unfreeze_block_group(block_group); 4037 return ret; 4038 } 4039 4040 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 4041 u64 *trimmed, u64 start, u64 end, u64 minlen, 4042 bool async) 4043 { 4044 int ret; 4045 4046 *trimmed = 0; 4047 4048 spin_lock(&block_group->lock); 4049 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4050 spin_unlock(&block_group->lock); 4051 return 0; 4052 } 4053 btrfs_freeze_block_group(block_group); 4054 spin_unlock(&block_group->lock); 4055 4056 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 4057 btrfs_unfreeze_block_group(block_group); 4058 4059 return ret; 4060 } 4061 4062 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 4063 u64 *trimmed, u64 start, u64 end, u64 minlen, 4064 u64 maxlen, bool async) 4065 { 4066 int ret; 4067 4068 *trimmed = 0; 4069 4070 spin_lock(&block_group->lock); 4071 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags)) { 4072 spin_unlock(&block_group->lock); 4073 return 0; 4074 } 4075 btrfs_freeze_block_group(block_group); 4076 spin_unlock(&block_group->lock); 4077 4078 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 4079 async); 4080 4081 btrfs_unfreeze_block_group(block_group); 4082 4083 return ret; 4084 } 4085 4086 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info) 4087 { 4088 return btrfs_super_cache_generation(fs_info->super_copy); 4089 } 4090 4091 static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, 4092 struct btrfs_trans_handle *trans) 4093 { 4094 struct btrfs_block_group *block_group; 4095 struct rb_node *node; 4096 int ret = 0; 4097 4098 btrfs_info(fs_info, "cleaning free space cache v1"); 4099 4100 node = rb_first_cached(&fs_info->block_group_cache_tree); 4101 while (node) { 4102 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4103 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4104 if (ret) 4105 goto out; 4106 node = rb_next(node); 4107 } 4108 out: 4109 return ret; 4110 } 4111 4112 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active) 4113 { 4114 struct btrfs_trans_handle *trans; 4115 int ret; 4116 4117 /* 4118 * update_super_roots will appropriately set or unset 4119 * super_copy->cache_generation based on SPACE_CACHE and 4120 * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a 4121 * transaction commit whether we are enabling space cache v1 and don't 4122 * have any other work to do, or are disabling it and removing free 4123 * space inodes. 4124 */ 4125 trans = btrfs_start_transaction(fs_info->tree_root, 0); 4126 if (IS_ERR(trans)) 4127 return PTR_ERR(trans); 4128 4129 if (!active) { 4130 set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4131 ret = cleanup_free_space_cache_v1(fs_info, trans); 4132 if (ret) { 4133 btrfs_abort_transaction(trans, ret); 4134 btrfs_end_transaction(trans); 4135 goto out; 4136 } 4137 } 4138 4139 ret = btrfs_commit_transaction(trans); 4140 out: 4141 clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags); 4142 4143 return ret; 4144 } 4145 4146 int __init btrfs_free_space_init(void) 4147 { 4148 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", 4149 sizeof(struct btrfs_free_space), 0, 4150 SLAB_MEM_SPREAD, NULL); 4151 if (!btrfs_free_space_cachep) 4152 return -ENOMEM; 4153 4154 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 4155 PAGE_SIZE, PAGE_SIZE, 4156 SLAB_MEM_SPREAD, NULL); 4157 if (!btrfs_free_space_bitmap_cachep) { 4158 kmem_cache_destroy(btrfs_free_space_cachep); 4159 return -ENOMEM; 4160 } 4161 4162 return 0; 4163 } 4164 4165 void __cold btrfs_free_space_exit(void) 4166 { 4167 kmem_cache_destroy(btrfs_free_space_cachep); 4168 kmem_cache_destroy(btrfs_free_space_bitmap_cachep); 4169 } 4170 4171 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4172 /* 4173 * Use this if you need to make a bitmap or extent entry specifically, it 4174 * doesn't do any of the merging that add_free_space does, this acts a lot like 4175 * how the free space cache loading stuff works, so you can get really weird 4176 * configurations. 4177 */ 4178 int test_add_free_space_entry(struct btrfs_block_group *cache, 4179 u64 offset, u64 bytes, bool bitmap) 4180 { 4181 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4182 struct btrfs_free_space *info = NULL, *bitmap_info; 4183 void *map = NULL; 4184 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 4185 u64 bytes_added; 4186 int ret; 4187 4188 again: 4189 if (!info) { 4190 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 4191 if (!info) 4192 return -ENOMEM; 4193 } 4194 4195 if (!bitmap) { 4196 spin_lock(&ctl->tree_lock); 4197 info->offset = offset; 4198 info->bytes = bytes; 4199 info->max_extent_size = 0; 4200 ret = link_free_space(ctl, info); 4201 spin_unlock(&ctl->tree_lock); 4202 if (ret) 4203 kmem_cache_free(btrfs_free_space_cachep, info); 4204 return ret; 4205 } 4206 4207 if (!map) { 4208 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 4209 if (!map) { 4210 kmem_cache_free(btrfs_free_space_cachep, info); 4211 return -ENOMEM; 4212 } 4213 } 4214 4215 spin_lock(&ctl->tree_lock); 4216 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4217 1, 0); 4218 if (!bitmap_info) { 4219 info->bitmap = map; 4220 map = NULL; 4221 add_new_bitmap(ctl, info, offset); 4222 bitmap_info = info; 4223 info = NULL; 4224 } 4225 4226 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 4227 trim_state); 4228 4229 bytes -= bytes_added; 4230 offset += bytes_added; 4231 spin_unlock(&ctl->tree_lock); 4232 4233 if (bytes) 4234 goto again; 4235 4236 if (info) 4237 kmem_cache_free(btrfs_free_space_cachep, info); 4238 if (map) 4239 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 4240 return 0; 4241 } 4242 4243 /* 4244 * Checks to see if the given range is in the free space cache. This is really 4245 * just used to check the absence of space, so if there is free space in the 4246 * range at all we will return 1. 4247 */ 4248 int test_check_exists(struct btrfs_block_group *cache, 4249 u64 offset, u64 bytes) 4250 { 4251 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 4252 struct btrfs_free_space *info; 4253 int ret = 0; 4254 4255 spin_lock(&ctl->tree_lock); 4256 info = tree_search_offset(ctl, offset, 0, 0); 4257 if (!info) { 4258 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 4259 1, 0); 4260 if (!info) 4261 goto out; 4262 } 4263 4264 have_info: 4265 if (info->bitmap) { 4266 u64 bit_off, bit_bytes; 4267 struct rb_node *n; 4268 struct btrfs_free_space *tmp; 4269 4270 bit_off = offset; 4271 bit_bytes = ctl->unit; 4272 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 4273 if (!ret) { 4274 if (bit_off == offset) { 4275 ret = 1; 4276 goto out; 4277 } else if (bit_off > offset && 4278 offset + bytes > bit_off) { 4279 ret = 1; 4280 goto out; 4281 } 4282 } 4283 4284 n = rb_prev(&info->offset_index); 4285 while (n) { 4286 tmp = rb_entry(n, struct btrfs_free_space, 4287 offset_index); 4288 if (tmp->offset + tmp->bytes < offset) 4289 break; 4290 if (offset + bytes < tmp->offset) { 4291 n = rb_prev(&tmp->offset_index); 4292 continue; 4293 } 4294 info = tmp; 4295 goto have_info; 4296 } 4297 4298 n = rb_next(&info->offset_index); 4299 while (n) { 4300 tmp = rb_entry(n, struct btrfs_free_space, 4301 offset_index); 4302 if (offset + bytes < tmp->offset) 4303 break; 4304 if (tmp->offset + tmp->bytes < offset) { 4305 n = rb_next(&tmp->offset_index); 4306 continue; 4307 } 4308 info = tmp; 4309 goto have_info; 4310 } 4311 4312 ret = 0; 4313 goto out; 4314 } 4315 4316 if (info->offset == offset) { 4317 ret = 1; 4318 goto out; 4319 } 4320 4321 if (offset > info->offset && offset < info->offset + info->bytes) 4322 ret = 1; 4323 out: 4324 spin_unlock(&ctl->tree_lock); 4325 return ret; 4326 } 4327 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 4328