1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Red Hat. All rights reserved. 4 */ 5 6 #include <linux/pagemap.h> 7 #include <linux/sched.h> 8 #include <linux/sched/signal.h> 9 #include <linux/slab.h> 10 #include <linux/math64.h> 11 #include <linux/ratelimit.h> 12 #include <linux/error-injection.h> 13 #include <linux/sched/mm.h> 14 #include "ctree.h" 15 #include "free-space-cache.h" 16 #include "transaction.h" 17 #include "disk-io.h" 18 #include "extent_io.h" 19 #include "volumes.h" 20 #include "space-info.h" 21 #include "delalloc-space.h" 22 #include "block-group.h" 23 #include "discard.h" 24 25 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 26 #define MAX_CACHE_BYTES_PER_GIG SZ_64K 27 #define FORCE_EXTENT_THRESHOLD SZ_1M 28 29 struct btrfs_trim_range { 30 u64 start; 31 u64 bytes; 32 struct list_head list; 33 }; 34 35 static int link_free_space(struct btrfs_free_space_ctl *ctl, 36 struct btrfs_free_space *info); 37 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 38 struct btrfs_free_space *info); 39 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 40 struct btrfs_free_space *bitmap_info, u64 *offset, 41 u64 *bytes, bool for_alloc); 42 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 43 struct btrfs_free_space *bitmap_info); 44 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 45 struct btrfs_free_space *info, u64 offset, 46 u64 bytes); 47 48 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 49 struct btrfs_path *path, 50 u64 offset) 51 { 52 struct btrfs_fs_info *fs_info = root->fs_info; 53 struct btrfs_key key; 54 struct btrfs_key location; 55 struct btrfs_disk_key disk_key; 56 struct btrfs_free_space_header *header; 57 struct extent_buffer *leaf; 58 struct inode *inode = NULL; 59 unsigned nofs_flag; 60 int ret; 61 62 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 63 key.offset = offset; 64 key.type = 0; 65 66 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 67 if (ret < 0) 68 return ERR_PTR(ret); 69 if (ret > 0) { 70 btrfs_release_path(path); 71 return ERR_PTR(-ENOENT); 72 } 73 74 leaf = path->nodes[0]; 75 header = btrfs_item_ptr(leaf, path->slots[0], 76 struct btrfs_free_space_header); 77 btrfs_free_space_key(leaf, header, &disk_key); 78 btrfs_disk_key_to_cpu(&location, &disk_key); 79 btrfs_release_path(path); 80 81 /* 82 * We are often under a trans handle at this point, so we need to make 83 * sure NOFS is set to keep us from deadlocking. 84 */ 85 nofs_flag = memalloc_nofs_save(); 86 inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); 87 btrfs_release_path(path); 88 memalloc_nofs_restore(nofs_flag); 89 if (IS_ERR(inode)) 90 return inode; 91 92 mapping_set_gfp_mask(inode->i_mapping, 93 mapping_gfp_constraint(inode->i_mapping, 94 ~(__GFP_FS | __GFP_HIGHMEM))); 95 96 return inode; 97 } 98 99 struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, 100 struct btrfs_path *path) 101 { 102 struct btrfs_fs_info *fs_info = block_group->fs_info; 103 struct inode *inode = NULL; 104 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 105 106 spin_lock(&block_group->lock); 107 if (block_group->inode) 108 inode = igrab(block_group->inode); 109 spin_unlock(&block_group->lock); 110 if (inode) 111 return inode; 112 113 inode = __lookup_free_space_inode(fs_info->tree_root, path, 114 block_group->start); 115 if (IS_ERR(inode)) 116 return inode; 117 118 spin_lock(&block_group->lock); 119 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 120 btrfs_info(fs_info, "Old style space inode found, converting."); 121 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 122 BTRFS_INODE_NODATACOW; 123 block_group->disk_cache_state = BTRFS_DC_CLEAR; 124 } 125 126 if (!block_group->iref) { 127 block_group->inode = igrab(inode); 128 block_group->iref = 1; 129 } 130 spin_unlock(&block_group->lock); 131 132 return inode; 133 } 134 135 static int __create_free_space_inode(struct btrfs_root *root, 136 struct btrfs_trans_handle *trans, 137 struct btrfs_path *path, 138 u64 ino, u64 offset) 139 { 140 struct btrfs_key key; 141 struct btrfs_disk_key disk_key; 142 struct btrfs_free_space_header *header; 143 struct btrfs_inode_item *inode_item; 144 struct extent_buffer *leaf; 145 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; 146 int ret; 147 148 ret = btrfs_insert_empty_inode(trans, root, path, ino); 149 if (ret) 150 return ret; 151 152 /* We inline crc's for the free disk space cache */ 153 if (ino != BTRFS_FREE_INO_OBJECTID) 154 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 155 156 leaf = path->nodes[0]; 157 inode_item = btrfs_item_ptr(leaf, path->slots[0], 158 struct btrfs_inode_item); 159 btrfs_item_key(leaf, &disk_key, path->slots[0]); 160 memzero_extent_buffer(leaf, (unsigned long)inode_item, 161 sizeof(*inode_item)); 162 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 163 btrfs_set_inode_size(leaf, inode_item, 0); 164 btrfs_set_inode_nbytes(leaf, inode_item, 0); 165 btrfs_set_inode_uid(leaf, inode_item, 0); 166 btrfs_set_inode_gid(leaf, inode_item, 0); 167 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 168 btrfs_set_inode_flags(leaf, inode_item, flags); 169 btrfs_set_inode_nlink(leaf, inode_item, 1); 170 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 171 btrfs_set_inode_block_group(leaf, inode_item, offset); 172 btrfs_mark_buffer_dirty(leaf); 173 btrfs_release_path(path); 174 175 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 176 key.offset = offset; 177 key.type = 0; 178 ret = btrfs_insert_empty_item(trans, root, path, &key, 179 sizeof(struct btrfs_free_space_header)); 180 if (ret < 0) { 181 btrfs_release_path(path); 182 return ret; 183 } 184 185 leaf = path->nodes[0]; 186 header = btrfs_item_ptr(leaf, path->slots[0], 187 struct btrfs_free_space_header); 188 memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); 189 btrfs_set_free_space_key(leaf, header, &disk_key); 190 btrfs_mark_buffer_dirty(leaf); 191 btrfs_release_path(path); 192 193 return 0; 194 } 195 196 int create_free_space_inode(struct btrfs_trans_handle *trans, 197 struct btrfs_block_group *block_group, 198 struct btrfs_path *path) 199 { 200 int ret; 201 u64 ino; 202 203 ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); 204 if (ret < 0) 205 return ret; 206 207 return __create_free_space_inode(trans->fs_info->tree_root, trans, path, 208 ino, block_group->start); 209 } 210 211 int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, 212 struct btrfs_block_rsv *rsv) 213 { 214 u64 needed_bytes; 215 int ret; 216 217 /* 1 for slack space, 1 for updating the inode */ 218 needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + 219 btrfs_calc_metadata_size(fs_info, 1); 220 221 spin_lock(&rsv->lock); 222 if (rsv->reserved < needed_bytes) 223 ret = -ENOSPC; 224 else 225 ret = 0; 226 spin_unlock(&rsv->lock); 227 return ret; 228 } 229 230 int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, 231 struct btrfs_block_group *block_group, 232 struct inode *inode) 233 { 234 struct btrfs_root *root = BTRFS_I(inode)->root; 235 int ret = 0; 236 bool locked = false; 237 238 if (block_group) { 239 struct btrfs_path *path = btrfs_alloc_path(); 240 241 if (!path) { 242 ret = -ENOMEM; 243 goto fail; 244 } 245 locked = true; 246 mutex_lock(&trans->transaction->cache_write_mutex); 247 if (!list_empty(&block_group->io_list)) { 248 list_del_init(&block_group->io_list); 249 250 btrfs_wait_cache_io(trans, block_group, path); 251 btrfs_put_block_group(block_group); 252 } 253 254 /* 255 * now that we've truncated the cache away, its no longer 256 * setup or written 257 */ 258 spin_lock(&block_group->lock); 259 block_group->disk_cache_state = BTRFS_DC_CLEAR; 260 spin_unlock(&block_group->lock); 261 btrfs_free_path(path); 262 } 263 264 btrfs_i_size_write(BTRFS_I(inode), 0); 265 truncate_pagecache(inode, 0); 266 267 /* 268 * We skip the throttling logic for free space cache inodes, so we don't 269 * need to check for -EAGAIN. 270 */ 271 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 272 0, BTRFS_EXTENT_DATA_KEY); 273 if (ret) 274 goto fail; 275 276 ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 277 278 fail: 279 if (locked) 280 mutex_unlock(&trans->transaction->cache_write_mutex); 281 if (ret) 282 btrfs_abort_transaction(trans, ret); 283 284 return ret; 285 } 286 287 static void readahead_cache(struct inode *inode) 288 { 289 struct file_ra_state *ra; 290 unsigned long last_index; 291 292 ra = kzalloc(sizeof(*ra), GFP_NOFS); 293 if (!ra) 294 return; 295 296 file_ra_state_init(ra, inode->i_mapping); 297 last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 298 299 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 300 301 kfree(ra); 302 } 303 304 static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, 305 int write) 306 { 307 int num_pages; 308 int check_crcs = 0; 309 310 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 311 312 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID) 313 check_crcs = 1; 314 315 /* Make sure we can fit our crcs and generation into the first page */ 316 if (write && check_crcs && 317 (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE) 318 return -ENOSPC; 319 320 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); 321 322 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); 323 if (!io_ctl->pages) 324 return -ENOMEM; 325 326 io_ctl->num_pages = num_pages; 327 io_ctl->fs_info = btrfs_sb(inode->i_sb); 328 io_ctl->check_crcs = check_crcs; 329 io_ctl->inode = inode; 330 331 return 0; 332 } 333 ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO); 334 335 static void io_ctl_free(struct btrfs_io_ctl *io_ctl) 336 { 337 kfree(io_ctl->pages); 338 io_ctl->pages = NULL; 339 } 340 341 static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) 342 { 343 if (io_ctl->cur) { 344 io_ctl->cur = NULL; 345 io_ctl->orig = NULL; 346 } 347 } 348 349 static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) 350 { 351 ASSERT(io_ctl->index < io_ctl->num_pages); 352 io_ctl->page = io_ctl->pages[io_ctl->index++]; 353 io_ctl->cur = page_address(io_ctl->page); 354 io_ctl->orig = io_ctl->cur; 355 io_ctl->size = PAGE_SIZE; 356 if (clear) 357 clear_page(io_ctl->cur); 358 } 359 360 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) 361 { 362 int i; 363 364 io_ctl_unmap_page(io_ctl); 365 366 for (i = 0; i < io_ctl->num_pages; i++) { 367 if (io_ctl->pages[i]) { 368 ClearPageChecked(io_ctl->pages[i]); 369 unlock_page(io_ctl->pages[i]); 370 put_page(io_ctl->pages[i]); 371 } 372 } 373 } 374 375 static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) 376 { 377 struct page *page; 378 struct inode *inode = io_ctl->inode; 379 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 380 int i; 381 382 for (i = 0; i < io_ctl->num_pages; i++) { 383 page = find_or_create_page(inode->i_mapping, i, mask); 384 if (!page) { 385 io_ctl_drop_pages(io_ctl); 386 return -ENOMEM; 387 } 388 io_ctl->pages[i] = page; 389 if (uptodate && !PageUptodate(page)) { 390 btrfs_readpage(NULL, page); 391 lock_page(page); 392 if (page->mapping != inode->i_mapping) { 393 btrfs_err(BTRFS_I(inode)->root->fs_info, 394 "free space cache page truncated"); 395 io_ctl_drop_pages(io_ctl); 396 return -EIO; 397 } 398 if (!PageUptodate(page)) { 399 btrfs_err(BTRFS_I(inode)->root->fs_info, 400 "error reading free space cache"); 401 io_ctl_drop_pages(io_ctl); 402 return -EIO; 403 } 404 } 405 } 406 407 for (i = 0; i < io_ctl->num_pages; i++) { 408 clear_page_dirty_for_io(io_ctl->pages[i]); 409 set_page_extent_mapped(io_ctl->pages[i]); 410 } 411 412 return 0; 413 } 414 415 static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 416 { 417 io_ctl_map_page(io_ctl, 1); 418 419 /* 420 * Skip the csum areas. If we don't check crcs then we just have a 421 * 64bit chunk at the front of the first page. 422 */ 423 if (io_ctl->check_crcs) { 424 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 425 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 426 } else { 427 io_ctl->cur += sizeof(u64); 428 io_ctl->size -= sizeof(u64) * 2; 429 } 430 431 put_unaligned_le64(generation, io_ctl->cur); 432 io_ctl->cur += sizeof(u64); 433 } 434 435 static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) 436 { 437 u64 cache_gen; 438 439 /* 440 * Skip the crc area. If we don't check crcs then we just have a 64bit 441 * chunk at the front of the first page. 442 */ 443 if (io_ctl->check_crcs) { 444 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 445 io_ctl->size -= sizeof(u64) + 446 (sizeof(u32) * io_ctl->num_pages); 447 } else { 448 io_ctl->cur += sizeof(u64); 449 io_ctl->size -= sizeof(u64) * 2; 450 } 451 452 cache_gen = get_unaligned_le64(io_ctl->cur); 453 if (cache_gen != generation) { 454 btrfs_err_rl(io_ctl->fs_info, 455 "space cache generation (%llu) does not match inode (%llu)", 456 cache_gen, generation); 457 io_ctl_unmap_page(io_ctl); 458 return -EIO; 459 } 460 io_ctl->cur += sizeof(u64); 461 return 0; 462 } 463 464 static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) 465 { 466 u32 *tmp; 467 u32 crc = ~(u32)0; 468 unsigned offset = 0; 469 470 if (!io_ctl->check_crcs) { 471 io_ctl_unmap_page(io_ctl); 472 return; 473 } 474 475 if (index == 0) 476 offset = sizeof(u32) * io_ctl->num_pages; 477 478 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 479 btrfs_crc32c_final(crc, (u8 *)&crc); 480 io_ctl_unmap_page(io_ctl); 481 tmp = page_address(io_ctl->pages[0]); 482 tmp += index; 483 *tmp = crc; 484 } 485 486 static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) 487 { 488 u32 *tmp, val; 489 u32 crc = ~(u32)0; 490 unsigned offset = 0; 491 492 if (!io_ctl->check_crcs) { 493 io_ctl_map_page(io_ctl, 0); 494 return 0; 495 } 496 497 if (index == 0) 498 offset = sizeof(u32) * io_ctl->num_pages; 499 500 tmp = page_address(io_ctl->pages[0]); 501 tmp += index; 502 val = *tmp; 503 504 io_ctl_map_page(io_ctl, 0); 505 crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); 506 btrfs_crc32c_final(crc, (u8 *)&crc); 507 if (val != crc) { 508 btrfs_err_rl(io_ctl->fs_info, 509 "csum mismatch on free space cache"); 510 io_ctl_unmap_page(io_ctl); 511 return -EIO; 512 } 513 514 return 0; 515 } 516 517 static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, 518 void *bitmap) 519 { 520 struct btrfs_free_space_entry *entry; 521 522 if (!io_ctl->cur) 523 return -ENOSPC; 524 525 entry = io_ctl->cur; 526 put_unaligned_le64(offset, &entry->offset); 527 put_unaligned_le64(bytes, &entry->bytes); 528 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 529 BTRFS_FREE_SPACE_EXTENT; 530 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 531 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 532 533 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 534 return 0; 535 536 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 537 538 /* No more pages to map */ 539 if (io_ctl->index >= io_ctl->num_pages) 540 return 0; 541 542 /* map the next page */ 543 io_ctl_map_page(io_ctl, 1); 544 return 0; 545 } 546 547 static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) 548 { 549 if (!io_ctl->cur) 550 return -ENOSPC; 551 552 /* 553 * If we aren't at the start of the current page, unmap this one and 554 * map the next one if there is any left. 555 */ 556 if (io_ctl->cur != io_ctl->orig) { 557 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 558 if (io_ctl->index >= io_ctl->num_pages) 559 return -ENOSPC; 560 io_ctl_map_page(io_ctl, 0); 561 } 562 563 copy_page(io_ctl->cur, bitmap); 564 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 565 if (io_ctl->index < io_ctl->num_pages) 566 io_ctl_map_page(io_ctl, 0); 567 return 0; 568 } 569 570 static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) 571 { 572 /* 573 * If we're not on the boundary we know we've modified the page and we 574 * need to crc the page. 575 */ 576 if (io_ctl->cur != io_ctl->orig) 577 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 578 else 579 io_ctl_unmap_page(io_ctl); 580 581 while (io_ctl->index < io_ctl->num_pages) { 582 io_ctl_map_page(io_ctl, 1); 583 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 584 } 585 } 586 587 static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, 588 struct btrfs_free_space *entry, u8 *type) 589 { 590 struct btrfs_free_space_entry *e; 591 int ret; 592 593 if (!io_ctl->cur) { 594 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 595 if (ret) 596 return ret; 597 } 598 599 e = io_ctl->cur; 600 entry->offset = get_unaligned_le64(&e->offset); 601 entry->bytes = get_unaligned_le64(&e->bytes); 602 *type = e->type; 603 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 604 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 605 606 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 607 return 0; 608 609 io_ctl_unmap_page(io_ctl); 610 611 return 0; 612 } 613 614 static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, 615 struct btrfs_free_space *entry) 616 { 617 int ret; 618 619 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 620 if (ret) 621 return ret; 622 623 copy_page(entry->bitmap, io_ctl->cur); 624 io_ctl_unmap_page(io_ctl); 625 626 return 0; 627 } 628 629 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 630 struct btrfs_free_space_ctl *ctl, 631 struct btrfs_path *path, u64 offset) 632 { 633 struct btrfs_fs_info *fs_info = root->fs_info; 634 struct btrfs_free_space_header *header; 635 struct extent_buffer *leaf; 636 struct btrfs_io_ctl io_ctl; 637 struct btrfs_key key; 638 struct btrfs_free_space *e, *n; 639 LIST_HEAD(bitmaps); 640 u64 num_entries; 641 u64 num_bitmaps; 642 u64 generation; 643 u8 type; 644 int ret = 0; 645 646 /* Nothing in the space cache, goodbye */ 647 if (!i_size_read(inode)) 648 return 0; 649 650 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 651 key.offset = offset; 652 key.type = 0; 653 654 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 655 if (ret < 0) 656 return 0; 657 else if (ret > 0) { 658 btrfs_release_path(path); 659 return 0; 660 } 661 662 ret = -1; 663 664 leaf = path->nodes[0]; 665 header = btrfs_item_ptr(leaf, path->slots[0], 666 struct btrfs_free_space_header); 667 num_entries = btrfs_free_space_entries(leaf, header); 668 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 669 generation = btrfs_free_space_generation(leaf, header); 670 btrfs_release_path(path); 671 672 if (!BTRFS_I(inode)->generation) { 673 btrfs_info(fs_info, 674 "the free space cache file (%llu) is invalid, skip it", 675 offset); 676 return 0; 677 } 678 679 if (BTRFS_I(inode)->generation != generation) { 680 btrfs_err(fs_info, 681 "free space inode generation (%llu) did not match free space cache generation (%llu)", 682 BTRFS_I(inode)->generation, generation); 683 return 0; 684 } 685 686 if (!num_entries) 687 return 0; 688 689 ret = io_ctl_init(&io_ctl, inode, 0); 690 if (ret) 691 return ret; 692 693 readahead_cache(inode); 694 695 ret = io_ctl_prepare_pages(&io_ctl, true); 696 if (ret) 697 goto out; 698 699 ret = io_ctl_check_crc(&io_ctl, 0); 700 if (ret) 701 goto free_cache; 702 703 ret = io_ctl_check_generation(&io_ctl, generation); 704 if (ret) 705 goto free_cache; 706 707 while (num_entries) { 708 e = kmem_cache_zalloc(btrfs_free_space_cachep, 709 GFP_NOFS); 710 if (!e) 711 goto free_cache; 712 713 ret = io_ctl_read_entry(&io_ctl, e, &type); 714 if (ret) { 715 kmem_cache_free(btrfs_free_space_cachep, e); 716 goto free_cache; 717 } 718 719 if (!e->bytes) { 720 kmem_cache_free(btrfs_free_space_cachep, e); 721 goto free_cache; 722 } 723 724 if (type == BTRFS_FREE_SPACE_EXTENT) { 725 spin_lock(&ctl->tree_lock); 726 ret = link_free_space(ctl, e); 727 spin_unlock(&ctl->tree_lock); 728 if (ret) { 729 btrfs_err(fs_info, 730 "Duplicate entries in free space cache, dumping"); 731 kmem_cache_free(btrfs_free_space_cachep, e); 732 goto free_cache; 733 } 734 } else { 735 ASSERT(num_bitmaps); 736 num_bitmaps--; 737 e->bitmap = kmem_cache_zalloc( 738 btrfs_free_space_bitmap_cachep, GFP_NOFS); 739 if (!e->bitmap) { 740 kmem_cache_free( 741 btrfs_free_space_cachep, e); 742 goto free_cache; 743 } 744 spin_lock(&ctl->tree_lock); 745 ret = link_free_space(ctl, e); 746 ctl->total_bitmaps++; 747 ctl->op->recalc_thresholds(ctl); 748 spin_unlock(&ctl->tree_lock); 749 if (ret) { 750 btrfs_err(fs_info, 751 "Duplicate entries in free space cache, dumping"); 752 kmem_cache_free(btrfs_free_space_cachep, e); 753 goto free_cache; 754 } 755 list_add_tail(&e->list, &bitmaps); 756 } 757 758 num_entries--; 759 } 760 761 io_ctl_unmap_page(&io_ctl); 762 763 /* 764 * We add the bitmaps at the end of the entries in order that 765 * the bitmap entries are added to the cache. 766 */ 767 list_for_each_entry_safe(e, n, &bitmaps, list) { 768 list_del_init(&e->list); 769 ret = io_ctl_read_bitmap(&io_ctl, e); 770 if (ret) 771 goto free_cache; 772 } 773 774 io_ctl_drop_pages(&io_ctl); 775 ret = 1; 776 out: 777 io_ctl_free(&io_ctl); 778 return ret; 779 free_cache: 780 io_ctl_drop_pages(&io_ctl); 781 __btrfs_remove_free_space_cache(ctl); 782 goto out; 783 } 784 785 static int copy_free_space_cache(struct btrfs_block_group *block_group, 786 struct btrfs_free_space_ctl *ctl) 787 { 788 struct btrfs_free_space *info; 789 struct rb_node *n; 790 int ret = 0; 791 792 while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { 793 info = rb_entry(n, struct btrfs_free_space, offset_index); 794 if (!info->bitmap) { 795 unlink_free_space(ctl, info); 796 ret = btrfs_add_free_space(block_group, info->offset, 797 info->bytes); 798 kmem_cache_free(btrfs_free_space_cachep, info); 799 } else { 800 u64 offset = info->offset; 801 u64 bytes = ctl->unit; 802 803 while (search_bitmap(ctl, info, &offset, &bytes, 804 false) == 0) { 805 ret = btrfs_add_free_space(block_group, offset, 806 bytes); 807 if (ret) 808 break; 809 bitmap_clear_bits(ctl, info, offset, bytes); 810 offset = info->offset; 811 bytes = ctl->unit; 812 } 813 free_bitmap(ctl, info); 814 } 815 cond_resched(); 816 } 817 return ret; 818 } 819 820 int load_free_space_cache(struct btrfs_block_group *block_group) 821 { 822 struct btrfs_fs_info *fs_info = block_group->fs_info; 823 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 824 struct btrfs_free_space_ctl tmp_ctl = {}; 825 struct inode *inode; 826 struct btrfs_path *path; 827 int ret = 0; 828 bool matched; 829 u64 used = block_group->used; 830 831 /* 832 * Because we could potentially discard our loaded free space, we want 833 * to load everything into a temporary structure first, and then if it's 834 * valid copy it all into the actual free space ctl. 835 */ 836 btrfs_init_free_space_ctl(block_group, &tmp_ctl); 837 838 /* 839 * If this block group has been marked to be cleared for one reason or 840 * another then we can't trust the on disk cache, so just return. 841 */ 842 spin_lock(&block_group->lock); 843 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 844 spin_unlock(&block_group->lock); 845 return 0; 846 } 847 spin_unlock(&block_group->lock); 848 849 path = btrfs_alloc_path(); 850 if (!path) 851 return 0; 852 path->search_commit_root = 1; 853 path->skip_locking = 1; 854 855 /* 856 * We must pass a path with search_commit_root set to btrfs_iget in 857 * order to avoid a deadlock when allocating extents for the tree root. 858 * 859 * When we are COWing an extent buffer from the tree root, when looking 860 * for a free extent, at extent-tree.c:find_free_extent(), we can find 861 * block group without its free space cache loaded. When we find one 862 * we must load its space cache which requires reading its free space 863 * cache's inode item from the root tree. If this inode item is located 864 * in the same leaf that we started COWing before, then we end up in 865 * deadlock on the extent buffer (trying to read lock it when we 866 * previously write locked it). 867 * 868 * It's safe to read the inode item using the commit root because 869 * block groups, once loaded, stay in memory forever (until they are 870 * removed) as well as their space caches once loaded. New block groups 871 * once created get their ->cached field set to BTRFS_CACHE_FINISHED so 872 * we will never try to read their inode item while the fs is mounted. 873 */ 874 inode = lookup_free_space_inode(block_group, path); 875 if (IS_ERR(inode)) { 876 btrfs_free_path(path); 877 return 0; 878 } 879 880 /* We may have converted the inode and made the cache invalid. */ 881 spin_lock(&block_group->lock); 882 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 883 spin_unlock(&block_group->lock); 884 btrfs_free_path(path); 885 goto out; 886 } 887 spin_unlock(&block_group->lock); 888 889 ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl, 890 path, block_group->start); 891 btrfs_free_path(path); 892 if (ret <= 0) 893 goto out; 894 895 matched = (tmp_ctl.free_space == (block_group->length - used - 896 block_group->bytes_super)); 897 898 if (matched) { 899 ret = copy_free_space_cache(block_group, &tmp_ctl); 900 /* 901 * ret == 1 means we successfully loaded the free space cache, 902 * so we need to re-set it here. 903 */ 904 if (ret == 0) 905 ret = 1; 906 } else { 907 __btrfs_remove_free_space_cache(&tmp_ctl); 908 btrfs_warn(fs_info, 909 "block group %llu has wrong amount of free space", 910 block_group->start); 911 ret = -1; 912 } 913 out: 914 if (ret < 0) { 915 /* This cache is bogus, make sure it gets cleared */ 916 spin_lock(&block_group->lock); 917 block_group->disk_cache_state = BTRFS_DC_CLEAR; 918 spin_unlock(&block_group->lock); 919 ret = 0; 920 921 btrfs_warn(fs_info, 922 "failed to load free space cache for block group %llu, rebuilding it now", 923 block_group->start); 924 } 925 926 spin_lock(&ctl->tree_lock); 927 btrfs_discard_update_discardable(block_group); 928 spin_unlock(&ctl->tree_lock); 929 iput(inode); 930 return ret; 931 } 932 933 static noinline_for_stack 934 int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, 935 struct btrfs_free_space_ctl *ctl, 936 struct btrfs_block_group *block_group, 937 int *entries, int *bitmaps, 938 struct list_head *bitmap_list) 939 { 940 int ret; 941 struct btrfs_free_cluster *cluster = NULL; 942 struct btrfs_free_cluster *cluster_locked = NULL; 943 struct rb_node *node = rb_first(&ctl->free_space_offset); 944 struct btrfs_trim_range *trim_entry; 945 946 /* Get the cluster for this block_group if it exists */ 947 if (block_group && !list_empty(&block_group->cluster_list)) { 948 cluster = list_entry(block_group->cluster_list.next, 949 struct btrfs_free_cluster, 950 block_group_list); 951 } 952 953 if (!node && cluster) { 954 cluster_locked = cluster; 955 spin_lock(&cluster_locked->lock); 956 node = rb_first(&cluster->root); 957 cluster = NULL; 958 } 959 960 /* Write out the extent entries */ 961 while (node) { 962 struct btrfs_free_space *e; 963 964 e = rb_entry(node, struct btrfs_free_space, offset_index); 965 *entries += 1; 966 967 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 968 e->bitmap); 969 if (ret) 970 goto fail; 971 972 if (e->bitmap) { 973 list_add_tail(&e->list, bitmap_list); 974 *bitmaps += 1; 975 } 976 node = rb_next(node); 977 if (!node && cluster) { 978 node = rb_first(&cluster->root); 979 cluster_locked = cluster; 980 spin_lock(&cluster_locked->lock); 981 cluster = NULL; 982 } 983 } 984 if (cluster_locked) { 985 spin_unlock(&cluster_locked->lock); 986 cluster_locked = NULL; 987 } 988 989 /* 990 * Make sure we don't miss any range that was removed from our rbtree 991 * because trimming is running. Otherwise after a umount+mount (or crash 992 * after committing the transaction) we would leak free space and get 993 * an inconsistent free space cache report from fsck. 994 */ 995 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 996 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 997 trim_entry->bytes, NULL); 998 if (ret) 999 goto fail; 1000 *entries += 1; 1001 } 1002 1003 return 0; 1004 fail: 1005 if (cluster_locked) 1006 spin_unlock(&cluster_locked->lock); 1007 return -ENOSPC; 1008 } 1009 1010 static noinline_for_stack int 1011 update_cache_item(struct btrfs_trans_handle *trans, 1012 struct btrfs_root *root, 1013 struct inode *inode, 1014 struct btrfs_path *path, u64 offset, 1015 int entries, int bitmaps) 1016 { 1017 struct btrfs_key key; 1018 struct btrfs_free_space_header *header; 1019 struct extent_buffer *leaf; 1020 int ret; 1021 1022 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 1023 key.offset = offset; 1024 key.type = 0; 1025 1026 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 1027 if (ret < 0) { 1028 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1029 EXTENT_DELALLOC, 0, 0, NULL); 1030 goto fail; 1031 } 1032 leaf = path->nodes[0]; 1033 if (ret > 0) { 1034 struct btrfs_key found_key; 1035 ASSERT(path->slots[0]); 1036 path->slots[0]--; 1037 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1038 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 1039 found_key.offset != offset) { 1040 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 1041 inode->i_size - 1, EXTENT_DELALLOC, 0, 1042 0, NULL); 1043 btrfs_release_path(path); 1044 goto fail; 1045 } 1046 } 1047 1048 BTRFS_I(inode)->generation = trans->transid; 1049 header = btrfs_item_ptr(leaf, path->slots[0], 1050 struct btrfs_free_space_header); 1051 btrfs_set_free_space_entries(leaf, header, entries); 1052 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1053 btrfs_set_free_space_generation(leaf, header, trans->transid); 1054 btrfs_mark_buffer_dirty(leaf); 1055 btrfs_release_path(path); 1056 1057 return 0; 1058 1059 fail: 1060 return -1; 1061 } 1062 1063 static noinline_for_stack int write_pinned_extent_entries( 1064 struct btrfs_trans_handle *trans, 1065 struct btrfs_block_group *block_group, 1066 struct btrfs_io_ctl *io_ctl, 1067 int *entries) 1068 { 1069 u64 start, extent_start, extent_end, len; 1070 struct extent_io_tree *unpin = NULL; 1071 int ret; 1072 1073 if (!block_group) 1074 return 0; 1075 1076 /* 1077 * We want to add any pinned extents to our free space cache 1078 * so we don't leak the space 1079 * 1080 * We shouldn't have switched the pinned extents yet so this is the 1081 * right one 1082 */ 1083 unpin = &trans->transaction->pinned_extents; 1084 1085 start = block_group->start; 1086 1087 while (start < block_group->start + block_group->length) { 1088 ret = find_first_extent_bit(unpin, start, 1089 &extent_start, &extent_end, 1090 EXTENT_DIRTY, NULL); 1091 if (ret) 1092 return 0; 1093 1094 /* This pinned extent is out of our range */ 1095 if (extent_start >= block_group->start + block_group->length) 1096 return 0; 1097 1098 extent_start = max(extent_start, start); 1099 extent_end = min(block_group->start + block_group->length, 1100 extent_end + 1); 1101 len = extent_end - extent_start; 1102 1103 *entries += 1; 1104 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1105 if (ret) 1106 return -ENOSPC; 1107 1108 start = extent_end; 1109 } 1110 1111 return 0; 1112 } 1113 1114 static noinline_for_stack int 1115 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) 1116 { 1117 struct btrfs_free_space *entry, *next; 1118 int ret; 1119 1120 /* Write out the bitmaps */ 1121 list_for_each_entry_safe(entry, next, bitmap_list, list) { 1122 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1123 if (ret) 1124 return -ENOSPC; 1125 list_del_init(&entry->list); 1126 } 1127 1128 return 0; 1129 } 1130 1131 static int flush_dirty_cache(struct inode *inode) 1132 { 1133 int ret; 1134 1135 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1136 if (ret) 1137 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1138 EXTENT_DELALLOC, 0, 0, NULL); 1139 1140 return ret; 1141 } 1142 1143 static void noinline_for_stack 1144 cleanup_bitmap_list(struct list_head *bitmap_list) 1145 { 1146 struct btrfs_free_space *entry, *next; 1147 1148 list_for_each_entry_safe(entry, next, bitmap_list, list) 1149 list_del_init(&entry->list); 1150 } 1151 1152 static void noinline_for_stack 1153 cleanup_write_cache_enospc(struct inode *inode, 1154 struct btrfs_io_ctl *io_ctl, 1155 struct extent_state **cached_state) 1156 { 1157 io_ctl_drop_pages(io_ctl); 1158 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1159 i_size_read(inode) - 1, cached_state); 1160 } 1161 1162 static int __btrfs_wait_cache_io(struct btrfs_root *root, 1163 struct btrfs_trans_handle *trans, 1164 struct btrfs_block_group *block_group, 1165 struct btrfs_io_ctl *io_ctl, 1166 struct btrfs_path *path, u64 offset) 1167 { 1168 int ret; 1169 struct inode *inode = io_ctl->inode; 1170 1171 if (!inode) 1172 return 0; 1173 1174 /* Flush the dirty pages in the cache file. */ 1175 ret = flush_dirty_cache(inode); 1176 if (ret) 1177 goto out; 1178 1179 /* Update the cache item to tell everyone this cache file is valid. */ 1180 ret = update_cache_item(trans, root, inode, path, offset, 1181 io_ctl->entries, io_ctl->bitmaps); 1182 out: 1183 if (ret) { 1184 invalidate_inode_pages2(inode->i_mapping); 1185 BTRFS_I(inode)->generation = 0; 1186 if (block_group) 1187 btrfs_debug(root->fs_info, 1188 "failed to write free space cache for block group %llu error %d", 1189 block_group->start, ret); 1190 } 1191 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1192 1193 if (block_group) { 1194 /* the dirty list is protected by the dirty_bgs_lock */ 1195 spin_lock(&trans->transaction->dirty_bgs_lock); 1196 1197 /* the disk_cache_state is protected by the block group lock */ 1198 spin_lock(&block_group->lock); 1199 1200 /* 1201 * only mark this as written if we didn't get put back on 1202 * the dirty list while waiting for IO. Otherwise our 1203 * cache state won't be right, and we won't get written again 1204 */ 1205 if (!ret && list_empty(&block_group->dirty_list)) 1206 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1207 else if (ret) 1208 block_group->disk_cache_state = BTRFS_DC_ERROR; 1209 1210 spin_unlock(&block_group->lock); 1211 spin_unlock(&trans->transaction->dirty_bgs_lock); 1212 io_ctl->inode = NULL; 1213 iput(inode); 1214 } 1215 1216 return ret; 1217 1218 } 1219 1220 int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, 1221 struct btrfs_block_group *block_group, 1222 struct btrfs_path *path) 1223 { 1224 return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, 1225 block_group, &block_group->io_ctl, 1226 path, block_group->start); 1227 } 1228 1229 /** 1230 * __btrfs_write_out_cache - write out cached info to an inode 1231 * @root - the root the inode belongs to 1232 * @ctl - the free space cache we are going to write out 1233 * @block_group - the block_group for this cache if it belongs to a block_group 1234 * @trans - the trans handle 1235 * 1236 * This function writes out a free space cache struct to disk for quick recovery 1237 * on mount. This will return 0 if it was successful in writing the cache out, 1238 * or an errno if it was not. 1239 */ 1240 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1241 struct btrfs_free_space_ctl *ctl, 1242 struct btrfs_block_group *block_group, 1243 struct btrfs_io_ctl *io_ctl, 1244 struct btrfs_trans_handle *trans) 1245 { 1246 struct extent_state *cached_state = NULL; 1247 LIST_HEAD(bitmap_list); 1248 int entries = 0; 1249 int bitmaps = 0; 1250 int ret; 1251 int must_iput = 0; 1252 1253 if (!i_size_read(inode)) 1254 return -EIO; 1255 1256 WARN_ON(io_ctl->pages); 1257 ret = io_ctl_init(io_ctl, inode, 1); 1258 if (ret) 1259 return ret; 1260 1261 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1262 down_write(&block_group->data_rwsem); 1263 spin_lock(&block_group->lock); 1264 if (block_group->delalloc_bytes) { 1265 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1266 spin_unlock(&block_group->lock); 1267 up_write(&block_group->data_rwsem); 1268 BTRFS_I(inode)->generation = 0; 1269 ret = 0; 1270 must_iput = 1; 1271 goto out; 1272 } 1273 spin_unlock(&block_group->lock); 1274 } 1275 1276 /* Lock all pages first so we can lock the extent safely. */ 1277 ret = io_ctl_prepare_pages(io_ctl, false); 1278 if (ret) 1279 goto out_unlock; 1280 1281 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1282 &cached_state); 1283 1284 io_ctl_set_generation(io_ctl, trans->transid); 1285 1286 mutex_lock(&ctl->cache_writeout_mutex); 1287 /* Write out the extent entries in the free space cache */ 1288 spin_lock(&ctl->tree_lock); 1289 ret = write_cache_extent_entries(io_ctl, ctl, 1290 block_group, &entries, &bitmaps, 1291 &bitmap_list); 1292 if (ret) 1293 goto out_nospc_locked; 1294 1295 /* 1296 * Some spaces that are freed in the current transaction are pinned, 1297 * they will be added into free space cache after the transaction is 1298 * committed, we shouldn't lose them. 1299 * 1300 * If this changes while we are working we'll get added back to 1301 * the dirty list and redo it. No locking needed 1302 */ 1303 ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries); 1304 if (ret) 1305 goto out_nospc_locked; 1306 1307 /* 1308 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1309 * locked while doing it because a concurrent trim can be manipulating 1310 * or freeing the bitmap. 1311 */ 1312 ret = write_bitmap_entries(io_ctl, &bitmap_list); 1313 spin_unlock(&ctl->tree_lock); 1314 mutex_unlock(&ctl->cache_writeout_mutex); 1315 if (ret) 1316 goto out_nospc; 1317 1318 /* Zero out the rest of the pages just to make sure */ 1319 io_ctl_zero_remaining_pages(io_ctl); 1320 1321 /* Everything is written out, now we dirty the pages in the file. */ 1322 ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, 1323 io_ctl->num_pages, 0, i_size_read(inode), 1324 &cached_state, false); 1325 if (ret) 1326 goto out_nospc; 1327 1328 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1329 up_write(&block_group->data_rwsem); 1330 /* 1331 * Release the pages and unlock the extent, we will flush 1332 * them out later 1333 */ 1334 io_ctl_drop_pages(io_ctl); 1335 io_ctl_free(io_ctl); 1336 1337 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1338 i_size_read(inode) - 1, &cached_state); 1339 1340 /* 1341 * at this point the pages are under IO and we're happy, 1342 * The caller is responsible for waiting on them and updating 1343 * the cache and the inode 1344 */ 1345 io_ctl->entries = entries; 1346 io_ctl->bitmaps = bitmaps; 1347 1348 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); 1349 if (ret) 1350 goto out; 1351 1352 return 0; 1353 1354 out_nospc_locked: 1355 cleanup_bitmap_list(&bitmap_list); 1356 spin_unlock(&ctl->tree_lock); 1357 mutex_unlock(&ctl->cache_writeout_mutex); 1358 1359 out_nospc: 1360 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1361 1362 out_unlock: 1363 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1364 up_write(&block_group->data_rwsem); 1365 1366 out: 1367 io_ctl->inode = NULL; 1368 io_ctl_free(io_ctl); 1369 if (ret) { 1370 invalidate_inode_pages2(inode->i_mapping); 1371 BTRFS_I(inode)->generation = 0; 1372 } 1373 btrfs_update_inode(trans, root, BTRFS_I(inode)); 1374 if (must_iput) 1375 iput(inode); 1376 return ret; 1377 } 1378 1379 int btrfs_write_out_cache(struct btrfs_trans_handle *trans, 1380 struct btrfs_block_group *block_group, 1381 struct btrfs_path *path) 1382 { 1383 struct btrfs_fs_info *fs_info = trans->fs_info; 1384 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1385 struct inode *inode; 1386 int ret = 0; 1387 1388 spin_lock(&block_group->lock); 1389 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1390 spin_unlock(&block_group->lock); 1391 return 0; 1392 } 1393 spin_unlock(&block_group->lock); 1394 1395 inode = lookup_free_space_inode(block_group, path); 1396 if (IS_ERR(inode)) 1397 return 0; 1398 1399 ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, 1400 block_group, &block_group->io_ctl, trans); 1401 if (ret) { 1402 btrfs_debug(fs_info, 1403 "failed to write free space cache for block group %llu error %d", 1404 block_group->start, ret); 1405 spin_lock(&block_group->lock); 1406 block_group->disk_cache_state = BTRFS_DC_ERROR; 1407 spin_unlock(&block_group->lock); 1408 1409 block_group->io_ctl.inode = NULL; 1410 iput(inode); 1411 } 1412 1413 /* 1414 * if ret == 0 the caller is expected to call btrfs_wait_cache_io 1415 * to wait for IO and put the inode 1416 */ 1417 1418 return ret; 1419 } 1420 1421 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1422 u64 offset) 1423 { 1424 ASSERT(offset >= bitmap_start); 1425 offset -= bitmap_start; 1426 return (unsigned long)(div_u64(offset, unit)); 1427 } 1428 1429 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1430 { 1431 return (unsigned long)(div_u64(bytes, unit)); 1432 } 1433 1434 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1435 u64 offset) 1436 { 1437 u64 bitmap_start; 1438 u64 bytes_per_bitmap; 1439 1440 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1441 bitmap_start = offset - ctl->start; 1442 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1443 bitmap_start *= bytes_per_bitmap; 1444 bitmap_start += ctl->start; 1445 1446 return bitmap_start; 1447 } 1448 1449 static int tree_insert_offset(struct rb_root *root, u64 offset, 1450 struct rb_node *node, int bitmap) 1451 { 1452 struct rb_node **p = &root->rb_node; 1453 struct rb_node *parent = NULL; 1454 struct btrfs_free_space *info; 1455 1456 while (*p) { 1457 parent = *p; 1458 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1459 1460 if (offset < info->offset) { 1461 p = &(*p)->rb_left; 1462 } else if (offset > info->offset) { 1463 p = &(*p)->rb_right; 1464 } else { 1465 /* 1466 * we could have a bitmap entry and an extent entry 1467 * share the same offset. If this is the case, we want 1468 * the extent entry to always be found first if we do a 1469 * linear search through the tree, since we want to have 1470 * the quickest allocation time, and allocating from an 1471 * extent is faster than allocating from a bitmap. So 1472 * if we're inserting a bitmap and we find an entry at 1473 * this offset, we want to go right, or after this entry 1474 * logically. If we are inserting an extent and we've 1475 * found a bitmap, we want to go left, or before 1476 * logically. 1477 */ 1478 if (bitmap) { 1479 if (info->bitmap) { 1480 WARN_ON_ONCE(1); 1481 return -EEXIST; 1482 } 1483 p = &(*p)->rb_right; 1484 } else { 1485 if (!info->bitmap) { 1486 WARN_ON_ONCE(1); 1487 return -EEXIST; 1488 } 1489 p = &(*p)->rb_left; 1490 } 1491 } 1492 } 1493 1494 rb_link_node(node, parent, p); 1495 rb_insert_color(node, root); 1496 1497 return 0; 1498 } 1499 1500 /* 1501 * searches the tree for the given offset. 1502 * 1503 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1504 * want a section that has at least bytes size and comes at or after the given 1505 * offset. 1506 */ 1507 static struct btrfs_free_space * 1508 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1509 u64 offset, int bitmap_only, int fuzzy) 1510 { 1511 struct rb_node *n = ctl->free_space_offset.rb_node; 1512 struct btrfs_free_space *entry, *prev = NULL; 1513 1514 /* find entry that is closest to the 'offset' */ 1515 while (1) { 1516 if (!n) { 1517 entry = NULL; 1518 break; 1519 } 1520 1521 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1522 prev = entry; 1523 1524 if (offset < entry->offset) 1525 n = n->rb_left; 1526 else if (offset > entry->offset) 1527 n = n->rb_right; 1528 else 1529 break; 1530 } 1531 1532 if (bitmap_only) { 1533 if (!entry) 1534 return NULL; 1535 if (entry->bitmap) 1536 return entry; 1537 1538 /* 1539 * bitmap entry and extent entry may share same offset, 1540 * in that case, bitmap entry comes after extent entry. 1541 */ 1542 n = rb_next(n); 1543 if (!n) 1544 return NULL; 1545 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1546 if (entry->offset != offset) 1547 return NULL; 1548 1549 WARN_ON(!entry->bitmap); 1550 return entry; 1551 } else if (entry) { 1552 if (entry->bitmap) { 1553 /* 1554 * if previous extent entry covers the offset, 1555 * we should return it instead of the bitmap entry 1556 */ 1557 n = rb_prev(&entry->offset_index); 1558 if (n) { 1559 prev = rb_entry(n, struct btrfs_free_space, 1560 offset_index); 1561 if (!prev->bitmap && 1562 prev->offset + prev->bytes > offset) 1563 entry = prev; 1564 } 1565 } 1566 return entry; 1567 } 1568 1569 if (!prev) 1570 return NULL; 1571 1572 /* find last entry before the 'offset' */ 1573 entry = prev; 1574 if (entry->offset > offset) { 1575 n = rb_prev(&entry->offset_index); 1576 if (n) { 1577 entry = rb_entry(n, struct btrfs_free_space, 1578 offset_index); 1579 ASSERT(entry->offset <= offset); 1580 } else { 1581 if (fuzzy) 1582 return entry; 1583 else 1584 return NULL; 1585 } 1586 } 1587 1588 if (entry->bitmap) { 1589 n = rb_prev(&entry->offset_index); 1590 if (n) { 1591 prev = rb_entry(n, struct btrfs_free_space, 1592 offset_index); 1593 if (!prev->bitmap && 1594 prev->offset + prev->bytes > offset) 1595 return prev; 1596 } 1597 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1598 return entry; 1599 } else if (entry->offset + entry->bytes > offset) 1600 return entry; 1601 1602 if (!fuzzy) 1603 return NULL; 1604 1605 while (1) { 1606 if (entry->bitmap) { 1607 if (entry->offset + BITS_PER_BITMAP * 1608 ctl->unit > offset) 1609 break; 1610 } else { 1611 if (entry->offset + entry->bytes > offset) 1612 break; 1613 } 1614 1615 n = rb_next(&entry->offset_index); 1616 if (!n) 1617 return NULL; 1618 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1619 } 1620 return entry; 1621 } 1622 1623 static inline void 1624 __unlink_free_space(struct btrfs_free_space_ctl *ctl, 1625 struct btrfs_free_space *info) 1626 { 1627 rb_erase(&info->offset_index, &ctl->free_space_offset); 1628 ctl->free_extents--; 1629 1630 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1631 ctl->discardable_extents[BTRFS_STAT_CURR]--; 1632 ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; 1633 } 1634 } 1635 1636 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1637 struct btrfs_free_space *info) 1638 { 1639 __unlink_free_space(ctl, info); 1640 ctl->free_space -= info->bytes; 1641 } 1642 1643 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1644 struct btrfs_free_space *info) 1645 { 1646 int ret = 0; 1647 1648 ASSERT(info->bytes || info->bitmap); 1649 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1650 &info->offset_index, (info->bitmap != NULL)); 1651 if (ret) 1652 return ret; 1653 1654 if (!info->bitmap && !btrfs_free_space_trimmed(info)) { 1655 ctl->discardable_extents[BTRFS_STAT_CURR]++; 1656 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 1657 } 1658 1659 ctl->free_space += info->bytes; 1660 ctl->free_extents++; 1661 return ret; 1662 } 1663 1664 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 1665 { 1666 struct btrfs_block_group *block_group = ctl->private; 1667 u64 max_bytes; 1668 u64 bitmap_bytes; 1669 u64 extent_bytes; 1670 u64 size = block_group->length; 1671 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1672 u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1673 1674 max_bitmaps = max_t(u64, max_bitmaps, 1); 1675 1676 ASSERT(ctl->total_bitmaps <= max_bitmaps); 1677 1678 /* 1679 * We are trying to keep the total amount of memory used per 1GiB of 1680 * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation 1681 * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of 1682 * bitmaps, we may end up using more memory than this. 1683 */ 1684 if (size < SZ_1G) 1685 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1686 else 1687 max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); 1688 1689 bitmap_bytes = ctl->total_bitmaps * ctl->unit; 1690 1691 /* 1692 * we want the extent entry threshold to always be at most 1/2 the max 1693 * bytes we can have, or whatever is less than that. 1694 */ 1695 extent_bytes = max_bytes - bitmap_bytes; 1696 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); 1697 1698 ctl->extents_thresh = 1699 div_u64(extent_bytes, sizeof(struct btrfs_free_space)); 1700 } 1701 1702 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1703 struct btrfs_free_space *info, 1704 u64 offset, u64 bytes) 1705 { 1706 unsigned long start, count, end; 1707 int extent_delta = -1; 1708 1709 start = offset_to_bit(info->offset, ctl->unit, offset); 1710 count = bytes_to_bits(bytes, ctl->unit); 1711 end = start + count; 1712 ASSERT(end <= BITS_PER_BITMAP); 1713 1714 bitmap_clear(info->bitmap, start, count); 1715 1716 info->bytes -= bytes; 1717 if (info->max_extent_size > ctl->unit) 1718 info->max_extent_size = 0; 1719 1720 if (start && test_bit(start - 1, info->bitmap)) 1721 extent_delta++; 1722 1723 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1724 extent_delta++; 1725 1726 info->bitmap_extents += extent_delta; 1727 if (!btrfs_free_space_trimmed(info)) { 1728 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1729 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 1730 } 1731 } 1732 1733 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1734 struct btrfs_free_space *info, u64 offset, 1735 u64 bytes) 1736 { 1737 __bitmap_clear_bits(ctl, info, offset, bytes); 1738 ctl->free_space -= bytes; 1739 } 1740 1741 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1742 struct btrfs_free_space *info, u64 offset, 1743 u64 bytes) 1744 { 1745 unsigned long start, count, end; 1746 int extent_delta = 1; 1747 1748 start = offset_to_bit(info->offset, ctl->unit, offset); 1749 count = bytes_to_bits(bytes, ctl->unit); 1750 end = start + count; 1751 ASSERT(end <= BITS_PER_BITMAP); 1752 1753 bitmap_set(info->bitmap, start, count); 1754 1755 info->bytes += bytes; 1756 ctl->free_space += bytes; 1757 1758 if (start && test_bit(start - 1, info->bitmap)) 1759 extent_delta--; 1760 1761 if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) 1762 extent_delta--; 1763 1764 info->bitmap_extents += extent_delta; 1765 if (!btrfs_free_space_trimmed(info)) { 1766 ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; 1767 ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; 1768 } 1769 } 1770 1771 /* 1772 * If we can not find suitable extent, we will use bytes to record 1773 * the size of the max extent. 1774 */ 1775 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1776 struct btrfs_free_space *bitmap_info, u64 *offset, 1777 u64 *bytes, bool for_alloc) 1778 { 1779 unsigned long found_bits = 0; 1780 unsigned long max_bits = 0; 1781 unsigned long bits, i; 1782 unsigned long next_zero; 1783 unsigned long extent_bits; 1784 1785 /* 1786 * Skip searching the bitmap if we don't have a contiguous section that 1787 * is large enough for this allocation. 1788 */ 1789 if (for_alloc && 1790 bitmap_info->max_extent_size && 1791 bitmap_info->max_extent_size < *bytes) { 1792 *bytes = bitmap_info->max_extent_size; 1793 return -1; 1794 } 1795 1796 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1797 max_t(u64, *offset, bitmap_info->offset)); 1798 bits = bytes_to_bits(*bytes, ctl->unit); 1799 1800 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1801 if (for_alloc && bits == 1) { 1802 found_bits = 1; 1803 break; 1804 } 1805 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1806 BITS_PER_BITMAP, i); 1807 extent_bits = next_zero - i; 1808 if (extent_bits >= bits) { 1809 found_bits = extent_bits; 1810 break; 1811 } else if (extent_bits > max_bits) { 1812 max_bits = extent_bits; 1813 } 1814 i = next_zero; 1815 } 1816 1817 if (found_bits) { 1818 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1819 *bytes = (u64)(found_bits) * ctl->unit; 1820 return 0; 1821 } 1822 1823 *bytes = (u64)(max_bits) * ctl->unit; 1824 bitmap_info->max_extent_size = *bytes; 1825 return -1; 1826 } 1827 1828 static inline u64 get_max_extent_size(struct btrfs_free_space *entry) 1829 { 1830 if (entry->bitmap) 1831 return entry->max_extent_size; 1832 return entry->bytes; 1833 } 1834 1835 /* Cache the size of the max extent in bytes */ 1836 static struct btrfs_free_space * 1837 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1838 unsigned long align, u64 *max_extent_size) 1839 { 1840 struct btrfs_free_space *entry; 1841 struct rb_node *node; 1842 u64 tmp; 1843 u64 align_off; 1844 int ret; 1845 1846 if (!ctl->free_space_offset.rb_node) 1847 goto out; 1848 1849 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); 1850 if (!entry) 1851 goto out; 1852 1853 for (node = &entry->offset_index; node; node = rb_next(node)) { 1854 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1855 if (entry->bytes < *bytes) { 1856 *max_extent_size = max(get_max_extent_size(entry), 1857 *max_extent_size); 1858 continue; 1859 } 1860 1861 /* make sure the space returned is big enough 1862 * to match our requested alignment 1863 */ 1864 if (*bytes >= align) { 1865 tmp = entry->offset - ctl->start + align - 1; 1866 tmp = div64_u64(tmp, align); 1867 tmp = tmp * align + ctl->start; 1868 align_off = tmp - entry->offset; 1869 } else { 1870 align_off = 0; 1871 tmp = entry->offset; 1872 } 1873 1874 if (entry->bytes < *bytes + align_off) { 1875 *max_extent_size = max(get_max_extent_size(entry), 1876 *max_extent_size); 1877 continue; 1878 } 1879 1880 if (entry->bitmap) { 1881 u64 size = *bytes; 1882 1883 ret = search_bitmap(ctl, entry, &tmp, &size, true); 1884 if (!ret) { 1885 *offset = tmp; 1886 *bytes = size; 1887 return entry; 1888 } else { 1889 *max_extent_size = 1890 max(get_max_extent_size(entry), 1891 *max_extent_size); 1892 } 1893 continue; 1894 } 1895 1896 *offset = tmp; 1897 *bytes = entry->bytes - align_off; 1898 return entry; 1899 } 1900 out: 1901 return NULL; 1902 } 1903 1904 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 1905 struct btrfs_free_space *info, u64 offset) 1906 { 1907 info->offset = offset_to_bitmap(ctl, offset); 1908 info->bytes = 0; 1909 info->bitmap_extents = 0; 1910 INIT_LIST_HEAD(&info->list); 1911 link_free_space(ctl, info); 1912 ctl->total_bitmaps++; 1913 1914 ctl->op->recalc_thresholds(ctl); 1915 } 1916 1917 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 1918 struct btrfs_free_space *bitmap_info) 1919 { 1920 /* 1921 * Normally when this is called, the bitmap is completely empty. However, 1922 * if we are blowing up the free space cache for one reason or another 1923 * via __btrfs_remove_free_space_cache(), then it may not be freed and 1924 * we may leave stats on the table. 1925 */ 1926 if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { 1927 ctl->discardable_extents[BTRFS_STAT_CURR] -= 1928 bitmap_info->bitmap_extents; 1929 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; 1930 1931 } 1932 unlink_free_space(ctl, bitmap_info); 1933 kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); 1934 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 1935 ctl->total_bitmaps--; 1936 ctl->op->recalc_thresholds(ctl); 1937 } 1938 1939 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 1940 struct btrfs_free_space *bitmap_info, 1941 u64 *offset, u64 *bytes) 1942 { 1943 u64 end; 1944 u64 search_start, search_bytes; 1945 int ret; 1946 1947 again: 1948 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 1949 1950 /* 1951 * We need to search for bits in this bitmap. We could only cover some 1952 * of the extent in this bitmap thanks to how we add space, so we need 1953 * to search for as much as it as we can and clear that amount, and then 1954 * go searching for the next bit. 1955 */ 1956 search_start = *offset; 1957 search_bytes = ctl->unit; 1958 search_bytes = min(search_bytes, end - search_start + 1); 1959 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, 1960 false); 1961 if (ret < 0 || search_start != *offset) 1962 return -EINVAL; 1963 1964 /* We may have found more bits than what we need */ 1965 search_bytes = min(search_bytes, *bytes); 1966 1967 /* Cannot clear past the end of the bitmap */ 1968 search_bytes = min(search_bytes, end - search_start + 1); 1969 1970 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); 1971 *offset += search_bytes; 1972 *bytes -= search_bytes; 1973 1974 if (*bytes) { 1975 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1976 if (!bitmap_info->bytes) 1977 free_bitmap(ctl, bitmap_info); 1978 1979 /* 1980 * no entry after this bitmap, but we still have bytes to 1981 * remove, so something has gone wrong. 1982 */ 1983 if (!next) 1984 return -EINVAL; 1985 1986 bitmap_info = rb_entry(next, struct btrfs_free_space, 1987 offset_index); 1988 1989 /* 1990 * if the next entry isn't a bitmap we need to return to let the 1991 * extent stuff do its work. 1992 */ 1993 if (!bitmap_info->bitmap) 1994 return -EAGAIN; 1995 1996 /* 1997 * Ok the next item is a bitmap, but it may not actually hold 1998 * the information for the rest of this free space stuff, so 1999 * look for it, and if we don't find it return so we can try 2000 * everything over again. 2001 */ 2002 search_start = *offset; 2003 search_bytes = ctl->unit; 2004 ret = search_bitmap(ctl, bitmap_info, &search_start, 2005 &search_bytes, false); 2006 if (ret < 0 || search_start != *offset) 2007 return -EAGAIN; 2008 2009 goto again; 2010 } else if (!bitmap_info->bytes) 2011 free_bitmap(ctl, bitmap_info); 2012 2013 return 0; 2014 } 2015 2016 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 2017 struct btrfs_free_space *info, u64 offset, 2018 u64 bytes, enum btrfs_trim_state trim_state) 2019 { 2020 u64 bytes_to_set = 0; 2021 u64 end; 2022 2023 /* 2024 * This is a tradeoff to make bitmap trim state minimal. We mark the 2025 * whole bitmap untrimmed if at any point we add untrimmed regions. 2026 */ 2027 if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { 2028 if (btrfs_free_space_trimmed(info)) { 2029 ctl->discardable_extents[BTRFS_STAT_CURR] += 2030 info->bitmap_extents; 2031 ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; 2032 } 2033 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2034 } 2035 2036 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 2037 2038 bytes_to_set = min(end - offset, bytes); 2039 2040 bitmap_set_bits(ctl, info, offset, bytes_to_set); 2041 2042 /* 2043 * We set some bytes, we have no idea what the max extent size is 2044 * anymore. 2045 */ 2046 info->max_extent_size = 0; 2047 2048 return bytes_to_set; 2049 2050 } 2051 2052 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 2053 struct btrfs_free_space *info) 2054 { 2055 struct btrfs_block_group *block_group = ctl->private; 2056 struct btrfs_fs_info *fs_info = block_group->fs_info; 2057 bool forced = false; 2058 2059 #ifdef CONFIG_BTRFS_DEBUG 2060 if (btrfs_should_fragment_free_space(block_group)) 2061 forced = true; 2062 #endif 2063 2064 /* This is a way to reclaim large regions from the bitmaps. */ 2065 if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) 2066 return false; 2067 2068 /* 2069 * If we are below the extents threshold then we can add this as an 2070 * extent, and don't have to deal with the bitmap 2071 */ 2072 if (!forced && ctl->free_extents < ctl->extents_thresh) { 2073 /* 2074 * If this block group has some small extents we don't want to 2075 * use up all of our free slots in the cache with them, we want 2076 * to reserve them to larger extents, however if we have plenty 2077 * of cache left then go ahead an dadd them, no sense in adding 2078 * the overhead of a bitmap if we don't have to. 2079 */ 2080 if (info->bytes <= fs_info->sectorsize * 8) { 2081 if (ctl->free_extents * 3 <= ctl->extents_thresh) 2082 return false; 2083 } else { 2084 return false; 2085 } 2086 } 2087 2088 /* 2089 * The original block groups from mkfs can be really small, like 8 2090 * megabytes, so don't bother with a bitmap for those entries. However 2091 * some block groups can be smaller than what a bitmap would cover but 2092 * are still large enough that they could overflow the 32k memory limit, 2093 * so allow those block groups to still be allowed to have a bitmap 2094 * entry. 2095 */ 2096 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length) 2097 return false; 2098 2099 return true; 2100 } 2101 2102 static const struct btrfs_free_space_op free_space_op = { 2103 .recalc_thresholds = recalculate_thresholds, 2104 .use_bitmap = use_bitmap, 2105 }; 2106 2107 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 2108 struct btrfs_free_space *info) 2109 { 2110 struct btrfs_free_space *bitmap_info; 2111 struct btrfs_block_group *block_group = NULL; 2112 int added = 0; 2113 u64 bytes, offset, bytes_added; 2114 enum btrfs_trim_state trim_state; 2115 int ret; 2116 2117 bytes = info->bytes; 2118 offset = info->offset; 2119 trim_state = info->trim_state; 2120 2121 if (!ctl->op->use_bitmap(ctl, info)) 2122 return 0; 2123 2124 if (ctl->op == &free_space_op) 2125 block_group = ctl->private; 2126 again: 2127 /* 2128 * Since we link bitmaps right into the cluster we need to see if we 2129 * have a cluster here, and if so and it has our bitmap we need to add 2130 * the free space to that bitmap. 2131 */ 2132 if (block_group && !list_empty(&block_group->cluster_list)) { 2133 struct btrfs_free_cluster *cluster; 2134 struct rb_node *node; 2135 struct btrfs_free_space *entry; 2136 2137 cluster = list_entry(block_group->cluster_list.next, 2138 struct btrfs_free_cluster, 2139 block_group_list); 2140 spin_lock(&cluster->lock); 2141 node = rb_first(&cluster->root); 2142 if (!node) { 2143 spin_unlock(&cluster->lock); 2144 goto no_cluster_bitmap; 2145 } 2146 2147 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2148 if (!entry->bitmap) { 2149 spin_unlock(&cluster->lock); 2150 goto no_cluster_bitmap; 2151 } 2152 2153 if (entry->offset == offset_to_bitmap(ctl, offset)) { 2154 bytes_added = add_bytes_to_bitmap(ctl, entry, offset, 2155 bytes, trim_state); 2156 bytes -= bytes_added; 2157 offset += bytes_added; 2158 } 2159 spin_unlock(&cluster->lock); 2160 if (!bytes) { 2161 ret = 1; 2162 goto out; 2163 } 2164 } 2165 2166 no_cluster_bitmap: 2167 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2168 1, 0); 2169 if (!bitmap_info) { 2170 ASSERT(added == 0); 2171 goto new_bitmap; 2172 } 2173 2174 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 2175 trim_state); 2176 bytes -= bytes_added; 2177 offset += bytes_added; 2178 added = 0; 2179 2180 if (!bytes) { 2181 ret = 1; 2182 goto out; 2183 } else 2184 goto again; 2185 2186 new_bitmap: 2187 if (info && info->bitmap) { 2188 add_new_bitmap(ctl, info, offset); 2189 added = 1; 2190 info = NULL; 2191 goto again; 2192 } else { 2193 spin_unlock(&ctl->tree_lock); 2194 2195 /* no pre-allocated info, allocate a new one */ 2196 if (!info) { 2197 info = kmem_cache_zalloc(btrfs_free_space_cachep, 2198 GFP_NOFS); 2199 if (!info) { 2200 spin_lock(&ctl->tree_lock); 2201 ret = -ENOMEM; 2202 goto out; 2203 } 2204 } 2205 2206 /* allocate the bitmap */ 2207 info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, 2208 GFP_NOFS); 2209 info->trim_state = BTRFS_TRIM_STATE_TRIMMED; 2210 spin_lock(&ctl->tree_lock); 2211 if (!info->bitmap) { 2212 ret = -ENOMEM; 2213 goto out; 2214 } 2215 goto again; 2216 } 2217 2218 out: 2219 if (info) { 2220 if (info->bitmap) 2221 kmem_cache_free(btrfs_free_space_bitmap_cachep, 2222 info->bitmap); 2223 kmem_cache_free(btrfs_free_space_cachep, info); 2224 } 2225 2226 return ret; 2227 } 2228 2229 /* 2230 * Free space merging rules: 2231 * 1) Merge trimmed areas together 2232 * 2) Let untrimmed areas coalesce with trimmed areas 2233 * 3) Always pull neighboring regions from bitmaps 2234 * 2235 * The above rules are for when we merge free space based on btrfs_trim_state. 2236 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the 2237 * same reason: to promote larger extent regions which makes life easier for 2238 * find_free_extent(). Rule 2 enables coalescing based on the common path 2239 * being returning free space from btrfs_finish_extent_commit(). So when free 2240 * space is trimmed, it will prevent aggregating trimmed new region and 2241 * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents 2242 * and provide find_free_extent() with the largest extents possible hoping for 2243 * the reuse path. 2244 */ 2245 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 2246 struct btrfs_free_space *info, bool update_stat) 2247 { 2248 struct btrfs_free_space *left_info = NULL; 2249 struct btrfs_free_space *right_info; 2250 bool merged = false; 2251 u64 offset = info->offset; 2252 u64 bytes = info->bytes; 2253 const bool is_trimmed = btrfs_free_space_trimmed(info); 2254 2255 /* 2256 * first we want to see if there is free space adjacent to the range we 2257 * are adding, if there is remove that struct and add a new one to 2258 * cover the entire range 2259 */ 2260 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2261 if (right_info && rb_prev(&right_info->offset_index)) 2262 left_info = rb_entry(rb_prev(&right_info->offset_index), 2263 struct btrfs_free_space, offset_index); 2264 else if (!right_info) 2265 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2266 2267 /* See try_merge_free_space() comment. */ 2268 if (right_info && !right_info->bitmap && 2269 (!is_trimmed || btrfs_free_space_trimmed(right_info))) { 2270 if (update_stat) 2271 unlink_free_space(ctl, right_info); 2272 else 2273 __unlink_free_space(ctl, right_info); 2274 info->bytes += right_info->bytes; 2275 kmem_cache_free(btrfs_free_space_cachep, right_info); 2276 merged = true; 2277 } 2278 2279 /* See try_merge_free_space() comment. */ 2280 if (left_info && !left_info->bitmap && 2281 left_info->offset + left_info->bytes == offset && 2282 (!is_trimmed || btrfs_free_space_trimmed(left_info))) { 2283 if (update_stat) 2284 unlink_free_space(ctl, left_info); 2285 else 2286 __unlink_free_space(ctl, left_info); 2287 info->offset = left_info->offset; 2288 info->bytes += left_info->bytes; 2289 kmem_cache_free(btrfs_free_space_cachep, left_info); 2290 merged = true; 2291 } 2292 2293 return merged; 2294 } 2295 2296 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2297 struct btrfs_free_space *info, 2298 bool update_stat) 2299 { 2300 struct btrfs_free_space *bitmap; 2301 unsigned long i; 2302 unsigned long j; 2303 const u64 end = info->offset + info->bytes; 2304 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2305 u64 bytes; 2306 2307 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2308 if (!bitmap) 2309 return false; 2310 2311 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2312 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2313 if (j == i) 2314 return false; 2315 bytes = (j - i) * ctl->unit; 2316 info->bytes += bytes; 2317 2318 /* See try_merge_free_space() comment. */ 2319 if (!btrfs_free_space_trimmed(bitmap)) 2320 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2321 2322 if (update_stat) 2323 bitmap_clear_bits(ctl, bitmap, end, bytes); 2324 else 2325 __bitmap_clear_bits(ctl, bitmap, end, bytes); 2326 2327 if (!bitmap->bytes) 2328 free_bitmap(ctl, bitmap); 2329 2330 return true; 2331 } 2332 2333 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2334 struct btrfs_free_space *info, 2335 bool update_stat) 2336 { 2337 struct btrfs_free_space *bitmap; 2338 u64 bitmap_offset; 2339 unsigned long i; 2340 unsigned long j; 2341 unsigned long prev_j; 2342 u64 bytes; 2343 2344 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2345 /* If we're on a boundary, try the previous logical bitmap. */ 2346 if (bitmap_offset == info->offset) { 2347 if (info->offset == 0) 2348 return false; 2349 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2350 } 2351 2352 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2353 if (!bitmap) 2354 return false; 2355 2356 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2357 j = 0; 2358 prev_j = (unsigned long)-1; 2359 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2360 if (j > i) 2361 break; 2362 prev_j = j; 2363 } 2364 if (prev_j == i) 2365 return false; 2366 2367 if (prev_j == (unsigned long)-1) 2368 bytes = (i + 1) * ctl->unit; 2369 else 2370 bytes = (i - prev_j) * ctl->unit; 2371 2372 info->offset -= bytes; 2373 info->bytes += bytes; 2374 2375 /* See try_merge_free_space() comment. */ 2376 if (!btrfs_free_space_trimmed(bitmap)) 2377 info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2378 2379 if (update_stat) 2380 bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2381 else 2382 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2383 2384 if (!bitmap->bytes) 2385 free_bitmap(ctl, bitmap); 2386 2387 return true; 2388 } 2389 2390 /* 2391 * We prefer always to allocate from extent entries, both for clustered and 2392 * non-clustered allocation requests. So when attempting to add a new extent 2393 * entry, try to see if there's adjacent free space in bitmap entries, and if 2394 * there is, migrate that space from the bitmaps to the extent. 2395 * Like this we get better chances of satisfying space allocation requests 2396 * because we attempt to satisfy them based on a single cache entry, and never 2397 * on 2 or more entries - even if the entries represent a contiguous free space 2398 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2399 * ends). 2400 */ 2401 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2402 struct btrfs_free_space *info, 2403 bool update_stat) 2404 { 2405 /* 2406 * Only work with disconnected entries, as we can change their offset, 2407 * and must be extent entries. 2408 */ 2409 ASSERT(!info->bitmap); 2410 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2411 2412 if (ctl->total_bitmaps > 0) { 2413 bool stole_end; 2414 bool stole_front = false; 2415 2416 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2417 if (ctl->total_bitmaps > 0) 2418 stole_front = steal_from_bitmap_to_front(ctl, info, 2419 update_stat); 2420 2421 if (stole_end || stole_front) 2422 try_merge_free_space(ctl, info, update_stat); 2423 } 2424 } 2425 2426 int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, 2427 struct btrfs_free_space_ctl *ctl, 2428 u64 offset, u64 bytes, 2429 enum btrfs_trim_state trim_state) 2430 { 2431 struct btrfs_block_group *block_group = ctl->private; 2432 struct btrfs_free_space *info; 2433 int ret = 0; 2434 u64 filter_bytes = bytes; 2435 2436 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2437 if (!info) 2438 return -ENOMEM; 2439 2440 info->offset = offset; 2441 info->bytes = bytes; 2442 info->trim_state = trim_state; 2443 RB_CLEAR_NODE(&info->offset_index); 2444 2445 spin_lock(&ctl->tree_lock); 2446 2447 if (try_merge_free_space(ctl, info, true)) 2448 goto link; 2449 2450 /* 2451 * There was no extent directly to the left or right of this new 2452 * extent then we know we're going to have to allocate a new extent, so 2453 * before we do that see if we need to drop this into a bitmap 2454 */ 2455 ret = insert_into_bitmap(ctl, info); 2456 if (ret < 0) { 2457 goto out; 2458 } else if (ret) { 2459 ret = 0; 2460 goto out; 2461 } 2462 link: 2463 /* 2464 * Only steal free space from adjacent bitmaps if we're sure we're not 2465 * going to add the new free space to existing bitmap entries - because 2466 * that would mean unnecessary work that would be reverted. Therefore 2467 * attempt to steal space from bitmaps if we're adding an extent entry. 2468 */ 2469 steal_from_bitmap(ctl, info, true); 2470 2471 filter_bytes = max(filter_bytes, info->bytes); 2472 2473 ret = link_free_space(ctl, info); 2474 if (ret) 2475 kmem_cache_free(btrfs_free_space_cachep, info); 2476 out: 2477 btrfs_discard_update_discardable(block_group); 2478 spin_unlock(&ctl->tree_lock); 2479 2480 if (ret) { 2481 btrfs_crit(fs_info, "unable to add free space :%d", ret); 2482 ASSERT(ret != -EEXIST); 2483 } 2484 2485 if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { 2486 btrfs_discard_check_filter(block_group, filter_bytes); 2487 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); 2488 } 2489 2490 return ret; 2491 } 2492 2493 int btrfs_add_free_space(struct btrfs_block_group *block_group, 2494 u64 bytenr, u64 size) 2495 { 2496 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2497 2498 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) 2499 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2500 2501 return __btrfs_add_free_space(block_group->fs_info, 2502 block_group->free_space_ctl, 2503 bytenr, size, trim_state); 2504 } 2505 2506 /* 2507 * This is a subtle distinction because when adding free space back in general, 2508 * we want it to be added as untrimmed for async. But in the case where we add 2509 * it on loading of a block group, we want to consider it trimmed. 2510 */ 2511 int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, 2512 u64 bytenr, u64 size) 2513 { 2514 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2515 2516 if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || 2517 btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 2518 trim_state = BTRFS_TRIM_STATE_TRIMMED; 2519 2520 return __btrfs_add_free_space(block_group->fs_info, 2521 block_group->free_space_ctl, 2522 bytenr, size, trim_state); 2523 } 2524 2525 int btrfs_remove_free_space(struct btrfs_block_group *block_group, 2526 u64 offset, u64 bytes) 2527 { 2528 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2529 struct btrfs_free_space *info; 2530 int ret; 2531 bool re_search = false; 2532 2533 spin_lock(&ctl->tree_lock); 2534 2535 again: 2536 ret = 0; 2537 if (!bytes) 2538 goto out_lock; 2539 2540 info = tree_search_offset(ctl, offset, 0, 0); 2541 if (!info) { 2542 /* 2543 * oops didn't find an extent that matched the space we wanted 2544 * to remove, look for a bitmap instead 2545 */ 2546 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2547 1, 0); 2548 if (!info) { 2549 /* 2550 * If we found a partial bit of our free space in a 2551 * bitmap but then couldn't find the other part this may 2552 * be a problem, so WARN about it. 2553 */ 2554 WARN_ON(re_search); 2555 goto out_lock; 2556 } 2557 } 2558 2559 re_search = false; 2560 if (!info->bitmap) { 2561 unlink_free_space(ctl, info); 2562 if (offset == info->offset) { 2563 u64 to_free = min(bytes, info->bytes); 2564 2565 info->bytes -= to_free; 2566 info->offset += to_free; 2567 if (info->bytes) { 2568 ret = link_free_space(ctl, info); 2569 WARN_ON(ret); 2570 } else { 2571 kmem_cache_free(btrfs_free_space_cachep, info); 2572 } 2573 2574 offset += to_free; 2575 bytes -= to_free; 2576 goto again; 2577 } else { 2578 u64 old_end = info->bytes + info->offset; 2579 2580 info->bytes = offset - info->offset; 2581 ret = link_free_space(ctl, info); 2582 WARN_ON(ret); 2583 if (ret) 2584 goto out_lock; 2585 2586 /* Not enough bytes in this entry to satisfy us */ 2587 if (old_end < offset + bytes) { 2588 bytes -= old_end - offset; 2589 offset = old_end; 2590 goto again; 2591 } else if (old_end == offset + bytes) { 2592 /* all done */ 2593 goto out_lock; 2594 } 2595 spin_unlock(&ctl->tree_lock); 2596 2597 ret = __btrfs_add_free_space(block_group->fs_info, ctl, 2598 offset + bytes, 2599 old_end - (offset + bytes), 2600 info->trim_state); 2601 WARN_ON(ret); 2602 goto out; 2603 } 2604 } 2605 2606 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2607 if (ret == -EAGAIN) { 2608 re_search = true; 2609 goto again; 2610 } 2611 out_lock: 2612 btrfs_discard_update_discardable(block_group); 2613 spin_unlock(&ctl->tree_lock); 2614 out: 2615 return ret; 2616 } 2617 2618 void btrfs_dump_free_space(struct btrfs_block_group *block_group, 2619 u64 bytes) 2620 { 2621 struct btrfs_fs_info *fs_info = block_group->fs_info; 2622 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2623 struct btrfs_free_space *info; 2624 struct rb_node *n; 2625 int count = 0; 2626 2627 spin_lock(&ctl->tree_lock); 2628 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2629 info = rb_entry(n, struct btrfs_free_space, offset_index); 2630 if (info->bytes >= bytes && !block_group->ro) 2631 count++; 2632 btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", 2633 info->offset, info->bytes, 2634 (info->bitmap) ? "yes" : "no"); 2635 } 2636 spin_unlock(&ctl->tree_lock); 2637 btrfs_info(fs_info, "block group has cluster?: %s", 2638 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2639 btrfs_info(fs_info, 2640 "%d blocks of free space at or bigger than bytes is", count); 2641 } 2642 2643 void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, 2644 struct btrfs_free_space_ctl *ctl) 2645 { 2646 struct btrfs_fs_info *fs_info = block_group->fs_info; 2647 2648 spin_lock_init(&ctl->tree_lock); 2649 ctl->unit = fs_info->sectorsize; 2650 ctl->start = block_group->start; 2651 ctl->private = block_group; 2652 ctl->op = &free_space_op; 2653 INIT_LIST_HEAD(&ctl->trimming_ranges); 2654 mutex_init(&ctl->cache_writeout_mutex); 2655 2656 /* 2657 * we only want to have 32k of ram per block group for keeping 2658 * track of free space, and if we pass 1/2 of that we want to 2659 * start converting things over to using bitmaps 2660 */ 2661 ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); 2662 } 2663 2664 /* 2665 * for a given cluster, put all of its extents back into the free 2666 * space cache. If the block group passed doesn't match the block group 2667 * pointed to by the cluster, someone else raced in and freed the 2668 * cluster already. In that case, we just return without changing anything 2669 */ 2670 static void __btrfs_return_cluster_to_free_space( 2671 struct btrfs_block_group *block_group, 2672 struct btrfs_free_cluster *cluster) 2673 { 2674 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2675 struct btrfs_free_space *entry; 2676 struct rb_node *node; 2677 2678 spin_lock(&cluster->lock); 2679 if (cluster->block_group != block_group) 2680 goto out; 2681 2682 cluster->block_group = NULL; 2683 cluster->window_start = 0; 2684 list_del_init(&cluster->block_group_list); 2685 2686 node = rb_first(&cluster->root); 2687 while (node) { 2688 bool bitmap; 2689 2690 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2691 node = rb_next(&entry->offset_index); 2692 rb_erase(&entry->offset_index, &cluster->root); 2693 RB_CLEAR_NODE(&entry->offset_index); 2694 2695 bitmap = (entry->bitmap != NULL); 2696 if (!bitmap) { 2697 /* Merging treats extents as if they were new */ 2698 if (!btrfs_free_space_trimmed(entry)) { 2699 ctl->discardable_extents[BTRFS_STAT_CURR]--; 2700 ctl->discardable_bytes[BTRFS_STAT_CURR] -= 2701 entry->bytes; 2702 } 2703 2704 try_merge_free_space(ctl, entry, false); 2705 steal_from_bitmap(ctl, entry, false); 2706 2707 /* As we insert directly, update these statistics */ 2708 if (!btrfs_free_space_trimmed(entry)) { 2709 ctl->discardable_extents[BTRFS_STAT_CURR]++; 2710 ctl->discardable_bytes[BTRFS_STAT_CURR] += 2711 entry->bytes; 2712 } 2713 } 2714 tree_insert_offset(&ctl->free_space_offset, 2715 entry->offset, &entry->offset_index, bitmap); 2716 } 2717 cluster->root = RB_ROOT; 2718 2719 out: 2720 spin_unlock(&cluster->lock); 2721 btrfs_put_block_group(block_group); 2722 } 2723 2724 static void __btrfs_remove_free_space_cache_locked( 2725 struct btrfs_free_space_ctl *ctl) 2726 { 2727 struct btrfs_free_space *info; 2728 struct rb_node *node; 2729 2730 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 2731 info = rb_entry(node, struct btrfs_free_space, offset_index); 2732 if (!info->bitmap) { 2733 unlink_free_space(ctl, info); 2734 kmem_cache_free(btrfs_free_space_cachep, info); 2735 } else { 2736 free_bitmap(ctl, info); 2737 } 2738 2739 cond_resched_lock(&ctl->tree_lock); 2740 } 2741 } 2742 2743 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 2744 { 2745 spin_lock(&ctl->tree_lock); 2746 __btrfs_remove_free_space_cache_locked(ctl); 2747 if (ctl->private) 2748 btrfs_discard_update_discardable(ctl->private); 2749 spin_unlock(&ctl->tree_lock); 2750 } 2751 2752 void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) 2753 { 2754 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2755 struct btrfs_free_cluster *cluster; 2756 struct list_head *head; 2757 2758 spin_lock(&ctl->tree_lock); 2759 while ((head = block_group->cluster_list.next) != 2760 &block_group->cluster_list) { 2761 cluster = list_entry(head, struct btrfs_free_cluster, 2762 block_group_list); 2763 2764 WARN_ON(cluster->block_group != block_group); 2765 __btrfs_return_cluster_to_free_space(block_group, cluster); 2766 2767 cond_resched_lock(&ctl->tree_lock); 2768 } 2769 __btrfs_remove_free_space_cache_locked(ctl); 2770 btrfs_discard_update_discardable(block_group); 2771 spin_unlock(&ctl->tree_lock); 2772 2773 } 2774 2775 /** 2776 * btrfs_is_free_space_trimmed - see if everything is trimmed 2777 * @block_group: block_group of interest 2778 * 2779 * Walk @block_group's free space rb_tree to determine if everything is trimmed. 2780 */ 2781 bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) 2782 { 2783 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2784 struct btrfs_free_space *info; 2785 struct rb_node *node; 2786 bool ret = true; 2787 2788 spin_lock(&ctl->tree_lock); 2789 node = rb_first(&ctl->free_space_offset); 2790 2791 while (node) { 2792 info = rb_entry(node, struct btrfs_free_space, offset_index); 2793 2794 if (!btrfs_free_space_trimmed(info)) { 2795 ret = false; 2796 break; 2797 } 2798 2799 node = rb_next(node); 2800 } 2801 2802 spin_unlock(&ctl->tree_lock); 2803 return ret; 2804 } 2805 2806 u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, 2807 u64 offset, u64 bytes, u64 empty_size, 2808 u64 *max_extent_size) 2809 { 2810 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2811 struct btrfs_discard_ctl *discard_ctl = 2812 &block_group->fs_info->discard_ctl; 2813 struct btrfs_free_space *entry = NULL; 2814 u64 bytes_search = bytes + empty_size; 2815 u64 ret = 0; 2816 u64 align_gap = 0; 2817 u64 align_gap_len = 0; 2818 enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2819 2820 spin_lock(&ctl->tree_lock); 2821 entry = find_free_space(ctl, &offset, &bytes_search, 2822 block_group->full_stripe_len, max_extent_size); 2823 if (!entry) 2824 goto out; 2825 2826 ret = offset; 2827 if (entry->bitmap) { 2828 bitmap_clear_bits(ctl, entry, offset, bytes); 2829 2830 if (!btrfs_free_space_trimmed(entry)) 2831 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 2832 2833 if (!entry->bytes) 2834 free_bitmap(ctl, entry); 2835 } else { 2836 unlink_free_space(ctl, entry); 2837 align_gap_len = offset - entry->offset; 2838 align_gap = entry->offset; 2839 align_gap_trim_state = entry->trim_state; 2840 2841 if (!btrfs_free_space_trimmed(entry)) 2842 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 2843 2844 entry->offset = offset + bytes; 2845 WARN_ON(entry->bytes < bytes + align_gap_len); 2846 2847 entry->bytes -= bytes + align_gap_len; 2848 if (!entry->bytes) 2849 kmem_cache_free(btrfs_free_space_cachep, entry); 2850 else 2851 link_free_space(ctl, entry); 2852 } 2853 out: 2854 btrfs_discard_update_discardable(block_group); 2855 spin_unlock(&ctl->tree_lock); 2856 2857 if (align_gap_len) 2858 __btrfs_add_free_space(block_group->fs_info, ctl, 2859 align_gap, align_gap_len, 2860 align_gap_trim_state); 2861 return ret; 2862 } 2863 2864 /* 2865 * given a cluster, put all of its extents back into the free space 2866 * cache. If a block group is passed, this function will only free 2867 * a cluster that belongs to the passed block group. 2868 * 2869 * Otherwise, it'll get a reference on the block group pointed to by the 2870 * cluster and remove the cluster from it. 2871 */ 2872 void btrfs_return_cluster_to_free_space( 2873 struct btrfs_block_group *block_group, 2874 struct btrfs_free_cluster *cluster) 2875 { 2876 struct btrfs_free_space_ctl *ctl; 2877 2878 /* first, get a safe pointer to the block group */ 2879 spin_lock(&cluster->lock); 2880 if (!block_group) { 2881 block_group = cluster->block_group; 2882 if (!block_group) { 2883 spin_unlock(&cluster->lock); 2884 return; 2885 } 2886 } else if (cluster->block_group != block_group) { 2887 /* someone else has already freed it don't redo their work */ 2888 spin_unlock(&cluster->lock); 2889 return; 2890 } 2891 btrfs_get_block_group(block_group); 2892 spin_unlock(&cluster->lock); 2893 2894 ctl = block_group->free_space_ctl; 2895 2896 /* now return any extents the cluster had on it */ 2897 spin_lock(&ctl->tree_lock); 2898 __btrfs_return_cluster_to_free_space(block_group, cluster); 2899 spin_unlock(&ctl->tree_lock); 2900 2901 btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); 2902 2903 /* finally drop our ref */ 2904 btrfs_put_block_group(block_group); 2905 } 2906 2907 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group, 2908 struct btrfs_free_cluster *cluster, 2909 struct btrfs_free_space *entry, 2910 u64 bytes, u64 min_start, 2911 u64 *max_extent_size) 2912 { 2913 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2914 int err; 2915 u64 search_start = cluster->window_start; 2916 u64 search_bytes = bytes; 2917 u64 ret = 0; 2918 2919 search_start = min_start; 2920 search_bytes = bytes; 2921 2922 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); 2923 if (err) { 2924 *max_extent_size = max(get_max_extent_size(entry), 2925 *max_extent_size); 2926 return 0; 2927 } 2928 2929 ret = search_start; 2930 __bitmap_clear_bits(ctl, entry, ret, bytes); 2931 2932 return ret; 2933 } 2934 2935 /* 2936 * given a cluster, try to allocate 'bytes' from it, returns 0 2937 * if it couldn't find anything suitably large, or a logical disk offset 2938 * if things worked out 2939 */ 2940 u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, 2941 struct btrfs_free_cluster *cluster, u64 bytes, 2942 u64 min_start, u64 *max_extent_size) 2943 { 2944 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2945 struct btrfs_discard_ctl *discard_ctl = 2946 &block_group->fs_info->discard_ctl; 2947 struct btrfs_free_space *entry = NULL; 2948 struct rb_node *node; 2949 u64 ret = 0; 2950 2951 spin_lock(&cluster->lock); 2952 if (bytes > cluster->max_size) 2953 goto out; 2954 2955 if (cluster->block_group != block_group) 2956 goto out; 2957 2958 node = rb_first(&cluster->root); 2959 if (!node) 2960 goto out; 2961 2962 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2963 while (1) { 2964 if (entry->bytes < bytes) 2965 *max_extent_size = max(get_max_extent_size(entry), 2966 *max_extent_size); 2967 2968 if (entry->bytes < bytes || 2969 (!entry->bitmap && entry->offset < min_start)) { 2970 node = rb_next(&entry->offset_index); 2971 if (!node) 2972 break; 2973 entry = rb_entry(node, struct btrfs_free_space, 2974 offset_index); 2975 continue; 2976 } 2977 2978 if (entry->bitmap) { 2979 ret = btrfs_alloc_from_bitmap(block_group, 2980 cluster, entry, bytes, 2981 cluster->window_start, 2982 max_extent_size); 2983 if (ret == 0) { 2984 node = rb_next(&entry->offset_index); 2985 if (!node) 2986 break; 2987 entry = rb_entry(node, struct btrfs_free_space, 2988 offset_index); 2989 continue; 2990 } 2991 cluster->window_start += bytes; 2992 } else { 2993 ret = entry->offset; 2994 2995 entry->offset += bytes; 2996 entry->bytes -= bytes; 2997 } 2998 2999 if (entry->bytes == 0) 3000 rb_erase(&entry->offset_index, &cluster->root); 3001 break; 3002 } 3003 out: 3004 spin_unlock(&cluster->lock); 3005 3006 if (!ret) 3007 return 0; 3008 3009 spin_lock(&ctl->tree_lock); 3010 3011 if (!btrfs_free_space_trimmed(entry)) 3012 atomic64_add(bytes, &discard_ctl->discard_bytes_saved); 3013 3014 ctl->free_space -= bytes; 3015 if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) 3016 ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; 3017 if (entry->bytes == 0) { 3018 ctl->free_extents--; 3019 if (entry->bitmap) { 3020 kmem_cache_free(btrfs_free_space_bitmap_cachep, 3021 entry->bitmap); 3022 ctl->total_bitmaps--; 3023 ctl->op->recalc_thresholds(ctl); 3024 } else if (!btrfs_free_space_trimmed(entry)) { 3025 ctl->discardable_extents[BTRFS_STAT_CURR]--; 3026 } 3027 kmem_cache_free(btrfs_free_space_cachep, entry); 3028 } 3029 3030 spin_unlock(&ctl->tree_lock); 3031 3032 return ret; 3033 } 3034 3035 static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, 3036 struct btrfs_free_space *entry, 3037 struct btrfs_free_cluster *cluster, 3038 u64 offset, u64 bytes, 3039 u64 cont1_bytes, u64 min_bytes) 3040 { 3041 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3042 unsigned long next_zero; 3043 unsigned long i; 3044 unsigned long want_bits; 3045 unsigned long min_bits; 3046 unsigned long found_bits; 3047 unsigned long max_bits = 0; 3048 unsigned long start = 0; 3049 unsigned long total_found = 0; 3050 int ret; 3051 3052 i = offset_to_bit(entry->offset, ctl->unit, 3053 max_t(u64, offset, entry->offset)); 3054 want_bits = bytes_to_bits(bytes, ctl->unit); 3055 min_bits = bytes_to_bits(min_bytes, ctl->unit); 3056 3057 /* 3058 * Don't bother looking for a cluster in this bitmap if it's heavily 3059 * fragmented. 3060 */ 3061 if (entry->max_extent_size && 3062 entry->max_extent_size < cont1_bytes) 3063 return -ENOSPC; 3064 again: 3065 found_bits = 0; 3066 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 3067 next_zero = find_next_zero_bit(entry->bitmap, 3068 BITS_PER_BITMAP, i); 3069 if (next_zero - i >= min_bits) { 3070 found_bits = next_zero - i; 3071 if (found_bits > max_bits) 3072 max_bits = found_bits; 3073 break; 3074 } 3075 if (next_zero - i > max_bits) 3076 max_bits = next_zero - i; 3077 i = next_zero; 3078 } 3079 3080 if (!found_bits) { 3081 entry->max_extent_size = (u64)max_bits * ctl->unit; 3082 return -ENOSPC; 3083 } 3084 3085 if (!total_found) { 3086 start = i; 3087 cluster->max_size = 0; 3088 } 3089 3090 total_found += found_bits; 3091 3092 if (cluster->max_size < found_bits * ctl->unit) 3093 cluster->max_size = found_bits * ctl->unit; 3094 3095 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 3096 i = next_zero + 1; 3097 goto again; 3098 } 3099 3100 cluster->window_start = start * ctl->unit + entry->offset; 3101 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3102 ret = tree_insert_offset(&cluster->root, entry->offset, 3103 &entry->offset_index, 1); 3104 ASSERT(!ret); /* -EEXIST; Logic error */ 3105 3106 trace_btrfs_setup_cluster(block_group, cluster, 3107 total_found * ctl->unit, 1); 3108 return 0; 3109 } 3110 3111 /* 3112 * This searches the block group for just extents to fill the cluster with. 3113 * Try to find a cluster with at least bytes total bytes, at least one 3114 * extent of cont1_bytes, and other clusters of at least min_bytes. 3115 */ 3116 static noinline int 3117 setup_cluster_no_bitmap(struct btrfs_block_group *block_group, 3118 struct btrfs_free_cluster *cluster, 3119 struct list_head *bitmaps, u64 offset, u64 bytes, 3120 u64 cont1_bytes, u64 min_bytes) 3121 { 3122 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3123 struct btrfs_free_space *first = NULL; 3124 struct btrfs_free_space *entry = NULL; 3125 struct btrfs_free_space *last; 3126 struct rb_node *node; 3127 u64 window_free; 3128 u64 max_extent; 3129 u64 total_size = 0; 3130 3131 entry = tree_search_offset(ctl, offset, 0, 1); 3132 if (!entry) 3133 return -ENOSPC; 3134 3135 /* 3136 * We don't want bitmaps, so just move along until we find a normal 3137 * extent entry. 3138 */ 3139 while (entry->bitmap || entry->bytes < min_bytes) { 3140 if (entry->bitmap && list_empty(&entry->list)) 3141 list_add_tail(&entry->list, bitmaps); 3142 node = rb_next(&entry->offset_index); 3143 if (!node) 3144 return -ENOSPC; 3145 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3146 } 3147 3148 window_free = entry->bytes; 3149 max_extent = entry->bytes; 3150 first = entry; 3151 last = entry; 3152 3153 for (node = rb_next(&entry->offset_index); node; 3154 node = rb_next(&entry->offset_index)) { 3155 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3156 3157 if (entry->bitmap) { 3158 if (list_empty(&entry->list)) 3159 list_add_tail(&entry->list, bitmaps); 3160 continue; 3161 } 3162 3163 if (entry->bytes < min_bytes) 3164 continue; 3165 3166 last = entry; 3167 window_free += entry->bytes; 3168 if (entry->bytes > max_extent) 3169 max_extent = entry->bytes; 3170 } 3171 3172 if (window_free < bytes || max_extent < cont1_bytes) 3173 return -ENOSPC; 3174 3175 cluster->window_start = first->offset; 3176 3177 node = &first->offset_index; 3178 3179 /* 3180 * now we've found our entries, pull them out of the free space 3181 * cache and put them into the cluster rbtree 3182 */ 3183 do { 3184 int ret; 3185 3186 entry = rb_entry(node, struct btrfs_free_space, offset_index); 3187 node = rb_next(&entry->offset_index); 3188 if (entry->bitmap || entry->bytes < min_bytes) 3189 continue; 3190 3191 rb_erase(&entry->offset_index, &ctl->free_space_offset); 3192 ret = tree_insert_offset(&cluster->root, entry->offset, 3193 &entry->offset_index, 0); 3194 total_size += entry->bytes; 3195 ASSERT(!ret); /* -EEXIST; Logic error */ 3196 } while (node && entry != last); 3197 3198 cluster->max_size = max_extent; 3199 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 3200 return 0; 3201 } 3202 3203 /* 3204 * This specifically looks for bitmaps that may work in the cluster, we assume 3205 * that we have already failed to find extents that will work. 3206 */ 3207 static noinline int 3208 setup_cluster_bitmap(struct btrfs_block_group *block_group, 3209 struct btrfs_free_cluster *cluster, 3210 struct list_head *bitmaps, u64 offset, u64 bytes, 3211 u64 cont1_bytes, u64 min_bytes) 3212 { 3213 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3214 struct btrfs_free_space *entry = NULL; 3215 int ret = -ENOSPC; 3216 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 3217 3218 if (ctl->total_bitmaps == 0) 3219 return -ENOSPC; 3220 3221 /* 3222 * The bitmap that covers offset won't be in the list unless offset 3223 * is just its start offset. 3224 */ 3225 if (!list_empty(bitmaps)) 3226 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 3227 3228 if (!entry || entry->offset != bitmap_offset) { 3229 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 3230 if (entry && list_empty(&entry->list)) 3231 list_add(&entry->list, bitmaps); 3232 } 3233 3234 list_for_each_entry(entry, bitmaps, list) { 3235 if (entry->bytes < bytes) 3236 continue; 3237 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 3238 bytes, cont1_bytes, min_bytes); 3239 if (!ret) 3240 return 0; 3241 } 3242 3243 /* 3244 * The bitmaps list has all the bitmaps that record free space 3245 * starting after offset, so no more search is required. 3246 */ 3247 return -ENOSPC; 3248 } 3249 3250 /* 3251 * here we try to find a cluster of blocks in a block group. The goal 3252 * is to find at least bytes+empty_size. 3253 * We might not find them all in one contiguous area. 3254 * 3255 * returns zero and sets up cluster if things worked out, otherwise 3256 * it returns -enospc 3257 */ 3258 int btrfs_find_space_cluster(struct btrfs_block_group *block_group, 3259 struct btrfs_free_cluster *cluster, 3260 u64 offset, u64 bytes, u64 empty_size) 3261 { 3262 struct btrfs_fs_info *fs_info = block_group->fs_info; 3263 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3264 struct btrfs_free_space *entry, *tmp; 3265 LIST_HEAD(bitmaps); 3266 u64 min_bytes; 3267 u64 cont1_bytes; 3268 int ret; 3269 3270 /* 3271 * Choose the minimum extent size we'll require for this 3272 * cluster. For SSD_SPREAD, don't allow any fragmentation. 3273 * For metadata, allow allocates with smaller extents. For 3274 * data, keep it dense. 3275 */ 3276 if (btrfs_test_opt(fs_info, SSD_SPREAD)) { 3277 cont1_bytes = min_bytes = bytes + empty_size; 3278 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 3279 cont1_bytes = bytes; 3280 min_bytes = fs_info->sectorsize; 3281 } else { 3282 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 3283 min_bytes = fs_info->sectorsize; 3284 } 3285 3286 spin_lock(&ctl->tree_lock); 3287 3288 /* 3289 * If we know we don't have enough space to make a cluster don't even 3290 * bother doing all the work to try and find one. 3291 */ 3292 if (ctl->free_space < bytes) { 3293 spin_unlock(&ctl->tree_lock); 3294 return -ENOSPC; 3295 } 3296 3297 spin_lock(&cluster->lock); 3298 3299 /* someone already found a cluster, hooray */ 3300 if (cluster->block_group) { 3301 ret = 0; 3302 goto out; 3303 } 3304 3305 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 3306 min_bytes); 3307 3308 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 3309 bytes + empty_size, 3310 cont1_bytes, min_bytes); 3311 if (ret) 3312 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 3313 offset, bytes + empty_size, 3314 cont1_bytes, min_bytes); 3315 3316 /* Clear our temporary list */ 3317 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 3318 list_del_init(&entry->list); 3319 3320 if (!ret) { 3321 btrfs_get_block_group(block_group); 3322 list_add_tail(&cluster->block_group_list, 3323 &block_group->cluster_list); 3324 cluster->block_group = block_group; 3325 } else { 3326 trace_btrfs_failed_cluster_setup(block_group); 3327 } 3328 out: 3329 spin_unlock(&cluster->lock); 3330 spin_unlock(&ctl->tree_lock); 3331 3332 return ret; 3333 } 3334 3335 /* 3336 * simple code to zero out a cluster 3337 */ 3338 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 3339 { 3340 spin_lock_init(&cluster->lock); 3341 spin_lock_init(&cluster->refill_lock); 3342 cluster->root = RB_ROOT; 3343 cluster->max_size = 0; 3344 cluster->fragmented = false; 3345 INIT_LIST_HEAD(&cluster->block_group_list); 3346 cluster->block_group = NULL; 3347 } 3348 3349 static int do_trimming(struct btrfs_block_group *block_group, 3350 u64 *total_trimmed, u64 start, u64 bytes, 3351 u64 reserved_start, u64 reserved_bytes, 3352 enum btrfs_trim_state reserved_trim_state, 3353 struct btrfs_trim_range *trim_entry) 3354 { 3355 struct btrfs_space_info *space_info = block_group->space_info; 3356 struct btrfs_fs_info *fs_info = block_group->fs_info; 3357 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3358 int ret; 3359 int update = 0; 3360 const u64 end = start + bytes; 3361 const u64 reserved_end = reserved_start + reserved_bytes; 3362 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3363 u64 trimmed = 0; 3364 3365 spin_lock(&space_info->lock); 3366 spin_lock(&block_group->lock); 3367 if (!block_group->ro) { 3368 block_group->reserved += reserved_bytes; 3369 space_info->bytes_reserved += reserved_bytes; 3370 update = 1; 3371 } 3372 spin_unlock(&block_group->lock); 3373 spin_unlock(&space_info->lock); 3374 3375 ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3376 if (!ret) { 3377 *total_trimmed += trimmed; 3378 trim_state = BTRFS_TRIM_STATE_TRIMMED; 3379 } 3380 3381 mutex_lock(&ctl->cache_writeout_mutex); 3382 if (reserved_start < start) 3383 __btrfs_add_free_space(fs_info, ctl, reserved_start, 3384 start - reserved_start, 3385 reserved_trim_state); 3386 if (start + bytes < reserved_start + reserved_bytes) 3387 __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, 3388 reserved_trim_state); 3389 __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); 3390 list_del(&trim_entry->list); 3391 mutex_unlock(&ctl->cache_writeout_mutex); 3392 3393 if (update) { 3394 spin_lock(&space_info->lock); 3395 spin_lock(&block_group->lock); 3396 if (block_group->ro) 3397 space_info->bytes_readonly += reserved_bytes; 3398 block_group->reserved -= reserved_bytes; 3399 space_info->bytes_reserved -= reserved_bytes; 3400 spin_unlock(&block_group->lock); 3401 spin_unlock(&space_info->lock); 3402 } 3403 3404 return ret; 3405 } 3406 3407 /* 3408 * If @async is set, then we will trim 1 region and return. 3409 */ 3410 static int trim_no_bitmap(struct btrfs_block_group *block_group, 3411 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3412 bool async) 3413 { 3414 struct btrfs_discard_ctl *discard_ctl = 3415 &block_group->fs_info->discard_ctl; 3416 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3417 struct btrfs_free_space *entry; 3418 struct rb_node *node; 3419 int ret = 0; 3420 u64 extent_start; 3421 u64 extent_bytes; 3422 enum btrfs_trim_state extent_trim_state; 3423 u64 bytes; 3424 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3425 3426 while (start < end) { 3427 struct btrfs_trim_range trim_entry; 3428 3429 mutex_lock(&ctl->cache_writeout_mutex); 3430 spin_lock(&ctl->tree_lock); 3431 3432 if (ctl->free_space < minlen) 3433 goto out_unlock; 3434 3435 entry = tree_search_offset(ctl, start, 0, 1); 3436 if (!entry) 3437 goto out_unlock; 3438 3439 /* Skip bitmaps and if async, already trimmed entries */ 3440 while (entry->bitmap || 3441 (async && btrfs_free_space_trimmed(entry))) { 3442 node = rb_next(&entry->offset_index); 3443 if (!node) 3444 goto out_unlock; 3445 entry = rb_entry(node, struct btrfs_free_space, 3446 offset_index); 3447 } 3448 3449 if (entry->offset >= end) 3450 goto out_unlock; 3451 3452 extent_start = entry->offset; 3453 extent_bytes = entry->bytes; 3454 extent_trim_state = entry->trim_state; 3455 if (async) { 3456 start = entry->offset; 3457 bytes = entry->bytes; 3458 if (bytes < minlen) { 3459 spin_unlock(&ctl->tree_lock); 3460 mutex_unlock(&ctl->cache_writeout_mutex); 3461 goto next; 3462 } 3463 unlink_free_space(ctl, entry); 3464 /* 3465 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3466 * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim 3467 * X when we come back around. So trim it now. 3468 */ 3469 if (max_discard_size && 3470 bytes >= (max_discard_size + 3471 BTRFS_ASYNC_DISCARD_MIN_FILTER)) { 3472 bytes = max_discard_size; 3473 extent_bytes = max_discard_size; 3474 entry->offset += max_discard_size; 3475 entry->bytes -= max_discard_size; 3476 link_free_space(ctl, entry); 3477 } else { 3478 kmem_cache_free(btrfs_free_space_cachep, entry); 3479 } 3480 } else { 3481 start = max(start, extent_start); 3482 bytes = min(extent_start + extent_bytes, end) - start; 3483 if (bytes < minlen) { 3484 spin_unlock(&ctl->tree_lock); 3485 mutex_unlock(&ctl->cache_writeout_mutex); 3486 goto next; 3487 } 3488 3489 unlink_free_space(ctl, entry); 3490 kmem_cache_free(btrfs_free_space_cachep, entry); 3491 } 3492 3493 spin_unlock(&ctl->tree_lock); 3494 trim_entry.start = extent_start; 3495 trim_entry.bytes = extent_bytes; 3496 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3497 mutex_unlock(&ctl->cache_writeout_mutex); 3498 3499 ret = do_trimming(block_group, total_trimmed, start, bytes, 3500 extent_start, extent_bytes, extent_trim_state, 3501 &trim_entry); 3502 if (ret) { 3503 block_group->discard_cursor = start + bytes; 3504 break; 3505 } 3506 next: 3507 start += bytes; 3508 block_group->discard_cursor = start; 3509 if (async && *total_trimmed) 3510 break; 3511 3512 if (fatal_signal_pending(current)) { 3513 ret = -ERESTARTSYS; 3514 break; 3515 } 3516 3517 cond_resched(); 3518 } 3519 3520 return ret; 3521 3522 out_unlock: 3523 block_group->discard_cursor = btrfs_block_group_end(block_group); 3524 spin_unlock(&ctl->tree_lock); 3525 mutex_unlock(&ctl->cache_writeout_mutex); 3526 3527 return ret; 3528 } 3529 3530 /* 3531 * If we break out of trimming a bitmap prematurely, we should reset the 3532 * trimming bit. In a rather contrieved case, it's possible to race here so 3533 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. 3534 * 3535 * start = start of bitmap 3536 * end = near end of bitmap 3537 * 3538 * Thread 1: Thread 2: 3539 * trim_bitmaps(start) 3540 * trim_bitmaps(end) 3541 * end_trimming_bitmap() 3542 * reset_trimming_bitmap() 3543 */ 3544 static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) 3545 { 3546 struct btrfs_free_space *entry; 3547 3548 spin_lock(&ctl->tree_lock); 3549 entry = tree_search_offset(ctl, offset, 1, 0); 3550 if (entry) { 3551 if (btrfs_free_space_trimmed(entry)) { 3552 ctl->discardable_extents[BTRFS_STAT_CURR] += 3553 entry->bitmap_extents; 3554 ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; 3555 } 3556 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3557 } 3558 3559 spin_unlock(&ctl->tree_lock); 3560 } 3561 3562 static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, 3563 struct btrfs_free_space *entry) 3564 { 3565 if (btrfs_free_space_trimming_bitmap(entry)) { 3566 entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; 3567 ctl->discardable_extents[BTRFS_STAT_CURR] -= 3568 entry->bitmap_extents; 3569 ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; 3570 } 3571 } 3572 3573 /* 3574 * If @async is set, then we will trim 1 region and return. 3575 */ 3576 static int trim_bitmaps(struct btrfs_block_group *block_group, 3577 u64 *total_trimmed, u64 start, u64 end, u64 minlen, 3578 u64 maxlen, bool async) 3579 { 3580 struct btrfs_discard_ctl *discard_ctl = 3581 &block_group->fs_info->discard_ctl; 3582 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3583 struct btrfs_free_space *entry; 3584 int ret = 0; 3585 int ret2; 3586 u64 bytes; 3587 u64 offset = offset_to_bitmap(ctl, start); 3588 const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3589 3590 while (offset < end) { 3591 bool next_bitmap = false; 3592 struct btrfs_trim_range trim_entry; 3593 3594 mutex_lock(&ctl->cache_writeout_mutex); 3595 spin_lock(&ctl->tree_lock); 3596 3597 if (ctl->free_space < minlen) { 3598 block_group->discard_cursor = 3599 btrfs_block_group_end(block_group); 3600 spin_unlock(&ctl->tree_lock); 3601 mutex_unlock(&ctl->cache_writeout_mutex); 3602 break; 3603 } 3604 3605 entry = tree_search_offset(ctl, offset, 1, 0); 3606 /* 3607 * Bitmaps are marked trimmed lossily now to prevent constant 3608 * discarding of the same bitmap (the reason why we are bound 3609 * by the filters). So, retrim the block group bitmaps when we 3610 * are preparing to punt to the unused_bgs list. This uses 3611 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED 3612 * which is the only discard index which sets minlen to 0. 3613 */ 3614 if (!entry || (async && minlen && start == offset && 3615 btrfs_free_space_trimmed(entry))) { 3616 spin_unlock(&ctl->tree_lock); 3617 mutex_unlock(&ctl->cache_writeout_mutex); 3618 next_bitmap = true; 3619 goto next; 3620 } 3621 3622 /* 3623 * Async discard bitmap trimming begins at by setting the start 3624 * to be key.objectid and the offset_to_bitmap() aligns to the 3625 * start of the bitmap. This lets us know we are fully 3626 * scanning the bitmap rather than only some portion of it. 3627 */ 3628 if (start == offset) 3629 entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; 3630 3631 bytes = minlen; 3632 ret2 = search_bitmap(ctl, entry, &start, &bytes, false); 3633 if (ret2 || start >= end) { 3634 /* 3635 * We lossily consider a bitmap trimmed if we only skip 3636 * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. 3637 */ 3638 if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) 3639 end_trimming_bitmap(ctl, entry); 3640 else 3641 entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 3642 spin_unlock(&ctl->tree_lock); 3643 mutex_unlock(&ctl->cache_writeout_mutex); 3644 next_bitmap = true; 3645 goto next; 3646 } 3647 3648 /* 3649 * We already trimmed a region, but are using the locking above 3650 * to reset the trim_state. 3651 */ 3652 if (async && *total_trimmed) { 3653 spin_unlock(&ctl->tree_lock); 3654 mutex_unlock(&ctl->cache_writeout_mutex); 3655 goto out; 3656 } 3657 3658 bytes = min(bytes, end - start); 3659 if (bytes < minlen || (async && maxlen && bytes > maxlen)) { 3660 spin_unlock(&ctl->tree_lock); 3661 mutex_unlock(&ctl->cache_writeout_mutex); 3662 goto next; 3663 } 3664 3665 /* 3666 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. 3667 * If X < @minlen, we won't trim X when we come back around. 3668 * So trim it now. We differ here from trimming extents as we 3669 * don't keep individual state per bit. 3670 */ 3671 if (async && 3672 max_discard_size && 3673 bytes > (max_discard_size + minlen)) 3674 bytes = max_discard_size; 3675 3676 bitmap_clear_bits(ctl, entry, start, bytes); 3677 if (entry->bytes == 0) 3678 free_bitmap(ctl, entry); 3679 3680 spin_unlock(&ctl->tree_lock); 3681 trim_entry.start = start; 3682 trim_entry.bytes = bytes; 3683 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3684 mutex_unlock(&ctl->cache_writeout_mutex); 3685 3686 ret = do_trimming(block_group, total_trimmed, start, bytes, 3687 start, bytes, 0, &trim_entry); 3688 if (ret) { 3689 reset_trimming_bitmap(ctl, offset); 3690 block_group->discard_cursor = 3691 btrfs_block_group_end(block_group); 3692 break; 3693 } 3694 next: 3695 if (next_bitmap) { 3696 offset += BITS_PER_BITMAP * ctl->unit; 3697 start = offset; 3698 } else { 3699 start += bytes; 3700 } 3701 block_group->discard_cursor = start; 3702 3703 if (fatal_signal_pending(current)) { 3704 if (start != offset) 3705 reset_trimming_bitmap(ctl, offset); 3706 ret = -ERESTARTSYS; 3707 break; 3708 } 3709 3710 cond_resched(); 3711 } 3712 3713 if (offset >= end) 3714 block_group->discard_cursor = end; 3715 3716 out: 3717 return ret; 3718 } 3719 3720 int btrfs_trim_block_group(struct btrfs_block_group *block_group, 3721 u64 *trimmed, u64 start, u64 end, u64 minlen) 3722 { 3723 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3724 int ret; 3725 u64 rem = 0; 3726 3727 *trimmed = 0; 3728 3729 spin_lock(&block_group->lock); 3730 if (block_group->removed) { 3731 spin_unlock(&block_group->lock); 3732 return 0; 3733 } 3734 btrfs_freeze_block_group(block_group); 3735 spin_unlock(&block_group->lock); 3736 3737 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); 3738 if (ret) 3739 goto out; 3740 3741 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); 3742 div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); 3743 /* If we ended in the middle of a bitmap, reset the trimming flag */ 3744 if (rem) 3745 reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); 3746 out: 3747 btrfs_unfreeze_block_group(block_group); 3748 return ret; 3749 } 3750 3751 int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, 3752 u64 *trimmed, u64 start, u64 end, u64 minlen, 3753 bool async) 3754 { 3755 int ret; 3756 3757 *trimmed = 0; 3758 3759 spin_lock(&block_group->lock); 3760 if (block_group->removed) { 3761 spin_unlock(&block_group->lock); 3762 return 0; 3763 } 3764 btrfs_freeze_block_group(block_group); 3765 spin_unlock(&block_group->lock); 3766 3767 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); 3768 btrfs_unfreeze_block_group(block_group); 3769 3770 return ret; 3771 } 3772 3773 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 3774 u64 *trimmed, u64 start, u64 end, u64 minlen, 3775 u64 maxlen, bool async) 3776 { 3777 int ret; 3778 3779 *trimmed = 0; 3780 3781 spin_lock(&block_group->lock); 3782 if (block_group->removed) { 3783 spin_unlock(&block_group->lock); 3784 return 0; 3785 } 3786 btrfs_freeze_block_group(block_group); 3787 spin_unlock(&block_group->lock); 3788 3789 ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, 3790 async); 3791 3792 btrfs_unfreeze_block_group(block_group); 3793 3794 return ret; 3795 } 3796 3797 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3798 /* 3799 * Use this if you need to make a bitmap or extent entry specifically, it 3800 * doesn't do any of the merging that add_free_space does, this acts a lot like 3801 * how the free space cache loading stuff works, so you can get really weird 3802 * configurations. 3803 */ 3804 int test_add_free_space_entry(struct btrfs_block_group *cache, 3805 u64 offset, u64 bytes, bool bitmap) 3806 { 3807 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3808 struct btrfs_free_space *info = NULL, *bitmap_info; 3809 void *map = NULL; 3810 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; 3811 u64 bytes_added; 3812 int ret; 3813 3814 again: 3815 if (!info) { 3816 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 3817 if (!info) 3818 return -ENOMEM; 3819 } 3820 3821 if (!bitmap) { 3822 spin_lock(&ctl->tree_lock); 3823 info->offset = offset; 3824 info->bytes = bytes; 3825 info->max_extent_size = 0; 3826 ret = link_free_space(ctl, info); 3827 spin_unlock(&ctl->tree_lock); 3828 if (ret) 3829 kmem_cache_free(btrfs_free_space_cachep, info); 3830 return ret; 3831 } 3832 3833 if (!map) { 3834 map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); 3835 if (!map) { 3836 kmem_cache_free(btrfs_free_space_cachep, info); 3837 return -ENOMEM; 3838 } 3839 } 3840 3841 spin_lock(&ctl->tree_lock); 3842 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3843 1, 0); 3844 if (!bitmap_info) { 3845 info->bitmap = map; 3846 map = NULL; 3847 add_new_bitmap(ctl, info, offset); 3848 bitmap_info = info; 3849 info = NULL; 3850 } 3851 3852 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, 3853 trim_state); 3854 3855 bytes -= bytes_added; 3856 offset += bytes_added; 3857 spin_unlock(&ctl->tree_lock); 3858 3859 if (bytes) 3860 goto again; 3861 3862 if (info) 3863 kmem_cache_free(btrfs_free_space_cachep, info); 3864 if (map) 3865 kmem_cache_free(btrfs_free_space_bitmap_cachep, map); 3866 return 0; 3867 } 3868 3869 /* 3870 * Checks to see if the given range is in the free space cache. This is really 3871 * just used to check the absence of space, so if there is free space in the 3872 * range at all we will return 1. 3873 */ 3874 int test_check_exists(struct btrfs_block_group *cache, 3875 u64 offset, u64 bytes) 3876 { 3877 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3878 struct btrfs_free_space *info; 3879 int ret = 0; 3880 3881 spin_lock(&ctl->tree_lock); 3882 info = tree_search_offset(ctl, offset, 0, 0); 3883 if (!info) { 3884 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3885 1, 0); 3886 if (!info) 3887 goto out; 3888 } 3889 3890 have_info: 3891 if (info->bitmap) { 3892 u64 bit_off, bit_bytes; 3893 struct rb_node *n; 3894 struct btrfs_free_space *tmp; 3895 3896 bit_off = offset; 3897 bit_bytes = ctl->unit; 3898 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); 3899 if (!ret) { 3900 if (bit_off == offset) { 3901 ret = 1; 3902 goto out; 3903 } else if (bit_off > offset && 3904 offset + bytes > bit_off) { 3905 ret = 1; 3906 goto out; 3907 } 3908 } 3909 3910 n = rb_prev(&info->offset_index); 3911 while (n) { 3912 tmp = rb_entry(n, struct btrfs_free_space, 3913 offset_index); 3914 if (tmp->offset + tmp->bytes < offset) 3915 break; 3916 if (offset + bytes < tmp->offset) { 3917 n = rb_prev(&tmp->offset_index); 3918 continue; 3919 } 3920 info = tmp; 3921 goto have_info; 3922 } 3923 3924 n = rb_next(&info->offset_index); 3925 while (n) { 3926 tmp = rb_entry(n, struct btrfs_free_space, 3927 offset_index); 3928 if (offset + bytes < tmp->offset) 3929 break; 3930 if (tmp->offset + tmp->bytes < offset) { 3931 n = rb_next(&tmp->offset_index); 3932 continue; 3933 } 3934 info = tmp; 3935 goto have_info; 3936 } 3937 3938 ret = 0; 3939 goto out; 3940 } 3941 3942 if (info->offset == offset) { 3943 ret = 1; 3944 goto out; 3945 } 3946 3947 if (offset > info->offset && offset < info->offset + info->bytes) 3948 ret = 1; 3949 out: 3950 spin_unlock(&ctl->tree_lock); 3951 return ret; 3952 } 3953 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3954