1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/pagemap.h> 20 #include <linux/sched.h> 21 #include <linux/slab.h> 22 #include <linux/math64.h> 23 #include <linux/ratelimit.h> 24 #include "ctree.h" 25 #include "free-space-cache.h" 26 #include "transaction.h" 27 #include "disk-io.h" 28 #include "extent_io.h" 29 #include "inode-map.h" 30 #include "volumes.h" 31 32 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 33 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 34 35 struct btrfs_trim_range { 36 u64 start; 37 u64 bytes; 38 struct list_head list; 39 }; 40 41 static int link_free_space(struct btrfs_free_space_ctl *ctl, 42 struct btrfs_free_space *info); 43 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 44 struct btrfs_free_space *info); 45 46 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 47 struct btrfs_path *path, 48 u64 offset) 49 { 50 struct btrfs_key key; 51 struct btrfs_key location; 52 struct btrfs_disk_key disk_key; 53 struct btrfs_free_space_header *header; 54 struct extent_buffer *leaf; 55 struct inode *inode = NULL; 56 int ret; 57 58 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 59 key.offset = offset; 60 key.type = 0; 61 62 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 63 if (ret < 0) 64 return ERR_PTR(ret); 65 if (ret > 0) { 66 btrfs_release_path(path); 67 return ERR_PTR(-ENOENT); 68 } 69 70 leaf = path->nodes[0]; 71 header = btrfs_item_ptr(leaf, path->slots[0], 72 struct btrfs_free_space_header); 73 btrfs_free_space_key(leaf, header, &disk_key); 74 btrfs_disk_key_to_cpu(&location, &disk_key); 75 btrfs_release_path(path); 76 77 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 78 if (!inode) 79 return ERR_PTR(-ENOENT); 80 if (IS_ERR(inode)) 81 return inode; 82 if (is_bad_inode(inode)) { 83 iput(inode); 84 return ERR_PTR(-ENOENT); 85 } 86 87 mapping_set_gfp_mask(inode->i_mapping, 88 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 89 90 return inode; 91 } 92 93 struct inode *lookup_free_space_inode(struct btrfs_root *root, 94 struct btrfs_block_group_cache 95 *block_group, struct btrfs_path *path) 96 { 97 struct inode *inode = NULL; 98 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 99 100 spin_lock(&block_group->lock); 101 if (block_group->inode) 102 inode = igrab(block_group->inode); 103 spin_unlock(&block_group->lock); 104 if (inode) 105 return inode; 106 107 inode = __lookup_free_space_inode(root, path, 108 block_group->key.objectid); 109 if (IS_ERR(inode)) 110 return inode; 111 112 spin_lock(&block_group->lock); 113 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 114 btrfs_info(root->fs_info, 115 "Old style space inode found, converting."); 116 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 117 BTRFS_INODE_NODATACOW; 118 block_group->disk_cache_state = BTRFS_DC_CLEAR; 119 } 120 121 if (!block_group->iref) { 122 block_group->inode = igrab(inode); 123 block_group->iref = 1; 124 } 125 spin_unlock(&block_group->lock); 126 127 return inode; 128 } 129 130 static int __create_free_space_inode(struct btrfs_root *root, 131 struct btrfs_trans_handle *trans, 132 struct btrfs_path *path, 133 u64 ino, u64 offset) 134 { 135 struct btrfs_key key; 136 struct btrfs_disk_key disk_key; 137 struct btrfs_free_space_header *header; 138 struct btrfs_inode_item *inode_item; 139 struct extent_buffer *leaf; 140 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; 141 int ret; 142 143 ret = btrfs_insert_empty_inode(trans, root, path, ino); 144 if (ret) 145 return ret; 146 147 /* We inline crc's for the free disk space cache */ 148 if (ino != BTRFS_FREE_INO_OBJECTID) 149 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 150 151 leaf = path->nodes[0]; 152 inode_item = btrfs_item_ptr(leaf, path->slots[0], 153 struct btrfs_inode_item); 154 btrfs_item_key(leaf, &disk_key, path->slots[0]); 155 memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 156 sizeof(*inode_item)); 157 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 158 btrfs_set_inode_size(leaf, inode_item, 0); 159 btrfs_set_inode_nbytes(leaf, inode_item, 0); 160 btrfs_set_inode_uid(leaf, inode_item, 0); 161 btrfs_set_inode_gid(leaf, inode_item, 0); 162 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 163 btrfs_set_inode_flags(leaf, inode_item, flags); 164 btrfs_set_inode_nlink(leaf, inode_item, 1); 165 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 166 btrfs_set_inode_block_group(leaf, inode_item, offset); 167 btrfs_mark_buffer_dirty(leaf); 168 btrfs_release_path(path); 169 170 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 171 key.offset = offset; 172 key.type = 0; 173 174 ret = btrfs_insert_empty_item(trans, root, path, &key, 175 sizeof(struct btrfs_free_space_header)); 176 if (ret < 0) { 177 btrfs_release_path(path); 178 return ret; 179 } 180 leaf = path->nodes[0]; 181 header = btrfs_item_ptr(leaf, path->slots[0], 182 struct btrfs_free_space_header); 183 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 184 btrfs_set_free_space_key(leaf, header, &disk_key); 185 btrfs_mark_buffer_dirty(leaf); 186 btrfs_release_path(path); 187 188 return 0; 189 } 190 191 int create_free_space_inode(struct btrfs_root *root, 192 struct btrfs_trans_handle *trans, 193 struct btrfs_block_group_cache *block_group, 194 struct btrfs_path *path) 195 { 196 int ret; 197 u64 ino; 198 199 ret = btrfs_find_free_objectid(root, &ino); 200 if (ret < 0) 201 return ret; 202 203 return __create_free_space_inode(root, trans, path, ino, 204 block_group->key.objectid); 205 } 206 207 int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, 208 struct btrfs_block_rsv *rsv) 209 { 210 u64 needed_bytes; 211 int ret; 212 213 /* 1 for slack space, 1 for updating the inode */ 214 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + 215 btrfs_calc_trans_metadata_size(root, 1); 216 217 spin_lock(&rsv->lock); 218 if (rsv->reserved < needed_bytes) 219 ret = -ENOSPC; 220 else 221 ret = 0; 222 spin_unlock(&rsv->lock); 223 return ret; 224 } 225 226 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 227 struct btrfs_trans_handle *trans, 228 struct inode *inode) 229 { 230 int ret = 0; 231 232 btrfs_i_size_write(inode, 0); 233 truncate_pagecache(inode, 0); 234 235 /* 236 * We don't need an orphan item because truncating the free space cache 237 * will never be split across transactions. 238 */ 239 ret = btrfs_truncate_inode_items(trans, root, inode, 240 0, BTRFS_EXTENT_DATA_KEY); 241 if (ret) { 242 btrfs_abort_transaction(trans, root, ret); 243 return ret; 244 } 245 246 ret = btrfs_update_inode(trans, root, inode); 247 if (ret) 248 btrfs_abort_transaction(trans, root, ret); 249 250 return ret; 251 } 252 253 static int readahead_cache(struct inode *inode) 254 { 255 struct file_ra_state *ra; 256 unsigned long last_index; 257 258 ra = kzalloc(sizeof(*ra), GFP_NOFS); 259 if (!ra) 260 return -ENOMEM; 261 262 file_ra_state_init(ra, inode->i_mapping); 263 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 264 265 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 266 267 kfree(ra); 268 269 return 0; 270 } 271 272 struct io_ctl { 273 void *cur, *orig; 274 struct page *page; 275 struct page **pages; 276 struct btrfs_root *root; 277 unsigned long size; 278 int index; 279 int num_pages; 280 unsigned check_crcs:1; 281 }; 282 283 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, 284 struct btrfs_root *root, int write) 285 { 286 int num_pages; 287 int check_crcs = 0; 288 289 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 290 291 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 292 check_crcs = 1; 293 294 /* Make sure we can fit our crcs into the first page */ 295 if (write && check_crcs && 296 (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) 297 return -ENOSPC; 298 299 memset(io_ctl, 0, sizeof(struct io_ctl)); 300 301 io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 302 if (!io_ctl->pages) 303 return -ENOMEM; 304 305 io_ctl->num_pages = num_pages; 306 io_ctl->root = root; 307 io_ctl->check_crcs = check_crcs; 308 309 return 0; 310 } 311 312 static void io_ctl_free(struct io_ctl *io_ctl) 313 { 314 kfree(io_ctl->pages); 315 } 316 317 static void io_ctl_unmap_page(struct io_ctl *io_ctl) 318 { 319 if (io_ctl->cur) { 320 kunmap(io_ctl->page); 321 io_ctl->cur = NULL; 322 io_ctl->orig = NULL; 323 } 324 } 325 326 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 327 { 328 ASSERT(io_ctl->index < io_ctl->num_pages); 329 io_ctl->page = io_ctl->pages[io_ctl->index++]; 330 io_ctl->cur = kmap(io_ctl->page); 331 io_ctl->orig = io_ctl->cur; 332 io_ctl->size = PAGE_CACHE_SIZE; 333 if (clear) 334 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); 335 } 336 337 static void io_ctl_drop_pages(struct io_ctl *io_ctl) 338 { 339 int i; 340 341 io_ctl_unmap_page(io_ctl); 342 343 for (i = 0; i < io_ctl->num_pages; i++) { 344 if (io_ctl->pages[i]) { 345 ClearPageChecked(io_ctl->pages[i]); 346 unlock_page(io_ctl->pages[i]); 347 page_cache_release(io_ctl->pages[i]); 348 } 349 } 350 } 351 352 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, 353 int uptodate) 354 { 355 struct page *page; 356 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 357 int i; 358 359 for (i = 0; i < io_ctl->num_pages; i++) { 360 page = find_or_create_page(inode->i_mapping, i, mask); 361 if (!page) { 362 io_ctl_drop_pages(io_ctl); 363 return -ENOMEM; 364 } 365 io_ctl->pages[i] = page; 366 if (uptodate && !PageUptodate(page)) { 367 btrfs_readpage(NULL, page); 368 lock_page(page); 369 if (!PageUptodate(page)) { 370 btrfs_err(BTRFS_I(inode)->root->fs_info, 371 "error reading free space cache"); 372 io_ctl_drop_pages(io_ctl); 373 return -EIO; 374 } 375 } 376 } 377 378 for (i = 0; i < io_ctl->num_pages; i++) { 379 clear_page_dirty_for_io(io_ctl->pages[i]); 380 set_page_extent_mapped(io_ctl->pages[i]); 381 } 382 383 return 0; 384 } 385 386 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) 387 { 388 __le64 *val; 389 390 io_ctl_map_page(io_ctl, 1); 391 392 /* 393 * Skip the csum areas. If we don't check crcs then we just have a 394 * 64bit chunk at the front of the first page. 395 */ 396 if (io_ctl->check_crcs) { 397 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 398 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 399 } else { 400 io_ctl->cur += sizeof(u64); 401 io_ctl->size -= sizeof(u64) * 2; 402 } 403 404 val = io_ctl->cur; 405 *val = cpu_to_le64(generation); 406 io_ctl->cur += sizeof(u64); 407 } 408 409 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) 410 { 411 __le64 *gen; 412 413 /* 414 * Skip the crc area. If we don't check crcs then we just have a 64bit 415 * chunk at the front of the first page. 416 */ 417 if (io_ctl->check_crcs) { 418 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 419 io_ctl->size -= sizeof(u64) + 420 (sizeof(u32) * io_ctl->num_pages); 421 } else { 422 io_ctl->cur += sizeof(u64); 423 io_ctl->size -= sizeof(u64) * 2; 424 } 425 426 gen = io_ctl->cur; 427 if (le64_to_cpu(*gen) != generation) { 428 printk_ratelimited(KERN_ERR "BTRFS: space cache generation " 429 "(%Lu) does not match inode (%Lu)\n", *gen, 430 generation); 431 io_ctl_unmap_page(io_ctl); 432 return -EIO; 433 } 434 io_ctl->cur += sizeof(u64); 435 return 0; 436 } 437 438 static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) 439 { 440 u32 *tmp; 441 u32 crc = ~(u32)0; 442 unsigned offset = 0; 443 444 if (!io_ctl->check_crcs) { 445 io_ctl_unmap_page(io_ctl); 446 return; 447 } 448 449 if (index == 0) 450 offset = sizeof(u32) * io_ctl->num_pages; 451 452 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 453 PAGE_CACHE_SIZE - offset); 454 btrfs_csum_final(crc, (char *)&crc); 455 io_ctl_unmap_page(io_ctl); 456 tmp = kmap(io_ctl->pages[0]); 457 tmp += index; 458 *tmp = crc; 459 kunmap(io_ctl->pages[0]); 460 } 461 462 static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) 463 { 464 u32 *tmp, val; 465 u32 crc = ~(u32)0; 466 unsigned offset = 0; 467 468 if (!io_ctl->check_crcs) { 469 io_ctl_map_page(io_ctl, 0); 470 return 0; 471 } 472 473 if (index == 0) 474 offset = sizeof(u32) * io_ctl->num_pages; 475 476 tmp = kmap(io_ctl->pages[0]); 477 tmp += index; 478 val = *tmp; 479 kunmap(io_ctl->pages[0]); 480 481 io_ctl_map_page(io_ctl, 0); 482 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 483 PAGE_CACHE_SIZE - offset); 484 btrfs_csum_final(crc, (char *)&crc); 485 if (val != crc) { 486 printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " 487 "space cache\n"); 488 io_ctl_unmap_page(io_ctl); 489 return -EIO; 490 } 491 492 return 0; 493 } 494 495 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, 496 void *bitmap) 497 { 498 struct btrfs_free_space_entry *entry; 499 500 if (!io_ctl->cur) 501 return -ENOSPC; 502 503 entry = io_ctl->cur; 504 entry->offset = cpu_to_le64(offset); 505 entry->bytes = cpu_to_le64(bytes); 506 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 507 BTRFS_FREE_SPACE_EXTENT; 508 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 509 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 510 511 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 512 return 0; 513 514 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 515 516 /* No more pages to map */ 517 if (io_ctl->index >= io_ctl->num_pages) 518 return 0; 519 520 /* map the next page */ 521 io_ctl_map_page(io_ctl, 1); 522 return 0; 523 } 524 525 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) 526 { 527 if (!io_ctl->cur) 528 return -ENOSPC; 529 530 /* 531 * If we aren't at the start of the current page, unmap this one and 532 * map the next one if there is any left. 533 */ 534 if (io_ctl->cur != io_ctl->orig) { 535 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 536 if (io_ctl->index >= io_ctl->num_pages) 537 return -ENOSPC; 538 io_ctl_map_page(io_ctl, 0); 539 } 540 541 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); 542 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 543 if (io_ctl->index < io_ctl->num_pages) 544 io_ctl_map_page(io_ctl, 0); 545 return 0; 546 } 547 548 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) 549 { 550 /* 551 * If we're not on the boundary we know we've modified the page and we 552 * need to crc the page. 553 */ 554 if (io_ctl->cur != io_ctl->orig) 555 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 556 else 557 io_ctl_unmap_page(io_ctl); 558 559 while (io_ctl->index < io_ctl->num_pages) { 560 io_ctl_map_page(io_ctl, 1); 561 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 562 } 563 } 564 565 static int io_ctl_read_entry(struct io_ctl *io_ctl, 566 struct btrfs_free_space *entry, u8 *type) 567 { 568 struct btrfs_free_space_entry *e; 569 int ret; 570 571 if (!io_ctl->cur) { 572 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 573 if (ret) 574 return ret; 575 } 576 577 e = io_ctl->cur; 578 entry->offset = le64_to_cpu(e->offset); 579 entry->bytes = le64_to_cpu(e->bytes); 580 *type = e->type; 581 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 582 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 583 584 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 585 return 0; 586 587 io_ctl_unmap_page(io_ctl); 588 589 return 0; 590 } 591 592 static int io_ctl_read_bitmap(struct io_ctl *io_ctl, 593 struct btrfs_free_space *entry) 594 { 595 int ret; 596 597 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 598 if (ret) 599 return ret; 600 601 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); 602 io_ctl_unmap_page(io_ctl); 603 604 return 0; 605 } 606 607 /* 608 * Since we attach pinned extents after the fact we can have contiguous sections 609 * of free space that are split up in entries. This poses a problem with the 610 * tree logging stuff since it could have allocated across what appears to be 2 611 * entries since we would have merged the entries when adding the pinned extents 612 * back to the free space cache. So run through the space cache that we just 613 * loaded and merge contiguous entries. This will make the log replay stuff not 614 * blow up and it will make for nicer allocator behavior. 615 */ 616 static void merge_space_tree(struct btrfs_free_space_ctl *ctl) 617 { 618 struct btrfs_free_space *e, *prev = NULL; 619 struct rb_node *n; 620 621 again: 622 spin_lock(&ctl->tree_lock); 623 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 624 e = rb_entry(n, struct btrfs_free_space, offset_index); 625 if (!prev) 626 goto next; 627 if (e->bitmap || prev->bitmap) 628 goto next; 629 if (prev->offset + prev->bytes == e->offset) { 630 unlink_free_space(ctl, prev); 631 unlink_free_space(ctl, e); 632 prev->bytes += e->bytes; 633 kmem_cache_free(btrfs_free_space_cachep, e); 634 link_free_space(ctl, prev); 635 prev = NULL; 636 spin_unlock(&ctl->tree_lock); 637 goto again; 638 } 639 next: 640 prev = e; 641 } 642 spin_unlock(&ctl->tree_lock); 643 } 644 645 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 646 struct btrfs_free_space_ctl *ctl, 647 struct btrfs_path *path, u64 offset) 648 { 649 struct btrfs_free_space_header *header; 650 struct extent_buffer *leaf; 651 struct io_ctl io_ctl; 652 struct btrfs_key key; 653 struct btrfs_free_space *e, *n; 654 struct list_head bitmaps; 655 u64 num_entries; 656 u64 num_bitmaps; 657 u64 generation; 658 u8 type; 659 int ret = 0; 660 661 INIT_LIST_HEAD(&bitmaps); 662 663 /* Nothing in the space cache, goodbye */ 664 if (!i_size_read(inode)) 665 return 0; 666 667 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 668 key.offset = offset; 669 key.type = 0; 670 671 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 672 if (ret < 0) 673 return 0; 674 else if (ret > 0) { 675 btrfs_release_path(path); 676 return 0; 677 } 678 679 ret = -1; 680 681 leaf = path->nodes[0]; 682 header = btrfs_item_ptr(leaf, path->slots[0], 683 struct btrfs_free_space_header); 684 num_entries = btrfs_free_space_entries(leaf, header); 685 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 686 generation = btrfs_free_space_generation(leaf, header); 687 btrfs_release_path(path); 688 689 if (!BTRFS_I(inode)->generation) { 690 btrfs_info(root->fs_info, 691 "The free space cache file (%llu) is invalid. skip it\n", 692 offset); 693 return 0; 694 } 695 696 if (BTRFS_I(inode)->generation != generation) { 697 btrfs_err(root->fs_info, 698 "free space inode generation (%llu) " 699 "did not match free space cache generation (%llu)", 700 BTRFS_I(inode)->generation, generation); 701 return 0; 702 } 703 704 if (!num_entries) 705 return 0; 706 707 ret = io_ctl_init(&io_ctl, inode, root, 0); 708 if (ret) 709 return ret; 710 711 ret = readahead_cache(inode); 712 if (ret) 713 goto out; 714 715 ret = io_ctl_prepare_pages(&io_ctl, inode, 1); 716 if (ret) 717 goto out; 718 719 ret = io_ctl_check_crc(&io_ctl, 0); 720 if (ret) 721 goto free_cache; 722 723 ret = io_ctl_check_generation(&io_ctl, generation); 724 if (ret) 725 goto free_cache; 726 727 while (num_entries) { 728 e = kmem_cache_zalloc(btrfs_free_space_cachep, 729 GFP_NOFS); 730 if (!e) 731 goto free_cache; 732 733 ret = io_ctl_read_entry(&io_ctl, e, &type); 734 if (ret) { 735 kmem_cache_free(btrfs_free_space_cachep, e); 736 goto free_cache; 737 } 738 739 if (!e->bytes) { 740 kmem_cache_free(btrfs_free_space_cachep, e); 741 goto free_cache; 742 } 743 744 if (type == BTRFS_FREE_SPACE_EXTENT) { 745 spin_lock(&ctl->tree_lock); 746 ret = link_free_space(ctl, e); 747 spin_unlock(&ctl->tree_lock); 748 if (ret) { 749 btrfs_err(root->fs_info, 750 "Duplicate entries in free space cache, dumping"); 751 kmem_cache_free(btrfs_free_space_cachep, e); 752 goto free_cache; 753 } 754 } else { 755 ASSERT(num_bitmaps); 756 num_bitmaps--; 757 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 758 if (!e->bitmap) { 759 kmem_cache_free( 760 btrfs_free_space_cachep, e); 761 goto free_cache; 762 } 763 spin_lock(&ctl->tree_lock); 764 ret = link_free_space(ctl, e); 765 ctl->total_bitmaps++; 766 ctl->op->recalc_thresholds(ctl); 767 spin_unlock(&ctl->tree_lock); 768 if (ret) { 769 btrfs_err(root->fs_info, 770 "Duplicate entries in free space cache, dumping"); 771 kmem_cache_free(btrfs_free_space_cachep, e); 772 goto free_cache; 773 } 774 list_add_tail(&e->list, &bitmaps); 775 } 776 777 num_entries--; 778 } 779 780 io_ctl_unmap_page(&io_ctl); 781 782 /* 783 * We add the bitmaps at the end of the entries in order that 784 * the bitmap entries are added to the cache. 785 */ 786 list_for_each_entry_safe(e, n, &bitmaps, list) { 787 list_del_init(&e->list); 788 ret = io_ctl_read_bitmap(&io_ctl, e); 789 if (ret) 790 goto free_cache; 791 } 792 793 io_ctl_drop_pages(&io_ctl); 794 merge_space_tree(ctl); 795 ret = 1; 796 out: 797 io_ctl_free(&io_ctl); 798 return ret; 799 free_cache: 800 io_ctl_drop_pages(&io_ctl); 801 __btrfs_remove_free_space_cache(ctl); 802 goto out; 803 } 804 805 int load_free_space_cache(struct btrfs_fs_info *fs_info, 806 struct btrfs_block_group_cache *block_group) 807 { 808 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 809 struct btrfs_root *root = fs_info->tree_root; 810 struct inode *inode; 811 struct btrfs_path *path; 812 int ret = 0; 813 bool matched; 814 u64 used = btrfs_block_group_used(&block_group->item); 815 816 /* 817 * If this block group has been marked to be cleared for one reason or 818 * another then we can't trust the on disk cache, so just return. 819 */ 820 spin_lock(&block_group->lock); 821 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 822 spin_unlock(&block_group->lock); 823 return 0; 824 } 825 spin_unlock(&block_group->lock); 826 827 path = btrfs_alloc_path(); 828 if (!path) 829 return 0; 830 path->search_commit_root = 1; 831 path->skip_locking = 1; 832 833 inode = lookup_free_space_inode(root, block_group, path); 834 if (IS_ERR(inode)) { 835 btrfs_free_path(path); 836 return 0; 837 } 838 839 /* We may have converted the inode and made the cache invalid. */ 840 spin_lock(&block_group->lock); 841 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 842 spin_unlock(&block_group->lock); 843 btrfs_free_path(path); 844 goto out; 845 } 846 spin_unlock(&block_group->lock); 847 848 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 849 path, block_group->key.objectid); 850 btrfs_free_path(path); 851 if (ret <= 0) 852 goto out; 853 854 spin_lock(&ctl->tree_lock); 855 matched = (ctl->free_space == (block_group->key.offset - used - 856 block_group->bytes_super)); 857 spin_unlock(&ctl->tree_lock); 858 859 if (!matched) { 860 __btrfs_remove_free_space_cache(ctl); 861 btrfs_warn(fs_info, "block group %llu has wrong amount of free space", 862 block_group->key.objectid); 863 ret = -1; 864 } 865 out: 866 if (ret < 0) { 867 /* This cache is bogus, make sure it gets cleared */ 868 spin_lock(&block_group->lock); 869 block_group->disk_cache_state = BTRFS_DC_CLEAR; 870 spin_unlock(&block_group->lock); 871 ret = 0; 872 873 btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now", 874 block_group->key.objectid); 875 } 876 877 iput(inode); 878 return ret; 879 } 880 881 static noinline_for_stack 882 int write_cache_extent_entries(struct io_ctl *io_ctl, 883 struct btrfs_free_space_ctl *ctl, 884 struct btrfs_block_group_cache *block_group, 885 int *entries, int *bitmaps, 886 struct list_head *bitmap_list) 887 { 888 int ret; 889 struct btrfs_free_cluster *cluster = NULL; 890 struct rb_node *node = rb_first(&ctl->free_space_offset); 891 struct btrfs_trim_range *trim_entry; 892 893 /* Get the cluster for this block_group if it exists */ 894 if (block_group && !list_empty(&block_group->cluster_list)) { 895 cluster = list_entry(block_group->cluster_list.next, 896 struct btrfs_free_cluster, 897 block_group_list); 898 } 899 900 if (!node && cluster) { 901 node = rb_first(&cluster->root); 902 cluster = NULL; 903 } 904 905 /* Write out the extent entries */ 906 while (node) { 907 struct btrfs_free_space *e; 908 909 e = rb_entry(node, struct btrfs_free_space, offset_index); 910 *entries += 1; 911 912 ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes, 913 e->bitmap); 914 if (ret) 915 goto fail; 916 917 if (e->bitmap) { 918 list_add_tail(&e->list, bitmap_list); 919 *bitmaps += 1; 920 } 921 node = rb_next(node); 922 if (!node && cluster) { 923 node = rb_first(&cluster->root); 924 cluster = NULL; 925 } 926 } 927 928 /* 929 * Make sure we don't miss any range that was removed from our rbtree 930 * because trimming is running. Otherwise after a umount+mount (or crash 931 * after committing the transaction) we would leak free space and get 932 * an inconsistent free space cache report from fsck. 933 */ 934 list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { 935 ret = io_ctl_add_entry(io_ctl, trim_entry->start, 936 trim_entry->bytes, NULL); 937 if (ret) 938 goto fail; 939 *entries += 1; 940 } 941 942 return 0; 943 fail: 944 return -ENOSPC; 945 } 946 947 static noinline_for_stack int 948 update_cache_item(struct btrfs_trans_handle *trans, 949 struct btrfs_root *root, 950 struct inode *inode, 951 struct btrfs_path *path, u64 offset, 952 int entries, int bitmaps) 953 { 954 struct btrfs_key key; 955 struct btrfs_free_space_header *header; 956 struct extent_buffer *leaf; 957 int ret; 958 959 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 960 key.offset = offset; 961 key.type = 0; 962 963 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 964 if (ret < 0) { 965 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 966 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 967 GFP_NOFS); 968 goto fail; 969 } 970 leaf = path->nodes[0]; 971 if (ret > 0) { 972 struct btrfs_key found_key; 973 ASSERT(path->slots[0]); 974 path->slots[0]--; 975 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 976 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 977 found_key.offset != offset) { 978 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 979 inode->i_size - 1, 980 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 981 NULL, GFP_NOFS); 982 btrfs_release_path(path); 983 goto fail; 984 } 985 } 986 987 BTRFS_I(inode)->generation = trans->transid; 988 header = btrfs_item_ptr(leaf, path->slots[0], 989 struct btrfs_free_space_header); 990 btrfs_set_free_space_entries(leaf, header, entries); 991 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 992 btrfs_set_free_space_generation(leaf, header, trans->transid); 993 btrfs_mark_buffer_dirty(leaf); 994 btrfs_release_path(path); 995 996 return 0; 997 998 fail: 999 return -1; 1000 } 1001 1002 static noinline_for_stack int 1003 write_pinned_extent_entries(struct btrfs_root *root, 1004 struct btrfs_block_group_cache *block_group, 1005 struct io_ctl *io_ctl, 1006 int *entries) 1007 { 1008 u64 start, extent_start, extent_end, len; 1009 struct extent_io_tree *unpin = NULL; 1010 int ret; 1011 1012 if (!block_group) 1013 return 0; 1014 1015 /* 1016 * We want to add any pinned extents to our free space cache 1017 * so we don't leak the space 1018 * 1019 * We shouldn't have switched the pinned extents yet so this is the 1020 * right one 1021 */ 1022 unpin = root->fs_info->pinned_extents; 1023 1024 start = block_group->key.objectid; 1025 1026 while (start < block_group->key.objectid + block_group->key.offset) { 1027 ret = find_first_extent_bit(unpin, start, 1028 &extent_start, &extent_end, 1029 EXTENT_DIRTY, NULL); 1030 if (ret) 1031 return 0; 1032 1033 /* This pinned extent is out of our range */ 1034 if (extent_start >= block_group->key.objectid + 1035 block_group->key.offset) 1036 return 0; 1037 1038 extent_start = max(extent_start, start); 1039 extent_end = min(block_group->key.objectid + 1040 block_group->key.offset, extent_end + 1); 1041 len = extent_end - extent_start; 1042 1043 *entries += 1; 1044 ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL); 1045 if (ret) 1046 return -ENOSPC; 1047 1048 start = extent_end; 1049 } 1050 1051 return 0; 1052 } 1053 1054 static noinline_for_stack int 1055 write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) 1056 { 1057 struct list_head *pos, *n; 1058 int ret; 1059 1060 /* Write out the bitmaps */ 1061 list_for_each_safe(pos, n, bitmap_list) { 1062 struct btrfs_free_space *entry = 1063 list_entry(pos, struct btrfs_free_space, list); 1064 1065 ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); 1066 if (ret) 1067 return -ENOSPC; 1068 list_del_init(&entry->list); 1069 } 1070 1071 return 0; 1072 } 1073 1074 static int flush_dirty_cache(struct inode *inode) 1075 { 1076 int ret; 1077 1078 ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); 1079 if (ret) 1080 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1081 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 1082 GFP_NOFS); 1083 1084 return ret; 1085 } 1086 1087 static void noinline_for_stack 1088 cleanup_write_cache_enospc(struct inode *inode, 1089 struct io_ctl *io_ctl, 1090 struct extent_state **cached_state, 1091 struct list_head *bitmap_list) 1092 { 1093 struct list_head *pos, *n; 1094 1095 list_for_each_safe(pos, n, bitmap_list) { 1096 struct btrfs_free_space *entry = 1097 list_entry(pos, struct btrfs_free_space, list); 1098 list_del_init(&entry->list); 1099 } 1100 io_ctl_drop_pages(io_ctl); 1101 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1102 i_size_read(inode) - 1, cached_state, 1103 GFP_NOFS); 1104 } 1105 1106 /** 1107 * __btrfs_write_out_cache - write out cached info to an inode 1108 * @root - the root the inode belongs to 1109 * @ctl - the free space cache we are going to write out 1110 * @block_group - the block_group for this cache if it belongs to a block_group 1111 * @trans - the trans handle 1112 * @path - the path to use 1113 * @offset - the offset for the key we'll insert 1114 * 1115 * This function writes out a free space cache struct to disk for quick recovery 1116 * on mount. This will return 0 if it was successfull in writing the cache out, 1117 * and -1 if it was not. 1118 */ 1119 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1120 struct btrfs_free_space_ctl *ctl, 1121 struct btrfs_block_group_cache *block_group, 1122 struct btrfs_trans_handle *trans, 1123 struct btrfs_path *path, u64 offset) 1124 { 1125 struct extent_state *cached_state = NULL; 1126 struct io_ctl io_ctl; 1127 LIST_HEAD(bitmap_list); 1128 int entries = 0; 1129 int bitmaps = 0; 1130 int ret; 1131 1132 if (!i_size_read(inode)) 1133 return -1; 1134 1135 ret = io_ctl_init(&io_ctl, inode, root, 1); 1136 if (ret) 1137 return -1; 1138 1139 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1140 down_write(&block_group->data_rwsem); 1141 spin_lock(&block_group->lock); 1142 if (block_group->delalloc_bytes) { 1143 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1144 spin_unlock(&block_group->lock); 1145 up_write(&block_group->data_rwsem); 1146 BTRFS_I(inode)->generation = 0; 1147 ret = 0; 1148 goto out; 1149 } 1150 spin_unlock(&block_group->lock); 1151 } 1152 1153 /* Lock all pages first so we can lock the extent safely. */ 1154 io_ctl_prepare_pages(&io_ctl, inode, 0); 1155 1156 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1157 0, &cached_state); 1158 1159 io_ctl_set_generation(&io_ctl, trans->transid); 1160 1161 mutex_lock(&ctl->cache_writeout_mutex); 1162 /* Write out the extent entries in the free space cache */ 1163 ret = write_cache_extent_entries(&io_ctl, ctl, 1164 block_group, &entries, &bitmaps, 1165 &bitmap_list); 1166 if (ret) { 1167 mutex_unlock(&ctl->cache_writeout_mutex); 1168 goto out_nospc; 1169 } 1170 1171 /* 1172 * Some spaces that are freed in the current transaction are pinned, 1173 * they will be added into free space cache after the transaction is 1174 * committed, we shouldn't lose them. 1175 */ 1176 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1177 if (ret) { 1178 mutex_unlock(&ctl->cache_writeout_mutex); 1179 goto out_nospc; 1180 } 1181 1182 /* 1183 * At last, we write out all the bitmaps and keep cache_writeout_mutex 1184 * locked while doing it because a concurrent trim can be manipulating 1185 * or freeing the bitmap. 1186 */ 1187 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1188 mutex_unlock(&ctl->cache_writeout_mutex); 1189 if (ret) 1190 goto out_nospc; 1191 1192 /* Zero out the rest of the pages just to make sure */ 1193 io_ctl_zero_remaining_pages(&io_ctl); 1194 1195 /* Everything is written out, now we dirty the pages in the file. */ 1196 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 1197 0, i_size_read(inode), &cached_state); 1198 if (ret) 1199 goto out_nospc; 1200 1201 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1202 up_write(&block_group->data_rwsem); 1203 /* 1204 * Release the pages and unlock the extent, we will flush 1205 * them out later 1206 */ 1207 io_ctl_drop_pages(&io_ctl); 1208 1209 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1210 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1211 1212 /* Flush the dirty pages in the cache file. */ 1213 ret = flush_dirty_cache(inode); 1214 if (ret) 1215 goto out; 1216 1217 /* Update the cache item to tell everyone this cache file is valid. */ 1218 ret = update_cache_item(trans, root, inode, path, offset, 1219 entries, bitmaps); 1220 out: 1221 io_ctl_free(&io_ctl); 1222 if (ret) { 1223 invalidate_inode_pages2(inode->i_mapping); 1224 BTRFS_I(inode)->generation = 0; 1225 } 1226 btrfs_update_inode(trans, root, inode); 1227 return ret; 1228 1229 out_nospc: 1230 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); 1231 1232 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1233 up_write(&block_group->data_rwsem); 1234 1235 goto out; 1236 } 1237 1238 int btrfs_write_out_cache(struct btrfs_root *root, 1239 struct btrfs_trans_handle *trans, 1240 struct btrfs_block_group_cache *block_group, 1241 struct btrfs_path *path) 1242 { 1243 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1244 struct inode *inode; 1245 int ret = 0; 1246 1247 root = root->fs_info->tree_root; 1248 1249 spin_lock(&block_group->lock); 1250 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1251 spin_unlock(&block_group->lock); 1252 return 0; 1253 } 1254 1255 if (block_group->delalloc_bytes) { 1256 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 1257 spin_unlock(&block_group->lock); 1258 return 0; 1259 } 1260 spin_unlock(&block_group->lock); 1261 1262 inode = lookup_free_space_inode(root, block_group, path); 1263 if (IS_ERR(inode)) 1264 return 0; 1265 1266 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1267 path, block_group->key.objectid); 1268 if (ret) { 1269 spin_lock(&block_group->lock); 1270 block_group->disk_cache_state = BTRFS_DC_ERROR; 1271 spin_unlock(&block_group->lock); 1272 ret = 0; 1273 #ifdef DEBUG 1274 btrfs_err(root->fs_info, 1275 "failed to write free space cache for block group %llu", 1276 block_group->key.objectid); 1277 #endif 1278 } 1279 1280 iput(inode); 1281 return ret; 1282 } 1283 1284 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1285 u64 offset) 1286 { 1287 ASSERT(offset >= bitmap_start); 1288 offset -= bitmap_start; 1289 return (unsigned long)(div_u64(offset, unit)); 1290 } 1291 1292 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1293 { 1294 return (unsigned long)(div_u64(bytes, unit)); 1295 } 1296 1297 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1298 u64 offset) 1299 { 1300 u64 bitmap_start; 1301 u64 bytes_per_bitmap; 1302 1303 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1304 bitmap_start = offset - ctl->start; 1305 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1306 bitmap_start *= bytes_per_bitmap; 1307 bitmap_start += ctl->start; 1308 1309 return bitmap_start; 1310 } 1311 1312 static int tree_insert_offset(struct rb_root *root, u64 offset, 1313 struct rb_node *node, int bitmap) 1314 { 1315 struct rb_node **p = &root->rb_node; 1316 struct rb_node *parent = NULL; 1317 struct btrfs_free_space *info; 1318 1319 while (*p) { 1320 parent = *p; 1321 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1322 1323 if (offset < info->offset) { 1324 p = &(*p)->rb_left; 1325 } else if (offset > info->offset) { 1326 p = &(*p)->rb_right; 1327 } else { 1328 /* 1329 * we could have a bitmap entry and an extent entry 1330 * share the same offset. If this is the case, we want 1331 * the extent entry to always be found first if we do a 1332 * linear search through the tree, since we want to have 1333 * the quickest allocation time, and allocating from an 1334 * extent is faster than allocating from a bitmap. So 1335 * if we're inserting a bitmap and we find an entry at 1336 * this offset, we want to go right, or after this entry 1337 * logically. If we are inserting an extent and we've 1338 * found a bitmap, we want to go left, or before 1339 * logically. 1340 */ 1341 if (bitmap) { 1342 if (info->bitmap) { 1343 WARN_ON_ONCE(1); 1344 return -EEXIST; 1345 } 1346 p = &(*p)->rb_right; 1347 } else { 1348 if (!info->bitmap) { 1349 WARN_ON_ONCE(1); 1350 return -EEXIST; 1351 } 1352 p = &(*p)->rb_left; 1353 } 1354 } 1355 } 1356 1357 rb_link_node(node, parent, p); 1358 rb_insert_color(node, root); 1359 1360 return 0; 1361 } 1362 1363 /* 1364 * searches the tree for the given offset. 1365 * 1366 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1367 * want a section that has at least bytes size and comes at or after the given 1368 * offset. 1369 */ 1370 static struct btrfs_free_space * 1371 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1372 u64 offset, int bitmap_only, int fuzzy) 1373 { 1374 struct rb_node *n = ctl->free_space_offset.rb_node; 1375 struct btrfs_free_space *entry, *prev = NULL; 1376 1377 /* find entry that is closest to the 'offset' */ 1378 while (1) { 1379 if (!n) { 1380 entry = NULL; 1381 break; 1382 } 1383 1384 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1385 prev = entry; 1386 1387 if (offset < entry->offset) 1388 n = n->rb_left; 1389 else if (offset > entry->offset) 1390 n = n->rb_right; 1391 else 1392 break; 1393 } 1394 1395 if (bitmap_only) { 1396 if (!entry) 1397 return NULL; 1398 if (entry->bitmap) 1399 return entry; 1400 1401 /* 1402 * bitmap entry and extent entry may share same offset, 1403 * in that case, bitmap entry comes after extent entry. 1404 */ 1405 n = rb_next(n); 1406 if (!n) 1407 return NULL; 1408 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1409 if (entry->offset != offset) 1410 return NULL; 1411 1412 WARN_ON(!entry->bitmap); 1413 return entry; 1414 } else if (entry) { 1415 if (entry->bitmap) { 1416 /* 1417 * if previous extent entry covers the offset, 1418 * we should return it instead of the bitmap entry 1419 */ 1420 n = rb_prev(&entry->offset_index); 1421 if (n) { 1422 prev = rb_entry(n, struct btrfs_free_space, 1423 offset_index); 1424 if (!prev->bitmap && 1425 prev->offset + prev->bytes > offset) 1426 entry = prev; 1427 } 1428 } 1429 return entry; 1430 } 1431 1432 if (!prev) 1433 return NULL; 1434 1435 /* find last entry before the 'offset' */ 1436 entry = prev; 1437 if (entry->offset > offset) { 1438 n = rb_prev(&entry->offset_index); 1439 if (n) { 1440 entry = rb_entry(n, struct btrfs_free_space, 1441 offset_index); 1442 ASSERT(entry->offset <= offset); 1443 } else { 1444 if (fuzzy) 1445 return entry; 1446 else 1447 return NULL; 1448 } 1449 } 1450 1451 if (entry->bitmap) { 1452 n = rb_prev(&entry->offset_index); 1453 if (n) { 1454 prev = rb_entry(n, struct btrfs_free_space, 1455 offset_index); 1456 if (!prev->bitmap && 1457 prev->offset + prev->bytes > offset) 1458 return prev; 1459 } 1460 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1461 return entry; 1462 } else if (entry->offset + entry->bytes > offset) 1463 return entry; 1464 1465 if (!fuzzy) 1466 return NULL; 1467 1468 while (1) { 1469 if (entry->bitmap) { 1470 if (entry->offset + BITS_PER_BITMAP * 1471 ctl->unit > offset) 1472 break; 1473 } else { 1474 if (entry->offset + entry->bytes > offset) 1475 break; 1476 } 1477 1478 n = rb_next(&entry->offset_index); 1479 if (!n) 1480 return NULL; 1481 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1482 } 1483 return entry; 1484 } 1485 1486 static inline void 1487 __unlink_free_space(struct btrfs_free_space_ctl *ctl, 1488 struct btrfs_free_space *info) 1489 { 1490 rb_erase(&info->offset_index, &ctl->free_space_offset); 1491 ctl->free_extents--; 1492 } 1493 1494 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1495 struct btrfs_free_space *info) 1496 { 1497 __unlink_free_space(ctl, info); 1498 ctl->free_space -= info->bytes; 1499 } 1500 1501 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1502 struct btrfs_free_space *info) 1503 { 1504 int ret = 0; 1505 1506 ASSERT(info->bytes || info->bitmap); 1507 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1508 &info->offset_index, (info->bitmap != NULL)); 1509 if (ret) 1510 return ret; 1511 1512 ctl->free_space += info->bytes; 1513 ctl->free_extents++; 1514 return ret; 1515 } 1516 1517 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 1518 { 1519 struct btrfs_block_group_cache *block_group = ctl->private; 1520 u64 max_bytes; 1521 u64 bitmap_bytes; 1522 u64 extent_bytes; 1523 u64 size = block_group->key.offset; 1524 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1525 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1526 1527 max_bitmaps = max(max_bitmaps, 1); 1528 1529 ASSERT(ctl->total_bitmaps <= max_bitmaps); 1530 1531 /* 1532 * The goal is to keep the total amount of memory used per 1gb of space 1533 * at or below 32k, so we need to adjust how much memory we allow to be 1534 * used by extent based free space tracking 1535 */ 1536 if (size < 1024 * 1024 * 1024) 1537 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1538 else 1539 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1540 div64_u64(size, 1024 * 1024 * 1024); 1541 1542 /* 1543 * we want to account for 1 more bitmap than what we have so we can make 1544 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1545 * we add more bitmaps. 1546 */ 1547 bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1548 1549 if (bitmap_bytes >= max_bytes) { 1550 ctl->extents_thresh = 0; 1551 return; 1552 } 1553 1554 /* 1555 * we want the extent entry threshold to always be at most 1/2 the maxw 1556 * bytes we can have, or whatever is less than that. 1557 */ 1558 extent_bytes = max_bytes - bitmap_bytes; 1559 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1560 1561 ctl->extents_thresh = 1562 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1563 } 1564 1565 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1566 struct btrfs_free_space *info, 1567 u64 offset, u64 bytes) 1568 { 1569 unsigned long start, count; 1570 1571 start = offset_to_bit(info->offset, ctl->unit, offset); 1572 count = bytes_to_bits(bytes, ctl->unit); 1573 ASSERT(start + count <= BITS_PER_BITMAP); 1574 1575 bitmap_clear(info->bitmap, start, count); 1576 1577 info->bytes -= bytes; 1578 } 1579 1580 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1581 struct btrfs_free_space *info, u64 offset, 1582 u64 bytes) 1583 { 1584 __bitmap_clear_bits(ctl, info, offset, bytes); 1585 ctl->free_space -= bytes; 1586 } 1587 1588 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1589 struct btrfs_free_space *info, u64 offset, 1590 u64 bytes) 1591 { 1592 unsigned long start, count; 1593 1594 start = offset_to_bit(info->offset, ctl->unit, offset); 1595 count = bytes_to_bits(bytes, ctl->unit); 1596 ASSERT(start + count <= BITS_PER_BITMAP); 1597 1598 bitmap_set(info->bitmap, start, count); 1599 1600 info->bytes += bytes; 1601 ctl->free_space += bytes; 1602 } 1603 1604 /* 1605 * If we can not find suitable extent, we will use bytes to record 1606 * the size of the max extent. 1607 */ 1608 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1609 struct btrfs_free_space *bitmap_info, u64 *offset, 1610 u64 *bytes) 1611 { 1612 unsigned long found_bits = 0; 1613 unsigned long max_bits = 0; 1614 unsigned long bits, i; 1615 unsigned long next_zero; 1616 unsigned long extent_bits; 1617 1618 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1619 max_t(u64, *offset, bitmap_info->offset)); 1620 bits = bytes_to_bits(*bytes, ctl->unit); 1621 1622 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1623 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1624 BITS_PER_BITMAP, i); 1625 extent_bits = next_zero - i; 1626 if (extent_bits >= bits) { 1627 found_bits = extent_bits; 1628 break; 1629 } else if (extent_bits > max_bits) { 1630 max_bits = extent_bits; 1631 } 1632 i = next_zero; 1633 } 1634 1635 if (found_bits) { 1636 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1637 *bytes = (u64)(found_bits) * ctl->unit; 1638 return 0; 1639 } 1640 1641 *bytes = (u64)(max_bits) * ctl->unit; 1642 return -1; 1643 } 1644 1645 /* Cache the size of the max extent in bytes */ 1646 static struct btrfs_free_space * 1647 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1648 unsigned long align, u64 *max_extent_size) 1649 { 1650 struct btrfs_free_space *entry; 1651 struct rb_node *node; 1652 u64 tmp; 1653 u64 align_off; 1654 int ret; 1655 1656 if (!ctl->free_space_offset.rb_node) 1657 goto out; 1658 1659 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); 1660 if (!entry) 1661 goto out; 1662 1663 for (node = &entry->offset_index; node; node = rb_next(node)) { 1664 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1665 if (entry->bytes < *bytes) { 1666 if (entry->bytes > *max_extent_size) 1667 *max_extent_size = entry->bytes; 1668 continue; 1669 } 1670 1671 /* make sure the space returned is big enough 1672 * to match our requested alignment 1673 */ 1674 if (*bytes >= align) { 1675 tmp = entry->offset - ctl->start + align - 1; 1676 do_div(tmp, align); 1677 tmp = tmp * align + ctl->start; 1678 align_off = tmp - entry->offset; 1679 } else { 1680 align_off = 0; 1681 tmp = entry->offset; 1682 } 1683 1684 if (entry->bytes < *bytes + align_off) { 1685 if (entry->bytes > *max_extent_size) 1686 *max_extent_size = entry->bytes; 1687 continue; 1688 } 1689 1690 if (entry->bitmap) { 1691 u64 size = *bytes; 1692 1693 ret = search_bitmap(ctl, entry, &tmp, &size); 1694 if (!ret) { 1695 *offset = tmp; 1696 *bytes = size; 1697 return entry; 1698 } else if (size > *max_extent_size) { 1699 *max_extent_size = size; 1700 } 1701 continue; 1702 } 1703 1704 *offset = tmp; 1705 *bytes = entry->bytes - align_off; 1706 return entry; 1707 } 1708 out: 1709 return NULL; 1710 } 1711 1712 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 1713 struct btrfs_free_space *info, u64 offset) 1714 { 1715 info->offset = offset_to_bitmap(ctl, offset); 1716 info->bytes = 0; 1717 INIT_LIST_HEAD(&info->list); 1718 link_free_space(ctl, info); 1719 ctl->total_bitmaps++; 1720 1721 ctl->op->recalc_thresholds(ctl); 1722 } 1723 1724 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 1725 struct btrfs_free_space *bitmap_info) 1726 { 1727 unlink_free_space(ctl, bitmap_info); 1728 kfree(bitmap_info->bitmap); 1729 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 1730 ctl->total_bitmaps--; 1731 ctl->op->recalc_thresholds(ctl); 1732 } 1733 1734 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 1735 struct btrfs_free_space *bitmap_info, 1736 u64 *offset, u64 *bytes) 1737 { 1738 u64 end; 1739 u64 search_start, search_bytes; 1740 int ret; 1741 1742 again: 1743 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 1744 1745 /* 1746 * We need to search for bits in this bitmap. We could only cover some 1747 * of the extent in this bitmap thanks to how we add space, so we need 1748 * to search for as much as it as we can and clear that amount, and then 1749 * go searching for the next bit. 1750 */ 1751 search_start = *offset; 1752 search_bytes = ctl->unit; 1753 search_bytes = min(search_bytes, end - search_start + 1); 1754 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); 1755 if (ret < 0 || search_start != *offset) 1756 return -EINVAL; 1757 1758 /* We may have found more bits than what we need */ 1759 search_bytes = min(search_bytes, *bytes); 1760 1761 /* Cannot clear past the end of the bitmap */ 1762 search_bytes = min(search_bytes, end - search_start + 1); 1763 1764 bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes); 1765 *offset += search_bytes; 1766 *bytes -= search_bytes; 1767 1768 if (*bytes) { 1769 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1770 if (!bitmap_info->bytes) 1771 free_bitmap(ctl, bitmap_info); 1772 1773 /* 1774 * no entry after this bitmap, but we still have bytes to 1775 * remove, so something has gone wrong. 1776 */ 1777 if (!next) 1778 return -EINVAL; 1779 1780 bitmap_info = rb_entry(next, struct btrfs_free_space, 1781 offset_index); 1782 1783 /* 1784 * if the next entry isn't a bitmap we need to return to let the 1785 * extent stuff do its work. 1786 */ 1787 if (!bitmap_info->bitmap) 1788 return -EAGAIN; 1789 1790 /* 1791 * Ok the next item is a bitmap, but it may not actually hold 1792 * the information for the rest of this free space stuff, so 1793 * look for it, and if we don't find it return so we can try 1794 * everything over again. 1795 */ 1796 search_start = *offset; 1797 search_bytes = ctl->unit; 1798 ret = search_bitmap(ctl, bitmap_info, &search_start, 1799 &search_bytes); 1800 if (ret < 0 || search_start != *offset) 1801 return -EAGAIN; 1802 1803 goto again; 1804 } else if (!bitmap_info->bytes) 1805 free_bitmap(ctl, bitmap_info); 1806 1807 return 0; 1808 } 1809 1810 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 1811 struct btrfs_free_space *info, u64 offset, 1812 u64 bytes) 1813 { 1814 u64 bytes_to_set = 0; 1815 u64 end; 1816 1817 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 1818 1819 bytes_to_set = min(end - offset, bytes); 1820 1821 bitmap_set_bits(ctl, info, offset, bytes_to_set); 1822 1823 return bytes_to_set; 1824 1825 } 1826 1827 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 1828 struct btrfs_free_space *info) 1829 { 1830 struct btrfs_block_group_cache *block_group = ctl->private; 1831 1832 /* 1833 * If we are below the extents threshold then we can add this as an 1834 * extent, and don't have to deal with the bitmap 1835 */ 1836 if (ctl->free_extents < ctl->extents_thresh) { 1837 /* 1838 * If this block group has some small extents we don't want to 1839 * use up all of our free slots in the cache with them, we want 1840 * to reserve them to larger extents, however if we have plent 1841 * of cache left then go ahead an dadd them, no sense in adding 1842 * the overhead of a bitmap if we don't have to. 1843 */ 1844 if (info->bytes <= block_group->sectorsize * 4) { 1845 if (ctl->free_extents * 2 <= ctl->extents_thresh) 1846 return false; 1847 } else { 1848 return false; 1849 } 1850 } 1851 1852 /* 1853 * The original block groups from mkfs can be really small, like 8 1854 * megabytes, so don't bother with a bitmap for those entries. However 1855 * some block groups can be smaller than what a bitmap would cover but 1856 * are still large enough that they could overflow the 32k memory limit, 1857 * so allow those block groups to still be allowed to have a bitmap 1858 * entry. 1859 */ 1860 if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) 1861 return false; 1862 1863 return true; 1864 } 1865 1866 static struct btrfs_free_space_op free_space_op = { 1867 .recalc_thresholds = recalculate_thresholds, 1868 .use_bitmap = use_bitmap, 1869 }; 1870 1871 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 1872 struct btrfs_free_space *info) 1873 { 1874 struct btrfs_free_space *bitmap_info; 1875 struct btrfs_block_group_cache *block_group = NULL; 1876 int added = 0; 1877 u64 bytes, offset, bytes_added; 1878 int ret; 1879 1880 bytes = info->bytes; 1881 offset = info->offset; 1882 1883 if (!ctl->op->use_bitmap(ctl, info)) 1884 return 0; 1885 1886 if (ctl->op == &free_space_op) 1887 block_group = ctl->private; 1888 again: 1889 /* 1890 * Since we link bitmaps right into the cluster we need to see if we 1891 * have a cluster here, and if so and it has our bitmap we need to add 1892 * the free space to that bitmap. 1893 */ 1894 if (block_group && !list_empty(&block_group->cluster_list)) { 1895 struct btrfs_free_cluster *cluster; 1896 struct rb_node *node; 1897 struct btrfs_free_space *entry; 1898 1899 cluster = list_entry(block_group->cluster_list.next, 1900 struct btrfs_free_cluster, 1901 block_group_list); 1902 spin_lock(&cluster->lock); 1903 node = rb_first(&cluster->root); 1904 if (!node) { 1905 spin_unlock(&cluster->lock); 1906 goto no_cluster_bitmap; 1907 } 1908 1909 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1910 if (!entry->bitmap) { 1911 spin_unlock(&cluster->lock); 1912 goto no_cluster_bitmap; 1913 } 1914 1915 if (entry->offset == offset_to_bitmap(ctl, offset)) { 1916 bytes_added = add_bytes_to_bitmap(ctl, entry, 1917 offset, bytes); 1918 bytes -= bytes_added; 1919 offset += bytes_added; 1920 } 1921 spin_unlock(&cluster->lock); 1922 if (!bytes) { 1923 ret = 1; 1924 goto out; 1925 } 1926 } 1927 1928 no_cluster_bitmap: 1929 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1930 1, 0); 1931 if (!bitmap_info) { 1932 ASSERT(added == 0); 1933 goto new_bitmap; 1934 } 1935 1936 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 1937 bytes -= bytes_added; 1938 offset += bytes_added; 1939 added = 0; 1940 1941 if (!bytes) { 1942 ret = 1; 1943 goto out; 1944 } else 1945 goto again; 1946 1947 new_bitmap: 1948 if (info && info->bitmap) { 1949 add_new_bitmap(ctl, info, offset); 1950 added = 1; 1951 info = NULL; 1952 goto again; 1953 } else { 1954 spin_unlock(&ctl->tree_lock); 1955 1956 /* no pre-allocated info, allocate a new one */ 1957 if (!info) { 1958 info = kmem_cache_zalloc(btrfs_free_space_cachep, 1959 GFP_NOFS); 1960 if (!info) { 1961 spin_lock(&ctl->tree_lock); 1962 ret = -ENOMEM; 1963 goto out; 1964 } 1965 } 1966 1967 /* allocate the bitmap */ 1968 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1969 spin_lock(&ctl->tree_lock); 1970 if (!info->bitmap) { 1971 ret = -ENOMEM; 1972 goto out; 1973 } 1974 goto again; 1975 } 1976 1977 out: 1978 if (info) { 1979 if (info->bitmap) 1980 kfree(info->bitmap); 1981 kmem_cache_free(btrfs_free_space_cachep, info); 1982 } 1983 1984 return ret; 1985 } 1986 1987 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 1988 struct btrfs_free_space *info, bool update_stat) 1989 { 1990 struct btrfs_free_space *left_info; 1991 struct btrfs_free_space *right_info; 1992 bool merged = false; 1993 u64 offset = info->offset; 1994 u64 bytes = info->bytes; 1995 1996 /* 1997 * first we want to see if there is free space adjacent to the range we 1998 * are adding, if there is remove that struct and add a new one to 1999 * cover the entire range 2000 */ 2001 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 2002 if (right_info && rb_prev(&right_info->offset_index)) 2003 left_info = rb_entry(rb_prev(&right_info->offset_index), 2004 struct btrfs_free_space, offset_index); 2005 else 2006 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 2007 2008 if (right_info && !right_info->bitmap) { 2009 if (update_stat) 2010 unlink_free_space(ctl, right_info); 2011 else 2012 __unlink_free_space(ctl, right_info); 2013 info->bytes += right_info->bytes; 2014 kmem_cache_free(btrfs_free_space_cachep, right_info); 2015 merged = true; 2016 } 2017 2018 if (left_info && !left_info->bitmap && 2019 left_info->offset + left_info->bytes == offset) { 2020 if (update_stat) 2021 unlink_free_space(ctl, left_info); 2022 else 2023 __unlink_free_space(ctl, left_info); 2024 info->offset = left_info->offset; 2025 info->bytes += left_info->bytes; 2026 kmem_cache_free(btrfs_free_space_cachep, left_info); 2027 merged = true; 2028 } 2029 2030 return merged; 2031 } 2032 2033 static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, 2034 struct btrfs_free_space *info, 2035 bool update_stat) 2036 { 2037 struct btrfs_free_space *bitmap; 2038 unsigned long i; 2039 unsigned long j; 2040 const u64 end = info->offset + info->bytes; 2041 const u64 bitmap_offset = offset_to_bitmap(ctl, end); 2042 u64 bytes; 2043 2044 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2045 if (!bitmap) 2046 return false; 2047 2048 i = offset_to_bit(bitmap->offset, ctl->unit, end); 2049 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); 2050 if (j == i) 2051 return false; 2052 bytes = (j - i) * ctl->unit; 2053 info->bytes += bytes; 2054 2055 if (update_stat) 2056 bitmap_clear_bits(ctl, bitmap, end, bytes); 2057 else 2058 __bitmap_clear_bits(ctl, bitmap, end, bytes); 2059 2060 if (!bitmap->bytes) 2061 free_bitmap(ctl, bitmap); 2062 2063 return true; 2064 } 2065 2066 static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, 2067 struct btrfs_free_space *info, 2068 bool update_stat) 2069 { 2070 struct btrfs_free_space *bitmap; 2071 u64 bitmap_offset; 2072 unsigned long i; 2073 unsigned long j; 2074 unsigned long prev_j; 2075 u64 bytes; 2076 2077 bitmap_offset = offset_to_bitmap(ctl, info->offset); 2078 /* If we're on a boundary, try the previous logical bitmap. */ 2079 if (bitmap_offset == info->offset) { 2080 if (info->offset == 0) 2081 return false; 2082 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); 2083 } 2084 2085 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); 2086 if (!bitmap) 2087 return false; 2088 2089 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; 2090 j = 0; 2091 prev_j = (unsigned long)-1; 2092 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { 2093 if (j > i) 2094 break; 2095 prev_j = j; 2096 } 2097 if (prev_j == i) 2098 return false; 2099 2100 if (prev_j == (unsigned long)-1) 2101 bytes = (i + 1) * ctl->unit; 2102 else 2103 bytes = (i - prev_j) * ctl->unit; 2104 2105 info->offset -= bytes; 2106 info->bytes += bytes; 2107 2108 if (update_stat) 2109 bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2110 else 2111 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); 2112 2113 if (!bitmap->bytes) 2114 free_bitmap(ctl, bitmap); 2115 2116 return true; 2117 } 2118 2119 /* 2120 * We prefer always to allocate from extent entries, both for clustered and 2121 * non-clustered allocation requests. So when attempting to add a new extent 2122 * entry, try to see if there's adjacent free space in bitmap entries, and if 2123 * there is, migrate that space from the bitmaps to the extent. 2124 * Like this we get better chances of satisfying space allocation requests 2125 * because we attempt to satisfy them based on a single cache entry, and never 2126 * on 2 or more entries - even if the entries represent a contiguous free space 2127 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry 2128 * ends). 2129 */ 2130 static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, 2131 struct btrfs_free_space *info, 2132 bool update_stat) 2133 { 2134 /* 2135 * Only work with disconnected entries, as we can change their offset, 2136 * and must be extent entries. 2137 */ 2138 ASSERT(!info->bitmap); 2139 ASSERT(RB_EMPTY_NODE(&info->offset_index)); 2140 2141 if (ctl->total_bitmaps > 0) { 2142 bool stole_end; 2143 bool stole_front = false; 2144 2145 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); 2146 if (ctl->total_bitmaps > 0) 2147 stole_front = steal_from_bitmap_to_front(ctl, info, 2148 update_stat); 2149 2150 if (stole_end || stole_front) 2151 try_merge_free_space(ctl, info, update_stat); 2152 } 2153 } 2154 2155 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 2156 u64 offset, u64 bytes) 2157 { 2158 struct btrfs_free_space *info; 2159 int ret = 0; 2160 2161 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 2162 if (!info) 2163 return -ENOMEM; 2164 2165 info->offset = offset; 2166 info->bytes = bytes; 2167 RB_CLEAR_NODE(&info->offset_index); 2168 2169 spin_lock(&ctl->tree_lock); 2170 2171 if (try_merge_free_space(ctl, info, true)) 2172 goto link; 2173 2174 /* 2175 * There was no extent directly to the left or right of this new 2176 * extent then we know we're going to have to allocate a new extent, so 2177 * before we do that see if we need to drop this into a bitmap 2178 */ 2179 ret = insert_into_bitmap(ctl, info); 2180 if (ret < 0) { 2181 goto out; 2182 } else if (ret) { 2183 ret = 0; 2184 goto out; 2185 } 2186 link: 2187 /* 2188 * Only steal free space from adjacent bitmaps if we're sure we're not 2189 * going to add the new free space to existing bitmap entries - because 2190 * that would mean unnecessary work that would be reverted. Therefore 2191 * attempt to steal space from bitmaps if we're adding an extent entry. 2192 */ 2193 steal_from_bitmap(ctl, info, true); 2194 2195 ret = link_free_space(ctl, info); 2196 if (ret) 2197 kmem_cache_free(btrfs_free_space_cachep, info); 2198 out: 2199 spin_unlock(&ctl->tree_lock); 2200 2201 if (ret) { 2202 printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret); 2203 ASSERT(ret != -EEXIST); 2204 } 2205 2206 return ret; 2207 } 2208 2209 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 2210 u64 offset, u64 bytes) 2211 { 2212 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2213 struct btrfs_free_space *info; 2214 int ret; 2215 bool re_search = false; 2216 2217 spin_lock(&ctl->tree_lock); 2218 2219 again: 2220 ret = 0; 2221 if (!bytes) 2222 goto out_lock; 2223 2224 info = tree_search_offset(ctl, offset, 0, 0); 2225 if (!info) { 2226 /* 2227 * oops didn't find an extent that matched the space we wanted 2228 * to remove, look for a bitmap instead 2229 */ 2230 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 2231 1, 0); 2232 if (!info) { 2233 /* 2234 * If we found a partial bit of our free space in a 2235 * bitmap but then couldn't find the other part this may 2236 * be a problem, so WARN about it. 2237 */ 2238 WARN_ON(re_search); 2239 goto out_lock; 2240 } 2241 } 2242 2243 re_search = false; 2244 if (!info->bitmap) { 2245 unlink_free_space(ctl, info); 2246 if (offset == info->offset) { 2247 u64 to_free = min(bytes, info->bytes); 2248 2249 info->bytes -= to_free; 2250 info->offset += to_free; 2251 if (info->bytes) { 2252 ret = link_free_space(ctl, info); 2253 WARN_ON(ret); 2254 } else { 2255 kmem_cache_free(btrfs_free_space_cachep, info); 2256 } 2257 2258 offset += to_free; 2259 bytes -= to_free; 2260 goto again; 2261 } else { 2262 u64 old_end = info->bytes + info->offset; 2263 2264 info->bytes = offset - info->offset; 2265 ret = link_free_space(ctl, info); 2266 WARN_ON(ret); 2267 if (ret) 2268 goto out_lock; 2269 2270 /* Not enough bytes in this entry to satisfy us */ 2271 if (old_end < offset + bytes) { 2272 bytes -= old_end - offset; 2273 offset = old_end; 2274 goto again; 2275 } else if (old_end == offset + bytes) { 2276 /* all done */ 2277 goto out_lock; 2278 } 2279 spin_unlock(&ctl->tree_lock); 2280 2281 ret = btrfs_add_free_space(block_group, offset + bytes, 2282 old_end - (offset + bytes)); 2283 WARN_ON(ret); 2284 goto out; 2285 } 2286 } 2287 2288 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 2289 if (ret == -EAGAIN) { 2290 re_search = true; 2291 goto again; 2292 } 2293 out_lock: 2294 spin_unlock(&ctl->tree_lock); 2295 out: 2296 return ret; 2297 } 2298 2299 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 2300 u64 bytes) 2301 { 2302 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2303 struct btrfs_free_space *info; 2304 struct rb_node *n; 2305 int count = 0; 2306 2307 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 2308 info = rb_entry(n, struct btrfs_free_space, offset_index); 2309 if (info->bytes >= bytes && !block_group->ro) 2310 count++; 2311 btrfs_crit(block_group->fs_info, 2312 "entry offset %llu, bytes %llu, bitmap %s", 2313 info->offset, info->bytes, 2314 (info->bitmap) ? "yes" : "no"); 2315 } 2316 btrfs_info(block_group->fs_info, "block group has cluster?: %s", 2317 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2318 btrfs_info(block_group->fs_info, 2319 "%d blocks of free space at or bigger than bytes is", count); 2320 } 2321 2322 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) 2323 { 2324 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2325 2326 spin_lock_init(&ctl->tree_lock); 2327 ctl->unit = block_group->sectorsize; 2328 ctl->start = block_group->key.objectid; 2329 ctl->private = block_group; 2330 ctl->op = &free_space_op; 2331 INIT_LIST_HEAD(&ctl->trimming_ranges); 2332 mutex_init(&ctl->cache_writeout_mutex); 2333 2334 /* 2335 * we only want to have 32k of ram per block group for keeping 2336 * track of free space, and if we pass 1/2 of that we want to 2337 * start converting things over to using bitmaps 2338 */ 2339 ctl->extents_thresh = ((1024 * 32) / 2) / 2340 sizeof(struct btrfs_free_space); 2341 } 2342 2343 /* 2344 * for a given cluster, put all of its extents back into the free 2345 * space cache. If the block group passed doesn't match the block group 2346 * pointed to by the cluster, someone else raced in and freed the 2347 * cluster already. In that case, we just return without changing anything 2348 */ 2349 static int 2350 __btrfs_return_cluster_to_free_space( 2351 struct btrfs_block_group_cache *block_group, 2352 struct btrfs_free_cluster *cluster) 2353 { 2354 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2355 struct btrfs_free_space *entry; 2356 struct rb_node *node; 2357 2358 spin_lock(&cluster->lock); 2359 if (cluster->block_group != block_group) 2360 goto out; 2361 2362 cluster->block_group = NULL; 2363 cluster->window_start = 0; 2364 list_del_init(&cluster->block_group_list); 2365 2366 node = rb_first(&cluster->root); 2367 while (node) { 2368 bool bitmap; 2369 2370 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2371 node = rb_next(&entry->offset_index); 2372 rb_erase(&entry->offset_index, &cluster->root); 2373 RB_CLEAR_NODE(&entry->offset_index); 2374 2375 bitmap = (entry->bitmap != NULL); 2376 if (!bitmap) { 2377 try_merge_free_space(ctl, entry, false); 2378 steal_from_bitmap(ctl, entry, false); 2379 } 2380 tree_insert_offset(&ctl->free_space_offset, 2381 entry->offset, &entry->offset_index, bitmap); 2382 } 2383 cluster->root = RB_ROOT; 2384 2385 out: 2386 spin_unlock(&cluster->lock); 2387 btrfs_put_block_group(block_group); 2388 return 0; 2389 } 2390 2391 static void __btrfs_remove_free_space_cache_locked( 2392 struct btrfs_free_space_ctl *ctl) 2393 { 2394 struct btrfs_free_space *info; 2395 struct rb_node *node; 2396 2397 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 2398 info = rb_entry(node, struct btrfs_free_space, offset_index); 2399 if (!info->bitmap) { 2400 unlink_free_space(ctl, info); 2401 kmem_cache_free(btrfs_free_space_cachep, info); 2402 } else { 2403 free_bitmap(ctl, info); 2404 } 2405 if (need_resched()) { 2406 spin_unlock(&ctl->tree_lock); 2407 cond_resched(); 2408 spin_lock(&ctl->tree_lock); 2409 } 2410 } 2411 } 2412 2413 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 2414 { 2415 spin_lock(&ctl->tree_lock); 2416 __btrfs_remove_free_space_cache_locked(ctl); 2417 spin_unlock(&ctl->tree_lock); 2418 } 2419 2420 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 2421 { 2422 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2423 struct btrfs_free_cluster *cluster; 2424 struct list_head *head; 2425 2426 spin_lock(&ctl->tree_lock); 2427 while ((head = block_group->cluster_list.next) != 2428 &block_group->cluster_list) { 2429 cluster = list_entry(head, struct btrfs_free_cluster, 2430 block_group_list); 2431 2432 WARN_ON(cluster->block_group != block_group); 2433 __btrfs_return_cluster_to_free_space(block_group, cluster); 2434 if (need_resched()) { 2435 spin_unlock(&ctl->tree_lock); 2436 cond_resched(); 2437 spin_lock(&ctl->tree_lock); 2438 } 2439 } 2440 __btrfs_remove_free_space_cache_locked(ctl); 2441 spin_unlock(&ctl->tree_lock); 2442 2443 } 2444 2445 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 2446 u64 offset, u64 bytes, u64 empty_size, 2447 u64 *max_extent_size) 2448 { 2449 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2450 struct btrfs_free_space *entry = NULL; 2451 u64 bytes_search = bytes + empty_size; 2452 u64 ret = 0; 2453 u64 align_gap = 0; 2454 u64 align_gap_len = 0; 2455 2456 spin_lock(&ctl->tree_lock); 2457 entry = find_free_space(ctl, &offset, &bytes_search, 2458 block_group->full_stripe_len, max_extent_size); 2459 if (!entry) 2460 goto out; 2461 2462 ret = offset; 2463 if (entry->bitmap) { 2464 bitmap_clear_bits(ctl, entry, offset, bytes); 2465 if (!entry->bytes) 2466 free_bitmap(ctl, entry); 2467 } else { 2468 unlink_free_space(ctl, entry); 2469 align_gap_len = offset - entry->offset; 2470 align_gap = entry->offset; 2471 2472 entry->offset = offset + bytes; 2473 WARN_ON(entry->bytes < bytes + align_gap_len); 2474 2475 entry->bytes -= bytes + align_gap_len; 2476 if (!entry->bytes) 2477 kmem_cache_free(btrfs_free_space_cachep, entry); 2478 else 2479 link_free_space(ctl, entry); 2480 } 2481 out: 2482 spin_unlock(&ctl->tree_lock); 2483 2484 if (align_gap_len) 2485 __btrfs_add_free_space(ctl, align_gap, align_gap_len); 2486 return ret; 2487 } 2488 2489 /* 2490 * given a cluster, put all of its extents back into the free space 2491 * cache. If a block group is passed, this function will only free 2492 * a cluster that belongs to the passed block group. 2493 * 2494 * Otherwise, it'll get a reference on the block group pointed to by the 2495 * cluster and remove the cluster from it. 2496 */ 2497 int btrfs_return_cluster_to_free_space( 2498 struct btrfs_block_group_cache *block_group, 2499 struct btrfs_free_cluster *cluster) 2500 { 2501 struct btrfs_free_space_ctl *ctl; 2502 int ret; 2503 2504 /* first, get a safe pointer to the block group */ 2505 spin_lock(&cluster->lock); 2506 if (!block_group) { 2507 block_group = cluster->block_group; 2508 if (!block_group) { 2509 spin_unlock(&cluster->lock); 2510 return 0; 2511 } 2512 } else if (cluster->block_group != block_group) { 2513 /* someone else has already freed it don't redo their work */ 2514 spin_unlock(&cluster->lock); 2515 return 0; 2516 } 2517 atomic_inc(&block_group->count); 2518 spin_unlock(&cluster->lock); 2519 2520 ctl = block_group->free_space_ctl; 2521 2522 /* now return any extents the cluster had on it */ 2523 spin_lock(&ctl->tree_lock); 2524 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 2525 spin_unlock(&ctl->tree_lock); 2526 2527 /* finally drop our ref */ 2528 btrfs_put_block_group(block_group); 2529 return ret; 2530 } 2531 2532 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 2533 struct btrfs_free_cluster *cluster, 2534 struct btrfs_free_space *entry, 2535 u64 bytes, u64 min_start, 2536 u64 *max_extent_size) 2537 { 2538 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2539 int err; 2540 u64 search_start = cluster->window_start; 2541 u64 search_bytes = bytes; 2542 u64 ret = 0; 2543 2544 search_start = min_start; 2545 search_bytes = bytes; 2546 2547 err = search_bitmap(ctl, entry, &search_start, &search_bytes); 2548 if (err) { 2549 if (search_bytes > *max_extent_size) 2550 *max_extent_size = search_bytes; 2551 return 0; 2552 } 2553 2554 ret = search_start; 2555 __bitmap_clear_bits(ctl, entry, ret, bytes); 2556 2557 return ret; 2558 } 2559 2560 /* 2561 * given a cluster, try to allocate 'bytes' from it, returns 0 2562 * if it couldn't find anything suitably large, or a logical disk offset 2563 * if things worked out 2564 */ 2565 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 2566 struct btrfs_free_cluster *cluster, u64 bytes, 2567 u64 min_start, u64 *max_extent_size) 2568 { 2569 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2570 struct btrfs_free_space *entry = NULL; 2571 struct rb_node *node; 2572 u64 ret = 0; 2573 2574 spin_lock(&cluster->lock); 2575 if (bytes > cluster->max_size) 2576 goto out; 2577 2578 if (cluster->block_group != block_group) 2579 goto out; 2580 2581 node = rb_first(&cluster->root); 2582 if (!node) 2583 goto out; 2584 2585 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2586 while (1) { 2587 if (entry->bytes < bytes && entry->bytes > *max_extent_size) 2588 *max_extent_size = entry->bytes; 2589 2590 if (entry->bytes < bytes || 2591 (!entry->bitmap && entry->offset < min_start)) { 2592 node = rb_next(&entry->offset_index); 2593 if (!node) 2594 break; 2595 entry = rb_entry(node, struct btrfs_free_space, 2596 offset_index); 2597 continue; 2598 } 2599 2600 if (entry->bitmap) { 2601 ret = btrfs_alloc_from_bitmap(block_group, 2602 cluster, entry, bytes, 2603 cluster->window_start, 2604 max_extent_size); 2605 if (ret == 0) { 2606 node = rb_next(&entry->offset_index); 2607 if (!node) 2608 break; 2609 entry = rb_entry(node, struct btrfs_free_space, 2610 offset_index); 2611 continue; 2612 } 2613 cluster->window_start += bytes; 2614 } else { 2615 ret = entry->offset; 2616 2617 entry->offset += bytes; 2618 entry->bytes -= bytes; 2619 } 2620 2621 if (entry->bytes == 0) 2622 rb_erase(&entry->offset_index, &cluster->root); 2623 break; 2624 } 2625 out: 2626 spin_unlock(&cluster->lock); 2627 2628 if (!ret) 2629 return 0; 2630 2631 spin_lock(&ctl->tree_lock); 2632 2633 ctl->free_space -= bytes; 2634 if (entry->bytes == 0) { 2635 ctl->free_extents--; 2636 if (entry->bitmap) { 2637 kfree(entry->bitmap); 2638 ctl->total_bitmaps--; 2639 ctl->op->recalc_thresholds(ctl); 2640 } 2641 kmem_cache_free(btrfs_free_space_cachep, entry); 2642 } 2643 2644 spin_unlock(&ctl->tree_lock); 2645 2646 return ret; 2647 } 2648 2649 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2650 struct btrfs_free_space *entry, 2651 struct btrfs_free_cluster *cluster, 2652 u64 offset, u64 bytes, 2653 u64 cont1_bytes, u64 min_bytes) 2654 { 2655 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2656 unsigned long next_zero; 2657 unsigned long i; 2658 unsigned long want_bits; 2659 unsigned long min_bits; 2660 unsigned long found_bits; 2661 unsigned long start = 0; 2662 unsigned long total_found = 0; 2663 int ret; 2664 2665 i = offset_to_bit(entry->offset, ctl->unit, 2666 max_t(u64, offset, entry->offset)); 2667 want_bits = bytes_to_bits(bytes, ctl->unit); 2668 min_bits = bytes_to_bits(min_bytes, ctl->unit); 2669 2670 again: 2671 found_bits = 0; 2672 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 2673 next_zero = find_next_zero_bit(entry->bitmap, 2674 BITS_PER_BITMAP, i); 2675 if (next_zero - i >= min_bits) { 2676 found_bits = next_zero - i; 2677 break; 2678 } 2679 i = next_zero; 2680 } 2681 2682 if (!found_bits) 2683 return -ENOSPC; 2684 2685 if (!total_found) { 2686 start = i; 2687 cluster->max_size = 0; 2688 } 2689 2690 total_found += found_bits; 2691 2692 if (cluster->max_size < found_bits * ctl->unit) 2693 cluster->max_size = found_bits * ctl->unit; 2694 2695 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2696 i = next_zero + 1; 2697 goto again; 2698 } 2699 2700 cluster->window_start = start * ctl->unit + entry->offset; 2701 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2702 ret = tree_insert_offset(&cluster->root, entry->offset, 2703 &entry->offset_index, 1); 2704 ASSERT(!ret); /* -EEXIST; Logic error */ 2705 2706 trace_btrfs_setup_cluster(block_group, cluster, 2707 total_found * ctl->unit, 1); 2708 return 0; 2709 } 2710 2711 /* 2712 * This searches the block group for just extents to fill the cluster with. 2713 * Try to find a cluster with at least bytes total bytes, at least one 2714 * extent of cont1_bytes, and other clusters of at least min_bytes. 2715 */ 2716 static noinline int 2717 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2718 struct btrfs_free_cluster *cluster, 2719 struct list_head *bitmaps, u64 offset, u64 bytes, 2720 u64 cont1_bytes, u64 min_bytes) 2721 { 2722 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2723 struct btrfs_free_space *first = NULL; 2724 struct btrfs_free_space *entry = NULL; 2725 struct btrfs_free_space *last; 2726 struct rb_node *node; 2727 u64 window_free; 2728 u64 max_extent; 2729 u64 total_size = 0; 2730 2731 entry = tree_search_offset(ctl, offset, 0, 1); 2732 if (!entry) 2733 return -ENOSPC; 2734 2735 /* 2736 * We don't want bitmaps, so just move along until we find a normal 2737 * extent entry. 2738 */ 2739 while (entry->bitmap || entry->bytes < min_bytes) { 2740 if (entry->bitmap && list_empty(&entry->list)) 2741 list_add_tail(&entry->list, bitmaps); 2742 node = rb_next(&entry->offset_index); 2743 if (!node) 2744 return -ENOSPC; 2745 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2746 } 2747 2748 window_free = entry->bytes; 2749 max_extent = entry->bytes; 2750 first = entry; 2751 last = entry; 2752 2753 for (node = rb_next(&entry->offset_index); node; 2754 node = rb_next(&entry->offset_index)) { 2755 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2756 2757 if (entry->bitmap) { 2758 if (list_empty(&entry->list)) 2759 list_add_tail(&entry->list, bitmaps); 2760 continue; 2761 } 2762 2763 if (entry->bytes < min_bytes) 2764 continue; 2765 2766 last = entry; 2767 window_free += entry->bytes; 2768 if (entry->bytes > max_extent) 2769 max_extent = entry->bytes; 2770 } 2771 2772 if (window_free < bytes || max_extent < cont1_bytes) 2773 return -ENOSPC; 2774 2775 cluster->window_start = first->offset; 2776 2777 node = &first->offset_index; 2778 2779 /* 2780 * now we've found our entries, pull them out of the free space 2781 * cache and put them into the cluster rbtree 2782 */ 2783 do { 2784 int ret; 2785 2786 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2787 node = rb_next(&entry->offset_index); 2788 if (entry->bitmap || entry->bytes < min_bytes) 2789 continue; 2790 2791 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2792 ret = tree_insert_offset(&cluster->root, entry->offset, 2793 &entry->offset_index, 0); 2794 total_size += entry->bytes; 2795 ASSERT(!ret); /* -EEXIST; Logic error */ 2796 } while (node && entry != last); 2797 2798 cluster->max_size = max_extent; 2799 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0); 2800 return 0; 2801 } 2802 2803 /* 2804 * This specifically looks for bitmaps that may work in the cluster, we assume 2805 * that we have already failed to find extents that will work. 2806 */ 2807 static noinline int 2808 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2809 struct btrfs_free_cluster *cluster, 2810 struct list_head *bitmaps, u64 offset, u64 bytes, 2811 u64 cont1_bytes, u64 min_bytes) 2812 { 2813 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2814 struct btrfs_free_space *entry; 2815 int ret = -ENOSPC; 2816 u64 bitmap_offset = offset_to_bitmap(ctl, offset); 2817 2818 if (ctl->total_bitmaps == 0) 2819 return -ENOSPC; 2820 2821 /* 2822 * The bitmap that covers offset won't be in the list unless offset 2823 * is just its start offset. 2824 */ 2825 entry = list_first_entry(bitmaps, struct btrfs_free_space, list); 2826 if (entry->offset != bitmap_offset) { 2827 entry = tree_search_offset(ctl, bitmap_offset, 1, 0); 2828 if (entry && list_empty(&entry->list)) 2829 list_add(&entry->list, bitmaps); 2830 } 2831 2832 list_for_each_entry(entry, bitmaps, list) { 2833 if (entry->bytes < bytes) 2834 continue; 2835 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2836 bytes, cont1_bytes, min_bytes); 2837 if (!ret) 2838 return 0; 2839 } 2840 2841 /* 2842 * The bitmaps list has all the bitmaps that record free space 2843 * starting after offset, so no more search is required. 2844 */ 2845 return -ENOSPC; 2846 } 2847 2848 /* 2849 * here we try to find a cluster of blocks in a block group. The goal 2850 * is to find at least bytes+empty_size. 2851 * We might not find them all in one contiguous area. 2852 * 2853 * returns zero and sets up cluster if things worked out, otherwise 2854 * it returns -enospc 2855 */ 2856 int btrfs_find_space_cluster(struct btrfs_root *root, 2857 struct btrfs_block_group_cache *block_group, 2858 struct btrfs_free_cluster *cluster, 2859 u64 offset, u64 bytes, u64 empty_size) 2860 { 2861 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2862 struct btrfs_free_space *entry, *tmp; 2863 LIST_HEAD(bitmaps); 2864 u64 min_bytes; 2865 u64 cont1_bytes; 2866 int ret; 2867 2868 /* 2869 * Choose the minimum extent size we'll require for this 2870 * cluster. For SSD_SPREAD, don't allow any fragmentation. 2871 * For metadata, allow allocates with smaller extents. For 2872 * data, keep it dense. 2873 */ 2874 if (btrfs_test_opt(root, SSD_SPREAD)) { 2875 cont1_bytes = min_bytes = bytes + empty_size; 2876 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2877 cont1_bytes = bytes; 2878 min_bytes = block_group->sectorsize; 2879 } else { 2880 cont1_bytes = max(bytes, (bytes + empty_size) >> 2); 2881 min_bytes = block_group->sectorsize; 2882 } 2883 2884 spin_lock(&ctl->tree_lock); 2885 2886 /* 2887 * If we know we don't have enough space to make a cluster don't even 2888 * bother doing all the work to try and find one. 2889 */ 2890 if (ctl->free_space < bytes) { 2891 spin_unlock(&ctl->tree_lock); 2892 return -ENOSPC; 2893 } 2894 2895 spin_lock(&cluster->lock); 2896 2897 /* someone already found a cluster, hooray */ 2898 if (cluster->block_group) { 2899 ret = 0; 2900 goto out; 2901 } 2902 2903 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, 2904 min_bytes); 2905 2906 INIT_LIST_HEAD(&bitmaps); 2907 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2908 bytes + empty_size, 2909 cont1_bytes, min_bytes); 2910 if (ret) 2911 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2912 offset, bytes + empty_size, 2913 cont1_bytes, min_bytes); 2914 2915 /* Clear our temporary list */ 2916 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2917 list_del_init(&entry->list); 2918 2919 if (!ret) { 2920 atomic_inc(&block_group->count); 2921 list_add_tail(&cluster->block_group_list, 2922 &block_group->cluster_list); 2923 cluster->block_group = block_group; 2924 } else { 2925 trace_btrfs_failed_cluster_setup(block_group); 2926 } 2927 out: 2928 spin_unlock(&cluster->lock); 2929 spin_unlock(&ctl->tree_lock); 2930 2931 return ret; 2932 } 2933 2934 /* 2935 * simple code to zero out a cluster 2936 */ 2937 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 2938 { 2939 spin_lock_init(&cluster->lock); 2940 spin_lock_init(&cluster->refill_lock); 2941 cluster->root = RB_ROOT; 2942 cluster->max_size = 0; 2943 INIT_LIST_HEAD(&cluster->block_group_list); 2944 cluster->block_group = NULL; 2945 } 2946 2947 static int do_trimming(struct btrfs_block_group_cache *block_group, 2948 u64 *total_trimmed, u64 start, u64 bytes, 2949 u64 reserved_start, u64 reserved_bytes, 2950 struct btrfs_trim_range *trim_entry) 2951 { 2952 struct btrfs_space_info *space_info = block_group->space_info; 2953 struct btrfs_fs_info *fs_info = block_group->fs_info; 2954 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2955 int ret; 2956 int update = 0; 2957 u64 trimmed = 0; 2958 2959 spin_lock(&space_info->lock); 2960 spin_lock(&block_group->lock); 2961 if (!block_group->ro) { 2962 block_group->reserved += reserved_bytes; 2963 space_info->bytes_reserved += reserved_bytes; 2964 update = 1; 2965 } 2966 spin_unlock(&block_group->lock); 2967 spin_unlock(&space_info->lock); 2968 2969 ret = btrfs_discard_extent(fs_info->extent_root, 2970 start, bytes, &trimmed); 2971 if (!ret) 2972 *total_trimmed += trimmed; 2973 2974 mutex_lock(&ctl->cache_writeout_mutex); 2975 btrfs_add_free_space(block_group, reserved_start, reserved_bytes); 2976 list_del(&trim_entry->list); 2977 mutex_unlock(&ctl->cache_writeout_mutex); 2978 2979 if (update) { 2980 spin_lock(&space_info->lock); 2981 spin_lock(&block_group->lock); 2982 if (block_group->ro) 2983 space_info->bytes_readonly += reserved_bytes; 2984 block_group->reserved -= reserved_bytes; 2985 space_info->bytes_reserved -= reserved_bytes; 2986 spin_unlock(&space_info->lock); 2987 spin_unlock(&block_group->lock); 2988 } 2989 2990 return ret; 2991 } 2992 2993 static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, 2994 u64 *total_trimmed, u64 start, u64 end, u64 minlen) 2995 { 2996 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2997 struct btrfs_free_space *entry; 2998 struct rb_node *node; 2999 int ret = 0; 3000 u64 extent_start; 3001 u64 extent_bytes; 3002 u64 bytes; 3003 3004 while (start < end) { 3005 struct btrfs_trim_range trim_entry; 3006 3007 mutex_lock(&ctl->cache_writeout_mutex); 3008 spin_lock(&ctl->tree_lock); 3009 3010 if (ctl->free_space < minlen) { 3011 spin_unlock(&ctl->tree_lock); 3012 mutex_unlock(&ctl->cache_writeout_mutex); 3013 break; 3014 } 3015 3016 entry = tree_search_offset(ctl, start, 0, 1); 3017 if (!entry) { 3018 spin_unlock(&ctl->tree_lock); 3019 mutex_unlock(&ctl->cache_writeout_mutex); 3020 break; 3021 } 3022 3023 /* skip bitmaps */ 3024 while (entry->bitmap) { 3025 node = rb_next(&entry->offset_index); 3026 if (!node) { 3027 spin_unlock(&ctl->tree_lock); 3028 mutex_unlock(&ctl->cache_writeout_mutex); 3029 goto out; 3030 } 3031 entry = rb_entry(node, struct btrfs_free_space, 3032 offset_index); 3033 } 3034 3035 if (entry->offset >= end) { 3036 spin_unlock(&ctl->tree_lock); 3037 mutex_unlock(&ctl->cache_writeout_mutex); 3038 break; 3039 } 3040 3041 extent_start = entry->offset; 3042 extent_bytes = entry->bytes; 3043 start = max(start, extent_start); 3044 bytes = min(extent_start + extent_bytes, end) - start; 3045 if (bytes < minlen) { 3046 spin_unlock(&ctl->tree_lock); 3047 mutex_unlock(&ctl->cache_writeout_mutex); 3048 goto next; 3049 } 3050 3051 unlink_free_space(ctl, entry); 3052 kmem_cache_free(btrfs_free_space_cachep, entry); 3053 3054 spin_unlock(&ctl->tree_lock); 3055 trim_entry.start = extent_start; 3056 trim_entry.bytes = extent_bytes; 3057 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3058 mutex_unlock(&ctl->cache_writeout_mutex); 3059 3060 ret = do_trimming(block_group, total_trimmed, start, bytes, 3061 extent_start, extent_bytes, &trim_entry); 3062 if (ret) 3063 break; 3064 next: 3065 start += bytes; 3066 3067 if (fatal_signal_pending(current)) { 3068 ret = -ERESTARTSYS; 3069 break; 3070 } 3071 3072 cond_resched(); 3073 } 3074 out: 3075 return ret; 3076 } 3077 3078 static int trim_bitmaps(struct btrfs_block_group_cache *block_group, 3079 u64 *total_trimmed, u64 start, u64 end, u64 minlen) 3080 { 3081 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 3082 struct btrfs_free_space *entry; 3083 int ret = 0; 3084 int ret2; 3085 u64 bytes; 3086 u64 offset = offset_to_bitmap(ctl, start); 3087 3088 while (offset < end) { 3089 bool next_bitmap = false; 3090 struct btrfs_trim_range trim_entry; 3091 3092 mutex_lock(&ctl->cache_writeout_mutex); 3093 spin_lock(&ctl->tree_lock); 3094 3095 if (ctl->free_space < minlen) { 3096 spin_unlock(&ctl->tree_lock); 3097 mutex_unlock(&ctl->cache_writeout_mutex); 3098 break; 3099 } 3100 3101 entry = tree_search_offset(ctl, offset, 1, 0); 3102 if (!entry) { 3103 spin_unlock(&ctl->tree_lock); 3104 mutex_unlock(&ctl->cache_writeout_mutex); 3105 next_bitmap = true; 3106 goto next; 3107 } 3108 3109 bytes = minlen; 3110 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3111 if (ret2 || start >= end) { 3112 spin_unlock(&ctl->tree_lock); 3113 mutex_unlock(&ctl->cache_writeout_mutex); 3114 next_bitmap = true; 3115 goto next; 3116 } 3117 3118 bytes = min(bytes, end - start); 3119 if (bytes < minlen) { 3120 spin_unlock(&ctl->tree_lock); 3121 mutex_unlock(&ctl->cache_writeout_mutex); 3122 goto next; 3123 } 3124 3125 bitmap_clear_bits(ctl, entry, start, bytes); 3126 if (entry->bytes == 0) 3127 free_bitmap(ctl, entry); 3128 3129 spin_unlock(&ctl->tree_lock); 3130 trim_entry.start = start; 3131 trim_entry.bytes = bytes; 3132 list_add_tail(&trim_entry.list, &ctl->trimming_ranges); 3133 mutex_unlock(&ctl->cache_writeout_mutex); 3134 3135 ret = do_trimming(block_group, total_trimmed, start, bytes, 3136 start, bytes, &trim_entry); 3137 if (ret) 3138 break; 3139 next: 3140 if (next_bitmap) { 3141 offset += BITS_PER_BITMAP * ctl->unit; 3142 } else { 3143 start += bytes; 3144 if (start >= offset + BITS_PER_BITMAP * ctl->unit) 3145 offset += BITS_PER_BITMAP * ctl->unit; 3146 } 3147 3148 if (fatal_signal_pending(current)) { 3149 ret = -ERESTARTSYS; 3150 break; 3151 } 3152 3153 cond_resched(); 3154 } 3155 3156 return ret; 3157 } 3158 3159 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 3160 u64 *trimmed, u64 start, u64 end, u64 minlen) 3161 { 3162 int ret; 3163 3164 *trimmed = 0; 3165 3166 spin_lock(&block_group->lock); 3167 if (block_group->removed) { 3168 spin_unlock(&block_group->lock); 3169 return 0; 3170 } 3171 atomic_inc(&block_group->trimming); 3172 spin_unlock(&block_group->lock); 3173 3174 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); 3175 if (ret) 3176 goto out; 3177 3178 ret = trim_bitmaps(block_group, trimmed, start, end, minlen); 3179 out: 3180 spin_lock(&block_group->lock); 3181 if (atomic_dec_and_test(&block_group->trimming) && 3182 block_group->removed) { 3183 struct extent_map_tree *em_tree; 3184 struct extent_map *em; 3185 3186 spin_unlock(&block_group->lock); 3187 3188 lock_chunks(block_group->fs_info->chunk_root); 3189 em_tree = &block_group->fs_info->mapping_tree.map_tree; 3190 write_lock(&em_tree->lock); 3191 em = lookup_extent_mapping(em_tree, block_group->key.objectid, 3192 1); 3193 BUG_ON(!em); /* logic error, can't happen */ 3194 /* 3195 * remove_extent_mapping() will delete us from the pinned_chunks 3196 * list, which is protected by the chunk mutex. 3197 */ 3198 remove_extent_mapping(em_tree, em); 3199 write_unlock(&em_tree->lock); 3200 unlock_chunks(block_group->fs_info->chunk_root); 3201 3202 /* once for us and once for the tree */ 3203 free_extent_map(em); 3204 free_extent_map(em); 3205 3206 /* 3207 * We've left one free space entry and other tasks trimming 3208 * this block group have left 1 entry each one. Free them. 3209 */ 3210 __btrfs_remove_free_space_cache(block_group->free_space_ctl); 3211 } else { 3212 spin_unlock(&block_group->lock); 3213 } 3214 3215 return ret; 3216 } 3217 3218 /* 3219 * Find the left-most item in the cache tree, and then return the 3220 * smallest inode number in the item. 3221 * 3222 * Note: the returned inode number may not be the smallest one in 3223 * the tree, if the left-most item is a bitmap. 3224 */ 3225 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) 3226 { 3227 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; 3228 struct btrfs_free_space *entry = NULL; 3229 u64 ino = 0; 3230 3231 spin_lock(&ctl->tree_lock); 3232 3233 if (RB_EMPTY_ROOT(&ctl->free_space_offset)) 3234 goto out; 3235 3236 entry = rb_entry(rb_first(&ctl->free_space_offset), 3237 struct btrfs_free_space, offset_index); 3238 3239 if (!entry->bitmap) { 3240 ino = entry->offset; 3241 3242 unlink_free_space(ctl, entry); 3243 entry->offset++; 3244 entry->bytes--; 3245 if (!entry->bytes) 3246 kmem_cache_free(btrfs_free_space_cachep, entry); 3247 else 3248 link_free_space(ctl, entry); 3249 } else { 3250 u64 offset = 0; 3251 u64 count = 1; 3252 int ret; 3253 3254 ret = search_bitmap(ctl, entry, &offset, &count); 3255 /* Logic error; Should be empty if it can't find anything */ 3256 ASSERT(!ret); 3257 3258 ino = offset; 3259 bitmap_clear_bits(ctl, entry, offset, 1); 3260 if (entry->bytes == 0) 3261 free_bitmap(ctl, entry); 3262 } 3263 out: 3264 spin_unlock(&ctl->tree_lock); 3265 3266 return ino; 3267 } 3268 3269 struct inode *lookup_free_ino_inode(struct btrfs_root *root, 3270 struct btrfs_path *path) 3271 { 3272 struct inode *inode = NULL; 3273 3274 spin_lock(&root->ino_cache_lock); 3275 if (root->ino_cache_inode) 3276 inode = igrab(root->ino_cache_inode); 3277 spin_unlock(&root->ino_cache_lock); 3278 if (inode) 3279 return inode; 3280 3281 inode = __lookup_free_space_inode(root, path, 0); 3282 if (IS_ERR(inode)) 3283 return inode; 3284 3285 spin_lock(&root->ino_cache_lock); 3286 if (!btrfs_fs_closing(root->fs_info)) 3287 root->ino_cache_inode = igrab(inode); 3288 spin_unlock(&root->ino_cache_lock); 3289 3290 return inode; 3291 } 3292 3293 int create_free_ino_inode(struct btrfs_root *root, 3294 struct btrfs_trans_handle *trans, 3295 struct btrfs_path *path) 3296 { 3297 return __create_free_space_inode(root, trans, path, 3298 BTRFS_FREE_INO_OBJECTID, 0); 3299 } 3300 3301 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3302 { 3303 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3304 struct btrfs_path *path; 3305 struct inode *inode; 3306 int ret = 0; 3307 u64 root_gen = btrfs_root_generation(&root->root_item); 3308 3309 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3310 return 0; 3311 3312 /* 3313 * If we're unmounting then just return, since this does a search on the 3314 * normal root and not the commit root and we could deadlock. 3315 */ 3316 if (btrfs_fs_closing(fs_info)) 3317 return 0; 3318 3319 path = btrfs_alloc_path(); 3320 if (!path) 3321 return 0; 3322 3323 inode = lookup_free_ino_inode(root, path); 3324 if (IS_ERR(inode)) 3325 goto out; 3326 3327 if (root_gen != BTRFS_I(inode)->generation) 3328 goto out_put; 3329 3330 ret = __load_free_space_cache(root, inode, ctl, path, 0); 3331 3332 if (ret < 0) 3333 btrfs_err(fs_info, 3334 "failed to load free ino cache for root %llu", 3335 root->root_key.objectid); 3336 out_put: 3337 iput(inode); 3338 out: 3339 btrfs_free_path(path); 3340 return ret; 3341 } 3342 3343 int btrfs_write_out_ino_cache(struct btrfs_root *root, 3344 struct btrfs_trans_handle *trans, 3345 struct btrfs_path *path, 3346 struct inode *inode) 3347 { 3348 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3349 int ret; 3350 3351 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3352 return 0; 3353 3354 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 3355 if (ret) { 3356 btrfs_delalloc_release_metadata(inode, inode->i_size); 3357 #ifdef DEBUG 3358 btrfs_err(root->fs_info, 3359 "failed to write free ino cache for root %llu", 3360 root->root_key.objectid); 3361 #endif 3362 } 3363 3364 return ret; 3365 } 3366 3367 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3368 /* 3369 * Use this if you need to make a bitmap or extent entry specifically, it 3370 * doesn't do any of the merging that add_free_space does, this acts a lot like 3371 * how the free space cache loading stuff works, so you can get really weird 3372 * configurations. 3373 */ 3374 int test_add_free_space_entry(struct btrfs_block_group_cache *cache, 3375 u64 offset, u64 bytes, bool bitmap) 3376 { 3377 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3378 struct btrfs_free_space *info = NULL, *bitmap_info; 3379 void *map = NULL; 3380 u64 bytes_added; 3381 int ret; 3382 3383 again: 3384 if (!info) { 3385 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 3386 if (!info) 3387 return -ENOMEM; 3388 } 3389 3390 if (!bitmap) { 3391 spin_lock(&ctl->tree_lock); 3392 info->offset = offset; 3393 info->bytes = bytes; 3394 ret = link_free_space(ctl, info); 3395 spin_unlock(&ctl->tree_lock); 3396 if (ret) 3397 kmem_cache_free(btrfs_free_space_cachep, info); 3398 return ret; 3399 } 3400 3401 if (!map) { 3402 map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 3403 if (!map) { 3404 kmem_cache_free(btrfs_free_space_cachep, info); 3405 return -ENOMEM; 3406 } 3407 } 3408 3409 spin_lock(&ctl->tree_lock); 3410 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3411 1, 0); 3412 if (!bitmap_info) { 3413 info->bitmap = map; 3414 map = NULL; 3415 add_new_bitmap(ctl, info, offset); 3416 bitmap_info = info; 3417 info = NULL; 3418 } 3419 3420 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3421 bytes -= bytes_added; 3422 offset += bytes_added; 3423 spin_unlock(&ctl->tree_lock); 3424 3425 if (bytes) 3426 goto again; 3427 3428 if (info) 3429 kmem_cache_free(btrfs_free_space_cachep, info); 3430 if (map) 3431 kfree(map); 3432 return 0; 3433 } 3434 3435 /* 3436 * Checks to see if the given range is in the free space cache. This is really 3437 * just used to check the absence of space, so if there is free space in the 3438 * range at all we will return 1. 3439 */ 3440 int test_check_exists(struct btrfs_block_group_cache *cache, 3441 u64 offset, u64 bytes) 3442 { 3443 struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; 3444 struct btrfs_free_space *info; 3445 int ret = 0; 3446 3447 spin_lock(&ctl->tree_lock); 3448 info = tree_search_offset(ctl, offset, 0, 0); 3449 if (!info) { 3450 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 3451 1, 0); 3452 if (!info) 3453 goto out; 3454 } 3455 3456 have_info: 3457 if (info->bitmap) { 3458 u64 bit_off, bit_bytes; 3459 struct rb_node *n; 3460 struct btrfs_free_space *tmp; 3461 3462 bit_off = offset; 3463 bit_bytes = ctl->unit; 3464 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); 3465 if (!ret) { 3466 if (bit_off == offset) { 3467 ret = 1; 3468 goto out; 3469 } else if (bit_off > offset && 3470 offset + bytes > bit_off) { 3471 ret = 1; 3472 goto out; 3473 } 3474 } 3475 3476 n = rb_prev(&info->offset_index); 3477 while (n) { 3478 tmp = rb_entry(n, struct btrfs_free_space, 3479 offset_index); 3480 if (tmp->offset + tmp->bytes < offset) 3481 break; 3482 if (offset + bytes < tmp->offset) { 3483 n = rb_prev(&info->offset_index); 3484 continue; 3485 } 3486 info = tmp; 3487 goto have_info; 3488 } 3489 3490 n = rb_next(&info->offset_index); 3491 while (n) { 3492 tmp = rb_entry(n, struct btrfs_free_space, 3493 offset_index); 3494 if (offset + bytes < tmp->offset) 3495 break; 3496 if (tmp->offset + tmp->bytes < offset) { 3497 n = rb_next(&info->offset_index); 3498 continue; 3499 } 3500 info = tmp; 3501 goto have_info; 3502 } 3503 3504 ret = 0; 3505 goto out; 3506 } 3507 3508 if (info->offset == offset) { 3509 ret = 1; 3510 goto out; 3511 } 3512 3513 if (offset > info->offset && offset < info->offset + info->bytes) 3514 ret = 1; 3515 out: 3516 spin_unlock(&ctl->tree_lock); 3517 return ret; 3518 } 3519 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3520