1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/pagemap.h> 20 #include <linux/sched.h> 21 #include <linux/slab.h> 22 #include <linux/math64.h> 23 #include "ctree.h" 24 #include "free-space-cache.h" 25 #include "transaction.h" 26 #include "disk-io.h" 27 28 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 29 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30 31 static void recalculate_thresholds(struct btrfs_block_group_cache 32 *block_group); 33 static int link_free_space(struct btrfs_block_group_cache *block_group, 34 struct btrfs_free_space *info); 35 36 struct inode *lookup_free_space_inode(struct btrfs_root *root, 37 struct btrfs_block_group_cache 38 *block_group, struct btrfs_path *path) 39 { 40 struct btrfs_key key; 41 struct btrfs_key location; 42 struct btrfs_disk_key disk_key; 43 struct btrfs_free_space_header *header; 44 struct extent_buffer *leaf; 45 struct inode *inode = NULL; 46 int ret; 47 48 spin_lock(&block_group->lock); 49 if (block_group->inode) 50 inode = igrab(block_group->inode); 51 spin_unlock(&block_group->lock); 52 if (inode) 53 return inode; 54 55 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 56 key.offset = block_group->key.objectid; 57 key.type = 0; 58 59 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 60 if (ret < 0) 61 return ERR_PTR(ret); 62 if (ret > 0) { 63 btrfs_release_path(root, path); 64 return ERR_PTR(-ENOENT); 65 } 66 67 leaf = path->nodes[0]; 68 header = btrfs_item_ptr(leaf, path->slots[0], 69 struct btrfs_free_space_header); 70 btrfs_free_space_key(leaf, header, &disk_key); 71 btrfs_disk_key_to_cpu(&location, &disk_key); 72 btrfs_release_path(root, path); 73 74 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 75 if (!inode) 76 return ERR_PTR(-ENOENT); 77 if (IS_ERR(inode)) 78 return inode; 79 if (is_bad_inode(inode)) { 80 iput(inode); 81 return ERR_PTR(-ENOENT); 82 } 83 84 spin_lock(&block_group->lock); 85 if (!root->fs_info->closing) { 86 block_group->inode = igrab(inode); 87 block_group->iref = 1; 88 } 89 spin_unlock(&block_group->lock); 90 91 return inode; 92 } 93 94 int create_free_space_inode(struct btrfs_root *root, 95 struct btrfs_trans_handle *trans, 96 struct btrfs_block_group_cache *block_group, 97 struct btrfs_path *path) 98 { 99 struct btrfs_key key; 100 struct btrfs_disk_key disk_key; 101 struct btrfs_free_space_header *header; 102 struct btrfs_inode_item *inode_item; 103 struct extent_buffer *leaf; 104 u64 objectid; 105 int ret; 106 107 ret = btrfs_find_free_objectid(trans, root, 0, &objectid); 108 if (ret < 0) 109 return ret; 110 111 ret = btrfs_insert_empty_inode(trans, root, path, objectid); 112 if (ret) 113 return ret; 114 115 leaf = path->nodes[0]; 116 inode_item = btrfs_item_ptr(leaf, path->slots[0], 117 struct btrfs_inode_item); 118 btrfs_item_key(leaf, &disk_key, path->slots[0]); 119 memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 120 sizeof(*inode_item)); 121 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 122 btrfs_set_inode_size(leaf, inode_item, 0); 123 btrfs_set_inode_nbytes(leaf, inode_item, 0); 124 btrfs_set_inode_uid(leaf, inode_item, 0); 125 btrfs_set_inode_gid(leaf, inode_item, 0); 126 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 127 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 128 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 129 btrfs_set_inode_nlink(leaf, inode_item, 1); 130 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 131 btrfs_set_inode_block_group(leaf, inode_item, 132 block_group->key.objectid); 133 btrfs_mark_buffer_dirty(leaf); 134 btrfs_release_path(root, path); 135 136 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 137 key.offset = block_group->key.objectid; 138 key.type = 0; 139 140 ret = btrfs_insert_empty_item(trans, root, path, &key, 141 sizeof(struct btrfs_free_space_header)); 142 if (ret < 0) { 143 btrfs_release_path(root, path); 144 return ret; 145 } 146 leaf = path->nodes[0]; 147 header = btrfs_item_ptr(leaf, path->slots[0], 148 struct btrfs_free_space_header); 149 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 150 btrfs_set_free_space_key(leaf, header, &disk_key); 151 btrfs_mark_buffer_dirty(leaf); 152 btrfs_release_path(root, path); 153 154 return 0; 155 } 156 157 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 158 struct btrfs_trans_handle *trans, 159 struct btrfs_path *path, 160 struct inode *inode) 161 { 162 loff_t oldsize; 163 int ret = 0; 164 165 trans->block_rsv = root->orphan_block_rsv; 166 ret = btrfs_block_rsv_check(trans, root, 167 root->orphan_block_rsv, 168 0, 5); 169 if (ret) 170 return ret; 171 172 oldsize = i_size_read(inode); 173 btrfs_i_size_write(inode, 0); 174 truncate_pagecache(inode, oldsize, 0); 175 176 /* 177 * We don't need an orphan item because truncating the free space cache 178 * will never be split across transactions. 179 */ 180 ret = btrfs_truncate_inode_items(trans, root, inode, 181 0, BTRFS_EXTENT_DATA_KEY); 182 if (ret) { 183 WARN_ON(1); 184 return ret; 185 } 186 187 return btrfs_update_inode(trans, root, inode); 188 } 189 190 static int readahead_cache(struct inode *inode) 191 { 192 struct file_ra_state *ra; 193 unsigned long last_index; 194 195 ra = kzalloc(sizeof(*ra), GFP_NOFS); 196 if (!ra) 197 return -ENOMEM; 198 199 file_ra_state_init(ra, inode->i_mapping); 200 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 201 202 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 203 204 kfree(ra); 205 206 return 0; 207 } 208 209 int load_free_space_cache(struct btrfs_fs_info *fs_info, 210 struct btrfs_block_group_cache *block_group) 211 { 212 struct btrfs_root *root = fs_info->tree_root; 213 struct inode *inode; 214 struct btrfs_free_space_header *header; 215 struct extent_buffer *leaf; 216 struct page *page; 217 struct btrfs_path *path; 218 u32 *checksums = NULL, *crc; 219 char *disk_crcs = NULL; 220 struct btrfs_key key; 221 struct list_head bitmaps; 222 u64 num_entries; 223 u64 num_bitmaps; 224 u64 generation; 225 u32 cur_crc = ~(u32)0; 226 pgoff_t index = 0; 227 unsigned long first_page_offset; 228 int num_checksums; 229 int ret = 0; 230 231 /* 232 * If we're unmounting then just return, since this does a search on the 233 * normal root and not the commit root and we could deadlock. 234 */ 235 smp_mb(); 236 if (fs_info->closing) 237 return 0; 238 239 /* 240 * If this block group has been marked to be cleared for one reason or 241 * another then we can't trust the on disk cache, so just return. 242 */ 243 spin_lock(&block_group->lock); 244 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 245 spin_unlock(&block_group->lock); 246 return 0; 247 } 248 spin_unlock(&block_group->lock); 249 250 INIT_LIST_HEAD(&bitmaps); 251 252 path = btrfs_alloc_path(); 253 if (!path) 254 return 0; 255 256 inode = lookup_free_space_inode(root, block_group, path); 257 if (IS_ERR(inode)) { 258 btrfs_free_path(path); 259 return 0; 260 } 261 262 /* Nothing in the space cache, goodbye */ 263 if (!i_size_read(inode)) { 264 btrfs_free_path(path); 265 goto out; 266 } 267 268 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 269 key.offset = block_group->key.objectid; 270 key.type = 0; 271 272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 273 if (ret) { 274 btrfs_free_path(path); 275 goto out; 276 } 277 278 leaf = path->nodes[0]; 279 header = btrfs_item_ptr(leaf, path->slots[0], 280 struct btrfs_free_space_header); 281 num_entries = btrfs_free_space_entries(leaf, header); 282 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 283 generation = btrfs_free_space_generation(leaf, header); 284 btrfs_free_path(path); 285 286 if (BTRFS_I(inode)->generation != generation) { 287 printk(KERN_ERR "btrfs: free space inode generation (%llu) did" 288 " not match free space cache generation (%llu) for " 289 "block group %llu\n", 290 (unsigned long long)BTRFS_I(inode)->generation, 291 (unsigned long long)generation, 292 (unsigned long long)block_group->key.objectid); 293 goto out; 294 } 295 296 if (!num_entries) 297 goto out; 298 299 /* Setup everything for doing checksumming */ 300 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 301 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 302 if (!checksums) 303 goto out; 304 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 305 disk_crcs = kzalloc(first_page_offset, GFP_NOFS); 306 if (!disk_crcs) 307 goto out; 308 309 ret = readahead_cache(inode); 310 if (ret) { 311 ret = 0; 312 goto out; 313 } 314 315 while (1) { 316 struct btrfs_free_space_entry *entry; 317 struct btrfs_free_space *e; 318 void *addr; 319 unsigned long offset = 0; 320 unsigned long start_offset = 0; 321 int need_loop = 0; 322 323 if (!num_entries && !num_bitmaps) 324 break; 325 326 if (index == 0) { 327 start_offset = first_page_offset; 328 offset = start_offset; 329 } 330 331 page = grab_cache_page(inode->i_mapping, index); 332 if (!page) { 333 ret = 0; 334 goto free_cache; 335 } 336 337 if (!PageUptodate(page)) { 338 btrfs_readpage(NULL, page); 339 lock_page(page); 340 if (!PageUptodate(page)) { 341 unlock_page(page); 342 page_cache_release(page); 343 printk(KERN_ERR "btrfs: error reading free " 344 "space cache: %llu\n", 345 (unsigned long long) 346 block_group->key.objectid); 347 goto free_cache; 348 } 349 } 350 addr = kmap(page); 351 352 if (index == 0) { 353 u64 *gen; 354 355 memcpy(disk_crcs, addr, first_page_offset); 356 gen = addr + (sizeof(u32) * num_checksums); 357 if (*gen != BTRFS_I(inode)->generation) { 358 printk(KERN_ERR "btrfs: space cache generation" 359 " (%llu) does not match inode (%llu) " 360 "for block group %llu\n", 361 (unsigned long long)*gen, 362 (unsigned long long) 363 BTRFS_I(inode)->generation, 364 (unsigned long long) 365 block_group->key.objectid); 366 kunmap(page); 367 unlock_page(page); 368 page_cache_release(page); 369 goto free_cache; 370 } 371 crc = (u32 *)disk_crcs; 372 } 373 entry = addr + start_offset; 374 375 /* First lets check our crc before we do anything fun */ 376 cur_crc = ~(u32)0; 377 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, 378 PAGE_CACHE_SIZE - start_offset); 379 btrfs_csum_final(cur_crc, (char *)&cur_crc); 380 if (cur_crc != *crc) { 381 printk(KERN_ERR "btrfs: crc mismatch for page %lu in " 382 "block group %llu\n", index, 383 (unsigned long long)block_group->key.objectid); 384 kunmap(page); 385 unlock_page(page); 386 page_cache_release(page); 387 goto free_cache; 388 } 389 crc++; 390 391 while (1) { 392 if (!num_entries) 393 break; 394 395 need_loop = 1; 396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 397 if (!e) { 398 kunmap(page); 399 unlock_page(page); 400 page_cache_release(page); 401 goto free_cache; 402 } 403 404 e->offset = le64_to_cpu(entry->offset); 405 e->bytes = le64_to_cpu(entry->bytes); 406 if (!e->bytes) { 407 kunmap(page); 408 kfree(e); 409 unlock_page(page); 410 page_cache_release(page); 411 goto free_cache; 412 } 413 414 if (entry->type == BTRFS_FREE_SPACE_EXTENT) { 415 spin_lock(&block_group->tree_lock); 416 ret = link_free_space(block_group, e); 417 spin_unlock(&block_group->tree_lock); 418 BUG_ON(ret); 419 } else { 420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 421 if (!e->bitmap) { 422 kunmap(page); 423 kfree(e); 424 unlock_page(page); 425 page_cache_release(page); 426 goto free_cache; 427 } 428 spin_lock(&block_group->tree_lock); 429 ret = link_free_space(block_group, e); 430 block_group->total_bitmaps++; 431 recalculate_thresholds(block_group); 432 spin_unlock(&block_group->tree_lock); 433 list_add_tail(&e->list, &bitmaps); 434 } 435 436 num_entries--; 437 offset += sizeof(struct btrfs_free_space_entry); 438 if (offset + sizeof(struct btrfs_free_space_entry) >= 439 PAGE_CACHE_SIZE) 440 break; 441 entry++; 442 } 443 444 /* 445 * We read an entry out of this page, we need to move on to the 446 * next page. 447 */ 448 if (need_loop) { 449 kunmap(page); 450 goto next; 451 } 452 453 /* 454 * We add the bitmaps at the end of the entries in order that 455 * the bitmap entries are added to the cache. 456 */ 457 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 458 list_del_init(&e->list); 459 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 460 kunmap(page); 461 num_bitmaps--; 462 next: 463 unlock_page(page); 464 page_cache_release(page); 465 index++; 466 } 467 468 ret = 1; 469 out: 470 kfree(checksums); 471 kfree(disk_crcs); 472 iput(inode); 473 return ret; 474 475 free_cache: 476 /* This cache is bogus, make sure it gets cleared */ 477 spin_lock(&block_group->lock); 478 block_group->disk_cache_state = BTRFS_DC_CLEAR; 479 spin_unlock(&block_group->lock); 480 btrfs_remove_free_space_cache(block_group); 481 goto out; 482 } 483 484 int btrfs_write_out_cache(struct btrfs_root *root, 485 struct btrfs_trans_handle *trans, 486 struct btrfs_block_group_cache *block_group, 487 struct btrfs_path *path) 488 { 489 struct btrfs_free_space_header *header; 490 struct extent_buffer *leaf; 491 struct inode *inode; 492 struct rb_node *node; 493 struct list_head *pos, *n; 494 struct page *page; 495 struct extent_state *cached_state = NULL; 496 struct list_head bitmap_list; 497 struct btrfs_key key; 498 u64 bytes = 0; 499 u32 *crc, *checksums; 500 pgoff_t index = 0, last_index = 0; 501 unsigned long first_page_offset; 502 int num_checksums; 503 int entries = 0; 504 int bitmaps = 0; 505 int ret = 0; 506 507 root = root->fs_info->tree_root; 508 509 INIT_LIST_HEAD(&bitmap_list); 510 511 spin_lock(&block_group->lock); 512 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 513 spin_unlock(&block_group->lock); 514 return 0; 515 } 516 spin_unlock(&block_group->lock); 517 518 inode = lookup_free_space_inode(root, block_group, path); 519 if (IS_ERR(inode)) 520 return 0; 521 522 if (!i_size_read(inode)) { 523 iput(inode); 524 return 0; 525 } 526 527 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 528 filemap_write_and_wait(inode->i_mapping); 529 btrfs_wait_ordered_range(inode, inode->i_size & 530 ~(root->sectorsize - 1), (u64)-1); 531 532 /* We need a checksum per page. */ 533 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 534 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); 535 if (!crc) { 536 iput(inode); 537 return 0; 538 } 539 540 /* Since the first page has all of our checksums and our generation we 541 * need to calculate the offset into the page that we can start writing 542 * our entries. 543 */ 544 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 545 546 node = rb_first(&block_group->free_space_offset); 547 if (!node) 548 goto out_free; 549 550 /* 551 * Lock all pages first so we can lock the extent safely. 552 * 553 * NOTE: Because we hold the ref the entire time we're going to write to 554 * the page find_get_page should never fail, so we don't do a check 555 * after find_get_page at this point. Just putting this here so people 556 * know and don't freak out. 557 */ 558 while (index <= last_index) { 559 page = grab_cache_page(inode->i_mapping, index); 560 if (!page) { 561 pgoff_t i = 0; 562 563 while (i < index) { 564 page = find_get_page(inode->i_mapping, i); 565 unlock_page(page); 566 page_cache_release(page); 567 page_cache_release(page); 568 i++; 569 } 570 goto out_free; 571 } 572 index++; 573 } 574 575 index = 0; 576 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 577 0, &cached_state, GFP_NOFS); 578 579 /* Write out the extent entries */ 580 do { 581 struct btrfs_free_space_entry *entry; 582 void *addr; 583 unsigned long offset = 0; 584 unsigned long start_offset = 0; 585 586 if (index == 0) { 587 start_offset = first_page_offset; 588 offset = start_offset; 589 } 590 591 page = find_get_page(inode->i_mapping, index); 592 593 addr = kmap(page); 594 entry = addr + start_offset; 595 596 memset(addr, 0, PAGE_CACHE_SIZE); 597 while (1) { 598 struct btrfs_free_space *e; 599 600 e = rb_entry(node, struct btrfs_free_space, offset_index); 601 entries++; 602 603 entry->offset = cpu_to_le64(e->offset); 604 entry->bytes = cpu_to_le64(e->bytes); 605 if (e->bitmap) { 606 entry->type = BTRFS_FREE_SPACE_BITMAP; 607 list_add_tail(&e->list, &bitmap_list); 608 bitmaps++; 609 } else { 610 entry->type = BTRFS_FREE_SPACE_EXTENT; 611 } 612 node = rb_next(node); 613 if (!node) 614 break; 615 offset += sizeof(struct btrfs_free_space_entry); 616 if (offset + sizeof(struct btrfs_free_space_entry) >= 617 PAGE_CACHE_SIZE) 618 break; 619 entry++; 620 } 621 *crc = ~(u32)0; 622 *crc = btrfs_csum_data(root, addr + start_offset, *crc, 623 PAGE_CACHE_SIZE - start_offset); 624 kunmap(page); 625 626 btrfs_csum_final(*crc, (char *)crc); 627 crc++; 628 629 bytes += PAGE_CACHE_SIZE; 630 631 ClearPageChecked(page); 632 set_page_extent_mapped(page); 633 SetPageUptodate(page); 634 set_page_dirty(page); 635 636 /* 637 * We need to release our reference we got for grab_cache_page, 638 * except for the first page which will hold our checksums, we 639 * do that below. 640 */ 641 if (index != 0) { 642 unlock_page(page); 643 page_cache_release(page); 644 } 645 646 page_cache_release(page); 647 648 index++; 649 } while (node); 650 651 /* Write out the bitmaps */ 652 list_for_each_safe(pos, n, &bitmap_list) { 653 void *addr; 654 struct btrfs_free_space *entry = 655 list_entry(pos, struct btrfs_free_space, list); 656 657 page = find_get_page(inode->i_mapping, index); 658 659 addr = kmap(page); 660 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 661 *crc = ~(u32)0; 662 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); 663 kunmap(page); 664 btrfs_csum_final(*crc, (char *)crc); 665 crc++; 666 bytes += PAGE_CACHE_SIZE; 667 668 ClearPageChecked(page); 669 set_page_extent_mapped(page); 670 SetPageUptodate(page); 671 set_page_dirty(page); 672 unlock_page(page); 673 page_cache_release(page); 674 page_cache_release(page); 675 list_del_init(&entry->list); 676 index++; 677 } 678 679 /* Zero out the rest of the pages just to make sure */ 680 while (index <= last_index) { 681 void *addr; 682 683 page = find_get_page(inode->i_mapping, index); 684 685 addr = kmap(page); 686 memset(addr, 0, PAGE_CACHE_SIZE); 687 kunmap(page); 688 ClearPageChecked(page); 689 set_page_extent_mapped(page); 690 SetPageUptodate(page); 691 set_page_dirty(page); 692 unlock_page(page); 693 page_cache_release(page); 694 page_cache_release(page); 695 bytes += PAGE_CACHE_SIZE; 696 index++; 697 } 698 699 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); 700 701 /* Write the checksums and trans id to the first page */ 702 { 703 void *addr; 704 u64 *gen; 705 706 page = find_get_page(inode->i_mapping, 0); 707 708 addr = kmap(page); 709 memcpy(addr, checksums, sizeof(u32) * num_checksums); 710 gen = addr + (sizeof(u32) * num_checksums); 711 *gen = trans->transid; 712 kunmap(page); 713 ClearPageChecked(page); 714 set_page_extent_mapped(page); 715 SetPageUptodate(page); 716 set_page_dirty(page); 717 unlock_page(page); 718 page_cache_release(page); 719 page_cache_release(page); 720 } 721 BTRFS_I(inode)->generation = trans->transid; 722 723 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 724 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 725 726 filemap_write_and_wait(inode->i_mapping); 727 728 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 729 key.offset = block_group->key.objectid; 730 key.type = 0; 731 732 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 733 if (ret < 0) { 734 ret = 0; 735 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 736 EXTENT_DIRTY | EXTENT_DELALLOC | 737 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 738 goto out_free; 739 } 740 leaf = path->nodes[0]; 741 if (ret > 0) { 742 struct btrfs_key found_key; 743 BUG_ON(!path->slots[0]); 744 path->slots[0]--; 745 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 746 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 747 found_key.offset != block_group->key.objectid) { 748 ret = 0; 749 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 750 EXTENT_DIRTY | EXTENT_DELALLOC | 751 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 752 GFP_NOFS); 753 btrfs_release_path(root, path); 754 goto out_free; 755 } 756 } 757 header = btrfs_item_ptr(leaf, path->slots[0], 758 struct btrfs_free_space_header); 759 btrfs_set_free_space_entries(leaf, header, entries); 760 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 761 btrfs_set_free_space_generation(leaf, header, trans->transid); 762 btrfs_mark_buffer_dirty(leaf); 763 btrfs_release_path(root, path); 764 765 ret = 1; 766 767 out_free: 768 if (ret == 0) { 769 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 770 spin_lock(&block_group->lock); 771 block_group->disk_cache_state = BTRFS_DC_ERROR; 772 spin_unlock(&block_group->lock); 773 BTRFS_I(inode)->generation = 0; 774 } 775 kfree(checksums); 776 btrfs_update_inode(trans, root, inode); 777 iput(inode); 778 return ret; 779 } 780 781 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 782 u64 offset) 783 { 784 BUG_ON(offset < bitmap_start); 785 offset -= bitmap_start; 786 return (unsigned long)(div64_u64(offset, sectorsize)); 787 } 788 789 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) 790 { 791 return (unsigned long)(div64_u64(bytes, sectorsize)); 792 } 793 794 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, 795 u64 offset) 796 { 797 u64 bitmap_start; 798 u64 bytes_per_bitmap; 799 800 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; 801 bitmap_start = offset - block_group->key.objectid; 802 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 803 bitmap_start *= bytes_per_bitmap; 804 bitmap_start += block_group->key.objectid; 805 806 return bitmap_start; 807 } 808 809 static int tree_insert_offset(struct rb_root *root, u64 offset, 810 struct rb_node *node, int bitmap) 811 { 812 struct rb_node **p = &root->rb_node; 813 struct rb_node *parent = NULL; 814 struct btrfs_free_space *info; 815 816 while (*p) { 817 parent = *p; 818 info = rb_entry(parent, struct btrfs_free_space, offset_index); 819 820 if (offset < info->offset) { 821 p = &(*p)->rb_left; 822 } else if (offset > info->offset) { 823 p = &(*p)->rb_right; 824 } else { 825 /* 826 * we could have a bitmap entry and an extent entry 827 * share the same offset. If this is the case, we want 828 * the extent entry to always be found first if we do a 829 * linear search through the tree, since we want to have 830 * the quickest allocation time, and allocating from an 831 * extent is faster than allocating from a bitmap. So 832 * if we're inserting a bitmap and we find an entry at 833 * this offset, we want to go right, or after this entry 834 * logically. If we are inserting an extent and we've 835 * found a bitmap, we want to go left, or before 836 * logically. 837 */ 838 if (bitmap) { 839 WARN_ON(info->bitmap); 840 p = &(*p)->rb_right; 841 } else { 842 WARN_ON(!info->bitmap); 843 p = &(*p)->rb_left; 844 } 845 } 846 } 847 848 rb_link_node(node, parent, p); 849 rb_insert_color(node, root); 850 851 return 0; 852 } 853 854 /* 855 * searches the tree for the given offset. 856 * 857 * fuzzy - If this is set, then we are trying to make an allocation, and we just 858 * want a section that has at least bytes size and comes at or after the given 859 * offset. 860 */ 861 static struct btrfs_free_space * 862 tree_search_offset(struct btrfs_block_group_cache *block_group, 863 u64 offset, int bitmap_only, int fuzzy) 864 { 865 struct rb_node *n = block_group->free_space_offset.rb_node; 866 struct btrfs_free_space *entry, *prev = NULL; 867 868 /* find entry that is closest to the 'offset' */ 869 while (1) { 870 if (!n) { 871 entry = NULL; 872 break; 873 } 874 875 entry = rb_entry(n, struct btrfs_free_space, offset_index); 876 prev = entry; 877 878 if (offset < entry->offset) 879 n = n->rb_left; 880 else if (offset > entry->offset) 881 n = n->rb_right; 882 else 883 break; 884 } 885 886 if (bitmap_only) { 887 if (!entry) 888 return NULL; 889 if (entry->bitmap) 890 return entry; 891 892 /* 893 * bitmap entry and extent entry may share same offset, 894 * in that case, bitmap entry comes after extent entry. 895 */ 896 n = rb_next(n); 897 if (!n) 898 return NULL; 899 entry = rb_entry(n, struct btrfs_free_space, offset_index); 900 if (entry->offset != offset) 901 return NULL; 902 903 WARN_ON(!entry->bitmap); 904 return entry; 905 } else if (entry) { 906 if (entry->bitmap) { 907 /* 908 * if previous extent entry covers the offset, 909 * we should return it instead of the bitmap entry 910 */ 911 n = &entry->offset_index; 912 while (1) { 913 n = rb_prev(n); 914 if (!n) 915 break; 916 prev = rb_entry(n, struct btrfs_free_space, 917 offset_index); 918 if (!prev->bitmap) { 919 if (prev->offset + prev->bytes > offset) 920 entry = prev; 921 break; 922 } 923 } 924 } 925 return entry; 926 } 927 928 if (!prev) 929 return NULL; 930 931 /* find last entry before the 'offset' */ 932 entry = prev; 933 if (entry->offset > offset) { 934 n = rb_prev(&entry->offset_index); 935 if (n) { 936 entry = rb_entry(n, struct btrfs_free_space, 937 offset_index); 938 BUG_ON(entry->offset > offset); 939 } else { 940 if (fuzzy) 941 return entry; 942 else 943 return NULL; 944 } 945 } 946 947 if (entry->bitmap) { 948 n = &entry->offset_index; 949 while (1) { 950 n = rb_prev(n); 951 if (!n) 952 break; 953 prev = rb_entry(n, struct btrfs_free_space, 954 offset_index); 955 if (!prev->bitmap) { 956 if (prev->offset + prev->bytes > offset) 957 return prev; 958 break; 959 } 960 } 961 if (entry->offset + BITS_PER_BITMAP * 962 block_group->sectorsize > offset) 963 return entry; 964 } else if (entry->offset + entry->bytes > offset) 965 return entry; 966 967 if (!fuzzy) 968 return NULL; 969 970 while (1) { 971 if (entry->bitmap) { 972 if (entry->offset + BITS_PER_BITMAP * 973 block_group->sectorsize > offset) 974 break; 975 } else { 976 if (entry->offset + entry->bytes > offset) 977 break; 978 } 979 980 n = rb_next(&entry->offset_index); 981 if (!n) 982 return NULL; 983 entry = rb_entry(n, struct btrfs_free_space, offset_index); 984 } 985 return entry; 986 } 987 988 static void unlink_free_space(struct btrfs_block_group_cache *block_group, 989 struct btrfs_free_space *info) 990 { 991 rb_erase(&info->offset_index, &block_group->free_space_offset); 992 block_group->free_extents--; 993 block_group->free_space -= info->bytes; 994 } 995 996 static int link_free_space(struct btrfs_block_group_cache *block_group, 997 struct btrfs_free_space *info) 998 { 999 int ret = 0; 1000 1001 BUG_ON(!info->bitmap && !info->bytes); 1002 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 1003 &info->offset_index, (info->bitmap != NULL)); 1004 if (ret) 1005 return ret; 1006 1007 block_group->free_space += info->bytes; 1008 block_group->free_extents++; 1009 return ret; 1010 } 1011 1012 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) 1013 { 1014 u64 max_bytes; 1015 u64 bitmap_bytes; 1016 u64 extent_bytes; 1017 1018 /* 1019 * The goal is to keep the total amount of memory used per 1gb of space 1020 * at or below 32k, so we need to adjust how much memory we allow to be 1021 * used by extent based free space tracking 1022 */ 1023 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1024 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1025 1026 /* 1027 * we want to account for 1 more bitmap than what we have so we can make 1028 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1029 * we add more bitmaps. 1030 */ 1031 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1032 1033 if (bitmap_bytes >= max_bytes) { 1034 block_group->extents_thresh = 0; 1035 return; 1036 } 1037 1038 /* 1039 * we want the extent entry threshold to always be at most 1/2 the maxw 1040 * bytes we can have, or whatever is less than that. 1041 */ 1042 extent_bytes = max_bytes - bitmap_bytes; 1043 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1044 1045 block_group->extents_thresh = 1046 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1047 } 1048 1049 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, 1050 struct btrfs_free_space *info, u64 offset, 1051 u64 bytes) 1052 { 1053 unsigned long start, end; 1054 unsigned long i; 1055 1056 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1057 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1058 BUG_ON(end > BITS_PER_BITMAP); 1059 1060 for (i = start; i < end; i++) 1061 clear_bit(i, info->bitmap); 1062 1063 info->bytes -= bytes; 1064 block_group->free_space -= bytes; 1065 } 1066 1067 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, 1068 struct btrfs_free_space *info, u64 offset, 1069 u64 bytes) 1070 { 1071 unsigned long start, end; 1072 unsigned long i; 1073 1074 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1075 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1076 BUG_ON(end > BITS_PER_BITMAP); 1077 1078 for (i = start; i < end; i++) 1079 set_bit(i, info->bitmap); 1080 1081 info->bytes += bytes; 1082 block_group->free_space += bytes; 1083 } 1084 1085 static int search_bitmap(struct btrfs_block_group_cache *block_group, 1086 struct btrfs_free_space *bitmap_info, u64 *offset, 1087 u64 *bytes) 1088 { 1089 unsigned long found_bits = 0; 1090 unsigned long bits, i; 1091 unsigned long next_zero; 1092 1093 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, 1094 max_t(u64, *offset, bitmap_info->offset)); 1095 bits = bytes_to_bits(*bytes, block_group->sectorsize); 1096 1097 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1098 i < BITS_PER_BITMAP; 1099 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { 1100 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1101 BITS_PER_BITMAP, i); 1102 if ((next_zero - i) >= bits) { 1103 found_bits = next_zero - i; 1104 break; 1105 } 1106 i = next_zero; 1107 } 1108 1109 if (found_bits) { 1110 *offset = (u64)(i * block_group->sectorsize) + 1111 bitmap_info->offset; 1112 *bytes = (u64)(found_bits) * block_group->sectorsize; 1113 return 0; 1114 } 1115 1116 return -1; 1117 } 1118 1119 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache 1120 *block_group, u64 *offset, 1121 u64 *bytes, int debug) 1122 { 1123 struct btrfs_free_space *entry; 1124 struct rb_node *node; 1125 int ret; 1126 1127 if (!block_group->free_space_offset.rb_node) 1128 return NULL; 1129 1130 entry = tree_search_offset(block_group, 1131 offset_to_bitmap(block_group, *offset), 1132 0, 1); 1133 if (!entry) 1134 return NULL; 1135 1136 for (node = &entry->offset_index; node; node = rb_next(node)) { 1137 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1138 if (entry->bytes < *bytes) 1139 continue; 1140 1141 if (entry->bitmap) { 1142 ret = search_bitmap(block_group, entry, offset, bytes); 1143 if (!ret) 1144 return entry; 1145 continue; 1146 } 1147 1148 *offset = entry->offset; 1149 *bytes = entry->bytes; 1150 return entry; 1151 } 1152 1153 return NULL; 1154 } 1155 1156 static void add_new_bitmap(struct btrfs_block_group_cache *block_group, 1157 struct btrfs_free_space *info, u64 offset) 1158 { 1159 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1160 int max_bitmaps = (int)div64_u64(block_group->key.offset + 1161 bytes_per_bg - 1, bytes_per_bg); 1162 BUG_ON(block_group->total_bitmaps >= max_bitmaps); 1163 1164 info->offset = offset_to_bitmap(block_group, offset); 1165 info->bytes = 0; 1166 link_free_space(block_group, info); 1167 block_group->total_bitmaps++; 1168 1169 recalculate_thresholds(block_group); 1170 } 1171 1172 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1173 struct btrfs_free_space *bitmap_info, 1174 u64 *offset, u64 *bytes) 1175 { 1176 u64 end; 1177 u64 search_start, search_bytes; 1178 int ret; 1179 1180 again: 1181 end = bitmap_info->offset + 1182 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; 1183 1184 /* 1185 * XXX - this can go away after a few releases. 1186 * 1187 * since the only user of btrfs_remove_free_space is the tree logging 1188 * stuff, and the only way to test that is under crash conditions, we 1189 * want to have this debug stuff here just in case somethings not 1190 * working. Search the bitmap for the space we are trying to use to 1191 * make sure its actually there. If its not there then we need to stop 1192 * because something has gone wrong. 1193 */ 1194 search_start = *offset; 1195 search_bytes = *bytes; 1196 ret = search_bitmap(block_group, bitmap_info, &search_start, 1197 &search_bytes); 1198 BUG_ON(ret < 0 || search_start != *offset); 1199 1200 if (*offset > bitmap_info->offset && *offset + *bytes > end) { 1201 bitmap_clear_bits(block_group, bitmap_info, *offset, 1202 end - *offset + 1); 1203 *bytes -= end - *offset + 1; 1204 *offset = end + 1; 1205 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { 1206 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); 1207 *bytes = 0; 1208 } 1209 1210 if (*bytes) { 1211 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1212 if (!bitmap_info->bytes) { 1213 unlink_free_space(block_group, bitmap_info); 1214 kfree(bitmap_info->bitmap); 1215 kfree(bitmap_info); 1216 block_group->total_bitmaps--; 1217 recalculate_thresholds(block_group); 1218 } 1219 1220 /* 1221 * no entry after this bitmap, but we still have bytes to 1222 * remove, so something has gone wrong. 1223 */ 1224 if (!next) 1225 return -EINVAL; 1226 1227 bitmap_info = rb_entry(next, struct btrfs_free_space, 1228 offset_index); 1229 1230 /* 1231 * if the next entry isn't a bitmap we need to return to let the 1232 * extent stuff do its work. 1233 */ 1234 if (!bitmap_info->bitmap) 1235 return -EAGAIN; 1236 1237 /* 1238 * Ok the next item is a bitmap, but it may not actually hold 1239 * the information for the rest of this free space stuff, so 1240 * look for it, and if we don't find it return so we can try 1241 * everything over again. 1242 */ 1243 search_start = *offset; 1244 search_bytes = *bytes; 1245 ret = search_bitmap(block_group, bitmap_info, &search_start, 1246 &search_bytes); 1247 if (ret < 0 || search_start != *offset) 1248 return -EAGAIN; 1249 1250 goto again; 1251 } else if (!bitmap_info->bytes) { 1252 unlink_free_space(block_group, bitmap_info); 1253 kfree(bitmap_info->bitmap); 1254 kfree(bitmap_info); 1255 block_group->total_bitmaps--; 1256 recalculate_thresholds(block_group); 1257 } 1258 1259 return 0; 1260 } 1261 1262 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, 1263 struct btrfs_free_space *info) 1264 { 1265 struct btrfs_free_space *bitmap_info; 1266 int added = 0; 1267 u64 bytes, offset, end; 1268 int ret; 1269 1270 /* 1271 * If we are below the extents threshold then we can add this as an 1272 * extent, and don't have to deal with the bitmap 1273 */ 1274 if (block_group->free_extents < block_group->extents_thresh && 1275 info->bytes > block_group->sectorsize * 4) 1276 return 0; 1277 1278 /* 1279 * some block groups are so tiny they can't be enveloped by a bitmap, so 1280 * don't even bother to create a bitmap for this 1281 */ 1282 if (BITS_PER_BITMAP * block_group->sectorsize > 1283 block_group->key.offset) 1284 return 0; 1285 1286 bytes = info->bytes; 1287 offset = info->offset; 1288 1289 again: 1290 bitmap_info = tree_search_offset(block_group, 1291 offset_to_bitmap(block_group, offset), 1292 1, 0); 1293 if (!bitmap_info) { 1294 BUG_ON(added); 1295 goto new_bitmap; 1296 } 1297 1298 end = bitmap_info->offset + 1299 (u64)(BITS_PER_BITMAP * block_group->sectorsize); 1300 1301 if (offset >= bitmap_info->offset && offset + bytes > end) { 1302 bitmap_set_bits(block_group, bitmap_info, offset, 1303 end - offset); 1304 bytes -= end - offset; 1305 offset = end; 1306 added = 0; 1307 } else if (offset >= bitmap_info->offset && offset + bytes <= end) { 1308 bitmap_set_bits(block_group, bitmap_info, offset, bytes); 1309 bytes = 0; 1310 } else { 1311 BUG(); 1312 } 1313 1314 if (!bytes) { 1315 ret = 1; 1316 goto out; 1317 } else 1318 goto again; 1319 1320 new_bitmap: 1321 if (info && info->bitmap) { 1322 add_new_bitmap(block_group, info, offset); 1323 added = 1; 1324 info = NULL; 1325 goto again; 1326 } else { 1327 spin_unlock(&block_group->tree_lock); 1328 1329 /* no pre-allocated info, allocate a new one */ 1330 if (!info) { 1331 info = kzalloc(sizeof(struct btrfs_free_space), 1332 GFP_NOFS); 1333 if (!info) { 1334 spin_lock(&block_group->tree_lock); 1335 ret = -ENOMEM; 1336 goto out; 1337 } 1338 } 1339 1340 /* allocate the bitmap */ 1341 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1342 spin_lock(&block_group->tree_lock); 1343 if (!info->bitmap) { 1344 ret = -ENOMEM; 1345 goto out; 1346 } 1347 goto again; 1348 } 1349 1350 out: 1351 if (info) { 1352 if (info->bitmap) 1353 kfree(info->bitmap); 1354 kfree(info); 1355 } 1356 1357 return ret; 1358 } 1359 1360 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1361 u64 offset, u64 bytes) 1362 { 1363 struct btrfs_free_space *right_info = NULL; 1364 struct btrfs_free_space *left_info = NULL; 1365 struct btrfs_free_space *info = NULL; 1366 int ret = 0; 1367 1368 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 1369 if (!info) 1370 return -ENOMEM; 1371 1372 info->offset = offset; 1373 info->bytes = bytes; 1374 1375 spin_lock(&block_group->tree_lock); 1376 1377 /* 1378 * first we want to see if there is free space adjacent to the range we 1379 * are adding, if there is remove that struct and add a new one to 1380 * cover the entire range 1381 */ 1382 right_info = tree_search_offset(block_group, offset + bytes, 0, 0); 1383 if (right_info && rb_prev(&right_info->offset_index)) 1384 left_info = rb_entry(rb_prev(&right_info->offset_index), 1385 struct btrfs_free_space, offset_index); 1386 else 1387 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1388 1389 /* 1390 * If there was no extent directly to the left or right of this new 1391 * extent then we know we're going to have to allocate a new extent, so 1392 * before we do that see if we need to drop this into a bitmap 1393 */ 1394 if ((!left_info || left_info->bitmap) && 1395 (!right_info || right_info->bitmap)) { 1396 ret = insert_into_bitmap(block_group, info); 1397 1398 if (ret < 0) { 1399 goto out; 1400 } else if (ret) { 1401 ret = 0; 1402 goto out; 1403 } 1404 } 1405 1406 if (right_info && !right_info->bitmap) { 1407 unlink_free_space(block_group, right_info); 1408 info->bytes += right_info->bytes; 1409 kfree(right_info); 1410 } 1411 1412 if (left_info && !left_info->bitmap && 1413 left_info->offset + left_info->bytes == offset) { 1414 unlink_free_space(block_group, left_info); 1415 info->offset = left_info->offset; 1416 info->bytes += left_info->bytes; 1417 kfree(left_info); 1418 } 1419 1420 ret = link_free_space(block_group, info); 1421 if (ret) 1422 kfree(info); 1423 out: 1424 spin_unlock(&block_group->tree_lock); 1425 1426 if (ret) { 1427 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); 1428 BUG_ON(ret == -EEXIST); 1429 } 1430 1431 return ret; 1432 } 1433 1434 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 1435 u64 offset, u64 bytes) 1436 { 1437 struct btrfs_free_space *info; 1438 struct btrfs_free_space *next_info = NULL; 1439 int ret = 0; 1440 1441 spin_lock(&block_group->tree_lock); 1442 1443 again: 1444 info = tree_search_offset(block_group, offset, 0, 0); 1445 if (!info) { 1446 /* 1447 * oops didn't find an extent that matched the space we wanted 1448 * to remove, look for a bitmap instead 1449 */ 1450 info = tree_search_offset(block_group, 1451 offset_to_bitmap(block_group, offset), 1452 1, 0); 1453 if (!info) { 1454 WARN_ON(1); 1455 goto out_lock; 1456 } 1457 } 1458 1459 if (info->bytes < bytes && rb_next(&info->offset_index)) { 1460 u64 end; 1461 next_info = rb_entry(rb_next(&info->offset_index), 1462 struct btrfs_free_space, 1463 offset_index); 1464 1465 if (next_info->bitmap) 1466 end = next_info->offset + BITS_PER_BITMAP * 1467 block_group->sectorsize - 1; 1468 else 1469 end = next_info->offset + next_info->bytes; 1470 1471 if (next_info->bytes < bytes || 1472 next_info->offset > offset || offset > end) { 1473 printk(KERN_CRIT "Found free space at %llu, size %llu," 1474 " trying to use %llu\n", 1475 (unsigned long long)info->offset, 1476 (unsigned long long)info->bytes, 1477 (unsigned long long)bytes); 1478 WARN_ON(1); 1479 ret = -EINVAL; 1480 goto out_lock; 1481 } 1482 1483 info = next_info; 1484 } 1485 1486 if (info->bytes == bytes) { 1487 unlink_free_space(block_group, info); 1488 if (info->bitmap) { 1489 kfree(info->bitmap); 1490 block_group->total_bitmaps--; 1491 } 1492 kfree(info); 1493 goto out_lock; 1494 } 1495 1496 if (!info->bitmap && info->offset == offset) { 1497 unlink_free_space(block_group, info); 1498 info->offset += bytes; 1499 info->bytes -= bytes; 1500 link_free_space(block_group, info); 1501 goto out_lock; 1502 } 1503 1504 if (!info->bitmap && info->offset <= offset && 1505 info->offset + info->bytes >= offset + bytes) { 1506 u64 old_start = info->offset; 1507 /* 1508 * we're freeing space in the middle of the info, 1509 * this can happen during tree log replay 1510 * 1511 * first unlink the old info and then 1512 * insert it again after the hole we're creating 1513 */ 1514 unlink_free_space(block_group, info); 1515 if (offset + bytes < info->offset + info->bytes) { 1516 u64 old_end = info->offset + info->bytes; 1517 1518 info->offset = offset + bytes; 1519 info->bytes = old_end - info->offset; 1520 ret = link_free_space(block_group, info); 1521 WARN_ON(ret); 1522 if (ret) 1523 goto out_lock; 1524 } else { 1525 /* the hole we're creating ends at the end 1526 * of the info struct, just free the info 1527 */ 1528 kfree(info); 1529 } 1530 spin_unlock(&block_group->tree_lock); 1531 1532 /* step two, insert a new info struct to cover 1533 * anything before the hole 1534 */ 1535 ret = btrfs_add_free_space(block_group, old_start, 1536 offset - old_start); 1537 WARN_ON(ret); 1538 goto out; 1539 } 1540 1541 ret = remove_from_bitmap(block_group, info, &offset, &bytes); 1542 if (ret == -EAGAIN) 1543 goto again; 1544 BUG_ON(ret); 1545 out_lock: 1546 spin_unlock(&block_group->tree_lock); 1547 out: 1548 return ret; 1549 } 1550 1551 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 1552 u64 bytes) 1553 { 1554 struct btrfs_free_space *info; 1555 struct rb_node *n; 1556 int count = 0; 1557 1558 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { 1559 info = rb_entry(n, struct btrfs_free_space, offset_index); 1560 if (info->bytes >= bytes) 1561 count++; 1562 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1563 (unsigned long long)info->offset, 1564 (unsigned long long)info->bytes, 1565 (info->bitmap) ? "yes" : "no"); 1566 } 1567 printk(KERN_INFO "block group has cluster?: %s\n", 1568 list_empty(&block_group->cluster_list) ? "no" : "yes"); 1569 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 1570 "\n", count); 1571 } 1572 1573 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) 1574 { 1575 struct btrfs_free_space *info; 1576 struct rb_node *n; 1577 u64 ret = 0; 1578 1579 for (n = rb_first(&block_group->free_space_offset); n; 1580 n = rb_next(n)) { 1581 info = rb_entry(n, struct btrfs_free_space, offset_index); 1582 ret += info->bytes; 1583 } 1584 1585 return ret; 1586 } 1587 1588 /* 1589 * for a given cluster, put all of its extents back into the free 1590 * space cache. If the block group passed doesn't match the block group 1591 * pointed to by the cluster, someone else raced in and freed the 1592 * cluster already. In that case, we just return without changing anything 1593 */ 1594 static int 1595 __btrfs_return_cluster_to_free_space( 1596 struct btrfs_block_group_cache *block_group, 1597 struct btrfs_free_cluster *cluster) 1598 { 1599 struct btrfs_free_space *entry; 1600 struct rb_node *node; 1601 bool bitmap; 1602 1603 spin_lock(&cluster->lock); 1604 if (cluster->block_group != block_group) 1605 goto out; 1606 1607 bitmap = cluster->points_to_bitmap; 1608 cluster->block_group = NULL; 1609 cluster->window_start = 0; 1610 list_del_init(&cluster->block_group_list); 1611 cluster->points_to_bitmap = false; 1612 1613 if (bitmap) 1614 goto out; 1615 1616 node = rb_first(&cluster->root); 1617 while (node) { 1618 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1619 node = rb_next(&entry->offset_index); 1620 rb_erase(&entry->offset_index, &cluster->root); 1621 BUG_ON(entry->bitmap); 1622 tree_insert_offset(&block_group->free_space_offset, 1623 entry->offset, &entry->offset_index, 0); 1624 } 1625 cluster->root = RB_ROOT; 1626 1627 out: 1628 spin_unlock(&cluster->lock); 1629 btrfs_put_block_group(block_group); 1630 return 0; 1631 } 1632 1633 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 1634 { 1635 struct btrfs_free_space *info; 1636 struct rb_node *node; 1637 struct btrfs_free_cluster *cluster; 1638 struct list_head *head; 1639 1640 spin_lock(&block_group->tree_lock); 1641 while ((head = block_group->cluster_list.next) != 1642 &block_group->cluster_list) { 1643 cluster = list_entry(head, struct btrfs_free_cluster, 1644 block_group_list); 1645 1646 WARN_ON(cluster->block_group != block_group); 1647 __btrfs_return_cluster_to_free_space(block_group, cluster); 1648 if (need_resched()) { 1649 spin_unlock(&block_group->tree_lock); 1650 cond_resched(); 1651 spin_lock(&block_group->tree_lock); 1652 } 1653 } 1654 1655 while ((node = rb_last(&block_group->free_space_offset)) != NULL) { 1656 info = rb_entry(node, struct btrfs_free_space, offset_index); 1657 unlink_free_space(block_group, info); 1658 if (info->bitmap) 1659 kfree(info->bitmap); 1660 kfree(info); 1661 if (need_resched()) { 1662 spin_unlock(&block_group->tree_lock); 1663 cond_resched(); 1664 spin_lock(&block_group->tree_lock); 1665 } 1666 } 1667 1668 spin_unlock(&block_group->tree_lock); 1669 } 1670 1671 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 1672 u64 offset, u64 bytes, u64 empty_size) 1673 { 1674 struct btrfs_free_space *entry = NULL; 1675 u64 bytes_search = bytes + empty_size; 1676 u64 ret = 0; 1677 1678 spin_lock(&block_group->tree_lock); 1679 entry = find_free_space(block_group, &offset, &bytes_search, 0); 1680 if (!entry) 1681 goto out; 1682 1683 ret = offset; 1684 if (entry->bitmap) { 1685 bitmap_clear_bits(block_group, entry, offset, bytes); 1686 if (!entry->bytes) { 1687 unlink_free_space(block_group, entry); 1688 kfree(entry->bitmap); 1689 kfree(entry); 1690 block_group->total_bitmaps--; 1691 recalculate_thresholds(block_group); 1692 } 1693 } else { 1694 unlink_free_space(block_group, entry); 1695 entry->offset += bytes; 1696 entry->bytes -= bytes; 1697 if (!entry->bytes) 1698 kfree(entry); 1699 else 1700 link_free_space(block_group, entry); 1701 } 1702 1703 out: 1704 spin_unlock(&block_group->tree_lock); 1705 1706 return ret; 1707 } 1708 1709 /* 1710 * given a cluster, put all of its extents back into the free space 1711 * cache. If a block group is passed, this function will only free 1712 * a cluster that belongs to the passed block group. 1713 * 1714 * Otherwise, it'll get a reference on the block group pointed to by the 1715 * cluster and remove the cluster from it. 1716 */ 1717 int btrfs_return_cluster_to_free_space( 1718 struct btrfs_block_group_cache *block_group, 1719 struct btrfs_free_cluster *cluster) 1720 { 1721 int ret; 1722 1723 /* first, get a safe pointer to the block group */ 1724 spin_lock(&cluster->lock); 1725 if (!block_group) { 1726 block_group = cluster->block_group; 1727 if (!block_group) { 1728 spin_unlock(&cluster->lock); 1729 return 0; 1730 } 1731 } else if (cluster->block_group != block_group) { 1732 /* someone else has already freed it don't redo their work */ 1733 spin_unlock(&cluster->lock); 1734 return 0; 1735 } 1736 atomic_inc(&block_group->count); 1737 spin_unlock(&cluster->lock); 1738 1739 /* now return any extents the cluster had on it */ 1740 spin_lock(&block_group->tree_lock); 1741 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 1742 spin_unlock(&block_group->tree_lock); 1743 1744 /* finally drop our ref */ 1745 btrfs_put_block_group(block_group); 1746 return ret; 1747 } 1748 1749 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 1750 struct btrfs_free_cluster *cluster, 1751 u64 bytes, u64 min_start) 1752 { 1753 struct btrfs_free_space *entry; 1754 int err; 1755 u64 search_start = cluster->window_start; 1756 u64 search_bytes = bytes; 1757 u64 ret = 0; 1758 1759 spin_lock(&block_group->tree_lock); 1760 spin_lock(&cluster->lock); 1761 1762 if (!cluster->points_to_bitmap) 1763 goto out; 1764 1765 if (cluster->block_group != block_group) 1766 goto out; 1767 1768 /* 1769 * search_start is the beginning of the bitmap, but at some point it may 1770 * be a good idea to point to the actual start of the free area in the 1771 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only 1772 * to 1 to make sure we get the bitmap entry 1773 */ 1774 entry = tree_search_offset(block_group, 1775 offset_to_bitmap(block_group, search_start), 1776 1, 0); 1777 if (!entry || !entry->bitmap) 1778 goto out; 1779 1780 search_start = min_start; 1781 search_bytes = bytes; 1782 1783 err = search_bitmap(block_group, entry, &search_start, 1784 &search_bytes); 1785 if (err) 1786 goto out; 1787 1788 ret = search_start; 1789 bitmap_clear_bits(block_group, entry, ret, bytes); 1790 out: 1791 spin_unlock(&cluster->lock); 1792 spin_unlock(&block_group->tree_lock); 1793 1794 return ret; 1795 } 1796 1797 /* 1798 * given a cluster, try to allocate 'bytes' from it, returns 0 1799 * if it couldn't find anything suitably large, or a logical disk offset 1800 * if things worked out 1801 */ 1802 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 1803 struct btrfs_free_cluster *cluster, u64 bytes, 1804 u64 min_start) 1805 { 1806 struct btrfs_free_space *entry = NULL; 1807 struct rb_node *node; 1808 u64 ret = 0; 1809 1810 if (cluster->points_to_bitmap) 1811 return btrfs_alloc_from_bitmap(block_group, cluster, bytes, 1812 min_start); 1813 1814 spin_lock(&cluster->lock); 1815 if (bytes > cluster->max_size) 1816 goto out; 1817 1818 if (cluster->block_group != block_group) 1819 goto out; 1820 1821 node = rb_first(&cluster->root); 1822 if (!node) 1823 goto out; 1824 1825 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1826 1827 while(1) { 1828 if (entry->bytes < bytes || entry->offset < min_start) { 1829 struct rb_node *node; 1830 1831 node = rb_next(&entry->offset_index); 1832 if (!node) 1833 break; 1834 entry = rb_entry(node, struct btrfs_free_space, 1835 offset_index); 1836 continue; 1837 } 1838 ret = entry->offset; 1839 1840 entry->offset += bytes; 1841 entry->bytes -= bytes; 1842 1843 if (entry->bytes == 0) { 1844 rb_erase(&entry->offset_index, &cluster->root); 1845 kfree(entry); 1846 } 1847 break; 1848 } 1849 out: 1850 spin_unlock(&cluster->lock); 1851 1852 return ret; 1853 } 1854 1855 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 1856 struct btrfs_free_space *entry, 1857 struct btrfs_free_cluster *cluster, 1858 u64 offset, u64 bytes, u64 min_bytes) 1859 { 1860 unsigned long next_zero; 1861 unsigned long i; 1862 unsigned long search_bits; 1863 unsigned long total_bits; 1864 unsigned long found_bits; 1865 unsigned long start = 0; 1866 unsigned long total_found = 0; 1867 bool found = false; 1868 1869 i = offset_to_bit(entry->offset, block_group->sectorsize, 1870 max_t(u64, offset, entry->offset)); 1871 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 1872 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 1873 1874 again: 1875 found_bits = 0; 1876 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 1877 i < BITS_PER_BITMAP; 1878 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 1879 next_zero = find_next_zero_bit(entry->bitmap, 1880 BITS_PER_BITMAP, i); 1881 if (next_zero - i >= search_bits) { 1882 found_bits = next_zero - i; 1883 break; 1884 } 1885 i = next_zero; 1886 } 1887 1888 if (!found_bits) 1889 return -1; 1890 1891 if (!found) { 1892 start = i; 1893 found = true; 1894 } 1895 1896 total_found += found_bits; 1897 1898 if (cluster->max_size < found_bits * block_group->sectorsize) 1899 cluster->max_size = found_bits * block_group->sectorsize; 1900 1901 if (total_found < total_bits) { 1902 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 1903 if (i - start > total_bits * 2) { 1904 total_found = 0; 1905 cluster->max_size = 0; 1906 found = false; 1907 } 1908 goto again; 1909 } 1910 1911 cluster->window_start = start * block_group->sectorsize + 1912 entry->offset; 1913 cluster->points_to_bitmap = true; 1914 1915 return 0; 1916 } 1917 1918 /* 1919 * here we try to find a cluster of blocks in a block group. The goal 1920 * is to find at least bytes free and up to empty_size + bytes free. 1921 * We might not find them all in one contiguous area. 1922 * 1923 * returns zero and sets up cluster if things worked out, otherwise 1924 * it returns -enospc 1925 */ 1926 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 1927 struct btrfs_root *root, 1928 struct btrfs_block_group_cache *block_group, 1929 struct btrfs_free_cluster *cluster, 1930 u64 offset, u64 bytes, u64 empty_size) 1931 { 1932 struct btrfs_free_space *entry = NULL; 1933 struct rb_node *node; 1934 struct btrfs_free_space *next; 1935 struct btrfs_free_space *last = NULL; 1936 u64 min_bytes; 1937 u64 window_start; 1938 u64 window_free; 1939 u64 max_extent = 0; 1940 bool found_bitmap = false; 1941 int ret; 1942 1943 /* for metadata, allow allocates with more holes */ 1944 if (btrfs_test_opt(root, SSD_SPREAD)) { 1945 min_bytes = bytes + empty_size; 1946 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 1947 /* 1948 * we want to do larger allocations when we are 1949 * flushing out the delayed refs, it helps prevent 1950 * making more work as we go along. 1951 */ 1952 if (trans->transaction->delayed_refs.flushing) 1953 min_bytes = max(bytes, (bytes + empty_size) >> 1); 1954 else 1955 min_bytes = max(bytes, (bytes + empty_size) >> 4); 1956 } else 1957 min_bytes = max(bytes, (bytes + empty_size) >> 2); 1958 1959 spin_lock(&block_group->tree_lock); 1960 spin_lock(&cluster->lock); 1961 1962 /* someone already found a cluster, hooray */ 1963 if (cluster->block_group) { 1964 ret = 0; 1965 goto out; 1966 } 1967 again: 1968 entry = tree_search_offset(block_group, offset, found_bitmap, 1); 1969 if (!entry) { 1970 ret = -ENOSPC; 1971 goto out; 1972 } 1973 1974 /* 1975 * If found_bitmap is true, we exhausted our search for extent entries, 1976 * and we just want to search all of the bitmaps that we can find, and 1977 * ignore any extent entries we find. 1978 */ 1979 while (entry->bitmap || found_bitmap || 1980 (!entry->bitmap && entry->bytes < min_bytes)) { 1981 struct rb_node *node = rb_next(&entry->offset_index); 1982 1983 if (entry->bitmap && entry->bytes > bytes + empty_size) { 1984 ret = btrfs_bitmap_cluster(block_group, entry, cluster, 1985 offset, bytes + empty_size, 1986 min_bytes); 1987 if (!ret) 1988 goto got_it; 1989 } 1990 1991 if (!node) { 1992 ret = -ENOSPC; 1993 goto out; 1994 } 1995 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1996 } 1997 1998 /* 1999 * We already searched all the extent entries from the passed in offset 2000 * to the end and didn't find enough space for the cluster, and we also 2001 * didn't find any bitmaps that met our criteria, just go ahead and exit 2002 */ 2003 if (found_bitmap) { 2004 ret = -ENOSPC; 2005 goto out; 2006 } 2007 2008 cluster->points_to_bitmap = false; 2009 window_start = entry->offset; 2010 window_free = entry->bytes; 2011 last = entry; 2012 max_extent = entry->bytes; 2013 2014 while (1) { 2015 /* out window is just right, lets fill it */ 2016 if (window_free >= bytes + empty_size) 2017 break; 2018 2019 node = rb_next(&last->offset_index); 2020 if (!node) { 2021 if (found_bitmap) 2022 goto again; 2023 ret = -ENOSPC; 2024 goto out; 2025 } 2026 next = rb_entry(node, struct btrfs_free_space, offset_index); 2027 2028 /* 2029 * we found a bitmap, so if this search doesn't result in a 2030 * cluster, we know to go and search again for the bitmaps and 2031 * start looking for space there 2032 */ 2033 if (next->bitmap) { 2034 if (!found_bitmap) 2035 offset = next->offset; 2036 found_bitmap = true; 2037 last = next; 2038 continue; 2039 } 2040 2041 /* 2042 * we haven't filled the empty size and the window is 2043 * very large. reset and try again 2044 */ 2045 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2046 next->offset - window_start > (bytes + empty_size) * 2) { 2047 entry = next; 2048 window_start = entry->offset; 2049 window_free = entry->bytes; 2050 last = entry; 2051 max_extent = entry->bytes; 2052 } else { 2053 last = next; 2054 window_free += next->bytes; 2055 if (entry->bytes > max_extent) 2056 max_extent = entry->bytes; 2057 } 2058 } 2059 2060 cluster->window_start = entry->offset; 2061 2062 /* 2063 * now we've found our entries, pull them out of the free space 2064 * cache and put them into the cluster rbtree 2065 * 2066 * The cluster includes an rbtree, but only uses the offset index 2067 * of each free space cache entry. 2068 */ 2069 while (1) { 2070 node = rb_next(&entry->offset_index); 2071 if (entry->bitmap && node) { 2072 entry = rb_entry(node, struct btrfs_free_space, 2073 offset_index); 2074 continue; 2075 } else if (entry->bitmap && !node) { 2076 break; 2077 } 2078 2079 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2080 ret = tree_insert_offset(&cluster->root, entry->offset, 2081 &entry->offset_index, 0); 2082 BUG_ON(ret); 2083 2084 if (!node || entry == last) 2085 break; 2086 2087 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2088 } 2089 2090 cluster->max_size = max_extent; 2091 got_it: 2092 ret = 0; 2093 atomic_inc(&block_group->count); 2094 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2095 cluster->block_group = block_group; 2096 out: 2097 spin_unlock(&cluster->lock); 2098 spin_unlock(&block_group->tree_lock); 2099 2100 return ret; 2101 } 2102 2103 /* 2104 * simple code to zero out a cluster 2105 */ 2106 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 2107 { 2108 spin_lock_init(&cluster->lock); 2109 spin_lock_init(&cluster->refill_lock); 2110 cluster->root = RB_ROOT; 2111 cluster->max_size = 0; 2112 cluster->points_to_bitmap = false; 2113 INIT_LIST_HEAD(&cluster->block_group_list); 2114 cluster->block_group = NULL; 2115 } 2116 2117