1 /* 2 * Copyright (C) 2008 Red Hat. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/pagemap.h> 20 #include <linux/sched.h> 21 #include <linux/slab.h> 22 #include <linux/math64.h> 23 #include <linux/ratelimit.h> 24 #include "ctree.h" 25 #include "free-space-cache.h" 26 #include "transaction.h" 27 #include "disk-io.h" 28 #include "extent_io.h" 29 #include "inode-map.h" 30 31 #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 32 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 33 34 static int link_free_space(struct btrfs_free_space_ctl *ctl, 35 struct btrfs_free_space *info); 36 37 static struct inode *__lookup_free_space_inode(struct btrfs_root *root, 38 struct btrfs_path *path, 39 u64 offset) 40 { 41 struct btrfs_key key; 42 struct btrfs_key location; 43 struct btrfs_disk_key disk_key; 44 struct btrfs_free_space_header *header; 45 struct extent_buffer *leaf; 46 struct inode *inode = NULL; 47 int ret; 48 49 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 50 key.offset = offset; 51 key.type = 0; 52 53 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 54 if (ret < 0) 55 return ERR_PTR(ret); 56 if (ret > 0) { 57 btrfs_release_path(path); 58 return ERR_PTR(-ENOENT); 59 } 60 61 leaf = path->nodes[0]; 62 header = btrfs_item_ptr(leaf, path->slots[0], 63 struct btrfs_free_space_header); 64 btrfs_free_space_key(leaf, header, &disk_key); 65 btrfs_disk_key_to_cpu(&location, &disk_key); 66 btrfs_release_path(path); 67 68 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); 69 if (!inode) 70 return ERR_PTR(-ENOENT); 71 if (IS_ERR(inode)) 72 return inode; 73 if (is_bad_inode(inode)) { 74 iput(inode); 75 return ERR_PTR(-ENOENT); 76 } 77 78 inode->i_mapping->flags &= ~__GFP_FS; 79 80 return inode; 81 } 82 83 struct inode *lookup_free_space_inode(struct btrfs_root *root, 84 struct btrfs_block_group_cache 85 *block_group, struct btrfs_path *path) 86 { 87 struct inode *inode = NULL; 88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 89 90 spin_lock(&block_group->lock); 91 if (block_group->inode) 92 inode = igrab(block_group->inode); 93 spin_unlock(&block_group->lock); 94 if (inode) 95 return inode; 96 97 inode = __lookup_free_space_inode(root, path, 98 block_group->key.objectid); 99 if (IS_ERR(inode)) 100 return inode; 101 102 spin_lock(&block_group->lock); 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) { 104 printk(KERN_INFO "Old style space inode found, converting.\n"); 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | 106 BTRFS_INODE_NODATACOW; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR; 108 } 109 110 if (!block_group->iref) { 111 block_group->inode = igrab(inode); 112 block_group->iref = 1; 113 } 114 spin_unlock(&block_group->lock); 115 116 return inode; 117 } 118 119 int __create_free_space_inode(struct btrfs_root *root, 120 struct btrfs_trans_handle *trans, 121 struct btrfs_path *path, u64 ino, u64 offset) 122 { 123 struct btrfs_key key; 124 struct btrfs_disk_key disk_key; 125 struct btrfs_free_space_header *header; 126 struct btrfs_inode_item *inode_item; 127 struct extent_buffer *leaf; 128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; 129 int ret; 130 131 ret = btrfs_insert_empty_inode(trans, root, path, ino); 132 if (ret) 133 return ret; 134 135 /* We inline crc's for the free disk space cache */ 136 if (ino != BTRFS_FREE_INO_OBJECTID) 137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; 138 139 leaf = path->nodes[0]; 140 inode_item = btrfs_item_ptr(leaf, path->slots[0], 141 struct btrfs_inode_item); 142 btrfs_item_key(leaf, &disk_key, path->slots[0]); 143 memset_extent_buffer(leaf, 0, (unsigned long)inode_item, 144 sizeof(*inode_item)); 145 btrfs_set_inode_generation(leaf, inode_item, trans->transid); 146 btrfs_set_inode_size(leaf, inode_item, 0); 147 btrfs_set_inode_nbytes(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 151 btrfs_set_inode_flags(leaf, inode_item, flags); 152 btrfs_set_inode_nlink(leaf, inode_item, 1); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 154 btrfs_set_inode_block_group(leaf, inode_item, offset); 155 btrfs_mark_buffer_dirty(leaf); 156 btrfs_release_path(path); 157 158 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 159 key.offset = offset; 160 key.type = 0; 161 162 ret = btrfs_insert_empty_item(trans, root, path, &key, 163 sizeof(struct btrfs_free_space_header)); 164 if (ret < 0) { 165 btrfs_release_path(path); 166 return ret; 167 } 168 leaf = path->nodes[0]; 169 header = btrfs_item_ptr(leaf, path->slots[0], 170 struct btrfs_free_space_header); 171 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); 172 btrfs_set_free_space_key(leaf, header, &disk_key); 173 btrfs_mark_buffer_dirty(leaf); 174 btrfs_release_path(path); 175 176 return 0; 177 } 178 179 int create_free_space_inode(struct btrfs_root *root, 180 struct btrfs_trans_handle *trans, 181 struct btrfs_block_group_cache *block_group, 182 struct btrfs_path *path) 183 { 184 int ret; 185 u64 ino; 186 187 ret = btrfs_find_free_objectid(root, &ino); 188 if (ret < 0) 189 return ret; 190 191 return __create_free_space_inode(root, trans, path, ino, 192 block_group->key.objectid); 193 } 194 195 int btrfs_truncate_free_space_cache(struct btrfs_root *root, 196 struct btrfs_trans_handle *trans, 197 struct btrfs_path *path, 198 struct inode *inode) 199 { 200 struct btrfs_block_rsv *rsv; 201 u64 needed_bytes; 202 loff_t oldsize; 203 int ret = 0; 204 205 rsv = trans->block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv; 207 208 /* 1 for slack space, 1 for updating the inode */ 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + 210 btrfs_calc_trans_metadata_size(root, 1); 211 212 spin_lock(&trans->block_rsv->lock); 213 if (trans->block_rsv->reserved < needed_bytes) { 214 spin_unlock(&trans->block_rsv->lock); 215 trans->block_rsv = rsv; 216 return -ENOSPC; 217 } 218 spin_unlock(&trans->block_rsv->lock); 219 220 oldsize = i_size_read(inode); 221 btrfs_i_size_write(inode, 0); 222 truncate_pagecache(inode, oldsize, 0); 223 224 /* 225 * We don't need an orphan item because truncating the free space cache 226 * will never be split across transactions. 227 */ 228 ret = btrfs_truncate_inode_items(trans, root, inode, 229 0, BTRFS_EXTENT_DATA_KEY); 230 231 if (ret) { 232 trans->block_rsv = rsv; 233 WARN_ON(1); 234 return ret; 235 } 236 237 ret = btrfs_update_inode(trans, root, inode); 238 trans->block_rsv = rsv; 239 240 return ret; 241 } 242 243 static int readahead_cache(struct inode *inode) 244 { 245 struct file_ra_state *ra; 246 unsigned long last_index; 247 248 ra = kzalloc(sizeof(*ra), GFP_NOFS); 249 if (!ra) 250 return -ENOMEM; 251 252 file_ra_state_init(ra, inode->i_mapping); 253 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 254 255 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); 256 257 kfree(ra); 258 259 return 0; 260 } 261 262 struct io_ctl { 263 void *cur, *orig; 264 struct page *page; 265 struct page **pages; 266 struct btrfs_root *root; 267 unsigned long size; 268 int index; 269 int num_pages; 270 unsigned check_crcs:1; 271 }; 272 273 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, 274 struct btrfs_root *root) 275 { 276 memset(io_ctl, 0, sizeof(struct io_ctl)); 277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 278 PAGE_CACHE_SHIFT; 279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, 280 GFP_NOFS); 281 if (!io_ctl->pages) 282 return -ENOMEM; 283 io_ctl->root = root; 284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 285 io_ctl->check_crcs = 1; 286 return 0; 287 } 288 289 static void io_ctl_free(struct io_ctl *io_ctl) 290 { 291 kfree(io_ctl->pages); 292 } 293 294 static void io_ctl_unmap_page(struct io_ctl *io_ctl) 295 { 296 if (io_ctl->cur) { 297 kunmap(io_ctl->page); 298 io_ctl->cur = NULL; 299 io_ctl->orig = NULL; 300 } 301 } 302 303 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 304 { 305 WARN_ON(io_ctl->cur); 306 BUG_ON(io_ctl->index >= io_ctl->num_pages); 307 io_ctl->page = io_ctl->pages[io_ctl->index++]; 308 io_ctl->cur = kmap(io_ctl->page); 309 io_ctl->orig = io_ctl->cur; 310 io_ctl->size = PAGE_CACHE_SIZE; 311 if (clear) 312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); 313 } 314 315 static void io_ctl_drop_pages(struct io_ctl *io_ctl) 316 { 317 int i; 318 319 io_ctl_unmap_page(io_ctl); 320 321 for (i = 0; i < io_ctl->num_pages; i++) { 322 ClearPageChecked(io_ctl->pages[i]); 323 unlock_page(io_ctl->pages[i]); 324 page_cache_release(io_ctl->pages[i]); 325 } 326 } 327 328 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, 329 int uptodate) 330 { 331 struct page *page; 332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 333 int i; 334 335 for (i = 0; i < io_ctl->num_pages; i++) { 336 page = find_or_create_page(inode->i_mapping, i, mask); 337 if (!page) { 338 io_ctl_drop_pages(io_ctl); 339 return -ENOMEM; 340 } 341 io_ctl->pages[i] = page; 342 if (uptodate && !PageUptodate(page)) { 343 btrfs_readpage(NULL, page); 344 lock_page(page); 345 if (!PageUptodate(page)) { 346 printk(KERN_ERR "btrfs: error reading free " 347 "space cache\n"); 348 io_ctl_drop_pages(io_ctl); 349 return -EIO; 350 } 351 } 352 } 353 354 return 0; 355 } 356 357 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) 358 { 359 u64 *val; 360 361 io_ctl_map_page(io_ctl, 1); 362 363 /* 364 * Skip the csum areas. If we don't check crcs then we just have a 365 * 64bit chunk at the front of the first page. 366 */ 367 if (io_ctl->check_crcs) { 368 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); 369 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); 370 } else { 371 io_ctl->cur += sizeof(u64); 372 io_ctl->size -= sizeof(u64) * 2; 373 } 374 375 val = io_ctl->cur; 376 *val = cpu_to_le64(generation); 377 io_ctl->cur += sizeof(u64); 378 } 379 380 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) 381 { 382 u64 *gen; 383 384 /* 385 * Skip the crc area. If we don't check crcs then we just have a 64bit 386 * chunk at the front of the first page. 387 */ 388 if (io_ctl->check_crcs) { 389 io_ctl->cur += sizeof(u32) * io_ctl->num_pages; 390 io_ctl->size -= sizeof(u64) + 391 (sizeof(u32) * io_ctl->num_pages); 392 } else { 393 io_ctl->cur += sizeof(u64); 394 io_ctl->size -= sizeof(u64) * 2; 395 } 396 397 gen = io_ctl->cur; 398 if (le64_to_cpu(*gen) != generation) { 399 printk_ratelimited(KERN_ERR "btrfs: space cache generation " 400 "(%Lu) does not match inode (%Lu)\n", *gen, 401 generation); 402 io_ctl_unmap_page(io_ctl); 403 return -EIO; 404 } 405 io_ctl->cur += sizeof(u64); 406 return 0; 407 } 408 409 static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) 410 { 411 u32 *tmp; 412 u32 crc = ~(u32)0; 413 unsigned offset = 0; 414 415 if (!io_ctl->check_crcs) { 416 io_ctl_unmap_page(io_ctl); 417 return; 418 } 419 420 if (index == 0) 421 offset = sizeof(u32) * io_ctl->num_pages;; 422 423 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, 424 PAGE_CACHE_SIZE - offset); 425 btrfs_csum_final(crc, (char *)&crc); 426 io_ctl_unmap_page(io_ctl); 427 tmp = kmap(io_ctl->pages[0]); 428 tmp += index; 429 *tmp = crc; 430 kunmap(io_ctl->pages[0]); 431 } 432 433 static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) 434 { 435 u32 *tmp, val; 436 u32 crc = ~(u32)0; 437 unsigned offset = 0; 438 439 if (!io_ctl->check_crcs) { 440 io_ctl_map_page(io_ctl, 0); 441 return 0; 442 } 443 444 if (index == 0) 445 offset = sizeof(u32) * io_ctl->num_pages; 446 447 tmp = kmap(io_ctl->pages[0]); 448 tmp += index; 449 val = *tmp; 450 kunmap(io_ctl->pages[0]); 451 452 io_ctl_map_page(io_ctl, 0); 453 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, 454 PAGE_CACHE_SIZE - offset); 455 btrfs_csum_final(crc, (char *)&crc); 456 if (val != crc) { 457 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " 458 "space cache\n"); 459 io_ctl_unmap_page(io_ctl); 460 return -EIO; 461 } 462 463 return 0; 464 } 465 466 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, 467 void *bitmap) 468 { 469 struct btrfs_free_space_entry *entry; 470 471 if (!io_ctl->cur) 472 return -ENOSPC; 473 474 entry = io_ctl->cur; 475 entry->offset = cpu_to_le64(offset); 476 entry->bytes = cpu_to_le64(bytes); 477 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : 478 BTRFS_FREE_SPACE_EXTENT; 479 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 480 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 481 482 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 483 return 0; 484 485 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 486 487 /* No more pages to map */ 488 if (io_ctl->index >= io_ctl->num_pages) 489 return 0; 490 491 /* map the next page */ 492 io_ctl_map_page(io_ctl, 1); 493 return 0; 494 } 495 496 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) 497 { 498 if (!io_ctl->cur) 499 return -ENOSPC; 500 501 /* 502 * If we aren't at the start of the current page, unmap this one and 503 * map the next one if there is any left. 504 */ 505 if (io_ctl->cur != io_ctl->orig) { 506 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 507 if (io_ctl->index >= io_ctl->num_pages) 508 return -ENOSPC; 509 io_ctl_map_page(io_ctl, 0); 510 } 511 512 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); 513 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 514 if (io_ctl->index < io_ctl->num_pages) 515 io_ctl_map_page(io_ctl, 0); 516 return 0; 517 } 518 519 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) 520 { 521 /* 522 * If we're not on the boundary we know we've modified the page and we 523 * need to crc the page. 524 */ 525 if (io_ctl->cur != io_ctl->orig) 526 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 527 else 528 io_ctl_unmap_page(io_ctl); 529 530 while (io_ctl->index < io_ctl->num_pages) { 531 io_ctl_map_page(io_ctl, 1); 532 io_ctl_set_crc(io_ctl, io_ctl->index - 1); 533 } 534 } 535 536 static int io_ctl_read_entry(struct io_ctl *io_ctl, 537 struct btrfs_free_space *entry, u8 *type) 538 { 539 struct btrfs_free_space_entry *e; 540 int ret; 541 542 if (!io_ctl->cur) { 543 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 544 if (ret) 545 return ret; 546 } 547 548 e = io_ctl->cur; 549 entry->offset = le64_to_cpu(e->offset); 550 entry->bytes = le64_to_cpu(e->bytes); 551 *type = e->type; 552 io_ctl->cur += sizeof(struct btrfs_free_space_entry); 553 io_ctl->size -= sizeof(struct btrfs_free_space_entry); 554 555 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) 556 return 0; 557 558 io_ctl_unmap_page(io_ctl); 559 560 return 0; 561 } 562 563 static int io_ctl_read_bitmap(struct io_ctl *io_ctl, 564 struct btrfs_free_space *entry) 565 { 566 int ret; 567 568 ret = io_ctl_check_crc(io_ctl, io_ctl->index); 569 if (ret) 570 return ret; 571 572 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); 573 io_ctl_unmap_page(io_ctl); 574 575 return 0; 576 } 577 578 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 579 struct btrfs_free_space_ctl *ctl, 580 struct btrfs_path *path, u64 offset) 581 { 582 struct btrfs_free_space_header *header; 583 struct extent_buffer *leaf; 584 struct io_ctl io_ctl; 585 struct btrfs_key key; 586 struct btrfs_free_space *e, *n; 587 struct list_head bitmaps; 588 u64 num_entries; 589 u64 num_bitmaps; 590 u64 generation; 591 u8 type; 592 int ret = 0; 593 594 INIT_LIST_HEAD(&bitmaps); 595 596 /* Nothing in the space cache, goodbye */ 597 if (!i_size_read(inode)) 598 return 0; 599 600 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 601 key.offset = offset; 602 key.type = 0; 603 604 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 605 if (ret < 0) 606 return 0; 607 else if (ret > 0) { 608 btrfs_release_path(path); 609 return 0; 610 } 611 612 ret = -1; 613 614 leaf = path->nodes[0]; 615 header = btrfs_item_ptr(leaf, path->slots[0], 616 struct btrfs_free_space_header); 617 num_entries = btrfs_free_space_entries(leaf, header); 618 num_bitmaps = btrfs_free_space_bitmaps(leaf, header); 619 generation = btrfs_free_space_generation(leaf, header); 620 btrfs_release_path(path); 621 622 if (BTRFS_I(inode)->generation != generation) { 623 printk(KERN_ERR "btrfs: free space inode generation (%llu) did" 624 " not match free space cache generation (%llu)\n", 625 (unsigned long long)BTRFS_I(inode)->generation, 626 (unsigned long long)generation); 627 return 0; 628 } 629 630 if (!num_entries) 631 return 0; 632 633 io_ctl_init(&io_ctl, inode, root); 634 ret = readahead_cache(inode); 635 if (ret) 636 goto out; 637 638 ret = io_ctl_prepare_pages(&io_ctl, inode, 1); 639 if (ret) 640 goto out; 641 642 ret = io_ctl_check_crc(&io_ctl, 0); 643 if (ret) 644 goto free_cache; 645 646 ret = io_ctl_check_generation(&io_ctl, generation); 647 if (ret) 648 goto free_cache; 649 650 while (num_entries) { 651 e = kmem_cache_zalloc(btrfs_free_space_cachep, 652 GFP_NOFS); 653 if (!e) 654 goto free_cache; 655 656 ret = io_ctl_read_entry(&io_ctl, e, &type); 657 if (ret) { 658 kmem_cache_free(btrfs_free_space_cachep, e); 659 goto free_cache; 660 } 661 662 if (!e->bytes) { 663 kmem_cache_free(btrfs_free_space_cachep, e); 664 goto free_cache; 665 } 666 667 if (type == BTRFS_FREE_SPACE_EXTENT) { 668 spin_lock(&ctl->tree_lock); 669 ret = link_free_space(ctl, e); 670 spin_unlock(&ctl->tree_lock); 671 if (ret) { 672 printk(KERN_ERR "Duplicate entries in " 673 "free space cache, dumping\n"); 674 kmem_cache_free(btrfs_free_space_cachep, e); 675 goto free_cache; 676 } 677 } else { 678 BUG_ON(!num_bitmaps); 679 num_bitmaps--; 680 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 681 if (!e->bitmap) { 682 kmem_cache_free( 683 btrfs_free_space_cachep, e); 684 goto free_cache; 685 } 686 spin_lock(&ctl->tree_lock); 687 ret = link_free_space(ctl, e); 688 ctl->total_bitmaps++; 689 ctl->op->recalc_thresholds(ctl); 690 spin_unlock(&ctl->tree_lock); 691 if (ret) { 692 printk(KERN_ERR "Duplicate entries in " 693 "free space cache, dumping\n"); 694 kmem_cache_free(btrfs_free_space_cachep, e); 695 goto free_cache; 696 } 697 list_add_tail(&e->list, &bitmaps); 698 } 699 700 num_entries--; 701 } 702 703 io_ctl_unmap_page(&io_ctl); 704 705 /* 706 * We add the bitmaps at the end of the entries in order that 707 * the bitmap entries are added to the cache. 708 */ 709 list_for_each_entry_safe(e, n, &bitmaps, list) { 710 list_del_init(&e->list); 711 ret = io_ctl_read_bitmap(&io_ctl, e); 712 if (ret) 713 goto free_cache; 714 } 715 716 io_ctl_drop_pages(&io_ctl); 717 ret = 1; 718 out: 719 io_ctl_free(&io_ctl); 720 return ret; 721 free_cache: 722 io_ctl_drop_pages(&io_ctl); 723 __btrfs_remove_free_space_cache(ctl); 724 goto out; 725 } 726 727 int load_free_space_cache(struct btrfs_fs_info *fs_info, 728 struct btrfs_block_group_cache *block_group) 729 { 730 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 731 struct btrfs_root *root = fs_info->tree_root; 732 struct inode *inode; 733 struct btrfs_path *path; 734 int ret = 0; 735 bool matched; 736 u64 used = btrfs_block_group_used(&block_group->item); 737 738 /* 739 * If we're unmounting then just return, since this does a search on the 740 * normal root and not the commit root and we could deadlock. 741 */ 742 if (btrfs_fs_closing(fs_info)) 743 return 0; 744 745 /* 746 * If this block group has been marked to be cleared for one reason or 747 * another then we can't trust the on disk cache, so just return. 748 */ 749 spin_lock(&block_group->lock); 750 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 751 spin_unlock(&block_group->lock); 752 return 0; 753 } 754 spin_unlock(&block_group->lock); 755 756 path = btrfs_alloc_path(); 757 if (!path) 758 return 0; 759 760 inode = lookup_free_space_inode(root, block_group, path); 761 if (IS_ERR(inode)) { 762 btrfs_free_path(path); 763 return 0; 764 } 765 766 /* We may have converted the inode and made the cache invalid. */ 767 spin_lock(&block_group->lock); 768 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { 769 spin_unlock(&block_group->lock); 770 goto out; 771 } 772 spin_unlock(&block_group->lock); 773 774 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 775 path, block_group->key.objectid); 776 btrfs_free_path(path); 777 if (ret <= 0) 778 goto out; 779 780 spin_lock(&ctl->tree_lock); 781 matched = (ctl->free_space == (block_group->key.offset - used - 782 block_group->bytes_super)); 783 spin_unlock(&ctl->tree_lock); 784 785 if (!matched) { 786 __btrfs_remove_free_space_cache(ctl); 787 printk(KERN_ERR "block group %llu has an wrong amount of free " 788 "space\n", block_group->key.objectid); 789 ret = -1; 790 } 791 out: 792 if (ret < 0) { 793 /* This cache is bogus, make sure it gets cleared */ 794 spin_lock(&block_group->lock); 795 block_group->disk_cache_state = BTRFS_DC_CLEAR; 796 spin_unlock(&block_group->lock); 797 ret = 0; 798 799 printk(KERN_ERR "btrfs: failed to load free space cache " 800 "for block group %llu\n", block_group->key.objectid); 801 } 802 803 iput(inode); 804 return ret; 805 } 806 807 /** 808 * __btrfs_write_out_cache - write out cached info to an inode 809 * @root - the root the inode belongs to 810 * @ctl - the free space cache we are going to write out 811 * @block_group - the block_group for this cache if it belongs to a block_group 812 * @trans - the trans handle 813 * @path - the path to use 814 * @offset - the offset for the key we'll insert 815 * 816 * This function writes out a free space cache struct to disk for quick recovery 817 * on mount. This will return 0 if it was successfull in writing the cache out, 818 * and -1 if it was not. 819 */ 820 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 821 struct btrfs_free_space_ctl *ctl, 822 struct btrfs_block_group_cache *block_group, 823 struct btrfs_trans_handle *trans, 824 struct btrfs_path *path, u64 offset) 825 { 826 struct btrfs_free_space_header *header; 827 struct extent_buffer *leaf; 828 struct rb_node *node; 829 struct list_head *pos, *n; 830 struct extent_state *cached_state = NULL; 831 struct btrfs_free_cluster *cluster = NULL; 832 struct extent_io_tree *unpin = NULL; 833 struct io_ctl io_ctl; 834 struct list_head bitmap_list; 835 struct btrfs_key key; 836 u64 start, end, len; 837 int entries = 0; 838 int bitmaps = 0; 839 int ret; 840 int err = -1; 841 842 INIT_LIST_HEAD(&bitmap_list); 843 844 if (!i_size_read(inode)) 845 return -1; 846 847 io_ctl_init(&io_ctl, inode, root); 848 849 /* Get the cluster for this block_group if it exists */ 850 if (block_group && !list_empty(&block_group->cluster_list)) 851 cluster = list_entry(block_group->cluster_list.next, 852 struct btrfs_free_cluster, 853 block_group_list); 854 855 /* 856 * We shouldn't have switched the pinned extents yet so this is the 857 * right one 858 */ 859 unpin = root->fs_info->pinned_extents; 860 861 /* Lock all pages first so we can lock the extent safely. */ 862 io_ctl_prepare_pages(&io_ctl, inode, 0); 863 864 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 865 0, &cached_state, GFP_NOFS); 866 867 /* 868 * When searching for pinned extents, we need to start at our start 869 * offset. 870 */ 871 if (block_group) 872 start = block_group->key.objectid; 873 874 node = rb_first(&ctl->free_space_offset); 875 if (!node && cluster) { 876 node = rb_first(&cluster->root); 877 cluster = NULL; 878 } 879 880 /* Make sure we can fit our crcs into the first page */ 881 if (io_ctl.check_crcs && 882 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { 883 WARN_ON(1); 884 goto out_nospc; 885 } 886 887 io_ctl_set_generation(&io_ctl, trans->transid); 888 889 /* Write out the extent entries */ 890 while (node) { 891 struct btrfs_free_space *e; 892 893 e = rb_entry(node, struct btrfs_free_space, offset_index); 894 entries++; 895 896 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, 897 e->bitmap); 898 if (ret) 899 goto out_nospc; 900 901 if (e->bitmap) { 902 list_add_tail(&e->list, &bitmap_list); 903 bitmaps++; 904 } 905 node = rb_next(node); 906 if (!node && cluster) { 907 node = rb_first(&cluster->root); 908 cluster = NULL; 909 } 910 } 911 912 /* 913 * We want to add any pinned extents to our free space cache 914 * so we don't leak the space 915 */ 916 while (block_group && (start < block_group->key.objectid + 917 block_group->key.offset)) { 918 ret = find_first_extent_bit(unpin, start, &start, &end, 919 EXTENT_DIRTY); 920 if (ret) { 921 ret = 0; 922 break; 923 } 924 925 /* This pinned extent is out of our range */ 926 if (start >= block_group->key.objectid + 927 block_group->key.offset) 928 break; 929 930 len = block_group->key.objectid + 931 block_group->key.offset - start; 932 len = min(len, end + 1 - start); 933 934 entries++; 935 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 936 if (ret) 937 goto out_nospc; 938 939 start = end + 1; 940 } 941 942 /* Write out the bitmaps */ 943 list_for_each_safe(pos, n, &bitmap_list) { 944 struct btrfs_free_space *entry = 945 list_entry(pos, struct btrfs_free_space, list); 946 947 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); 948 if (ret) 949 goto out_nospc; 950 list_del_init(&entry->list); 951 } 952 953 /* Zero out the rest of the pages just to make sure */ 954 io_ctl_zero_remaining_pages(&io_ctl); 955 956 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 957 0, i_size_read(inode), &cached_state); 958 io_ctl_drop_pages(&io_ctl); 959 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 960 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 961 962 if (ret) 963 goto out; 964 965 966 ret = filemap_write_and_wait(inode->i_mapping); 967 if (ret) 968 goto out; 969 970 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 971 key.offset = offset; 972 key.type = 0; 973 974 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 975 if (ret < 0) { 976 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 977 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, 978 GFP_NOFS); 979 goto out; 980 } 981 leaf = path->nodes[0]; 982 if (ret > 0) { 983 struct btrfs_key found_key; 984 BUG_ON(!path->slots[0]); 985 path->slots[0]--; 986 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 987 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 988 found_key.offset != offset) { 989 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, 990 inode->i_size - 1, 991 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 992 NULL, GFP_NOFS); 993 btrfs_release_path(path); 994 goto out; 995 } 996 } 997 998 BTRFS_I(inode)->generation = trans->transid; 999 header = btrfs_item_ptr(leaf, path->slots[0], 1000 struct btrfs_free_space_header); 1001 btrfs_set_free_space_entries(leaf, header, entries); 1002 btrfs_set_free_space_bitmaps(leaf, header, bitmaps); 1003 btrfs_set_free_space_generation(leaf, header, trans->transid); 1004 btrfs_mark_buffer_dirty(leaf); 1005 btrfs_release_path(path); 1006 1007 err = 0; 1008 out: 1009 io_ctl_free(&io_ctl); 1010 if (err) { 1011 invalidate_inode_pages2(inode->i_mapping); 1012 BTRFS_I(inode)->generation = 0; 1013 } 1014 btrfs_update_inode(trans, root, inode); 1015 return err; 1016 1017 out_nospc: 1018 list_for_each_safe(pos, n, &bitmap_list) { 1019 struct btrfs_free_space *entry = 1020 list_entry(pos, struct btrfs_free_space, list); 1021 list_del_init(&entry->list); 1022 } 1023 io_ctl_drop_pages(&io_ctl); 1024 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1025 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1026 goto out; 1027 } 1028 1029 int btrfs_write_out_cache(struct btrfs_root *root, 1030 struct btrfs_trans_handle *trans, 1031 struct btrfs_block_group_cache *block_group, 1032 struct btrfs_path *path) 1033 { 1034 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1035 struct inode *inode; 1036 int ret = 0; 1037 1038 root = root->fs_info->tree_root; 1039 1040 spin_lock(&block_group->lock); 1041 if (block_group->disk_cache_state < BTRFS_DC_SETUP) { 1042 spin_unlock(&block_group->lock); 1043 return 0; 1044 } 1045 spin_unlock(&block_group->lock); 1046 1047 inode = lookup_free_space_inode(root, block_group, path); 1048 if (IS_ERR(inode)) 1049 return 0; 1050 1051 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1052 path, block_group->key.objectid); 1053 if (ret) { 1054 spin_lock(&block_group->lock); 1055 block_group->disk_cache_state = BTRFS_DC_ERROR; 1056 spin_unlock(&block_group->lock); 1057 ret = 0; 1058 #ifdef DEBUG 1059 printk(KERN_ERR "btrfs: failed to write free space cace " 1060 "for block group %llu\n", block_group->key.objectid); 1061 #endif 1062 } 1063 1064 iput(inode); 1065 return ret; 1066 } 1067 1068 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, 1069 u64 offset) 1070 { 1071 BUG_ON(offset < bitmap_start); 1072 offset -= bitmap_start; 1073 return (unsigned long)(div_u64(offset, unit)); 1074 } 1075 1076 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit) 1077 { 1078 return (unsigned long)(div_u64(bytes, unit)); 1079 } 1080 1081 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, 1082 u64 offset) 1083 { 1084 u64 bitmap_start; 1085 u64 bytes_per_bitmap; 1086 1087 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1088 bitmap_start = offset - ctl->start; 1089 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1090 bitmap_start *= bytes_per_bitmap; 1091 bitmap_start += ctl->start; 1092 1093 return bitmap_start; 1094 } 1095 1096 static int tree_insert_offset(struct rb_root *root, u64 offset, 1097 struct rb_node *node, int bitmap) 1098 { 1099 struct rb_node **p = &root->rb_node; 1100 struct rb_node *parent = NULL; 1101 struct btrfs_free_space *info; 1102 1103 while (*p) { 1104 parent = *p; 1105 info = rb_entry(parent, struct btrfs_free_space, offset_index); 1106 1107 if (offset < info->offset) { 1108 p = &(*p)->rb_left; 1109 } else if (offset > info->offset) { 1110 p = &(*p)->rb_right; 1111 } else { 1112 /* 1113 * we could have a bitmap entry and an extent entry 1114 * share the same offset. If this is the case, we want 1115 * the extent entry to always be found first if we do a 1116 * linear search through the tree, since we want to have 1117 * the quickest allocation time, and allocating from an 1118 * extent is faster than allocating from a bitmap. So 1119 * if we're inserting a bitmap and we find an entry at 1120 * this offset, we want to go right, or after this entry 1121 * logically. If we are inserting an extent and we've 1122 * found a bitmap, we want to go left, or before 1123 * logically. 1124 */ 1125 if (bitmap) { 1126 if (info->bitmap) { 1127 WARN_ON_ONCE(1); 1128 return -EEXIST; 1129 } 1130 p = &(*p)->rb_right; 1131 } else { 1132 if (!info->bitmap) { 1133 WARN_ON_ONCE(1); 1134 return -EEXIST; 1135 } 1136 p = &(*p)->rb_left; 1137 } 1138 } 1139 } 1140 1141 rb_link_node(node, parent, p); 1142 rb_insert_color(node, root); 1143 1144 return 0; 1145 } 1146 1147 /* 1148 * searches the tree for the given offset. 1149 * 1150 * fuzzy - If this is set, then we are trying to make an allocation, and we just 1151 * want a section that has at least bytes size and comes at or after the given 1152 * offset. 1153 */ 1154 static struct btrfs_free_space * 1155 tree_search_offset(struct btrfs_free_space_ctl *ctl, 1156 u64 offset, int bitmap_only, int fuzzy) 1157 { 1158 struct rb_node *n = ctl->free_space_offset.rb_node; 1159 struct btrfs_free_space *entry, *prev = NULL; 1160 1161 /* find entry that is closest to the 'offset' */ 1162 while (1) { 1163 if (!n) { 1164 entry = NULL; 1165 break; 1166 } 1167 1168 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1169 prev = entry; 1170 1171 if (offset < entry->offset) 1172 n = n->rb_left; 1173 else if (offset > entry->offset) 1174 n = n->rb_right; 1175 else 1176 break; 1177 } 1178 1179 if (bitmap_only) { 1180 if (!entry) 1181 return NULL; 1182 if (entry->bitmap) 1183 return entry; 1184 1185 /* 1186 * bitmap entry and extent entry may share same offset, 1187 * in that case, bitmap entry comes after extent entry. 1188 */ 1189 n = rb_next(n); 1190 if (!n) 1191 return NULL; 1192 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1193 if (entry->offset != offset) 1194 return NULL; 1195 1196 WARN_ON(!entry->bitmap); 1197 return entry; 1198 } else if (entry) { 1199 if (entry->bitmap) { 1200 /* 1201 * if previous extent entry covers the offset, 1202 * we should return it instead of the bitmap entry 1203 */ 1204 n = &entry->offset_index; 1205 while (1) { 1206 n = rb_prev(n); 1207 if (!n) 1208 break; 1209 prev = rb_entry(n, struct btrfs_free_space, 1210 offset_index); 1211 if (!prev->bitmap) { 1212 if (prev->offset + prev->bytes > offset) 1213 entry = prev; 1214 break; 1215 } 1216 } 1217 } 1218 return entry; 1219 } 1220 1221 if (!prev) 1222 return NULL; 1223 1224 /* find last entry before the 'offset' */ 1225 entry = prev; 1226 if (entry->offset > offset) { 1227 n = rb_prev(&entry->offset_index); 1228 if (n) { 1229 entry = rb_entry(n, struct btrfs_free_space, 1230 offset_index); 1231 BUG_ON(entry->offset > offset); 1232 } else { 1233 if (fuzzy) 1234 return entry; 1235 else 1236 return NULL; 1237 } 1238 } 1239 1240 if (entry->bitmap) { 1241 n = &entry->offset_index; 1242 while (1) { 1243 n = rb_prev(n); 1244 if (!n) 1245 break; 1246 prev = rb_entry(n, struct btrfs_free_space, 1247 offset_index); 1248 if (!prev->bitmap) { 1249 if (prev->offset + prev->bytes > offset) 1250 return prev; 1251 break; 1252 } 1253 } 1254 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1255 return entry; 1256 } else if (entry->offset + entry->bytes > offset) 1257 return entry; 1258 1259 if (!fuzzy) 1260 return NULL; 1261 1262 while (1) { 1263 if (entry->bitmap) { 1264 if (entry->offset + BITS_PER_BITMAP * 1265 ctl->unit > offset) 1266 break; 1267 } else { 1268 if (entry->offset + entry->bytes > offset) 1269 break; 1270 } 1271 1272 n = rb_next(&entry->offset_index); 1273 if (!n) 1274 return NULL; 1275 entry = rb_entry(n, struct btrfs_free_space, offset_index); 1276 } 1277 return entry; 1278 } 1279 1280 static inline void 1281 __unlink_free_space(struct btrfs_free_space_ctl *ctl, 1282 struct btrfs_free_space *info) 1283 { 1284 rb_erase(&info->offset_index, &ctl->free_space_offset); 1285 ctl->free_extents--; 1286 } 1287 1288 static void unlink_free_space(struct btrfs_free_space_ctl *ctl, 1289 struct btrfs_free_space *info) 1290 { 1291 __unlink_free_space(ctl, info); 1292 ctl->free_space -= info->bytes; 1293 } 1294 1295 static int link_free_space(struct btrfs_free_space_ctl *ctl, 1296 struct btrfs_free_space *info) 1297 { 1298 int ret = 0; 1299 1300 BUG_ON(!info->bitmap && !info->bytes); 1301 ret = tree_insert_offset(&ctl->free_space_offset, info->offset, 1302 &info->offset_index, (info->bitmap != NULL)); 1303 if (ret) 1304 return ret; 1305 1306 ctl->free_space += info->bytes; 1307 ctl->free_extents++; 1308 return ret; 1309 } 1310 1311 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) 1312 { 1313 struct btrfs_block_group_cache *block_group = ctl->private; 1314 u64 max_bytes; 1315 u64 bitmap_bytes; 1316 u64 extent_bytes; 1317 u64 size = block_group->key.offset; 1318 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1319 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1320 1321 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1322 1323 /* 1324 * The goal is to keep the total amount of memory used per 1gb of space 1325 * at or below 32k, so we need to adjust how much memory we allow to be 1326 * used by extent based free space tracking 1327 */ 1328 if (size < 1024 * 1024 * 1024) 1329 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1330 else 1331 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1332 div64_u64(size, 1024 * 1024 * 1024); 1333 1334 /* 1335 * we want to account for 1 more bitmap than what we have so we can make 1336 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1337 * we add more bitmaps. 1338 */ 1339 bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1340 1341 if (bitmap_bytes >= max_bytes) { 1342 ctl->extents_thresh = 0; 1343 return; 1344 } 1345 1346 /* 1347 * we want the extent entry threshold to always be at most 1/2 the maxw 1348 * bytes we can have, or whatever is less than that. 1349 */ 1350 extent_bytes = max_bytes - bitmap_bytes; 1351 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1352 1353 ctl->extents_thresh = 1354 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1355 } 1356 1357 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1358 struct btrfs_free_space *info, 1359 u64 offset, u64 bytes) 1360 { 1361 unsigned long start, count; 1362 1363 start = offset_to_bit(info->offset, ctl->unit, offset); 1364 count = bytes_to_bits(bytes, ctl->unit); 1365 BUG_ON(start + count > BITS_PER_BITMAP); 1366 1367 bitmap_clear(info->bitmap, start, count); 1368 1369 info->bytes -= bytes; 1370 } 1371 1372 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1373 struct btrfs_free_space *info, u64 offset, 1374 u64 bytes) 1375 { 1376 __bitmap_clear_bits(ctl, info, offset, bytes); 1377 ctl->free_space -= bytes; 1378 } 1379 1380 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, 1381 struct btrfs_free_space *info, u64 offset, 1382 u64 bytes) 1383 { 1384 unsigned long start, count; 1385 1386 start = offset_to_bit(info->offset, ctl->unit, offset); 1387 count = bytes_to_bits(bytes, ctl->unit); 1388 BUG_ON(start + count > BITS_PER_BITMAP); 1389 1390 bitmap_set(info->bitmap, start, count); 1391 1392 info->bytes += bytes; 1393 ctl->free_space += bytes; 1394 } 1395 1396 static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1397 struct btrfs_free_space *bitmap_info, u64 *offset, 1398 u64 *bytes) 1399 { 1400 unsigned long found_bits = 0; 1401 unsigned long bits, i; 1402 unsigned long next_zero; 1403 1404 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1405 max_t(u64, *offset, bitmap_info->offset)); 1406 bits = bytes_to_bits(*bytes, ctl->unit); 1407 1408 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1409 i < BITS_PER_BITMAP; 1410 i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { 1411 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1412 BITS_PER_BITMAP, i); 1413 if ((next_zero - i) >= bits) { 1414 found_bits = next_zero - i; 1415 break; 1416 } 1417 i = next_zero; 1418 } 1419 1420 if (found_bits) { 1421 *offset = (u64)(i * ctl->unit) + bitmap_info->offset; 1422 *bytes = (u64)(found_bits) * ctl->unit; 1423 return 0; 1424 } 1425 1426 return -1; 1427 } 1428 1429 static struct btrfs_free_space * 1430 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1431 { 1432 struct btrfs_free_space *entry; 1433 struct rb_node *node; 1434 int ret; 1435 1436 if (!ctl->free_space_offset.rb_node) 1437 return NULL; 1438 1439 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1); 1440 if (!entry) 1441 return NULL; 1442 1443 for (node = &entry->offset_index; node; node = rb_next(node)) { 1444 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1445 if (entry->bytes < *bytes) 1446 continue; 1447 1448 if (entry->bitmap) { 1449 ret = search_bitmap(ctl, entry, offset, bytes); 1450 if (!ret) 1451 return entry; 1452 continue; 1453 } 1454 1455 *offset = entry->offset; 1456 *bytes = entry->bytes; 1457 return entry; 1458 } 1459 1460 return NULL; 1461 } 1462 1463 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, 1464 struct btrfs_free_space *info, u64 offset) 1465 { 1466 info->offset = offset_to_bitmap(ctl, offset); 1467 info->bytes = 0; 1468 link_free_space(ctl, info); 1469 ctl->total_bitmaps++; 1470 1471 ctl->op->recalc_thresholds(ctl); 1472 } 1473 1474 static void free_bitmap(struct btrfs_free_space_ctl *ctl, 1475 struct btrfs_free_space *bitmap_info) 1476 { 1477 unlink_free_space(ctl, bitmap_info); 1478 kfree(bitmap_info->bitmap); 1479 kmem_cache_free(btrfs_free_space_cachep, bitmap_info); 1480 ctl->total_bitmaps--; 1481 ctl->op->recalc_thresholds(ctl); 1482 } 1483 1484 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, 1485 struct btrfs_free_space *bitmap_info, 1486 u64 *offset, u64 *bytes) 1487 { 1488 u64 end; 1489 u64 search_start, search_bytes; 1490 int ret; 1491 1492 again: 1493 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1; 1494 1495 /* 1496 * XXX - this can go away after a few releases. 1497 * 1498 * since the only user of btrfs_remove_free_space is the tree logging 1499 * stuff, and the only way to test that is under crash conditions, we 1500 * want to have this debug stuff here just in case somethings not 1501 * working. Search the bitmap for the space we are trying to use to 1502 * make sure its actually there. If its not there then we need to stop 1503 * because something has gone wrong. 1504 */ 1505 search_start = *offset; 1506 search_bytes = *bytes; 1507 search_bytes = min(search_bytes, end - search_start + 1); 1508 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); 1509 BUG_ON(ret < 0 || search_start != *offset); 1510 1511 if (*offset > bitmap_info->offset && *offset + *bytes > end) { 1512 bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1); 1513 *bytes -= end - *offset + 1; 1514 *offset = end + 1; 1515 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { 1516 bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes); 1517 *bytes = 0; 1518 } 1519 1520 if (*bytes) { 1521 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1522 if (!bitmap_info->bytes) 1523 free_bitmap(ctl, bitmap_info); 1524 1525 /* 1526 * no entry after this bitmap, but we still have bytes to 1527 * remove, so something has gone wrong. 1528 */ 1529 if (!next) 1530 return -EINVAL; 1531 1532 bitmap_info = rb_entry(next, struct btrfs_free_space, 1533 offset_index); 1534 1535 /* 1536 * if the next entry isn't a bitmap we need to return to let the 1537 * extent stuff do its work. 1538 */ 1539 if (!bitmap_info->bitmap) 1540 return -EAGAIN; 1541 1542 /* 1543 * Ok the next item is a bitmap, but it may not actually hold 1544 * the information for the rest of this free space stuff, so 1545 * look for it, and if we don't find it return so we can try 1546 * everything over again. 1547 */ 1548 search_start = *offset; 1549 search_bytes = *bytes; 1550 ret = search_bitmap(ctl, bitmap_info, &search_start, 1551 &search_bytes); 1552 if (ret < 0 || search_start != *offset) 1553 return -EAGAIN; 1554 1555 goto again; 1556 } else if (!bitmap_info->bytes) 1557 free_bitmap(ctl, bitmap_info); 1558 1559 return 0; 1560 } 1561 1562 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, 1563 struct btrfs_free_space *info, u64 offset, 1564 u64 bytes) 1565 { 1566 u64 bytes_to_set = 0; 1567 u64 end; 1568 1569 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); 1570 1571 bytes_to_set = min(end - offset, bytes); 1572 1573 bitmap_set_bits(ctl, info, offset, bytes_to_set); 1574 1575 return bytes_to_set; 1576 1577 } 1578 1579 static bool use_bitmap(struct btrfs_free_space_ctl *ctl, 1580 struct btrfs_free_space *info) 1581 { 1582 struct btrfs_block_group_cache *block_group = ctl->private; 1583 1584 /* 1585 * If we are below the extents threshold then we can add this as an 1586 * extent, and don't have to deal with the bitmap 1587 */ 1588 if (ctl->free_extents < ctl->extents_thresh) { 1589 /* 1590 * If this block group has some small extents we don't want to 1591 * use up all of our free slots in the cache with them, we want 1592 * to reserve them to larger extents, however if we have plent 1593 * of cache left then go ahead an dadd them, no sense in adding 1594 * the overhead of a bitmap if we don't have to. 1595 */ 1596 if (info->bytes <= block_group->sectorsize * 4) { 1597 if (ctl->free_extents * 2 <= ctl->extents_thresh) 1598 return false; 1599 } else { 1600 return false; 1601 } 1602 } 1603 1604 /* 1605 * some block groups are so tiny they can't be enveloped by a bitmap, so 1606 * don't even bother to create a bitmap for this 1607 */ 1608 if (BITS_PER_BITMAP * block_group->sectorsize > 1609 block_group->key.offset) 1610 return false; 1611 1612 return true; 1613 } 1614 1615 static struct btrfs_free_space_op free_space_op = { 1616 .recalc_thresholds = recalculate_thresholds, 1617 .use_bitmap = use_bitmap, 1618 }; 1619 1620 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, 1621 struct btrfs_free_space *info) 1622 { 1623 struct btrfs_free_space *bitmap_info; 1624 struct btrfs_block_group_cache *block_group = NULL; 1625 int added = 0; 1626 u64 bytes, offset, bytes_added; 1627 int ret; 1628 1629 bytes = info->bytes; 1630 offset = info->offset; 1631 1632 if (!ctl->op->use_bitmap(ctl, info)) 1633 return 0; 1634 1635 if (ctl->op == &free_space_op) 1636 block_group = ctl->private; 1637 again: 1638 /* 1639 * Since we link bitmaps right into the cluster we need to see if we 1640 * have a cluster here, and if so and it has our bitmap we need to add 1641 * the free space to that bitmap. 1642 */ 1643 if (block_group && !list_empty(&block_group->cluster_list)) { 1644 struct btrfs_free_cluster *cluster; 1645 struct rb_node *node; 1646 struct btrfs_free_space *entry; 1647 1648 cluster = list_entry(block_group->cluster_list.next, 1649 struct btrfs_free_cluster, 1650 block_group_list); 1651 spin_lock(&cluster->lock); 1652 node = rb_first(&cluster->root); 1653 if (!node) { 1654 spin_unlock(&cluster->lock); 1655 goto no_cluster_bitmap; 1656 } 1657 1658 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1659 if (!entry->bitmap) { 1660 spin_unlock(&cluster->lock); 1661 goto no_cluster_bitmap; 1662 } 1663 1664 if (entry->offset == offset_to_bitmap(ctl, offset)) { 1665 bytes_added = add_bytes_to_bitmap(ctl, entry, 1666 offset, bytes); 1667 bytes -= bytes_added; 1668 offset += bytes_added; 1669 } 1670 spin_unlock(&cluster->lock); 1671 if (!bytes) { 1672 ret = 1; 1673 goto out; 1674 } 1675 } 1676 1677 no_cluster_bitmap: 1678 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1679 1, 0); 1680 if (!bitmap_info) { 1681 BUG_ON(added); 1682 goto new_bitmap; 1683 } 1684 1685 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 1686 bytes -= bytes_added; 1687 offset += bytes_added; 1688 added = 0; 1689 1690 if (!bytes) { 1691 ret = 1; 1692 goto out; 1693 } else 1694 goto again; 1695 1696 new_bitmap: 1697 if (info && info->bitmap) { 1698 add_new_bitmap(ctl, info, offset); 1699 added = 1; 1700 info = NULL; 1701 goto again; 1702 } else { 1703 spin_unlock(&ctl->tree_lock); 1704 1705 /* no pre-allocated info, allocate a new one */ 1706 if (!info) { 1707 info = kmem_cache_zalloc(btrfs_free_space_cachep, 1708 GFP_NOFS); 1709 if (!info) { 1710 spin_lock(&ctl->tree_lock); 1711 ret = -ENOMEM; 1712 goto out; 1713 } 1714 } 1715 1716 /* allocate the bitmap */ 1717 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1718 spin_lock(&ctl->tree_lock); 1719 if (!info->bitmap) { 1720 ret = -ENOMEM; 1721 goto out; 1722 } 1723 goto again; 1724 } 1725 1726 out: 1727 if (info) { 1728 if (info->bitmap) 1729 kfree(info->bitmap); 1730 kmem_cache_free(btrfs_free_space_cachep, info); 1731 } 1732 1733 return ret; 1734 } 1735 1736 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, 1737 struct btrfs_free_space *info, bool update_stat) 1738 { 1739 struct btrfs_free_space *left_info; 1740 struct btrfs_free_space *right_info; 1741 bool merged = false; 1742 u64 offset = info->offset; 1743 u64 bytes = info->bytes; 1744 1745 /* 1746 * first we want to see if there is free space adjacent to the range we 1747 * are adding, if there is remove that struct and add a new one to 1748 * cover the entire range 1749 */ 1750 right_info = tree_search_offset(ctl, offset + bytes, 0, 0); 1751 if (right_info && rb_prev(&right_info->offset_index)) 1752 left_info = rb_entry(rb_prev(&right_info->offset_index), 1753 struct btrfs_free_space, offset_index); 1754 else 1755 left_info = tree_search_offset(ctl, offset - 1, 0, 0); 1756 1757 if (right_info && !right_info->bitmap) { 1758 if (update_stat) 1759 unlink_free_space(ctl, right_info); 1760 else 1761 __unlink_free_space(ctl, right_info); 1762 info->bytes += right_info->bytes; 1763 kmem_cache_free(btrfs_free_space_cachep, right_info); 1764 merged = true; 1765 } 1766 1767 if (left_info && !left_info->bitmap && 1768 left_info->offset + left_info->bytes == offset) { 1769 if (update_stat) 1770 unlink_free_space(ctl, left_info); 1771 else 1772 __unlink_free_space(ctl, left_info); 1773 info->offset = left_info->offset; 1774 info->bytes += left_info->bytes; 1775 kmem_cache_free(btrfs_free_space_cachep, left_info); 1776 merged = true; 1777 } 1778 1779 return merged; 1780 } 1781 1782 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 1783 u64 offset, u64 bytes) 1784 { 1785 struct btrfs_free_space *info; 1786 int ret = 0; 1787 1788 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); 1789 if (!info) 1790 return -ENOMEM; 1791 1792 info->offset = offset; 1793 info->bytes = bytes; 1794 1795 spin_lock(&ctl->tree_lock); 1796 1797 if (try_merge_free_space(ctl, info, true)) 1798 goto link; 1799 1800 /* 1801 * There was no extent directly to the left or right of this new 1802 * extent then we know we're going to have to allocate a new extent, so 1803 * before we do that see if we need to drop this into a bitmap 1804 */ 1805 ret = insert_into_bitmap(ctl, info); 1806 if (ret < 0) { 1807 goto out; 1808 } else if (ret) { 1809 ret = 0; 1810 goto out; 1811 } 1812 link: 1813 ret = link_free_space(ctl, info); 1814 if (ret) 1815 kmem_cache_free(btrfs_free_space_cachep, info); 1816 out: 1817 spin_unlock(&ctl->tree_lock); 1818 1819 if (ret) { 1820 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); 1821 BUG_ON(ret == -EEXIST); 1822 } 1823 1824 return ret; 1825 } 1826 1827 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 1828 u64 offset, u64 bytes) 1829 { 1830 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1831 struct btrfs_free_space *info; 1832 struct btrfs_free_space *next_info = NULL; 1833 int ret = 0; 1834 1835 spin_lock(&ctl->tree_lock); 1836 1837 again: 1838 info = tree_search_offset(ctl, offset, 0, 0); 1839 if (!info) { 1840 /* 1841 * oops didn't find an extent that matched the space we wanted 1842 * to remove, look for a bitmap instead 1843 */ 1844 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1845 1, 0); 1846 if (!info) { 1847 WARN_ON(1); 1848 goto out_lock; 1849 } 1850 } 1851 1852 if (info->bytes < bytes && rb_next(&info->offset_index)) { 1853 u64 end; 1854 next_info = rb_entry(rb_next(&info->offset_index), 1855 struct btrfs_free_space, 1856 offset_index); 1857 1858 if (next_info->bitmap) 1859 end = next_info->offset + 1860 BITS_PER_BITMAP * ctl->unit - 1; 1861 else 1862 end = next_info->offset + next_info->bytes; 1863 1864 if (next_info->bytes < bytes || 1865 next_info->offset > offset || offset > end) { 1866 printk(KERN_CRIT "Found free space at %llu, size %llu," 1867 " trying to use %llu\n", 1868 (unsigned long long)info->offset, 1869 (unsigned long long)info->bytes, 1870 (unsigned long long)bytes); 1871 WARN_ON(1); 1872 ret = -EINVAL; 1873 goto out_lock; 1874 } 1875 1876 info = next_info; 1877 } 1878 1879 if (info->bytes == bytes) { 1880 unlink_free_space(ctl, info); 1881 if (info->bitmap) { 1882 kfree(info->bitmap); 1883 ctl->total_bitmaps--; 1884 } 1885 kmem_cache_free(btrfs_free_space_cachep, info); 1886 ret = 0; 1887 goto out_lock; 1888 } 1889 1890 if (!info->bitmap && info->offset == offset) { 1891 unlink_free_space(ctl, info); 1892 info->offset += bytes; 1893 info->bytes -= bytes; 1894 ret = link_free_space(ctl, info); 1895 WARN_ON(ret); 1896 goto out_lock; 1897 } 1898 1899 if (!info->bitmap && info->offset <= offset && 1900 info->offset + info->bytes >= offset + bytes) { 1901 u64 old_start = info->offset; 1902 /* 1903 * we're freeing space in the middle of the info, 1904 * this can happen during tree log replay 1905 * 1906 * first unlink the old info and then 1907 * insert it again after the hole we're creating 1908 */ 1909 unlink_free_space(ctl, info); 1910 if (offset + bytes < info->offset + info->bytes) { 1911 u64 old_end = info->offset + info->bytes; 1912 1913 info->offset = offset + bytes; 1914 info->bytes = old_end - info->offset; 1915 ret = link_free_space(ctl, info); 1916 WARN_ON(ret); 1917 if (ret) 1918 goto out_lock; 1919 } else { 1920 /* the hole we're creating ends at the end 1921 * of the info struct, just free the info 1922 */ 1923 kmem_cache_free(btrfs_free_space_cachep, info); 1924 } 1925 spin_unlock(&ctl->tree_lock); 1926 1927 /* step two, insert a new info struct to cover 1928 * anything before the hole 1929 */ 1930 ret = btrfs_add_free_space(block_group, old_start, 1931 offset - old_start); 1932 WARN_ON(ret); 1933 goto out; 1934 } 1935 1936 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 1937 if (ret == -EAGAIN) 1938 goto again; 1939 BUG_ON(ret); 1940 out_lock: 1941 spin_unlock(&ctl->tree_lock); 1942 out: 1943 return ret; 1944 } 1945 1946 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 1947 u64 bytes) 1948 { 1949 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1950 struct btrfs_free_space *info; 1951 struct rb_node *n; 1952 int count = 0; 1953 1954 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { 1955 info = rb_entry(n, struct btrfs_free_space, offset_index); 1956 if (info->bytes >= bytes) 1957 count++; 1958 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 1959 (unsigned long long)info->offset, 1960 (unsigned long long)info->bytes, 1961 (info->bitmap) ? "yes" : "no"); 1962 } 1963 printk(KERN_INFO "block group has cluster?: %s\n", 1964 list_empty(&block_group->cluster_list) ? "no" : "yes"); 1965 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 1966 "\n", count); 1967 } 1968 1969 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) 1970 { 1971 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1972 1973 spin_lock_init(&ctl->tree_lock); 1974 ctl->unit = block_group->sectorsize; 1975 ctl->start = block_group->key.objectid; 1976 ctl->private = block_group; 1977 ctl->op = &free_space_op; 1978 1979 /* 1980 * we only want to have 32k of ram per block group for keeping 1981 * track of free space, and if we pass 1/2 of that we want to 1982 * start converting things over to using bitmaps 1983 */ 1984 ctl->extents_thresh = ((1024 * 32) / 2) / 1985 sizeof(struct btrfs_free_space); 1986 } 1987 1988 /* 1989 * for a given cluster, put all of its extents back into the free 1990 * space cache. If the block group passed doesn't match the block group 1991 * pointed to by the cluster, someone else raced in and freed the 1992 * cluster already. In that case, we just return without changing anything 1993 */ 1994 static int 1995 __btrfs_return_cluster_to_free_space( 1996 struct btrfs_block_group_cache *block_group, 1997 struct btrfs_free_cluster *cluster) 1998 { 1999 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2000 struct btrfs_free_space *entry; 2001 struct rb_node *node; 2002 2003 spin_lock(&cluster->lock); 2004 if (cluster->block_group != block_group) 2005 goto out; 2006 2007 cluster->block_group = NULL; 2008 cluster->window_start = 0; 2009 list_del_init(&cluster->block_group_list); 2010 2011 node = rb_first(&cluster->root); 2012 while (node) { 2013 bool bitmap; 2014 2015 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2016 node = rb_next(&entry->offset_index); 2017 rb_erase(&entry->offset_index, &cluster->root); 2018 2019 bitmap = (entry->bitmap != NULL); 2020 if (!bitmap) 2021 try_merge_free_space(ctl, entry, false); 2022 tree_insert_offset(&ctl->free_space_offset, 2023 entry->offset, &entry->offset_index, bitmap); 2024 } 2025 cluster->root = RB_ROOT; 2026 2027 out: 2028 spin_unlock(&cluster->lock); 2029 btrfs_put_block_group(block_group); 2030 return 0; 2031 } 2032 2033 void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) 2034 { 2035 struct btrfs_free_space *info; 2036 struct rb_node *node; 2037 2038 while ((node = rb_last(&ctl->free_space_offset)) != NULL) { 2039 info = rb_entry(node, struct btrfs_free_space, offset_index); 2040 if (!info->bitmap) { 2041 unlink_free_space(ctl, info); 2042 kmem_cache_free(btrfs_free_space_cachep, info); 2043 } else { 2044 free_bitmap(ctl, info); 2045 } 2046 if (need_resched()) { 2047 spin_unlock(&ctl->tree_lock); 2048 cond_resched(); 2049 spin_lock(&ctl->tree_lock); 2050 } 2051 } 2052 } 2053 2054 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) 2055 { 2056 spin_lock(&ctl->tree_lock); 2057 __btrfs_remove_free_space_cache_locked(ctl); 2058 spin_unlock(&ctl->tree_lock); 2059 } 2060 2061 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 2062 { 2063 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2064 struct btrfs_free_cluster *cluster; 2065 struct list_head *head; 2066 2067 spin_lock(&ctl->tree_lock); 2068 while ((head = block_group->cluster_list.next) != 2069 &block_group->cluster_list) { 2070 cluster = list_entry(head, struct btrfs_free_cluster, 2071 block_group_list); 2072 2073 WARN_ON(cluster->block_group != block_group); 2074 __btrfs_return_cluster_to_free_space(block_group, cluster); 2075 if (need_resched()) { 2076 spin_unlock(&ctl->tree_lock); 2077 cond_resched(); 2078 spin_lock(&ctl->tree_lock); 2079 } 2080 } 2081 __btrfs_remove_free_space_cache_locked(ctl); 2082 spin_unlock(&ctl->tree_lock); 2083 2084 } 2085 2086 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 2087 u64 offset, u64 bytes, u64 empty_size) 2088 { 2089 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2090 struct btrfs_free_space *entry = NULL; 2091 u64 bytes_search = bytes + empty_size; 2092 u64 ret = 0; 2093 2094 spin_lock(&ctl->tree_lock); 2095 entry = find_free_space(ctl, &offset, &bytes_search); 2096 if (!entry) 2097 goto out; 2098 2099 ret = offset; 2100 if (entry->bitmap) { 2101 bitmap_clear_bits(ctl, entry, offset, bytes); 2102 if (!entry->bytes) 2103 free_bitmap(ctl, entry); 2104 } else { 2105 unlink_free_space(ctl, entry); 2106 entry->offset += bytes; 2107 entry->bytes -= bytes; 2108 if (!entry->bytes) 2109 kmem_cache_free(btrfs_free_space_cachep, entry); 2110 else 2111 link_free_space(ctl, entry); 2112 } 2113 2114 out: 2115 spin_unlock(&ctl->tree_lock); 2116 2117 return ret; 2118 } 2119 2120 /* 2121 * given a cluster, put all of its extents back into the free space 2122 * cache. If a block group is passed, this function will only free 2123 * a cluster that belongs to the passed block group. 2124 * 2125 * Otherwise, it'll get a reference on the block group pointed to by the 2126 * cluster and remove the cluster from it. 2127 */ 2128 int btrfs_return_cluster_to_free_space( 2129 struct btrfs_block_group_cache *block_group, 2130 struct btrfs_free_cluster *cluster) 2131 { 2132 struct btrfs_free_space_ctl *ctl; 2133 int ret; 2134 2135 /* first, get a safe pointer to the block group */ 2136 spin_lock(&cluster->lock); 2137 if (!block_group) { 2138 block_group = cluster->block_group; 2139 if (!block_group) { 2140 spin_unlock(&cluster->lock); 2141 return 0; 2142 } 2143 } else if (cluster->block_group != block_group) { 2144 /* someone else has already freed it don't redo their work */ 2145 spin_unlock(&cluster->lock); 2146 return 0; 2147 } 2148 atomic_inc(&block_group->count); 2149 spin_unlock(&cluster->lock); 2150 2151 ctl = block_group->free_space_ctl; 2152 2153 /* now return any extents the cluster had on it */ 2154 spin_lock(&ctl->tree_lock); 2155 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 2156 spin_unlock(&ctl->tree_lock); 2157 2158 /* finally drop our ref */ 2159 btrfs_put_block_group(block_group); 2160 return ret; 2161 } 2162 2163 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 2164 struct btrfs_free_cluster *cluster, 2165 struct btrfs_free_space *entry, 2166 u64 bytes, u64 min_start) 2167 { 2168 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2169 int err; 2170 u64 search_start = cluster->window_start; 2171 u64 search_bytes = bytes; 2172 u64 ret = 0; 2173 2174 search_start = min_start; 2175 search_bytes = bytes; 2176 2177 err = search_bitmap(ctl, entry, &search_start, &search_bytes); 2178 if (err) 2179 return 0; 2180 2181 ret = search_start; 2182 __bitmap_clear_bits(ctl, entry, ret, bytes); 2183 2184 return ret; 2185 } 2186 2187 /* 2188 * given a cluster, try to allocate 'bytes' from it, returns 0 2189 * if it couldn't find anything suitably large, or a logical disk offset 2190 * if things worked out 2191 */ 2192 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, 2193 struct btrfs_free_cluster *cluster, u64 bytes, 2194 u64 min_start) 2195 { 2196 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2197 struct btrfs_free_space *entry = NULL; 2198 struct rb_node *node; 2199 u64 ret = 0; 2200 2201 spin_lock(&cluster->lock); 2202 if (bytes > cluster->max_size) 2203 goto out; 2204 2205 if (cluster->block_group != block_group) 2206 goto out; 2207 2208 node = rb_first(&cluster->root); 2209 if (!node) 2210 goto out; 2211 2212 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2213 while(1) { 2214 if (entry->bytes < bytes || 2215 (!entry->bitmap && entry->offset < min_start)) { 2216 node = rb_next(&entry->offset_index); 2217 if (!node) 2218 break; 2219 entry = rb_entry(node, struct btrfs_free_space, 2220 offset_index); 2221 continue; 2222 } 2223 2224 if (entry->bitmap) { 2225 ret = btrfs_alloc_from_bitmap(block_group, 2226 cluster, entry, bytes, 2227 min_start); 2228 if (ret == 0) { 2229 node = rb_next(&entry->offset_index); 2230 if (!node) 2231 break; 2232 entry = rb_entry(node, struct btrfs_free_space, 2233 offset_index); 2234 continue; 2235 } 2236 } else { 2237 ret = entry->offset; 2238 2239 entry->offset += bytes; 2240 entry->bytes -= bytes; 2241 } 2242 2243 if (entry->bytes == 0) 2244 rb_erase(&entry->offset_index, &cluster->root); 2245 break; 2246 } 2247 out: 2248 spin_unlock(&cluster->lock); 2249 2250 if (!ret) 2251 return 0; 2252 2253 spin_lock(&ctl->tree_lock); 2254 2255 ctl->free_space -= bytes; 2256 if (entry->bytes == 0) { 2257 ctl->free_extents--; 2258 if (entry->bitmap) { 2259 kfree(entry->bitmap); 2260 ctl->total_bitmaps--; 2261 ctl->op->recalc_thresholds(ctl); 2262 } 2263 kmem_cache_free(btrfs_free_space_cachep, entry); 2264 } 2265 2266 spin_unlock(&ctl->tree_lock); 2267 2268 return ret; 2269 } 2270 2271 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2272 struct btrfs_free_space *entry, 2273 struct btrfs_free_cluster *cluster, 2274 u64 offset, u64 bytes, u64 min_bytes) 2275 { 2276 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2277 unsigned long next_zero; 2278 unsigned long i; 2279 unsigned long search_bits; 2280 unsigned long total_bits; 2281 unsigned long found_bits; 2282 unsigned long start = 0; 2283 unsigned long total_found = 0; 2284 int ret; 2285 bool found = false; 2286 2287 i = offset_to_bit(entry->offset, block_group->sectorsize, 2288 max_t(u64, offset, entry->offset)); 2289 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2290 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2291 2292 again: 2293 found_bits = 0; 2294 for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); 2295 i < BITS_PER_BITMAP; 2296 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2297 next_zero = find_next_zero_bit(entry->bitmap, 2298 BITS_PER_BITMAP, i); 2299 if (next_zero - i >= search_bits) { 2300 found_bits = next_zero - i; 2301 break; 2302 } 2303 i = next_zero; 2304 } 2305 2306 if (!found_bits) 2307 return -ENOSPC; 2308 2309 if (!found) { 2310 start = i; 2311 found = true; 2312 } 2313 2314 total_found += found_bits; 2315 2316 if (cluster->max_size < found_bits * block_group->sectorsize) 2317 cluster->max_size = found_bits * block_group->sectorsize; 2318 2319 if (total_found < total_bits) { 2320 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2321 if (i - start > total_bits * 2) { 2322 total_found = 0; 2323 cluster->max_size = 0; 2324 found = false; 2325 } 2326 goto again; 2327 } 2328 2329 cluster->window_start = start * block_group->sectorsize + 2330 entry->offset; 2331 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2332 ret = tree_insert_offset(&cluster->root, entry->offset, 2333 &entry->offset_index, 1); 2334 BUG_ON(ret); 2335 2336 return 0; 2337 } 2338 2339 /* 2340 * This searches the block group for just extents to fill the cluster with. 2341 */ 2342 static noinline int 2343 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2344 struct btrfs_free_cluster *cluster, 2345 struct list_head *bitmaps, u64 offset, u64 bytes, 2346 u64 min_bytes) 2347 { 2348 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2349 struct btrfs_free_space *first = NULL; 2350 struct btrfs_free_space *entry = NULL; 2351 struct btrfs_free_space *prev = NULL; 2352 struct btrfs_free_space *last; 2353 struct rb_node *node; 2354 u64 window_start; 2355 u64 window_free; 2356 u64 max_extent; 2357 u64 max_gap = 128 * 1024; 2358 2359 entry = tree_search_offset(ctl, offset, 0, 1); 2360 if (!entry) 2361 return -ENOSPC; 2362 2363 /* 2364 * We don't want bitmaps, so just move along until we find a normal 2365 * extent entry. 2366 */ 2367 while (entry->bitmap) { 2368 if (list_empty(&entry->list)) 2369 list_add_tail(&entry->list, bitmaps); 2370 node = rb_next(&entry->offset_index); 2371 if (!node) 2372 return -ENOSPC; 2373 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2374 } 2375 2376 window_start = entry->offset; 2377 window_free = entry->bytes; 2378 max_extent = entry->bytes; 2379 first = entry; 2380 last = entry; 2381 prev = entry; 2382 2383 while (window_free <= min_bytes) { 2384 node = rb_next(&entry->offset_index); 2385 if (!node) 2386 return -ENOSPC; 2387 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2388 2389 if (entry->bitmap) { 2390 if (list_empty(&entry->list)) 2391 list_add_tail(&entry->list, bitmaps); 2392 continue; 2393 } 2394 2395 /* 2396 * we haven't filled the empty size and the window is 2397 * very large. reset and try again 2398 */ 2399 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2400 entry->offset - window_start > (min_bytes * 2)) { 2401 first = entry; 2402 window_start = entry->offset; 2403 window_free = entry->bytes; 2404 last = entry; 2405 max_extent = entry->bytes; 2406 } else { 2407 last = entry; 2408 window_free += entry->bytes; 2409 if (entry->bytes > max_extent) 2410 max_extent = entry->bytes; 2411 } 2412 prev = entry; 2413 } 2414 2415 cluster->window_start = first->offset; 2416 2417 node = &first->offset_index; 2418 2419 /* 2420 * now we've found our entries, pull them out of the free space 2421 * cache and put them into the cluster rbtree 2422 */ 2423 do { 2424 int ret; 2425 2426 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2427 node = rb_next(&entry->offset_index); 2428 if (entry->bitmap) 2429 continue; 2430 2431 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2432 ret = tree_insert_offset(&cluster->root, entry->offset, 2433 &entry->offset_index, 0); 2434 BUG_ON(ret); 2435 } while (node && entry != last); 2436 2437 cluster->max_size = max_extent; 2438 2439 return 0; 2440 } 2441 2442 /* 2443 * This specifically looks for bitmaps that may work in the cluster, we assume 2444 * that we have already failed to find extents that will work. 2445 */ 2446 static noinline int 2447 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2448 struct btrfs_free_cluster *cluster, 2449 struct list_head *bitmaps, u64 offset, u64 bytes, 2450 u64 min_bytes) 2451 { 2452 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2453 struct btrfs_free_space *entry; 2454 struct rb_node *node; 2455 int ret = -ENOSPC; 2456 2457 if (ctl->total_bitmaps == 0) 2458 return -ENOSPC; 2459 2460 /* 2461 * First check our cached list of bitmaps and see if there is an entry 2462 * here that will work. 2463 */ 2464 list_for_each_entry(entry, bitmaps, list) { 2465 if (entry->bytes < min_bytes) 2466 continue; 2467 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2468 bytes, min_bytes); 2469 if (!ret) 2470 return 0; 2471 } 2472 2473 /* 2474 * If we do have entries on our list and we are here then we didn't find 2475 * anything, so go ahead and get the next entry after the last entry in 2476 * this list and start the search from there. 2477 */ 2478 if (!list_empty(bitmaps)) { 2479 entry = list_entry(bitmaps->prev, struct btrfs_free_space, 2480 list); 2481 node = rb_next(&entry->offset_index); 2482 if (!node) 2483 return -ENOSPC; 2484 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2485 goto search; 2486 } 2487 2488 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); 2489 if (!entry) 2490 return -ENOSPC; 2491 2492 search: 2493 node = &entry->offset_index; 2494 do { 2495 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2496 node = rb_next(&entry->offset_index); 2497 if (!entry->bitmap) 2498 continue; 2499 if (entry->bytes < min_bytes) 2500 continue; 2501 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2502 bytes, min_bytes); 2503 } while (ret && node); 2504 2505 return ret; 2506 } 2507 2508 /* 2509 * here we try to find a cluster of blocks in a block group. The goal 2510 * is to find at least bytes free and up to empty_size + bytes free. 2511 * We might not find them all in one contiguous area. 2512 * 2513 * returns zero and sets up cluster if things worked out, otherwise 2514 * it returns -enospc 2515 */ 2516 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 2517 struct btrfs_root *root, 2518 struct btrfs_block_group_cache *block_group, 2519 struct btrfs_free_cluster *cluster, 2520 u64 offset, u64 bytes, u64 empty_size) 2521 { 2522 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2523 struct list_head bitmaps; 2524 struct btrfs_free_space *entry, *tmp; 2525 u64 min_bytes; 2526 int ret; 2527 2528 /* for metadata, allow allocates with more holes */ 2529 if (btrfs_test_opt(root, SSD_SPREAD)) { 2530 min_bytes = bytes + empty_size; 2531 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2532 /* 2533 * we want to do larger allocations when we are 2534 * flushing out the delayed refs, it helps prevent 2535 * making more work as we go along. 2536 */ 2537 if (trans->transaction->delayed_refs.flushing) 2538 min_bytes = max(bytes, (bytes + empty_size) >> 1); 2539 else 2540 min_bytes = max(bytes, (bytes + empty_size) >> 4); 2541 } else 2542 min_bytes = max(bytes, (bytes + empty_size) >> 2); 2543 2544 spin_lock(&ctl->tree_lock); 2545 2546 /* 2547 * If we know we don't have enough space to make a cluster don't even 2548 * bother doing all the work to try and find one. 2549 */ 2550 if (ctl->free_space < min_bytes) { 2551 spin_unlock(&ctl->tree_lock); 2552 return -ENOSPC; 2553 } 2554 2555 spin_lock(&cluster->lock); 2556 2557 /* someone already found a cluster, hooray */ 2558 if (cluster->block_group) { 2559 ret = 0; 2560 goto out; 2561 } 2562 2563 INIT_LIST_HEAD(&bitmaps); 2564 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2565 bytes, min_bytes); 2566 if (ret) 2567 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2568 offset, bytes, min_bytes); 2569 2570 /* Clear our temporary list */ 2571 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2572 list_del_init(&entry->list); 2573 2574 if (!ret) { 2575 atomic_inc(&block_group->count); 2576 list_add_tail(&cluster->block_group_list, 2577 &block_group->cluster_list); 2578 cluster->block_group = block_group; 2579 } 2580 out: 2581 spin_unlock(&cluster->lock); 2582 spin_unlock(&ctl->tree_lock); 2583 2584 return ret; 2585 } 2586 2587 /* 2588 * simple code to zero out a cluster 2589 */ 2590 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) 2591 { 2592 spin_lock_init(&cluster->lock); 2593 spin_lock_init(&cluster->refill_lock); 2594 cluster->root = RB_ROOT; 2595 cluster->max_size = 0; 2596 INIT_LIST_HEAD(&cluster->block_group_list); 2597 cluster->block_group = NULL; 2598 } 2599 2600 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2601 u64 *trimmed, u64 start, u64 end, u64 minlen) 2602 { 2603 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2604 struct btrfs_free_space *entry = NULL; 2605 struct btrfs_fs_info *fs_info = block_group->fs_info; 2606 u64 bytes = 0; 2607 u64 actually_trimmed; 2608 int ret = 0; 2609 2610 *trimmed = 0; 2611 2612 while (start < end) { 2613 spin_lock(&ctl->tree_lock); 2614 2615 if (ctl->free_space < minlen) { 2616 spin_unlock(&ctl->tree_lock); 2617 break; 2618 } 2619 2620 entry = tree_search_offset(ctl, start, 0, 1); 2621 if (!entry) 2622 entry = tree_search_offset(ctl, 2623 offset_to_bitmap(ctl, start), 2624 1, 1); 2625 2626 if (!entry || entry->offset >= end) { 2627 spin_unlock(&ctl->tree_lock); 2628 break; 2629 } 2630 2631 if (entry->bitmap) { 2632 ret = search_bitmap(ctl, entry, &start, &bytes); 2633 if (!ret) { 2634 if (start >= end) { 2635 spin_unlock(&ctl->tree_lock); 2636 break; 2637 } 2638 bytes = min(bytes, end - start); 2639 bitmap_clear_bits(ctl, entry, start, bytes); 2640 if (entry->bytes == 0) 2641 free_bitmap(ctl, entry); 2642 } else { 2643 start = entry->offset + BITS_PER_BITMAP * 2644 block_group->sectorsize; 2645 spin_unlock(&ctl->tree_lock); 2646 ret = 0; 2647 continue; 2648 } 2649 } else { 2650 start = entry->offset; 2651 bytes = min(entry->bytes, end - start); 2652 unlink_free_space(ctl, entry); 2653 kmem_cache_free(btrfs_free_space_cachep, entry); 2654 } 2655 2656 spin_unlock(&ctl->tree_lock); 2657 2658 if (bytes >= minlen) { 2659 struct btrfs_space_info *space_info; 2660 int update = 0; 2661 2662 space_info = block_group->space_info; 2663 spin_lock(&space_info->lock); 2664 spin_lock(&block_group->lock); 2665 if (!block_group->ro) { 2666 block_group->reserved += bytes; 2667 space_info->bytes_reserved += bytes; 2668 update = 1; 2669 } 2670 spin_unlock(&block_group->lock); 2671 spin_unlock(&space_info->lock); 2672 2673 ret = btrfs_error_discard_extent(fs_info->extent_root, 2674 start, 2675 bytes, 2676 &actually_trimmed); 2677 2678 btrfs_add_free_space(block_group, start, bytes); 2679 if (update) { 2680 spin_lock(&space_info->lock); 2681 spin_lock(&block_group->lock); 2682 if (block_group->ro) 2683 space_info->bytes_readonly += bytes; 2684 block_group->reserved -= bytes; 2685 space_info->bytes_reserved -= bytes; 2686 spin_unlock(&space_info->lock); 2687 spin_unlock(&block_group->lock); 2688 } 2689 2690 if (ret) 2691 break; 2692 *trimmed += actually_trimmed; 2693 } 2694 start += bytes; 2695 bytes = 0; 2696 2697 if (fatal_signal_pending(current)) { 2698 ret = -ERESTARTSYS; 2699 break; 2700 } 2701 2702 cond_resched(); 2703 } 2704 2705 return ret; 2706 } 2707 2708 /* 2709 * Find the left-most item in the cache tree, and then return the 2710 * smallest inode number in the item. 2711 * 2712 * Note: the returned inode number may not be the smallest one in 2713 * the tree, if the left-most item is a bitmap. 2714 */ 2715 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) 2716 { 2717 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl; 2718 struct btrfs_free_space *entry = NULL; 2719 u64 ino = 0; 2720 2721 spin_lock(&ctl->tree_lock); 2722 2723 if (RB_EMPTY_ROOT(&ctl->free_space_offset)) 2724 goto out; 2725 2726 entry = rb_entry(rb_first(&ctl->free_space_offset), 2727 struct btrfs_free_space, offset_index); 2728 2729 if (!entry->bitmap) { 2730 ino = entry->offset; 2731 2732 unlink_free_space(ctl, entry); 2733 entry->offset++; 2734 entry->bytes--; 2735 if (!entry->bytes) 2736 kmem_cache_free(btrfs_free_space_cachep, entry); 2737 else 2738 link_free_space(ctl, entry); 2739 } else { 2740 u64 offset = 0; 2741 u64 count = 1; 2742 int ret; 2743 2744 ret = search_bitmap(ctl, entry, &offset, &count); 2745 BUG_ON(ret); 2746 2747 ino = offset; 2748 bitmap_clear_bits(ctl, entry, offset, 1); 2749 if (entry->bytes == 0) 2750 free_bitmap(ctl, entry); 2751 } 2752 out: 2753 spin_unlock(&ctl->tree_lock); 2754 2755 return ino; 2756 } 2757 2758 struct inode *lookup_free_ino_inode(struct btrfs_root *root, 2759 struct btrfs_path *path) 2760 { 2761 struct inode *inode = NULL; 2762 2763 spin_lock(&root->cache_lock); 2764 if (root->cache_inode) 2765 inode = igrab(root->cache_inode); 2766 spin_unlock(&root->cache_lock); 2767 if (inode) 2768 return inode; 2769 2770 inode = __lookup_free_space_inode(root, path, 0); 2771 if (IS_ERR(inode)) 2772 return inode; 2773 2774 spin_lock(&root->cache_lock); 2775 if (!btrfs_fs_closing(root->fs_info)) 2776 root->cache_inode = igrab(inode); 2777 spin_unlock(&root->cache_lock); 2778 2779 return inode; 2780 } 2781 2782 int create_free_ino_inode(struct btrfs_root *root, 2783 struct btrfs_trans_handle *trans, 2784 struct btrfs_path *path) 2785 { 2786 return __create_free_space_inode(root, trans, path, 2787 BTRFS_FREE_INO_OBJECTID, 0); 2788 } 2789 2790 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 2791 { 2792 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 2793 struct btrfs_path *path; 2794 struct inode *inode; 2795 int ret = 0; 2796 u64 root_gen = btrfs_root_generation(&root->root_item); 2797 2798 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 2799 return 0; 2800 2801 /* 2802 * If we're unmounting then just return, since this does a search on the 2803 * normal root and not the commit root and we could deadlock. 2804 */ 2805 if (btrfs_fs_closing(fs_info)) 2806 return 0; 2807 2808 path = btrfs_alloc_path(); 2809 if (!path) 2810 return 0; 2811 2812 inode = lookup_free_ino_inode(root, path); 2813 if (IS_ERR(inode)) 2814 goto out; 2815 2816 if (root_gen != BTRFS_I(inode)->generation) 2817 goto out_put; 2818 2819 ret = __load_free_space_cache(root, inode, ctl, path, 0); 2820 2821 if (ret < 0) 2822 printk(KERN_ERR "btrfs: failed to load free ino cache for " 2823 "root %llu\n", root->root_key.objectid); 2824 out_put: 2825 iput(inode); 2826 out: 2827 btrfs_free_path(path); 2828 return ret; 2829 } 2830 2831 int btrfs_write_out_ino_cache(struct btrfs_root *root, 2832 struct btrfs_trans_handle *trans, 2833 struct btrfs_path *path) 2834 { 2835 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 2836 struct inode *inode; 2837 int ret; 2838 2839 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 2840 return 0; 2841 2842 inode = lookup_free_ino_inode(root, path); 2843 if (IS_ERR(inode)) 2844 return 0; 2845 2846 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2847 if (ret) { 2848 btrfs_delalloc_release_metadata(inode, inode->i_size); 2849 #ifdef DEBUG 2850 printk(KERN_ERR "btrfs: failed to write free ino cache " 2851 "for root %llu\n", root->root_key.objectid); 2852 #endif 2853 } 2854 2855 iput(inode); 2856 return ret; 2857 } 2858