1 /* 2 * Copyright (C) 2007 Oracle. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/slab.h> 20 #include <linux/blkdev.h> 21 #include <linux/writeback.h> 22 #include <linux/pagevec.h> 23 #include "ctree.h" 24 #include "transaction.h" 25 #include "btrfs_inode.h" 26 #include "extent_io.h" 27 28 static struct kmem_cache *btrfs_ordered_extent_cache; 29 30 static u64 entry_end(struct btrfs_ordered_extent *entry) 31 { 32 if (entry->file_offset + entry->len < entry->file_offset) 33 return (u64)-1; 34 return entry->file_offset + entry->len; 35 } 36 37 /* returns NULL if the insertion worked, or it returns the node it did find 38 * in the tree 39 */ 40 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 41 struct rb_node *node) 42 { 43 struct rb_node **p = &root->rb_node; 44 struct rb_node *parent = NULL; 45 struct btrfs_ordered_extent *entry; 46 47 while (*p) { 48 parent = *p; 49 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 50 51 if (file_offset < entry->file_offset) 52 p = &(*p)->rb_left; 53 else if (file_offset >= entry_end(entry)) 54 p = &(*p)->rb_right; 55 else 56 return parent; 57 } 58 59 rb_link_node(node, parent, p); 60 rb_insert_color(node, root); 61 return NULL; 62 } 63 64 static void ordered_data_tree_panic(struct inode *inode, int errno, 65 u64 offset) 66 { 67 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 68 btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " 69 "%llu\n", (unsigned long long)offset); 70 } 71 72 /* 73 * look for a given offset in the tree, and if it can't be found return the 74 * first lesser offset 75 */ 76 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 77 struct rb_node **prev_ret) 78 { 79 struct rb_node *n = root->rb_node; 80 struct rb_node *prev = NULL; 81 struct rb_node *test; 82 struct btrfs_ordered_extent *entry; 83 struct btrfs_ordered_extent *prev_entry = NULL; 84 85 while (n) { 86 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 87 prev = n; 88 prev_entry = entry; 89 90 if (file_offset < entry->file_offset) 91 n = n->rb_left; 92 else if (file_offset >= entry_end(entry)) 93 n = n->rb_right; 94 else 95 return n; 96 } 97 if (!prev_ret) 98 return NULL; 99 100 while (prev && file_offset >= entry_end(prev_entry)) { 101 test = rb_next(prev); 102 if (!test) 103 break; 104 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 105 rb_node); 106 if (file_offset < entry_end(prev_entry)) 107 break; 108 109 prev = test; 110 } 111 if (prev) 112 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 113 rb_node); 114 while (prev && file_offset < entry_end(prev_entry)) { 115 test = rb_prev(prev); 116 if (!test) 117 break; 118 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 119 rb_node); 120 prev = test; 121 } 122 *prev_ret = prev; 123 return NULL; 124 } 125 126 /* 127 * helper to check if a given offset is inside a given entry 128 */ 129 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 130 { 131 if (file_offset < entry->file_offset || 132 entry->file_offset + entry->len <= file_offset) 133 return 0; 134 return 1; 135 } 136 137 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 138 u64 len) 139 { 140 if (file_offset + len <= entry->file_offset || 141 entry->file_offset + entry->len <= file_offset) 142 return 0; 143 return 1; 144 } 145 146 /* 147 * look find the first ordered struct that has this offset, otherwise 148 * the first one less than this offset 149 */ 150 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 151 u64 file_offset) 152 { 153 struct rb_root *root = &tree->tree; 154 struct rb_node *prev = NULL; 155 struct rb_node *ret; 156 struct btrfs_ordered_extent *entry; 157 158 if (tree->last) { 159 entry = rb_entry(tree->last, struct btrfs_ordered_extent, 160 rb_node); 161 if (offset_in_entry(entry, file_offset)) 162 return tree->last; 163 } 164 ret = __tree_search(root, file_offset, &prev); 165 if (!ret) 166 ret = prev; 167 if (ret) 168 tree->last = ret; 169 return ret; 170 } 171 172 /* allocate and add a new ordered_extent into the per-inode tree. 173 * file_offset is the logical offset in the file 174 * 175 * start is the disk block number of an extent already reserved in the 176 * extent allocation tree 177 * 178 * len is the length of the extent 179 * 180 * The tree is given a single reference on the ordered extent that was 181 * inserted. 182 */ 183 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 184 u64 start, u64 len, u64 disk_len, 185 int type, int dio, int compress_type) 186 { 187 struct btrfs_ordered_inode_tree *tree; 188 struct rb_node *node; 189 struct btrfs_ordered_extent *entry; 190 191 tree = &BTRFS_I(inode)->ordered_tree; 192 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 193 if (!entry) 194 return -ENOMEM; 195 196 entry->file_offset = file_offset; 197 entry->start = start; 198 entry->len = len; 199 entry->disk_len = disk_len; 200 entry->bytes_left = len; 201 entry->inode = igrab(inode); 202 entry->compress_type = compress_type; 203 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 204 set_bit(type, &entry->flags); 205 206 if (dio) 207 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 208 209 /* one ref for the tree */ 210 atomic_set(&entry->refs, 1); 211 init_waitqueue_head(&entry->wait); 212 INIT_LIST_HEAD(&entry->list); 213 INIT_LIST_HEAD(&entry->root_extent_list); 214 INIT_LIST_HEAD(&entry->work_list); 215 init_completion(&entry->completion); 216 217 trace_btrfs_ordered_extent_add(inode, entry); 218 219 spin_lock_irq(&tree->lock); 220 node = tree_insert(&tree->tree, file_offset, 221 &entry->rb_node); 222 if (node) 223 ordered_data_tree_panic(inode, -EEXIST, file_offset); 224 spin_unlock_irq(&tree->lock); 225 226 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 227 list_add_tail(&entry->root_extent_list, 228 &BTRFS_I(inode)->root->fs_info->ordered_extents); 229 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 230 231 return 0; 232 } 233 234 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 235 u64 start, u64 len, u64 disk_len, int type) 236 { 237 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 238 disk_len, type, 0, 239 BTRFS_COMPRESS_NONE); 240 } 241 242 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 243 u64 start, u64 len, u64 disk_len, int type) 244 { 245 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 246 disk_len, type, 1, 247 BTRFS_COMPRESS_NONE); 248 } 249 250 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, 251 u64 start, u64 len, u64 disk_len, 252 int type, int compress_type) 253 { 254 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 255 disk_len, type, 0, 256 compress_type); 257 } 258 259 /* 260 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 261 * when an ordered extent is finished. If the list covers more than one 262 * ordered extent, it is split across multiples. 263 */ 264 void btrfs_add_ordered_sum(struct inode *inode, 265 struct btrfs_ordered_extent *entry, 266 struct btrfs_ordered_sum *sum) 267 { 268 struct btrfs_ordered_inode_tree *tree; 269 270 tree = &BTRFS_I(inode)->ordered_tree; 271 spin_lock_irq(&tree->lock); 272 list_add_tail(&sum->list, &entry->list); 273 spin_unlock_irq(&tree->lock); 274 } 275 276 /* 277 * this is used to account for finished IO across a given range 278 * of the file. The IO may span ordered extents. If 279 * a given ordered_extent is completely done, 1 is returned, otherwise 280 * 0. 281 * 282 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 283 * to make sure this function only returns 1 once for a given ordered extent. 284 * 285 * file_offset is updated to one byte past the range that is recorded as 286 * complete. This allows you to walk forward in the file. 287 */ 288 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 289 struct btrfs_ordered_extent **cached, 290 u64 *file_offset, u64 io_size, int uptodate) 291 { 292 struct btrfs_ordered_inode_tree *tree; 293 struct rb_node *node; 294 struct btrfs_ordered_extent *entry = NULL; 295 int ret; 296 unsigned long flags; 297 u64 dec_end; 298 u64 dec_start; 299 u64 to_dec; 300 301 tree = &BTRFS_I(inode)->ordered_tree; 302 spin_lock_irqsave(&tree->lock, flags); 303 node = tree_search(tree, *file_offset); 304 if (!node) { 305 ret = 1; 306 goto out; 307 } 308 309 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 310 if (!offset_in_entry(entry, *file_offset)) { 311 ret = 1; 312 goto out; 313 } 314 315 dec_start = max(*file_offset, entry->file_offset); 316 dec_end = min(*file_offset + io_size, entry->file_offset + 317 entry->len); 318 *file_offset = dec_end; 319 if (dec_start > dec_end) { 320 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", 321 (unsigned long long)dec_start, 322 (unsigned long long)dec_end); 323 } 324 to_dec = dec_end - dec_start; 325 if (to_dec > entry->bytes_left) { 326 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", 327 (unsigned long long)entry->bytes_left, 328 (unsigned long long)to_dec); 329 } 330 entry->bytes_left -= to_dec; 331 if (!uptodate) 332 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 333 334 if (entry->bytes_left == 0) 335 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 336 else 337 ret = 1; 338 out: 339 if (!ret && cached && entry) { 340 *cached = entry; 341 atomic_inc(&entry->refs); 342 } 343 spin_unlock_irqrestore(&tree->lock, flags); 344 return ret == 0; 345 } 346 347 /* 348 * this is used to account for finished IO across a given range 349 * of the file. The IO should not span ordered extents. If 350 * a given ordered_extent is completely done, 1 is returned, otherwise 351 * 0. 352 * 353 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 354 * to make sure this function only returns 1 once for a given ordered extent. 355 */ 356 int btrfs_dec_test_ordered_pending(struct inode *inode, 357 struct btrfs_ordered_extent **cached, 358 u64 file_offset, u64 io_size, int uptodate) 359 { 360 struct btrfs_ordered_inode_tree *tree; 361 struct rb_node *node; 362 struct btrfs_ordered_extent *entry = NULL; 363 unsigned long flags; 364 int ret; 365 366 tree = &BTRFS_I(inode)->ordered_tree; 367 spin_lock_irqsave(&tree->lock, flags); 368 if (cached && *cached) { 369 entry = *cached; 370 goto have_entry; 371 } 372 373 node = tree_search(tree, file_offset); 374 if (!node) { 375 ret = 1; 376 goto out; 377 } 378 379 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 380 have_entry: 381 if (!offset_in_entry(entry, file_offset)) { 382 ret = 1; 383 goto out; 384 } 385 386 if (io_size > entry->bytes_left) { 387 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", 388 (unsigned long long)entry->bytes_left, 389 (unsigned long long)io_size); 390 } 391 entry->bytes_left -= io_size; 392 if (!uptodate) 393 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 394 395 if (entry->bytes_left == 0) 396 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 397 else 398 ret = 1; 399 out: 400 if (!ret && cached && entry) { 401 *cached = entry; 402 atomic_inc(&entry->refs); 403 } 404 spin_unlock_irqrestore(&tree->lock, flags); 405 return ret == 0; 406 } 407 408 /* 409 * used to drop a reference on an ordered extent. This will free 410 * the extent if the last reference is dropped 411 */ 412 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 413 { 414 struct list_head *cur; 415 struct btrfs_ordered_sum *sum; 416 417 trace_btrfs_ordered_extent_put(entry->inode, entry); 418 419 if (atomic_dec_and_test(&entry->refs)) { 420 if (entry->inode) 421 btrfs_add_delayed_iput(entry->inode); 422 while (!list_empty(&entry->list)) { 423 cur = entry->list.next; 424 sum = list_entry(cur, struct btrfs_ordered_sum, list); 425 list_del(&sum->list); 426 kfree(sum); 427 } 428 kmem_cache_free(btrfs_ordered_extent_cache, entry); 429 } 430 } 431 432 /* 433 * remove an ordered extent from the tree. No references are dropped 434 * and waiters are woken up. 435 */ 436 void btrfs_remove_ordered_extent(struct inode *inode, 437 struct btrfs_ordered_extent *entry) 438 { 439 struct btrfs_ordered_inode_tree *tree; 440 struct btrfs_root *root = BTRFS_I(inode)->root; 441 struct rb_node *node; 442 443 tree = &BTRFS_I(inode)->ordered_tree; 444 spin_lock_irq(&tree->lock); 445 node = &entry->rb_node; 446 rb_erase(node, &tree->tree); 447 tree->last = NULL; 448 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 449 spin_unlock_irq(&tree->lock); 450 451 spin_lock(&root->fs_info->ordered_extent_lock); 452 list_del_init(&entry->root_extent_list); 453 454 trace_btrfs_ordered_extent_remove(inode, entry); 455 456 /* 457 * we have no more ordered extents for this inode and 458 * no dirty pages. We can safely remove it from the 459 * list of ordered extents 460 */ 461 if (RB_EMPTY_ROOT(&tree->tree) && 462 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 463 list_del_init(&BTRFS_I(inode)->ordered_operations); 464 } 465 spin_unlock(&root->fs_info->ordered_extent_lock); 466 wake_up(&entry->wait); 467 } 468 469 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 470 { 471 struct btrfs_ordered_extent *ordered; 472 473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 474 btrfs_start_ordered_extent(ordered->inode, ordered, 1); 475 complete(&ordered->completion); 476 } 477 478 /* 479 * wait for all the ordered extents in a root. This is done when balancing 480 * space between drives. 481 */ 482 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 483 { 484 struct list_head splice, works; 485 struct list_head *cur; 486 struct btrfs_ordered_extent *ordered, *next; 487 struct inode *inode; 488 489 INIT_LIST_HEAD(&splice); 490 INIT_LIST_HEAD(&works); 491 492 spin_lock(&root->fs_info->ordered_extent_lock); 493 list_splice_init(&root->fs_info->ordered_extents, &splice); 494 while (!list_empty(&splice)) { 495 cur = splice.next; 496 ordered = list_entry(cur, struct btrfs_ordered_extent, 497 root_extent_list); 498 list_del_init(&ordered->root_extent_list); 499 atomic_inc(&ordered->refs); 500 501 /* 502 * the inode may be getting freed (in sys_unlink path). 503 */ 504 inode = igrab(ordered->inode); 505 506 spin_unlock(&root->fs_info->ordered_extent_lock); 507 508 if (inode) { 509 ordered->flush_work.func = btrfs_run_ordered_extent_work; 510 list_add_tail(&ordered->work_list, &works); 511 btrfs_queue_worker(&root->fs_info->flush_workers, 512 &ordered->flush_work); 513 } else { 514 btrfs_put_ordered_extent(ordered); 515 } 516 517 cond_resched(); 518 spin_lock(&root->fs_info->ordered_extent_lock); 519 } 520 spin_unlock(&root->fs_info->ordered_extent_lock); 521 522 list_for_each_entry_safe(ordered, next, &works, work_list) { 523 list_del_init(&ordered->work_list); 524 wait_for_completion(&ordered->completion); 525 526 inode = ordered->inode; 527 btrfs_put_ordered_extent(ordered); 528 if (delay_iput) 529 btrfs_add_delayed_iput(inode); 530 else 531 iput(inode); 532 533 cond_resched(); 534 } 535 } 536 537 /* 538 * this is used during transaction commit to write all the inodes 539 * added to the ordered operation list. These files must be fully on 540 * disk before the transaction commits. 541 * 542 * we have two modes here, one is to just start the IO via filemap_flush 543 * and the other is to wait for all the io. When we wait, we have an 544 * extra check to make sure the ordered operation list really is empty 545 * before we return 546 */ 547 int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 548 { 549 struct btrfs_inode *btrfs_inode; 550 struct inode *inode; 551 struct list_head splice; 552 struct list_head works; 553 struct btrfs_delalloc_work *work, *next; 554 int ret = 0; 555 556 INIT_LIST_HEAD(&splice); 557 INIT_LIST_HEAD(&works); 558 559 mutex_lock(&root->fs_info->ordered_operations_mutex); 560 spin_lock(&root->fs_info->ordered_extent_lock); 561 again: 562 list_splice_init(&root->fs_info->ordered_operations, &splice); 563 564 while (!list_empty(&splice)) { 565 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 567 ordered_operations); 568 569 inode = &btrfs_inode->vfs_inode; 570 571 list_del_init(&btrfs_inode->ordered_operations); 572 573 /* 574 * the inode may be getting freed (in sys_unlink path). 575 */ 576 inode = igrab(inode); 577 578 if (!wait && inode) { 579 list_add_tail(&BTRFS_I(inode)->ordered_operations, 580 &root->fs_info->ordered_operations); 581 } 582 583 if (!inode) 584 continue; 585 spin_unlock(&root->fs_info->ordered_extent_lock); 586 587 work = btrfs_alloc_delalloc_work(inode, wait, 1); 588 if (!work) { 589 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 590 list_add_tail(&btrfs_inode->ordered_operations, 591 &splice); 592 spin_lock(&root->fs_info->ordered_extent_lock); 593 list_splice_tail(&splice, 594 &root->fs_info->ordered_operations); 595 spin_unlock(&root->fs_info->ordered_extent_lock); 596 ret = -ENOMEM; 597 goto out; 598 } 599 list_add_tail(&work->list, &works); 600 btrfs_queue_worker(&root->fs_info->flush_workers, 601 &work->work); 602 603 cond_resched(); 604 spin_lock(&root->fs_info->ordered_extent_lock); 605 } 606 if (wait && !list_empty(&root->fs_info->ordered_operations)) 607 goto again; 608 609 spin_unlock(&root->fs_info->ordered_extent_lock); 610 out: 611 list_for_each_entry_safe(work, next, &works, list) { 612 list_del_init(&work->list); 613 btrfs_wait_and_free_delalloc_work(work); 614 } 615 mutex_unlock(&root->fs_info->ordered_operations_mutex); 616 return ret; 617 } 618 619 /* 620 * Used to start IO or wait for a given ordered extent to finish. 621 * 622 * If wait is one, this effectively waits on page writeback for all the pages 623 * in the extent, and it waits on the io completion code to insert 624 * metadata into the btree corresponding to the extent 625 */ 626 void btrfs_start_ordered_extent(struct inode *inode, 627 struct btrfs_ordered_extent *entry, 628 int wait) 629 { 630 u64 start = entry->file_offset; 631 u64 end = start + entry->len - 1; 632 633 trace_btrfs_ordered_extent_start(inode, entry); 634 635 /* 636 * pages in the range can be dirty, clean or writeback. We 637 * start IO on any dirty ones so the wait doesn't stall waiting 638 * for the flusher thread to find them 639 */ 640 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 641 filemap_fdatawrite_range(inode->i_mapping, start, end); 642 if (wait) { 643 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 644 &entry->flags)); 645 } 646 } 647 648 /* 649 * Used to wait on ordered extents across a large range of bytes. 650 */ 651 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 652 { 653 u64 end; 654 u64 orig_end; 655 struct btrfs_ordered_extent *ordered; 656 657 if (start + len < start) { 658 orig_end = INT_LIMIT(loff_t); 659 } else { 660 orig_end = start + len - 1; 661 if (orig_end > INT_LIMIT(loff_t)) 662 orig_end = INT_LIMIT(loff_t); 663 } 664 665 /* start IO across the range first to instantiate any delalloc 666 * extents 667 */ 668 filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 669 670 /* 671 * So with compression we will find and lock a dirty page and clear the 672 * first one as dirty, setup an async extent, and immediately return 673 * with the entire range locked but with nobody actually marked with 674 * writeback. So we can't just filemap_write_and_wait_range() and 675 * expect it to work since it will just kick off a thread to do the 676 * actual work. So we need to call filemap_fdatawrite_range _again_ 677 * since it will wait on the page lock, which won't be unlocked until 678 * after the pages have been marked as writeback and so we're good to go 679 * from there. We have to do this otherwise we'll miss the ordered 680 * extents and that results in badness. Please Josef, do not think you 681 * know better and pull this out at some point in the future, it is 682 * right and you are wrong. 683 */ 684 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 685 &BTRFS_I(inode)->runtime_flags)) 686 filemap_fdatawrite_range(inode->i_mapping, start, orig_end); 687 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 689 690 end = orig_end; 691 while (1) { 692 ordered = btrfs_lookup_first_ordered_extent(inode, end); 693 if (!ordered) 694 break; 695 if (ordered->file_offset > orig_end) { 696 btrfs_put_ordered_extent(ordered); 697 break; 698 } 699 if (ordered->file_offset + ordered->len < start) { 700 btrfs_put_ordered_extent(ordered); 701 break; 702 } 703 btrfs_start_ordered_extent(inode, ordered, 1); 704 end = ordered->file_offset; 705 btrfs_put_ordered_extent(ordered); 706 if (end == 0 || end == start) 707 break; 708 end--; 709 } 710 } 711 712 /* 713 * find an ordered extent corresponding to file_offset. return NULL if 714 * nothing is found, otherwise take a reference on the extent and return it 715 */ 716 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, 717 u64 file_offset) 718 { 719 struct btrfs_ordered_inode_tree *tree; 720 struct rb_node *node; 721 struct btrfs_ordered_extent *entry = NULL; 722 723 tree = &BTRFS_I(inode)->ordered_tree; 724 spin_lock_irq(&tree->lock); 725 node = tree_search(tree, file_offset); 726 if (!node) 727 goto out; 728 729 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 730 if (!offset_in_entry(entry, file_offset)) 731 entry = NULL; 732 if (entry) 733 atomic_inc(&entry->refs); 734 out: 735 spin_unlock_irq(&tree->lock); 736 return entry; 737 } 738 739 /* Since the DIO code tries to lock a wide area we need to look for any ordered 740 * extents that exist in the range, rather than just the start of the range. 741 */ 742 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, 743 u64 file_offset, 744 u64 len) 745 { 746 struct btrfs_ordered_inode_tree *tree; 747 struct rb_node *node; 748 struct btrfs_ordered_extent *entry = NULL; 749 750 tree = &BTRFS_I(inode)->ordered_tree; 751 spin_lock_irq(&tree->lock); 752 node = tree_search(tree, file_offset); 753 if (!node) { 754 node = tree_search(tree, file_offset + len); 755 if (!node) 756 goto out; 757 } 758 759 while (1) { 760 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 761 if (range_overlaps(entry, file_offset, len)) 762 break; 763 764 if (entry->file_offset >= file_offset + len) { 765 entry = NULL; 766 break; 767 } 768 entry = NULL; 769 node = rb_next(node); 770 if (!node) 771 break; 772 } 773 out: 774 if (entry) 775 atomic_inc(&entry->refs); 776 spin_unlock_irq(&tree->lock); 777 return entry; 778 } 779 780 /* 781 * lookup and return any extent before 'file_offset'. NULL is returned 782 * if none is found 783 */ 784 struct btrfs_ordered_extent * 785 btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) 786 { 787 struct btrfs_ordered_inode_tree *tree; 788 struct rb_node *node; 789 struct btrfs_ordered_extent *entry = NULL; 790 791 tree = &BTRFS_I(inode)->ordered_tree; 792 spin_lock_irq(&tree->lock); 793 node = tree_search(tree, file_offset); 794 if (!node) 795 goto out; 796 797 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 798 atomic_inc(&entry->refs); 799 out: 800 spin_unlock_irq(&tree->lock); 801 return entry; 802 } 803 804 /* 805 * After an extent is done, call this to conditionally update the on disk 806 * i_size. i_size is updated to cover any fully written part of the file. 807 */ 808 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 809 struct btrfs_ordered_extent *ordered) 810 { 811 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 812 u64 disk_i_size; 813 u64 new_i_size; 814 u64 i_size = i_size_read(inode); 815 struct rb_node *node; 816 struct rb_node *prev = NULL; 817 struct btrfs_ordered_extent *test; 818 int ret = 1; 819 820 if (ordered) 821 offset = entry_end(ordered); 822 else 823 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 824 825 spin_lock_irq(&tree->lock); 826 disk_i_size = BTRFS_I(inode)->disk_i_size; 827 828 /* truncate file */ 829 if (disk_i_size > i_size) { 830 BTRFS_I(inode)->disk_i_size = i_size; 831 ret = 0; 832 goto out; 833 } 834 835 /* 836 * if the disk i_size is already at the inode->i_size, or 837 * this ordered extent is inside the disk i_size, we're done 838 */ 839 if (disk_i_size == i_size || offset <= disk_i_size) { 840 goto out; 841 } 842 843 /* 844 * walk backward from this ordered extent to disk_i_size. 845 * if we find an ordered extent then we can't update disk i_size 846 * yet 847 */ 848 if (ordered) { 849 node = rb_prev(&ordered->rb_node); 850 } else { 851 prev = tree_search(tree, offset); 852 /* 853 * we insert file extents without involving ordered struct, 854 * so there should be no ordered struct cover this offset 855 */ 856 if (prev) { 857 test = rb_entry(prev, struct btrfs_ordered_extent, 858 rb_node); 859 BUG_ON(offset_in_entry(test, offset)); 860 } 861 node = prev; 862 } 863 for (; node; node = rb_prev(node)) { 864 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 865 866 /* We treat this entry as if it doesnt exist */ 867 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 868 continue; 869 if (test->file_offset + test->len <= disk_i_size) 870 break; 871 if (test->file_offset >= i_size) 872 break; 873 if (test->file_offset >= disk_i_size) { 874 /* 875 * we don't update disk_i_size now, so record this 876 * undealt i_size. Or we will not know the real 877 * i_size. 878 */ 879 if (test->outstanding_isize < offset) 880 test->outstanding_isize = offset; 881 if (ordered && 882 ordered->outstanding_isize > 883 test->outstanding_isize) 884 test->outstanding_isize = 885 ordered->outstanding_isize; 886 goto out; 887 } 888 } 889 new_i_size = min_t(u64, offset, i_size); 890 891 /* 892 * Some ordered extents may completed before the current one, and 893 * we hold the real i_size in ->outstanding_isize. 894 */ 895 if (ordered && ordered->outstanding_isize > new_i_size) 896 new_i_size = min_t(u64, ordered->outstanding_isize, i_size); 897 BTRFS_I(inode)->disk_i_size = new_i_size; 898 ret = 0; 899 out: 900 /* 901 * We need to do this because we can't remove ordered extents until 902 * after the i_disk_size has been updated and then the inode has been 903 * updated to reflect the change, so we need to tell anybody who finds 904 * this ordered extent that we've already done all the real work, we 905 * just haven't completed all the other work. 906 */ 907 if (ordered) 908 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); 909 spin_unlock_irq(&tree->lock); 910 return ret; 911 } 912 913 /* 914 * search the ordered extents for one corresponding to 'offset' and 915 * try to find a checksum. This is used because we allow pages to 916 * be reclaimed before their checksum is actually put into the btree 917 */ 918 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 919 u32 *sum) 920 { 921 struct btrfs_ordered_sum *ordered_sum; 922 struct btrfs_sector_sum *sector_sums; 923 struct btrfs_ordered_extent *ordered; 924 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 925 unsigned long num_sectors; 926 unsigned long i; 927 u32 sectorsize = BTRFS_I(inode)->root->sectorsize; 928 int ret = 1; 929 930 ordered = btrfs_lookup_ordered_extent(inode, offset); 931 if (!ordered) 932 return 1; 933 934 spin_lock_irq(&tree->lock); 935 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 936 if (disk_bytenr >= ordered_sum->bytenr) { 937 num_sectors = ordered_sum->len / sectorsize; 938 sector_sums = ordered_sum->sums; 939 for (i = 0; i < num_sectors; i++) { 940 if (sector_sums[i].bytenr == disk_bytenr) { 941 *sum = sector_sums[i].sum; 942 ret = 0; 943 goto out; 944 } 945 } 946 } 947 } 948 out: 949 spin_unlock_irq(&tree->lock); 950 btrfs_put_ordered_extent(ordered); 951 return ret; 952 } 953 954 955 /* 956 * add a given inode to the list of inodes that must be fully on 957 * disk before a transaction commit finishes. 958 * 959 * This basically gives us the ext3 style data=ordered mode, and it is mostly 960 * used to make sure renamed files are fully on disk. 961 * 962 * It is a noop if the inode is already fully on disk. 963 * 964 * If trans is not null, we'll do a friendly check for a transaction that 965 * is already flushing things and force the IO down ourselves. 966 */ 967 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 968 struct btrfs_root *root, struct inode *inode) 969 { 970 u64 last_mod; 971 972 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); 973 974 /* 975 * if this file hasn't been changed since the last transaction 976 * commit, we can safely return without doing anything 977 */ 978 if (last_mod < root->fs_info->last_trans_committed) 979 return; 980 981 spin_lock(&root->fs_info->ordered_extent_lock); 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 983 list_add_tail(&BTRFS_I(inode)->ordered_operations, 984 &root->fs_info->ordered_operations); 985 } 986 spin_unlock(&root->fs_info->ordered_extent_lock); 987 } 988 989 int __init ordered_data_init(void) 990 { 991 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 992 sizeof(struct btrfs_ordered_extent), 0, 993 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, 994 NULL); 995 if (!btrfs_ordered_extent_cache) 996 return -ENOMEM; 997 998 return 0; 999 } 1000 1001 void ordered_data_exit(void) 1002 { 1003 if (btrfs_ordered_extent_cache) 1004 kmem_cache_destroy(btrfs_ordered_extent_cache); 1005 } 1006