1 /* 2 * linux/fs/buffer.c 3 * 4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds 5 */ 6 7 /* 8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 9 * 10 * Removed a lot of unnecessary code and simplified things now that 11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 12 * 13 * Speed up hash, lru, and free list operations. Use gfp() for allocating 14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM 15 * 16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK 17 * 18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> 19 */ 20 21 #include <linux/kernel.h> 22 #include <linux/syscalls.h> 23 #include <linux/fs.h> 24 #include <linux/mm.h> 25 #include <linux/percpu.h> 26 #include <linux/slab.h> 27 #include <linux/capability.h> 28 #include <linux/blkdev.h> 29 #include <linux/file.h> 30 #include <linux/quotaops.h> 31 #include <linux/highmem.h> 32 #include <linux/export.h> 33 #include <linux/writeback.h> 34 #include <linux/hash.h> 35 #include <linux/suspend.h> 36 #include <linux/buffer_head.h> 37 #include <linux/task_io_accounting_ops.h> 38 #include <linux/bio.h> 39 #include <linux/notifier.h> 40 #include <linux/cpu.h> 41 #include <linux/bitops.h> 42 #include <linux/mpage.h> 43 #include <linux/bit_spinlock.h> 44 45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 46 47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 48 49 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) 50 { 51 bh->b_end_io = handler; 52 bh->b_private = private; 53 } 54 EXPORT_SYMBOL(init_buffer); 55 56 static int sleep_on_buffer(void *word) 57 { 58 io_schedule(); 59 return 0; 60 } 61 62 void __lock_buffer(struct buffer_head *bh) 63 { 64 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, 65 TASK_UNINTERRUPTIBLE); 66 } 67 EXPORT_SYMBOL(__lock_buffer); 68 69 void unlock_buffer(struct buffer_head *bh) 70 { 71 clear_bit_unlock(BH_Lock, &bh->b_state); 72 smp_mb__after_clear_bit(); 73 wake_up_bit(&bh->b_state, BH_Lock); 74 } 75 EXPORT_SYMBOL(unlock_buffer); 76 77 /* 78 * Block until a buffer comes unlocked. This doesn't stop it 79 * from becoming locked again - you have to lock it yourself 80 * if you want to preserve its state. 81 */ 82 void __wait_on_buffer(struct buffer_head * bh) 83 { 84 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); 85 } 86 EXPORT_SYMBOL(__wait_on_buffer); 87 88 static void 89 __clear_page_buffers(struct page *page) 90 { 91 ClearPagePrivate(page); 92 set_page_private(page, 0); 93 page_cache_release(page); 94 } 95 96 97 static int quiet_error(struct buffer_head *bh) 98 { 99 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) 100 return 0; 101 return 1; 102 } 103 104 105 static void buffer_io_error(struct buffer_head *bh) 106 { 107 char b[BDEVNAME_SIZE]; 108 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", 109 bdevname(bh->b_bdev, b), 110 (unsigned long long)bh->b_blocknr); 111 } 112 113 /* 114 * End-of-IO handler helper function which does not touch the bh after 115 * unlocking it. 116 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but 117 * a race there is benign: unlock_buffer() only use the bh's address for 118 * hashing after unlocking the buffer, so it doesn't actually touch the bh 119 * itself. 120 */ 121 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) 122 { 123 if (uptodate) { 124 set_buffer_uptodate(bh); 125 } else { 126 /* This happens, due to failed READA attempts. */ 127 clear_buffer_uptodate(bh); 128 } 129 unlock_buffer(bh); 130 } 131 132 /* 133 * Default synchronous end-of-IO handler.. Just mark it up-to-date and 134 * unlock the buffer. This is what ll_rw_block uses too. 135 */ 136 void end_buffer_read_sync(struct buffer_head *bh, int uptodate) 137 { 138 __end_buffer_read_notouch(bh, uptodate); 139 put_bh(bh); 140 } 141 EXPORT_SYMBOL(end_buffer_read_sync); 142 143 void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 144 { 145 char b[BDEVNAME_SIZE]; 146 147 if (uptodate) { 148 set_buffer_uptodate(bh); 149 } else { 150 if (!quiet_error(bh)) { 151 buffer_io_error(bh); 152 printk(KERN_WARNING "lost page write due to " 153 "I/O error on %s\n", 154 bdevname(bh->b_bdev, b)); 155 } 156 set_buffer_write_io_error(bh); 157 clear_buffer_uptodate(bh); 158 } 159 unlock_buffer(bh); 160 put_bh(bh); 161 } 162 EXPORT_SYMBOL(end_buffer_write_sync); 163 164 /* 165 * Various filesystems appear to want __find_get_block to be non-blocking. 166 * But it's the page lock which protects the buffers. To get around this, 167 * we get exclusion from try_to_free_buffers with the blockdev mapping's 168 * private_lock. 169 * 170 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention 171 * may be quite high. This code could TryLock the page, and if that 172 * succeeds, there is no need to take private_lock. (But if 173 * private_lock is contended then so is mapping->tree_lock). 174 */ 175 static struct buffer_head * 176 __find_get_block_slow(struct block_device *bdev, sector_t block) 177 { 178 struct inode *bd_inode = bdev->bd_inode; 179 struct address_space *bd_mapping = bd_inode->i_mapping; 180 struct buffer_head *ret = NULL; 181 pgoff_t index; 182 struct buffer_head *bh; 183 struct buffer_head *head; 184 struct page *page; 185 int all_mapped = 1; 186 187 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); 188 page = find_get_page(bd_mapping, index); 189 if (!page) 190 goto out; 191 192 spin_lock(&bd_mapping->private_lock); 193 if (!page_has_buffers(page)) 194 goto out_unlock; 195 head = page_buffers(page); 196 bh = head; 197 do { 198 if (!buffer_mapped(bh)) 199 all_mapped = 0; 200 else if (bh->b_blocknr == block) { 201 ret = bh; 202 get_bh(bh); 203 goto out_unlock; 204 } 205 bh = bh->b_this_page; 206 } while (bh != head); 207 208 /* we might be here because some of the buffers on this page are 209 * not mapped. This is due to various races between 210 * file io on the block device and getblk. It gets dealt with 211 * elsewhere, don't buffer_error if we had some unmapped buffers 212 */ 213 if (all_mapped) { 214 char b[BDEVNAME_SIZE]; 215 216 printk("__find_get_block_slow() failed. " 217 "block=%llu, b_blocknr=%llu\n", 218 (unsigned long long)block, 219 (unsigned long long)bh->b_blocknr); 220 printk("b_state=0x%08lx, b_size=%zu\n", 221 bh->b_state, bh->b_size); 222 printk("device %s blocksize: %d\n", bdevname(bdev, b), 223 1 << bd_inode->i_blkbits); 224 } 225 out_unlock: 226 spin_unlock(&bd_mapping->private_lock); 227 page_cache_release(page); 228 out: 229 return ret; 230 } 231 232 /* 233 * Kick the writeback threads then try to free up some ZONE_NORMAL memory. 234 */ 235 static void free_more_memory(void) 236 { 237 struct zone *zone; 238 int nid; 239 240 wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); 241 yield(); 242 243 for_each_online_node(nid) { 244 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), 245 gfp_zone(GFP_NOFS), NULL, 246 &zone); 247 if (zone) 248 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 249 GFP_NOFS, NULL); 250 } 251 } 252 253 /* 254 * I/O completion handler for block_read_full_page() - pages 255 * which come unlocked at the end of I/O. 256 */ 257 static void end_buffer_async_read(struct buffer_head *bh, int uptodate) 258 { 259 unsigned long flags; 260 struct buffer_head *first; 261 struct buffer_head *tmp; 262 struct page *page; 263 int page_uptodate = 1; 264 265 BUG_ON(!buffer_async_read(bh)); 266 267 page = bh->b_page; 268 if (uptodate) { 269 set_buffer_uptodate(bh); 270 } else { 271 clear_buffer_uptodate(bh); 272 if (!quiet_error(bh)) 273 buffer_io_error(bh); 274 SetPageError(page); 275 } 276 277 /* 278 * Be _very_ careful from here on. Bad things can happen if 279 * two buffer heads end IO at almost the same time and both 280 * decide that the page is now completely done. 281 */ 282 first = page_buffers(page); 283 local_irq_save(flags); 284 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 285 clear_buffer_async_read(bh); 286 unlock_buffer(bh); 287 tmp = bh; 288 do { 289 if (!buffer_uptodate(tmp)) 290 page_uptodate = 0; 291 if (buffer_async_read(tmp)) { 292 BUG_ON(!buffer_locked(tmp)); 293 goto still_busy; 294 } 295 tmp = tmp->b_this_page; 296 } while (tmp != bh); 297 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 298 local_irq_restore(flags); 299 300 /* 301 * If none of the buffers had errors and they are all 302 * uptodate then we can set the page uptodate. 303 */ 304 if (page_uptodate && !PageError(page)) 305 SetPageUptodate(page); 306 unlock_page(page); 307 return; 308 309 still_busy: 310 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 311 local_irq_restore(flags); 312 return; 313 } 314 315 /* 316 * Completion handler for block_write_full_page() - pages which are unlocked 317 * during I/O, and which have PageWriteback cleared upon I/O completion. 318 */ 319 void end_buffer_async_write(struct buffer_head *bh, int uptodate) 320 { 321 char b[BDEVNAME_SIZE]; 322 unsigned long flags; 323 struct buffer_head *first; 324 struct buffer_head *tmp; 325 struct page *page; 326 327 BUG_ON(!buffer_async_write(bh)); 328 329 page = bh->b_page; 330 if (uptodate) { 331 set_buffer_uptodate(bh); 332 } else { 333 if (!quiet_error(bh)) { 334 buffer_io_error(bh); 335 printk(KERN_WARNING "lost page write due to " 336 "I/O error on %s\n", 337 bdevname(bh->b_bdev, b)); 338 } 339 set_bit(AS_EIO, &page->mapping->flags); 340 set_buffer_write_io_error(bh); 341 clear_buffer_uptodate(bh); 342 SetPageError(page); 343 } 344 345 first = page_buffers(page); 346 local_irq_save(flags); 347 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 348 349 clear_buffer_async_write(bh); 350 unlock_buffer(bh); 351 tmp = bh->b_this_page; 352 while (tmp != bh) { 353 if (buffer_async_write(tmp)) { 354 BUG_ON(!buffer_locked(tmp)); 355 goto still_busy; 356 } 357 tmp = tmp->b_this_page; 358 } 359 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 360 local_irq_restore(flags); 361 end_page_writeback(page); 362 return; 363 364 still_busy: 365 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 366 local_irq_restore(flags); 367 return; 368 } 369 EXPORT_SYMBOL(end_buffer_async_write); 370 371 /* 372 * If a page's buffers are under async readin (end_buffer_async_read 373 * completion) then there is a possibility that another thread of 374 * control could lock one of the buffers after it has completed 375 * but while some of the other buffers have not completed. This 376 * locked buffer would confuse end_buffer_async_read() into not unlocking 377 * the page. So the absence of BH_Async_Read tells end_buffer_async_read() 378 * that this buffer is not under async I/O. 379 * 380 * The page comes unlocked when it has no locked buffer_async buffers 381 * left. 382 * 383 * PageLocked prevents anyone starting new async I/O reads any of 384 * the buffers. 385 * 386 * PageWriteback is used to prevent simultaneous writeout of the same 387 * page. 388 * 389 * PageLocked prevents anyone from starting writeback of a page which is 390 * under read I/O (PageWriteback is only ever set against a locked page). 391 */ 392 static void mark_buffer_async_read(struct buffer_head *bh) 393 { 394 bh->b_end_io = end_buffer_async_read; 395 set_buffer_async_read(bh); 396 } 397 398 static void mark_buffer_async_write_endio(struct buffer_head *bh, 399 bh_end_io_t *handler) 400 { 401 bh->b_end_io = handler; 402 set_buffer_async_write(bh); 403 } 404 405 void mark_buffer_async_write(struct buffer_head *bh) 406 { 407 mark_buffer_async_write_endio(bh, end_buffer_async_write); 408 } 409 EXPORT_SYMBOL(mark_buffer_async_write); 410 411 412 /* 413 * fs/buffer.c contains helper functions for buffer-backed address space's 414 * fsync functions. A common requirement for buffer-based filesystems is 415 * that certain data from the backing blockdev needs to be written out for 416 * a successful fsync(). For example, ext2 indirect blocks need to be 417 * written back and waited upon before fsync() returns. 418 * 419 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), 420 * inode_has_buffers() and invalidate_inode_buffers() are provided for the 421 * management of a list of dependent buffers at ->i_mapping->private_list. 422 * 423 * Locking is a little subtle: try_to_free_buffers() will remove buffers 424 * from their controlling inode's queue when they are being freed. But 425 * try_to_free_buffers() will be operating against the *blockdev* mapping 426 * at the time, not against the S_ISREG file which depends on those buffers. 427 * So the locking for private_list is via the private_lock in the address_space 428 * which backs the buffers. Which is different from the address_space 429 * against which the buffers are listed. So for a particular address_space, 430 * mapping->private_lock does *not* protect mapping->private_list! In fact, 431 * mapping->private_list will always be protected by the backing blockdev's 432 * ->private_lock. 433 * 434 * Which introduces a requirement: all buffers on an address_space's 435 * ->private_list must be from the same address_space: the blockdev's. 436 * 437 * address_spaces which do not place buffers at ->private_list via these 438 * utility functions are free to use private_lock and private_list for 439 * whatever they want. The only requirement is that list_empty(private_list) 440 * be true at clear_inode() time. 441 * 442 * FIXME: clear_inode should not call invalidate_inode_buffers(). The 443 * filesystems should do that. invalidate_inode_buffers() should just go 444 * BUG_ON(!list_empty). 445 * 446 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should 447 * take an address_space, not an inode. And it should be called 448 * mark_buffer_dirty_fsync() to clearly define why those buffers are being 449 * queued up. 450 * 451 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the 452 * list if it is already on a list. Because if the buffer is on a list, 453 * it *must* already be on the right one. If not, the filesystem is being 454 * silly. This will save a ton of locking. But first we have to ensure 455 * that buffers are taken *off* the old inode's list when they are freed 456 * (presumably in truncate). That requires careful auditing of all 457 * filesystems (do it inside bforget()). It could also be done by bringing 458 * b_inode back. 459 */ 460 461 /* 462 * The buffer's backing address_space's private_lock must be held 463 */ 464 static void __remove_assoc_queue(struct buffer_head *bh) 465 { 466 list_del_init(&bh->b_assoc_buffers); 467 WARN_ON(!bh->b_assoc_map); 468 if (buffer_write_io_error(bh)) 469 set_bit(AS_EIO, &bh->b_assoc_map->flags); 470 bh->b_assoc_map = NULL; 471 } 472 473 int inode_has_buffers(struct inode *inode) 474 { 475 return !list_empty(&inode->i_data.private_list); 476 } 477 478 /* 479 * osync is designed to support O_SYNC io. It waits synchronously for 480 * all already-submitted IO to complete, but does not queue any new 481 * writes to the disk. 482 * 483 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as 484 * you dirty the buffers, and then use osync_inode_buffers to wait for 485 * completion. Any other dirty buffers which are not yet queued for 486 * write will not be flushed to disk by the osync. 487 */ 488 static int osync_buffers_list(spinlock_t *lock, struct list_head *list) 489 { 490 struct buffer_head *bh; 491 struct list_head *p; 492 int err = 0; 493 494 spin_lock(lock); 495 repeat: 496 list_for_each_prev(p, list) { 497 bh = BH_ENTRY(p); 498 if (buffer_locked(bh)) { 499 get_bh(bh); 500 spin_unlock(lock); 501 wait_on_buffer(bh); 502 if (!buffer_uptodate(bh)) 503 err = -EIO; 504 brelse(bh); 505 spin_lock(lock); 506 goto repeat; 507 } 508 } 509 spin_unlock(lock); 510 return err; 511 } 512 513 static void do_thaw_one(struct super_block *sb, void *unused) 514 { 515 char b[BDEVNAME_SIZE]; 516 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) 517 printk(KERN_WARNING "Emergency Thaw on %s\n", 518 bdevname(sb->s_bdev, b)); 519 } 520 521 static void do_thaw_all(struct work_struct *work) 522 { 523 iterate_supers(do_thaw_one, NULL); 524 kfree(work); 525 printk(KERN_WARNING "Emergency Thaw complete\n"); 526 } 527 528 /** 529 * emergency_thaw_all -- forcibly thaw every frozen filesystem 530 * 531 * Used for emergency unfreeze of all filesystems via SysRq 532 */ 533 void emergency_thaw_all(void) 534 { 535 struct work_struct *work; 536 537 work = kmalloc(sizeof(*work), GFP_ATOMIC); 538 if (work) { 539 INIT_WORK(work, do_thaw_all); 540 schedule_work(work); 541 } 542 } 543 544 /** 545 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 546 * @mapping: the mapping which wants those buffers written 547 * 548 * Starts I/O against the buffers at mapping->private_list, and waits upon 549 * that I/O. 550 * 551 * Basically, this is a convenience function for fsync(). 552 * @mapping is a file or directory which needs those buffers to be written for 553 * a successful fsync(). 554 */ 555 int sync_mapping_buffers(struct address_space *mapping) 556 { 557 struct address_space *buffer_mapping = mapping->private_data; 558 559 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 560 return 0; 561 562 return fsync_buffers_list(&buffer_mapping->private_lock, 563 &mapping->private_list); 564 } 565 EXPORT_SYMBOL(sync_mapping_buffers); 566 567 /* 568 * Called when we've recently written block `bblock', and it is known that 569 * `bblock' was for a buffer_boundary() buffer. This means that the block at 570 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's 571 * dirty, schedule it for IO. So that indirects merge nicely with their data. 572 */ 573 void write_boundary_block(struct block_device *bdev, 574 sector_t bblock, unsigned blocksize) 575 { 576 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); 577 if (bh) { 578 if (buffer_dirty(bh)) 579 ll_rw_block(WRITE, 1, &bh); 580 put_bh(bh); 581 } 582 } 583 584 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) 585 { 586 struct address_space *mapping = inode->i_mapping; 587 struct address_space *buffer_mapping = bh->b_page->mapping; 588 589 mark_buffer_dirty(bh); 590 if (!mapping->private_data) { 591 mapping->private_data = buffer_mapping; 592 } else { 593 BUG_ON(mapping->private_data != buffer_mapping); 594 } 595 if (!bh->b_assoc_map) { 596 spin_lock(&buffer_mapping->private_lock); 597 list_move_tail(&bh->b_assoc_buffers, 598 &mapping->private_list); 599 bh->b_assoc_map = mapping; 600 spin_unlock(&buffer_mapping->private_lock); 601 } 602 } 603 EXPORT_SYMBOL(mark_buffer_dirty_inode); 604 605 /* 606 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode 607 * dirty. 608 * 609 * If warn is true, then emit a warning if the page is not uptodate and has 610 * not been truncated. 611 */ 612 static void __set_page_dirty(struct page *page, 613 struct address_space *mapping, int warn) 614 { 615 spin_lock_irq(&mapping->tree_lock); 616 if (page->mapping) { /* Race with truncate? */ 617 WARN_ON_ONCE(warn && !PageUptodate(page)); 618 account_page_dirtied(page, mapping); 619 radix_tree_tag_set(&mapping->page_tree, 620 page_index(page), PAGECACHE_TAG_DIRTY); 621 } 622 spin_unlock_irq(&mapping->tree_lock); 623 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 624 } 625 626 /* 627 * Add a page to the dirty page list. 628 * 629 * It is a sad fact of life that this function is called from several places 630 * deeply under spinlocking. It may not sleep. 631 * 632 * If the page has buffers, the uptodate buffers are set dirty, to preserve 633 * dirty-state coherency between the page and the buffers. It the page does 634 * not have buffers then when they are later attached they will all be set 635 * dirty. 636 * 637 * The buffers are dirtied before the page is dirtied. There's a small race 638 * window in which a writepage caller may see the page cleanness but not the 639 * buffer dirtiness. That's fine. If this code were to set the page dirty 640 * before the buffers, a concurrent writepage caller could clear the page dirty 641 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean 642 * page on the dirty page list. 643 * 644 * We use private_lock to lock against try_to_free_buffers while using the 645 * page's buffer list. Also use this to protect against clean buffers being 646 * added to the page after it was set dirty. 647 * 648 * FIXME: may need to call ->reservepage here as well. That's rather up to the 649 * address_space though. 650 */ 651 int __set_page_dirty_buffers(struct page *page) 652 { 653 int newly_dirty; 654 struct address_space *mapping = page_mapping(page); 655 656 if (unlikely(!mapping)) 657 return !TestSetPageDirty(page); 658 659 spin_lock(&mapping->private_lock); 660 if (page_has_buffers(page)) { 661 struct buffer_head *head = page_buffers(page); 662 struct buffer_head *bh = head; 663 664 do { 665 set_buffer_dirty(bh); 666 bh = bh->b_this_page; 667 } while (bh != head); 668 } 669 newly_dirty = !TestSetPageDirty(page); 670 spin_unlock(&mapping->private_lock); 671 672 if (newly_dirty) 673 __set_page_dirty(page, mapping, 1); 674 return newly_dirty; 675 } 676 EXPORT_SYMBOL(__set_page_dirty_buffers); 677 678 /* 679 * Write out and wait upon a list of buffers. 680 * 681 * We have conflicting pressures: we want to make sure that all 682 * initially dirty buffers get waited on, but that any subsequently 683 * dirtied buffers don't. After all, we don't want fsync to last 684 * forever if somebody is actively writing to the file. 685 * 686 * Do this in two main stages: first we copy dirty buffers to a 687 * temporary inode list, queueing the writes as we go. Then we clean 688 * up, waiting for those writes to complete. 689 * 690 * During this second stage, any subsequent updates to the file may end 691 * up refiling the buffer on the original inode's dirty list again, so 692 * there is a chance we will end up with a buffer queued for write but 693 * not yet completed on that list. So, as a final cleanup we go through 694 * the osync code to catch these locked, dirty buffers without requeuing 695 * any newly dirty buffers for write. 696 */ 697 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) 698 { 699 struct buffer_head *bh; 700 struct list_head tmp; 701 struct address_space *mapping; 702 int err = 0, err2; 703 struct blk_plug plug; 704 705 INIT_LIST_HEAD(&tmp); 706 blk_start_plug(&plug); 707 708 spin_lock(lock); 709 while (!list_empty(list)) { 710 bh = BH_ENTRY(list->next); 711 mapping = bh->b_assoc_map; 712 __remove_assoc_queue(bh); 713 /* Avoid race with mark_buffer_dirty_inode() which does 714 * a lockless check and we rely on seeing the dirty bit */ 715 smp_mb(); 716 if (buffer_dirty(bh) || buffer_locked(bh)) { 717 list_add(&bh->b_assoc_buffers, &tmp); 718 bh->b_assoc_map = mapping; 719 if (buffer_dirty(bh)) { 720 get_bh(bh); 721 spin_unlock(lock); 722 /* 723 * Ensure any pending I/O completes so that 724 * write_dirty_buffer() actually writes the 725 * current contents - it is a noop if I/O is 726 * still in flight on potentially older 727 * contents. 728 */ 729 write_dirty_buffer(bh, WRITE_SYNC); 730 731 /* 732 * Kick off IO for the previous mapping. Note 733 * that we will not run the very last mapping, 734 * wait_on_buffer() will do that for us 735 * through sync_buffer(). 736 */ 737 brelse(bh); 738 spin_lock(lock); 739 } 740 } 741 } 742 743 spin_unlock(lock); 744 blk_finish_plug(&plug); 745 spin_lock(lock); 746 747 while (!list_empty(&tmp)) { 748 bh = BH_ENTRY(tmp.prev); 749 get_bh(bh); 750 mapping = bh->b_assoc_map; 751 __remove_assoc_queue(bh); 752 /* Avoid race with mark_buffer_dirty_inode() which does 753 * a lockless check and we rely on seeing the dirty bit */ 754 smp_mb(); 755 if (buffer_dirty(bh)) { 756 list_add(&bh->b_assoc_buffers, 757 &mapping->private_list); 758 bh->b_assoc_map = mapping; 759 } 760 spin_unlock(lock); 761 wait_on_buffer(bh); 762 if (!buffer_uptodate(bh)) 763 err = -EIO; 764 brelse(bh); 765 spin_lock(lock); 766 } 767 768 spin_unlock(lock); 769 err2 = osync_buffers_list(lock, list); 770 if (err) 771 return err; 772 else 773 return err2; 774 } 775 776 /* 777 * Invalidate any and all dirty buffers on a given inode. We are 778 * probably unmounting the fs, but that doesn't mean we have already 779 * done a sync(). Just drop the buffers from the inode list. 780 * 781 * NOTE: we take the inode's blockdev's mapping's private_lock. Which 782 * assumes that all the buffers are against the blockdev. Not true 783 * for reiserfs. 784 */ 785 void invalidate_inode_buffers(struct inode *inode) 786 { 787 if (inode_has_buffers(inode)) { 788 struct address_space *mapping = &inode->i_data; 789 struct list_head *list = &mapping->private_list; 790 struct address_space *buffer_mapping = mapping->private_data; 791 792 spin_lock(&buffer_mapping->private_lock); 793 while (!list_empty(list)) 794 __remove_assoc_queue(BH_ENTRY(list->next)); 795 spin_unlock(&buffer_mapping->private_lock); 796 } 797 } 798 EXPORT_SYMBOL(invalidate_inode_buffers); 799 800 /* 801 * Remove any clean buffers from the inode's buffer list. This is called 802 * when we're trying to free the inode itself. Those buffers can pin it. 803 * 804 * Returns true if all buffers were removed. 805 */ 806 int remove_inode_buffers(struct inode *inode) 807 { 808 int ret = 1; 809 810 if (inode_has_buffers(inode)) { 811 struct address_space *mapping = &inode->i_data; 812 struct list_head *list = &mapping->private_list; 813 struct address_space *buffer_mapping = mapping->private_data; 814 815 spin_lock(&buffer_mapping->private_lock); 816 while (!list_empty(list)) { 817 struct buffer_head *bh = BH_ENTRY(list->next); 818 if (buffer_dirty(bh)) { 819 ret = 0; 820 break; 821 } 822 __remove_assoc_queue(bh); 823 } 824 spin_unlock(&buffer_mapping->private_lock); 825 } 826 return ret; 827 } 828 829 /* 830 * Create the appropriate buffers when given a page for data area and 831 * the size of each buffer.. Use the bh->b_this_page linked list to 832 * follow the buffers created. Return NULL if unable to create more 833 * buffers. 834 * 835 * The retry flag is used to differentiate async IO (paging, swapping) 836 * which may not fail from ordinary buffer allocations. 837 */ 838 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 839 int retry) 840 { 841 struct buffer_head *bh, *head; 842 long offset; 843 844 try_again: 845 head = NULL; 846 offset = PAGE_SIZE; 847 while ((offset -= size) >= 0) { 848 bh = alloc_buffer_head(GFP_NOFS); 849 if (!bh) 850 goto no_grow; 851 852 bh->b_this_page = head; 853 bh->b_blocknr = -1; 854 head = bh; 855 856 bh->b_size = size; 857 858 /* Link the buffer to its page */ 859 set_bh_page(bh, page, offset); 860 861 init_buffer(bh, NULL, NULL); 862 } 863 return head; 864 /* 865 * In case anything failed, we just free everything we got. 866 */ 867 no_grow: 868 if (head) { 869 do { 870 bh = head; 871 head = head->b_this_page; 872 free_buffer_head(bh); 873 } while (head); 874 } 875 876 /* 877 * Return failure for non-async IO requests. Async IO requests 878 * are not allowed to fail, so we have to wait until buffer heads 879 * become available. But we don't want tasks sleeping with 880 * partially complete buffers, so all were released above. 881 */ 882 if (!retry) 883 return NULL; 884 885 /* We're _really_ low on memory. Now we just 886 * wait for old buffer heads to become free due to 887 * finishing IO. Since this is an async request and 888 * the reserve list is empty, we're sure there are 889 * async buffer heads in use. 890 */ 891 free_more_memory(); 892 goto try_again; 893 } 894 EXPORT_SYMBOL_GPL(alloc_page_buffers); 895 896 static inline void 897 link_dev_buffers(struct page *page, struct buffer_head *head) 898 { 899 struct buffer_head *bh, *tail; 900 901 bh = head; 902 do { 903 tail = bh; 904 bh = bh->b_this_page; 905 } while (bh); 906 tail->b_this_page = head; 907 attach_page_buffers(page, head); 908 } 909 910 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) 911 { 912 sector_t retval = ~((sector_t)0); 913 loff_t sz = i_size_read(bdev->bd_inode); 914 915 if (sz) { 916 unsigned int sizebits = blksize_bits(size); 917 retval = (sz >> sizebits); 918 } 919 return retval; 920 } 921 922 /* 923 * Initialise the state of a blockdev page's buffers. 924 */ 925 static sector_t 926 init_page_buffers(struct page *page, struct block_device *bdev, 927 sector_t block, int size) 928 { 929 struct buffer_head *head = page_buffers(page); 930 struct buffer_head *bh = head; 931 int uptodate = PageUptodate(page); 932 sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); 933 934 do { 935 if (!buffer_mapped(bh)) { 936 init_buffer(bh, NULL, NULL); 937 bh->b_bdev = bdev; 938 bh->b_blocknr = block; 939 if (uptodate) 940 set_buffer_uptodate(bh); 941 if (block < end_block) 942 set_buffer_mapped(bh); 943 } 944 block++; 945 bh = bh->b_this_page; 946 } while (bh != head); 947 948 /* 949 * Caller needs to validate requested block against end of device. 950 */ 951 return end_block; 952 } 953 954 /* 955 * Create the page-cache page that contains the requested block. 956 * 957 * This is used purely for blockdev mappings. 958 */ 959 static int 960 grow_dev_page(struct block_device *bdev, sector_t block, 961 pgoff_t index, int size, int sizebits) 962 { 963 struct inode *inode = bdev->bd_inode; 964 struct page *page; 965 struct buffer_head *bh; 966 sector_t end_block; 967 int ret = 0; /* Will call free_more_memory() */ 968 969 page = find_or_create_page(inode->i_mapping, index, 970 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 971 if (!page) 972 return ret; 973 974 BUG_ON(!PageLocked(page)); 975 976 if (page_has_buffers(page)) { 977 bh = page_buffers(page); 978 if (bh->b_size == size) { 979 end_block = init_page_buffers(page, bdev, 980 index << sizebits, size); 981 goto done; 982 } 983 if (!try_to_free_buffers(page)) 984 goto failed; 985 } 986 987 /* 988 * Allocate some buffers for this page 989 */ 990 bh = alloc_page_buffers(page, size, 0); 991 if (!bh) 992 goto failed; 993 994 /* 995 * Link the page to the buffers and initialise them. Take the 996 * lock to be atomic wrt __find_get_block(), which does not 997 * run under the page lock. 998 */ 999 spin_lock(&inode->i_mapping->private_lock); 1000 link_dev_buffers(page, bh); 1001 end_block = init_page_buffers(page, bdev, index << sizebits, size); 1002 spin_unlock(&inode->i_mapping->private_lock); 1003 done: 1004 ret = (block < end_block) ? 1 : -ENXIO; 1005 failed: 1006 unlock_page(page); 1007 page_cache_release(page); 1008 return ret; 1009 } 1010 1011 /* 1012 * Create buffers for the specified block device block's page. If 1013 * that page was dirty, the buffers are set dirty also. 1014 */ 1015 static int 1016 grow_buffers(struct block_device *bdev, sector_t block, int size) 1017 { 1018 pgoff_t index; 1019 int sizebits; 1020 1021 sizebits = -1; 1022 do { 1023 sizebits++; 1024 } while ((size << sizebits) < PAGE_SIZE); 1025 1026 index = block >> sizebits; 1027 1028 /* 1029 * Check for a block which wants to lie outside our maximum possible 1030 * pagecache index. (this comparison is done using sector_t types). 1031 */ 1032 if (unlikely(index != block >> sizebits)) { 1033 char b[BDEVNAME_SIZE]; 1034 1035 printk(KERN_ERR "%s: requested out-of-range block %llu for " 1036 "device %s\n", 1037 __func__, (unsigned long long)block, 1038 bdevname(bdev, b)); 1039 return -EIO; 1040 } 1041 1042 /* Create a page with the proper size buffers.. */ 1043 return grow_dev_page(bdev, block, index, size, sizebits); 1044 } 1045 1046 static struct buffer_head * 1047 __getblk_slow(struct block_device *bdev, sector_t block, int size) 1048 { 1049 /* Size must be multiple of hard sectorsize */ 1050 if (unlikely(size & (bdev_logical_block_size(bdev)-1) || 1051 (size < 512 || size > PAGE_SIZE))) { 1052 printk(KERN_ERR "getblk(): invalid block size %d requested\n", 1053 size); 1054 printk(KERN_ERR "logical block size: %d\n", 1055 bdev_logical_block_size(bdev)); 1056 1057 dump_stack(); 1058 return NULL; 1059 } 1060 1061 for (;;) { 1062 struct buffer_head *bh; 1063 int ret; 1064 1065 bh = __find_get_block(bdev, block, size); 1066 if (bh) 1067 return bh; 1068 1069 ret = grow_buffers(bdev, block, size); 1070 if (ret < 0) 1071 return NULL; 1072 if (ret == 0) 1073 free_more_memory(); 1074 } 1075 } 1076 1077 /* 1078 * The relationship between dirty buffers and dirty pages: 1079 * 1080 * Whenever a page has any dirty buffers, the page's dirty bit is set, and 1081 * the page is tagged dirty in its radix tree. 1082 * 1083 * At all times, the dirtiness of the buffers represents the dirtiness of 1084 * subsections of the page. If the page has buffers, the page dirty bit is 1085 * merely a hint about the true dirty state. 1086 * 1087 * When a page is set dirty in its entirety, all its buffers are marked dirty 1088 * (if the page has buffers). 1089 * 1090 * When a buffer is marked dirty, its page is dirtied, but the page's other 1091 * buffers are not. 1092 * 1093 * Also. When blockdev buffers are explicitly read with bread(), they 1094 * individually become uptodate. But their backing page remains not 1095 * uptodate - even if all of its buffers are uptodate. A subsequent 1096 * block_read_full_page() against that page will discover all the uptodate 1097 * buffers, will set the page uptodate and will perform no I/O. 1098 */ 1099 1100 /** 1101 * mark_buffer_dirty - mark a buffer_head as needing writeout 1102 * @bh: the buffer_head to mark dirty 1103 * 1104 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its 1105 * backing page dirty, then tag the page as dirty in its address_space's radix 1106 * tree and then attach the address_space's inode to its superblock's dirty 1107 * inode list. 1108 * 1109 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1110 * mapping->tree_lock and mapping->host->i_lock. 1111 */ 1112 void mark_buffer_dirty(struct buffer_head *bh) 1113 { 1114 WARN_ON_ONCE(!buffer_uptodate(bh)); 1115 1116 /* 1117 * Very *carefully* optimize the it-is-already-dirty case. 1118 * 1119 * Don't let the final "is it dirty" escape to before we 1120 * perhaps modified the buffer. 1121 */ 1122 if (buffer_dirty(bh)) { 1123 smp_mb(); 1124 if (buffer_dirty(bh)) 1125 return; 1126 } 1127 1128 if (!test_set_buffer_dirty(bh)) { 1129 struct page *page = bh->b_page; 1130 if (!TestSetPageDirty(page)) { 1131 struct address_space *mapping = page_mapping(page); 1132 if (mapping) 1133 __set_page_dirty(page, mapping, 0); 1134 } 1135 } 1136 } 1137 EXPORT_SYMBOL(mark_buffer_dirty); 1138 1139 /* 1140 * Decrement a buffer_head's reference count. If all buffers against a page 1141 * have zero reference count, are clean and unlocked, and if the page is clean 1142 * and unlocked then try_to_free_buffers() may strip the buffers from the page 1143 * in preparation for freeing it (sometimes, rarely, buffers are removed from 1144 * a page but it ends up not being freed, and buffers may later be reattached). 1145 */ 1146 void __brelse(struct buffer_head * buf) 1147 { 1148 if (atomic_read(&buf->b_count)) { 1149 put_bh(buf); 1150 return; 1151 } 1152 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1153 } 1154 EXPORT_SYMBOL(__brelse); 1155 1156 /* 1157 * bforget() is like brelse(), except it discards any 1158 * potentially dirty data. 1159 */ 1160 void __bforget(struct buffer_head *bh) 1161 { 1162 clear_buffer_dirty(bh); 1163 if (bh->b_assoc_map) { 1164 struct address_space *buffer_mapping = bh->b_page->mapping; 1165 1166 spin_lock(&buffer_mapping->private_lock); 1167 list_del_init(&bh->b_assoc_buffers); 1168 bh->b_assoc_map = NULL; 1169 spin_unlock(&buffer_mapping->private_lock); 1170 } 1171 __brelse(bh); 1172 } 1173 EXPORT_SYMBOL(__bforget); 1174 1175 static struct buffer_head *__bread_slow(struct buffer_head *bh) 1176 { 1177 lock_buffer(bh); 1178 if (buffer_uptodate(bh)) { 1179 unlock_buffer(bh); 1180 return bh; 1181 } else { 1182 get_bh(bh); 1183 bh->b_end_io = end_buffer_read_sync; 1184 submit_bh(READ, bh); 1185 wait_on_buffer(bh); 1186 if (buffer_uptodate(bh)) 1187 return bh; 1188 } 1189 brelse(bh); 1190 return NULL; 1191 } 1192 1193 /* 1194 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). 1195 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their 1196 * refcount elevated by one when they're in an LRU. A buffer can only appear 1197 * once in a particular CPU's LRU. A single buffer can be present in multiple 1198 * CPU's LRUs at the same time. 1199 * 1200 * This is a transparent caching front-end to sb_bread(), sb_getblk() and 1201 * sb_find_get_block(). 1202 * 1203 * The LRUs themselves only need locking against invalidate_bh_lrus. We use 1204 * a local interrupt disable for that. 1205 */ 1206 1207 #define BH_LRU_SIZE 8 1208 1209 struct bh_lru { 1210 struct buffer_head *bhs[BH_LRU_SIZE]; 1211 }; 1212 1213 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; 1214 1215 #ifdef CONFIG_SMP 1216 #define bh_lru_lock() local_irq_disable() 1217 #define bh_lru_unlock() local_irq_enable() 1218 #else 1219 #define bh_lru_lock() preempt_disable() 1220 #define bh_lru_unlock() preempt_enable() 1221 #endif 1222 1223 static inline void check_irqs_on(void) 1224 { 1225 #ifdef irqs_disabled 1226 BUG_ON(irqs_disabled()); 1227 #endif 1228 } 1229 1230 /* 1231 * The LRU management algorithm is dopey-but-simple. Sorry. 1232 */ 1233 static void bh_lru_install(struct buffer_head *bh) 1234 { 1235 struct buffer_head *evictee = NULL; 1236 1237 check_irqs_on(); 1238 bh_lru_lock(); 1239 if (__this_cpu_read(bh_lrus.bhs[0]) != bh) { 1240 struct buffer_head *bhs[BH_LRU_SIZE]; 1241 int in; 1242 int out = 0; 1243 1244 get_bh(bh); 1245 bhs[out++] = bh; 1246 for (in = 0; in < BH_LRU_SIZE; in++) { 1247 struct buffer_head *bh2 = 1248 __this_cpu_read(bh_lrus.bhs[in]); 1249 1250 if (bh2 == bh) { 1251 __brelse(bh2); 1252 } else { 1253 if (out >= BH_LRU_SIZE) { 1254 BUG_ON(evictee != NULL); 1255 evictee = bh2; 1256 } else { 1257 bhs[out++] = bh2; 1258 } 1259 } 1260 } 1261 while (out < BH_LRU_SIZE) 1262 bhs[out++] = NULL; 1263 memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); 1264 } 1265 bh_lru_unlock(); 1266 1267 if (evictee) 1268 __brelse(evictee); 1269 } 1270 1271 /* 1272 * Look up the bh in this cpu's LRU. If it's there, move it to the head. 1273 */ 1274 static struct buffer_head * 1275 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1276 { 1277 struct buffer_head *ret = NULL; 1278 unsigned int i; 1279 1280 check_irqs_on(); 1281 bh_lru_lock(); 1282 for (i = 0; i < BH_LRU_SIZE; i++) { 1283 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); 1284 1285 if (bh && bh->b_bdev == bdev && 1286 bh->b_blocknr == block && bh->b_size == size) { 1287 if (i) { 1288 while (i) { 1289 __this_cpu_write(bh_lrus.bhs[i], 1290 __this_cpu_read(bh_lrus.bhs[i - 1])); 1291 i--; 1292 } 1293 __this_cpu_write(bh_lrus.bhs[0], bh); 1294 } 1295 get_bh(bh); 1296 ret = bh; 1297 break; 1298 } 1299 } 1300 bh_lru_unlock(); 1301 return ret; 1302 } 1303 1304 /* 1305 * Perform a pagecache lookup for the matching buffer. If it's there, refresh 1306 * it in the LRU and mark it as accessed. If it is not present then return 1307 * NULL 1308 */ 1309 struct buffer_head * 1310 __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1311 { 1312 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1313 1314 if (bh == NULL) { 1315 bh = __find_get_block_slow(bdev, block); 1316 if (bh) 1317 bh_lru_install(bh); 1318 } 1319 if (bh) 1320 touch_buffer(bh); 1321 return bh; 1322 } 1323 EXPORT_SYMBOL(__find_get_block); 1324 1325 /* 1326 * __getblk will locate (and, if necessary, create) the buffer_head 1327 * which corresponds to the passed block_device, block and size. The 1328 * returned buffer has its reference count incremented. 1329 * 1330 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1331 * attempt is failing. FIXME, perhaps? 1332 */ 1333 struct buffer_head * 1334 __getblk(struct block_device *bdev, sector_t block, unsigned size) 1335 { 1336 struct buffer_head *bh = __find_get_block(bdev, block, size); 1337 1338 might_sleep(); 1339 if (bh == NULL) 1340 bh = __getblk_slow(bdev, block, size); 1341 return bh; 1342 } 1343 EXPORT_SYMBOL(__getblk); 1344 1345 /* 1346 * Do async read-ahead on a buffer.. 1347 */ 1348 void __breadahead(struct block_device *bdev, sector_t block, unsigned size) 1349 { 1350 struct buffer_head *bh = __getblk(bdev, block, size); 1351 if (likely(bh)) { 1352 ll_rw_block(READA, 1, &bh); 1353 brelse(bh); 1354 } 1355 } 1356 EXPORT_SYMBOL(__breadahead); 1357 1358 /** 1359 * __bread() - reads a specified block and returns the bh 1360 * @bdev: the block_device to read from 1361 * @block: number of block 1362 * @size: size (in bytes) to read 1363 * 1364 * Reads a specified block, and returns buffer head that contains it. 1365 * It returns NULL if the block was unreadable. 1366 */ 1367 struct buffer_head * 1368 __bread(struct block_device *bdev, sector_t block, unsigned size) 1369 { 1370 struct buffer_head *bh = __getblk(bdev, block, size); 1371 1372 if (likely(bh) && !buffer_uptodate(bh)) 1373 bh = __bread_slow(bh); 1374 return bh; 1375 } 1376 EXPORT_SYMBOL(__bread); 1377 1378 /* 1379 * invalidate_bh_lrus() is called rarely - but not only at unmount. 1380 * This doesn't race because it runs in each cpu either in irq 1381 * or with preempt disabled. 1382 */ 1383 static void invalidate_bh_lru(void *arg) 1384 { 1385 struct bh_lru *b = &get_cpu_var(bh_lrus); 1386 int i; 1387 1388 for (i = 0; i < BH_LRU_SIZE; i++) { 1389 brelse(b->bhs[i]); 1390 b->bhs[i] = NULL; 1391 } 1392 put_cpu_var(bh_lrus); 1393 } 1394 1395 static bool has_bh_in_lru(int cpu, void *dummy) 1396 { 1397 struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); 1398 int i; 1399 1400 for (i = 0; i < BH_LRU_SIZE; i++) { 1401 if (b->bhs[i]) 1402 return 1; 1403 } 1404 1405 return 0; 1406 } 1407 1408 void invalidate_bh_lrus(void) 1409 { 1410 on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); 1411 } 1412 EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1413 1414 void set_bh_page(struct buffer_head *bh, 1415 struct page *page, unsigned long offset) 1416 { 1417 bh->b_page = page; 1418 BUG_ON(offset >= PAGE_SIZE); 1419 if (PageHighMem(page)) 1420 /* 1421 * This catches illegal uses and preserves the offset: 1422 */ 1423 bh->b_data = (char *)(0 + offset); 1424 else 1425 bh->b_data = page_address(page) + offset; 1426 } 1427 EXPORT_SYMBOL(set_bh_page); 1428 1429 /* 1430 * Called when truncating a buffer on a page completely. 1431 */ 1432 static void discard_buffer(struct buffer_head * bh) 1433 { 1434 lock_buffer(bh); 1435 clear_buffer_dirty(bh); 1436 bh->b_bdev = NULL; 1437 clear_buffer_mapped(bh); 1438 clear_buffer_req(bh); 1439 clear_buffer_new(bh); 1440 clear_buffer_delay(bh); 1441 clear_buffer_unwritten(bh); 1442 unlock_buffer(bh); 1443 } 1444 1445 /** 1446 * block_invalidatepage - invalidate part or all of a buffer-backed page 1447 * 1448 * @page: the page which is affected 1449 * @offset: the index of the truncation point 1450 * 1451 * block_invalidatepage() is called when all or part of the page has become 1452 * invalidated by a truncate operation. 1453 * 1454 * block_invalidatepage() does not have to release all buffers, but it must 1455 * ensure that no dirty buffer is left outside @offset and that no I/O 1456 * is underway against any of the blocks which are outside the truncation 1457 * point. Because the caller is about to free (and possibly reuse) those 1458 * blocks on-disk. 1459 */ 1460 void block_invalidatepage(struct page *page, unsigned long offset) 1461 { 1462 struct buffer_head *head, *bh, *next; 1463 unsigned int curr_off = 0; 1464 1465 BUG_ON(!PageLocked(page)); 1466 if (!page_has_buffers(page)) 1467 goto out; 1468 1469 head = page_buffers(page); 1470 bh = head; 1471 do { 1472 unsigned int next_off = curr_off + bh->b_size; 1473 next = bh->b_this_page; 1474 1475 /* 1476 * is this block fully invalidated? 1477 */ 1478 if (offset <= curr_off) 1479 discard_buffer(bh); 1480 curr_off = next_off; 1481 bh = next; 1482 } while (bh != head); 1483 1484 /* 1485 * We release buffers only if the entire page is being invalidated. 1486 * The get_block cached value has been unconditionally invalidated, 1487 * so real IO is not possible anymore. 1488 */ 1489 if (offset == 0) 1490 try_to_release_page(page, 0); 1491 out: 1492 return; 1493 } 1494 EXPORT_SYMBOL(block_invalidatepage); 1495 1496 /* 1497 * We attach and possibly dirty the buffers atomically wrt 1498 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1499 * is already excluded via the page lock. 1500 */ 1501 void create_empty_buffers(struct page *page, 1502 unsigned long blocksize, unsigned long b_state) 1503 { 1504 struct buffer_head *bh, *head, *tail; 1505 1506 head = alloc_page_buffers(page, blocksize, 1); 1507 bh = head; 1508 do { 1509 bh->b_state |= b_state; 1510 tail = bh; 1511 bh = bh->b_this_page; 1512 } while (bh); 1513 tail->b_this_page = head; 1514 1515 spin_lock(&page->mapping->private_lock); 1516 if (PageUptodate(page) || PageDirty(page)) { 1517 bh = head; 1518 do { 1519 if (PageDirty(page)) 1520 set_buffer_dirty(bh); 1521 if (PageUptodate(page)) 1522 set_buffer_uptodate(bh); 1523 bh = bh->b_this_page; 1524 } while (bh != head); 1525 } 1526 attach_page_buffers(page, head); 1527 spin_unlock(&page->mapping->private_lock); 1528 } 1529 EXPORT_SYMBOL(create_empty_buffers); 1530 1531 /* 1532 * We are taking a block for data and we don't want any output from any 1533 * buffer-cache aliases starting from return from that function and 1534 * until the moment when something will explicitly mark the buffer 1535 * dirty (hopefully that will not happen until we will free that block ;-) 1536 * We don't even need to mark it not-uptodate - nobody can expect 1537 * anything from a newly allocated buffer anyway. We used to used 1538 * unmap_buffer() for such invalidation, but that was wrong. We definitely 1539 * don't want to mark the alias unmapped, for example - it would confuse 1540 * anyone who might pick it with bread() afterwards... 1541 * 1542 * Also.. Note that bforget() doesn't lock the buffer. So there can 1543 * be writeout I/O going on against recently-freed buffers. We don't 1544 * wait on that I/O in bforget() - it's more efficient to wait on the I/O 1545 * only if we really need to. That happens here. 1546 */ 1547 void unmap_underlying_metadata(struct block_device *bdev, sector_t block) 1548 { 1549 struct buffer_head *old_bh; 1550 1551 might_sleep(); 1552 1553 old_bh = __find_get_block_slow(bdev, block); 1554 if (old_bh) { 1555 clear_buffer_dirty(old_bh); 1556 wait_on_buffer(old_bh); 1557 clear_buffer_req(old_bh); 1558 __brelse(old_bh); 1559 } 1560 } 1561 EXPORT_SYMBOL(unmap_underlying_metadata); 1562 1563 /* 1564 * Size is a power-of-two in the range 512..PAGE_SIZE, 1565 * and the case we care about most is PAGE_SIZE. 1566 * 1567 * So this *could* possibly be written with those 1568 * constraints in mind (relevant mostly if some 1569 * architecture has a slow bit-scan instruction) 1570 */ 1571 static inline int block_size_bits(unsigned int blocksize) 1572 { 1573 return ilog2(blocksize); 1574 } 1575 1576 static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) 1577 { 1578 BUG_ON(!PageLocked(page)); 1579 1580 if (!page_has_buffers(page)) 1581 create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); 1582 return page_buffers(page); 1583 } 1584 1585 /* 1586 * NOTE! All mapped/uptodate combinations are valid: 1587 * 1588 * Mapped Uptodate Meaning 1589 * 1590 * No No "unknown" - must do get_block() 1591 * No Yes "hole" - zero-filled 1592 * Yes No "allocated" - allocated on disk, not read in 1593 * Yes Yes "valid" - allocated and up-to-date in memory. 1594 * 1595 * "Dirty" is valid only with the last case (mapped+uptodate). 1596 */ 1597 1598 /* 1599 * While block_write_full_page is writing back the dirty buffers under 1600 * the page lock, whoever dirtied the buffers may decide to clean them 1601 * again at any time. We handle that by only looking at the buffer 1602 * state inside lock_buffer(). 1603 * 1604 * If block_write_full_page() is called for regular writeback 1605 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a 1606 * locked buffer. This only can happen if someone has written the buffer 1607 * directly, with submit_bh(). At the address_space level PageWriteback 1608 * prevents this contention from occurring. 1609 * 1610 * If block_write_full_page() is called with wbc->sync_mode == 1611 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this 1612 * causes the writes to be flagged as synchronous writes. 1613 */ 1614 static int __block_write_full_page(struct inode *inode, struct page *page, 1615 get_block_t *get_block, struct writeback_control *wbc, 1616 bh_end_io_t *handler) 1617 { 1618 int err; 1619 sector_t block; 1620 sector_t last_block; 1621 struct buffer_head *bh, *head; 1622 unsigned int blocksize, bbits; 1623 int nr_underway = 0; 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1625 WRITE_SYNC : WRITE); 1626 1627 head = create_page_buffers(page, inode, 1628 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1629 1630 /* 1631 * Be very careful. We have no exclusion from __set_page_dirty_buffers 1632 * here, and the (potentially unmapped) buffers may become dirty at 1633 * any time. If a buffer becomes dirty here after we've inspected it 1634 * then we just miss that fact, and the page stays dirty. 1635 * 1636 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; 1637 * handle that here by just cleaning them. 1638 */ 1639 1640 bh = head; 1641 blocksize = bh->b_size; 1642 bbits = block_size_bits(blocksize); 1643 1644 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1645 last_block = (i_size_read(inode) - 1) >> bbits; 1646 1647 /* 1648 * Get all the dirty buffers mapped to disk addresses and 1649 * handle any aliases from the underlying blockdev's mapping. 1650 */ 1651 do { 1652 if (block > last_block) { 1653 /* 1654 * mapped buffers outside i_size will occur, because 1655 * this page can be outside i_size when there is a 1656 * truncate in progress. 1657 */ 1658 /* 1659 * The buffer was zeroed by block_write_full_page() 1660 */ 1661 clear_buffer_dirty(bh); 1662 set_buffer_uptodate(bh); 1663 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && 1664 buffer_dirty(bh)) { 1665 WARN_ON(bh->b_size != blocksize); 1666 err = get_block(inode, block, bh, 1); 1667 if (err) 1668 goto recover; 1669 clear_buffer_delay(bh); 1670 if (buffer_new(bh)) { 1671 /* blockdev mappings never come here */ 1672 clear_buffer_new(bh); 1673 unmap_underlying_metadata(bh->b_bdev, 1674 bh->b_blocknr); 1675 } 1676 } 1677 bh = bh->b_this_page; 1678 block++; 1679 } while (bh != head); 1680 1681 do { 1682 if (!buffer_mapped(bh)) 1683 continue; 1684 /* 1685 * If it's a fully non-blocking write attempt and we cannot 1686 * lock the buffer then redirty the page. Note that this can 1687 * potentially cause a busy-wait loop from writeback threads 1688 * and kswapd activity, but those code paths have their own 1689 * higher-level throttling. 1690 */ 1691 if (wbc->sync_mode != WB_SYNC_NONE) { 1692 lock_buffer(bh); 1693 } else if (!trylock_buffer(bh)) { 1694 redirty_page_for_writepage(wbc, page); 1695 continue; 1696 } 1697 if (test_clear_buffer_dirty(bh)) { 1698 mark_buffer_async_write_endio(bh, handler); 1699 } else { 1700 unlock_buffer(bh); 1701 } 1702 } while ((bh = bh->b_this_page) != head); 1703 1704 /* 1705 * The page and its buffers are protected by PageWriteback(), so we can 1706 * drop the bh refcounts early. 1707 */ 1708 BUG_ON(PageWriteback(page)); 1709 set_page_writeback(page); 1710 1711 do { 1712 struct buffer_head *next = bh->b_this_page; 1713 if (buffer_async_write(bh)) { 1714 submit_bh(write_op, bh); 1715 nr_underway++; 1716 } 1717 bh = next; 1718 } while (bh != head); 1719 unlock_page(page); 1720 1721 err = 0; 1722 done: 1723 if (nr_underway == 0) { 1724 /* 1725 * The page was marked dirty, but the buffers were 1726 * clean. Someone wrote them back by hand with 1727 * ll_rw_block/submit_bh. A rare case. 1728 */ 1729 end_page_writeback(page); 1730 1731 /* 1732 * The page and buffer_heads can be released at any time from 1733 * here on. 1734 */ 1735 } 1736 return err; 1737 1738 recover: 1739 /* 1740 * ENOSPC, or some other error. We may already have added some 1741 * blocks to the file, so we need to write these out to avoid 1742 * exposing stale data. 1743 * The page is currently locked and not marked for writeback 1744 */ 1745 bh = head; 1746 /* Recovery: lock and submit the mapped buffers */ 1747 do { 1748 if (buffer_mapped(bh) && buffer_dirty(bh) && 1749 !buffer_delay(bh)) { 1750 lock_buffer(bh); 1751 mark_buffer_async_write_endio(bh, handler); 1752 } else { 1753 /* 1754 * The buffer may have been set dirty during 1755 * attachment to a dirty page. 1756 */ 1757 clear_buffer_dirty(bh); 1758 } 1759 } while ((bh = bh->b_this_page) != head); 1760 SetPageError(page); 1761 BUG_ON(PageWriteback(page)); 1762 mapping_set_error(page->mapping, err); 1763 set_page_writeback(page); 1764 do { 1765 struct buffer_head *next = bh->b_this_page; 1766 if (buffer_async_write(bh)) { 1767 clear_buffer_dirty(bh); 1768 submit_bh(write_op, bh); 1769 nr_underway++; 1770 } 1771 bh = next; 1772 } while (bh != head); 1773 unlock_page(page); 1774 goto done; 1775 } 1776 1777 /* 1778 * If a page has any new buffers, zero them out here, and mark them uptodate 1779 * and dirty so they'll be written out (in order to prevent uninitialised 1780 * block data from leaking). And clear the new bit. 1781 */ 1782 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) 1783 { 1784 unsigned int block_start, block_end; 1785 struct buffer_head *head, *bh; 1786 1787 BUG_ON(!PageLocked(page)); 1788 if (!page_has_buffers(page)) 1789 return; 1790 1791 bh = head = page_buffers(page); 1792 block_start = 0; 1793 do { 1794 block_end = block_start + bh->b_size; 1795 1796 if (buffer_new(bh)) { 1797 if (block_end > from && block_start < to) { 1798 if (!PageUptodate(page)) { 1799 unsigned start, size; 1800 1801 start = max(from, block_start); 1802 size = min(to, block_end) - start; 1803 1804 zero_user(page, start, size); 1805 set_buffer_uptodate(bh); 1806 } 1807 1808 clear_buffer_new(bh); 1809 mark_buffer_dirty(bh); 1810 } 1811 } 1812 1813 block_start = block_end; 1814 bh = bh->b_this_page; 1815 } while (bh != head); 1816 } 1817 EXPORT_SYMBOL(page_zero_new_buffers); 1818 1819 int __block_write_begin(struct page *page, loff_t pos, unsigned len, 1820 get_block_t *get_block) 1821 { 1822 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1823 unsigned to = from + len; 1824 struct inode *inode = page->mapping->host; 1825 unsigned block_start, block_end; 1826 sector_t block; 1827 int err = 0; 1828 unsigned blocksize, bbits; 1829 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 1830 1831 BUG_ON(!PageLocked(page)); 1832 BUG_ON(from > PAGE_CACHE_SIZE); 1833 BUG_ON(to > PAGE_CACHE_SIZE); 1834 BUG_ON(from > to); 1835 1836 head = create_page_buffers(page, inode, 0); 1837 blocksize = head->b_size; 1838 bbits = block_size_bits(blocksize); 1839 1840 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1841 1842 for(bh = head, block_start = 0; bh != head || !block_start; 1843 block++, block_start=block_end, bh = bh->b_this_page) { 1844 block_end = block_start + blocksize; 1845 if (block_end <= from || block_start >= to) { 1846 if (PageUptodate(page)) { 1847 if (!buffer_uptodate(bh)) 1848 set_buffer_uptodate(bh); 1849 } 1850 continue; 1851 } 1852 if (buffer_new(bh)) 1853 clear_buffer_new(bh); 1854 if (!buffer_mapped(bh)) { 1855 WARN_ON(bh->b_size != blocksize); 1856 err = get_block(inode, block, bh, 1); 1857 if (err) 1858 break; 1859 if (buffer_new(bh)) { 1860 unmap_underlying_metadata(bh->b_bdev, 1861 bh->b_blocknr); 1862 if (PageUptodate(page)) { 1863 clear_buffer_new(bh); 1864 set_buffer_uptodate(bh); 1865 mark_buffer_dirty(bh); 1866 continue; 1867 } 1868 if (block_end > to || block_start < from) 1869 zero_user_segments(page, 1870 to, block_end, 1871 block_start, from); 1872 continue; 1873 } 1874 } 1875 if (PageUptodate(page)) { 1876 if (!buffer_uptodate(bh)) 1877 set_buffer_uptodate(bh); 1878 continue; 1879 } 1880 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1881 !buffer_unwritten(bh) && 1882 (block_start < from || block_end > to)) { 1883 ll_rw_block(READ, 1, &bh); 1884 *wait_bh++=bh; 1885 } 1886 } 1887 /* 1888 * If we issued read requests - let them complete. 1889 */ 1890 while(wait_bh > wait) { 1891 wait_on_buffer(*--wait_bh); 1892 if (!buffer_uptodate(*wait_bh)) 1893 err = -EIO; 1894 } 1895 if (unlikely(err)) 1896 page_zero_new_buffers(page, from, to); 1897 return err; 1898 } 1899 EXPORT_SYMBOL(__block_write_begin); 1900 1901 static int __block_commit_write(struct inode *inode, struct page *page, 1902 unsigned from, unsigned to) 1903 { 1904 unsigned block_start, block_end; 1905 int partial = 0; 1906 unsigned blocksize; 1907 struct buffer_head *bh, *head; 1908 1909 bh = head = page_buffers(page); 1910 blocksize = bh->b_size; 1911 1912 block_start = 0; 1913 do { 1914 block_end = block_start + blocksize; 1915 if (block_end <= from || block_start >= to) { 1916 if (!buffer_uptodate(bh)) 1917 partial = 1; 1918 } else { 1919 set_buffer_uptodate(bh); 1920 mark_buffer_dirty(bh); 1921 } 1922 clear_buffer_new(bh); 1923 1924 block_start = block_end; 1925 bh = bh->b_this_page; 1926 } while (bh != head); 1927 1928 /* 1929 * If this is a partial write which happened to make all buffers 1930 * uptodate then we can optimize away a bogus readpage() for 1931 * the next read(). Here we 'discover' whether the page went 1932 * uptodate as a result of this (potentially partial) write. 1933 */ 1934 if (!partial) 1935 SetPageUptodate(page); 1936 return 0; 1937 } 1938 1939 /* 1940 * block_write_begin takes care of the basic task of block allocation and 1941 * bringing partial write blocks uptodate first. 1942 * 1943 * The filesystem needs to handle block truncation upon failure. 1944 */ 1945 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, 1946 unsigned flags, struct page **pagep, get_block_t *get_block) 1947 { 1948 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1949 struct page *page; 1950 int status; 1951 1952 page = grab_cache_page_write_begin(mapping, index, flags); 1953 if (!page) 1954 return -ENOMEM; 1955 1956 status = __block_write_begin(page, pos, len, get_block); 1957 if (unlikely(status)) { 1958 unlock_page(page); 1959 page_cache_release(page); 1960 page = NULL; 1961 } 1962 1963 *pagep = page; 1964 return status; 1965 } 1966 EXPORT_SYMBOL(block_write_begin); 1967 1968 int block_write_end(struct file *file, struct address_space *mapping, 1969 loff_t pos, unsigned len, unsigned copied, 1970 struct page *page, void *fsdata) 1971 { 1972 struct inode *inode = mapping->host; 1973 unsigned start; 1974 1975 start = pos & (PAGE_CACHE_SIZE - 1); 1976 1977 if (unlikely(copied < len)) { 1978 /* 1979 * The buffers that were written will now be uptodate, so we 1980 * don't have to worry about a readpage reading them and 1981 * overwriting a partial write. However if we have encountered 1982 * a short write and only partially written into a buffer, it 1983 * will not be marked uptodate, so a readpage might come in and 1984 * destroy our partial write. 1985 * 1986 * Do the simplest thing, and just treat any short write to a 1987 * non uptodate page as a zero-length write, and force the 1988 * caller to redo the whole thing. 1989 */ 1990 if (!PageUptodate(page)) 1991 copied = 0; 1992 1993 page_zero_new_buffers(page, start+copied, start+len); 1994 } 1995 flush_dcache_page(page); 1996 1997 /* This could be a short (even 0-length) commit */ 1998 __block_commit_write(inode, page, start, start+copied); 1999 2000 return copied; 2001 } 2002 EXPORT_SYMBOL(block_write_end); 2003 2004 int generic_write_end(struct file *file, struct address_space *mapping, 2005 loff_t pos, unsigned len, unsigned copied, 2006 struct page *page, void *fsdata) 2007 { 2008 struct inode *inode = mapping->host; 2009 int i_size_changed = 0; 2010 2011 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2012 2013 /* 2014 * No need to use i_size_read() here, the i_size 2015 * cannot change under us because we hold i_mutex. 2016 * 2017 * But it's important to update i_size while still holding page lock: 2018 * page writeout could otherwise come in and zero beyond i_size. 2019 */ 2020 if (pos+copied > inode->i_size) { 2021 i_size_write(inode, pos+copied); 2022 i_size_changed = 1; 2023 } 2024 2025 unlock_page(page); 2026 page_cache_release(page); 2027 2028 /* 2029 * Don't mark the inode dirty under page lock. First, it unnecessarily 2030 * makes the holding time of page lock longer. Second, it forces lock 2031 * ordering of page lock and transaction start for journaling 2032 * filesystems. 2033 */ 2034 if (i_size_changed) 2035 mark_inode_dirty(inode); 2036 2037 return copied; 2038 } 2039 EXPORT_SYMBOL(generic_write_end); 2040 2041 /* 2042 * block_is_partially_uptodate checks whether buffers within a page are 2043 * uptodate or not. 2044 * 2045 * Returns true if all buffers which correspond to a file portion 2046 * we want to read are uptodate. 2047 */ 2048 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2049 unsigned long from) 2050 { 2051 unsigned block_start, block_end, blocksize; 2052 unsigned to; 2053 struct buffer_head *bh, *head; 2054 int ret = 1; 2055 2056 if (!page_has_buffers(page)) 2057 return 0; 2058 2059 head = page_buffers(page); 2060 blocksize = head->b_size; 2061 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2062 to = from + to; 2063 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2064 return 0; 2065 2066 bh = head; 2067 block_start = 0; 2068 do { 2069 block_end = block_start + blocksize; 2070 if (block_end > from && block_start < to) { 2071 if (!buffer_uptodate(bh)) { 2072 ret = 0; 2073 break; 2074 } 2075 if (block_end >= to) 2076 break; 2077 } 2078 block_start = block_end; 2079 bh = bh->b_this_page; 2080 } while (bh != head); 2081 2082 return ret; 2083 } 2084 EXPORT_SYMBOL(block_is_partially_uptodate); 2085 2086 /* 2087 * Generic "read page" function for block devices that have the normal 2088 * get_block functionality. This is most of the block device filesystems. 2089 * Reads the page asynchronously --- the unlock_buffer() and 2090 * set/clear_buffer_uptodate() functions propagate buffer state into the 2091 * page struct once IO has completed. 2092 */ 2093 int block_read_full_page(struct page *page, get_block_t *get_block) 2094 { 2095 struct inode *inode = page->mapping->host; 2096 sector_t iblock, lblock; 2097 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2098 unsigned int blocksize, bbits; 2099 int nr, i; 2100 int fully_mapped = 1; 2101 2102 head = create_page_buffers(page, inode, 0); 2103 blocksize = head->b_size; 2104 bbits = block_size_bits(blocksize); 2105 2106 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 2107 lblock = (i_size_read(inode)+blocksize-1) >> bbits; 2108 bh = head; 2109 nr = 0; 2110 i = 0; 2111 2112 do { 2113 if (buffer_uptodate(bh)) 2114 continue; 2115 2116 if (!buffer_mapped(bh)) { 2117 int err = 0; 2118 2119 fully_mapped = 0; 2120 if (iblock < lblock) { 2121 WARN_ON(bh->b_size != blocksize); 2122 err = get_block(inode, iblock, bh, 0); 2123 if (err) 2124 SetPageError(page); 2125 } 2126 if (!buffer_mapped(bh)) { 2127 zero_user(page, i * blocksize, blocksize); 2128 if (!err) 2129 set_buffer_uptodate(bh); 2130 continue; 2131 } 2132 /* 2133 * get_block() might have updated the buffer 2134 * synchronously 2135 */ 2136 if (buffer_uptodate(bh)) 2137 continue; 2138 } 2139 arr[nr++] = bh; 2140 } while (i++, iblock++, (bh = bh->b_this_page) != head); 2141 2142 if (fully_mapped) 2143 SetPageMappedToDisk(page); 2144 2145 if (!nr) { 2146 /* 2147 * All buffers are uptodate - we can set the page uptodate 2148 * as well. But not if get_block() returned an error. 2149 */ 2150 if (!PageError(page)) 2151 SetPageUptodate(page); 2152 unlock_page(page); 2153 return 0; 2154 } 2155 2156 /* Stage two: lock the buffers */ 2157 for (i = 0; i < nr; i++) { 2158 bh = arr[i]; 2159 lock_buffer(bh); 2160 mark_buffer_async_read(bh); 2161 } 2162 2163 /* 2164 * Stage 3: start the IO. Check for uptodateness 2165 * inside the buffer lock in case another process reading 2166 * the underlying blockdev brought it uptodate (the sct fix). 2167 */ 2168 for (i = 0; i < nr; i++) { 2169 bh = arr[i]; 2170 if (buffer_uptodate(bh)) 2171 end_buffer_async_read(bh, 1); 2172 else 2173 submit_bh(READ, bh); 2174 } 2175 return 0; 2176 } 2177 EXPORT_SYMBOL(block_read_full_page); 2178 2179 /* utility function for filesystems that need to do work on expanding 2180 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2181 * deal with the hole. 2182 */ 2183 int generic_cont_expand_simple(struct inode *inode, loff_t size) 2184 { 2185 struct address_space *mapping = inode->i_mapping; 2186 struct page *page; 2187 void *fsdata; 2188 int err; 2189 2190 err = inode_newsize_ok(inode, size); 2191 if (err) 2192 goto out; 2193 2194 err = pagecache_write_begin(NULL, mapping, size, 0, 2195 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, 2196 &page, &fsdata); 2197 if (err) 2198 goto out; 2199 2200 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); 2201 BUG_ON(err > 0); 2202 2203 out: 2204 return err; 2205 } 2206 EXPORT_SYMBOL(generic_cont_expand_simple); 2207 2208 static int cont_expand_zero(struct file *file, struct address_space *mapping, 2209 loff_t pos, loff_t *bytes) 2210 { 2211 struct inode *inode = mapping->host; 2212 unsigned blocksize = 1 << inode->i_blkbits; 2213 struct page *page; 2214 void *fsdata; 2215 pgoff_t index, curidx; 2216 loff_t curpos; 2217 unsigned zerofrom, offset, len; 2218 int err = 0; 2219 2220 index = pos >> PAGE_CACHE_SHIFT; 2221 offset = pos & ~PAGE_CACHE_MASK; 2222 2223 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { 2224 zerofrom = curpos & ~PAGE_CACHE_MASK; 2225 if (zerofrom & (blocksize-1)) { 2226 *bytes |= (blocksize-1); 2227 (*bytes)++; 2228 } 2229 len = PAGE_CACHE_SIZE - zerofrom; 2230 2231 err = pagecache_write_begin(file, mapping, curpos, len, 2232 AOP_FLAG_UNINTERRUPTIBLE, 2233 &page, &fsdata); 2234 if (err) 2235 goto out; 2236 zero_user(page, zerofrom, len); 2237 err = pagecache_write_end(file, mapping, curpos, len, len, 2238 page, fsdata); 2239 if (err < 0) 2240 goto out; 2241 BUG_ON(err != len); 2242 err = 0; 2243 2244 balance_dirty_pages_ratelimited(mapping); 2245 } 2246 2247 /* page covers the boundary, find the boundary offset */ 2248 if (index == curidx) { 2249 zerofrom = curpos & ~PAGE_CACHE_MASK; 2250 /* if we will expand the thing last block will be filled */ 2251 if (offset <= zerofrom) { 2252 goto out; 2253 } 2254 if (zerofrom & (blocksize-1)) { 2255 *bytes |= (blocksize-1); 2256 (*bytes)++; 2257 } 2258 len = offset - zerofrom; 2259 2260 err = pagecache_write_begin(file, mapping, curpos, len, 2261 AOP_FLAG_UNINTERRUPTIBLE, 2262 &page, &fsdata); 2263 if (err) 2264 goto out; 2265 zero_user(page, zerofrom, len); 2266 err = pagecache_write_end(file, mapping, curpos, len, len, 2267 page, fsdata); 2268 if (err < 0) 2269 goto out; 2270 BUG_ON(err != len); 2271 err = 0; 2272 } 2273 out: 2274 return err; 2275 } 2276 2277 /* 2278 * For moronic filesystems that do not allow holes in file. 2279 * We may have to extend the file. 2280 */ 2281 int cont_write_begin(struct file *file, struct address_space *mapping, 2282 loff_t pos, unsigned len, unsigned flags, 2283 struct page **pagep, void **fsdata, 2284 get_block_t *get_block, loff_t *bytes) 2285 { 2286 struct inode *inode = mapping->host; 2287 unsigned blocksize = 1 << inode->i_blkbits; 2288 unsigned zerofrom; 2289 int err; 2290 2291 err = cont_expand_zero(file, mapping, pos, bytes); 2292 if (err) 2293 return err; 2294 2295 zerofrom = *bytes & ~PAGE_CACHE_MASK; 2296 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2297 *bytes |= (blocksize-1); 2298 (*bytes)++; 2299 } 2300 2301 return block_write_begin(mapping, pos, len, flags, pagep, get_block); 2302 } 2303 EXPORT_SYMBOL(cont_write_begin); 2304 2305 int block_commit_write(struct page *page, unsigned from, unsigned to) 2306 { 2307 struct inode *inode = page->mapping->host; 2308 __block_commit_write(inode,page,from,to); 2309 return 0; 2310 } 2311 EXPORT_SYMBOL(block_commit_write); 2312 2313 /* 2314 * block_page_mkwrite() is not allowed to change the file size as it gets 2315 * called from a page fault handler when a page is first dirtied. Hence we must 2316 * be careful to check for EOF conditions here. We set the page up correctly 2317 * for a written page which means we get ENOSPC checking when writing into 2318 * holes and correct delalloc and unwritten extent mapping on filesystems that 2319 * support these features. 2320 * 2321 * We are not allowed to take the i_mutex here so we have to play games to 2322 * protect against truncate races as the page could now be beyond EOF. Because 2323 * truncate writes the inode size before removing pages, once we have the 2324 * page lock we can determine safely if the page is beyond EOF. If it is not 2325 * beyond EOF, then the page is guaranteed safe against truncation until we 2326 * unlock the page. 2327 * 2328 * Direct callers of this function should protect against filesystem freezing 2329 * using sb_start_write() - sb_end_write() functions. 2330 */ 2331 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2332 get_block_t get_block) 2333 { 2334 struct page *page = vmf->page; 2335 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2336 unsigned long end; 2337 loff_t size; 2338 int ret; 2339 2340 lock_page(page); 2341 size = i_size_read(inode); 2342 if ((page->mapping != inode->i_mapping) || 2343 (page_offset(page) > size)) { 2344 /* We overload EFAULT to mean page got truncated */ 2345 ret = -EFAULT; 2346 goto out_unlock; 2347 } 2348 2349 /* page is wholly or partially inside EOF */ 2350 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) 2351 end = size & ~PAGE_CACHE_MASK; 2352 else 2353 end = PAGE_CACHE_SIZE; 2354 2355 ret = __block_write_begin(page, 0, end, get_block); 2356 if (!ret) 2357 ret = block_commit_write(page, 0, end); 2358 2359 if (unlikely(ret < 0)) 2360 goto out_unlock; 2361 set_page_dirty(page); 2362 wait_on_page_writeback(page); 2363 return 0; 2364 out_unlock: 2365 unlock_page(page); 2366 return ret; 2367 } 2368 EXPORT_SYMBOL(__block_page_mkwrite); 2369 2370 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2371 get_block_t get_block) 2372 { 2373 int ret; 2374 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; 2375 2376 sb_start_pagefault(sb); 2377 2378 /* 2379 * Update file times before taking page lock. We may end up failing the 2380 * fault so this update may be superfluous but who really cares... 2381 */ 2382 file_update_time(vma->vm_file); 2383 2384 ret = __block_page_mkwrite(vma, vmf, get_block); 2385 sb_end_pagefault(sb); 2386 return block_page_mkwrite_return(ret); 2387 } 2388 EXPORT_SYMBOL(block_page_mkwrite); 2389 2390 /* 2391 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2392 * immediately, while under the page lock. So it needs a special end_io 2393 * handler which does not touch the bh after unlocking it. 2394 */ 2395 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) 2396 { 2397 __end_buffer_read_notouch(bh, uptodate); 2398 } 2399 2400 /* 2401 * Attach the singly-linked list of buffers created by nobh_write_begin, to 2402 * the page (converting it to circular linked list and taking care of page 2403 * dirty races). 2404 */ 2405 static void attach_nobh_buffers(struct page *page, struct buffer_head *head) 2406 { 2407 struct buffer_head *bh; 2408 2409 BUG_ON(!PageLocked(page)); 2410 2411 spin_lock(&page->mapping->private_lock); 2412 bh = head; 2413 do { 2414 if (PageDirty(page)) 2415 set_buffer_dirty(bh); 2416 if (!bh->b_this_page) 2417 bh->b_this_page = head; 2418 bh = bh->b_this_page; 2419 } while (bh != head); 2420 attach_page_buffers(page, head); 2421 spin_unlock(&page->mapping->private_lock); 2422 } 2423 2424 /* 2425 * On entry, the page is fully not uptodate. 2426 * On exit the page is fully uptodate in the areas outside (from,to) 2427 * The filesystem needs to handle block truncation upon failure. 2428 */ 2429 int nobh_write_begin(struct address_space *mapping, 2430 loff_t pos, unsigned len, unsigned flags, 2431 struct page **pagep, void **fsdata, 2432 get_block_t *get_block) 2433 { 2434 struct inode *inode = mapping->host; 2435 const unsigned blkbits = inode->i_blkbits; 2436 const unsigned blocksize = 1 << blkbits; 2437 struct buffer_head *head, *bh; 2438 struct page *page; 2439 pgoff_t index; 2440 unsigned from, to; 2441 unsigned block_in_page; 2442 unsigned block_start, block_end; 2443 sector_t block_in_file; 2444 int nr_reads = 0; 2445 int ret = 0; 2446 int is_mapped_to_disk = 1; 2447 2448 index = pos >> PAGE_CACHE_SHIFT; 2449 from = pos & (PAGE_CACHE_SIZE - 1); 2450 to = from + len; 2451 2452 page = grab_cache_page_write_begin(mapping, index, flags); 2453 if (!page) 2454 return -ENOMEM; 2455 *pagep = page; 2456 *fsdata = NULL; 2457 2458 if (page_has_buffers(page)) { 2459 ret = __block_write_begin(page, pos, len, get_block); 2460 if (unlikely(ret)) 2461 goto out_release; 2462 return ret; 2463 } 2464 2465 if (PageMappedToDisk(page)) 2466 return 0; 2467 2468 /* 2469 * Allocate buffers so that we can keep track of state, and potentially 2470 * attach them to the page if an error occurs. In the common case of 2471 * no error, they will just be freed again without ever being attached 2472 * to the page (which is all OK, because we're under the page lock). 2473 * 2474 * Be careful: the buffer linked list is a NULL terminated one, rather 2475 * than the circular one we're used to. 2476 */ 2477 head = alloc_page_buffers(page, blocksize, 0); 2478 if (!head) { 2479 ret = -ENOMEM; 2480 goto out_release; 2481 } 2482 2483 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 2484 2485 /* 2486 * We loop across all blocks in the page, whether or not they are 2487 * part of the affected region. This is so we can discover if the 2488 * page is fully mapped-to-disk. 2489 */ 2490 for (block_start = 0, block_in_page = 0, bh = head; 2491 block_start < PAGE_CACHE_SIZE; 2492 block_in_page++, block_start += blocksize, bh = bh->b_this_page) { 2493 int create; 2494 2495 block_end = block_start + blocksize; 2496 bh->b_state = 0; 2497 create = 1; 2498 if (block_start >= to) 2499 create = 0; 2500 ret = get_block(inode, block_in_file + block_in_page, 2501 bh, create); 2502 if (ret) 2503 goto failed; 2504 if (!buffer_mapped(bh)) 2505 is_mapped_to_disk = 0; 2506 if (buffer_new(bh)) 2507 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 2508 if (PageUptodate(page)) { 2509 set_buffer_uptodate(bh); 2510 continue; 2511 } 2512 if (buffer_new(bh) || !buffer_mapped(bh)) { 2513 zero_user_segments(page, block_start, from, 2514 to, block_end); 2515 continue; 2516 } 2517 if (buffer_uptodate(bh)) 2518 continue; /* reiserfs does this */ 2519 if (block_start < from || block_end > to) { 2520 lock_buffer(bh); 2521 bh->b_end_io = end_buffer_read_nobh; 2522 submit_bh(READ, bh); 2523 nr_reads++; 2524 } 2525 } 2526 2527 if (nr_reads) { 2528 /* 2529 * The page is locked, so these buffers are protected from 2530 * any VM or truncate activity. Hence we don't need to care 2531 * for the buffer_head refcounts. 2532 */ 2533 for (bh = head; bh; bh = bh->b_this_page) { 2534 wait_on_buffer(bh); 2535 if (!buffer_uptodate(bh)) 2536 ret = -EIO; 2537 } 2538 if (ret) 2539 goto failed; 2540 } 2541 2542 if (is_mapped_to_disk) 2543 SetPageMappedToDisk(page); 2544 2545 *fsdata = head; /* to be released by nobh_write_end */ 2546 2547 return 0; 2548 2549 failed: 2550 BUG_ON(!ret); 2551 /* 2552 * Error recovery is a bit difficult. We need to zero out blocks that 2553 * were newly allocated, and dirty them to ensure they get written out. 2554 * Buffers need to be attached to the page at this point, otherwise 2555 * the handling of potential IO errors during writeout would be hard 2556 * (could try doing synchronous writeout, but what if that fails too?) 2557 */ 2558 attach_nobh_buffers(page, head); 2559 page_zero_new_buffers(page, from, to); 2560 2561 out_release: 2562 unlock_page(page); 2563 page_cache_release(page); 2564 *pagep = NULL; 2565 2566 return ret; 2567 } 2568 EXPORT_SYMBOL(nobh_write_begin); 2569 2570 int nobh_write_end(struct file *file, struct address_space *mapping, 2571 loff_t pos, unsigned len, unsigned copied, 2572 struct page *page, void *fsdata) 2573 { 2574 struct inode *inode = page->mapping->host; 2575 struct buffer_head *head = fsdata; 2576 struct buffer_head *bh; 2577 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2578 2579 if (unlikely(copied < len) && head) 2580 attach_nobh_buffers(page, head); 2581 if (page_has_buffers(page)) 2582 return generic_write_end(file, mapping, pos, len, 2583 copied, page, fsdata); 2584 2585 SetPageUptodate(page); 2586 set_page_dirty(page); 2587 if (pos+copied > inode->i_size) { 2588 i_size_write(inode, pos+copied); 2589 mark_inode_dirty(inode); 2590 } 2591 2592 unlock_page(page); 2593 page_cache_release(page); 2594 2595 while (head) { 2596 bh = head; 2597 head = head->b_this_page; 2598 free_buffer_head(bh); 2599 } 2600 2601 return copied; 2602 } 2603 EXPORT_SYMBOL(nobh_write_end); 2604 2605 /* 2606 * nobh_writepage() - based on block_full_write_page() except 2607 * that it tries to operate without attaching bufferheads to 2608 * the page. 2609 */ 2610 int nobh_writepage(struct page *page, get_block_t *get_block, 2611 struct writeback_control *wbc) 2612 { 2613 struct inode * const inode = page->mapping->host; 2614 loff_t i_size = i_size_read(inode); 2615 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2616 unsigned offset; 2617 int ret; 2618 2619 /* Is the page fully inside i_size? */ 2620 if (page->index < end_index) 2621 goto out; 2622 2623 /* Is the page fully outside i_size? (truncate in progress) */ 2624 offset = i_size & (PAGE_CACHE_SIZE-1); 2625 if (page->index >= end_index+1 || !offset) { 2626 /* 2627 * The page may have dirty, unmapped buffers. For example, 2628 * they may have been added in ext3_writepage(). Make them 2629 * freeable here, so the page does not leak. 2630 */ 2631 #if 0 2632 /* Not really sure about this - do we need this ? */ 2633 if (page->mapping->a_ops->invalidatepage) 2634 page->mapping->a_ops->invalidatepage(page, offset); 2635 #endif 2636 unlock_page(page); 2637 return 0; /* don't care */ 2638 } 2639 2640 /* 2641 * The page straddles i_size. It must be zeroed out on each and every 2642 * writepage invocation because it may be mmapped. "A file is mapped 2643 * in multiples of the page size. For a file that is not a multiple of 2644 * the page size, the remaining memory is zeroed when mapped, and 2645 * writes to that region are not written out to the file." 2646 */ 2647 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2648 out: 2649 ret = mpage_writepage(page, get_block, wbc); 2650 if (ret == -EAGAIN) 2651 ret = __block_write_full_page(inode, page, get_block, wbc, 2652 end_buffer_async_write); 2653 return ret; 2654 } 2655 EXPORT_SYMBOL(nobh_writepage); 2656 2657 int nobh_truncate_page(struct address_space *mapping, 2658 loff_t from, get_block_t *get_block) 2659 { 2660 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2661 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2662 unsigned blocksize; 2663 sector_t iblock; 2664 unsigned length, pos; 2665 struct inode *inode = mapping->host; 2666 struct page *page; 2667 struct buffer_head map_bh; 2668 int err; 2669 2670 blocksize = 1 << inode->i_blkbits; 2671 length = offset & (blocksize - 1); 2672 2673 /* Block boundary? Nothing to do */ 2674 if (!length) 2675 return 0; 2676 2677 length = blocksize - length; 2678 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2679 2680 page = grab_cache_page(mapping, index); 2681 err = -ENOMEM; 2682 if (!page) 2683 goto out; 2684 2685 if (page_has_buffers(page)) { 2686 has_buffers: 2687 unlock_page(page); 2688 page_cache_release(page); 2689 return block_truncate_page(mapping, from, get_block); 2690 } 2691 2692 /* Find the buffer that contains "offset" */ 2693 pos = blocksize; 2694 while (offset >= pos) { 2695 iblock++; 2696 pos += blocksize; 2697 } 2698 2699 map_bh.b_size = blocksize; 2700 map_bh.b_state = 0; 2701 err = get_block(inode, iblock, &map_bh, 0); 2702 if (err) 2703 goto unlock; 2704 /* unmapped? It's a hole - nothing to do */ 2705 if (!buffer_mapped(&map_bh)) 2706 goto unlock; 2707 2708 /* Ok, it's mapped. Make sure it's up-to-date */ 2709 if (!PageUptodate(page)) { 2710 err = mapping->a_ops->readpage(NULL, page); 2711 if (err) { 2712 page_cache_release(page); 2713 goto out; 2714 } 2715 lock_page(page); 2716 if (!PageUptodate(page)) { 2717 err = -EIO; 2718 goto unlock; 2719 } 2720 if (page_has_buffers(page)) 2721 goto has_buffers; 2722 } 2723 zero_user(page, offset, length); 2724 set_page_dirty(page); 2725 err = 0; 2726 2727 unlock: 2728 unlock_page(page); 2729 page_cache_release(page); 2730 out: 2731 return err; 2732 } 2733 EXPORT_SYMBOL(nobh_truncate_page); 2734 2735 int block_truncate_page(struct address_space *mapping, 2736 loff_t from, get_block_t *get_block) 2737 { 2738 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2739 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2740 unsigned blocksize; 2741 sector_t iblock; 2742 unsigned length, pos; 2743 struct inode *inode = mapping->host; 2744 struct page *page; 2745 struct buffer_head *bh; 2746 int err; 2747 2748 blocksize = 1 << inode->i_blkbits; 2749 length = offset & (blocksize - 1); 2750 2751 /* Block boundary? Nothing to do */ 2752 if (!length) 2753 return 0; 2754 2755 length = blocksize - length; 2756 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2757 2758 page = grab_cache_page(mapping, index); 2759 err = -ENOMEM; 2760 if (!page) 2761 goto out; 2762 2763 if (!page_has_buffers(page)) 2764 create_empty_buffers(page, blocksize, 0); 2765 2766 /* Find the buffer that contains "offset" */ 2767 bh = page_buffers(page); 2768 pos = blocksize; 2769 while (offset >= pos) { 2770 bh = bh->b_this_page; 2771 iblock++; 2772 pos += blocksize; 2773 } 2774 2775 err = 0; 2776 if (!buffer_mapped(bh)) { 2777 WARN_ON(bh->b_size != blocksize); 2778 err = get_block(inode, iblock, bh, 0); 2779 if (err) 2780 goto unlock; 2781 /* unmapped? It's a hole - nothing to do */ 2782 if (!buffer_mapped(bh)) 2783 goto unlock; 2784 } 2785 2786 /* Ok, it's mapped. Make sure it's up-to-date */ 2787 if (PageUptodate(page)) 2788 set_buffer_uptodate(bh); 2789 2790 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { 2791 err = -EIO; 2792 ll_rw_block(READ, 1, &bh); 2793 wait_on_buffer(bh); 2794 /* Uhhuh. Read error. Complain and punt. */ 2795 if (!buffer_uptodate(bh)) 2796 goto unlock; 2797 } 2798 2799 zero_user(page, offset, length); 2800 mark_buffer_dirty(bh); 2801 err = 0; 2802 2803 unlock: 2804 unlock_page(page); 2805 page_cache_release(page); 2806 out: 2807 return err; 2808 } 2809 EXPORT_SYMBOL(block_truncate_page); 2810 2811 /* 2812 * The generic ->writepage function for buffer-backed address_spaces 2813 * this form passes in the end_io handler used to finish the IO. 2814 */ 2815 int block_write_full_page_endio(struct page *page, get_block_t *get_block, 2816 struct writeback_control *wbc, bh_end_io_t *handler) 2817 { 2818 struct inode * const inode = page->mapping->host; 2819 loff_t i_size = i_size_read(inode); 2820 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2821 unsigned offset; 2822 2823 /* Is the page fully inside i_size? */ 2824 if (page->index < end_index) 2825 return __block_write_full_page(inode, page, get_block, wbc, 2826 handler); 2827 2828 /* Is the page fully outside i_size? (truncate in progress) */ 2829 offset = i_size & (PAGE_CACHE_SIZE-1); 2830 if (page->index >= end_index+1 || !offset) { 2831 /* 2832 * The page may have dirty, unmapped buffers. For example, 2833 * they may have been added in ext3_writepage(). Make them 2834 * freeable here, so the page does not leak. 2835 */ 2836 do_invalidatepage(page, 0); 2837 unlock_page(page); 2838 return 0; /* don't care */ 2839 } 2840 2841 /* 2842 * The page straddles i_size. It must be zeroed out on each and every 2843 * writepage invocation because it may be mmapped. "A file is mapped 2844 * in multiples of the page size. For a file that is not a multiple of 2845 * the page size, the remaining memory is zeroed when mapped, and 2846 * writes to that region are not written out to the file." 2847 */ 2848 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2849 return __block_write_full_page(inode, page, get_block, wbc, handler); 2850 } 2851 EXPORT_SYMBOL(block_write_full_page_endio); 2852 2853 /* 2854 * The generic ->writepage function for buffer-backed address_spaces 2855 */ 2856 int block_write_full_page(struct page *page, get_block_t *get_block, 2857 struct writeback_control *wbc) 2858 { 2859 return block_write_full_page_endio(page, get_block, wbc, 2860 end_buffer_async_write); 2861 } 2862 EXPORT_SYMBOL(block_write_full_page); 2863 2864 sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2865 get_block_t *get_block) 2866 { 2867 struct buffer_head tmp; 2868 struct inode *inode = mapping->host; 2869 tmp.b_state = 0; 2870 tmp.b_blocknr = 0; 2871 tmp.b_size = 1 << inode->i_blkbits; 2872 get_block(inode, block, &tmp, 0); 2873 return tmp.b_blocknr; 2874 } 2875 EXPORT_SYMBOL(generic_block_bmap); 2876 2877 static void end_bio_bh_io_sync(struct bio *bio, int err) 2878 { 2879 struct buffer_head *bh = bio->bi_private; 2880 2881 if (err == -EOPNOTSUPP) { 2882 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2883 } 2884 2885 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) 2886 set_bit(BH_Quiet, &bh->b_state); 2887 2888 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2889 bio_put(bio); 2890 } 2891 2892 /* 2893 * This allows us to do IO even on the odd last sectors 2894 * of a device, even if the bh block size is some multiple 2895 * of the physical sector size. 2896 * 2897 * We'll just truncate the bio to the size of the device, 2898 * and clear the end of the buffer head manually. 2899 * 2900 * Truly out-of-range accesses will turn into actual IO 2901 * errors, this only handles the "we need to be able to 2902 * do IO at the final sector" case. 2903 */ 2904 static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) 2905 { 2906 sector_t maxsector; 2907 unsigned bytes; 2908 2909 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; 2910 if (!maxsector) 2911 return; 2912 2913 /* 2914 * If the *whole* IO is past the end of the device, 2915 * let it through, and the IO layer will turn it into 2916 * an EIO. 2917 */ 2918 if (unlikely(bio->bi_sector >= maxsector)) 2919 return; 2920 2921 maxsector -= bio->bi_sector; 2922 bytes = bio->bi_size; 2923 if (likely((bytes >> 9) <= maxsector)) 2924 return; 2925 2926 /* Uhhuh. We've got a bh that straddles the device size! */ 2927 bytes = maxsector << 9; 2928 2929 /* Truncate the bio.. */ 2930 bio->bi_size = bytes; 2931 bio->bi_io_vec[0].bv_len = bytes; 2932 2933 /* ..and clear the end of the buffer for reads */ 2934 if ((rw & RW_MASK) == READ) { 2935 void *kaddr = kmap_atomic(bh->b_page); 2936 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); 2937 kunmap_atomic(kaddr); 2938 } 2939 } 2940 2941 int submit_bh(int rw, struct buffer_head * bh) 2942 { 2943 struct bio *bio; 2944 int ret = 0; 2945 2946 BUG_ON(!buffer_locked(bh)); 2947 BUG_ON(!buffer_mapped(bh)); 2948 BUG_ON(!bh->b_end_io); 2949 BUG_ON(buffer_delay(bh)); 2950 BUG_ON(buffer_unwritten(bh)); 2951 2952 /* 2953 * Only clear out a write error when rewriting 2954 */ 2955 if (test_set_buffer_req(bh) && (rw & WRITE)) 2956 clear_buffer_write_io_error(bh); 2957 2958 /* 2959 * from here on down, it's all bio -- do the initial mapping, 2960 * submit_bio -> generic_make_request may further map this bio around 2961 */ 2962 bio = bio_alloc(GFP_NOIO, 1); 2963 2964 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 2965 bio->bi_bdev = bh->b_bdev; 2966 bio->bi_io_vec[0].bv_page = bh->b_page; 2967 bio->bi_io_vec[0].bv_len = bh->b_size; 2968 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 2969 2970 bio->bi_vcnt = 1; 2971 bio->bi_idx = 0; 2972 bio->bi_size = bh->b_size; 2973 2974 bio->bi_end_io = end_bio_bh_io_sync; 2975 bio->bi_private = bh; 2976 2977 /* Take care of bh's that straddle the end of the device */ 2978 guard_bh_eod(rw, bio, bh); 2979 2980 bio_get(bio); 2981 submit_bio(rw, bio); 2982 2983 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2984 ret = -EOPNOTSUPP; 2985 2986 bio_put(bio); 2987 return ret; 2988 } 2989 EXPORT_SYMBOL(submit_bh); 2990 2991 /** 2992 * ll_rw_block: low-level access to block devices (DEPRECATED) 2993 * @rw: whether to %READ or %WRITE or maybe %READA (readahead) 2994 * @nr: number of &struct buffer_heads in the array 2995 * @bhs: array of pointers to &struct buffer_head 2996 * 2997 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2998 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2999 * %READA option is described in the documentation for generic_make_request() 3000 * which ll_rw_block() calls. 3001 * 3002 * This function drops any buffer that it cannot get a lock on (with the 3003 * BH_Lock state bit), any buffer that appears to be clean when doing a write 3004 * request, and any buffer that appears to be up-to-date when doing read 3005 * request. Further it marks as clean buffers that are processed for 3006 * writing (the buffer cache won't assume that they are actually clean 3007 * until the buffer gets unlocked). 3008 * 3009 * ll_rw_block sets b_end_io to simple completion handler that marks 3010 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 3011 * any waiters. 3012 * 3013 * All of the buffers must be for the same device, and must also be a 3014 * multiple of the current approved size for the device. 3015 */ 3016 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) 3017 { 3018 int i; 3019 3020 for (i = 0; i < nr; i++) { 3021 struct buffer_head *bh = bhs[i]; 3022 3023 if (!trylock_buffer(bh)) 3024 continue; 3025 if (rw == WRITE) { 3026 if (test_clear_buffer_dirty(bh)) { 3027 bh->b_end_io = end_buffer_write_sync; 3028 get_bh(bh); 3029 submit_bh(WRITE, bh); 3030 continue; 3031 } 3032 } else { 3033 if (!buffer_uptodate(bh)) { 3034 bh->b_end_io = end_buffer_read_sync; 3035 get_bh(bh); 3036 submit_bh(rw, bh); 3037 continue; 3038 } 3039 } 3040 unlock_buffer(bh); 3041 } 3042 } 3043 EXPORT_SYMBOL(ll_rw_block); 3044 3045 void write_dirty_buffer(struct buffer_head *bh, int rw) 3046 { 3047 lock_buffer(bh); 3048 if (!test_clear_buffer_dirty(bh)) { 3049 unlock_buffer(bh); 3050 return; 3051 } 3052 bh->b_end_io = end_buffer_write_sync; 3053 get_bh(bh); 3054 submit_bh(rw, bh); 3055 } 3056 EXPORT_SYMBOL(write_dirty_buffer); 3057 3058 /* 3059 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3060 * and then start new I/O and then wait upon it. The caller must have a ref on 3061 * the buffer_head. 3062 */ 3063 int __sync_dirty_buffer(struct buffer_head *bh, int rw) 3064 { 3065 int ret = 0; 3066 3067 WARN_ON(atomic_read(&bh->b_count) < 1); 3068 lock_buffer(bh); 3069 if (test_clear_buffer_dirty(bh)) { 3070 get_bh(bh); 3071 bh->b_end_io = end_buffer_write_sync; 3072 ret = submit_bh(rw, bh); 3073 wait_on_buffer(bh); 3074 if (!ret && !buffer_uptodate(bh)) 3075 ret = -EIO; 3076 } else { 3077 unlock_buffer(bh); 3078 } 3079 return ret; 3080 } 3081 EXPORT_SYMBOL(__sync_dirty_buffer); 3082 3083 int sync_dirty_buffer(struct buffer_head *bh) 3084 { 3085 return __sync_dirty_buffer(bh, WRITE_SYNC); 3086 } 3087 EXPORT_SYMBOL(sync_dirty_buffer); 3088 3089 /* 3090 * try_to_free_buffers() checks if all the buffers on this particular page 3091 * are unused, and releases them if so. 3092 * 3093 * Exclusion against try_to_free_buffers may be obtained by either 3094 * locking the page or by holding its mapping's private_lock. 3095 * 3096 * If the page is dirty but all the buffers are clean then we need to 3097 * be sure to mark the page clean as well. This is because the page 3098 * may be against a block device, and a later reattachment of buffers 3099 * to a dirty page will set *all* buffers dirty. Which would corrupt 3100 * filesystem data on the same device. 3101 * 3102 * The same applies to regular filesystem pages: if all the buffers are 3103 * clean then we set the page clean and proceed. To do that, we require 3104 * total exclusion from __set_page_dirty_buffers(). That is obtained with 3105 * private_lock. 3106 * 3107 * try_to_free_buffers() is non-blocking. 3108 */ 3109 static inline int buffer_busy(struct buffer_head *bh) 3110 { 3111 return atomic_read(&bh->b_count) | 3112 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); 3113 } 3114 3115 static int 3116 drop_buffers(struct page *page, struct buffer_head **buffers_to_free) 3117 { 3118 struct buffer_head *head = page_buffers(page); 3119 struct buffer_head *bh; 3120 3121 bh = head; 3122 do { 3123 if (buffer_write_io_error(bh) && page->mapping) 3124 set_bit(AS_EIO, &page->mapping->flags); 3125 if (buffer_busy(bh)) 3126 goto failed; 3127 bh = bh->b_this_page; 3128 } while (bh != head); 3129 3130 do { 3131 struct buffer_head *next = bh->b_this_page; 3132 3133 if (bh->b_assoc_map) 3134 __remove_assoc_queue(bh); 3135 bh = next; 3136 } while (bh != head); 3137 *buffers_to_free = head; 3138 __clear_page_buffers(page); 3139 return 1; 3140 failed: 3141 return 0; 3142 } 3143 3144 int try_to_free_buffers(struct page *page) 3145 { 3146 struct address_space * const mapping = page->mapping; 3147 struct buffer_head *buffers_to_free = NULL; 3148 int ret = 0; 3149 3150 BUG_ON(!PageLocked(page)); 3151 if (PageWriteback(page)) 3152 return 0; 3153 3154 if (mapping == NULL) { /* can this still happen? */ 3155 ret = drop_buffers(page, &buffers_to_free); 3156 goto out; 3157 } 3158 3159 spin_lock(&mapping->private_lock); 3160 ret = drop_buffers(page, &buffers_to_free); 3161 3162 /* 3163 * If the filesystem writes its buffers by hand (eg ext3) 3164 * then we can have clean buffers against a dirty page. We 3165 * clean the page here; otherwise the VM will never notice 3166 * that the filesystem did any IO at all. 3167 * 3168 * Also, during truncate, discard_buffer will have marked all 3169 * the page's buffers clean. We discover that here and clean 3170 * the page also. 3171 * 3172 * private_lock must be held over this entire operation in order 3173 * to synchronise against __set_page_dirty_buffers and prevent the 3174 * dirty bit from being lost. 3175 */ 3176 if (ret) 3177 cancel_dirty_page(page, PAGE_CACHE_SIZE); 3178 spin_unlock(&mapping->private_lock); 3179 out: 3180 if (buffers_to_free) { 3181 struct buffer_head *bh = buffers_to_free; 3182 3183 do { 3184 struct buffer_head *next = bh->b_this_page; 3185 free_buffer_head(bh); 3186 bh = next; 3187 } while (bh != buffers_to_free); 3188 } 3189 return ret; 3190 } 3191 EXPORT_SYMBOL(try_to_free_buffers); 3192 3193 /* 3194 * There are no bdflush tunables left. But distributions are 3195 * still running obsolete flush daemons, so we terminate them here. 3196 * 3197 * Use of bdflush() is deprecated and will be removed in a future kernel. 3198 * The `flush-X' kernel threads fully replace bdflush daemons and this call. 3199 */ 3200 SYSCALL_DEFINE2(bdflush, int, func, long, data) 3201 { 3202 static int msg_count; 3203 3204 if (!capable(CAP_SYS_ADMIN)) 3205 return -EPERM; 3206 3207 if (msg_count < 5) { 3208 msg_count++; 3209 printk(KERN_INFO 3210 "warning: process `%s' used the obsolete bdflush" 3211 " system call\n", current->comm); 3212 printk(KERN_INFO "Fix your initscripts?\n"); 3213 } 3214 3215 if (func == 1) 3216 do_exit(0); 3217 return 0; 3218 } 3219 3220 /* 3221 * Buffer-head allocation 3222 */ 3223 static struct kmem_cache *bh_cachep __read_mostly; 3224 3225 /* 3226 * Once the number of bh's in the machine exceeds this level, we start 3227 * stripping them in writeback. 3228 */ 3229 static int max_buffer_heads; 3230 3231 int buffer_heads_over_limit; 3232 3233 struct bh_accounting { 3234 int nr; /* Number of live bh's */ 3235 int ratelimit; /* Limit cacheline bouncing */ 3236 }; 3237 3238 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; 3239 3240 static void recalc_bh_state(void) 3241 { 3242 int i; 3243 int tot = 0; 3244 3245 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096) 3246 return; 3247 __this_cpu_write(bh_accounting.ratelimit, 0); 3248 for_each_online_cpu(i) 3249 tot += per_cpu(bh_accounting, i).nr; 3250 buffer_heads_over_limit = (tot > max_buffer_heads); 3251 } 3252 3253 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3254 { 3255 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); 3256 if (ret) { 3257 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3258 preempt_disable(); 3259 __this_cpu_inc(bh_accounting.nr); 3260 recalc_bh_state(); 3261 preempt_enable(); 3262 } 3263 return ret; 3264 } 3265 EXPORT_SYMBOL(alloc_buffer_head); 3266 3267 void free_buffer_head(struct buffer_head *bh) 3268 { 3269 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3270 kmem_cache_free(bh_cachep, bh); 3271 preempt_disable(); 3272 __this_cpu_dec(bh_accounting.nr); 3273 recalc_bh_state(); 3274 preempt_enable(); 3275 } 3276 EXPORT_SYMBOL(free_buffer_head); 3277 3278 static void buffer_exit_cpu(int cpu) 3279 { 3280 int i; 3281 struct bh_lru *b = &per_cpu(bh_lrus, cpu); 3282 3283 for (i = 0; i < BH_LRU_SIZE; i++) { 3284 brelse(b->bhs[i]); 3285 b->bhs[i] = NULL; 3286 } 3287 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); 3288 per_cpu(bh_accounting, cpu).nr = 0; 3289 } 3290 3291 static int buffer_cpu_notify(struct notifier_block *self, 3292 unsigned long action, void *hcpu) 3293 { 3294 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 3295 buffer_exit_cpu((unsigned long)hcpu); 3296 return NOTIFY_OK; 3297 } 3298 3299 /** 3300 * bh_uptodate_or_lock - Test whether the buffer is uptodate 3301 * @bh: struct buffer_head 3302 * 3303 * Return true if the buffer is up-to-date and false, 3304 * with the buffer locked, if not. 3305 */ 3306 int bh_uptodate_or_lock(struct buffer_head *bh) 3307 { 3308 if (!buffer_uptodate(bh)) { 3309 lock_buffer(bh); 3310 if (!buffer_uptodate(bh)) 3311 return 0; 3312 unlock_buffer(bh); 3313 } 3314 return 1; 3315 } 3316 EXPORT_SYMBOL(bh_uptodate_or_lock); 3317 3318 /** 3319 * bh_submit_read - Submit a locked buffer for reading 3320 * @bh: struct buffer_head 3321 * 3322 * Returns zero on success and -EIO on error. 3323 */ 3324 int bh_submit_read(struct buffer_head *bh) 3325 { 3326 BUG_ON(!buffer_locked(bh)); 3327 3328 if (buffer_uptodate(bh)) { 3329 unlock_buffer(bh); 3330 return 0; 3331 } 3332 3333 get_bh(bh); 3334 bh->b_end_io = end_buffer_read_sync; 3335 submit_bh(READ, bh); 3336 wait_on_buffer(bh); 3337 if (buffer_uptodate(bh)) 3338 return 0; 3339 return -EIO; 3340 } 3341 EXPORT_SYMBOL(bh_submit_read); 3342 3343 void __init buffer_init(void) 3344 { 3345 int nrpages; 3346 3347 bh_cachep = kmem_cache_create("buffer_head", 3348 sizeof(struct buffer_head), 0, 3349 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3350 SLAB_MEM_SPREAD), 3351 NULL); 3352 3353 /* 3354 * Limit the bh occupancy to 10% of ZONE_NORMAL 3355 */ 3356 nrpages = (nr_free_buffer_pages() * 10) / 100; 3357 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3358 hotcpu_notifier(buffer_cpu_notify, 0); 3359 } 3360