xref: /linux/fs/buffer.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/capability.h>
29 #include <linux/blkdev.h>
30 #include <linux/file.h>
31 #include <linux/quotaops.h>
32 #include <linux/highmem.h>
33 #include <linux/module.h>
34 #include <linux/writeback.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/task_io_accounting_ops.h>
39 #include <linux/bio.h>
40 #include <linux/notifier.h>
41 #include <linux/cpu.h>
42 #include <linux/bitops.h>
43 #include <linux/mpage.h>
44 #include <linux/bit_spinlock.h>
45 
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 static void invalidate_bh_lrus(void);
48 
49 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50 
51 inline void
52 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53 {
54 	bh->b_end_io = handler;
55 	bh->b_private = private;
56 }
57 
58 static int sync_buffer(void *word)
59 {
60 	struct block_device *bd;
61 	struct buffer_head *bh
62 		= container_of(word, struct buffer_head, b_state);
63 
64 	smp_mb();
65 	bd = bh->b_bdev;
66 	if (bd)
67 		blk_run_address_space(bd->bd_inode->i_mapping);
68 	io_schedule();
69 	return 0;
70 }
71 
72 void fastcall __lock_buffer(struct buffer_head *bh)
73 {
74 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 							TASK_UNINTERRUPTIBLE);
76 }
77 EXPORT_SYMBOL(__lock_buffer);
78 
79 void fastcall unlock_buffer(struct buffer_head *bh)
80 {
81 	smp_mb__before_clear_bit();
82 	clear_buffer_locked(bh);
83 	smp_mb__after_clear_bit();
84 	wake_up_bit(&bh->b_state, BH_Lock);
85 }
86 
87 /*
88  * Block until a buffer comes unlocked.  This doesn't stop it
89  * from becoming locked again - you have to lock it yourself
90  * if you want to preserve its state.
91  */
92 void __wait_on_buffer(struct buffer_head * bh)
93 {
94 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
95 }
96 
97 static void
98 __clear_page_buffers(struct page *page)
99 {
100 	ClearPagePrivate(page);
101 	set_page_private(page, 0);
102 	page_cache_release(page);
103 }
104 
105 static void buffer_io_error(struct buffer_head *bh)
106 {
107 	char b[BDEVNAME_SIZE];
108 
109 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
110 			bdevname(bh->b_bdev, b),
111 			(unsigned long long)bh->b_blocknr);
112 }
113 
114 /*
115  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
116  * unlock the buffer. This is what ll_rw_block uses too.
117  */
118 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
119 {
120 	if (uptodate) {
121 		set_buffer_uptodate(bh);
122 	} else {
123 		/* This happens, due to failed READA attempts. */
124 		clear_buffer_uptodate(bh);
125 	}
126 	unlock_buffer(bh);
127 	put_bh(bh);
128 }
129 
130 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
131 {
132 	char b[BDEVNAME_SIZE];
133 
134 	if (uptodate) {
135 		set_buffer_uptodate(bh);
136 	} else {
137 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
138 			buffer_io_error(bh);
139 			printk(KERN_WARNING "lost page write due to "
140 					"I/O error on %s\n",
141 				       bdevname(bh->b_bdev, b));
142 		}
143 		set_buffer_write_io_error(bh);
144 		clear_buffer_uptodate(bh);
145 	}
146 	unlock_buffer(bh);
147 	put_bh(bh);
148 }
149 
150 /*
151  * Write out and wait upon all the dirty data associated with a block
152  * device via its mapping.  Does not take the superblock lock.
153  */
154 int sync_blockdev(struct block_device *bdev)
155 {
156 	int ret = 0;
157 
158 	if (bdev)
159 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
160 	return ret;
161 }
162 EXPORT_SYMBOL(sync_blockdev);
163 
164 /*
165  * Write out and wait upon all dirty data associated with this
166  * device.   Filesystem data as well as the underlying block
167  * device.  Takes the superblock lock.
168  */
169 int fsync_bdev(struct block_device *bdev)
170 {
171 	struct super_block *sb = get_super(bdev);
172 	if (sb) {
173 		int res = fsync_super(sb);
174 		drop_super(sb);
175 		return res;
176 	}
177 	return sync_blockdev(bdev);
178 }
179 
180 /**
181  * freeze_bdev  --  lock a filesystem and force it into a consistent state
182  * @bdev:	blockdevice to lock
183  *
184  * This takes the block device bd_mount_sem to make sure no new mounts
185  * happen on bdev until thaw_bdev() is called.
186  * If a superblock is found on this device, we take the s_umount semaphore
187  * on it to make sure nobody unmounts until the snapshot creation is done.
188  */
189 struct super_block *freeze_bdev(struct block_device *bdev)
190 {
191 	struct super_block *sb;
192 
193 	down(&bdev->bd_mount_sem);
194 	sb = get_super(bdev);
195 	if (sb && !(sb->s_flags & MS_RDONLY)) {
196 		sb->s_frozen = SB_FREEZE_WRITE;
197 		smp_wmb();
198 
199 		__fsync_super(sb);
200 
201 		sb->s_frozen = SB_FREEZE_TRANS;
202 		smp_wmb();
203 
204 		sync_blockdev(sb->s_bdev);
205 
206 		if (sb->s_op->write_super_lockfs)
207 			sb->s_op->write_super_lockfs(sb);
208 	}
209 
210 	sync_blockdev(bdev);
211 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
212 }
213 EXPORT_SYMBOL(freeze_bdev);
214 
215 /**
216  * thaw_bdev  -- unlock filesystem
217  * @bdev:	blockdevice to unlock
218  * @sb:		associated superblock
219  *
220  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
221  */
222 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
223 {
224 	if (sb) {
225 		BUG_ON(sb->s_bdev != bdev);
226 
227 		if (sb->s_op->unlockfs)
228 			sb->s_op->unlockfs(sb);
229 		sb->s_frozen = SB_UNFROZEN;
230 		smp_wmb();
231 		wake_up(&sb->s_wait_unfrozen);
232 		drop_super(sb);
233 	}
234 
235 	up(&bdev->bd_mount_sem);
236 }
237 EXPORT_SYMBOL(thaw_bdev);
238 
239 /*
240  * Various filesystems appear to want __find_get_block to be non-blocking.
241  * But it's the page lock which protects the buffers.  To get around this,
242  * we get exclusion from try_to_free_buffers with the blockdev mapping's
243  * private_lock.
244  *
245  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
246  * may be quite high.  This code could TryLock the page, and if that
247  * succeeds, there is no need to take private_lock. (But if
248  * private_lock is contended then so is mapping->tree_lock).
249  */
250 static struct buffer_head *
251 __find_get_block_slow(struct block_device *bdev, sector_t block)
252 {
253 	struct inode *bd_inode = bdev->bd_inode;
254 	struct address_space *bd_mapping = bd_inode->i_mapping;
255 	struct buffer_head *ret = NULL;
256 	pgoff_t index;
257 	struct buffer_head *bh;
258 	struct buffer_head *head;
259 	struct page *page;
260 	int all_mapped = 1;
261 
262 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
263 	page = find_get_page(bd_mapping, index);
264 	if (!page)
265 		goto out;
266 
267 	spin_lock(&bd_mapping->private_lock);
268 	if (!page_has_buffers(page))
269 		goto out_unlock;
270 	head = page_buffers(page);
271 	bh = head;
272 	do {
273 		if (bh->b_blocknr == block) {
274 			ret = bh;
275 			get_bh(bh);
276 			goto out_unlock;
277 		}
278 		if (!buffer_mapped(bh))
279 			all_mapped = 0;
280 		bh = bh->b_this_page;
281 	} while (bh != head);
282 
283 	/* we might be here because some of the buffers on this page are
284 	 * not mapped.  This is due to various races between
285 	 * file io on the block device and getblk.  It gets dealt with
286 	 * elsewhere, don't buffer_error if we had some unmapped buffers
287 	 */
288 	if (all_mapped) {
289 		printk("__find_get_block_slow() failed. "
290 			"block=%llu, b_blocknr=%llu\n",
291 			(unsigned long long)block,
292 			(unsigned long long)bh->b_blocknr);
293 		printk("b_state=0x%08lx, b_size=%zu\n",
294 			bh->b_state, bh->b_size);
295 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
296 	}
297 out_unlock:
298 	spin_unlock(&bd_mapping->private_lock);
299 	page_cache_release(page);
300 out:
301 	return ret;
302 }
303 
304 /* If invalidate_buffers() will trash dirty buffers, it means some kind
305    of fs corruption is going on. Trashing dirty data always imply losing
306    information that was supposed to be just stored on the physical layer
307    by the user.
308 
309    Thus invalidate_buffers in general usage is not allwowed to trash
310    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
311    be preserved.  These buffers are simply skipped.
312 
313    We also skip buffers which are still in use.  For example this can
314    happen if a userspace program is reading the block device.
315 
316    NOTE: In the case where the user removed a removable-media-disk even if
317    there's still dirty data not synced on disk (due a bug in the device driver
318    or due an error of the user), by not destroying the dirty buffers we could
319    generate corruption also on the next media inserted, thus a parameter is
320    necessary to handle this case in the most safe way possible (trying
321    to not corrupt also the new disk inserted with the data belonging to
322    the old now corrupted disk). Also for the ramdisk the natural thing
323    to do in order to release the ramdisk memory is to destroy dirty buffers.
324 
325    These are two special cases. Normal usage imply the device driver
326    to issue a sync on the device (without waiting I/O completion) and
327    then an invalidate_buffers call that doesn't trash dirty buffers.
328 
329    For handling cache coherency with the blkdev pagecache the 'update' case
330    is been introduced. It is needed to re-read from disk any pinned
331    buffer. NOTE: re-reading from disk is destructive so we can do it only
332    when we assume nobody is changing the buffercache under our I/O and when
333    we think the disk contains more recent information than the buffercache.
334    The update == 1 pass marks the buffers we need to update, the update == 2
335    pass does the actual I/O. */
336 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
337 {
338 	struct address_space *mapping = bdev->bd_inode->i_mapping;
339 
340 	if (mapping->nrpages == 0)
341 		return;
342 
343 	invalidate_bh_lrus();
344 	/*
345 	 * FIXME: what about destroy_dirty_buffers?
346 	 * We really want to use invalidate_inode_pages2() for
347 	 * that, but not until that's cleaned up.
348 	 */
349 	invalidate_mapping_pages(mapping, 0, -1);
350 }
351 
352 /*
353  * Kick pdflush then try to free up some ZONE_NORMAL memory.
354  */
355 static void free_more_memory(void)
356 {
357 	struct zone **zones;
358 	pg_data_t *pgdat;
359 
360 	wakeup_pdflush(1024);
361 	yield();
362 
363 	for_each_online_pgdat(pgdat) {
364 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
365 		if (*zones)
366 			try_to_free_pages(zones, GFP_NOFS);
367 	}
368 }
369 
370 /*
371  * I/O completion handler for block_read_full_page() - pages
372  * which come unlocked at the end of I/O.
373  */
374 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
375 {
376 	unsigned long flags;
377 	struct buffer_head *first;
378 	struct buffer_head *tmp;
379 	struct page *page;
380 	int page_uptodate = 1;
381 
382 	BUG_ON(!buffer_async_read(bh));
383 
384 	page = bh->b_page;
385 	if (uptodate) {
386 		set_buffer_uptodate(bh);
387 	} else {
388 		clear_buffer_uptodate(bh);
389 		if (printk_ratelimit())
390 			buffer_io_error(bh);
391 		SetPageError(page);
392 	}
393 
394 	/*
395 	 * Be _very_ careful from here on. Bad things can happen if
396 	 * two buffer heads end IO at almost the same time and both
397 	 * decide that the page is now completely done.
398 	 */
399 	first = page_buffers(page);
400 	local_irq_save(flags);
401 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
402 	clear_buffer_async_read(bh);
403 	unlock_buffer(bh);
404 	tmp = bh;
405 	do {
406 		if (!buffer_uptodate(tmp))
407 			page_uptodate = 0;
408 		if (buffer_async_read(tmp)) {
409 			BUG_ON(!buffer_locked(tmp));
410 			goto still_busy;
411 		}
412 		tmp = tmp->b_this_page;
413 	} while (tmp != bh);
414 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
415 	local_irq_restore(flags);
416 
417 	/*
418 	 * If none of the buffers had errors and they are all
419 	 * uptodate then we can set the page uptodate.
420 	 */
421 	if (page_uptodate && !PageError(page))
422 		SetPageUptodate(page);
423 	unlock_page(page);
424 	return;
425 
426 still_busy:
427 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
428 	local_irq_restore(flags);
429 	return;
430 }
431 
432 /*
433  * Completion handler for block_write_full_page() - pages which are unlocked
434  * during I/O, and which have PageWriteback cleared upon I/O completion.
435  */
436 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
437 {
438 	char b[BDEVNAME_SIZE];
439 	unsigned long flags;
440 	struct buffer_head *first;
441 	struct buffer_head *tmp;
442 	struct page *page;
443 
444 	BUG_ON(!buffer_async_write(bh));
445 
446 	page = bh->b_page;
447 	if (uptodate) {
448 		set_buffer_uptodate(bh);
449 	} else {
450 		if (printk_ratelimit()) {
451 			buffer_io_error(bh);
452 			printk(KERN_WARNING "lost page write due to "
453 					"I/O error on %s\n",
454 			       bdevname(bh->b_bdev, b));
455 		}
456 		set_bit(AS_EIO, &page->mapping->flags);
457 		set_buffer_write_io_error(bh);
458 		clear_buffer_uptodate(bh);
459 		SetPageError(page);
460 	}
461 
462 	first = page_buffers(page);
463 	local_irq_save(flags);
464 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
465 
466 	clear_buffer_async_write(bh);
467 	unlock_buffer(bh);
468 	tmp = bh->b_this_page;
469 	while (tmp != bh) {
470 		if (buffer_async_write(tmp)) {
471 			BUG_ON(!buffer_locked(tmp));
472 			goto still_busy;
473 		}
474 		tmp = tmp->b_this_page;
475 	}
476 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
477 	local_irq_restore(flags);
478 	end_page_writeback(page);
479 	return;
480 
481 still_busy:
482 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
483 	local_irq_restore(flags);
484 	return;
485 }
486 
487 /*
488  * If a page's buffers are under async readin (end_buffer_async_read
489  * completion) then there is a possibility that another thread of
490  * control could lock one of the buffers after it has completed
491  * but while some of the other buffers have not completed.  This
492  * locked buffer would confuse end_buffer_async_read() into not unlocking
493  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
494  * that this buffer is not under async I/O.
495  *
496  * The page comes unlocked when it has no locked buffer_async buffers
497  * left.
498  *
499  * PageLocked prevents anyone starting new async I/O reads any of
500  * the buffers.
501  *
502  * PageWriteback is used to prevent simultaneous writeout of the same
503  * page.
504  *
505  * PageLocked prevents anyone from starting writeback of a page which is
506  * under read I/O (PageWriteback is only ever set against a locked page).
507  */
508 static void mark_buffer_async_read(struct buffer_head *bh)
509 {
510 	bh->b_end_io = end_buffer_async_read;
511 	set_buffer_async_read(bh);
512 }
513 
514 void mark_buffer_async_write(struct buffer_head *bh)
515 {
516 	bh->b_end_io = end_buffer_async_write;
517 	set_buffer_async_write(bh);
518 }
519 EXPORT_SYMBOL(mark_buffer_async_write);
520 
521 
522 /*
523  * fs/buffer.c contains helper functions for buffer-backed address space's
524  * fsync functions.  A common requirement for buffer-based filesystems is
525  * that certain data from the backing blockdev needs to be written out for
526  * a successful fsync().  For example, ext2 indirect blocks need to be
527  * written back and waited upon before fsync() returns.
528  *
529  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
530  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
531  * management of a list of dependent buffers at ->i_mapping->private_list.
532  *
533  * Locking is a little subtle: try_to_free_buffers() will remove buffers
534  * from their controlling inode's queue when they are being freed.  But
535  * try_to_free_buffers() will be operating against the *blockdev* mapping
536  * at the time, not against the S_ISREG file which depends on those buffers.
537  * So the locking for private_list is via the private_lock in the address_space
538  * which backs the buffers.  Which is different from the address_space
539  * against which the buffers are listed.  So for a particular address_space,
540  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
541  * mapping->private_list will always be protected by the backing blockdev's
542  * ->private_lock.
543  *
544  * Which introduces a requirement: all buffers on an address_space's
545  * ->private_list must be from the same address_space: the blockdev's.
546  *
547  * address_spaces which do not place buffers at ->private_list via these
548  * utility functions are free to use private_lock and private_list for
549  * whatever they want.  The only requirement is that list_empty(private_list)
550  * be true at clear_inode() time.
551  *
552  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
553  * filesystems should do that.  invalidate_inode_buffers() should just go
554  * BUG_ON(!list_empty).
555  *
556  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
557  * take an address_space, not an inode.  And it should be called
558  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
559  * queued up.
560  *
561  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
562  * list if it is already on a list.  Because if the buffer is on a list,
563  * it *must* already be on the right one.  If not, the filesystem is being
564  * silly.  This will save a ton of locking.  But first we have to ensure
565  * that buffers are taken *off* the old inode's list when they are freed
566  * (presumably in truncate).  That requires careful auditing of all
567  * filesystems (do it inside bforget()).  It could also be done by bringing
568  * b_inode back.
569  */
570 
571 /*
572  * The buffer's backing address_space's private_lock must be held
573  */
574 static inline void __remove_assoc_queue(struct buffer_head *bh)
575 {
576 	list_del_init(&bh->b_assoc_buffers);
577 	WARN_ON(!bh->b_assoc_map);
578 	if (buffer_write_io_error(bh))
579 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
580 	bh->b_assoc_map = NULL;
581 }
582 
583 int inode_has_buffers(struct inode *inode)
584 {
585 	return !list_empty(&inode->i_data.private_list);
586 }
587 
588 /*
589  * osync is designed to support O_SYNC io.  It waits synchronously for
590  * all already-submitted IO to complete, but does not queue any new
591  * writes to the disk.
592  *
593  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
594  * you dirty the buffers, and then use osync_inode_buffers to wait for
595  * completion.  Any other dirty buffers which are not yet queued for
596  * write will not be flushed to disk by the osync.
597  */
598 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
599 {
600 	struct buffer_head *bh;
601 	struct list_head *p;
602 	int err = 0;
603 
604 	spin_lock(lock);
605 repeat:
606 	list_for_each_prev(p, list) {
607 		bh = BH_ENTRY(p);
608 		if (buffer_locked(bh)) {
609 			get_bh(bh);
610 			spin_unlock(lock);
611 			wait_on_buffer(bh);
612 			if (!buffer_uptodate(bh))
613 				err = -EIO;
614 			brelse(bh);
615 			spin_lock(lock);
616 			goto repeat;
617 		}
618 	}
619 	spin_unlock(lock);
620 	return err;
621 }
622 
623 /**
624  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
625  *                        buffers
626  * @mapping: the mapping which wants those buffers written
627  *
628  * Starts I/O against the buffers at mapping->private_list, and waits upon
629  * that I/O.
630  *
631  * Basically, this is a convenience function for fsync().
632  * @mapping is a file or directory which needs those buffers to be written for
633  * a successful fsync().
634  */
635 int sync_mapping_buffers(struct address_space *mapping)
636 {
637 	struct address_space *buffer_mapping = mapping->assoc_mapping;
638 
639 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
640 		return 0;
641 
642 	return fsync_buffers_list(&buffer_mapping->private_lock,
643 					&mapping->private_list);
644 }
645 EXPORT_SYMBOL(sync_mapping_buffers);
646 
647 /*
648  * Called when we've recently written block `bblock', and it is known that
649  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
650  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
651  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
652  */
653 void write_boundary_block(struct block_device *bdev,
654 			sector_t bblock, unsigned blocksize)
655 {
656 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
657 	if (bh) {
658 		if (buffer_dirty(bh))
659 			ll_rw_block(WRITE, 1, &bh);
660 		put_bh(bh);
661 	}
662 }
663 
664 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
665 {
666 	struct address_space *mapping = inode->i_mapping;
667 	struct address_space *buffer_mapping = bh->b_page->mapping;
668 
669 	mark_buffer_dirty(bh);
670 	if (!mapping->assoc_mapping) {
671 		mapping->assoc_mapping = buffer_mapping;
672 	} else {
673 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
674 	}
675 	if (list_empty(&bh->b_assoc_buffers)) {
676 		spin_lock(&buffer_mapping->private_lock);
677 		list_move_tail(&bh->b_assoc_buffers,
678 				&mapping->private_list);
679 		bh->b_assoc_map = mapping;
680 		spin_unlock(&buffer_mapping->private_lock);
681 	}
682 }
683 EXPORT_SYMBOL(mark_buffer_dirty_inode);
684 
685 /*
686  * Add a page to the dirty page list.
687  *
688  * It is a sad fact of life that this function is called from several places
689  * deeply under spinlocking.  It may not sleep.
690  *
691  * If the page has buffers, the uptodate buffers are set dirty, to preserve
692  * dirty-state coherency between the page and the buffers.  It the page does
693  * not have buffers then when they are later attached they will all be set
694  * dirty.
695  *
696  * The buffers are dirtied before the page is dirtied.  There's a small race
697  * window in which a writepage caller may see the page cleanness but not the
698  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
699  * before the buffers, a concurrent writepage caller could clear the page dirty
700  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
701  * page on the dirty page list.
702  *
703  * We use private_lock to lock against try_to_free_buffers while using the
704  * page's buffer list.  Also use this to protect against clean buffers being
705  * added to the page after it was set dirty.
706  *
707  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
708  * address_space though.
709  */
710 int __set_page_dirty_buffers(struct page *page)
711 {
712 	struct address_space * const mapping = page_mapping(page);
713 
714 	if (unlikely(!mapping))
715 		return !TestSetPageDirty(page);
716 
717 	spin_lock(&mapping->private_lock);
718 	if (page_has_buffers(page)) {
719 		struct buffer_head *head = page_buffers(page);
720 		struct buffer_head *bh = head;
721 
722 		do {
723 			set_buffer_dirty(bh);
724 			bh = bh->b_this_page;
725 		} while (bh != head);
726 	}
727 	spin_unlock(&mapping->private_lock);
728 
729 	if (TestSetPageDirty(page))
730 		return 0;
731 
732 	write_lock_irq(&mapping->tree_lock);
733 	if (page->mapping) {	/* Race with truncate? */
734 		if (mapping_cap_account_dirty(mapping)) {
735 			__inc_zone_page_state(page, NR_FILE_DIRTY);
736 			task_io_account_write(PAGE_CACHE_SIZE);
737 		}
738 		radix_tree_tag_set(&mapping->page_tree,
739 				page_index(page), PAGECACHE_TAG_DIRTY);
740 	}
741 	write_unlock_irq(&mapping->tree_lock);
742 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
743 	return 1;
744 }
745 EXPORT_SYMBOL(__set_page_dirty_buffers);
746 
747 /*
748  * Write out and wait upon a list of buffers.
749  *
750  * We have conflicting pressures: we want to make sure that all
751  * initially dirty buffers get waited on, but that any subsequently
752  * dirtied buffers don't.  After all, we don't want fsync to last
753  * forever if somebody is actively writing to the file.
754  *
755  * Do this in two main stages: first we copy dirty buffers to a
756  * temporary inode list, queueing the writes as we go.  Then we clean
757  * up, waiting for those writes to complete.
758  *
759  * During this second stage, any subsequent updates to the file may end
760  * up refiling the buffer on the original inode's dirty list again, so
761  * there is a chance we will end up with a buffer queued for write but
762  * not yet completed on that list.  So, as a final cleanup we go through
763  * the osync code to catch these locked, dirty buffers without requeuing
764  * any newly dirty buffers for write.
765  */
766 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
767 {
768 	struct buffer_head *bh;
769 	struct list_head tmp;
770 	int err = 0, err2;
771 
772 	INIT_LIST_HEAD(&tmp);
773 
774 	spin_lock(lock);
775 	while (!list_empty(list)) {
776 		bh = BH_ENTRY(list->next);
777 		__remove_assoc_queue(bh);
778 		if (buffer_dirty(bh) || buffer_locked(bh)) {
779 			list_add(&bh->b_assoc_buffers, &tmp);
780 			if (buffer_dirty(bh)) {
781 				get_bh(bh);
782 				spin_unlock(lock);
783 				/*
784 				 * Ensure any pending I/O completes so that
785 				 * ll_rw_block() actually writes the current
786 				 * contents - it is a noop if I/O is still in
787 				 * flight on potentially older contents.
788 				 */
789 				ll_rw_block(SWRITE, 1, &bh);
790 				brelse(bh);
791 				spin_lock(lock);
792 			}
793 		}
794 	}
795 
796 	while (!list_empty(&tmp)) {
797 		bh = BH_ENTRY(tmp.prev);
798 		list_del_init(&bh->b_assoc_buffers);
799 		get_bh(bh);
800 		spin_unlock(lock);
801 		wait_on_buffer(bh);
802 		if (!buffer_uptodate(bh))
803 			err = -EIO;
804 		brelse(bh);
805 		spin_lock(lock);
806 	}
807 
808 	spin_unlock(lock);
809 	err2 = osync_buffers_list(lock, list);
810 	if (err)
811 		return err;
812 	else
813 		return err2;
814 }
815 
816 /*
817  * Invalidate any and all dirty buffers on a given inode.  We are
818  * probably unmounting the fs, but that doesn't mean we have already
819  * done a sync().  Just drop the buffers from the inode list.
820  *
821  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
822  * assumes that all the buffers are against the blockdev.  Not true
823  * for reiserfs.
824  */
825 void invalidate_inode_buffers(struct inode *inode)
826 {
827 	if (inode_has_buffers(inode)) {
828 		struct address_space *mapping = &inode->i_data;
829 		struct list_head *list = &mapping->private_list;
830 		struct address_space *buffer_mapping = mapping->assoc_mapping;
831 
832 		spin_lock(&buffer_mapping->private_lock);
833 		while (!list_empty(list))
834 			__remove_assoc_queue(BH_ENTRY(list->next));
835 		spin_unlock(&buffer_mapping->private_lock);
836 	}
837 }
838 
839 /*
840  * Remove any clean buffers from the inode's buffer list.  This is called
841  * when we're trying to free the inode itself.  Those buffers can pin it.
842  *
843  * Returns true if all buffers were removed.
844  */
845 int remove_inode_buffers(struct inode *inode)
846 {
847 	int ret = 1;
848 
849 	if (inode_has_buffers(inode)) {
850 		struct address_space *mapping = &inode->i_data;
851 		struct list_head *list = &mapping->private_list;
852 		struct address_space *buffer_mapping = mapping->assoc_mapping;
853 
854 		spin_lock(&buffer_mapping->private_lock);
855 		while (!list_empty(list)) {
856 			struct buffer_head *bh = BH_ENTRY(list->next);
857 			if (buffer_dirty(bh)) {
858 				ret = 0;
859 				break;
860 			}
861 			__remove_assoc_queue(bh);
862 		}
863 		spin_unlock(&buffer_mapping->private_lock);
864 	}
865 	return ret;
866 }
867 
868 /*
869  * Create the appropriate buffers when given a page for data area and
870  * the size of each buffer.. Use the bh->b_this_page linked list to
871  * follow the buffers created.  Return NULL if unable to create more
872  * buffers.
873  *
874  * The retry flag is used to differentiate async IO (paging, swapping)
875  * which may not fail from ordinary buffer allocations.
876  */
877 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
878 		int retry)
879 {
880 	struct buffer_head *bh, *head;
881 	long offset;
882 
883 try_again:
884 	head = NULL;
885 	offset = PAGE_SIZE;
886 	while ((offset -= size) >= 0) {
887 		bh = alloc_buffer_head(GFP_NOFS);
888 		if (!bh)
889 			goto no_grow;
890 
891 		bh->b_bdev = NULL;
892 		bh->b_this_page = head;
893 		bh->b_blocknr = -1;
894 		head = bh;
895 
896 		bh->b_state = 0;
897 		atomic_set(&bh->b_count, 0);
898 		bh->b_private = NULL;
899 		bh->b_size = size;
900 
901 		/* Link the buffer to its page */
902 		set_bh_page(bh, page, offset);
903 
904 		init_buffer(bh, NULL, NULL);
905 	}
906 	return head;
907 /*
908  * In case anything failed, we just free everything we got.
909  */
910 no_grow:
911 	if (head) {
912 		do {
913 			bh = head;
914 			head = head->b_this_page;
915 			free_buffer_head(bh);
916 		} while (head);
917 	}
918 
919 	/*
920 	 * Return failure for non-async IO requests.  Async IO requests
921 	 * are not allowed to fail, so we have to wait until buffer heads
922 	 * become available.  But we don't want tasks sleeping with
923 	 * partially complete buffers, so all were released above.
924 	 */
925 	if (!retry)
926 		return NULL;
927 
928 	/* We're _really_ low on memory. Now we just
929 	 * wait for old buffer heads to become free due to
930 	 * finishing IO.  Since this is an async request and
931 	 * the reserve list is empty, we're sure there are
932 	 * async buffer heads in use.
933 	 */
934 	free_more_memory();
935 	goto try_again;
936 }
937 EXPORT_SYMBOL_GPL(alloc_page_buffers);
938 
939 static inline void
940 link_dev_buffers(struct page *page, struct buffer_head *head)
941 {
942 	struct buffer_head *bh, *tail;
943 
944 	bh = head;
945 	do {
946 		tail = bh;
947 		bh = bh->b_this_page;
948 	} while (bh);
949 	tail->b_this_page = head;
950 	attach_page_buffers(page, head);
951 }
952 
953 /*
954  * Initialise the state of a blockdev page's buffers.
955  */
956 static void
957 init_page_buffers(struct page *page, struct block_device *bdev,
958 			sector_t block, int size)
959 {
960 	struct buffer_head *head = page_buffers(page);
961 	struct buffer_head *bh = head;
962 	int uptodate = PageUptodate(page);
963 
964 	do {
965 		if (!buffer_mapped(bh)) {
966 			init_buffer(bh, NULL, NULL);
967 			bh->b_bdev = bdev;
968 			bh->b_blocknr = block;
969 			if (uptodate)
970 				set_buffer_uptodate(bh);
971 			set_buffer_mapped(bh);
972 		}
973 		block++;
974 		bh = bh->b_this_page;
975 	} while (bh != head);
976 }
977 
978 /*
979  * Create the page-cache page that contains the requested block.
980  *
981  * This is user purely for blockdev mappings.
982  */
983 static struct page *
984 grow_dev_page(struct block_device *bdev, sector_t block,
985 		pgoff_t index, int size)
986 {
987 	struct inode *inode = bdev->bd_inode;
988 	struct page *page;
989 	struct buffer_head *bh;
990 
991 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
992 	if (!page)
993 		return NULL;
994 
995 	BUG_ON(!PageLocked(page));
996 
997 	if (page_has_buffers(page)) {
998 		bh = page_buffers(page);
999 		if (bh->b_size == size) {
1000 			init_page_buffers(page, bdev, block, size);
1001 			return page;
1002 		}
1003 		if (!try_to_free_buffers(page))
1004 			goto failed;
1005 	}
1006 
1007 	/*
1008 	 * Allocate some buffers for this page
1009 	 */
1010 	bh = alloc_page_buffers(page, size, 0);
1011 	if (!bh)
1012 		goto failed;
1013 
1014 	/*
1015 	 * Link the page to the buffers and initialise them.  Take the
1016 	 * lock to be atomic wrt __find_get_block(), which does not
1017 	 * run under the page lock.
1018 	 */
1019 	spin_lock(&inode->i_mapping->private_lock);
1020 	link_dev_buffers(page, bh);
1021 	init_page_buffers(page, bdev, block, size);
1022 	spin_unlock(&inode->i_mapping->private_lock);
1023 	return page;
1024 
1025 failed:
1026 	BUG();
1027 	unlock_page(page);
1028 	page_cache_release(page);
1029 	return NULL;
1030 }
1031 
1032 /*
1033  * Create buffers for the specified block device block's page.  If
1034  * that page was dirty, the buffers are set dirty also.
1035  *
1036  * Except that's a bug.  Attaching dirty buffers to a dirty
1037  * blockdev's page can result in filesystem corruption, because
1038  * some of those buffers may be aliases of filesystem data.
1039  * grow_dev_page() will go BUG() if this happens.
1040  */
1041 static int
1042 grow_buffers(struct block_device *bdev, sector_t block, int size)
1043 {
1044 	struct page *page;
1045 	pgoff_t index;
1046 	int sizebits;
1047 
1048 	sizebits = -1;
1049 	do {
1050 		sizebits++;
1051 	} while ((size << sizebits) < PAGE_SIZE);
1052 
1053 	index = block >> sizebits;
1054 
1055 	/*
1056 	 * Check for a block which wants to lie outside our maximum possible
1057 	 * pagecache index.  (this comparison is done using sector_t types).
1058 	 */
1059 	if (unlikely(index != block >> sizebits)) {
1060 		char b[BDEVNAME_SIZE];
1061 
1062 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1063 			"device %s\n",
1064 			__FUNCTION__, (unsigned long long)block,
1065 			bdevname(bdev, b));
1066 		return -EIO;
1067 	}
1068 	block = index << sizebits;
1069 	/* Create a page with the proper size buffers.. */
1070 	page = grow_dev_page(bdev, block, index, size);
1071 	if (!page)
1072 		return 0;
1073 	unlock_page(page);
1074 	page_cache_release(page);
1075 	return 1;
1076 }
1077 
1078 static struct buffer_head *
1079 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1080 {
1081 	/* Size must be multiple of hard sectorsize */
1082 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1083 			(size < 512 || size > PAGE_SIZE))) {
1084 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1085 					size);
1086 		printk(KERN_ERR "hardsect size: %d\n",
1087 					bdev_hardsect_size(bdev));
1088 
1089 		dump_stack();
1090 		return NULL;
1091 	}
1092 
1093 	for (;;) {
1094 		struct buffer_head * bh;
1095 		int ret;
1096 
1097 		bh = __find_get_block(bdev, block, size);
1098 		if (bh)
1099 			return bh;
1100 
1101 		ret = grow_buffers(bdev, block, size);
1102 		if (ret < 0)
1103 			return NULL;
1104 		if (ret == 0)
1105 			free_more_memory();
1106 	}
1107 }
1108 
1109 /*
1110  * The relationship between dirty buffers and dirty pages:
1111  *
1112  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1113  * the page is tagged dirty in its radix tree.
1114  *
1115  * At all times, the dirtiness of the buffers represents the dirtiness of
1116  * subsections of the page.  If the page has buffers, the page dirty bit is
1117  * merely a hint about the true dirty state.
1118  *
1119  * When a page is set dirty in its entirety, all its buffers are marked dirty
1120  * (if the page has buffers).
1121  *
1122  * When a buffer is marked dirty, its page is dirtied, but the page's other
1123  * buffers are not.
1124  *
1125  * Also.  When blockdev buffers are explicitly read with bread(), they
1126  * individually become uptodate.  But their backing page remains not
1127  * uptodate - even if all of its buffers are uptodate.  A subsequent
1128  * block_read_full_page() against that page will discover all the uptodate
1129  * buffers, will set the page uptodate and will perform no I/O.
1130  */
1131 
1132 /**
1133  * mark_buffer_dirty - mark a buffer_head as needing writeout
1134  * @bh: the buffer_head to mark dirty
1135  *
1136  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1137  * backing page dirty, then tag the page as dirty in its address_space's radix
1138  * tree and then attach the address_space's inode to its superblock's dirty
1139  * inode list.
1140  *
1141  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1142  * mapping->tree_lock and the global inode_lock.
1143  */
1144 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1145 {
1146 	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1147 		__set_page_dirty_nobuffers(bh->b_page);
1148 }
1149 
1150 /*
1151  * Decrement a buffer_head's reference count.  If all buffers against a page
1152  * have zero reference count, are clean and unlocked, and if the page is clean
1153  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1154  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1155  * a page but it ends up not being freed, and buffers may later be reattached).
1156  */
1157 void __brelse(struct buffer_head * buf)
1158 {
1159 	if (atomic_read(&buf->b_count)) {
1160 		put_bh(buf);
1161 		return;
1162 	}
1163 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1164 	WARN_ON(1);
1165 }
1166 
1167 /*
1168  * bforget() is like brelse(), except it discards any
1169  * potentially dirty data.
1170  */
1171 void __bforget(struct buffer_head *bh)
1172 {
1173 	clear_buffer_dirty(bh);
1174 	if (!list_empty(&bh->b_assoc_buffers)) {
1175 		struct address_space *buffer_mapping = bh->b_page->mapping;
1176 
1177 		spin_lock(&buffer_mapping->private_lock);
1178 		list_del_init(&bh->b_assoc_buffers);
1179 		bh->b_assoc_map = NULL;
1180 		spin_unlock(&buffer_mapping->private_lock);
1181 	}
1182 	__brelse(bh);
1183 }
1184 
1185 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1186 {
1187 	lock_buffer(bh);
1188 	if (buffer_uptodate(bh)) {
1189 		unlock_buffer(bh);
1190 		return bh;
1191 	} else {
1192 		get_bh(bh);
1193 		bh->b_end_io = end_buffer_read_sync;
1194 		submit_bh(READ, bh);
1195 		wait_on_buffer(bh);
1196 		if (buffer_uptodate(bh))
1197 			return bh;
1198 	}
1199 	brelse(bh);
1200 	return NULL;
1201 }
1202 
1203 /*
1204  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1205  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1206  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1207  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1208  * CPU's LRUs at the same time.
1209  *
1210  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1211  * sb_find_get_block().
1212  *
1213  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1214  * a local interrupt disable for that.
1215  */
1216 
1217 #define BH_LRU_SIZE	8
1218 
1219 struct bh_lru {
1220 	struct buffer_head *bhs[BH_LRU_SIZE];
1221 };
1222 
1223 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1224 
1225 #ifdef CONFIG_SMP
1226 #define bh_lru_lock()	local_irq_disable()
1227 #define bh_lru_unlock()	local_irq_enable()
1228 #else
1229 #define bh_lru_lock()	preempt_disable()
1230 #define bh_lru_unlock()	preempt_enable()
1231 #endif
1232 
1233 static inline void check_irqs_on(void)
1234 {
1235 #ifdef irqs_disabled
1236 	BUG_ON(irqs_disabled());
1237 #endif
1238 }
1239 
1240 /*
1241  * The LRU management algorithm is dopey-but-simple.  Sorry.
1242  */
1243 static void bh_lru_install(struct buffer_head *bh)
1244 {
1245 	struct buffer_head *evictee = NULL;
1246 	struct bh_lru *lru;
1247 
1248 	check_irqs_on();
1249 	bh_lru_lock();
1250 	lru = &__get_cpu_var(bh_lrus);
1251 	if (lru->bhs[0] != bh) {
1252 		struct buffer_head *bhs[BH_LRU_SIZE];
1253 		int in;
1254 		int out = 0;
1255 
1256 		get_bh(bh);
1257 		bhs[out++] = bh;
1258 		for (in = 0; in < BH_LRU_SIZE; in++) {
1259 			struct buffer_head *bh2 = lru->bhs[in];
1260 
1261 			if (bh2 == bh) {
1262 				__brelse(bh2);
1263 			} else {
1264 				if (out >= BH_LRU_SIZE) {
1265 					BUG_ON(evictee != NULL);
1266 					evictee = bh2;
1267 				} else {
1268 					bhs[out++] = bh2;
1269 				}
1270 			}
1271 		}
1272 		while (out < BH_LRU_SIZE)
1273 			bhs[out++] = NULL;
1274 		memcpy(lru->bhs, bhs, sizeof(bhs));
1275 	}
1276 	bh_lru_unlock();
1277 
1278 	if (evictee)
1279 		__brelse(evictee);
1280 }
1281 
1282 /*
1283  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1284  */
1285 static struct buffer_head *
1286 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1287 {
1288 	struct buffer_head *ret = NULL;
1289 	struct bh_lru *lru;
1290 	unsigned int i;
1291 
1292 	check_irqs_on();
1293 	bh_lru_lock();
1294 	lru = &__get_cpu_var(bh_lrus);
1295 	for (i = 0; i < BH_LRU_SIZE; i++) {
1296 		struct buffer_head *bh = lru->bhs[i];
1297 
1298 		if (bh && bh->b_bdev == bdev &&
1299 				bh->b_blocknr == block && bh->b_size == size) {
1300 			if (i) {
1301 				while (i) {
1302 					lru->bhs[i] = lru->bhs[i - 1];
1303 					i--;
1304 				}
1305 				lru->bhs[0] = bh;
1306 			}
1307 			get_bh(bh);
1308 			ret = bh;
1309 			break;
1310 		}
1311 	}
1312 	bh_lru_unlock();
1313 	return ret;
1314 }
1315 
1316 /*
1317  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1318  * it in the LRU and mark it as accessed.  If it is not present then return
1319  * NULL
1320  */
1321 struct buffer_head *
1322 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1323 {
1324 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1325 
1326 	if (bh == NULL) {
1327 		bh = __find_get_block_slow(bdev, block);
1328 		if (bh)
1329 			bh_lru_install(bh);
1330 	}
1331 	if (bh)
1332 		touch_buffer(bh);
1333 	return bh;
1334 }
1335 EXPORT_SYMBOL(__find_get_block);
1336 
1337 /*
1338  * __getblk will locate (and, if necessary, create) the buffer_head
1339  * which corresponds to the passed block_device, block and size. The
1340  * returned buffer has its reference count incremented.
1341  *
1342  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1343  * illegal block number, __getblk() will happily return a buffer_head
1344  * which represents the non-existent block.  Very weird.
1345  *
1346  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1347  * attempt is failing.  FIXME, perhaps?
1348  */
1349 struct buffer_head *
1350 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1351 {
1352 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1353 
1354 	might_sleep();
1355 	if (bh == NULL)
1356 		bh = __getblk_slow(bdev, block, size);
1357 	return bh;
1358 }
1359 EXPORT_SYMBOL(__getblk);
1360 
1361 /*
1362  * Do async read-ahead on a buffer..
1363  */
1364 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1365 {
1366 	struct buffer_head *bh = __getblk(bdev, block, size);
1367 	if (likely(bh)) {
1368 		ll_rw_block(READA, 1, &bh);
1369 		brelse(bh);
1370 	}
1371 }
1372 EXPORT_SYMBOL(__breadahead);
1373 
1374 /**
1375  *  __bread() - reads a specified block and returns the bh
1376  *  @bdev: the block_device to read from
1377  *  @block: number of block
1378  *  @size: size (in bytes) to read
1379  *
1380  *  Reads a specified block, and returns buffer head that contains it.
1381  *  It returns NULL if the block was unreadable.
1382  */
1383 struct buffer_head *
1384 __bread(struct block_device *bdev, sector_t block, unsigned size)
1385 {
1386 	struct buffer_head *bh = __getblk(bdev, block, size);
1387 
1388 	if (likely(bh) && !buffer_uptodate(bh))
1389 		bh = __bread_slow(bh);
1390 	return bh;
1391 }
1392 EXPORT_SYMBOL(__bread);
1393 
1394 /*
1395  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1396  * This doesn't race because it runs in each cpu either in irq
1397  * or with preempt disabled.
1398  */
1399 static void invalidate_bh_lru(void *arg)
1400 {
1401 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1402 	int i;
1403 
1404 	for (i = 0; i < BH_LRU_SIZE; i++) {
1405 		brelse(b->bhs[i]);
1406 		b->bhs[i] = NULL;
1407 	}
1408 	put_cpu_var(bh_lrus);
1409 }
1410 
1411 static void invalidate_bh_lrus(void)
1412 {
1413 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1414 }
1415 
1416 void set_bh_page(struct buffer_head *bh,
1417 		struct page *page, unsigned long offset)
1418 {
1419 	bh->b_page = page;
1420 	BUG_ON(offset >= PAGE_SIZE);
1421 	if (PageHighMem(page))
1422 		/*
1423 		 * This catches illegal uses and preserves the offset:
1424 		 */
1425 		bh->b_data = (char *)(0 + offset);
1426 	else
1427 		bh->b_data = page_address(page) + offset;
1428 }
1429 EXPORT_SYMBOL(set_bh_page);
1430 
1431 /*
1432  * Called when truncating a buffer on a page completely.
1433  */
1434 static void discard_buffer(struct buffer_head * bh)
1435 {
1436 	lock_buffer(bh);
1437 	clear_buffer_dirty(bh);
1438 	bh->b_bdev = NULL;
1439 	clear_buffer_mapped(bh);
1440 	clear_buffer_req(bh);
1441 	clear_buffer_new(bh);
1442 	clear_buffer_delay(bh);
1443 	clear_buffer_unwritten(bh);
1444 	unlock_buffer(bh);
1445 }
1446 
1447 /**
1448  * block_invalidatepage - invalidate part of all of a buffer-backed page
1449  *
1450  * @page: the page which is affected
1451  * @offset: the index of the truncation point
1452  *
1453  * block_invalidatepage() is called when all or part of the page has become
1454  * invalidatedby a truncate operation.
1455  *
1456  * block_invalidatepage() does not have to release all buffers, but it must
1457  * ensure that no dirty buffer is left outside @offset and that no I/O
1458  * is underway against any of the blocks which are outside the truncation
1459  * point.  Because the caller is about to free (and possibly reuse) those
1460  * blocks on-disk.
1461  */
1462 void block_invalidatepage(struct page *page, unsigned long offset)
1463 {
1464 	struct buffer_head *head, *bh, *next;
1465 	unsigned int curr_off = 0;
1466 
1467 	BUG_ON(!PageLocked(page));
1468 	if (!page_has_buffers(page))
1469 		goto out;
1470 
1471 	head = page_buffers(page);
1472 	bh = head;
1473 	do {
1474 		unsigned int next_off = curr_off + bh->b_size;
1475 		next = bh->b_this_page;
1476 
1477 		/*
1478 		 * is this block fully invalidated?
1479 		 */
1480 		if (offset <= curr_off)
1481 			discard_buffer(bh);
1482 		curr_off = next_off;
1483 		bh = next;
1484 	} while (bh != head);
1485 
1486 	/*
1487 	 * We release buffers only if the entire page is being invalidated.
1488 	 * The get_block cached value has been unconditionally invalidated,
1489 	 * so real IO is not possible anymore.
1490 	 */
1491 	if (offset == 0)
1492 		try_to_release_page(page, 0);
1493 out:
1494 	return;
1495 }
1496 EXPORT_SYMBOL(block_invalidatepage);
1497 
1498 /*
1499  * We attach and possibly dirty the buffers atomically wrt
1500  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1501  * is already excluded via the page lock.
1502  */
1503 void create_empty_buffers(struct page *page,
1504 			unsigned long blocksize, unsigned long b_state)
1505 {
1506 	struct buffer_head *bh, *head, *tail;
1507 
1508 	head = alloc_page_buffers(page, blocksize, 1);
1509 	bh = head;
1510 	do {
1511 		bh->b_state |= b_state;
1512 		tail = bh;
1513 		bh = bh->b_this_page;
1514 	} while (bh);
1515 	tail->b_this_page = head;
1516 
1517 	spin_lock(&page->mapping->private_lock);
1518 	if (PageUptodate(page) || PageDirty(page)) {
1519 		bh = head;
1520 		do {
1521 			if (PageDirty(page))
1522 				set_buffer_dirty(bh);
1523 			if (PageUptodate(page))
1524 				set_buffer_uptodate(bh);
1525 			bh = bh->b_this_page;
1526 		} while (bh != head);
1527 	}
1528 	attach_page_buffers(page, head);
1529 	spin_unlock(&page->mapping->private_lock);
1530 }
1531 EXPORT_SYMBOL(create_empty_buffers);
1532 
1533 /*
1534  * We are taking a block for data and we don't want any output from any
1535  * buffer-cache aliases starting from return from that function and
1536  * until the moment when something will explicitly mark the buffer
1537  * dirty (hopefully that will not happen until we will free that block ;-)
1538  * We don't even need to mark it not-uptodate - nobody can expect
1539  * anything from a newly allocated buffer anyway. We used to used
1540  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1541  * don't want to mark the alias unmapped, for example - it would confuse
1542  * anyone who might pick it with bread() afterwards...
1543  *
1544  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1545  * be writeout I/O going on against recently-freed buffers.  We don't
1546  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1547  * only if we really need to.  That happens here.
1548  */
1549 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1550 {
1551 	struct buffer_head *old_bh;
1552 
1553 	might_sleep();
1554 
1555 	old_bh = __find_get_block_slow(bdev, block);
1556 	if (old_bh) {
1557 		clear_buffer_dirty(old_bh);
1558 		wait_on_buffer(old_bh);
1559 		clear_buffer_req(old_bh);
1560 		__brelse(old_bh);
1561 	}
1562 }
1563 EXPORT_SYMBOL(unmap_underlying_metadata);
1564 
1565 /*
1566  * NOTE! All mapped/uptodate combinations are valid:
1567  *
1568  *	Mapped	Uptodate	Meaning
1569  *
1570  *	No	No		"unknown" - must do get_block()
1571  *	No	Yes		"hole" - zero-filled
1572  *	Yes	No		"allocated" - allocated on disk, not read in
1573  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1574  *
1575  * "Dirty" is valid only with the last case (mapped+uptodate).
1576  */
1577 
1578 /*
1579  * While block_write_full_page is writing back the dirty buffers under
1580  * the page lock, whoever dirtied the buffers may decide to clean them
1581  * again at any time.  We handle that by only looking at the buffer
1582  * state inside lock_buffer().
1583  *
1584  * If block_write_full_page() is called for regular writeback
1585  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1586  * locked buffer.   This only can happen if someone has written the buffer
1587  * directly, with submit_bh().  At the address_space level PageWriteback
1588  * prevents this contention from occurring.
1589  */
1590 static int __block_write_full_page(struct inode *inode, struct page *page,
1591 			get_block_t *get_block, struct writeback_control *wbc)
1592 {
1593 	int err;
1594 	sector_t block;
1595 	sector_t last_block;
1596 	struct buffer_head *bh, *head;
1597 	const unsigned blocksize = 1 << inode->i_blkbits;
1598 	int nr_underway = 0;
1599 
1600 	BUG_ON(!PageLocked(page));
1601 
1602 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1603 
1604 	if (!page_has_buffers(page)) {
1605 		create_empty_buffers(page, blocksize,
1606 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1607 	}
1608 
1609 	/*
1610 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1611 	 * here, and the (potentially unmapped) buffers may become dirty at
1612 	 * any time.  If a buffer becomes dirty here after we've inspected it
1613 	 * then we just miss that fact, and the page stays dirty.
1614 	 *
1615 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1616 	 * handle that here by just cleaning them.
1617 	 */
1618 
1619 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1620 	head = page_buffers(page);
1621 	bh = head;
1622 
1623 	/*
1624 	 * Get all the dirty buffers mapped to disk addresses and
1625 	 * handle any aliases from the underlying blockdev's mapping.
1626 	 */
1627 	do {
1628 		if (block > last_block) {
1629 			/*
1630 			 * mapped buffers outside i_size will occur, because
1631 			 * this page can be outside i_size when there is a
1632 			 * truncate in progress.
1633 			 */
1634 			/*
1635 			 * The buffer was zeroed by block_write_full_page()
1636 			 */
1637 			clear_buffer_dirty(bh);
1638 			set_buffer_uptodate(bh);
1639 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1640 			WARN_ON(bh->b_size != blocksize);
1641 			err = get_block(inode, block, bh, 1);
1642 			if (err)
1643 				goto recover;
1644 			if (buffer_new(bh)) {
1645 				/* blockdev mappings never come here */
1646 				clear_buffer_new(bh);
1647 				unmap_underlying_metadata(bh->b_bdev,
1648 							bh->b_blocknr);
1649 			}
1650 		}
1651 		bh = bh->b_this_page;
1652 		block++;
1653 	} while (bh != head);
1654 
1655 	do {
1656 		if (!buffer_mapped(bh))
1657 			continue;
1658 		/*
1659 		 * If it's a fully non-blocking write attempt and we cannot
1660 		 * lock the buffer then redirty the page.  Note that this can
1661 		 * potentially cause a busy-wait loop from pdflush and kswapd
1662 		 * activity, but those code paths have their own higher-level
1663 		 * throttling.
1664 		 */
1665 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1666 			lock_buffer(bh);
1667 		} else if (test_set_buffer_locked(bh)) {
1668 			redirty_page_for_writepage(wbc, page);
1669 			continue;
1670 		}
1671 		if (test_clear_buffer_dirty(bh)) {
1672 			mark_buffer_async_write(bh);
1673 		} else {
1674 			unlock_buffer(bh);
1675 		}
1676 	} while ((bh = bh->b_this_page) != head);
1677 
1678 	/*
1679 	 * The page and its buffers are protected by PageWriteback(), so we can
1680 	 * drop the bh refcounts early.
1681 	 */
1682 	BUG_ON(PageWriteback(page));
1683 	set_page_writeback(page);
1684 
1685 	do {
1686 		struct buffer_head *next = bh->b_this_page;
1687 		if (buffer_async_write(bh)) {
1688 			submit_bh(WRITE, bh);
1689 			nr_underway++;
1690 		}
1691 		bh = next;
1692 	} while (bh != head);
1693 	unlock_page(page);
1694 
1695 	err = 0;
1696 done:
1697 	if (nr_underway == 0) {
1698 		/*
1699 		 * The page was marked dirty, but the buffers were
1700 		 * clean.  Someone wrote them back by hand with
1701 		 * ll_rw_block/submit_bh.  A rare case.
1702 		 */
1703 		int uptodate = 1;
1704 		do {
1705 			if (!buffer_uptodate(bh)) {
1706 				uptodate = 0;
1707 				break;
1708 			}
1709 			bh = bh->b_this_page;
1710 		} while (bh != head);
1711 		if (uptodate)
1712 			SetPageUptodate(page);
1713 		end_page_writeback(page);
1714 		/*
1715 		 * The page and buffer_heads can be released at any time from
1716 		 * here on.
1717 		 */
1718 		wbc->pages_skipped++;	/* We didn't write this page */
1719 	}
1720 	return err;
1721 
1722 recover:
1723 	/*
1724 	 * ENOSPC, or some other error.  We may already have added some
1725 	 * blocks to the file, so we need to write these out to avoid
1726 	 * exposing stale data.
1727 	 * The page is currently locked and not marked for writeback
1728 	 */
1729 	bh = head;
1730 	/* Recovery: lock and submit the mapped buffers */
1731 	do {
1732 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
1733 			lock_buffer(bh);
1734 			mark_buffer_async_write(bh);
1735 		} else {
1736 			/*
1737 			 * The buffer may have been set dirty during
1738 			 * attachment to a dirty page.
1739 			 */
1740 			clear_buffer_dirty(bh);
1741 		}
1742 	} while ((bh = bh->b_this_page) != head);
1743 	SetPageError(page);
1744 	BUG_ON(PageWriteback(page));
1745 	set_page_writeback(page);
1746 	do {
1747 		struct buffer_head *next = bh->b_this_page;
1748 		if (buffer_async_write(bh)) {
1749 			clear_buffer_dirty(bh);
1750 			submit_bh(WRITE, bh);
1751 			nr_underway++;
1752 		}
1753 		bh = next;
1754 	} while (bh != head);
1755 	unlock_page(page);
1756 	goto done;
1757 }
1758 
1759 static int __block_prepare_write(struct inode *inode, struct page *page,
1760 		unsigned from, unsigned to, get_block_t *get_block)
1761 {
1762 	unsigned block_start, block_end;
1763 	sector_t block;
1764 	int err = 0;
1765 	unsigned blocksize, bbits;
1766 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1767 
1768 	BUG_ON(!PageLocked(page));
1769 	BUG_ON(from > PAGE_CACHE_SIZE);
1770 	BUG_ON(to > PAGE_CACHE_SIZE);
1771 	BUG_ON(from > to);
1772 
1773 	blocksize = 1 << inode->i_blkbits;
1774 	if (!page_has_buffers(page))
1775 		create_empty_buffers(page, blocksize, 0);
1776 	head = page_buffers(page);
1777 
1778 	bbits = inode->i_blkbits;
1779 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1780 
1781 	for(bh = head, block_start = 0; bh != head || !block_start;
1782 	    block++, block_start=block_end, bh = bh->b_this_page) {
1783 		block_end = block_start + blocksize;
1784 		if (block_end <= from || block_start >= to) {
1785 			if (PageUptodate(page)) {
1786 				if (!buffer_uptodate(bh))
1787 					set_buffer_uptodate(bh);
1788 			}
1789 			continue;
1790 		}
1791 		if (buffer_new(bh))
1792 			clear_buffer_new(bh);
1793 		if (!buffer_mapped(bh)) {
1794 			WARN_ON(bh->b_size != blocksize);
1795 			err = get_block(inode, block, bh, 1);
1796 			if (err)
1797 				break;
1798 			if (buffer_new(bh)) {
1799 				unmap_underlying_metadata(bh->b_bdev,
1800 							bh->b_blocknr);
1801 				if (PageUptodate(page)) {
1802 					set_buffer_uptodate(bh);
1803 					continue;
1804 				}
1805 				if (block_end > to || block_start < from) {
1806 					void *kaddr;
1807 
1808 					kaddr = kmap_atomic(page, KM_USER0);
1809 					if (block_end > to)
1810 						memset(kaddr+to, 0,
1811 							block_end-to);
1812 					if (block_start < from)
1813 						memset(kaddr+block_start,
1814 							0, from-block_start);
1815 					flush_dcache_page(page);
1816 					kunmap_atomic(kaddr, KM_USER0);
1817 				}
1818 				continue;
1819 			}
1820 		}
1821 		if (PageUptodate(page)) {
1822 			if (!buffer_uptodate(bh))
1823 				set_buffer_uptodate(bh);
1824 			continue;
1825 		}
1826 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1827 		    !buffer_unwritten(bh) &&
1828 		     (block_start < from || block_end > to)) {
1829 			ll_rw_block(READ, 1, &bh);
1830 			*wait_bh++=bh;
1831 		}
1832 	}
1833 	/*
1834 	 * If we issued read requests - let them complete.
1835 	 */
1836 	while(wait_bh > wait) {
1837 		wait_on_buffer(*--wait_bh);
1838 		if (!buffer_uptodate(*wait_bh))
1839 			err = -EIO;
1840 	}
1841 	if (!err) {
1842 		bh = head;
1843 		do {
1844 			if (buffer_new(bh))
1845 				clear_buffer_new(bh);
1846 		} while ((bh = bh->b_this_page) != head);
1847 		return 0;
1848 	}
1849 	/* Error case: */
1850 	/*
1851 	 * Zero out any newly allocated blocks to avoid exposing stale
1852 	 * data.  If BH_New is set, we know that the block was newly
1853 	 * allocated in the above loop.
1854 	 */
1855 	bh = head;
1856 	block_start = 0;
1857 	do {
1858 		block_end = block_start+blocksize;
1859 		if (block_end <= from)
1860 			goto next_bh;
1861 		if (block_start >= to)
1862 			break;
1863 		if (buffer_new(bh)) {
1864 			void *kaddr;
1865 
1866 			clear_buffer_new(bh);
1867 			kaddr = kmap_atomic(page, KM_USER0);
1868 			memset(kaddr+block_start, 0, bh->b_size);
1869 			flush_dcache_page(page);
1870 			kunmap_atomic(kaddr, KM_USER0);
1871 			set_buffer_uptodate(bh);
1872 			mark_buffer_dirty(bh);
1873 		}
1874 next_bh:
1875 		block_start = block_end;
1876 		bh = bh->b_this_page;
1877 	} while (bh != head);
1878 	return err;
1879 }
1880 
1881 static int __block_commit_write(struct inode *inode, struct page *page,
1882 		unsigned from, unsigned to)
1883 {
1884 	unsigned block_start, block_end;
1885 	int partial = 0;
1886 	unsigned blocksize;
1887 	struct buffer_head *bh, *head;
1888 
1889 	blocksize = 1 << inode->i_blkbits;
1890 
1891 	for(bh = head = page_buffers(page), block_start = 0;
1892 	    bh != head || !block_start;
1893 	    block_start=block_end, bh = bh->b_this_page) {
1894 		block_end = block_start + blocksize;
1895 		if (block_end <= from || block_start >= to) {
1896 			if (!buffer_uptodate(bh))
1897 				partial = 1;
1898 		} else {
1899 			set_buffer_uptodate(bh);
1900 			mark_buffer_dirty(bh);
1901 		}
1902 	}
1903 
1904 	/*
1905 	 * If this is a partial write which happened to make all buffers
1906 	 * uptodate then we can optimize away a bogus readpage() for
1907 	 * the next read(). Here we 'discover' whether the page went
1908 	 * uptodate as a result of this (potentially partial) write.
1909 	 */
1910 	if (!partial)
1911 		SetPageUptodate(page);
1912 	return 0;
1913 }
1914 
1915 /*
1916  * Generic "read page" function for block devices that have the normal
1917  * get_block functionality. This is most of the block device filesystems.
1918  * Reads the page asynchronously --- the unlock_buffer() and
1919  * set/clear_buffer_uptodate() functions propagate buffer state into the
1920  * page struct once IO has completed.
1921  */
1922 int block_read_full_page(struct page *page, get_block_t *get_block)
1923 {
1924 	struct inode *inode = page->mapping->host;
1925 	sector_t iblock, lblock;
1926 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1927 	unsigned int blocksize;
1928 	int nr, i;
1929 	int fully_mapped = 1;
1930 
1931 	BUG_ON(!PageLocked(page));
1932 	blocksize = 1 << inode->i_blkbits;
1933 	if (!page_has_buffers(page))
1934 		create_empty_buffers(page, blocksize, 0);
1935 	head = page_buffers(page);
1936 
1937 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1938 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1939 	bh = head;
1940 	nr = 0;
1941 	i = 0;
1942 
1943 	do {
1944 		if (buffer_uptodate(bh))
1945 			continue;
1946 
1947 		if (!buffer_mapped(bh)) {
1948 			int err = 0;
1949 
1950 			fully_mapped = 0;
1951 			if (iblock < lblock) {
1952 				WARN_ON(bh->b_size != blocksize);
1953 				err = get_block(inode, iblock, bh, 0);
1954 				if (err)
1955 					SetPageError(page);
1956 			}
1957 			if (!buffer_mapped(bh)) {
1958 				void *kaddr = kmap_atomic(page, KM_USER0);
1959 				memset(kaddr + i * blocksize, 0, blocksize);
1960 				flush_dcache_page(page);
1961 				kunmap_atomic(kaddr, KM_USER0);
1962 				if (!err)
1963 					set_buffer_uptodate(bh);
1964 				continue;
1965 			}
1966 			/*
1967 			 * get_block() might have updated the buffer
1968 			 * synchronously
1969 			 */
1970 			if (buffer_uptodate(bh))
1971 				continue;
1972 		}
1973 		arr[nr++] = bh;
1974 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1975 
1976 	if (fully_mapped)
1977 		SetPageMappedToDisk(page);
1978 
1979 	if (!nr) {
1980 		/*
1981 		 * All buffers are uptodate - we can set the page uptodate
1982 		 * as well. But not if get_block() returned an error.
1983 		 */
1984 		if (!PageError(page))
1985 			SetPageUptodate(page);
1986 		unlock_page(page);
1987 		return 0;
1988 	}
1989 
1990 	/* Stage two: lock the buffers */
1991 	for (i = 0; i < nr; i++) {
1992 		bh = arr[i];
1993 		lock_buffer(bh);
1994 		mark_buffer_async_read(bh);
1995 	}
1996 
1997 	/*
1998 	 * Stage 3: start the IO.  Check for uptodateness
1999 	 * inside the buffer lock in case another process reading
2000 	 * the underlying blockdev brought it uptodate (the sct fix).
2001 	 */
2002 	for (i = 0; i < nr; i++) {
2003 		bh = arr[i];
2004 		if (buffer_uptodate(bh))
2005 			end_buffer_async_read(bh, 1);
2006 		else
2007 			submit_bh(READ, bh);
2008 	}
2009 	return 0;
2010 }
2011 
2012 /* utility function for filesystems that need to do work on expanding
2013  * truncates.  Uses prepare/commit_write to allow the filesystem to
2014  * deal with the hole.
2015  */
2016 static int __generic_cont_expand(struct inode *inode, loff_t size,
2017 				 pgoff_t index, unsigned int offset)
2018 {
2019 	struct address_space *mapping = inode->i_mapping;
2020 	struct page *page;
2021 	unsigned long limit;
2022 	int err;
2023 
2024 	err = -EFBIG;
2025         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2026 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2027 		send_sig(SIGXFSZ, current, 0);
2028 		goto out;
2029 	}
2030 	if (size > inode->i_sb->s_maxbytes)
2031 		goto out;
2032 
2033 	err = -ENOMEM;
2034 	page = grab_cache_page(mapping, index);
2035 	if (!page)
2036 		goto out;
2037 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2038 	if (err) {
2039 		/*
2040 		 * ->prepare_write() may have instantiated a few blocks
2041 		 * outside i_size.  Trim these off again.
2042 		 */
2043 		unlock_page(page);
2044 		page_cache_release(page);
2045 		vmtruncate(inode, inode->i_size);
2046 		goto out;
2047 	}
2048 
2049 	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2050 
2051 	unlock_page(page);
2052 	page_cache_release(page);
2053 	if (err > 0)
2054 		err = 0;
2055 out:
2056 	return err;
2057 }
2058 
2059 int generic_cont_expand(struct inode *inode, loff_t size)
2060 {
2061 	pgoff_t index;
2062 	unsigned int offset;
2063 
2064 	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2065 
2066 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
2067 	** skip the prepare.  make sure we never send an offset for the start
2068 	** of a block
2069 	*/
2070 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2071 		/* caller must handle this extra byte. */
2072 		offset++;
2073 	}
2074 	index = size >> PAGE_CACHE_SHIFT;
2075 
2076 	return __generic_cont_expand(inode, size, index, offset);
2077 }
2078 
2079 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2080 {
2081 	loff_t pos = size - 1;
2082 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2083 	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2084 
2085 	/* prepare/commit_write can handle even if from==to==start of block. */
2086 	return __generic_cont_expand(inode, size, index, offset);
2087 }
2088 
2089 /*
2090  * For moronic filesystems that do not allow holes in file.
2091  * We may have to extend the file.
2092  */
2093 
2094 int cont_prepare_write(struct page *page, unsigned offset,
2095 		unsigned to, get_block_t *get_block, loff_t *bytes)
2096 {
2097 	struct address_space *mapping = page->mapping;
2098 	struct inode *inode = mapping->host;
2099 	struct page *new_page;
2100 	pgoff_t pgpos;
2101 	long status;
2102 	unsigned zerofrom;
2103 	unsigned blocksize = 1 << inode->i_blkbits;
2104 	void *kaddr;
2105 
2106 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2107 		status = -ENOMEM;
2108 		new_page = grab_cache_page(mapping, pgpos);
2109 		if (!new_page)
2110 			goto out;
2111 		/* we might sleep */
2112 		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2113 			unlock_page(new_page);
2114 			page_cache_release(new_page);
2115 			continue;
2116 		}
2117 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2118 		if (zerofrom & (blocksize-1)) {
2119 			*bytes |= (blocksize-1);
2120 			(*bytes)++;
2121 		}
2122 		status = __block_prepare_write(inode, new_page, zerofrom,
2123 						PAGE_CACHE_SIZE, get_block);
2124 		if (status)
2125 			goto out_unmap;
2126 		kaddr = kmap_atomic(new_page, KM_USER0);
2127 		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2128 		flush_dcache_page(new_page);
2129 		kunmap_atomic(kaddr, KM_USER0);
2130 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2131 		unlock_page(new_page);
2132 		page_cache_release(new_page);
2133 	}
2134 
2135 	if (page->index < pgpos) {
2136 		/* completely inside the area */
2137 		zerofrom = offset;
2138 	} else {
2139 		/* page covers the boundary, find the boundary offset */
2140 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2141 
2142 		/* if we will expand the thing last block will be filled */
2143 		if (to > zerofrom && (zerofrom & (blocksize-1))) {
2144 			*bytes |= (blocksize-1);
2145 			(*bytes)++;
2146 		}
2147 
2148 		/* starting below the boundary? Nothing to zero out */
2149 		if (offset <= zerofrom)
2150 			zerofrom = offset;
2151 	}
2152 	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2153 	if (status)
2154 		goto out1;
2155 	if (zerofrom < offset) {
2156 		kaddr = kmap_atomic(page, KM_USER0);
2157 		memset(kaddr+zerofrom, 0, offset-zerofrom);
2158 		flush_dcache_page(page);
2159 		kunmap_atomic(kaddr, KM_USER0);
2160 		__block_commit_write(inode, page, zerofrom, offset);
2161 	}
2162 	return 0;
2163 out1:
2164 	ClearPageUptodate(page);
2165 	return status;
2166 
2167 out_unmap:
2168 	ClearPageUptodate(new_page);
2169 	unlock_page(new_page);
2170 	page_cache_release(new_page);
2171 out:
2172 	return status;
2173 }
2174 
2175 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2176 			get_block_t *get_block)
2177 {
2178 	struct inode *inode = page->mapping->host;
2179 	int err = __block_prepare_write(inode, page, from, to, get_block);
2180 	if (err)
2181 		ClearPageUptodate(page);
2182 	return err;
2183 }
2184 
2185 int block_commit_write(struct page *page, unsigned from, unsigned to)
2186 {
2187 	struct inode *inode = page->mapping->host;
2188 	__block_commit_write(inode,page,from,to);
2189 	return 0;
2190 }
2191 
2192 int generic_commit_write(struct file *file, struct page *page,
2193 		unsigned from, unsigned to)
2194 {
2195 	struct inode *inode = page->mapping->host;
2196 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2197 	__block_commit_write(inode,page,from,to);
2198 	/*
2199 	 * No need to use i_size_read() here, the i_size
2200 	 * cannot change under us because we hold i_mutex.
2201 	 */
2202 	if (pos > inode->i_size) {
2203 		i_size_write(inode, pos);
2204 		mark_inode_dirty(inode);
2205 	}
2206 	return 0;
2207 }
2208 
2209 
2210 /*
2211  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2212  * immediately, while under the page lock.  So it needs a special end_io
2213  * handler which does not touch the bh after unlocking it.
2214  *
2215  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2216  * a race there is benign: unlock_buffer() only use the bh's address for
2217  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2218  * itself.
2219  */
2220 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2221 {
2222 	if (uptodate) {
2223 		set_buffer_uptodate(bh);
2224 	} else {
2225 		/* This happens, due to failed READA attempts. */
2226 		clear_buffer_uptodate(bh);
2227 	}
2228 	unlock_buffer(bh);
2229 }
2230 
2231 /*
2232  * On entry, the page is fully not uptodate.
2233  * On exit the page is fully uptodate in the areas outside (from,to)
2234  */
2235 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2236 			get_block_t *get_block)
2237 {
2238 	struct inode *inode = page->mapping->host;
2239 	const unsigned blkbits = inode->i_blkbits;
2240 	const unsigned blocksize = 1 << blkbits;
2241 	struct buffer_head map_bh;
2242 	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2243 	unsigned block_in_page;
2244 	unsigned block_start;
2245 	sector_t block_in_file;
2246 	char *kaddr;
2247 	int nr_reads = 0;
2248 	int i;
2249 	int ret = 0;
2250 	int is_mapped_to_disk = 1;
2251 
2252 	if (PageMappedToDisk(page))
2253 		return 0;
2254 
2255 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2256 	map_bh.b_page = page;
2257 
2258 	/*
2259 	 * We loop across all blocks in the page, whether or not they are
2260 	 * part of the affected region.  This is so we can discover if the
2261 	 * page is fully mapped-to-disk.
2262 	 */
2263 	for (block_start = 0, block_in_page = 0;
2264 		  block_start < PAGE_CACHE_SIZE;
2265 		  block_in_page++, block_start += blocksize) {
2266 		unsigned block_end = block_start + blocksize;
2267 		int create;
2268 
2269 		map_bh.b_state = 0;
2270 		create = 1;
2271 		if (block_start >= to)
2272 			create = 0;
2273 		map_bh.b_size = blocksize;
2274 		ret = get_block(inode, block_in_file + block_in_page,
2275 					&map_bh, create);
2276 		if (ret)
2277 			goto failed;
2278 		if (!buffer_mapped(&map_bh))
2279 			is_mapped_to_disk = 0;
2280 		if (buffer_new(&map_bh))
2281 			unmap_underlying_metadata(map_bh.b_bdev,
2282 							map_bh.b_blocknr);
2283 		if (PageUptodate(page))
2284 			continue;
2285 		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2286 			kaddr = kmap_atomic(page, KM_USER0);
2287 			if (block_start < from)
2288 				memset(kaddr+block_start, 0, from-block_start);
2289 			if (block_end > to)
2290 				memset(kaddr + to, 0, block_end - to);
2291 			flush_dcache_page(page);
2292 			kunmap_atomic(kaddr, KM_USER0);
2293 			continue;
2294 		}
2295 		if (buffer_uptodate(&map_bh))
2296 			continue;	/* reiserfs does this */
2297 		if (block_start < from || block_end > to) {
2298 			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2299 
2300 			if (!bh) {
2301 				ret = -ENOMEM;
2302 				goto failed;
2303 			}
2304 			bh->b_state = map_bh.b_state;
2305 			atomic_set(&bh->b_count, 0);
2306 			bh->b_this_page = NULL;
2307 			bh->b_page = page;
2308 			bh->b_blocknr = map_bh.b_blocknr;
2309 			bh->b_size = blocksize;
2310 			bh->b_data = (char *)(long)block_start;
2311 			bh->b_bdev = map_bh.b_bdev;
2312 			bh->b_private = NULL;
2313 			read_bh[nr_reads++] = bh;
2314 		}
2315 	}
2316 
2317 	if (nr_reads) {
2318 		struct buffer_head *bh;
2319 
2320 		/*
2321 		 * The page is locked, so these buffers are protected from
2322 		 * any VM or truncate activity.  Hence we don't need to care
2323 		 * for the buffer_head refcounts.
2324 		 */
2325 		for (i = 0; i < nr_reads; i++) {
2326 			bh = read_bh[i];
2327 			lock_buffer(bh);
2328 			bh->b_end_io = end_buffer_read_nobh;
2329 			submit_bh(READ, bh);
2330 		}
2331 		for (i = 0; i < nr_reads; i++) {
2332 			bh = read_bh[i];
2333 			wait_on_buffer(bh);
2334 			if (!buffer_uptodate(bh))
2335 				ret = -EIO;
2336 			free_buffer_head(bh);
2337 			read_bh[i] = NULL;
2338 		}
2339 		if (ret)
2340 			goto failed;
2341 	}
2342 
2343 	if (is_mapped_to_disk)
2344 		SetPageMappedToDisk(page);
2345 
2346 	return 0;
2347 
2348 failed:
2349 	for (i = 0; i < nr_reads; i++) {
2350 		if (read_bh[i])
2351 			free_buffer_head(read_bh[i]);
2352 	}
2353 
2354 	/*
2355 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
2356 	 * so we'll later zero out any blocks which _were_ allocated.
2357 	 */
2358 	kaddr = kmap_atomic(page, KM_USER0);
2359 	memset(kaddr, 0, PAGE_CACHE_SIZE);
2360 	flush_dcache_page(page);
2361 	kunmap_atomic(kaddr, KM_USER0);
2362 	SetPageUptodate(page);
2363 	set_page_dirty(page);
2364 	return ret;
2365 }
2366 EXPORT_SYMBOL(nobh_prepare_write);
2367 
2368 /*
2369  * Make sure any changes to nobh_commit_write() are reflected in
2370  * nobh_truncate_page(), since it doesn't call commit_write().
2371  */
2372 int nobh_commit_write(struct file *file, struct page *page,
2373 		unsigned from, unsigned to)
2374 {
2375 	struct inode *inode = page->mapping->host;
2376 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2377 
2378 	SetPageUptodate(page);
2379 	set_page_dirty(page);
2380 	if (pos > inode->i_size) {
2381 		i_size_write(inode, pos);
2382 		mark_inode_dirty(inode);
2383 	}
2384 	return 0;
2385 }
2386 EXPORT_SYMBOL(nobh_commit_write);
2387 
2388 /*
2389  * nobh_writepage() - based on block_full_write_page() except
2390  * that it tries to operate without attaching bufferheads to
2391  * the page.
2392  */
2393 int nobh_writepage(struct page *page, get_block_t *get_block,
2394 			struct writeback_control *wbc)
2395 {
2396 	struct inode * const inode = page->mapping->host;
2397 	loff_t i_size = i_size_read(inode);
2398 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2399 	unsigned offset;
2400 	void *kaddr;
2401 	int ret;
2402 
2403 	/* Is the page fully inside i_size? */
2404 	if (page->index < end_index)
2405 		goto out;
2406 
2407 	/* Is the page fully outside i_size? (truncate in progress) */
2408 	offset = i_size & (PAGE_CACHE_SIZE-1);
2409 	if (page->index >= end_index+1 || !offset) {
2410 		/*
2411 		 * The page may have dirty, unmapped buffers.  For example,
2412 		 * they may have been added in ext3_writepage().  Make them
2413 		 * freeable here, so the page does not leak.
2414 		 */
2415 #if 0
2416 		/* Not really sure about this  - do we need this ? */
2417 		if (page->mapping->a_ops->invalidatepage)
2418 			page->mapping->a_ops->invalidatepage(page, offset);
2419 #endif
2420 		unlock_page(page);
2421 		return 0; /* don't care */
2422 	}
2423 
2424 	/*
2425 	 * The page straddles i_size.  It must be zeroed out on each and every
2426 	 * writepage invocation because it may be mmapped.  "A file is mapped
2427 	 * in multiples of the page size.  For a file that is not a multiple of
2428 	 * the  page size, the remaining memory is zeroed when mapped, and
2429 	 * writes to that region are not written out to the file."
2430 	 */
2431 	kaddr = kmap_atomic(page, KM_USER0);
2432 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2433 	flush_dcache_page(page);
2434 	kunmap_atomic(kaddr, KM_USER0);
2435 out:
2436 	ret = mpage_writepage(page, get_block, wbc);
2437 	if (ret == -EAGAIN)
2438 		ret = __block_write_full_page(inode, page, get_block, wbc);
2439 	return ret;
2440 }
2441 EXPORT_SYMBOL(nobh_writepage);
2442 
2443 /*
2444  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2445  */
2446 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2447 {
2448 	struct inode *inode = mapping->host;
2449 	unsigned blocksize = 1 << inode->i_blkbits;
2450 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2451 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2452 	unsigned to;
2453 	struct page *page;
2454 	const struct address_space_operations *a_ops = mapping->a_ops;
2455 	char *kaddr;
2456 	int ret = 0;
2457 
2458 	if ((offset & (blocksize - 1)) == 0)
2459 		goto out;
2460 
2461 	ret = -ENOMEM;
2462 	page = grab_cache_page(mapping, index);
2463 	if (!page)
2464 		goto out;
2465 
2466 	to = (offset + blocksize) & ~(blocksize - 1);
2467 	ret = a_ops->prepare_write(NULL, page, offset, to);
2468 	if (ret == 0) {
2469 		kaddr = kmap_atomic(page, KM_USER0);
2470 		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2471 		flush_dcache_page(page);
2472 		kunmap_atomic(kaddr, KM_USER0);
2473 		/*
2474 		 * It would be more correct to call aops->commit_write()
2475 		 * here, but this is more efficient.
2476 		 */
2477 		SetPageUptodate(page);
2478 		set_page_dirty(page);
2479 	}
2480 	unlock_page(page);
2481 	page_cache_release(page);
2482 out:
2483 	return ret;
2484 }
2485 EXPORT_SYMBOL(nobh_truncate_page);
2486 
2487 int block_truncate_page(struct address_space *mapping,
2488 			loff_t from, get_block_t *get_block)
2489 {
2490 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2491 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2492 	unsigned blocksize;
2493 	sector_t iblock;
2494 	unsigned length, pos;
2495 	struct inode *inode = mapping->host;
2496 	struct page *page;
2497 	struct buffer_head *bh;
2498 	void *kaddr;
2499 	int err;
2500 
2501 	blocksize = 1 << inode->i_blkbits;
2502 	length = offset & (blocksize - 1);
2503 
2504 	/* Block boundary? Nothing to do */
2505 	if (!length)
2506 		return 0;
2507 
2508 	length = blocksize - length;
2509 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2510 
2511 	page = grab_cache_page(mapping, index);
2512 	err = -ENOMEM;
2513 	if (!page)
2514 		goto out;
2515 
2516 	if (!page_has_buffers(page))
2517 		create_empty_buffers(page, blocksize, 0);
2518 
2519 	/* Find the buffer that contains "offset" */
2520 	bh = page_buffers(page);
2521 	pos = blocksize;
2522 	while (offset >= pos) {
2523 		bh = bh->b_this_page;
2524 		iblock++;
2525 		pos += blocksize;
2526 	}
2527 
2528 	err = 0;
2529 	if (!buffer_mapped(bh)) {
2530 		WARN_ON(bh->b_size != blocksize);
2531 		err = get_block(inode, iblock, bh, 0);
2532 		if (err)
2533 			goto unlock;
2534 		/* unmapped? It's a hole - nothing to do */
2535 		if (!buffer_mapped(bh))
2536 			goto unlock;
2537 	}
2538 
2539 	/* Ok, it's mapped. Make sure it's up-to-date */
2540 	if (PageUptodate(page))
2541 		set_buffer_uptodate(bh);
2542 
2543 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2544 		err = -EIO;
2545 		ll_rw_block(READ, 1, &bh);
2546 		wait_on_buffer(bh);
2547 		/* Uhhuh. Read error. Complain and punt. */
2548 		if (!buffer_uptodate(bh))
2549 			goto unlock;
2550 	}
2551 
2552 	kaddr = kmap_atomic(page, KM_USER0);
2553 	memset(kaddr + offset, 0, length);
2554 	flush_dcache_page(page);
2555 	kunmap_atomic(kaddr, KM_USER0);
2556 
2557 	mark_buffer_dirty(bh);
2558 	err = 0;
2559 
2560 unlock:
2561 	unlock_page(page);
2562 	page_cache_release(page);
2563 out:
2564 	return err;
2565 }
2566 
2567 /*
2568  * The generic ->writepage function for buffer-backed address_spaces
2569  */
2570 int block_write_full_page(struct page *page, get_block_t *get_block,
2571 			struct writeback_control *wbc)
2572 {
2573 	struct inode * const inode = page->mapping->host;
2574 	loff_t i_size = i_size_read(inode);
2575 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2576 	unsigned offset;
2577 	void *kaddr;
2578 
2579 	/* Is the page fully inside i_size? */
2580 	if (page->index < end_index)
2581 		return __block_write_full_page(inode, page, get_block, wbc);
2582 
2583 	/* Is the page fully outside i_size? (truncate in progress) */
2584 	offset = i_size & (PAGE_CACHE_SIZE-1);
2585 	if (page->index >= end_index+1 || !offset) {
2586 		/*
2587 		 * The page may have dirty, unmapped buffers.  For example,
2588 		 * they may have been added in ext3_writepage().  Make them
2589 		 * freeable here, so the page does not leak.
2590 		 */
2591 		do_invalidatepage(page, 0);
2592 		unlock_page(page);
2593 		return 0; /* don't care */
2594 	}
2595 
2596 	/*
2597 	 * The page straddles i_size.  It must be zeroed out on each and every
2598 	 * writepage invokation because it may be mmapped.  "A file is mapped
2599 	 * in multiples of the page size.  For a file that is not a multiple of
2600 	 * the  page size, the remaining memory is zeroed when mapped, and
2601 	 * writes to that region are not written out to the file."
2602 	 */
2603 	kaddr = kmap_atomic(page, KM_USER0);
2604 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2605 	flush_dcache_page(page);
2606 	kunmap_atomic(kaddr, KM_USER0);
2607 	return __block_write_full_page(inode, page, get_block, wbc);
2608 }
2609 
2610 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2611 			    get_block_t *get_block)
2612 {
2613 	struct buffer_head tmp;
2614 	struct inode *inode = mapping->host;
2615 	tmp.b_state = 0;
2616 	tmp.b_blocknr = 0;
2617 	tmp.b_size = 1 << inode->i_blkbits;
2618 	get_block(inode, block, &tmp, 0);
2619 	return tmp.b_blocknr;
2620 }
2621 
2622 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2623 {
2624 	struct buffer_head *bh = bio->bi_private;
2625 
2626 	if (bio->bi_size)
2627 		return 1;
2628 
2629 	if (err == -EOPNOTSUPP) {
2630 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2631 		set_bit(BH_Eopnotsupp, &bh->b_state);
2632 	}
2633 
2634 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2635 	bio_put(bio);
2636 	return 0;
2637 }
2638 
2639 int submit_bh(int rw, struct buffer_head * bh)
2640 {
2641 	struct bio *bio;
2642 	int ret = 0;
2643 
2644 	BUG_ON(!buffer_locked(bh));
2645 	BUG_ON(!buffer_mapped(bh));
2646 	BUG_ON(!bh->b_end_io);
2647 
2648 	if (buffer_ordered(bh) && (rw == WRITE))
2649 		rw = WRITE_BARRIER;
2650 
2651 	/*
2652 	 * Only clear out a write error when rewriting, should this
2653 	 * include WRITE_SYNC as well?
2654 	 */
2655 	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2656 		clear_buffer_write_io_error(bh);
2657 
2658 	/*
2659 	 * from here on down, it's all bio -- do the initial mapping,
2660 	 * submit_bio -> generic_make_request may further map this bio around
2661 	 */
2662 	bio = bio_alloc(GFP_NOIO, 1);
2663 
2664 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2665 	bio->bi_bdev = bh->b_bdev;
2666 	bio->bi_io_vec[0].bv_page = bh->b_page;
2667 	bio->bi_io_vec[0].bv_len = bh->b_size;
2668 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2669 
2670 	bio->bi_vcnt = 1;
2671 	bio->bi_idx = 0;
2672 	bio->bi_size = bh->b_size;
2673 
2674 	bio->bi_end_io = end_bio_bh_io_sync;
2675 	bio->bi_private = bh;
2676 
2677 	bio_get(bio);
2678 	submit_bio(rw, bio);
2679 
2680 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2681 		ret = -EOPNOTSUPP;
2682 
2683 	bio_put(bio);
2684 	return ret;
2685 }
2686 
2687 /**
2688  * ll_rw_block: low-level access to block devices (DEPRECATED)
2689  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2690  * @nr: number of &struct buffer_heads in the array
2691  * @bhs: array of pointers to &struct buffer_head
2692  *
2693  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2694  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2695  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2696  * are sent to disk. The fourth %READA option is described in the documentation
2697  * for generic_make_request() which ll_rw_block() calls.
2698  *
2699  * This function drops any buffer that it cannot get a lock on (with the
2700  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2701  * clean when doing a write request, and any buffer that appears to be
2702  * up-to-date when doing read request.  Further it marks as clean buffers that
2703  * are processed for writing (the buffer cache won't assume that they are
2704  * actually clean until the buffer gets unlocked).
2705  *
2706  * ll_rw_block sets b_end_io to simple completion handler that marks
2707  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2708  * any waiters.
2709  *
2710  * All of the buffers must be for the same device, and must also be a
2711  * multiple of the current approved size for the device.
2712  */
2713 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2714 {
2715 	int i;
2716 
2717 	for (i = 0; i < nr; i++) {
2718 		struct buffer_head *bh = bhs[i];
2719 
2720 		if (rw == SWRITE)
2721 			lock_buffer(bh);
2722 		else if (test_set_buffer_locked(bh))
2723 			continue;
2724 
2725 		if (rw == WRITE || rw == SWRITE) {
2726 			if (test_clear_buffer_dirty(bh)) {
2727 				bh->b_end_io = end_buffer_write_sync;
2728 				get_bh(bh);
2729 				submit_bh(WRITE, bh);
2730 				continue;
2731 			}
2732 		} else {
2733 			if (!buffer_uptodate(bh)) {
2734 				bh->b_end_io = end_buffer_read_sync;
2735 				get_bh(bh);
2736 				submit_bh(rw, bh);
2737 				continue;
2738 			}
2739 		}
2740 		unlock_buffer(bh);
2741 	}
2742 }
2743 
2744 /*
2745  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2746  * and then start new I/O and then wait upon it.  The caller must have a ref on
2747  * the buffer_head.
2748  */
2749 int sync_dirty_buffer(struct buffer_head *bh)
2750 {
2751 	int ret = 0;
2752 
2753 	WARN_ON(atomic_read(&bh->b_count) < 1);
2754 	lock_buffer(bh);
2755 	if (test_clear_buffer_dirty(bh)) {
2756 		get_bh(bh);
2757 		bh->b_end_io = end_buffer_write_sync;
2758 		ret = submit_bh(WRITE, bh);
2759 		wait_on_buffer(bh);
2760 		if (buffer_eopnotsupp(bh)) {
2761 			clear_buffer_eopnotsupp(bh);
2762 			ret = -EOPNOTSUPP;
2763 		}
2764 		if (!ret && !buffer_uptodate(bh))
2765 			ret = -EIO;
2766 	} else {
2767 		unlock_buffer(bh);
2768 	}
2769 	return ret;
2770 }
2771 
2772 /*
2773  * try_to_free_buffers() checks if all the buffers on this particular page
2774  * are unused, and releases them if so.
2775  *
2776  * Exclusion against try_to_free_buffers may be obtained by either
2777  * locking the page or by holding its mapping's private_lock.
2778  *
2779  * If the page is dirty but all the buffers are clean then we need to
2780  * be sure to mark the page clean as well.  This is because the page
2781  * may be against a block device, and a later reattachment of buffers
2782  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2783  * filesystem data on the same device.
2784  *
2785  * The same applies to regular filesystem pages: if all the buffers are
2786  * clean then we set the page clean and proceed.  To do that, we require
2787  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2788  * private_lock.
2789  *
2790  * try_to_free_buffers() is non-blocking.
2791  */
2792 static inline int buffer_busy(struct buffer_head *bh)
2793 {
2794 	return atomic_read(&bh->b_count) |
2795 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2796 }
2797 
2798 static int
2799 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2800 {
2801 	struct buffer_head *head = page_buffers(page);
2802 	struct buffer_head *bh;
2803 
2804 	bh = head;
2805 	do {
2806 		if (buffer_write_io_error(bh) && page->mapping)
2807 			set_bit(AS_EIO, &page->mapping->flags);
2808 		if (buffer_busy(bh))
2809 			goto failed;
2810 		bh = bh->b_this_page;
2811 	} while (bh != head);
2812 
2813 	do {
2814 		struct buffer_head *next = bh->b_this_page;
2815 
2816 		if (!list_empty(&bh->b_assoc_buffers))
2817 			__remove_assoc_queue(bh);
2818 		bh = next;
2819 	} while (bh != head);
2820 	*buffers_to_free = head;
2821 	__clear_page_buffers(page);
2822 	return 1;
2823 failed:
2824 	return 0;
2825 }
2826 
2827 int try_to_free_buffers(struct page *page)
2828 {
2829 	struct address_space * const mapping = page->mapping;
2830 	struct buffer_head *buffers_to_free = NULL;
2831 	int ret = 0;
2832 
2833 	BUG_ON(!PageLocked(page));
2834 	if (PageWriteback(page))
2835 		return 0;
2836 
2837 	if (mapping == NULL) {		/* can this still happen? */
2838 		ret = drop_buffers(page, &buffers_to_free);
2839 		goto out;
2840 	}
2841 
2842 	spin_lock(&mapping->private_lock);
2843 	ret = drop_buffers(page, &buffers_to_free);
2844 
2845 	/*
2846 	 * If the filesystem writes its buffers by hand (eg ext3)
2847 	 * then we can have clean buffers against a dirty page.  We
2848 	 * clean the page here; otherwise the VM will never notice
2849 	 * that the filesystem did any IO at all.
2850 	 *
2851 	 * Also, during truncate, discard_buffer will have marked all
2852 	 * the page's buffers clean.  We discover that here and clean
2853 	 * the page also.
2854 	 *
2855 	 * private_lock must be held over this entire operation in order
2856 	 * to synchronise against __set_page_dirty_buffers and prevent the
2857 	 * dirty bit from being lost.
2858 	 */
2859 	if (ret)
2860 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
2861 	spin_unlock(&mapping->private_lock);
2862 out:
2863 	if (buffers_to_free) {
2864 		struct buffer_head *bh = buffers_to_free;
2865 
2866 		do {
2867 			struct buffer_head *next = bh->b_this_page;
2868 			free_buffer_head(bh);
2869 			bh = next;
2870 		} while (bh != buffers_to_free);
2871 	}
2872 	return ret;
2873 }
2874 EXPORT_SYMBOL(try_to_free_buffers);
2875 
2876 void block_sync_page(struct page *page)
2877 {
2878 	struct address_space *mapping;
2879 
2880 	smp_mb();
2881 	mapping = page_mapping(page);
2882 	if (mapping)
2883 		blk_run_backing_dev(mapping->backing_dev_info, page);
2884 }
2885 
2886 /*
2887  * There are no bdflush tunables left.  But distributions are
2888  * still running obsolete flush daemons, so we terminate them here.
2889  *
2890  * Use of bdflush() is deprecated and will be removed in a future kernel.
2891  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2892  */
2893 asmlinkage long sys_bdflush(int func, long data)
2894 {
2895 	static int msg_count;
2896 
2897 	if (!capable(CAP_SYS_ADMIN))
2898 		return -EPERM;
2899 
2900 	if (msg_count < 5) {
2901 		msg_count++;
2902 		printk(KERN_INFO
2903 			"warning: process `%s' used the obsolete bdflush"
2904 			" system call\n", current->comm);
2905 		printk(KERN_INFO "Fix your initscripts?\n");
2906 	}
2907 
2908 	if (func == 1)
2909 		do_exit(0);
2910 	return 0;
2911 }
2912 
2913 /*
2914  * Buffer-head allocation
2915  */
2916 static struct kmem_cache *bh_cachep;
2917 
2918 /*
2919  * Once the number of bh's in the machine exceeds this level, we start
2920  * stripping them in writeback.
2921  */
2922 static int max_buffer_heads;
2923 
2924 int buffer_heads_over_limit;
2925 
2926 struct bh_accounting {
2927 	int nr;			/* Number of live bh's */
2928 	int ratelimit;		/* Limit cacheline bouncing */
2929 };
2930 
2931 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2932 
2933 static void recalc_bh_state(void)
2934 {
2935 	int i;
2936 	int tot = 0;
2937 
2938 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2939 		return;
2940 	__get_cpu_var(bh_accounting).ratelimit = 0;
2941 	for_each_online_cpu(i)
2942 		tot += per_cpu(bh_accounting, i).nr;
2943 	buffer_heads_over_limit = (tot > max_buffer_heads);
2944 }
2945 
2946 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2947 {
2948 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2949 	if (ret) {
2950 		get_cpu_var(bh_accounting).nr++;
2951 		recalc_bh_state();
2952 		put_cpu_var(bh_accounting);
2953 	}
2954 	return ret;
2955 }
2956 EXPORT_SYMBOL(alloc_buffer_head);
2957 
2958 void free_buffer_head(struct buffer_head *bh)
2959 {
2960 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
2961 	kmem_cache_free(bh_cachep, bh);
2962 	get_cpu_var(bh_accounting).nr--;
2963 	recalc_bh_state();
2964 	put_cpu_var(bh_accounting);
2965 }
2966 EXPORT_SYMBOL(free_buffer_head);
2967 
2968 static void
2969 init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
2970 {
2971 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2972 			    SLAB_CTOR_CONSTRUCTOR) {
2973 		struct buffer_head * bh = (struct buffer_head *)data;
2974 
2975 		memset(bh, 0, sizeof(*bh));
2976 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
2977 	}
2978 }
2979 
2980 static void buffer_exit_cpu(int cpu)
2981 {
2982 	int i;
2983 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2984 
2985 	for (i = 0; i < BH_LRU_SIZE; i++) {
2986 		brelse(b->bhs[i]);
2987 		b->bhs[i] = NULL;
2988 	}
2989 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2990 	per_cpu(bh_accounting, cpu).nr = 0;
2991 	put_cpu_var(bh_accounting);
2992 }
2993 
2994 static int buffer_cpu_notify(struct notifier_block *self,
2995 			      unsigned long action, void *hcpu)
2996 {
2997 	if (action == CPU_DEAD)
2998 		buffer_exit_cpu((unsigned long)hcpu);
2999 	return NOTIFY_OK;
3000 }
3001 
3002 void __init buffer_init(void)
3003 {
3004 	int nrpages;
3005 
3006 	bh_cachep = kmem_cache_create("buffer_head",
3007 					sizeof(struct buffer_head), 0,
3008 					(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3009 					SLAB_MEM_SPREAD),
3010 					init_buffer_head,
3011 					NULL);
3012 
3013 	/*
3014 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3015 	 */
3016 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3017 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3018 	hotcpu_notifier(buffer_cpu_notify, 0);
3019 }
3020 
3021 EXPORT_SYMBOL(__bforget);
3022 EXPORT_SYMBOL(__brelse);
3023 EXPORT_SYMBOL(__wait_on_buffer);
3024 EXPORT_SYMBOL(block_commit_write);
3025 EXPORT_SYMBOL(block_prepare_write);
3026 EXPORT_SYMBOL(block_read_full_page);
3027 EXPORT_SYMBOL(block_sync_page);
3028 EXPORT_SYMBOL(block_truncate_page);
3029 EXPORT_SYMBOL(block_write_full_page);
3030 EXPORT_SYMBOL(cont_prepare_write);
3031 EXPORT_SYMBOL(end_buffer_read_sync);
3032 EXPORT_SYMBOL(end_buffer_write_sync);
3033 EXPORT_SYMBOL(file_fsync);
3034 EXPORT_SYMBOL(fsync_bdev);
3035 EXPORT_SYMBOL(generic_block_bmap);
3036 EXPORT_SYMBOL(generic_commit_write);
3037 EXPORT_SYMBOL(generic_cont_expand);
3038 EXPORT_SYMBOL(generic_cont_expand_simple);
3039 EXPORT_SYMBOL(init_buffer);
3040 EXPORT_SYMBOL(invalidate_bdev);
3041 EXPORT_SYMBOL(ll_rw_block);
3042 EXPORT_SYMBOL(mark_buffer_dirty);
3043 EXPORT_SYMBOL(submit_bh);
3044 EXPORT_SYMBOL(sync_dirty_buffer);
3045 EXPORT_SYMBOL(unlock_buffer);
3046