xref: /linux/fs/buffer.c (revision cd354f1ae75e6466a7e31b727faede57a1f89ca5)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/smp_lock.h>
28 #include <linux/capability.h>
29 #include <linux/blkdev.h>
30 #include <linux/file.h>
31 #include <linux/quotaops.h>
32 #include <linux/highmem.h>
33 #include <linux/module.h>
34 #include <linux/writeback.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/task_io_accounting_ops.h>
39 #include <linux/bio.h>
40 #include <linux/notifier.h>
41 #include <linux/cpu.h>
42 #include <linux/bitops.h>
43 #include <linux/mpage.h>
44 #include <linux/bit_spinlock.h>
45 
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 static void invalidate_bh_lrus(void);
48 
49 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
50 
51 inline void
52 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53 {
54 	bh->b_end_io = handler;
55 	bh->b_private = private;
56 }
57 
58 static int sync_buffer(void *word)
59 {
60 	struct block_device *bd;
61 	struct buffer_head *bh
62 		= container_of(word, struct buffer_head, b_state);
63 
64 	smp_mb();
65 	bd = bh->b_bdev;
66 	if (bd)
67 		blk_run_address_space(bd->bd_inode->i_mapping);
68 	io_schedule();
69 	return 0;
70 }
71 
72 void fastcall __lock_buffer(struct buffer_head *bh)
73 {
74 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 							TASK_UNINTERRUPTIBLE);
76 }
77 EXPORT_SYMBOL(__lock_buffer);
78 
79 void fastcall unlock_buffer(struct buffer_head *bh)
80 {
81 	smp_mb__before_clear_bit();
82 	clear_buffer_locked(bh);
83 	smp_mb__after_clear_bit();
84 	wake_up_bit(&bh->b_state, BH_Lock);
85 }
86 
87 /*
88  * Block until a buffer comes unlocked.  This doesn't stop it
89  * from becoming locked again - you have to lock it yourself
90  * if you want to preserve its state.
91  */
92 void __wait_on_buffer(struct buffer_head * bh)
93 {
94 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
95 }
96 
97 static void
98 __clear_page_buffers(struct page *page)
99 {
100 	ClearPagePrivate(page);
101 	set_page_private(page, 0);
102 	page_cache_release(page);
103 }
104 
105 static void buffer_io_error(struct buffer_head *bh)
106 {
107 	char b[BDEVNAME_SIZE];
108 
109 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
110 			bdevname(bh->b_bdev, b),
111 			(unsigned long long)bh->b_blocknr);
112 }
113 
114 /*
115  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
116  * unlock the buffer. This is what ll_rw_block uses too.
117  */
118 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
119 {
120 	if (uptodate) {
121 		set_buffer_uptodate(bh);
122 	} else {
123 		/* This happens, due to failed READA attempts. */
124 		clear_buffer_uptodate(bh);
125 	}
126 	unlock_buffer(bh);
127 	put_bh(bh);
128 }
129 
130 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
131 {
132 	char b[BDEVNAME_SIZE];
133 
134 	if (uptodate) {
135 		set_buffer_uptodate(bh);
136 	} else {
137 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
138 			buffer_io_error(bh);
139 			printk(KERN_WARNING "lost page write due to "
140 					"I/O error on %s\n",
141 				       bdevname(bh->b_bdev, b));
142 		}
143 		set_buffer_write_io_error(bh);
144 		clear_buffer_uptodate(bh);
145 	}
146 	unlock_buffer(bh);
147 	put_bh(bh);
148 }
149 
150 /*
151  * Write out and wait upon all the dirty data associated with a block
152  * device via its mapping.  Does not take the superblock lock.
153  */
154 int sync_blockdev(struct block_device *bdev)
155 {
156 	int ret = 0;
157 
158 	if (bdev)
159 		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
160 	return ret;
161 }
162 EXPORT_SYMBOL(sync_blockdev);
163 
164 /*
165  * Write out and wait upon all dirty data associated with this
166  * device.   Filesystem data as well as the underlying block
167  * device.  Takes the superblock lock.
168  */
169 int fsync_bdev(struct block_device *bdev)
170 {
171 	struct super_block *sb = get_super(bdev);
172 	if (sb) {
173 		int res = fsync_super(sb);
174 		drop_super(sb);
175 		return res;
176 	}
177 	return sync_blockdev(bdev);
178 }
179 
180 /**
181  * freeze_bdev  --  lock a filesystem and force it into a consistent state
182  * @bdev:	blockdevice to lock
183  *
184  * This takes the block device bd_mount_sem to make sure no new mounts
185  * happen on bdev until thaw_bdev() is called.
186  * If a superblock is found on this device, we take the s_umount semaphore
187  * on it to make sure nobody unmounts until the snapshot creation is done.
188  */
189 struct super_block *freeze_bdev(struct block_device *bdev)
190 {
191 	struct super_block *sb;
192 
193 	down(&bdev->bd_mount_sem);
194 	sb = get_super(bdev);
195 	if (sb && !(sb->s_flags & MS_RDONLY)) {
196 		sb->s_frozen = SB_FREEZE_WRITE;
197 		smp_wmb();
198 
199 		__fsync_super(sb);
200 
201 		sb->s_frozen = SB_FREEZE_TRANS;
202 		smp_wmb();
203 
204 		sync_blockdev(sb->s_bdev);
205 
206 		if (sb->s_op->write_super_lockfs)
207 			sb->s_op->write_super_lockfs(sb);
208 	}
209 
210 	sync_blockdev(bdev);
211 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
212 }
213 EXPORT_SYMBOL(freeze_bdev);
214 
215 /**
216  * thaw_bdev  -- unlock filesystem
217  * @bdev:	blockdevice to unlock
218  * @sb:		associated superblock
219  *
220  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
221  */
222 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
223 {
224 	if (sb) {
225 		BUG_ON(sb->s_bdev != bdev);
226 
227 		if (sb->s_op->unlockfs)
228 			sb->s_op->unlockfs(sb);
229 		sb->s_frozen = SB_UNFROZEN;
230 		smp_wmb();
231 		wake_up(&sb->s_wait_unfrozen);
232 		drop_super(sb);
233 	}
234 
235 	up(&bdev->bd_mount_sem);
236 }
237 EXPORT_SYMBOL(thaw_bdev);
238 
239 /*
240  * Various filesystems appear to want __find_get_block to be non-blocking.
241  * But it's the page lock which protects the buffers.  To get around this,
242  * we get exclusion from try_to_free_buffers with the blockdev mapping's
243  * private_lock.
244  *
245  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
246  * may be quite high.  This code could TryLock the page, and if that
247  * succeeds, there is no need to take private_lock. (But if
248  * private_lock is contended then so is mapping->tree_lock).
249  */
250 static struct buffer_head *
251 __find_get_block_slow(struct block_device *bdev, sector_t block)
252 {
253 	struct inode *bd_inode = bdev->bd_inode;
254 	struct address_space *bd_mapping = bd_inode->i_mapping;
255 	struct buffer_head *ret = NULL;
256 	pgoff_t index;
257 	struct buffer_head *bh;
258 	struct buffer_head *head;
259 	struct page *page;
260 	int all_mapped = 1;
261 
262 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
263 	page = find_get_page(bd_mapping, index);
264 	if (!page)
265 		goto out;
266 
267 	spin_lock(&bd_mapping->private_lock);
268 	if (!page_has_buffers(page))
269 		goto out_unlock;
270 	head = page_buffers(page);
271 	bh = head;
272 	do {
273 		if (bh->b_blocknr == block) {
274 			ret = bh;
275 			get_bh(bh);
276 			goto out_unlock;
277 		}
278 		if (!buffer_mapped(bh))
279 			all_mapped = 0;
280 		bh = bh->b_this_page;
281 	} while (bh != head);
282 
283 	/* we might be here because some of the buffers on this page are
284 	 * not mapped.  This is due to various races between
285 	 * file io on the block device and getblk.  It gets dealt with
286 	 * elsewhere, don't buffer_error if we had some unmapped buffers
287 	 */
288 	if (all_mapped) {
289 		printk("__find_get_block_slow() failed. "
290 			"block=%llu, b_blocknr=%llu\n",
291 			(unsigned long long)block,
292 			(unsigned long long)bh->b_blocknr);
293 		printk("b_state=0x%08lx, b_size=%zu\n",
294 			bh->b_state, bh->b_size);
295 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
296 	}
297 out_unlock:
298 	spin_unlock(&bd_mapping->private_lock);
299 	page_cache_release(page);
300 out:
301 	return ret;
302 }
303 
304 /* If invalidate_buffers() will trash dirty buffers, it means some kind
305    of fs corruption is going on. Trashing dirty data always imply losing
306    information that was supposed to be just stored on the physical layer
307    by the user.
308 
309    Thus invalidate_buffers in general usage is not allwowed to trash
310    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
311    be preserved.  These buffers are simply skipped.
312 
313    We also skip buffers which are still in use.  For example this can
314    happen if a userspace program is reading the block device.
315 
316    NOTE: In the case where the user removed a removable-media-disk even if
317    there's still dirty data not synced on disk (due a bug in the device driver
318    or due an error of the user), by not destroying the dirty buffers we could
319    generate corruption also on the next media inserted, thus a parameter is
320    necessary to handle this case in the most safe way possible (trying
321    to not corrupt also the new disk inserted with the data belonging to
322    the old now corrupted disk). Also for the ramdisk the natural thing
323    to do in order to release the ramdisk memory is to destroy dirty buffers.
324 
325    These are two special cases. Normal usage imply the device driver
326    to issue a sync on the device (without waiting I/O completion) and
327    then an invalidate_buffers call that doesn't trash dirty buffers.
328 
329    For handling cache coherency with the blkdev pagecache the 'update' case
330    is been introduced. It is needed to re-read from disk any pinned
331    buffer. NOTE: re-reading from disk is destructive so we can do it only
332    when we assume nobody is changing the buffercache under our I/O and when
333    we think the disk contains more recent information than the buffercache.
334    The update == 1 pass marks the buffers we need to update, the update == 2
335    pass does the actual I/O. */
336 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
337 {
338 	struct address_space *mapping = bdev->bd_inode->i_mapping;
339 
340 	if (mapping->nrpages == 0)
341 		return;
342 
343 	invalidate_bh_lrus();
344 	/*
345 	 * FIXME: what about destroy_dirty_buffers?
346 	 * We really want to use invalidate_inode_pages2() for
347 	 * that, but not until that's cleaned up.
348 	 */
349 	invalidate_mapping_pages(mapping, 0, -1);
350 }
351 
352 /*
353  * Kick pdflush then try to free up some ZONE_NORMAL memory.
354  */
355 static void free_more_memory(void)
356 {
357 	struct zone **zones;
358 	pg_data_t *pgdat;
359 
360 	wakeup_pdflush(1024);
361 	yield();
362 
363 	for_each_online_pgdat(pgdat) {
364 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
365 		if (*zones)
366 			try_to_free_pages(zones, GFP_NOFS);
367 	}
368 }
369 
370 /*
371  * I/O completion handler for block_read_full_page() - pages
372  * which come unlocked at the end of I/O.
373  */
374 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
375 {
376 	unsigned long flags;
377 	struct buffer_head *first;
378 	struct buffer_head *tmp;
379 	struct page *page;
380 	int page_uptodate = 1;
381 
382 	BUG_ON(!buffer_async_read(bh));
383 
384 	page = bh->b_page;
385 	if (uptodate) {
386 		set_buffer_uptodate(bh);
387 	} else {
388 		clear_buffer_uptodate(bh);
389 		if (printk_ratelimit())
390 			buffer_io_error(bh);
391 		SetPageError(page);
392 	}
393 
394 	/*
395 	 * Be _very_ careful from here on. Bad things can happen if
396 	 * two buffer heads end IO at almost the same time and both
397 	 * decide that the page is now completely done.
398 	 */
399 	first = page_buffers(page);
400 	local_irq_save(flags);
401 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
402 	clear_buffer_async_read(bh);
403 	unlock_buffer(bh);
404 	tmp = bh;
405 	do {
406 		if (!buffer_uptodate(tmp))
407 			page_uptodate = 0;
408 		if (buffer_async_read(tmp)) {
409 			BUG_ON(!buffer_locked(tmp));
410 			goto still_busy;
411 		}
412 		tmp = tmp->b_this_page;
413 	} while (tmp != bh);
414 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
415 	local_irq_restore(flags);
416 
417 	/*
418 	 * If none of the buffers had errors and they are all
419 	 * uptodate then we can set the page uptodate.
420 	 */
421 	if (page_uptodate && !PageError(page))
422 		SetPageUptodate(page);
423 	unlock_page(page);
424 	return;
425 
426 still_busy:
427 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
428 	local_irq_restore(flags);
429 	return;
430 }
431 
432 /*
433  * Completion handler for block_write_full_page() - pages which are unlocked
434  * during I/O, and which have PageWriteback cleared upon I/O completion.
435  */
436 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
437 {
438 	char b[BDEVNAME_SIZE];
439 	unsigned long flags;
440 	struct buffer_head *first;
441 	struct buffer_head *tmp;
442 	struct page *page;
443 
444 	BUG_ON(!buffer_async_write(bh));
445 
446 	page = bh->b_page;
447 	if (uptodate) {
448 		set_buffer_uptodate(bh);
449 	} else {
450 		if (printk_ratelimit()) {
451 			buffer_io_error(bh);
452 			printk(KERN_WARNING "lost page write due to "
453 					"I/O error on %s\n",
454 			       bdevname(bh->b_bdev, b));
455 		}
456 		set_bit(AS_EIO, &page->mapping->flags);
457 		set_buffer_write_io_error(bh);
458 		clear_buffer_uptodate(bh);
459 		SetPageError(page);
460 	}
461 
462 	first = page_buffers(page);
463 	local_irq_save(flags);
464 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
465 
466 	clear_buffer_async_write(bh);
467 	unlock_buffer(bh);
468 	tmp = bh->b_this_page;
469 	while (tmp != bh) {
470 		if (buffer_async_write(tmp)) {
471 			BUG_ON(!buffer_locked(tmp));
472 			goto still_busy;
473 		}
474 		tmp = tmp->b_this_page;
475 	}
476 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
477 	local_irq_restore(flags);
478 	end_page_writeback(page);
479 	return;
480 
481 still_busy:
482 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
483 	local_irq_restore(flags);
484 	return;
485 }
486 
487 /*
488  * If a page's buffers are under async readin (end_buffer_async_read
489  * completion) then there is a possibility that another thread of
490  * control could lock one of the buffers after it has completed
491  * but while some of the other buffers have not completed.  This
492  * locked buffer would confuse end_buffer_async_read() into not unlocking
493  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
494  * that this buffer is not under async I/O.
495  *
496  * The page comes unlocked when it has no locked buffer_async buffers
497  * left.
498  *
499  * PageLocked prevents anyone starting new async I/O reads any of
500  * the buffers.
501  *
502  * PageWriteback is used to prevent simultaneous writeout of the same
503  * page.
504  *
505  * PageLocked prevents anyone from starting writeback of a page which is
506  * under read I/O (PageWriteback is only ever set against a locked page).
507  */
508 static void mark_buffer_async_read(struct buffer_head *bh)
509 {
510 	bh->b_end_io = end_buffer_async_read;
511 	set_buffer_async_read(bh);
512 }
513 
514 void mark_buffer_async_write(struct buffer_head *bh)
515 {
516 	bh->b_end_io = end_buffer_async_write;
517 	set_buffer_async_write(bh);
518 }
519 EXPORT_SYMBOL(mark_buffer_async_write);
520 
521 
522 /*
523  * fs/buffer.c contains helper functions for buffer-backed address space's
524  * fsync functions.  A common requirement for buffer-based filesystems is
525  * that certain data from the backing blockdev needs to be written out for
526  * a successful fsync().  For example, ext2 indirect blocks need to be
527  * written back and waited upon before fsync() returns.
528  *
529  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
530  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
531  * management of a list of dependent buffers at ->i_mapping->private_list.
532  *
533  * Locking is a little subtle: try_to_free_buffers() will remove buffers
534  * from their controlling inode's queue when they are being freed.  But
535  * try_to_free_buffers() will be operating against the *blockdev* mapping
536  * at the time, not against the S_ISREG file which depends on those buffers.
537  * So the locking for private_list is via the private_lock in the address_space
538  * which backs the buffers.  Which is different from the address_space
539  * against which the buffers are listed.  So for a particular address_space,
540  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
541  * mapping->private_list will always be protected by the backing blockdev's
542  * ->private_lock.
543  *
544  * Which introduces a requirement: all buffers on an address_space's
545  * ->private_list must be from the same address_space: the blockdev's.
546  *
547  * address_spaces which do not place buffers at ->private_list via these
548  * utility functions are free to use private_lock and private_list for
549  * whatever they want.  The only requirement is that list_empty(private_list)
550  * be true at clear_inode() time.
551  *
552  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
553  * filesystems should do that.  invalidate_inode_buffers() should just go
554  * BUG_ON(!list_empty).
555  *
556  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
557  * take an address_space, not an inode.  And it should be called
558  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
559  * queued up.
560  *
561  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
562  * list if it is already on a list.  Because if the buffer is on a list,
563  * it *must* already be on the right one.  If not, the filesystem is being
564  * silly.  This will save a ton of locking.  But first we have to ensure
565  * that buffers are taken *off* the old inode's list when they are freed
566  * (presumably in truncate).  That requires careful auditing of all
567  * filesystems (do it inside bforget()).  It could also be done by bringing
568  * b_inode back.
569  */
570 
571 /*
572  * The buffer's backing address_space's private_lock must be held
573  */
574 static inline void __remove_assoc_queue(struct buffer_head *bh)
575 {
576 	list_del_init(&bh->b_assoc_buffers);
577 	WARN_ON(!bh->b_assoc_map);
578 	if (buffer_write_io_error(bh))
579 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
580 	bh->b_assoc_map = NULL;
581 }
582 
583 int inode_has_buffers(struct inode *inode)
584 {
585 	return !list_empty(&inode->i_data.private_list);
586 }
587 
588 /*
589  * osync is designed to support O_SYNC io.  It waits synchronously for
590  * all already-submitted IO to complete, but does not queue any new
591  * writes to the disk.
592  *
593  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
594  * you dirty the buffers, and then use osync_inode_buffers to wait for
595  * completion.  Any other dirty buffers which are not yet queued for
596  * write will not be flushed to disk by the osync.
597  */
598 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
599 {
600 	struct buffer_head *bh;
601 	struct list_head *p;
602 	int err = 0;
603 
604 	spin_lock(lock);
605 repeat:
606 	list_for_each_prev(p, list) {
607 		bh = BH_ENTRY(p);
608 		if (buffer_locked(bh)) {
609 			get_bh(bh);
610 			spin_unlock(lock);
611 			wait_on_buffer(bh);
612 			if (!buffer_uptodate(bh))
613 				err = -EIO;
614 			brelse(bh);
615 			spin_lock(lock);
616 			goto repeat;
617 		}
618 	}
619 	spin_unlock(lock);
620 	return err;
621 }
622 
623 /**
624  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
625  *                        buffers
626  * @mapping: the mapping which wants those buffers written
627  *
628  * Starts I/O against the buffers at mapping->private_list, and waits upon
629  * that I/O.
630  *
631  * Basically, this is a convenience function for fsync().
632  * @mapping is a file or directory which needs those buffers to be written for
633  * a successful fsync().
634  */
635 int sync_mapping_buffers(struct address_space *mapping)
636 {
637 	struct address_space *buffer_mapping = mapping->assoc_mapping;
638 
639 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
640 		return 0;
641 
642 	return fsync_buffers_list(&buffer_mapping->private_lock,
643 					&mapping->private_list);
644 }
645 EXPORT_SYMBOL(sync_mapping_buffers);
646 
647 /*
648  * Called when we've recently written block `bblock', and it is known that
649  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
650  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
651  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
652  */
653 void write_boundary_block(struct block_device *bdev,
654 			sector_t bblock, unsigned blocksize)
655 {
656 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
657 	if (bh) {
658 		if (buffer_dirty(bh))
659 			ll_rw_block(WRITE, 1, &bh);
660 		put_bh(bh);
661 	}
662 }
663 
664 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
665 {
666 	struct address_space *mapping = inode->i_mapping;
667 	struct address_space *buffer_mapping = bh->b_page->mapping;
668 
669 	mark_buffer_dirty(bh);
670 	if (!mapping->assoc_mapping) {
671 		mapping->assoc_mapping = buffer_mapping;
672 	} else {
673 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
674 	}
675 	if (list_empty(&bh->b_assoc_buffers)) {
676 		spin_lock(&buffer_mapping->private_lock);
677 		list_move_tail(&bh->b_assoc_buffers,
678 				&mapping->private_list);
679 		bh->b_assoc_map = mapping;
680 		spin_unlock(&buffer_mapping->private_lock);
681 	}
682 }
683 EXPORT_SYMBOL(mark_buffer_dirty_inode);
684 
685 /*
686  * Add a page to the dirty page list.
687  *
688  * It is a sad fact of life that this function is called from several places
689  * deeply under spinlocking.  It may not sleep.
690  *
691  * If the page has buffers, the uptodate buffers are set dirty, to preserve
692  * dirty-state coherency between the page and the buffers.  It the page does
693  * not have buffers then when they are later attached they will all be set
694  * dirty.
695  *
696  * The buffers are dirtied before the page is dirtied.  There's a small race
697  * window in which a writepage caller may see the page cleanness but not the
698  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
699  * before the buffers, a concurrent writepage caller could clear the page dirty
700  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
701  * page on the dirty page list.
702  *
703  * We use private_lock to lock against try_to_free_buffers while using the
704  * page's buffer list.  Also use this to protect against clean buffers being
705  * added to the page after it was set dirty.
706  *
707  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
708  * address_space though.
709  */
710 int __set_page_dirty_buffers(struct page *page)
711 {
712 	struct address_space * const mapping = page_mapping(page);
713 
714 	if (unlikely(!mapping))
715 		return !TestSetPageDirty(page);
716 
717 	spin_lock(&mapping->private_lock);
718 	if (page_has_buffers(page)) {
719 		struct buffer_head *head = page_buffers(page);
720 		struct buffer_head *bh = head;
721 
722 		do {
723 			set_buffer_dirty(bh);
724 			bh = bh->b_this_page;
725 		} while (bh != head);
726 	}
727 	spin_unlock(&mapping->private_lock);
728 
729 	if (TestSetPageDirty(page))
730 		return 0;
731 
732 	write_lock_irq(&mapping->tree_lock);
733 	if (page->mapping) {	/* Race with truncate? */
734 		if (mapping_cap_account_dirty(mapping)) {
735 			__inc_zone_page_state(page, NR_FILE_DIRTY);
736 			task_io_account_write(PAGE_CACHE_SIZE);
737 		}
738 		radix_tree_tag_set(&mapping->page_tree,
739 				page_index(page), PAGECACHE_TAG_DIRTY);
740 	}
741 	write_unlock_irq(&mapping->tree_lock);
742 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
743 	return 1;
744 }
745 EXPORT_SYMBOL(__set_page_dirty_buffers);
746 
747 /*
748  * Write out and wait upon a list of buffers.
749  *
750  * We have conflicting pressures: we want to make sure that all
751  * initially dirty buffers get waited on, but that any subsequently
752  * dirtied buffers don't.  After all, we don't want fsync to last
753  * forever if somebody is actively writing to the file.
754  *
755  * Do this in two main stages: first we copy dirty buffers to a
756  * temporary inode list, queueing the writes as we go.  Then we clean
757  * up, waiting for those writes to complete.
758  *
759  * During this second stage, any subsequent updates to the file may end
760  * up refiling the buffer on the original inode's dirty list again, so
761  * there is a chance we will end up with a buffer queued for write but
762  * not yet completed on that list.  So, as a final cleanup we go through
763  * the osync code to catch these locked, dirty buffers without requeuing
764  * any newly dirty buffers for write.
765  */
766 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
767 {
768 	struct buffer_head *bh;
769 	struct list_head tmp;
770 	int err = 0, err2;
771 
772 	INIT_LIST_HEAD(&tmp);
773 
774 	spin_lock(lock);
775 	while (!list_empty(list)) {
776 		bh = BH_ENTRY(list->next);
777 		__remove_assoc_queue(bh);
778 		if (buffer_dirty(bh) || buffer_locked(bh)) {
779 			list_add(&bh->b_assoc_buffers, &tmp);
780 			if (buffer_dirty(bh)) {
781 				get_bh(bh);
782 				spin_unlock(lock);
783 				/*
784 				 * Ensure any pending I/O completes so that
785 				 * ll_rw_block() actually writes the current
786 				 * contents - it is a noop if I/O is still in
787 				 * flight on potentially older contents.
788 				 */
789 				ll_rw_block(SWRITE, 1, &bh);
790 				brelse(bh);
791 				spin_lock(lock);
792 			}
793 		}
794 	}
795 
796 	while (!list_empty(&tmp)) {
797 		bh = BH_ENTRY(tmp.prev);
798 		list_del_init(&bh->b_assoc_buffers);
799 		get_bh(bh);
800 		spin_unlock(lock);
801 		wait_on_buffer(bh);
802 		if (!buffer_uptodate(bh))
803 			err = -EIO;
804 		brelse(bh);
805 		spin_lock(lock);
806 	}
807 
808 	spin_unlock(lock);
809 	err2 = osync_buffers_list(lock, list);
810 	if (err)
811 		return err;
812 	else
813 		return err2;
814 }
815 
816 /*
817  * Invalidate any and all dirty buffers on a given inode.  We are
818  * probably unmounting the fs, but that doesn't mean we have already
819  * done a sync().  Just drop the buffers from the inode list.
820  *
821  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
822  * assumes that all the buffers are against the blockdev.  Not true
823  * for reiserfs.
824  */
825 void invalidate_inode_buffers(struct inode *inode)
826 {
827 	if (inode_has_buffers(inode)) {
828 		struct address_space *mapping = &inode->i_data;
829 		struct list_head *list = &mapping->private_list;
830 		struct address_space *buffer_mapping = mapping->assoc_mapping;
831 
832 		spin_lock(&buffer_mapping->private_lock);
833 		while (!list_empty(list))
834 			__remove_assoc_queue(BH_ENTRY(list->next));
835 		spin_unlock(&buffer_mapping->private_lock);
836 	}
837 }
838 
839 /*
840  * Remove any clean buffers from the inode's buffer list.  This is called
841  * when we're trying to free the inode itself.  Those buffers can pin it.
842  *
843  * Returns true if all buffers were removed.
844  */
845 int remove_inode_buffers(struct inode *inode)
846 {
847 	int ret = 1;
848 
849 	if (inode_has_buffers(inode)) {
850 		struct address_space *mapping = &inode->i_data;
851 		struct list_head *list = &mapping->private_list;
852 		struct address_space *buffer_mapping = mapping->assoc_mapping;
853 
854 		spin_lock(&buffer_mapping->private_lock);
855 		while (!list_empty(list)) {
856 			struct buffer_head *bh = BH_ENTRY(list->next);
857 			if (buffer_dirty(bh)) {
858 				ret = 0;
859 				break;
860 			}
861 			__remove_assoc_queue(bh);
862 		}
863 		spin_unlock(&buffer_mapping->private_lock);
864 	}
865 	return ret;
866 }
867 
868 /*
869  * Create the appropriate buffers when given a page for data area and
870  * the size of each buffer.. Use the bh->b_this_page linked list to
871  * follow the buffers created.  Return NULL if unable to create more
872  * buffers.
873  *
874  * The retry flag is used to differentiate async IO (paging, swapping)
875  * which may not fail from ordinary buffer allocations.
876  */
877 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
878 		int retry)
879 {
880 	struct buffer_head *bh, *head;
881 	long offset;
882 
883 try_again:
884 	head = NULL;
885 	offset = PAGE_SIZE;
886 	while ((offset -= size) >= 0) {
887 		bh = alloc_buffer_head(GFP_NOFS);
888 		if (!bh)
889 			goto no_grow;
890 
891 		bh->b_bdev = NULL;
892 		bh->b_this_page = head;
893 		bh->b_blocknr = -1;
894 		head = bh;
895 
896 		bh->b_state = 0;
897 		atomic_set(&bh->b_count, 0);
898 		bh->b_private = NULL;
899 		bh->b_size = size;
900 
901 		/* Link the buffer to its page */
902 		set_bh_page(bh, page, offset);
903 
904 		init_buffer(bh, NULL, NULL);
905 	}
906 	return head;
907 /*
908  * In case anything failed, we just free everything we got.
909  */
910 no_grow:
911 	if (head) {
912 		do {
913 			bh = head;
914 			head = head->b_this_page;
915 			free_buffer_head(bh);
916 		} while (head);
917 	}
918 
919 	/*
920 	 * Return failure for non-async IO requests.  Async IO requests
921 	 * are not allowed to fail, so we have to wait until buffer heads
922 	 * become available.  But we don't want tasks sleeping with
923 	 * partially complete buffers, so all were released above.
924 	 */
925 	if (!retry)
926 		return NULL;
927 
928 	/* We're _really_ low on memory. Now we just
929 	 * wait for old buffer heads to become free due to
930 	 * finishing IO.  Since this is an async request and
931 	 * the reserve list is empty, we're sure there are
932 	 * async buffer heads in use.
933 	 */
934 	free_more_memory();
935 	goto try_again;
936 }
937 EXPORT_SYMBOL_GPL(alloc_page_buffers);
938 
939 static inline void
940 link_dev_buffers(struct page *page, struct buffer_head *head)
941 {
942 	struct buffer_head *bh, *tail;
943 
944 	bh = head;
945 	do {
946 		tail = bh;
947 		bh = bh->b_this_page;
948 	} while (bh);
949 	tail->b_this_page = head;
950 	attach_page_buffers(page, head);
951 }
952 
953 /*
954  * Initialise the state of a blockdev page's buffers.
955  */
956 static void
957 init_page_buffers(struct page *page, struct block_device *bdev,
958 			sector_t block, int size)
959 {
960 	struct buffer_head *head = page_buffers(page);
961 	struct buffer_head *bh = head;
962 	int uptodate = PageUptodate(page);
963 
964 	do {
965 		if (!buffer_mapped(bh)) {
966 			init_buffer(bh, NULL, NULL);
967 			bh->b_bdev = bdev;
968 			bh->b_blocknr = block;
969 			if (uptodate)
970 				set_buffer_uptodate(bh);
971 			set_buffer_mapped(bh);
972 		}
973 		block++;
974 		bh = bh->b_this_page;
975 	} while (bh != head);
976 }
977 
978 /*
979  * Create the page-cache page that contains the requested block.
980  *
981  * This is user purely for blockdev mappings.
982  */
983 static struct page *
984 grow_dev_page(struct block_device *bdev, sector_t block,
985 		pgoff_t index, int size)
986 {
987 	struct inode *inode = bdev->bd_inode;
988 	struct page *page;
989 	struct buffer_head *bh;
990 
991 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
992 	if (!page)
993 		return NULL;
994 
995 	BUG_ON(!PageLocked(page));
996 
997 	if (page_has_buffers(page)) {
998 		bh = page_buffers(page);
999 		if (bh->b_size == size) {
1000 			init_page_buffers(page, bdev, block, size);
1001 			return page;
1002 		}
1003 		if (!try_to_free_buffers(page))
1004 			goto failed;
1005 	}
1006 
1007 	/*
1008 	 * Allocate some buffers for this page
1009 	 */
1010 	bh = alloc_page_buffers(page, size, 0);
1011 	if (!bh)
1012 		goto failed;
1013 
1014 	/*
1015 	 * Link the page to the buffers and initialise them.  Take the
1016 	 * lock to be atomic wrt __find_get_block(), which does not
1017 	 * run under the page lock.
1018 	 */
1019 	spin_lock(&inode->i_mapping->private_lock);
1020 	link_dev_buffers(page, bh);
1021 	init_page_buffers(page, bdev, block, size);
1022 	spin_unlock(&inode->i_mapping->private_lock);
1023 	return page;
1024 
1025 failed:
1026 	BUG();
1027 	unlock_page(page);
1028 	page_cache_release(page);
1029 	return NULL;
1030 }
1031 
1032 /*
1033  * Create buffers for the specified block device block's page.  If
1034  * that page was dirty, the buffers are set dirty also.
1035  *
1036  * Except that's a bug.  Attaching dirty buffers to a dirty
1037  * blockdev's page can result in filesystem corruption, because
1038  * some of those buffers may be aliases of filesystem data.
1039  * grow_dev_page() will go BUG() if this happens.
1040  */
1041 static int
1042 grow_buffers(struct block_device *bdev, sector_t block, int size)
1043 {
1044 	struct page *page;
1045 	pgoff_t index;
1046 	int sizebits;
1047 
1048 	sizebits = -1;
1049 	do {
1050 		sizebits++;
1051 	} while ((size << sizebits) < PAGE_SIZE);
1052 
1053 	index = block >> sizebits;
1054 
1055 	/*
1056 	 * Check for a block which wants to lie outside our maximum possible
1057 	 * pagecache index.  (this comparison is done using sector_t types).
1058 	 */
1059 	if (unlikely(index != block >> sizebits)) {
1060 		char b[BDEVNAME_SIZE];
1061 
1062 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1063 			"device %s\n",
1064 			__FUNCTION__, (unsigned long long)block,
1065 			bdevname(bdev, b));
1066 		return -EIO;
1067 	}
1068 	block = index << sizebits;
1069 	/* Create a page with the proper size buffers.. */
1070 	page = grow_dev_page(bdev, block, index, size);
1071 	if (!page)
1072 		return 0;
1073 	unlock_page(page);
1074 	page_cache_release(page);
1075 	return 1;
1076 }
1077 
1078 static struct buffer_head *
1079 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1080 {
1081 	/* Size must be multiple of hard sectorsize */
1082 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1083 			(size < 512 || size > PAGE_SIZE))) {
1084 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1085 					size);
1086 		printk(KERN_ERR "hardsect size: %d\n",
1087 					bdev_hardsect_size(bdev));
1088 
1089 		dump_stack();
1090 		return NULL;
1091 	}
1092 
1093 	for (;;) {
1094 		struct buffer_head * bh;
1095 		int ret;
1096 
1097 		bh = __find_get_block(bdev, block, size);
1098 		if (bh)
1099 			return bh;
1100 
1101 		ret = grow_buffers(bdev, block, size);
1102 		if (ret < 0)
1103 			return NULL;
1104 		if (ret == 0)
1105 			free_more_memory();
1106 	}
1107 }
1108 
1109 /*
1110  * The relationship between dirty buffers and dirty pages:
1111  *
1112  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1113  * the page is tagged dirty in its radix tree.
1114  *
1115  * At all times, the dirtiness of the buffers represents the dirtiness of
1116  * subsections of the page.  If the page has buffers, the page dirty bit is
1117  * merely a hint about the true dirty state.
1118  *
1119  * When a page is set dirty in its entirety, all its buffers are marked dirty
1120  * (if the page has buffers).
1121  *
1122  * When a buffer is marked dirty, its page is dirtied, but the page's other
1123  * buffers are not.
1124  *
1125  * Also.  When blockdev buffers are explicitly read with bread(), they
1126  * individually become uptodate.  But their backing page remains not
1127  * uptodate - even if all of its buffers are uptodate.  A subsequent
1128  * block_read_full_page() against that page will discover all the uptodate
1129  * buffers, will set the page uptodate and will perform no I/O.
1130  */
1131 
1132 /**
1133  * mark_buffer_dirty - mark a buffer_head as needing writeout
1134  * @bh: the buffer_head to mark dirty
1135  *
1136  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1137  * backing page dirty, then tag the page as dirty in its address_space's radix
1138  * tree and then attach the address_space's inode to its superblock's dirty
1139  * inode list.
1140  *
1141  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1142  * mapping->tree_lock and the global inode_lock.
1143  */
1144 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1145 {
1146 	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1147 		__set_page_dirty_nobuffers(bh->b_page);
1148 }
1149 
1150 /*
1151  * Decrement a buffer_head's reference count.  If all buffers against a page
1152  * have zero reference count, are clean and unlocked, and if the page is clean
1153  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1154  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1155  * a page but it ends up not being freed, and buffers may later be reattached).
1156  */
1157 void __brelse(struct buffer_head * buf)
1158 {
1159 	if (atomic_read(&buf->b_count)) {
1160 		put_bh(buf);
1161 		return;
1162 	}
1163 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1164 	WARN_ON(1);
1165 }
1166 
1167 /*
1168  * bforget() is like brelse(), except it discards any
1169  * potentially dirty data.
1170  */
1171 void __bforget(struct buffer_head *bh)
1172 {
1173 	clear_buffer_dirty(bh);
1174 	if (!list_empty(&bh->b_assoc_buffers)) {
1175 		struct address_space *buffer_mapping = bh->b_page->mapping;
1176 
1177 		spin_lock(&buffer_mapping->private_lock);
1178 		list_del_init(&bh->b_assoc_buffers);
1179 		bh->b_assoc_map = NULL;
1180 		spin_unlock(&buffer_mapping->private_lock);
1181 	}
1182 	__brelse(bh);
1183 }
1184 
1185 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1186 {
1187 	lock_buffer(bh);
1188 	if (buffer_uptodate(bh)) {
1189 		unlock_buffer(bh);
1190 		return bh;
1191 	} else {
1192 		get_bh(bh);
1193 		bh->b_end_io = end_buffer_read_sync;
1194 		submit_bh(READ, bh);
1195 		wait_on_buffer(bh);
1196 		if (buffer_uptodate(bh))
1197 			return bh;
1198 	}
1199 	brelse(bh);
1200 	return NULL;
1201 }
1202 
1203 /*
1204  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1205  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1206  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1207  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1208  * CPU's LRUs at the same time.
1209  *
1210  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1211  * sb_find_get_block().
1212  *
1213  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1214  * a local interrupt disable for that.
1215  */
1216 
1217 #define BH_LRU_SIZE	8
1218 
1219 struct bh_lru {
1220 	struct buffer_head *bhs[BH_LRU_SIZE];
1221 };
1222 
1223 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1224 
1225 #ifdef CONFIG_SMP
1226 #define bh_lru_lock()	local_irq_disable()
1227 #define bh_lru_unlock()	local_irq_enable()
1228 #else
1229 #define bh_lru_lock()	preempt_disable()
1230 #define bh_lru_unlock()	preempt_enable()
1231 #endif
1232 
1233 static inline void check_irqs_on(void)
1234 {
1235 #ifdef irqs_disabled
1236 	BUG_ON(irqs_disabled());
1237 #endif
1238 }
1239 
1240 /*
1241  * The LRU management algorithm is dopey-but-simple.  Sorry.
1242  */
1243 static void bh_lru_install(struct buffer_head *bh)
1244 {
1245 	struct buffer_head *evictee = NULL;
1246 	struct bh_lru *lru;
1247 
1248 	check_irqs_on();
1249 	bh_lru_lock();
1250 	lru = &__get_cpu_var(bh_lrus);
1251 	if (lru->bhs[0] != bh) {
1252 		struct buffer_head *bhs[BH_LRU_SIZE];
1253 		int in;
1254 		int out = 0;
1255 
1256 		get_bh(bh);
1257 		bhs[out++] = bh;
1258 		for (in = 0; in < BH_LRU_SIZE; in++) {
1259 			struct buffer_head *bh2 = lru->bhs[in];
1260 
1261 			if (bh2 == bh) {
1262 				__brelse(bh2);
1263 			} else {
1264 				if (out >= BH_LRU_SIZE) {
1265 					BUG_ON(evictee != NULL);
1266 					evictee = bh2;
1267 				} else {
1268 					bhs[out++] = bh2;
1269 				}
1270 			}
1271 		}
1272 		while (out < BH_LRU_SIZE)
1273 			bhs[out++] = NULL;
1274 		memcpy(lru->bhs, bhs, sizeof(bhs));
1275 	}
1276 	bh_lru_unlock();
1277 
1278 	if (evictee)
1279 		__brelse(evictee);
1280 }
1281 
1282 /*
1283  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1284  */
1285 static struct buffer_head *
1286 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1287 {
1288 	struct buffer_head *ret = NULL;
1289 	struct bh_lru *lru;
1290 	unsigned int i;
1291 
1292 	check_irqs_on();
1293 	bh_lru_lock();
1294 	lru = &__get_cpu_var(bh_lrus);
1295 	for (i = 0; i < BH_LRU_SIZE; i++) {
1296 		struct buffer_head *bh = lru->bhs[i];
1297 
1298 		if (bh && bh->b_bdev == bdev &&
1299 				bh->b_blocknr == block && bh->b_size == size) {
1300 			if (i) {
1301 				while (i) {
1302 					lru->bhs[i] = lru->bhs[i - 1];
1303 					i--;
1304 				}
1305 				lru->bhs[0] = bh;
1306 			}
1307 			get_bh(bh);
1308 			ret = bh;
1309 			break;
1310 		}
1311 	}
1312 	bh_lru_unlock();
1313 	return ret;
1314 }
1315 
1316 /*
1317  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1318  * it in the LRU and mark it as accessed.  If it is not present then return
1319  * NULL
1320  */
1321 struct buffer_head *
1322 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1323 {
1324 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1325 
1326 	if (bh == NULL) {
1327 		bh = __find_get_block_slow(bdev, block);
1328 		if (bh)
1329 			bh_lru_install(bh);
1330 	}
1331 	if (bh)
1332 		touch_buffer(bh);
1333 	return bh;
1334 }
1335 EXPORT_SYMBOL(__find_get_block);
1336 
1337 /*
1338  * __getblk will locate (and, if necessary, create) the buffer_head
1339  * which corresponds to the passed block_device, block and size. The
1340  * returned buffer has its reference count incremented.
1341  *
1342  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1343  * illegal block number, __getblk() will happily return a buffer_head
1344  * which represents the non-existent block.  Very weird.
1345  *
1346  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1347  * attempt is failing.  FIXME, perhaps?
1348  */
1349 struct buffer_head *
1350 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1351 {
1352 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1353 
1354 	might_sleep();
1355 	if (bh == NULL)
1356 		bh = __getblk_slow(bdev, block, size);
1357 	return bh;
1358 }
1359 EXPORT_SYMBOL(__getblk);
1360 
1361 /*
1362  * Do async read-ahead on a buffer..
1363  */
1364 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1365 {
1366 	struct buffer_head *bh = __getblk(bdev, block, size);
1367 	if (likely(bh)) {
1368 		ll_rw_block(READA, 1, &bh);
1369 		brelse(bh);
1370 	}
1371 }
1372 EXPORT_SYMBOL(__breadahead);
1373 
1374 /**
1375  *  __bread() - reads a specified block and returns the bh
1376  *  @bdev: the block_device to read from
1377  *  @block: number of block
1378  *  @size: size (in bytes) to read
1379  *
1380  *  Reads a specified block, and returns buffer head that contains it.
1381  *  It returns NULL if the block was unreadable.
1382  */
1383 struct buffer_head *
1384 __bread(struct block_device *bdev, sector_t block, unsigned size)
1385 {
1386 	struct buffer_head *bh = __getblk(bdev, block, size);
1387 
1388 	if (likely(bh) && !buffer_uptodate(bh))
1389 		bh = __bread_slow(bh);
1390 	return bh;
1391 }
1392 EXPORT_SYMBOL(__bread);
1393 
1394 /*
1395  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1396  * This doesn't race because it runs in each cpu either in irq
1397  * or with preempt disabled.
1398  */
1399 static void invalidate_bh_lru(void *arg)
1400 {
1401 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1402 	int i;
1403 
1404 	for (i = 0; i < BH_LRU_SIZE; i++) {
1405 		brelse(b->bhs[i]);
1406 		b->bhs[i] = NULL;
1407 	}
1408 	put_cpu_var(bh_lrus);
1409 }
1410 
1411 static void invalidate_bh_lrus(void)
1412 {
1413 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1414 }
1415 
1416 void set_bh_page(struct buffer_head *bh,
1417 		struct page *page, unsigned long offset)
1418 {
1419 	bh->b_page = page;
1420 	BUG_ON(offset >= PAGE_SIZE);
1421 	if (PageHighMem(page))
1422 		/*
1423 		 * This catches illegal uses and preserves the offset:
1424 		 */
1425 		bh->b_data = (char *)(0 + offset);
1426 	else
1427 		bh->b_data = page_address(page) + offset;
1428 }
1429 EXPORT_SYMBOL(set_bh_page);
1430 
1431 /*
1432  * Called when truncating a buffer on a page completely.
1433  */
1434 static void discard_buffer(struct buffer_head * bh)
1435 {
1436 	lock_buffer(bh);
1437 	clear_buffer_dirty(bh);
1438 	bh->b_bdev = NULL;
1439 	clear_buffer_mapped(bh);
1440 	clear_buffer_req(bh);
1441 	clear_buffer_new(bh);
1442 	clear_buffer_delay(bh);
1443 	clear_buffer_unwritten(bh);
1444 	unlock_buffer(bh);
1445 }
1446 
1447 /**
1448  * block_invalidatepage - invalidate part of all of a buffer-backed page
1449  *
1450  * @page: the page which is affected
1451  * @offset: the index of the truncation point
1452  *
1453  * block_invalidatepage() is called when all or part of the page has become
1454  * invalidatedby a truncate operation.
1455  *
1456  * block_invalidatepage() does not have to release all buffers, but it must
1457  * ensure that no dirty buffer is left outside @offset and that no I/O
1458  * is underway against any of the blocks which are outside the truncation
1459  * point.  Because the caller is about to free (and possibly reuse) those
1460  * blocks on-disk.
1461  */
1462 void block_invalidatepage(struct page *page, unsigned long offset)
1463 {
1464 	struct buffer_head *head, *bh, *next;
1465 	unsigned int curr_off = 0;
1466 
1467 	BUG_ON(!PageLocked(page));
1468 	if (!page_has_buffers(page))
1469 		goto out;
1470 
1471 	head = page_buffers(page);
1472 	bh = head;
1473 	do {
1474 		unsigned int next_off = curr_off + bh->b_size;
1475 		next = bh->b_this_page;
1476 
1477 		/*
1478 		 * is this block fully invalidated?
1479 		 */
1480 		if (offset <= curr_off)
1481 			discard_buffer(bh);
1482 		curr_off = next_off;
1483 		bh = next;
1484 	} while (bh != head);
1485 
1486 	/*
1487 	 * We release buffers only if the entire page is being invalidated.
1488 	 * The get_block cached value has been unconditionally invalidated,
1489 	 * so real IO is not possible anymore.
1490 	 */
1491 	if (offset == 0)
1492 		try_to_release_page(page, 0);
1493 out:
1494 	return;
1495 }
1496 EXPORT_SYMBOL(block_invalidatepage);
1497 
1498 /*
1499  * We attach and possibly dirty the buffers atomically wrt
1500  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1501  * is already excluded via the page lock.
1502  */
1503 void create_empty_buffers(struct page *page,
1504 			unsigned long blocksize, unsigned long b_state)
1505 {
1506 	struct buffer_head *bh, *head, *tail;
1507 
1508 	head = alloc_page_buffers(page, blocksize, 1);
1509 	bh = head;
1510 	do {
1511 		bh->b_state |= b_state;
1512 		tail = bh;
1513 		bh = bh->b_this_page;
1514 	} while (bh);
1515 	tail->b_this_page = head;
1516 
1517 	spin_lock(&page->mapping->private_lock);
1518 	if (PageUptodate(page) || PageDirty(page)) {
1519 		bh = head;
1520 		do {
1521 			if (PageDirty(page))
1522 				set_buffer_dirty(bh);
1523 			if (PageUptodate(page))
1524 				set_buffer_uptodate(bh);
1525 			bh = bh->b_this_page;
1526 		} while (bh != head);
1527 	}
1528 	attach_page_buffers(page, head);
1529 	spin_unlock(&page->mapping->private_lock);
1530 }
1531 EXPORT_SYMBOL(create_empty_buffers);
1532 
1533 /*
1534  * We are taking a block for data and we don't want any output from any
1535  * buffer-cache aliases starting from return from that function and
1536  * until the moment when something will explicitly mark the buffer
1537  * dirty (hopefully that will not happen until we will free that block ;-)
1538  * We don't even need to mark it not-uptodate - nobody can expect
1539  * anything from a newly allocated buffer anyway. We used to used
1540  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1541  * don't want to mark the alias unmapped, for example - it would confuse
1542  * anyone who might pick it with bread() afterwards...
1543  *
1544  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1545  * be writeout I/O going on against recently-freed buffers.  We don't
1546  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1547  * only if we really need to.  That happens here.
1548  */
1549 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1550 {
1551 	struct buffer_head *old_bh;
1552 
1553 	might_sleep();
1554 
1555 	old_bh = __find_get_block_slow(bdev, block);
1556 	if (old_bh) {
1557 		clear_buffer_dirty(old_bh);
1558 		wait_on_buffer(old_bh);
1559 		clear_buffer_req(old_bh);
1560 		__brelse(old_bh);
1561 	}
1562 }
1563 EXPORT_SYMBOL(unmap_underlying_metadata);
1564 
1565 /*
1566  * NOTE! All mapped/uptodate combinations are valid:
1567  *
1568  *	Mapped	Uptodate	Meaning
1569  *
1570  *	No	No		"unknown" - must do get_block()
1571  *	No	Yes		"hole" - zero-filled
1572  *	Yes	No		"allocated" - allocated on disk, not read in
1573  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1574  *
1575  * "Dirty" is valid only with the last case (mapped+uptodate).
1576  */
1577 
1578 /*
1579  * While block_write_full_page is writing back the dirty buffers under
1580  * the page lock, whoever dirtied the buffers may decide to clean them
1581  * again at any time.  We handle that by only looking at the buffer
1582  * state inside lock_buffer().
1583  *
1584  * If block_write_full_page() is called for regular writeback
1585  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1586  * locked buffer.   This only can happen if someone has written the buffer
1587  * directly, with submit_bh().  At the address_space level PageWriteback
1588  * prevents this contention from occurring.
1589  */
1590 static int __block_write_full_page(struct inode *inode, struct page *page,
1591 			get_block_t *get_block, struct writeback_control *wbc)
1592 {
1593 	int err;
1594 	sector_t block;
1595 	sector_t last_block;
1596 	struct buffer_head *bh, *head;
1597 	const unsigned blocksize = 1 << inode->i_blkbits;
1598 	int nr_underway = 0;
1599 
1600 	BUG_ON(!PageLocked(page));
1601 
1602 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1603 
1604 	if (!page_has_buffers(page)) {
1605 		create_empty_buffers(page, blocksize,
1606 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1607 	}
1608 
1609 	/*
1610 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1611 	 * here, and the (potentially unmapped) buffers may become dirty at
1612 	 * any time.  If a buffer becomes dirty here after we've inspected it
1613 	 * then we just miss that fact, and the page stays dirty.
1614 	 *
1615 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1616 	 * handle that here by just cleaning them.
1617 	 */
1618 
1619 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1620 	head = page_buffers(page);
1621 	bh = head;
1622 
1623 	/*
1624 	 * Get all the dirty buffers mapped to disk addresses and
1625 	 * handle any aliases from the underlying blockdev's mapping.
1626 	 */
1627 	do {
1628 		if (block > last_block) {
1629 			/*
1630 			 * mapped buffers outside i_size will occur, because
1631 			 * this page can be outside i_size when there is a
1632 			 * truncate in progress.
1633 			 */
1634 			/*
1635 			 * The buffer was zeroed by block_write_full_page()
1636 			 */
1637 			clear_buffer_dirty(bh);
1638 			set_buffer_uptodate(bh);
1639 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1640 			WARN_ON(bh->b_size != blocksize);
1641 			err = get_block(inode, block, bh, 1);
1642 			if (err)
1643 				goto recover;
1644 			if (buffer_new(bh)) {
1645 				/* blockdev mappings never come here */
1646 				clear_buffer_new(bh);
1647 				unmap_underlying_metadata(bh->b_bdev,
1648 							bh->b_blocknr);
1649 			}
1650 		}
1651 		bh = bh->b_this_page;
1652 		block++;
1653 	} while (bh != head);
1654 
1655 	do {
1656 		if (!buffer_mapped(bh))
1657 			continue;
1658 		/*
1659 		 * If it's a fully non-blocking write attempt and we cannot
1660 		 * lock the buffer then redirty the page.  Note that this can
1661 		 * potentially cause a busy-wait loop from pdflush and kswapd
1662 		 * activity, but those code paths have their own higher-level
1663 		 * throttling.
1664 		 */
1665 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1666 			lock_buffer(bh);
1667 		} else if (test_set_buffer_locked(bh)) {
1668 			redirty_page_for_writepage(wbc, page);
1669 			continue;
1670 		}
1671 		if (test_clear_buffer_dirty(bh)) {
1672 			mark_buffer_async_write(bh);
1673 		} else {
1674 			unlock_buffer(bh);
1675 		}
1676 	} while ((bh = bh->b_this_page) != head);
1677 
1678 	/*
1679 	 * The page and its buffers are protected by PageWriteback(), so we can
1680 	 * drop the bh refcounts early.
1681 	 */
1682 	BUG_ON(PageWriteback(page));
1683 	set_page_writeback(page);
1684 
1685 	do {
1686 		struct buffer_head *next = bh->b_this_page;
1687 		if (buffer_async_write(bh)) {
1688 			submit_bh(WRITE, bh);
1689 			nr_underway++;
1690 		}
1691 		bh = next;
1692 	} while (bh != head);
1693 	unlock_page(page);
1694 
1695 	err = 0;
1696 done:
1697 	if (nr_underway == 0) {
1698 		/*
1699 		 * The page was marked dirty, but the buffers were
1700 		 * clean.  Someone wrote them back by hand with
1701 		 * ll_rw_block/submit_bh.  A rare case.
1702 		 */
1703 		int uptodate = 1;
1704 		do {
1705 			if (!buffer_uptodate(bh)) {
1706 				uptodate = 0;
1707 				break;
1708 			}
1709 			bh = bh->b_this_page;
1710 		} while (bh != head);
1711 		if (uptodate)
1712 			SetPageUptodate(page);
1713 		end_page_writeback(page);
1714 		/*
1715 		 * The page and buffer_heads can be released at any time from
1716 		 * here on.
1717 		 */
1718 		wbc->pages_skipped++;	/* We didn't write this page */
1719 	}
1720 	return err;
1721 
1722 recover:
1723 	/*
1724 	 * ENOSPC, or some other error.  We may already have added some
1725 	 * blocks to the file, so we need to write these out to avoid
1726 	 * exposing stale data.
1727 	 * The page is currently locked and not marked for writeback
1728 	 */
1729 	bh = head;
1730 	/* Recovery: lock and submit the mapped buffers */
1731 	do {
1732 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
1733 			lock_buffer(bh);
1734 			mark_buffer_async_write(bh);
1735 		} else {
1736 			/*
1737 			 * The buffer may have been set dirty during
1738 			 * attachment to a dirty page.
1739 			 */
1740 			clear_buffer_dirty(bh);
1741 		}
1742 	} while ((bh = bh->b_this_page) != head);
1743 	SetPageError(page);
1744 	BUG_ON(PageWriteback(page));
1745 	set_page_writeback(page);
1746 	unlock_page(page);
1747 	do {
1748 		struct buffer_head *next = bh->b_this_page;
1749 		if (buffer_async_write(bh)) {
1750 			clear_buffer_dirty(bh);
1751 			submit_bh(WRITE, bh);
1752 			nr_underway++;
1753 		}
1754 		bh = next;
1755 	} while (bh != head);
1756 	goto done;
1757 }
1758 
1759 static int __block_prepare_write(struct inode *inode, struct page *page,
1760 		unsigned from, unsigned to, get_block_t *get_block)
1761 {
1762 	unsigned block_start, block_end;
1763 	sector_t block;
1764 	int err = 0;
1765 	unsigned blocksize, bbits;
1766 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1767 
1768 	BUG_ON(!PageLocked(page));
1769 	BUG_ON(from > PAGE_CACHE_SIZE);
1770 	BUG_ON(to > PAGE_CACHE_SIZE);
1771 	BUG_ON(from > to);
1772 
1773 	blocksize = 1 << inode->i_blkbits;
1774 	if (!page_has_buffers(page))
1775 		create_empty_buffers(page, blocksize, 0);
1776 	head = page_buffers(page);
1777 
1778 	bbits = inode->i_blkbits;
1779 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1780 
1781 	for(bh = head, block_start = 0; bh != head || !block_start;
1782 	    block++, block_start=block_end, bh = bh->b_this_page) {
1783 		block_end = block_start + blocksize;
1784 		if (block_end <= from || block_start >= to) {
1785 			if (PageUptodate(page)) {
1786 				if (!buffer_uptodate(bh))
1787 					set_buffer_uptodate(bh);
1788 			}
1789 			continue;
1790 		}
1791 		if (buffer_new(bh))
1792 			clear_buffer_new(bh);
1793 		if (!buffer_mapped(bh)) {
1794 			WARN_ON(bh->b_size != blocksize);
1795 			err = get_block(inode, block, bh, 1);
1796 			if (err)
1797 				break;
1798 			if (buffer_new(bh)) {
1799 				unmap_underlying_metadata(bh->b_bdev,
1800 							bh->b_blocknr);
1801 				if (PageUptodate(page)) {
1802 					set_buffer_uptodate(bh);
1803 					continue;
1804 				}
1805 				if (block_end > to || block_start < from) {
1806 					void *kaddr;
1807 
1808 					kaddr = kmap_atomic(page, KM_USER0);
1809 					if (block_end > to)
1810 						memset(kaddr+to, 0,
1811 							block_end-to);
1812 					if (block_start < from)
1813 						memset(kaddr+block_start,
1814 							0, from-block_start);
1815 					flush_dcache_page(page);
1816 					kunmap_atomic(kaddr, KM_USER0);
1817 				}
1818 				continue;
1819 			}
1820 		}
1821 		if (PageUptodate(page)) {
1822 			if (!buffer_uptodate(bh))
1823 				set_buffer_uptodate(bh);
1824 			continue;
1825 		}
1826 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1827 		    !buffer_unwritten(bh) &&
1828 		     (block_start < from || block_end > to)) {
1829 			ll_rw_block(READ, 1, &bh);
1830 			*wait_bh++=bh;
1831 		}
1832 	}
1833 	/*
1834 	 * If we issued read requests - let them complete.
1835 	 */
1836 	while(wait_bh > wait) {
1837 		wait_on_buffer(*--wait_bh);
1838 		if (!buffer_uptodate(*wait_bh))
1839 			err = -EIO;
1840 	}
1841 	if (!err) {
1842 		bh = head;
1843 		do {
1844 			if (buffer_new(bh))
1845 				clear_buffer_new(bh);
1846 		} while ((bh = bh->b_this_page) != head);
1847 		return 0;
1848 	}
1849 	/* Error case: */
1850 	/*
1851 	 * Zero out any newly allocated blocks to avoid exposing stale
1852 	 * data.  If BH_New is set, we know that the block was newly
1853 	 * allocated in the above loop.
1854 	 */
1855 	bh = head;
1856 	block_start = 0;
1857 	do {
1858 		block_end = block_start+blocksize;
1859 		if (block_end <= from)
1860 			goto next_bh;
1861 		if (block_start >= to)
1862 			break;
1863 		if (buffer_new(bh)) {
1864 			void *kaddr;
1865 
1866 			clear_buffer_new(bh);
1867 			kaddr = kmap_atomic(page, KM_USER0);
1868 			memset(kaddr+block_start, 0, bh->b_size);
1869 			flush_dcache_page(page);
1870 			kunmap_atomic(kaddr, KM_USER0);
1871 			set_buffer_uptodate(bh);
1872 			mark_buffer_dirty(bh);
1873 		}
1874 next_bh:
1875 		block_start = block_end;
1876 		bh = bh->b_this_page;
1877 	} while (bh != head);
1878 	return err;
1879 }
1880 
1881 static int __block_commit_write(struct inode *inode, struct page *page,
1882 		unsigned from, unsigned to)
1883 {
1884 	unsigned block_start, block_end;
1885 	int partial = 0;
1886 	unsigned blocksize;
1887 	struct buffer_head *bh, *head;
1888 
1889 	blocksize = 1 << inode->i_blkbits;
1890 
1891 	for(bh = head = page_buffers(page), block_start = 0;
1892 	    bh != head || !block_start;
1893 	    block_start=block_end, bh = bh->b_this_page) {
1894 		block_end = block_start + blocksize;
1895 		if (block_end <= from || block_start >= to) {
1896 			if (!buffer_uptodate(bh))
1897 				partial = 1;
1898 		} else {
1899 			set_buffer_uptodate(bh);
1900 			mark_buffer_dirty(bh);
1901 		}
1902 	}
1903 
1904 	/*
1905 	 * If this is a partial write which happened to make all buffers
1906 	 * uptodate then we can optimize away a bogus readpage() for
1907 	 * the next read(). Here we 'discover' whether the page went
1908 	 * uptodate as a result of this (potentially partial) write.
1909 	 */
1910 	if (!partial)
1911 		SetPageUptodate(page);
1912 	return 0;
1913 }
1914 
1915 /*
1916  * Generic "read page" function for block devices that have the normal
1917  * get_block functionality. This is most of the block device filesystems.
1918  * Reads the page asynchronously --- the unlock_buffer() and
1919  * set/clear_buffer_uptodate() functions propagate buffer state into the
1920  * page struct once IO has completed.
1921  */
1922 int block_read_full_page(struct page *page, get_block_t *get_block)
1923 {
1924 	struct inode *inode = page->mapping->host;
1925 	sector_t iblock, lblock;
1926 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1927 	unsigned int blocksize;
1928 	int nr, i;
1929 	int fully_mapped = 1;
1930 
1931 	BUG_ON(!PageLocked(page));
1932 	blocksize = 1 << inode->i_blkbits;
1933 	if (!page_has_buffers(page))
1934 		create_empty_buffers(page, blocksize, 0);
1935 	head = page_buffers(page);
1936 
1937 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1938 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
1939 	bh = head;
1940 	nr = 0;
1941 	i = 0;
1942 
1943 	do {
1944 		if (buffer_uptodate(bh))
1945 			continue;
1946 
1947 		if (!buffer_mapped(bh)) {
1948 			int err = 0;
1949 
1950 			fully_mapped = 0;
1951 			if (iblock < lblock) {
1952 				WARN_ON(bh->b_size != blocksize);
1953 				err = get_block(inode, iblock, bh, 0);
1954 				if (err)
1955 					SetPageError(page);
1956 			}
1957 			if (!buffer_mapped(bh)) {
1958 				void *kaddr = kmap_atomic(page, KM_USER0);
1959 				memset(kaddr + i * blocksize, 0, blocksize);
1960 				flush_dcache_page(page);
1961 				kunmap_atomic(kaddr, KM_USER0);
1962 				if (!err)
1963 					set_buffer_uptodate(bh);
1964 				continue;
1965 			}
1966 			/*
1967 			 * get_block() might have updated the buffer
1968 			 * synchronously
1969 			 */
1970 			if (buffer_uptodate(bh))
1971 				continue;
1972 		}
1973 		arr[nr++] = bh;
1974 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1975 
1976 	if (fully_mapped)
1977 		SetPageMappedToDisk(page);
1978 
1979 	if (!nr) {
1980 		/*
1981 		 * All buffers are uptodate - we can set the page uptodate
1982 		 * as well. But not if get_block() returned an error.
1983 		 */
1984 		if (!PageError(page))
1985 			SetPageUptodate(page);
1986 		unlock_page(page);
1987 		return 0;
1988 	}
1989 
1990 	/* Stage two: lock the buffers */
1991 	for (i = 0; i < nr; i++) {
1992 		bh = arr[i];
1993 		lock_buffer(bh);
1994 		mark_buffer_async_read(bh);
1995 	}
1996 
1997 	/*
1998 	 * Stage 3: start the IO.  Check for uptodateness
1999 	 * inside the buffer lock in case another process reading
2000 	 * the underlying blockdev brought it uptodate (the sct fix).
2001 	 */
2002 	for (i = 0; i < nr; i++) {
2003 		bh = arr[i];
2004 		if (buffer_uptodate(bh))
2005 			end_buffer_async_read(bh, 1);
2006 		else
2007 			submit_bh(READ, bh);
2008 	}
2009 	return 0;
2010 }
2011 
2012 /* utility function for filesystems that need to do work on expanding
2013  * truncates.  Uses prepare/commit_write to allow the filesystem to
2014  * deal with the hole.
2015  */
2016 static int __generic_cont_expand(struct inode *inode, loff_t size,
2017 				 pgoff_t index, unsigned int offset)
2018 {
2019 	struct address_space *mapping = inode->i_mapping;
2020 	struct page *page;
2021 	unsigned long limit;
2022 	int err;
2023 
2024 	err = -EFBIG;
2025         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2026 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2027 		send_sig(SIGXFSZ, current, 0);
2028 		goto out;
2029 	}
2030 	if (size > inode->i_sb->s_maxbytes)
2031 		goto out;
2032 
2033 	err = -ENOMEM;
2034 	page = grab_cache_page(mapping, index);
2035 	if (!page)
2036 		goto out;
2037 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2038 	if (err) {
2039 		/*
2040 		 * ->prepare_write() may have instantiated a few blocks
2041 		 * outside i_size.  Trim these off again.
2042 		 */
2043 		unlock_page(page);
2044 		page_cache_release(page);
2045 		vmtruncate(inode, inode->i_size);
2046 		goto out;
2047 	}
2048 
2049 	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2050 
2051 	unlock_page(page);
2052 	page_cache_release(page);
2053 	if (err > 0)
2054 		err = 0;
2055 out:
2056 	return err;
2057 }
2058 
2059 int generic_cont_expand(struct inode *inode, loff_t size)
2060 {
2061 	pgoff_t index;
2062 	unsigned int offset;
2063 
2064 	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2065 
2066 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
2067 	** skip the prepare.  make sure we never send an offset for the start
2068 	** of a block
2069 	*/
2070 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2071 		/* caller must handle this extra byte. */
2072 		offset++;
2073 	}
2074 	index = size >> PAGE_CACHE_SHIFT;
2075 
2076 	return __generic_cont_expand(inode, size, index, offset);
2077 }
2078 
2079 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2080 {
2081 	loff_t pos = size - 1;
2082 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2083 	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2084 
2085 	/* prepare/commit_write can handle even if from==to==start of block. */
2086 	return __generic_cont_expand(inode, size, index, offset);
2087 }
2088 
2089 /*
2090  * For moronic filesystems that do not allow holes in file.
2091  * We may have to extend the file.
2092  */
2093 
2094 int cont_prepare_write(struct page *page, unsigned offset,
2095 		unsigned to, get_block_t *get_block, loff_t *bytes)
2096 {
2097 	struct address_space *mapping = page->mapping;
2098 	struct inode *inode = mapping->host;
2099 	struct page *new_page;
2100 	pgoff_t pgpos;
2101 	long status;
2102 	unsigned zerofrom;
2103 	unsigned blocksize = 1 << inode->i_blkbits;
2104 	void *kaddr;
2105 
2106 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2107 		status = -ENOMEM;
2108 		new_page = grab_cache_page(mapping, pgpos);
2109 		if (!new_page)
2110 			goto out;
2111 		/* we might sleep */
2112 		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2113 			unlock_page(new_page);
2114 			page_cache_release(new_page);
2115 			continue;
2116 		}
2117 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2118 		if (zerofrom & (blocksize-1)) {
2119 			*bytes |= (blocksize-1);
2120 			(*bytes)++;
2121 		}
2122 		status = __block_prepare_write(inode, new_page, zerofrom,
2123 						PAGE_CACHE_SIZE, get_block);
2124 		if (status)
2125 			goto out_unmap;
2126 		kaddr = kmap_atomic(new_page, KM_USER0);
2127 		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2128 		flush_dcache_page(new_page);
2129 		kunmap_atomic(kaddr, KM_USER0);
2130 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2131 		unlock_page(new_page);
2132 		page_cache_release(new_page);
2133 	}
2134 
2135 	if (page->index < pgpos) {
2136 		/* completely inside the area */
2137 		zerofrom = offset;
2138 	} else {
2139 		/* page covers the boundary, find the boundary offset */
2140 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2141 
2142 		/* if we will expand the thing last block will be filled */
2143 		if (to > zerofrom && (zerofrom & (blocksize-1))) {
2144 			*bytes |= (blocksize-1);
2145 			(*bytes)++;
2146 		}
2147 
2148 		/* starting below the boundary? Nothing to zero out */
2149 		if (offset <= zerofrom)
2150 			zerofrom = offset;
2151 	}
2152 	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2153 	if (status)
2154 		goto out1;
2155 	if (zerofrom < offset) {
2156 		kaddr = kmap_atomic(page, KM_USER0);
2157 		memset(kaddr+zerofrom, 0, offset-zerofrom);
2158 		flush_dcache_page(page);
2159 		kunmap_atomic(kaddr, KM_USER0);
2160 		__block_commit_write(inode, page, zerofrom, offset);
2161 	}
2162 	return 0;
2163 out1:
2164 	ClearPageUptodate(page);
2165 	return status;
2166 
2167 out_unmap:
2168 	ClearPageUptodate(new_page);
2169 	unlock_page(new_page);
2170 	page_cache_release(new_page);
2171 out:
2172 	return status;
2173 }
2174 
2175 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2176 			get_block_t *get_block)
2177 {
2178 	struct inode *inode = page->mapping->host;
2179 	int err = __block_prepare_write(inode, page, from, to, get_block);
2180 	if (err)
2181 		ClearPageUptodate(page);
2182 	return err;
2183 }
2184 
2185 int block_commit_write(struct page *page, unsigned from, unsigned to)
2186 {
2187 	struct inode *inode = page->mapping->host;
2188 	__block_commit_write(inode,page,from,to);
2189 	return 0;
2190 }
2191 
2192 int generic_commit_write(struct file *file, struct page *page,
2193 		unsigned from, unsigned to)
2194 {
2195 	struct inode *inode = page->mapping->host;
2196 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2197 	__block_commit_write(inode,page,from,to);
2198 	/*
2199 	 * No need to use i_size_read() here, the i_size
2200 	 * cannot change under us because we hold i_mutex.
2201 	 */
2202 	if (pos > inode->i_size) {
2203 		i_size_write(inode, pos);
2204 		mark_inode_dirty(inode);
2205 	}
2206 	return 0;
2207 }
2208 
2209 
2210 /*
2211  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2212  * immediately, while under the page lock.  So it needs a special end_io
2213  * handler which does not touch the bh after unlocking it.
2214  *
2215  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2216  * a race there is benign: unlock_buffer() only use the bh's address for
2217  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2218  * itself.
2219  */
2220 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2221 {
2222 	if (uptodate) {
2223 		set_buffer_uptodate(bh);
2224 	} else {
2225 		/* This happens, due to failed READA attempts. */
2226 		clear_buffer_uptodate(bh);
2227 	}
2228 	unlock_buffer(bh);
2229 }
2230 
2231 /*
2232  * On entry, the page is fully not uptodate.
2233  * On exit the page is fully uptodate in the areas outside (from,to)
2234  */
2235 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2236 			get_block_t *get_block)
2237 {
2238 	struct inode *inode = page->mapping->host;
2239 	const unsigned blkbits = inode->i_blkbits;
2240 	const unsigned blocksize = 1 << blkbits;
2241 	struct buffer_head map_bh;
2242 	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2243 	unsigned block_in_page;
2244 	unsigned block_start;
2245 	sector_t block_in_file;
2246 	char *kaddr;
2247 	int nr_reads = 0;
2248 	int i;
2249 	int ret = 0;
2250 	int is_mapped_to_disk = 1;
2251 	int dirtied_it = 0;
2252 
2253 	if (PageMappedToDisk(page))
2254 		return 0;
2255 
2256 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2257 	map_bh.b_page = page;
2258 
2259 	/*
2260 	 * We loop across all blocks in the page, whether or not they are
2261 	 * part of the affected region.  This is so we can discover if the
2262 	 * page is fully mapped-to-disk.
2263 	 */
2264 	for (block_start = 0, block_in_page = 0;
2265 		  block_start < PAGE_CACHE_SIZE;
2266 		  block_in_page++, block_start += blocksize) {
2267 		unsigned block_end = block_start + blocksize;
2268 		int create;
2269 
2270 		map_bh.b_state = 0;
2271 		create = 1;
2272 		if (block_start >= to)
2273 			create = 0;
2274 		map_bh.b_size = blocksize;
2275 		ret = get_block(inode, block_in_file + block_in_page,
2276 					&map_bh, create);
2277 		if (ret)
2278 			goto failed;
2279 		if (!buffer_mapped(&map_bh))
2280 			is_mapped_to_disk = 0;
2281 		if (buffer_new(&map_bh))
2282 			unmap_underlying_metadata(map_bh.b_bdev,
2283 							map_bh.b_blocknr);
2284 		if (PageUptodate(page))
2285 			continue;
2286 		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2287 			kaddr = kmap_atomic(page, KM_USER0);
2288 			if (block_start < from) {
2289 				memset(kaddr+block_start, 0, from-block_start);
2290 				dirtied_it = 1;
2291 			}
2292 			if (block_end > to) {
2293 				memset(kaddr + to, 0, block_end - to);
2294 				dirtied_it = 1;
2295 			}
2296 			flush_dcache_page(page);
2297 			kunmap_atomic(kaddr, KM_USER0);
2298 			continue;
2299 		}
2300 		if (buffer_uptodate(&map_bh))
2301 			continue;	/* reiserfs does this */
2302 		if (block_start < from || block_end > to) {
2303 			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2304 
2305 			if (!bh) {
2306 				ret = -ENOMEM;
2307 				goto failed;
2308 			}
2309 			bh->b_state = map_bh.b_state;
2310 			atomic_set(&bh->b_count, 0);
2311 			bh->b_this_page = NULL;
2312 			bh->b_page = page;
2313 			bh->b_blocknr = map_bh.b_blocknr;
2314 			bh->b_size = blocksize;
2315 			bh->b_data = (char *)(long)block_start;
2316 			bh->b_bdev = map_bh.b_bdev;
2317 			bh->b_private = NULL;
2318 			read_bh[nr_reads++] = bh;
2319 		}
2320 	}
2321 
2322 	if (nr_reads) {
2323 		struct buffer_head *bh;
2324 
2325 		/*
2326 		 * The page is locked, so these buffers are protected from
2327 		 * any VM or truncate activity.  Hence we don't need to care
2328 		 * for the buffer_head refcounts.
2329 		 */
2330 		for (i = 0; i < nr_reads; i++) {
2331 			bh = read_bh[i];
2332 			lock_buffer(bh);
2333 			bh->b_end_io = end_buffer_read_nobh;
2334 			submit_bh(READ, bh);
2335 		}
2336 		for (i = 0; i < nr_reads; i++) {
2337 			bh = read_bh[i];
2338 			wait_on_buffer(bh);
2339 			if (!buffer_uptodate(bh))
2340 				ret = -EIO;
2341 			free_buffer_head(bh);
2342 			read_bh[i] = NULL;
2343 		}
2344 		if (ret)
2345 			goto failed;
2346 	}
2347 
2348 	if (is_mapped_to_disk)
2349 		SetPageMappedToDisk(page);
2350 	SetPageUptodate(page);
2351 
2352 	/*
2353 	 * Setting the page dirty here isn't necessary for the prepare_write
2354 	 * function - commit_write will do that.  But if/when this function is
2355 	 * used within the pagefault handler to ensure that all mmapped pages
2356 	 * have backing space in the filesystem, we will need to dirty the page
2357 	 * if its contents were altered.
2358 	 */
2359 	if (dirtied_it)
2360 		set_page_dirty(page);
2361 
2362 	return 0;
2363 
2364 failed:
2365 	for (i = 0; i < nr_reads; i++) {
2366 		if (read_bh[i])
2367 			free_buffer_head(read_bh[i]);
2368 	}
2369 
2370 	/*
2371 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
2372 	 * so we'll later zero out any blocks which _were_ allocated.
2373 	 */
2374 	kaddr = kmap_atomic(page, KM_USER0);
2375 	memset(kaddr, 0, PAGE_CACHE_SIZE);
2376 	flush_dcache_page(page);
2377 	kunmap_atomic(kaddr, KM_USER0);
2378 	SetPageUptodate(page);
2379 	set_page_dirty(page);
2380 	return ret;
2381 }
2382 EXPORT_SYMBOL(nobh_prepare_write);
2383 
2384 int nobh_commit_write(struct file *file, struct page *page,
2385 		unsigned from, unsigned to)
2386 {
2387 	struct inode *inode = page->mapping->host;
2388 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2389 
2390 	set_page_dirty(page);
2391 	if (pos > inode->i_size) {
2392 		i_size_write(inode, pos);
2393 		mark_inode_dirty(inode);
2394 	}
2395 	return 0;
2396 }
2397 EXPORT_SYMBOL(nobh_commit_write);
2398 
2399 /*
2400  * nobh_writepage() - based on block_full_write_page() except
2401  * that it tries to operate without attaching bufferheads to
2402  * the page.
2403  */
2404 int nobh_writepage(struct page *page, get_block_t *get_block,
2405 			struct writeback_control *wbc)
2406 {
2407 	struct inode * const inode = page->mapping->host;
2408 	loff_t i_size = i_size_read(inode);
2409 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2410 	unsigned offset;
2411 	void *kaddr;
2412 	int ret;
2413 
2414 	/* Is the page fully inside i_size? */
2415 	if (page->index < end_index)
2416 		goto out;
2417 
2418 	/* Is the page fully outside i_size? (truncate in progress) */
2419 	offset = i_size & (PAGE_CACHE_SIZE-1);
2420 	if (page->index >= end_index+1 || !offset) {
2421 		/*
2422 		 * The page may have dirty, unmapped buffers.  For example,
2423 		 * they may have been added in ext3_writepage().  Make them
2424 		 * freeable here, so the page does not leak.
2425 		 */
2426 #if 0
2427 		/* Not really sure about this  - do we need this ? */
2428 		if (page->mapping->a_ops->invalidatepage)
2429 			page->mapping->a_ops->invalidatepage(page, offset);
2430 #endif
2431 		unlock_page(page);
2432 		return 0; /* don't care */
2433 	}
2434 
2435 	/*
2436 	 * The page straddles i_size.  It must be zeroed out on each and every
2437 	 * writepage invocation because it may be mmapped.  "A file is mapped
2438 	 * in multiples of the page size.  For a file that is not a multiple of
2439 	 * the  page size, the remaining memory is zeroed when mapped, and
2440 	 * writes to that region are not written out to the file."
2441 	 */
2442 	kaddr = kmap_atomic(page, KM_USER0);
2443 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2444 	flush_dcache_page(page);
2445 	kunmap_atomic(kaddr, KM_USER0);
2446 out:
2447 	ret = mpage_writepage(page, get_block, wbc);
2448 	if (ret == -EAGAIN)
2449 		ret = __block_write_full_page(inode, page, get_block, wbc);
2450 	return ret;
2451 }
2452 EXPORT_SYMBOL(nobh_writepage);
2453 
2454 /*
2455  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2456  */
2457 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2458 {
2459 	struct inode *inode = mapping->host;
2460 	unsigned blocksize = 1 << inode->i_blkbits;
2461 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2462 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2463 	unsigned to;
2464 	struct page *page;
2465 	const struct address_space_operations *a_ops = mapping->a_ops;
2466 	char *kaddr;
2467 	int ret = 0;
2468 
2469 	if ((offset & (blocksize - 1)) == 0)
2470 		goto out;
2471 
2472 	ret = -ENOMEM;
2473 	page = grab_cache_page(mapping, index);
2474 	if (!page)
2475 		goto out;
2476 
2477 	to = (offset + blocksize) & ~(blocksize - 1);
2478 	ret = a_ops->prepare_write(NULL, page, offset, to);
2479 	if (ret == 0) {
2480 		kaddr = kmap_atomic(page, KM_USER0);
2481 		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2482 		flush_dcache_page(page);
2483 		kunmap_atomic(kaddr, KM_USER0);
2484 		set_page_dirty(page);
2485 	}
2486 	unlock_page(page);
2487 	page_cache_release(page);
2488 out:
2489 	return ret;
2490 }
2491 EXPORT_SYMBOL(nobh_truncate_page);
2492 
2493 int block_truncate_page(struct address_space *mapping,
2494 			loff_t from, get_block_t *get_block)
2495 {
2496 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2497 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2498 	unsigned blocksize;
2499 	sector_t iblock;
2500 	unsigned length, pos;
2501 	struct inode *inode = mapping->host;
2502 	struct page *page;
2503 	struct buffer_head *bh;
2504 	void *kaddr;
2505 	int err;
2506 
2507 	blocksize = 1 << inode->i_blkbits;
2508 	length = offset & (blocksize - 1);
2509 
2510 	/* Block boundary? Nothing to do */
2511 	if (!length)
2512 		return 0;
2513 
2514 	length = blocksize - length;
2515 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2516 
2517 	page = grab_cache_page(mapping, index);
2518 	err = -ENOMEM;
2519 	if (!page)
2520 		goto out;
2521 
2522 	if (!page_has_buffers(page))
2523 		create_empty_buffers(page, blocksize, 0);
2524 
2525 	/* Find the buffer that contains "offset" */
2526 	bh = page_buffers(page);
2527 	pos = blocksize;
2528 	while (offset >= pos) {
2529 		bh = bh->b_this_page;
2530 		iblock++;
2531 		pos += blocksize;
2532 	}
2533 
2534 	err = 0;
2535 	if (!buffer_mapped(bh)) {
2536 		WARN_ON(bh->b_size != blocksize);
2537 		err = get_block(inode, iblock, bh, 0);
2538 		if (err)
2539 			goto unlock;
2540 		/* unmapped? It's a hole - nothing to do */
2541 		if (!buffer_mapped(bh))
2542 			goto unlock;
2543 	}
2544 
2545 	/* Ok, it's mapped. Make sure it's up-to-date */
2546 	if (PageUptodate(page))
2547 		set_buffer_uptodate(bh);
2548 
2549 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2550 		err = -EIO;
2551 		ll_rw_block(READ, 1, &bh);
2552 		wait_on_buffer(bh);
2553 		/* Uhhuh. Read error. Complain and punt. */
2554 		if (!buffer_uptodate(bh))
2555 			goto unlock;
2556 	}
2557 
2558 	kaddr = kmap_atomic(page, KM_USER0);
2559 	memset(kaddr + offset, 0, length);
2560 	flush_dcache_page(page);
2561 	kunmap_atomic(kaddr, KM_USER0);
2562 
2563 	mark_buffer_dirty(bh);
2564 	err = 0;
2565 
2566 unlock:
2567 	unlock_page(page);
2568 	page_cache_release(page);
2569 out:
2570 	return err;
2571 }
2572 
2573 /*
2574  * The generic ->writepage function for buffer-backed address_spaces
2575  */
2576 int block_write_full_page(struct page *page, get_block_t *get_block,
2577 			struct writeback_control *wbc)
2578 {
2579 	struct inode * const inode = page->mapping->host;
2580 	loff_t i_size = i_size_read(inode);
2581 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2582 	unsigned offset;
2583 	void *kaddr;
2584 
2585 	/* Is the page fully inside i_size? */
2586 	if (page->index < end_index)
2587 		return __block_write_full_page(inode, page, get_block, wbc);
2588 
2589 	/* Is the page fully outside i_size? (truncate in progress) */
2590 	offset = i_size & (PAGE_CACHE_SIZE-1);
2591 	if (page->index >= end_index+1 || !offset) {
2592 		/*
2593 		 * The page may have dirty, unmapped buffers.  For example,
2594 		 * they may have been added in ext3_writepage().  Make them
2595 		 * freeable here, so the page does not leak.
2596 		 */
2597 		do_invalidatepage(page, 0);
2598 		unlock_page(page);
2599 		return 0; /* don't care */
2600 	}
2601 
2602 	/*
2603 	 * The page straddles i_size.  It must be zeroed out on each and every
2604 	 * writepage invokation because it may be mmapped.  "A file is mapped
2605 	 * in multiples of the page size.  For a file that is not a multiple of
2606 	 * the  page size, the remaining memory is zeroed when mapped, and
2607 	 * writes to that region are not written out to the file."
2608 	 */
2609 	kaddr = kmap_atomic(page, KM_USER0);
2610 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2611 	flush_dcache_page(page);
2612 	kunmap_atomic(kaddr, KM_USER0);
2613 	return __block_write_full_page(inode, page, get_block, wbc);
2614 }
2615 
2616 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2617 			    get_block_t *get_block)
2618 {
2619 	struct buffer_head tmp;
2620 	struct inode *inode = mapping->host;
2621 	tmp.b_state = 0;
2622 	tmp.b_blocknr = 0;
2623 	tmp.b_size = 1 << inode->i_blkbits;
2624 	get_block(inode, block, &tmp, 0);
2625 	return tmp.b_blocknr;
2626 }
2627 
2628 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2629 {
2630 	struct buffer_head *bh = bio->bi_private;
2631 
2632 	if (bio->bi_size)
2633 		return 1;
2634 
2635 	if (err == -EOPNOTSUPP) {
2636 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2637 		set_bit(BH_Eopnotsupp, &bh->b_state);
2638 	}
2639 
2640 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2641 	bio_put(bio);
2642 	return 0;
2643 }
2644 
2645 int submit_bh(int rw, struct buffer_head * bh)
2646 {
2647 	struct bio *bio;
2648 	int ret = 0;
2649 
2650 	BUG_ON(!buffer_locked(bh));
2651 	BUG_ON(!buffer_mapped(bh));
2652 	BUG_ON(!bh->b_end_io);
2653 
2654 	if (buffer_ordered(bh) && (rw == WRITE))
2655 		rw = WRITE_BARRIER;
2656 
2657 	/*
2658 	 * Only clear out a write error when rewriting, should this
2659 	 * include WRITE_SYNC as well?
2660 	 */
2661 	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2662 		clear_buffer_write_io_error(bh);
2663 
2664 	/*
2665 	 * from here on down, it's all bio -- do the initial mapping,
2666 	 * submit_bio -> generic_make_request may further map this bio around
2667 	 */
2668 	bio = bio_alloc(GFP_NOIO, 1);
2669 
2670 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2671 	bio->bi_bdev = bh->b_bdev;
2672 	bio->bi_io_vec[0].bv_page = bh->b_page;
2673 	bio->bi_io_vec[0].bv_len = bh->b_size;
2674 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2675 
2676 	bio->bi_vcnt = 1;
2677 	bio->bi_idx = 0;
2678 	bio->bi_size = bh->b_size;
2679 
2680 	bio->bi_end_io = end_bio_bh_io_sync;
2681 	bio->bi_private = bh;
2682 
2683 	bio_get(bio);
2684 	submit_bio(rw, bio);
2685 
2686 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2687 		ret = -EOPNOTSUPP;
2688 
2689 	bio_put(bio);
2690 	return ret;
2691 }
2692 
2693 /**
2694  * ll_rw_block: low-level access to block devices (DEPRECATED)
2695  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2696  * @nr: number of &struct buffer_heads in the array
2697  * @bhs: array of pointers to &struct buffer_head
2698  *
2699  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2700  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2701  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2702  * are sent to disk. The fourth %READA option is described in the documentation
2703  * for generic_make_request() which ll_rw_block() calls.
2704  *
2705  * This function drops any buffer that it cannot get a lock on (with the
2706  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2707  * clean when doing a write request, and any buffer that appears to be
2708  * up-to-date when doing read request.  Further it marks as clean buffers that
2709  * are processed for writing (the buffer cache won't assume that they are
2710  * actually clean until the buffer gets unlocked).
2711  *
2712  * ll_rw_block sets b_end_io to simple completion handler that marks
2713  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2714  * any waiters.
2715  *
2716  * All of the buffers must be for the same device, and must also be a
2717  * multiple of the current approved size for the device.
2718  */
2719 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2720 {
2721 	int i;
2722 
2723 	for (i = 0; i < nr; i++) {
2724 		struct buffer_head *bh = bhs[i];
2725 
2726 		if (rw == SWRITE)
2727 			lock_buffer(bh);
2728 		else if (test_set_buffer_locked(bh))
2729 			continue;
2730 
2731 		if (rw == WRITE || rw == SWRITE) {
2732 			if (test_clear_buffer_dirty(bh)) {
2733 				bh->b_end_io = end_buffer_write_sync;
2734 				get_bh(bh);
2735 				submit_bh(WRITE, bh);
2736 				continue;
2737 			}
2738 		} else {
2739 			if (!buffer_uptodate(bh)) {
2740 				bh->b_end_io = end_buffer_read_sync;
2741 				get_bh(bh);
2742 				submit_bh(rw, bh);
2743 				continue;
2744 			}
2745 		}
2746 		unlock_buffer(bh);
2747 	}
2748 }
2749 
2750 /*
2751  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2752  * and then start new I/O and then wait upon it.  The caller must have a ref on
2753  * the buffer_head.
2754  */
2755 int sync_dirty_buffer(struct buffer_head *bh)
2756 {
2757 	int ret = 0;
2758 
2759 	WARN_ON(atomic_read(&bh->b_count) < 1);
2760 	lock_buffer(bh);
2761 	if (test_clear_buffer_dirty(bh)) {
2762 		get_bh(bh);
2763 		bh->b_end_io = end_buffer_write_sync;
2764 		ret = submit_bh(WRITE, bh);
2765 		wait_on_buffer(bh);
2766 		if (buffer_eopnotsupp(bh)) {
2767 			clear_buffer_eopnotsupp(bh);
2768 			ret = -EOPNOTSUPP;
2769 		}
2770 		if (!ret && !buffer_uptodate(bh))
2771 			ret = -EIO;
2772 	} else {
2773 		unlock_buffer(bh);
2774 	}
2775 	return ret;
2776 }
2777 
2778 /*
2779  * try_to_free_buffers() checks if all the buffers on this particular page
2780  * are unused, and releases them if so.
2781  *
2782  * Exclusion against try_to_free_buffers may be obtained by either
2783  * locking the page or by holding its mapping's private_lock.
2784  *
2785  * If the page is dirty but all the buffers are clean then we need to
2786  * be sure to mark the page clean as well.  This is because the page
2787  * may be against a block device, and a later reattachment of buffers
2788  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2789  * filesystem data on the same device.
2790  *
2791  * The same applies to regular filesystem pages: if all the buffers are
2792  * clean then we set the page clean and proceed.  To do that, we require
2793  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2794  * private_lock.
2795  *
2796  * try_to_free_buffers() is non-blocking.
2797  */
2798 static inline int buffer_busy(struct buffer_head *bh)
2799 {
2800 	return atomic_read(&bh->b_count) |
2801 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2802 }
2803 
2804 static int
2805 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2806 {
2807 	struct buffer_head *head = page_buffers(page);
2808 	struct buffer_head *bh;
2809 
2810 	bh = head;
2811 	do {
2812 		if (buffer_write_io_error(bh) && page->mapping)
2813 			set_bit(AS_EIO, &page->mapping->flags);
2814 		if (buffer_busy(bh))
2815 			goto failed;
2816 		bh = bh->b_this_page;
2817 	} while (bh != head);
2818 
2819 	do {
2820 		struct buffer_head *next = bh->b_this_page;
2821 
2822 		if (!list_empty(&bh->b_assoc_buffers))
2823 			__remove_assoc_queue(bh);
2824 		bh = next;
2825 	} while (bh != head);
2826 	*buffers_to_free = head;
2827 	__clear_page_buffers(page);
2828 	return 1;
2829 failed:
2830 	return 0;
2831 }
2832 
2833 int try_to_free_buffers(struct page *page)
2834 {
2835 	struct address_space * const mapping = page->mapping;
2836 	struct buffer_head *buffers_to_free = NULL;
2837 	int ret = 0;
2838 
2839 	BUG_ON(!PageLocked(page));
2840 	if (PageWriteback(page))
2841 		return 0;
2842 
2843 	if (mapping == NULL) {		/* can this still happen? */
2844 		ret = drop_buffers(page, &buffers_to_free);
2845 		goto out;
2846 	}
2847 
2848 	spin_lock(&mapping->private_lock);
2849 	ret = drop_buffers(page, &buffers_to_free);
2850 
2851 	/*
2852 	 * If the filesystem writes its buffers by hand (eg ext3)
2853 	 * then we can have clean buffers against a dirty page.  We
2854 	 * clean the page here; otherwise the VM will never notice
2855 	 * that the filesystem did any IO at all.
2856 	 *
2857 	 * Also, during truncate, discard_buffer will have marked all
2858 	 * the page's buffers clean.  We discover that here and clean
2859 	 * the page also.
2860 	 *
2861 	 * private_lock must be held over this entire operation in order
2862 	 * to synchronise against __set_page_dirty_buffers and prevent the
2863 	 * dirty bit from being lost.
2864 	 */
2865 	if (ret)
2866 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
2867 	spin_unlock(&mapping->private_lock);
2868 out:
2869 	if (buffers_to_free) {
2870 		struct buffer_head *bh = buffers_to_free;
2871 
2872 		do {
2873 			struct buffer_head *next = bh->b_this_page;
2874 			free_buffer_head(bh);
2875 			bh = next;
2876 		} while (bh != buffers_to_free);
2877 	}
2878 	return ret;
2879 }
2880 EXPORT_SYMBOL(try_to_free_buffers);
2881 
2882 void block_sync_page(struct page *page)
2883 {
2884 	struct address_space *mapping;
2885 
2886 	smp_mb();
2887 	mapping = page_mapping(page);
2888 	if (mapping)
2889 		blk_run_backing_dev(mapping->backing_dev_info, page);
2890 }
2891 
2892 /*
2893  * There are no bdflush tunables left.  But distributions are
2894  * still running obsolete flush daemons, so we terminate them here.
2895  *
2896  * Use of bdflush() is deprecated and will be removed in a future kernel.
2897  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2898  */
2899 asmlinkage long sys_bdflush(int func, long data)
2900 {
2901 	static int msg_count;
2902 
2903 	if (!capable(CAP_SYS_ADMIN))
2904 		return -EPERM;
2905 
2906 	if (msg_count < 5) {
2907 		msg_count++;
2908 		printk(KERN_INFO
2909 			"warning: process `%s' used the obsolete bdflush"
2910 			" system call\n", current->comm);
2911 		printk(KERN_INFO "Fix your initscripts?\n");
2912 	}
2913 
2914 	if (func == 1)
2915 		do_exit(0);
2916 	return 0;
2917 }
2918 
2919 /*
2920  * Buffer-head allocation
2921  */
2922 static struct kmem_cache *bh_cachep;
2923 
2924 /*
2925  * Once the number of bh's in the machine exceeds this level, we start
2926  * stripping them in writeback.
2927  */
2928 static int max_buffer_heads;
2929 
2930 int buffer_heads_over_limit;
2931 
2932 struct bh_accounting {
2933 	int nr;			/* Number of live bh's */
2934 	int ratelimit;		/* Limit cacheline bouncing */
2935 };
2936 
2937 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
2938 
2939 static void recalc_bh_state(void)
2940 {
2941 	int i;
2942 	int tot = 0;
2943 
2944 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
2945 		return;
2946 	__get_cpu_var(bh_accounting).ratelimit = 0;
2947 	for_each_online_cpu(i)
2948 		tot += per_cpu(bh_accounting, i).nr;
2949 	buffer_heads_over_limit = (tot > max_buffer_heads);
2950 }
2951 
2952 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
2953 {
2954 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
2955 	if (ret) {
2956 		get_cpu_var(bh_accounting).nr++;
2957 		recalc_bh_state();
2958 		put_cpu_var(bh_accounting);
2959 	}
2960 	return ret;
2961 }
2962 EXPORT_SYMBOL(alloc_buffer_head);
2963 
2964 void free_buffer_head(struct buffer_head *bh)
2965 {
2966 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
2967 	kmem_cache_free(bh_cachep, bh);
2968 	get_cpu_var(bh_accounting).nr--;
2969 	recalc_bh_state();
2970 	put_cpu_var(bh_accounting);
2971 }
2972 EXPORT_SYMBOL(free_buffer_head);
2973 
2974 static void
2975 init_buffer_head(void *data, struct kmem_cache *cachep, unsigned long flags)
2976 {
2977 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2978 			    SLAB_CTOR_CONSTRUCTOR) {
2979 		struct buffer_head * bh = (struct buffer_head *)data;
2980 
2981 		memset(bh, 0, sizeof(*bh));
2982 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
2983 	}
2984 }
2985 
2986 static void buffer_exit_cpu(int cpu)
2987 {
2988 	int i;
2989 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
2990 
2991 	for (i = 0; i < BH_LRU_SIZE; i++) {
2992 		brelse(b->bhs[i]);
2993 		b->bhs[i] = NULL;
2994 	}
2995 	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
2996 	per_cpu(bh_accounting, cpu).nr = 0;
2997 	put_cpu_var(bh_accounting);
2998 }
2999 
3000 static int buffer_cpu_notify(struct notifier_block *self,
3001 			      unsigned long action, void *hcpu)
3002 {
3003 	if (action == CPU_DEAD)
3004 		buffer_exit_cpu((unsigned long)hcpu);
3005 	return NOTIFY_OK;
3006 }
3007 
3008 void __init buffer_init(void)
3009 {
3010 	int nrpages;
3011 
3012 	bh_cachep = kmem_cache_create("buffer_head",
3013 					sizeof(struct buffer_head), 0,
3014 					(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3015 					SLAB_MEM_SPREAD),
3016 					init_buffer_head,
3017 					NULL);
3018 
3019 	/*
3020 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3021 	 */
3022 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3023 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3024 	hotcpu_notifier(buffer_cpu_notify, 0);
3025 }
3026 
3027 EXPORT_SYMBOL(__bforget);
3028 EXPORT_SYMBOL(__brelse);
3029 EXPORT_SYMBOL(__wait_on_buffer);
3030 EXPORT_SYMBOL(block_commit_write);
3031 EXPORT_SYMBOL(block_prepare_write);
3032 EXPORT_SYMBOL(block_read_full_page);
3033 EXPORT_SYMBOL(block_sync_page);
3034 EXPORT_SYMBOL(block_truncate_page);
3035 EXPORT_SYMBOL(block_write_full_page);
3036 EXPORT_SYMBOL(cont_prepare_write);
3037 EXPORT_SYMBOL(end_buffer_read_sync);
3038 EXPORT_SYMBOL(end_buffer_write_sync);
3039 EXPORT_SYMBOL(file_fsync);
3040 EXPORT_SYMBOL(fsync_bdev);
3041 EXPORT_SYMBOL(generic_block_bmap);
3042 EXPORT_SYMBOL(generic_commit_write);
3043 EXPORT_SYMBOL(generic_cont_expand);
3044 EXPORT_SYMBOL(generic_cont_expand_simple);
3045 EXPORT_SYMBOL(init_buffer);
3046 EXPORT_SYMBOL(invalidate_bdev);
3047 EXPORT_SYMBOL(ll_rw_block);
3048 EXPORT_SYMBOL(mark_buffer_dirty);
3049 EXPORT_SYMBOL(submit_bh);
3050 EXPORT_SYMBOL(sync_dirty_buffer);
3051 EXPORT_SYMBOL(unlock_buffer);
3052