xref: /linux/fs/buffer.c (revision d67b569f5f620c0fb95d5212642746b7ba9d29e4)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/syscalls.h>
24 #include <linux/fs.h>
25 #include <linux/mm.h>
26 #include <linux/percpu.h>
27 #include <linux/slab.h>
28 #include <linux/smp_lock.h>
29 #include <linux/blkdev.h>
30 #include <linux/file.h>
31 #include <linux/quotaops.h>
32 #include <linux/highmem.h>
33 #include <linux/module.h>
34 #include <linux/writeback.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 
44 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
45 static void invalidate_bh_lrus(void);
46 
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52 	bh->b_end_io = handler;
53 	bh->b_private = private;
54 }
55 
56 static int sync_buffer(void *word)
57 {
58 	struct block_device *bd;
59 	struct buffer_head *bh
60 		= container_of(word, struct buffer_head, b_state);
61 
62 	smp_mb();
63 	bd = bh->b_bdev;
64 	if (bd)
65 		blk_run_address_space(bd->bd_inode->i_mapping);
66 	io_schedule();
67 	return 0;
68 }
69 
70 void fastcall __lock_buffer(struct buffer_head *bh)
71 {
72 	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 							TASK_UNINTERRUPTIBLE);
74 }
75 EXPORT_SYMBOL(__lock_buffer);
76 
77 void fastcall unlock_buffer(struct buffer_head *bh)
78 {
79 	clear_buffer_locked(bh);
80 	smp_mb__after_clear_bit();
81 	wake_up_bit(&bh->b_state, BH_Lock);
82 }
83 
84 /*
85  * Block until a buffer comes unlocked.  This doesn't stop it
86  * from becoming locked again - you have to lock it yourself
87  * if you want to preserve its state.
88  */
89 void __wait_on_buffer(struct buffer_head * bh)
90 {
91 	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92 }
93 
94 static void
95 __clear_page_buffers(struct page *page)
96 {
97 	ClearPagePrivate(page);
98 	page->private = 0;
99 	page_cache_release(page);
100 }
101 
102 static void buffer_io_error(struct buffer_head *bh)
103 {
104 	char b[BDEVNAME_SIZE];
105 
106 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 			bdevname(bh->b_bdev, b),
108 			(unsigned long long)bh->b_blocknr);
109 }
110 
111 /*
112  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
113  * unlock the buffer. This is what ll_rw_block uses too.
114  */
115 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
116 {
117 	if (uptodate) {
118 		set_buffer_uptodate(bh);
119 	} else {
120 		/* This happens, due to failed READA attempts. */
121 		clear_buffer_uptodate(bh);
122 	}
123 	unlock_buffer(bh);
124 	put_bh(bh);
125 }
126 
127 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
128 {
129 	char b[BDEVNAME_SIZE];
130 
131 	if (uptodate) {
132 		set_buffer_uptodate(bh);
133 	} else {
134 		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
135 			buffer_io_error(bh);
136 			printk(KERN_WARNING "lost page write due to "
137 					"I/O error on %s\n",
138 				       bdevname(bh->b_bdev, b));
139 		}
140 		set_buffer_write_io_error(bh);
141 		clear_buffer_uptodate(bh);
142 	}
143 	unlock_buffer(bh);
144 	put_bh(bh);
145 }
146 
147 /*
148  * Write out and wait upon all the dirty data associated with a block
149  * device via its mapping.  Does not take the superblock lock.
150  */
151 int sync_blockdev(struct block_device *bdev)
152 {
153 	int ret = 0;
154 
155 	if (bdev) {
156 		int err;
157 
158 		ret = filemap_fdatawrite(bdev->bd_inode->i_mapping);
159 		err = filemap_fdatawait(bdev->bd_inode->i_mapping);
160 		if (!ret)
161 			ret = err;
162 	}
163 	return ret;
164 }
165 EXPORT_SYMBOL(sync_blockdev);
166 
167 /*
168  * Write out and wait upon all dirty data associated with this
169  * superblock.  Filesystem data as well as the underlying block
170  * device.  Takes the superblock lock.
171  */
172 int fsync_super(struct super_block *sb)
173 {
174 	sync_inodes_sb(sb, 0);
175 	DQUOT_SYNC(sb);
176 	lock_super(sb);
177 	if (sb->s_dirt && sb->s_op->write_super)
178 		sb->s_op->write_super(sb);
179 	unlock_super(sb);
180 	if (sb->s_op->sync_fs)
181 		sb->s_op->sync_fs(sb, 1);
182 	sync_blockdev(sb->s_bdev);
183 	sync_inodes_sb(sb, 1);
184 
185 	return sync_blockdev(sb->s_bdev);
186 }
187 
188 /*
189  * Write out and wait upon all dirty data associated with this
190  * device.   Filesystem data as well as the underlying block
191  * device.  Takes the superblock lock.
192  */
193 int fsync_bdev(struct block_device *bdev)
194 {
195 	struct super_block *sb = get_super(bdev);
196 	if (sb) {
197 		int res = fsync_super(sb);
198 		drop_super(sb);
199 		return res;
200 	}
201 	return sync_blockdev(bdev);
202 }
203 
204 /**
205  * freeze_bdev  --  lock a filesystem and force it into a consistent state
206  * @bdev:	blockdevice to lock
207  *
208  * This takes the block device bd_mount_sem to make sure no new mounts
209  * happen on bdev until thaw_bdev() is called.
210  * If a superblock is found on this device, we take the s_umount semaphore
211  * on it to make sure nobody unmounts until the snapshot creation is done.
212  */
213 struct super_block *freeze_bdev(struct block_device *bdev)
214 {
215 	struct super_block *sb;
216 
217 	down(&bdev->bd_mount_sem);
218 	sb = get_super(bdev);
219 	if (sb && !(sb->s_flags & MS_RDONLY)) {
220 		sb->s_frozen = SB_FREEZE_WRITE;
221 		smp_wmb();
222 
223 		sync_inodes_sb(sb, 0);
224 		DQUOT_SYNC(sb);
225 
226 		lock_super(sb);
227 		if (sb->s_dirt && sb->s_op->write_super)
228 			sb->s_op->write_super(sb);
229 		unlock_super(sb);
230 
231 		if (sb->s_op->sync_fs)
232 			sb->s_op->sync_fs(sb, 1);
233 
234 		sync_blockdev(sb->s_bdev);
235 		sync_inodes_sb(sb, 1);
236 
237 		sb->s_frozen = SB_FREEZE_TRANS;
238 		smp_wmb();
239 
240 		sync_blockdev(sb->s_bdev);
241 
242 		if (sb->s_op->write_super_lockfs)
243 			sb->s_op->write_super_lockfs(sb);
244 	}
245 
246 	sync_blockdev(bdev);
247 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
248 }
249 EXPORT_SYMBOL(freeze_bdev);
250 
251 /**
252  * thaw_bdev  -- unlock filesystem
253  * @bdev:	blockdevice to unlock
254  * @sb:		associated superblock
255  *
256  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
257  */
258 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
259 {
260 	if (sb) {
261 		BUG_ON(sb->s_bdev != bdev);
262 
263 		if (sb->s_op->unlockfs)
264 			sb->s_op->unlockfs(sb);
265 		sb->s_frozen = SB_UNFROZEN;
266 		smp_wmb();
267 		wake_up(&sb->s_wait_unfrozen);
268 		drop_super(sb);
269 	}
270 
271 	up(&bdev->bd_mount_sem);
272 }
273 EXPORT_SYMBOL(thaw_bdev);
274 
275 /*
276  * sync everything.  Start out by waking pdflush, because that writes back
277  * all queues in parallel.
278  */
279 static void do_sync(unsigned long wait)
280 {
281 	wakeup_pdflush(0);
282 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
283 	DQUOT_SYNC(NULL);
284 	sync_supers();		/* Write the superblocks */
285 	sync_filesystems(0);	/* Start syncing the filesystems */
286 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
287 	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
288 	if (!wait)
289 		printk("Emergency Sync complete\n");
290 	if (unlikely(laptop_mode))
291 		laptop_sync_completion();
292 }
293 
294 asmlinkage long sys_sync(void)
295 {
296 	do_sync(1);
297 	return 0;
298 }
299 
300 void emergency_sync(void)
301 {
302 	pdflush_operation(do_sync, 0);
303 }
304 
305 /*
306  * Generic function to fsync a file.
307  *
308  * filp may be NULL if called via the msync of a vma.
309  */
310 
311 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
312 {
313 	struct inode * inode = dentry->d_inode;
314 	struct super_block * sb;
315 	int ret, err;
316 
317 	/* sync the inode to buffers */
318 	ret = write_inode_now(inode, 0);
319 
320 	/* sync the superblock to buffers */
321 	sb = inode->i_sb;
322 	lock_super(sb);
323 	if (sb->s_op->write_super)
324 		sb->s_op->write_super(sb);
325 	unlock_super(sb);
326 
327 	/* .. finally sync the buffers to disk */
328 	err = sync_blockdev(sb->s_bdev);
329 	if (!ret)
330 		ret = err;
331 	return ret;
332 }
333 
334 static long do_fsync(unsigned int fd, int datasync)
335 {
336 	struct file * file;
337 	struct address_space *mapping;
338 	int ret, err;
339 
340 	ret = -EBADF;
341 	file = fget(fd);
342 	if (!file)
343 		goto out;
344 
345 	ret = -EINVAL;
346 	if (!file->f_op || !file->f_op->fsync) {
347 		/* Why?  We can still call filemap_fdatawrite */
348 		goto out_putf;
349 	}
350 
351 	mapping = file->f_mapping;
352 
353 	current->flags |= PF_SYNCWRITE;
354 	ret = filemap_fdatawrite(mapping);
355 
356 	/*
357 	 * We need to protect against concurrent writers,
358 	 * which could cause livelocks in fsync_buffers_list
359 	 */
360 	down(&mapping->host->i_sem);
361 	err = file->f_op->fsync(file, file->f_dentry, datasync);
362 	if (!ret)
363 		ret = err;
364 	up(&mapping->host->i_sem);
365 	err = filemap_fdatawait(mapping);
366 	if (!ret)
367 		ret = err;
368 	current->flags &= ~PF_SYNCWRITE;
369 
370 out_putf:
371 	fput(file);
372 out:
373 	return ret;
374 }
375 
376 asmlinkage long sys_fsync(unsigned int fd)
377 {
378 	return do_fsync(fd, 0);
379 }
380 
381 asmlinkage long sys_fdatasync(unsigned int fd)
382 {
383 	return do_fsync(fd, 1);
384 }
385 
386 /*
387  * Various filesystems appear to want __find_get_block to be non-blocking.
388  * But it's the page lock which protects the buffers.  To get around this,
389  * we get exclusion from try_to_free_buffers with the blockdev mapping's
390  * private_lock.
391  *
392  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
393  * may be quite high.  This code could TryLock the page, and if that
394  * succeeds, there is no need to take private_lock. (But if
395  * private_lock is contended then so is mapping->tree_lock).
396  */
397 static struct buffer_head *
398 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
399 {
400 	struct inode *bd_inode = bdev->bd_inode;
401 	struct address_space *bd_mapping = bd_inode->i_mapping;
402 	struct buffer_head *ret = NULL;
403 	pgoff_t index;
404 	struct buffer_head *bh;
405 	struct buffer_head *head;
406 	struct page *page;
407 	int all_mapped = 1;
408 
409 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
410 	page = find_get_page(bd_mapping, index);
411 	if (!page)
412 		goto out;
413 
414 	spin_lock(&bd_mapping->private_lock);
415 	if (!page_has_buffers(page))
416 		goto out_unlock;
417 	head = page_buffers(page);
418 	bh = head;
419 	do {
420 		if (bh->b_blocknr == block) {
421 			ret = bh;
422 			get_bh(bh);
423 			goto out_unlock;
424 		}
425 		if (!buffer_mapped(bh))
426 			all_mapped = 0;
427 		bh = bh->b_this_page;
428 	} while (bh != head);
429 
430 	/* we might be here because some of the buffers on this page are
431 	 * not mapped.  This is due to various races between
432 	 * file io on the block device and getblk.  It gets dealt with
433 	 * elsewhere, don't buffer_error if we had some unmapped buffers
434 	 */
435 	if (all_mapped) {
436 		printk("__find_get_block_slow() failed. "
437 			"block=%llu, b_blocknr=%llu\n",
438 			(unsigned long long)block, (unsigned long long)bh->b_blocknr);
439 		printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
440 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
441 	}
442 out_unlock:
443 	spin_unlock(&bd_mapping->private_lock);
444 	page_cache_release(page);
445 out:
446 	return ret;
447 }
448 
449 /* If invalidate_buffers() will trash dirty buffers, it means some kind
450    of fs corruption is going on. Trashing dirty data always imply losing
451    information that was supposed to be just stored on the physical layer
452    by the user.
453 
454    Thus invalidate_buffers in general usage is not allwowed to trash
455    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
456    be preserved.  These buffers are simply skipped.
457 
458    We also skip buffers which are still in use.  For example this can
459    happen if a userspace program is reading the block device.
460 
461    NOTE: In the case where the user removed a removable-media-disk even if
462    there's still dirty data not synced on disk (due a bug in the device driver
463    or due an error of the user), by not destroying the dirty buffers we could
464    generate corruption also on the next media inserted, thus a parameter is
465    necessary to handle this case in the most safe way possible (trying
466    to not corrupt also the new disk inserted with the data belonging to
467    the old now corrupted disk). Also for the ramdisk the natural thing
468    to do in order to release the ramdisk memory is to destroy dirty buffers.
469 
470    These are two special cases. Normal usage imply the device driver
471    to issue a sync on the device (without waiting I/O completion) and
472    then an invalidate_buffers call that doesn't trash dirty buffers.
473 
474    For handling cache coherency with the blkdev pagecache the 'update' case
475    is been introduced. It is needed to re-read from disk any pinned
476    buffer. NOTE: re-reading from disk is destructive so we can do it only
477    when we assume nobody is changing the buffercache under our I/O and when
478    we think the disk contains more recent information than the buffercache.
479    The update == 1 pass marks the buffers we need to update, the update == 2
480    pass does the actual I/O. */
481 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
482 {
483 	invalidate_bh_lrus();
484 	/*
485 	 * FIXME: what about destroy_dirty_buffers?
486 	 * We really want to use invalidate_inode_pages2() for
487 	 * that, but not until that's cleaned up.
488 	 */
489 	invalidate_inode_pages(bdev->bd_inode->i_mapping);
490 }
491 
492 /*
493  * Kick pdflush then try to free up some ZONE_NORMAL memory.
494  */
495 static void free_more_memory(void)
496 {
497 	struct zone **zones;
498 	pg_data_t *pgdat;
499 
500 	wakeup_pdflush(1024);
501 	yield();
502 
503 	for_each_pgdat(pgdat) {
504 		zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
505 		if (*zones)
506 			try_to_free_pages(zones, GFP_NOFS);
507 	}
508 }
509 
510 /*
511  * I/O completion handler for block_read_full_page() - pages
512  * which come unlocked at the end of I/O.
513  */
514 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
515 {
516 	static DEFINE_SPINLOCK(page_uptodate_lock);
517 	unsigned long flags;
518 	struct buffer_head *tmp;
519 	struct page *page;
520 	int page_uptodate = 1;
521 
522 	BUG_ON(!buffer_async_read(bh));
523 
524 	page = bh->b_page;
525 	if (uptodate) {
526 		set_buffer_uptodate(bh);
527 	} else {
528 		clear_buffer_uptodate(bh);
529 		if (printk_ratelimit())
530 			buffer_io_error(bh);
531 		SetPageError(page);
532 	}
533 
534 	/*
535 	 * Be _very_ careful from here on. Bad things can happen if
536 	 * two buffer heads end IO at almost the same time and both
537 	 * decide that the page is now completely done.
538 	 */
539 	spin_lock_irqsave(&page_uptodate_lock, flags);
540 	clear_buffer_async_read(bh);
541 	unlock_buffer(bh);
542 	tmp = bh;
543 	do {
544 		if (!buffer_uptodate(tmp))
545 			page_uptodate = 0;
546 		if (buffer_async_read(tmp)) {
547 			BUG_ON(!buffer_locked(tmp));
548 			goto still_busy;
549 		}
550 		tmp = tmp->b_this_page;
551 	} while (tmp != bh);
552 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
553 
554 	/*
555 	 * If none of the buffers had errors and they are all
556 	 * uptodate then we can set the page uptodate.
557 	 */
558 	if (page_uptodate && !PageError(page))
559 		SetPageUptodate(page);
560 	unlock_page(page);
561 	return;
562 
563 still_busy:
564 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
565 	return;
566 }
567 
568 /*
569  * Completion handler for block_write_full_page() - pages which are unlocked
570  * during I/O, and which have PageWriteback cleared upon I/O completion.
571  */
572 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
573 {
574 	char b[BDEVNAME_SIZE];
575 	static DEFINE_SPINLOCK(page_uptodate_lock);
576 	unsigned long flags;
577 	struct buffer_head *tmp;
578 	struct page *page;
579 
580 	BUG_ON(!buffer_async_write(bh));
581 
582 	page = bh->b_page;
583 	if (uptodate) {
584 		set_buffer_uptodate(bh);
585 	} else {
586 		if (printk_ratelimit()) {
587 			buffer_io_error(bh);
588 			printk(KERN_WARNING "lost page write due to "
589 					"I/O error on %s\n",
590 			       bdevname(bh->b_bdev, b));
591 		}
592 		set_bit(AS_EIO, &page->mapping->flags);
593 		clear_buffer_uptodate(bh);
594 		SetPageError(page);
595 	}
596 
597 	spin_lock_irqsave(&page_uptodate_lock, flags);
598 	clear_buffer_async_write(bh);
599 	unlock_buffer(bh);
600 	tmp = bh->b_this_page;
601 	while (tmp != bh) {
602 		if (buffer_async_write(tmp)) {
603 			BUG_ON(!buffer_locked(tmp));
604 			goto still_busy;
605 		}
606 		tmp = tmp->b_this_page;
607 	}
608 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
609 	end_page_writeback(page);
610 	return;
611 
612 still_busy:
613 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
614 	return;
615 }
616 
617 /*
618  * If a page's buffers are under async readin (end_buffer_async_read
619  * completion) then there is a possibility that another thread of
620  * control could lock one of the buffers after it has completed
621  * but while some of the other buffers have not completed.  This
622  * locked buffer would confuse end_buffer_async_read() into not unlocking
623  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
624  * that this buffer is not under async I/O.
625  *
626  * The page comes unlocked when it has no locked buffer_async buffers
627  * left.
628  *
629  * PageLocked prevents anyone starting new async I/O reads any of
630  * the buffers.
631  *
632  * PageWriteback is used to prevent simultaneous writeout of the same
633  * page.
634  *
635  * PageLocked prevents anyone from starting writeback of a page which is
636  * under read I/O (PageWriteback is only ever set against a locked page).
637  */
638 static void mark_buffer_async_read(struct buffer_head *bh)
639 {
640 	bh->b_end_io = end_buffer_async_read;
641 	set_buffer_async_read(bh);
642 }
643 
644 void mark_buffer_async_write(struct buffer_head *bh)
645 {
646 	bh->b_end_io = end_buffer_async_write;
647 	set_buffer_async_write(bh);
648 }
649 EXPORT_SYMBOL(mark_buffer_async_write);
650 
651 
652 /*
653  * fs/buffer.c contains helper functions for buffer-backed address space's
654  * fsync functions.  A common requirement for buffer-based filesystems is
655  * that certain data from the backing blockdev needs to be written out for
656  * a successful fsync().  For example, ext2 indirect blocks need to be
657  * written back and waited upon before fsync() returns.
658  *
659  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
660  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
661  * management of a list of dependent buffers at ->i_mapping->private_list.
662  *
663  * Locking is a little subtle: try_to_free_buffers() will remove buffers
664  * from their controlling inode's queue when they are being freed.  But
665  * try_to_free_buffers() will be operating against the *blockdev* mapping
666  * at the time, not against the S_ISREG file which depends on those buffers.
667  * So the locking for private_list is via the private_lock in the address_space
668  * which backs the buffers.  Which is different from the address_space
669  * against which the buffers are listed.  So for a particular address_space,
670  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
671  * mapping->private_list will always be protected by the backing blockdev's
672  * ->private_lock.
673  *
674  * Which introduces a requirement: all buffers on an address_space's
675  * ->private_list must be from the same address_space: the blockdev's.
676  *
677  * address_spaces which do not place buffers at ->private_list via these
678  * utility functions are free to use private_lock and private_list for
679  * whatever they want.  The only requirement is that list_empty(private_list)
680  * be true at clear_inode() time.
681  *
682  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
683  * filesystems should do that.  invalidate_inode_buffers() should just go
684  * BUG_ON(!list_empty).
685  *
686  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
687  * take an address_space, not an inode.  And it should be called
688  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
689  * queued up.
690  *
691  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
692  * list if it is already on a list.  Because if the buffer is on a list,
693  * it *must* already be on the right one.  If not, the filesystem is being
694  * silly.  This will save a ton of locking.  But first we have to ensure
695  * that buffers are taken *off* the old inode's list when they are freed
696  * (presumably in truncate).  That requires careful auditing of all
697  * filesystems (do it inside bforget()).  It could also be done by bringing
698  * b_inode back.
699  */
700 
701 /*
702  * The buffer's backing address_space's private_lock must be held
703  */
704 static inline void __remove_assoc_queue(struct buffer_head *bh)
705 {
706 	list_del_init(&bh->b_assoc_buffers);
707 }
708 
709 int inode_has_buffers(struct inode *inode)
710 {
711 	return !list_empty(&inode->i_data.private_list);
712 }
713 
714 /*
715  * osync is designed to support O_SYNC io.  It waits synchronously for
716  * all already-submitted IO to complete, but does not queue any new
717  * writes to the disk.
718  *
719  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
720  * you dirty the buffers, and then use osync_inode_buffers to wait for
721  * completion.  Any other dirty buffers which are not yet queued for
722  * write will not be flushed to disk by the osync.
723  */
724 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
725 {
726 	struct buffer_head *bh;
727 	struct list_head *p;
728 	int err = 0;
729 
730 	spin_lock(lock);
731 repeat:
732 	list_for_each_prev(p, list) {
733 		bh = BH_ENTRY(p);
734 		if (buffer_locked(bh)) {
735 			get_bh(bh);
736 			spin_unlock(lock);
737 			wait_on_buffer(bh);
738 			if (!buffer_uptodate(bh))
739 				err = -EIO;
740 			brelse(bh);
741 			spin_lock(lock);
742 			goto repeat;
743 		}
744 	}
745 	spin_unlock(lock);
746 	return err;
747 }
748 
749 /**
750  * sync_mapping_buffers - write out and wait upon a mapping's "associated"
751  *                        buffers
752  * @mapping: the mapping which wants those buffers written
753  *
754  * Starts I/O against the buffers at mapping->private_list, and waits upon
755  * that I/O.
756  *
757  * Basically, this is a convenience function for fsync().
758  * @mapping is a file or directory which needs those buffers to be written for
759  * a successful fsync().
760  */
761 int sync_mapping_buffers(struct address_space *mapping)
762 {
763 	struct address_space *buffer_mapping = mapping->assoc_mapping;
764 
765 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
766 		return 0;
767 
768 	return fsync_buffers_list(&buffer_mapping->private_lock,
769 					&mapping->private_list);
770 }
771 EXPORT_SYMBOL(sync_mapping_buffers);
772 
773 /*
774  * Called when we've recently written block `bblock', and it is known that
775  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
776  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
777  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
778  */
779 void write_boundary_block(struct block_device *bdev,
780 			sector_t bblock, unsigned blocksize)
781 {
782 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
783 	if (bh) {
784 		if (buffer_dirty(bh))
785 			ll_rw_block(WRITE, 1, &bh);
786 		put_bh(bh);
787 	}
788 }
789 
790 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
791 {
792 	struct address_space *mapping = inode->i_mapping;
793 	struct address_space *buffer_mapping = bh->b_page->mapping;
794 
795 	mark_buffer_dirty(bh);
796 	if (!mapping->assoc_mapping) {
797 		mapping->assoc_mapping = buffer_mapping;
798 	} else {
799 		if (mapping->assoc_mapping != buffer_mapping)
800 			BUG();
801 	}
802 	if (list_empty(&bh->b_assoc_buffers)) {
803 		spin_lock(&buffer_mapping->private_lock);
804 		list_move_tail(&bh->b_assoc_buffers,
805 				&mapping->private_list);
806 		spin_unlock(&buffer_mapping->private_lock);
807 	}
808 }
809 EXPORT_SYMBOL(mark_buffer_dirty_inode);
810 
811 /*
812  * Add a page to the dirty page list.
813  *
814  * It is a sad fact of life that this function is called from several places
815  * deeply under spinlocking.  It may not sleep.
816  *
817  * If the page has buffers, the uptodate buffers are set dirty, to preserve
818  * dirty-state coherency between the page and the buffers.  It the page does
819  * not have buffers then when they are later attached they will all be set
820  * dirty.
821  *
822  * The buffers are dirtied before the page is dirtied.  There's a small race
823  * window in which a writepage caller may see the page cleanness but not the
824  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
825  * before the buffers, a concurrent writepage caller could clear the page dirty
826  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
827  * page on the dirty page list.
828  *
829  * We use private_lock to lock against try_to_free_buffers while using the
830  * page's buffer list.  Also use this to protect against clean buffers being
831  * added to the page after it was set dirty.
832  *
833  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
834  * address_space though.
835  */
836 int __set_page_dirty_buffers(struct page *page)
837 {
838 	struct address_space * const mapping = page->mapping;
839 
840 	spin_lock(&mapping->private_lock);
841 	if (page_has_buffers(page)) {
842 		struct buffer_head *head = page_buffers(page);
843 		struct buffer_head *bh = head;
844 
845 		do {
846 			set_buffer_dirty(bh);
847 			bh = bh->b_this_page;
848 		} while (bh != head);
849 	}
850 	spin_unlock(&mapping->private_lock);
851 
852 	if (!TestSetPageDirty(page)) {
853 		write_lock_irq(&mapping->tree_lock);
854 		if (page->mapping) {	/* Race with truncate? */
855 			if (mapping_cap_account_dirty(mapping))
856 				inc_page_state(nr_dirty);
857 			radix_tree_tag_set(&mapping->page_tree,
858 						page_index(page),
859 						PAGECACHE_TAG_DIRTY);
860 		}
861 		write_unlock_irq(&mapping->tree_lock);
862 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
863 	}
864 
865 	return 0;
866 }
867 EXPORT_SYMBOL(__set_page_dirty_buffers);
868 
869 /*
870  * Write out and wait upon a list of buffers.
871  *
872  * We have conflicting pressures: we want to make sure that all
873  * initially dirty buffers get waited on, but that any subsequently
874  * dirtied buffers don't.  After all, we don't want fsync to last
875  * forever if somebody is actively writing to the file.
876  *
877  * Do this in two main stages: first we copy dirty buffers to a
878  * temporary inode list, queueing the writes as we go.  Then we clean
879  * up, waiting for those writes to complete.
880  *
881  * During this second stage, any subsequent updates to the file may end
882  * up refiling the buffer on the original inode's dirty list again, so
883  * there is a chance we will end up with a buffer queued for write but
884  * not yet completed on that list.  So, as a final cleanup we go through
885  * the osync code to catch these locked, dirty buffers without requeuing
886  * any newly dirty buffers for write.
887  */
888 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
889 {
890 	struct buffer_head *bh;
891 	struct list_head tmp;
892 	int err = 0, err2;
893 
894 	INIT_LIST_HEAD(&tmp);
895 
896 	spin_lock(lock);
897 	while (!list_empty(list)) {
898 		bh = BH_ENTRY(list->next);
899 		list_del_init(&bh->b_assoc_buffers);
900 		if (buffer_dirty(bh) || buffer_locked(bh)) {
901 			list_add(&bh->b_assoc_buffers, &tmp);
902 			if (buffer_dirty(bh)) {
903 				get_bh(bh);
904 				spin_unlock(lock);
905 				/*
906 				 * Ensure any pending I/O completes so that
907 				 * ll_rw_block() actually writes the current
908 				 * contents - it is a noop if I/O is still in
909 				 * flight on potentially older contents.
910 				 */
911 				wait_on_buffer(bh);
912 				ll_rw_block(WRITE, 1, &bh);
913 				brelse(bh);
914 				spin_lock(lock);
915 			}
916 		}
917 	}
918 
919 	while (!list_empty(&tmp)) {
920 		bh = BH_ENTRY(tmp.prev);
921 		__remove_assoc_queue(bh);
922 		get_bh(bh);
923 		spin_unlock(lock);
924 		wait_on_buffer(bh);
925 		if (!buffer_uptodate(bh))
926 			err = -EIO;
927 		brelse(bh);
928 		spin_lock(lock);
929 	}
930 
931 	spin_unlock(lock);
932 	err2 = osync_buffers_list(lock, list);
933 	if (err)
934 		return err;
935 	else
936 		return err2;
937 }
938 
939 /*
940  * Invalidate any and all dirty buffers on a given inode.  We are
941  * probably unmounting the fs, but that doesn't mean we have already
942  * done a sync().  Just drop the buffers from the inode list.
943  *
944  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
945  * assumes that all the buffers are against the blockdev.  Not true
946  * for reiserfs.
947  */
948 void invalidate_inode_buffers(struct inode *inode)
949 {
950 	if (inode_has_buffers(inode)) {
951 		struct address_space *mapping = &inode->i_data;
952 		struct list_head *list = &mapping->private_list;
953 		struct address_space *buffer_mapping = mapping->assoc_mapping;
954 
955 		spin_lock(&buffer_mapping->private_lock);
956 		while (!list_empty(list))
957 			__remove_assoc_queue(BH_ENTRY(list->next));
958 		spin_unlock(&buffer_mapping->private_lock);
959 	}
960 }
961 
962 /*
963  * Remove any clean buffers from the inode's buffer list.  This is called
964  * when we're trying to free the inode itself.  Those buffers can pin it.
965  *
966  * Returns true if all buffers were removed.
967  */
968 int remove_inode_buffers(struct inode *inode)
969 {
970 	int ret = 1;
971 
972 	if (inode_has_buffers(inode)) {
973 		struct address_space *mapping = &inode->i_data;
974 		struct list_head *list = &mapping->private_list;
975 		struct address_space *buffer_mapping = mapping->assoc_mapping;
976 
977 		spin_lock(&buffer_mapping->private_lock);
978 		while (!list_empty(list)) {
979 			struct buffer_head *bh = BH_ENTRY(list->next);
980 			if (buffer_dirty(bh)) {
981 				ret = 0;
982 				break;
983 			}
984 			__remove_assoc_queue(bh);
985 		}
986 		spin_unlock(&buffer_mapping->private_lock);
987 	}
988 	return ret;
989 }
990 
991 /*
992  * Create the appropriate buffers when given a page for data area and
993  * the size of each buffer.. Use the bh->b_this_page linked list to
994  * follow the buffers created.  Return NULL if unable to create more
995  * buffers.
996  *
997  * The retry flag is used to differentiate async IO (paging, swapping)
998  * which may not fail from ordinary buffer allocations.
999  */
1000 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1001 		int retry)
1002 {
1003 	struct buffer_head *bh, *head;
1004 	long offset;
1005 
1006 try_again:
1007 	head = NULL;
1008 	offset = PAGE_SIZE;
1009 	while ((offset -= size) >= 0) {
1010 		bh = alloc_buffer_head(GFP_NOFS);
1011 		if (!bh)
1012 			goto no_grow;
1013 
1014 		bh->b_bdev = NULL;
1015 		bh->b_this_page = head;
1016 		bh->b_blocknr = -1;
1017 		head = bh;
1018 
1019 		bh->b_state = 0;
1020 		atomic_set(&bh->b_count, 0);
1021 		bh->b_size = size;
1022 
1023 		/* Link the buffer to its page */
1024 		set_bh_page(bh, page, offset);
1025 
1026 		bh->b_end_io = NULL;
1027 	}
1028 	return head;
1029 /*
1030  * In case anything failed, we just free everything we got.
1031  */
1032 no_grow:
1033 	if (head) {
1034 		do {
1035 			bh = head;
1036 			head = head->b_this_page;
1037 			free_buffer_head(bh);
1038 		} while (head);
1039 	}
1040 
1041 	/*
1042 	 * Return failure for non-async IO requests.  Async IO requests
1043 	 * are not allowed to fail, so we have to wait until buffer heads
1044 	 * become available.  But we don't want tasks sleeping with
1045 	 * partially complete buffers, so all were released above.
1046 	 */
1047 	if (!retry)
1048 		return NULL;
1049 
1050 	/* We're _really_ low on memory. Now we just
1051 	 * wait for old buffer heads to become free due to
1052 	 * finishing IO.  Since this is an async request and
1053 	 * the reserve list is empty, we're sure there are
1054 	 * async buffer heads in use.
1055 	 */
1056 	free_more_memory();
1057 	goto try_again;
1058 }
1059 EXPORT_SYMBOL_GPL(alloc_page_buffers);
1060 
1061 static inline void
1062 link_dev_buffers(struct page *page, struct buffer_head *head)
1063 {
1064 	struct buffer_head *bh, *tail;
1065 
1066 	bh = head;
1067 	do {
1068 		tail = bh;
1069 		bh = bh->b_this_page;
1070 	} while (bh);
1071 	tail->b_this_page = head;
1072 	attach_page_buffers(page, head);
1073 }
1074 
1075 /*
1076  * Initialise the state of a blockdev page's buffers.
1077  */
1078 static void
1079 init_page_buffers(struct page *page, struct block_device *bdev,
1080 			sector_t block, int size)
1081 {
1082 	struct buffer_head *head = page_buffers(page);
1083 	struct buffer_head *bh = head;
1084 	int uptodate = PageUptodate(page);
1085 
1086 	do {
1087 		if (!buffer_mapped(bh)) {
1088 			init_buffer(bh, NULL, NULL);
1089 			bh->b_bdev = bdev;
1090 			bh->b_blocknr = block;
1091 			if (uptodate)
1092 				set_buffer_uptodate(bh);
1093 			set_buffer_mapped(bh);
1094 		}
1095 		block++;
1096 		bh = bh->b_this_page;
1097 	} while (bh != head);
1098 }
1099 
1100 /*
1101  * Create the page-cache page that contains the requested block.
1102  *
1103  * This is user purely for blockdev mappings.
1104  */
1105 static struct page *
1106 grow_dev_page(struct block_device *bdev, sector_t block,
1107 		pgoff_t index, int size)
1108 {
1109 	struct inode *inode = bdev->bd_inode;
1110 	struct page *page;
1111 	struct buffer_head *bh;
1112 
1113 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1114 	if (!page)
1115 		return NULL;
1116 
1117 	if (!PageLocked(page))
1118 		BUG();
1119 
1120 	if (page_has_buffers(page)) {
1121 		bh = page_buffers(page);
1122 		if (bh->b_size == size) {
1123 			init_page_buffers(page, bdev, block, size);
1124 			return page;
1125 		}
1126 		if (!try_to_free_buffers(page))
1127 			goto failed;
1128 	}
1129 
1130 	/*
1131 	 * Allocate some buffers for this page
1132 	 */
1133 	bh = alloc_page_buffers(page, size, 0);
1134 	if (!bh)
1135 		goto failed;
1136 
1137 	/*
1138 	 * Link the page to the buffers and initialise them.  Take the
1139 	 * lock to be atomic wrt __find_get_block(), which does not
1140 	 * run under the page lock.
1141 	 */
1142 	spin_lock(&inode->i_mapping->private_lock);
1143 	link_dev_buffers(page, bh);
1144 	init_page_buffers(page, bdev, block, size);
1145 	spin_unlock(&inode->i_mapping->private_lock);
1146 	return page;
1147 
1148 failed:
1149 	BUG();
1150 	unlock_page(page);
1151 	page_cache_release(page);
1152 	return NULL;
1153 }
1154 
1155 /*
1156  * Create buffers for the specified block device block's page.  If
1157  * that page was dirty, the buffers are set dirty also.
1158  *
1159  * Except that's a bug.  Attaching dirty buffers to a dirty
1160  * blockdev's page can result in filesystem corruption, because
1161  * some of those buffers may be aliases of filesystem data.
1162  * grow_dev_page() will go BUG() if this happens.
1163  */
1164 static inline int
1165 grow_buffers(struct block_device *bdev, sector_t block, int size)
1166 {
1167 	struct page *page;
1168 	pgoff_t index;
1169 	int sizebits;
1170 
1171 	sizebits = -1;
1172 	do {
1173 		sizebits++;
1174 	} while ((size << sizebits) < PAGE_SIZE);
1175 
1176 	index = block >> sizebits;
1177 	block = index << sizebits;
1178 
1179 	/* Create a page with the proper size buffers.. */
1180 	page = grow_dev_page(bdev, block, index, size);
1181 	if (!page)
1182 		return 0;
1183 	unlock_page(page);
1184 	page_cache_release(page);
1185 	return 1;
1186 }
1187 
1188 static struct buffer_head *
1189 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1190 {
1191 	/* Size must be multiple of hard sectorsize */
1192 	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1193 			(size < 512 || size > PAGE_SIZE))) {
1194 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1195 					size);
1196 		printk(KERN_ERR "hardsect size: %d\n",
1197 					bdev_hardsect_size(bdev));
1198 
1199 		dump_stack();
1200 		return NULL;
1201 	}
1202 
1203 	for (;;) {
1204 		struct buffer_head * bh;
1205 
1206 		bh = __find_get_block(bdev, block, size);
1207 		if (bh)
1208 			return bh;
1209 
1210 		if (!grow_buffers(bdev, block, size))
1211 			free_more_memory();
1212 	}
1213 }
1214 
1215 /*
1216  * The relationship between dirty buffers and dirty pages:
1217  *
1218  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1219  * the page is tagged dirty in its radix tree.
1220  *
1221  * At all times, the dirtiness of the buffers represents the dirtiness of
1222  * subsections of the page.  If the page has buffers, the page dirty bit is
1223  * merely a hint about the true dirty state.
1224  *
1225  * When a page is set dirty in its entirety, all its buffers are marked dirty
1226  * (if the page has buffers).
1227  *
1228  * When a buffer is marked dirty, its page is dirtied, but the page's other
1229  * buffers are not.
1230  *
1231  * Also.  When blockdev buffers are explicitly read with bread(), they
1232  * individually become uptodate.  But their backing page remains not
1233  * uptodate - even if all of its buffers are uptodate.  A subsequent
1234  * block_read_full_page() against that page will discover all the uptodate
1235  * buffers, will set the page uptodate and will perform no I/O.
1236  */
1237 
1238 /**
1239  * mark_buffer_dirty - mark a buffer_head as needing writeout
1240  * @bh: the buffer_head to mark dirty
1241  *
1242  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1243  * backing page dirty, then tag the page as dirty in its address_space's radix
1244  * tree and then attach the address_space's inode to its superblock's dirty
1245  * inode list.
1246  *
1247  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1248  * mapping->tree_lock and the global inode_lock.
1249  */
1250 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1251 {
1252 	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1253 		__set_page_dirty_nobuffers(bh->b_page);
1254 }
1255 
1256 /*
1257  * Decrement a buffer_head's reference count.  If all buffers against a page
1258  * have zero reference count, are clean and unlocked, and if the page is clean
1259  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1260  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1261  * a page but it ends up not being freed, and buffers may later be reattached).
1262  */
1263 void __brelse(struct buffer_head * buf)
1264 {
1265 	if (atomic_read(&buf->b_count)) {
1266 		put_bh(buf);
1267 		return;
1268 	}
1269 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1270 	WARN_ON(1);
1271 }
1272 
1273 /*
1274  * bforget() is like brelse(), except it discards any
1275  * potentially dirty data.
1276  */
1277 void __bforget(struct buffer_head *bh)
1278 {
1279 	clear_buffer_dirty(bh);
1280 	if (!list_empty(&bh->b_assoc_buffers)) {
1281 		struct address_space *buffer_mapping = bh->b_page->mapping;
1282 
1283 		spin_lock(&buffer_mapping->private_lock);
1284 		list_del_init(&bh->b_assoc_buffers);
1285 		spin_unlock(&buffer_mapping->private_lock);
1286 	}
1287 	__brelse(bh);
1288 }
1289 
1290 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1291 {
1292 	lock_buffer(bh);
1293 	if (buffer_uptodate(bh)) {
1294 		unlock_buffer(bh);
1295 		return bh;
1296 	} else {
1297 		get_bh(bh);
1298 		bh->b_end_io = end_buffer_read_sync;
1299 		submit_bh(READ, bh);
1300 		wait_on_buffer(bh);
1301 		if (buffer_uptodate(bh))
1302 			return bh;
1303 	}
1304 	brelse(bh);
1305 	return NULL;
1306 }
1307 
1308 /*
1309  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1310  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1311  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1312  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1313  * CPU's LRUs at the same time.
1314  *
1315  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1316  * sb_find_get_block().
1317  *
1318  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1319  * a local interrupt disable for that.
1320  */
1321 
1322 #define BH_LRU_SIZE	8
1323 
1324 struct bh_lru {
1325 	struct buffer_head *bhs[BH_LRU_SIZE];
1326 };
1327 
1328 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1329 
1330 #ifdef CONFIG_SMP
1331 #define bh_lru_lock()	local_irq_disable()
1332 #define bh_lru_unlock()	local_irq_enable()
1333 #else
1334 #define bh_lru_lock()	preempt_disable()
1335 #define bh_lru_unlock()	preempt_enable()
1336 #endif
1337 
1338 static inline void check_irqs_on(void)
1339 {
1340 #ifdef irqs_disabled
1341 	BUG_ON(irqs_disabled());
1342 #endif
1343 }
1344 
1345 /*
1346  * The LRU management algorithm is dopey-but-simple.  Sorry.
1347  */
1348 static void bh_lru_install(struct buffer_head *bh)
1349 {
1350 	struct buffer_head *evictee = NULL;
1351 	struct bh_lru *lru;
1352 
1353 	check_irqs_on();
1354 	bh_lru_lock();
1355 	lru = &__get_cpu_var(bh_lrus);
1356 	if (lru->bhs[0] != bh) {
1357 		struct buffer_head *bhs[BH_LRU_SIZE];
1358 		int in;
1359 		int out = 0;
1360 
1361 		get_bh(bh);
1362 		bhs[out++] = bh;
1363 		for (in = 0; in < BH_LRU_SIZE; in++) {
1364 			struct buffer_head *bh2 = lru->bhs[in];
1365 
1366 			if (bh2 == bh) {
1367 				__brelse(bh2);
1368 			} else {
1369 				if (out >= BH_LRU_SIZE) {
1370 					BUG_ON(evictee != NULL);
1371 					evictee = bh2;
1372 				} else {
1373 					bhs[out++] = bh2;
1374 				}
1375 			}
1376 		}
1377 		while (out < BH_LRU_SIZE)
1378 			bhs[out++] = NULL;
1379 		memcpy(lru->bhs, bhs, sizeof(bhs));
1380 	}
1381 	bh_lru_unlock();
1382 
1383 	if (evictee)
1384 		__brelse(evictee);
1385 }
1386 
1387 /*
1388  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1389  */
1390 static inline struct buffer_head *
1391 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1392 {
1393 	struct buffer_head *ret = NULL;
1394 	struct bh_lru *lru;
1395 	int i;
1396 
1397 	check_irqs_on();
1398 	bh_lru_lock();
1399 	lru = &__get_cpu_var(bh_lrus);
1400 	for (i = 0; i < BH_LRU_SIZE; i++) {
1401 		struct buffer_head *bh = lru->bhs[i];
1402 
1403 		if (bh && bh->b_bdev == bdev &&
1404 				bh->b_blocknr == block && bh->b_size == size) {
1405 			if (i) {
1406 				while (i) {
1407 					lru->bhs[i] = lru->bhs[i - 1];
1408 					i--;
1409 				}
1410 				lru->bhs[0] = bh;
1411 			}
1412 			get_bh(bh);
1413 			ret = bh;
1414 			break;
1415 		}
1416 	}
1417 	bh_lru_unlock();
1418 	return ret;
1419 }
1420 
1421 /*
1422  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1423  * it in the LRU and mark it as accessed.  If it is not present then return
1424  * NULL
1425  */
1426 struct buffer_head *
1427 __find_get_block(struct block_device *bdev, sector_t block, int size)
1428 {
1429 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1430 
1431 	if (bh == NULL) {
1432 		bh = __find_get_block_slow(bdev, block, size);
1433 		if (bh)
1434 			bh_lru_install(bh);
1435 	}
1436 	if (bh)
1437 		touch_buffer(bh);
1438 	return bh;
1439 }
1440 EXPORT_SYMBOL(__find_get_block);
1441 
1442 /*
1443  * __getblk will locate (and, if necessary, create) the buffer_head
1444  * which corresponds to the passed block_device, block and size. The
1445  * returned buffer has its reference count incremented.
1446  *
1447  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1448  * illegal block number, __getblk() will happily return a buffer_head
1449  * which represents the non-existent block.  Very weird.
1450  *
1451  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1452  * attempt is failing.  FIXME, perhaps?
1453  */
1454 struct buffer_head *
1455 __getblk(struct block_device *bdev, sector_t block, int size)
1456 {
1457 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1458 
1459 	might_sleep();
1460 	if (bh == NULL)
1461 		bh = __getblk_slow(bdev, block, size);
1462 	return bh;
1463 }
1464 EXPORT_SYMBOL(__getblk);
1465 
1466 /*
1467  * Do async read-ahead on a buffer..
1468  */
1469 void __breadahead(struct block_device *bdev, sector_t block, int size)
1470 {
1471 	struct buffer_head *bh = __getblk(bdev, block, size);
1472 	ll_rw_block(READA, 1, &bh);
1473 	brelse(bh);
1474 }
1475 EXPORT_SYMBOL(__breadahead);
1476 
1477 /**
1478  *  __bread() - reads a specified block and returns the bh
1479  *  @bdev: the block_device to read from
1480  *  @block: number of block
1481  *  @size: size (in bytes) to read
1482  *
1483  *  Reads a specified block, and returns buffer head that contains it.
1484  *  It returns NULL if the block was unreadable.
1485  */
1486 struct buffer_head *
1487 __bread(struct block_device *bdev, sector_t block, int size)
1488 {
1489 	struct buffer_head *bh = __getblk(bdev, block, size);
1490 
1491 	if (!buffer_uptodate(bh))
1492 		bh = __bread_slow(bh);
1493 	return bh;
1494 }
1495 EXPORT_SYMBOL(__bread);
1496 
1497 /*
1498  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1499  * This doesn't race because it runs in each cpu either in irq
1500  * or with preempt disabled.
1501  */
1502 static void invalidate_bh_lru(void *arg)
1503 {
1504 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1505 	int i;
1506 
1507 	for (i = 0; i < BH_LRU_SIZE; i++) {
1508 		brelse(b->bhs[i]);
1509 		b->bhs[i] = NULL;
1510 	}
1511 	put_cpu_var(bh_lrus);
1512 }
1513 
1514 static void invalidate_bh_lrus(void)
1515 {
1516 	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1517 }
1518 
1519 void set_bh_page(struct buffer_head *bh,
1520 		struct page *page, unsigned long offset)
1521 {
1522 	bh->b_page = page;
1523 	if (offset >= PAGE_SIZE)
1524 		BUG();
1525 	if (PageHighMem(page))
1526 		/*
1527 		 * This catches illegal uses and preserves the offset:
1528 		 */
1529 		bh->b_data = (char *)(0 + offset);
1530 	else
1531 		bh->b_data = page_address(page) + offset;
1532 }
1533 EXPORT_SYMBOL(set_bh_page);
1534 
1535 /*
1536  * Called when truncating a buffer on a page completely.
1537  */
1538 static inline void discard_buffer(struct buffer_head * bh)
1539 {
1540 	lock_buffer(bh);
1541 	clear_buffer_dirty(bh);
1542 	bh->b_bdev = NULL;
1543 	clear_buffer_mapped(bh);
1544 	clear_buffer_req(bh);
1545 	clear_buffer_new(bh);
1546 	clear_buffer_delay(bh);
1547 	unlock_buffer(bh);
1548 }
1549 
1550 /**
1551  * try_to_release_page() - release old fs-specific metadata on a page
1552  *
1553  * @page: the page which the kernel is trying to free
1554  * @gfp_mask: memory allocation flags (and I/O mode)
1555  *
1556  * The address_space is to try to release any data against the page
1557  * (presumably at page->private).  If the release was successful, return `1'.
1558  * Otherwise return zero.
1559  *
1560  * The @gfp_mask argument specifies whether I/O may be performed to release
1561  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1562  *
1563  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1564  */
1565 int try_to_release_page(struct page *page, int gfp_mask)
1566 {
1567 	struct address_space * const mapping = page->mapping;
1568 
1569 	BUG_ON(!PageLocked(page));
1570 	if (PageWriteback(page))
1571 		return 0;
1572 
1573 	if (mapping && mapping->a_ops->releasepage)
1574 		return mapping->a_ops->releasepage(page, gfp_mask);
1575 	return try_to_free_buffers(page);
1576 }
1577 EXPORT_SYMBOL(try_to_release_page);
1578 
1579 /**
1580  * block_invalidatepage - invalidate part of all of a buffer-backed page
1581  *
1582  * @page: the page which is affected
1583  * @offset: the index of the truncation point
1584  *
1585  * block_invalidatepage() is called when all or part of the page has become
1586  * invalidatedby a truncate operation.
1587  *
1588  * block_invalidatepage() does not have to release all buffers, but it must
1589  * ensure that no dirty buffer is left outside @offset and that no I/O
1590  * is underway against any of the blocks which are outside the truncation
1591  * point.  Because the caller is about to free (and possibly reuse) those
1592  * blocks on-disk.
1593  */
1594 int block_invalidatepage(struct page *page, unsigned long offset)
1595 {
1596 	struct buffer_head *head, *bh, *next;
1597 	unsigned int curr_off = 0;
1598 	int ret = 1;
1599 
1600 	BUG_ON(!PageLocked(page));
1601 	if (!page_has_buffers(page))
1602 		goto out;
1603 
1604 	head = page_buffers(page);
1605 	bh = head;
1606 	do {
1607 		unsigned int next_off = curr_off + bh->b_size;
1608 		next = bh->b_this_page;
1609 
1610 		/*
1611 		 * is this block fully invalidated?
1612 		 */
1613 		if (offset <= curr_off)
1614 			discard_buffer(bh);
1615 		curr_off = next_off;
1616 		bh = next;
1617 	} while (bh != head);
1618 
1619 	/*
1620 	 * We release buffers only if the entire page is being invalidated.
1621 	 * The get_block cached value has been unconditionally invalidated,
1622 	 * so real IO is not possible anymore.
1623 	 */
1624 	if (offset == 0)
1625 		ret = try_to_release_page(page, 0);
1626 out:
1627 	return ret;
1628 }
1629 EXPORT_SYMBOL(block_invalidatepage);
1630 
1631 /*
1632  * We attach and possibly dirty the buffers atomically wrt
1633  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1634  * is already excluded via the page lock.
1635  */
1636 void create_empty_buffers(struct page *page,
1637 			unsigned long blocksize, unsigned long b_state)
1638 {
1639 	struct buffer_head *bh, *head, *tail;
1640 
1641 	head = alloc_page_buffers(page, blocksize, 1);
1642 	bh = head;
1643 	do {
1644 		bh->b_state |= b_state;
1645 		tail = bh;
1646 		bh = bh->b_this_page;
1647 	} while (bh);
1648 	tail->b_this_page = head;
1649 
1650 	spin_lock(&page->mapping->private_lock);
1651 	if (PageUptodate(page) || PageDirty(page)) {
1652 		bh = head;
1653 		do {
1654 			if (PageDirty(page))
1655 				set_buffer_dirty(bh);
1656 			if (PageUptodate(page))
1657 				set_buffer_uptodate(bh);
1658 			bh = bh->b_this_page;
1659 		} while (bh != head);
1660 	}
1661 	attach_page_buffers(page, head);
1662 	spin_unlock(&page->mapping->private_lock);
1663 }
1664 EXPORT_SYMBOL(create_empty_buffers);
1665 
1666 /*
1667  * We are taking a block for data and we don't want any output from any
1668  * buffer-cache aliases starting from return from that function and
1669  * until the moment when something will explicitly mark the buffer
1670  * dirty (hopefully that will not happen until we will free that block ;-)
1671  * We don't even need to mark it not-uptodate - nobody can expect
1672  * anything from a newly allocated buffer anyway. We used to used
1673  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1674  * don't want to mark the alias unmapped, for example - it would confuse
1675  * anyone who might pick it with bread() afterwards...
1676  *
1677  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1678  * be writeout I/O going on against recently-freed buffers.  We don't
1679  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1680  * only if we really need to.  That happens here.
1681  */
1682 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1683 {
1684 	struct buffer_head *old_bh;
1685 
1686 	might_sleep();
1687 
1688 	old_bh = __find_get_block_slow(bdev, block, 0);
1689 	if (old_bh) {
1690 		clear_buffer_dirty(old_bh);
1691 		wait_on_buffer(old_bh);
1692 		clear_buffer_req(old_bh);
1693 		__brelse(old_bh);
1694 	}
1695 }
1696 EXPORT_SYMBOL(unmap_underlying_metadata);
1697 
1698 /*
1699  * NOTE! All mapped/uptodate combinations are valid:
1700  *
1701  *	Mapped	Uptodate	Meaning
1702  *
1703  *	No	No		"unknown" - must do get_block()
1704  *	No	Yes		"hole" - zero-filled
1705  *	Yes	No		"allocated" - allocated on disk, not read in
1706  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1707  *
1708  * "Dirty" is valid only with the last case (mapped+uptodate).
1709  */
1710 
1711 /*
1712  * While block_write_full_page is writing back the dirty buffers under
1713  * the page lock, whoever dirtied the buffers may decide to clean them
1714  * again at any time.  We handle that by only looking at the buffer
1715  * state inside lock_buffer().
1716  *
1717  * If block_write_full_page() is called for regular writeback
1718  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1719  * locked buffer.   This only can happen if someone has written the buffer
1720  * directly, with submit_bh().  At the address_space level PageWriteback
1721  * prevents this contention from occurring.
1722  */
1723 static int __block_write_full_page(struct inode *inode, struct page *page,
1724 			get_block_t *get_block, struct writeback_control *wbc)
1725 {
1726 	int err;
1727 	sector_t block;
1728 	sector_t last_block;
1729 	struct buffer_head *bh, *head;
1730 	int nr_underway = 0;
1731 
1732 	BUG_ON(!PageLocked(page));
1733 
1734 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1735 
1736 	if (!page_has_buffers(page)) {
1737 		create_empty_buffers(page, 1 << inode->i_blkbits,
1738 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1739 	}
1740 
1741 	/*
1742 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1743 	 * here, and the (potentially unmapped) buffers may become dirty at
1744 	 * any time.  If a buffer becomes dirty here after we've inspected it
1745 	 * then we just miss that fact, and the page stays dirty.
1746 	 *
1747 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1748 	 * handle that here by just cleaning them.
1749 	 */
1750 
1751 	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1752 	head = page_buffers(page);
1753 	bh = head;
1754 
1755 	/*
1756 	 * Get all the dirty buffers mapped to disk addresses and
1757 	 * handle any aliases from the underlying blockdev's mapping.
1758 	 */
1759 	do {
1760 		if (block > last_block) {
1761 			/*
1762 			 * mapped buffers outside i_size will occur, because
1763 			 * this page can be outside i_size when there is a
1764 			 * truncate in progress.
1765 			 */
1766 			/*
1767 			 * The buffer was zeroed by block_write_full_page()
1768 			 */
1769 			clear_buffer_dirty(bh);
1770 			set_buffer_uptodate(bh);
1771 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1772 			err = get_block(inode, block, bh, 1);
1773 			if (err)
1774 				goto recover;
1775 			if (buffer_new(bh)) {
1776 				/* blockdev mappings never come here */
1777 				clear_buffer_new(bh);
1778 				unmap_underlying_metadata(bh->b_bdev,
1779 							bh->b_blocknr);
1780 			}
1781 		}
1782 		bh = bh->b_this_page;
1783 		block++;
1784 	} while (bh != head);
1785 
1786 	do {
1787 		if (!buffer_mapped(bh))
1788 			continue;
1789 		/*
1790 		 * If it's a fully non-blocking write attempt and we cannot
1791 		 * lock the buffer then redirty the page.  Note that this can
1792 		 * potentially cause a busy-wait loop from pdflush and kswapd
1793 		 * activity, but those code paths have their own higher-level
1794 		 * throttling.
1795 		 */
1796 		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1797 			lock_buffer(bh);
1798 		} else if (test_set_buffer_locked(bh)) {
1799 			redirty_page_for_writepage(wbc, page);
1800 			continue;
1801 		}
1802 		if (test_clear_buffer_dirty(bh)) {
1803 			mark_buffer_async_write(bh);
1804 		} else {
1805 			unlock_buffer(bh);
1806 		}
1807 	} while ((bh = bh->b_this_page) != head);
1808 
1809 	/*
1810 	 * The page and its buffers are protected by PageWriteback(), so we can
1811 	 * drop the bh refcounts early.
1812 	 */
1813 	BUG_ON(PageWriteback(page));
1814 	set_page_writeback(page);
1815 
1816 	do {
1817 		struct buffer_head *next = bh->b_this_page;
1818 		if (buffer_async_write(bh)) {
1819 			submit_bh(WRITE, bh);
1820 			nr_underway++;
1821 		}
1822 		bh = next;
1823 	} while (bh != head);
1824 	unlock_page(page);
1825 
1826 	err = 0;
1827 done:
1828 	if (nr_underway == 0) {
1829 		/*
1830 		 * The page was marked dirty, but the buffers were
1831 		 * clean.  Someone wrote them back by hand with
1832 		 * ll_rw_block/submit_bh.  A rare case.
1833 		 */
1834 		int uptodate = 1;
1835 		do {
1836 			if (!buffer_uptodate(bh)) {
1837 				uptodate = 0;
1838 				break;
1839 			}
1840 			bh = bh->b_this_page;
1841 		} while (bh != head);
1842 		if (uptodate)
1843 			SetPageUptodate(page);
1844 		end_page_writeback(page);
1845 		/*
1846 		 * The page and buffer_heads can be released at any time from
1847 		 * here on.
1848 		 */
1849 		wbc->pages_skipped++;	/* We didn't write this page */
1850 	}
1851 	return err;
1852 
1853 recover:
1854 	/*
1855 	 * ENOSPC, or some other error.  We may already have added some
1856 	 * blocks to the file, so we need to write these out to avoid
1857 	 * exposing stale data.
1858 	 * The page is currently locked and not marked for writeback
1859 	 */
1860 	bh = head;
1861 	/* Recovery: lock and submit the mapped buffers */
1862 	do {
1863 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
1864 			lock_buffer(bh);
1865 			mark_buffer_async_write(bh);
1866 		} else {
1867 			/*
1868 			 * The buffer may have been set dirty during
1869 			 * attachment to a dirty page.
1870 			 */
1871 			clear_buffer_dirty(bh);
1872 		}
1873 	} while ((bh = bh->b_this_page) != head);
1874 	SetPageError(page);
1875 	BUG_ON(PageWriteback(page));
1876 	set_page_writeback(page);
1877 	unlock_page(page);
1878 	do {
1879 		struct buffer_head *next = bh->b_this_page;
1880 		if (buffer_async_write(bh)) {
1881 			clear_buffer_dirty(bh);
1882 			submit_bh(WRITE, bh);
1883 			nr_underway++;
1884 		}
1885 		bh = next;
1886 	} while (bh != head);
1887 	goto done;
1888 }
1889 
1890 static int __block_prepare_write(struct inode *inode, struct page *page,
1891 		unsigned from, unsigned to, get_block_t *get_block)
1892 {
1893 	unsigned block_start, block_end;
1894 	sector_t block;
1895 	int err = 0;
1896 	unsigned blocksize, bbits;
1897 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1898 
1899 	BUG_ON(!PageLocked(page));
1900 	BUG_ON(from > PAGE_CACHE_SIZE);
1901 	BUG_ON(to > PAGE_CACHE_SIZE);
1902 	BUG_ON(from > to);
1903 
1904 	blocksize = 1 << inode->i_blkbits;
1905 	if (!page_has_buffers(page))
1906 		create_empty_buffers(page, blocksize, 0);
1907 	head = page_buffers(page);
1908 
1909 	bbits = inode->i_blkbits;
1910 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1911 
1912 	for(bh = head, block_start = 0; bh != head || !block_start;
1913 	    block++, block_start=block_end, bh = bh->b_this_page) {
1914 		block_end = block_start + blocksize;
1915 		if (block_end <= from || block_start >= to) {
1916 			if (PageUptodate(page)) {
1917 				if (!buffer_uptodate(bh))
1918 					set_buffer_uptodate(bh);
1919 			}
1920 			continue;
1921 		}
1922 		if (buffer_new(bh))
1923 			clear_buffer_new(bh);
1924 		if (!buffer_mapped(bh)) {
1925 			err = get_block(inode, block, bh, 1);
1926 			if (err)
1927 				break;
1928 			if (buffer_new(bh)) {
1929 				unmap_underlying_metadata(bh->b_bdev,
1930 							bh->b_blocknr);
1931 				if (PageUptodate(page)) {
1932 					set_buffer_uptodate(bh);
1933 					continue;
1934 				}
1935 				if (block_end > to || block_start < from) {
1936 					void *kaddr;
1937 
1938 					kaddr = kmap_atomic(page, KM_USER0);
1939 					if (block_end > to)
1940 						memset(kaddr+to, 0,
1941 							block_end-to);
1942 					if (block_start < from)
1943 						memset(kaddr+block_start,
1944 							0, from-block_start);
1945 					flush_dcache_page(page);
1946 					kunmap_atomic(kaddr, KM_USER0);
1947 				}
1948 				continue;
1949 			}
1950 		}
1951 		if (PageUptodate(page)) {
1952 			if (!buffer_uptodate(bh))
1953 				set_buffer_uptodate(bh);
1954 			continue;
1955 		}
1956 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1957 		     (block_start < from || block_end > to)) {
1958 			ll_rw_block(READ, 1, &bh);
1959 			*wait_bh++=bh;
1960 		}
1961 	}
1962 	/*
1963 	 * If we issued read requests - let them complete.
1964 	 */
1965 	while(wait_bh > wait) {
1966 		wait_on_buffer(*--wait_bh);
1967 		if (!buffer_uptodate(*wait_bh))
1968 			err = -EIO;
1969 	}
1970 	if (!err) {
1971 		bh = head;
1972 		do {
1973 			if (buffer_new(bh))
1974 				clear_buffer_new(bh);
1975 		} while ((bh = bh->b_this_page) != head);
1976 		return 0;
1977 	}
1978 	/* Error case: */
1979 	/*
1980 	 * Zero out any newly allocated blocks to avoid exposing stale
1981 	 * data.  If BH_New is set, we know that the block was newly
1982 	 * allocated in the above loop.
1983 	 */
1984 	bh = head;
1985 	block_start = 0;
1986 	do {
1987 		block_end = block_start+blocksize;
1988 		if (block_end <= from)
1989 			goto next_bh;
1990 		if (block_start >= to)
1991 			break;
1992 		if (buffer_new(bh)) {
1993 			void *kaddr;
1994 
1995 			clear_buffer_new(bh);
1996 			kaddr = kmap_atomic(page, KM_USER0);
1997 			memset(kaddr+block_start, 0, bh->b_size);
1998 			kunmap_atomic(kaddr, KM_USER0);
1999 			set_buffer_uptodate(bh);
2000 			mark_buffer_dirty(bh);
2001 		}
2002 next_bh:
2003 		block_start = block_end;
2004 		bh = bh->b_this_page;
2005 	} while (bh != head);
2006 	return err;
2007 }
2008 
2009 static int __block_commit_write(struct inode *inode, struct page *page,
2010 		unsigned from, unsigned to)
2011 {
2012 	unsigned block_start, block_end;
2013 	int partial = 0;
2014 	unsigned blocksize;
2015 	struct buffer_head *bh, *head;
2016 
2017 	blocksize = 1 << inode->i_blkbits;
2018 
2019 	for(bh = head = page_buffers(page), block_start = 0;
2020 	    bh != head || !block_start;
2021 	    block_start=block_end, bh = bh->b_this_page) {
2022 		block_end = block_start + blocksize;
2023 		if (block_end <= from || block_start >= to) {
2024 			if (!buffer_uptodate(bh))
2025 				partial = 1;
2026 		} else {
2027 			set_buffer_uptodate(bh);
2028 			mark_buffer_dirty(bh);
2029 		}
2030 	}
2031 
2032 	/*
2033 	 * If this is a partial write which happened to make all buffers
2034 	 * uptodate then we can optimize away a bogus readpage() for
2035 	 * the next read(). Here we 'discover' whether the page went
2036 	 * uptodate as a result of this (potentially partial) write.
2037 	 */
2038 	if (!partial)
2039 		SetPageUptodate(page);
2040 	return 0;
2041 }
2042 
2043 /*
2044  * Generic "read page" function for block devices that have the normal
2045  * get_block functionality. This is most of the block device filesystems.
2046  * Reads the page asynchronously --- the unlock_buffer() and
2047  * set/clear_buffer_uptodate() functions propagate buffer state into the
2048  * page struct once IO has completed.
2049  */
2050 int block_read_full_page(struct page *page, get_block_t *get_block)
2051 {
2052 	struct inode *inode = page->mapping->host;
2053 	sector_t iblock, lblock;
2054 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2055 	unsigned int blocksize;
2056 	int nr, i;
2057 	int fully_mapped = 1;
2058 
2059 	BUG_ON(!PageLocked(page));
2060 	blocksize = 1 << inode->i_blkbits;
2061 	if (!page_has_buffers(page))
2062 		create_empty_buffers(page, blocksize, 0);
2063 	head = page_buffers(page);
2064 
2065 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2067 	bh = head;
2068 	nr = 0;
2069 	i = 0;
2070 
2071 	do {
2072 		if (buffer_uptodate(bh))
2073 			continue;
2074 
2075 		if (!buffer_mapped(bh)) {
2076 			int err = 0;
2077 
2078 			fully_mapped = 0;
2079 			if (iblock < lblock) {
2080 				err = get_block(inode, iblock, bh, 0);
2081 				if (err)
2082 					SetPageError(page);
2083 			}
2084 			if (!buffer_mapped(bh)) {
2085 				void *kaddr = kmap_atomic(page, KM_USER0);
2086 				memset(kaddr + i * blocksize, 0, blocksize);
2087 				flush_dcache_page(page);
2088 				kunmap_atomic(kaddr, KM_USER0);
2089 				if (!err)
2090 					set_buffer_uptodate(bh);
2091 				continue;
2092 			}
2093 			/*
2094 			 * get_block() might have updated the buffer
2095 			 * synchronously
2096 			 */
2097 			if (buffer_uptodate(bh))
2098 				continue;
2099 		}
2100 		arr[nr++] = bh;
2101 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2102 
2103 	if (fully_mapped)
2104 		SetPageMappedToDisk(page);
2105 
2106 	if (!nr) {
2107 		/*
2108 		 * All buffers are uptodate - we can set the page uptodate
2109 		 * as well. But not if get_block() returned an error.
2110 		 */
2111 		if (!PageError(page))
2112 			SetPageUptodate(page);
2113 		unlock_page(page);
2114 		return 0;
2115 	}
2116 
2117 	/* Stage two: lock the buffers */
2118 	for (i = 0; i < nr; i++) {
2119 		bh = arr[i];
2120 		lock_buffer(bh);
2121 		mark_buffer_async_read(bh);
2122 	}
2123 
2124 	/*
2125 	 * Stage 3: start the IO.  Check for uptodateness
2126 	 * inside the buffer lock in case another process reading
2127 	 * the underlying blockdev brought it uptodate (the sct fix).
2128 	 */
2129 	for (i = 0; i < nr; i++) {
2130 		bh = arr[i];
2131 		if (buffer_uptodate(bh))
2132 			end_buffer_async_read(bh, 1);
2133 		else
2134 			submit_bh(READ, bh);
2135 	}
2136 	return 0;
2137 }
2138 
2139 /* utility function for filesystems that need to do work on expanding
2140  * truncates.  Uses prepare/commit_write to allow the filesystem to
2141  * deal with the hole.
2142  */
2143 int generic_cont_expand(struct inode *inode, loff_t size)
2144 {
2145 	struct address_space *mapping = inode->i_mapping;
2146 	struct page *page;
2147 	unsigned long index, offset, limit;
2148 	int err;
2149 
2150 	err = -EFBIG;
2151         limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2152 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2153 		send_sig(SIGXFSZ, current, 0);
2154 		goto out;
2155 	}
2156 	if (size > inode->i_sb->s_maxbytes)
2157 		goto out;
2158 
2159 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
2160 
2161 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
2162 	** skip the prepare.  make sure we never send an offset for the start
2163 	** of a block
2164 	*/
2165 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2166 		offset++;
2167 	}
2168 	index = size >> PAGE_CACHE_SHIFT;
2169 	err = -ENOMEM;
2170 	page = grab_cache_page(mapping, index);
2171 	if (!page)
2172 		goto out;
2173 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2174 	if (!err) {
2175 		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2176 	}
2177 	unlock_page(page);
2178 	page_cache_release(page);
2179 	if (err > 0)
2180 		err = 0;
2181 out:
2182 	return err;
2183 }
2184 
2185 /*
2186  * For moronic filesystems that do not allow holes in file.
2187  * We may have to extend the file.
2188  */
2189 
2190 int cont_prepare_write(struct page *page, unsigned offset,
2191 		unsigned to, get_block_t *get_block, loff_t *bytes)
2192 {
2193 	struct address_space *mapping = page->mapping;
2194 	struct inode *inode = mapping->host;
2195 	struct page *new_page;
2196 	pgoff_t pgpos;
2197 	long status;
2198 	unsigned zerofrom;
2199 	unsigned blocksize = 1 << inode->i_blkbits;
2200 	void *kaddr;
2201 
2202 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2203 		status = -ENOMEM;
2204 		new_page = grab_cache_page(mapping, pgpos);
2205 		if (!new_page)
2206 			goto out;
2207 		/* we might sleep */
2208 		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2209 			unlock_page(new_page);
2210 			page_cache_release(new_page);
2211 			continue;
2212 		}
2213 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2214 		if (zerofrom & (blocksize-1)) {
2215 			*bytes |= (blocksize-1);
2216 			(*bytes)++;
2217 		}
2218 		status = __block_prepare_write(inode, new_page, zerofrom,
2219 						PAGE_CACHE_SIZE, get_block);
2220 		if (status)
2221 			goto out_unmap;
2222 		kaddr = kmap_atomic(new_page, KM_USER0);
2223 		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2224 		flush_dcache_page(new_page);
2225 		kunmap_atomic(kaddr, KM_USER0);
2226 		generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2227 		unlock_page(new_page);
2228 		page_cache_release(new_page);
2229 	}
2230 
2231 	if (page->index < pgpos) {
2232 		/* completely inside the area */
2233 		zerofrom = offset;
2234 	} else {
2235 		/* page covers the boundary, find the boundary offset */
2236 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
2237 
2238 		/* if we will expand the thing last block will be filled */
2239 		if (to > zerofrom && (zerofrom & (blocksize-1))) {
2240 			*bytes |= (blocksize-1);
2241 			(*bytes)++;
2242 		}
2243 
2244 		/* starting below the boundary? Nothing to zero out */
2245 		if (offset <= zerofrom)
2246 			zerofrom = offset;
2247 	}
2248 	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2249 	if (status)
2250 		goto out1;
2251 	if (zerofrom < offset) {
2252 		kaddr = kmap_atomic(page, KM_USER0);
2253 		memset(kaddr+zerofrom, 0, offset-zerofrom);
2254 		flush_dcache_page(page);
2255 		kunmap_atomic(kaddr, KM_USER0);
2256 		__block_commit_write(inode, page, zerofrom, offset);
2257 	}
2258 	return 0;
2259 out1:
2260 	ClearPageUptodate(page);
2261 	return status;
2262 
2263 out_unmap:
2264 	ClearPageUptodate(new_page);
2265 	unlock_page(new_page);
2266 	page_cache_release(new_page);
2267 out:
2268 	return status;
2269 }
2270 
2271 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2272 			get_block_t *get_block)
2273 {
2274 	struct inode *inode = page->mapping->host;
2275 	int err = __block_prepare_write(inode, page, from, to, get_block);
2276 	if (err)
2277 		ClearPageUptodate(page);
2278 	return err;
2279 }
2280 
2281 int block_commit_write(struct page *page, unsigned from, unsigned to)
2282 {
2283 	struct inode *inode = page->mapping->host;
2284 	__block_commit_write(inode,page,from,to);
2285 	return 0;
2286 }
2287 
2288 int generic_commit_write(struct file *file, struct page *page,
2289 		unsigned from, unsigned to)
2290 {
2291 	struct inode *inode = page->mapping->host;
2292 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2293 	__block_commit_write(inode,page,from,to);
2294 	/*
2295 	 * No need to use i_size_read() here, the i_size
2296 	 * cannot change under us because we hold i_sem.
2297 	 */
2298 	if (pos > inode->i_size) {
2299 		i_size_write(inode, pos);
2300 		mark_inode_dirty(inode);
2301 	}
2302 	return 0;
2303 }
2304 
2305 
2306 /*
2307  * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2308  * immediately, while under the page lock.  So it needs a special end_io
2309  * handler which does not touch the bh after unlocking it.
2310  *
2311  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2312  * a race there is benign: unlock_buffer() only use the bh's address for
2313  * hashing after unlocking the buffer, so it doesn't actually touch the bh
2314  * itself.
2315  */
2316 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2317 {
2318 	if (uptodate) {
2319 		set_buffer_uptodate(bh);
2320 	} else {
2321 		/* This happens, due to failed READA attempts. */
2322 		clear_buffer_uptodate(bh);
2323 	}
2324 	unlock_buffer(bh);
2325 }
2326 
2327 /*
2328  * On entry, the page is fully not uptodate.
2329  * On exit the page is fully uptodate in the areas outside (from,to)
2330  */
2331 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2332 			get_block_t *get_block)
2333 {
2334 	struct inode *inode = page->mapping->host;
2335 	const unsigned blkbits = inode->i_blkbits;
2336 	const unsigned blocksize = 1 << blkbits;
2337 	struct buffer_head map_bh;
2338 	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2339 	unsigned block_in_page;
2340 	unsigned block_start;
2341 	sector_t block_in_file;
2342 	char *kaddr;
2343 	int nr_reads = 0;
2344 	int i;
2345 	int ret = 0;
2346 	int is_mapped_to_disk = 1;
2347 	int dirtied_it = 0;
2348 
2349 	if (PageMappedToDisk(page))
2350 		return 0;
2351 
2352 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2353 	map_bh.b_page = page;
2354 
2355 	/*
2356 	 * We loop across all blocks in the page, whether or not they are
2357 	 * part of the affected region.  This is so we can discover if the
2358 	 * page is fully mapped-to-disk.
2359 	 */
2360 	for (block_start = 0, block_in_page = 0;
2361 		  block_start < PAGE_CACHE_SIZE;
2362 		  block_in_page++, block_start += blocksize) {
2363 		unsigned block_end = block_start + blocksize;
2364 		int create;
2365 
2366 		map_bh.b_state = 0;
2367 		create = 1;
2368 		if (block_start >= to)
2369 			create = 0;
2370 		ret = get_block(inode, block_in_file + block_in_page,
2371 					&map_bh, create);
2372 		if (ret)
2373 			goto failed;
2374 		if (!buffer_mapped(&map_bh))
2375 			is_mapped_to_disk = 0;
2376 		if (buffer_new(&map_bh))
2377 			unmap_underlying_metadata(map_bh.b_bdev,
2378 							map_bh.b_blocknr);
2379 		if (PageUptodate(page))
2380 			continue;
2381 		if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2382 			kaddr = kmap_atomic(page, KM_USER0);
2383 			if (block_start < from) {
2384 				memset(kaddr+block_start, 0, from-block_start);
2385 				dirtied_it = 1;
2386 			}
2387 			if (block_end > to) {
2388 				memset(kaddr + to, 0, block_end - to);
2389 				dirtied_it = 1;
2390 			}
2391 			flush_dcache_page(page);
2392 			kunmap_atomic(kaddr, KM_USER0);
2393 			continue;
2394 		}
2395 		if (buffer_uptodate(&map_bh))
2396 			continue;	/* reiserfs does this */
2397 		if (block_start < from || block_end > to) {
2398 			struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2399 
2400 			if (!bh) {
2401 				ret = -ENOMEM;
2402 				goto failed;
2403 			}
2404 			bh->b_state = map_bh.b_state;
2405 			atomic_set(&bh->b_count, 0);
2406 			bh->b_this_page = NULL;
2407 			bh->b_page = page;
2408 			bh->b_blocknr = map_bh.b_blocknr;
2409 			bh->b_size = blocksize;
2410 			bh->b_data = (char *)(long)block_start;
2411 			bh->b_bdev = map_bh.b_bdev;
2412 			bh->b_private = NULL;
2413 			read_bh[nr_reads++] = bh;
2414 		}
2415 	}
2416 
2417 	if (nr_reads) {
2418 		struct buffer_head *bh;
2419 
2420 		/*
2421 		 * The page is locked, so these buffers are protected from
2422 		 * any VM or truncate activity.  Hence we don't need to care
2423 		 * for the buffer_head refcounts.
2424 		 */
2425 		for (i = 0; i < nr_reads; i++) {
2426 			bh = read_bh[i];
2427 			lock_buffer(bh);
2428 			bh->b_end_io = end_buffer_read_nobh;
2429 			submit_bh(READ, bh);
2430 		}
2431 		for (i = 0; i < nr_reads; i++) {
2432 			bh = read_bh[i];
2433 			wait_on_buffer(bh);
2434 			if (!buffer_uptodate(bh))
2435 				ret = -EIO;
2436 			free_buffer_head(bh);
2437 			read_bh[i] = NULL;
2438 		}
2439 		if (ret)
2440 			goto failed;
2441 	}
2442 
2443 	if (is_mapped_to_disk)
2444 		SetPageMappedToDisk(page);
2445 	SetPageUptodate(page);
2446 
2447 	/*
2448 	 * Setting the page dirty here isn't necessary for the prepare_write
2449 	 * function - commit_write will do that.  But if/when this function is
2450 	 * used within the pagefault handler to ensure that all mmapped pages
2451 	 * have backing space in the filesystem, we will need to dirty the page
2452 	 * if its contents were altered.
2453 	 */
2454 	if (dirtied_it)
2455 		set_page_dirty(page);
2456 
2457 	return 0;
2458 
2459 failed:
2460 	for (i = 0; i < nr_reads; i++) {
2461 		if (read_bh[i])
2462 			free_buffer_head(read_bh[i]);
2463 	}
2464 
2465 	/*
2466 	 * Error recovery is pretty slack.  Clear the page and mark it dirty
2467 	 * so we'll later zero out any blocks which _were_ allocated.
2468 	 */
2469 	kaddr = kmap_atomic(page, KM_USER0);
2470 	memset(kaddr, 0, PAGE_CACHE_SIZE);
2471 	kunmap_atomic(kaddr, KM_USER0);
2472 	SetPageUptodate(page);
2473 	set_page_dirty(page);
2474 	return ret;
2475 }
2476 EXPORT_SYMBOL(nobh_prepare_write);
2477 
2478 int nobh_commit_write(struct file *file, struct page *page,
2479 		unsigned from, unsigned to)
2480 {
2481 	struct inode *inode = page->mapping->host;
2482 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2483 
2484 	set_page_dirty(page);
2485 	if (pos > inode->i_size) {
2486 		i_size_write(inode, pos);
2487 		mark_inode_dirty(inode);
2488 	}
2489 	return 0;
2490 }
2491 EXPORT_SYMBOL(nobh_commit_write);
2492 
2493 /*
2494  * nobh_writepage() - based on block_full_write_page() except
2495  * that it tries to operate without attaching bufferheads to
2496  * the page.
2497  */
2498 int nobh_writepage(struct page *page, get_block_t *get_block,
2499 			struct writeback_control *wbc)
2500 {
2501 	struct inode * const inode = page->mapping->host;
2502 	loff_t i_size = i_size_read(inode);
2503 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2504 	unsigned offset;
2505 	void *kaddr;
2506 	int ret;
2507 
2508 	/* Is the page fully inside i_size? */
2509 	if (page->index < end_index)
2510 		goto out;
2511 
2512 	/* Is the page fully outside i_size? (truncate in progress) */
2513 	offset = i_size & (PAGE_CACHE_SIZE-1);
2514 	if (page->index >= end_index+1 || !offset) {
2515 		/*
2516 		 * The page may have dirty, unmapped buffers.  For example,
2517 		 * they may have been added in ext3_writepage().  Make them
2518 		 * freeable here, so the page does not leak.
2519 		 */
2520 #if 0
2521 		/* Not really sure about this  - do we need this ? */
2522 		if (page->mapping->a_ops->invalidatepage)
2523 			page->mapping->a_ops->invalidatepage(page, offset);
2524 #endif
2525 		unlock_page(page);
2526 		return 0; /* don't care */
2527 	}
2528 
2529 	/*
2530 	 * The page straddles i_size.  It must be zeroed out on each and every
2531 	 * writepage invocation because it may be mmapped.  "A file is mapped
2532 	 * in multiples of the page size.  For a file that is not a multiple of
2533 	 * the  page size, the remaining memory is zeroed when mapped, and
2534 	 * writes to that region are not written out to the file."
2535 	 */
2536 	kaddr = kmap_atomic(page, KM_USER0);
2537 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2538 	flush_dcache_page(page);
2539 	kunmap_atomic(kaddr, KM_USER0);
2540 out:
2541 	ret = mpage_writepage(page, get_block, wbc);
2542 	if (ret == -EAGAIN)
2543 		ret = __block_write_full_page(inode, page, get_block, wbc);
2544 	return ret;
2545 }
2546 EXPORT_SYMBOL(nobh_writepage);
2547 
2548 /*
2549  * This function assumes that ->prepare_write() uses nobh_prepare_write().
2550  */
2551 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2552 {
2553 	struct inode *inode = mapping->host;
2554 	unsigned blocksize = 1 << inode->i_blkbits;
2555 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2556 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2557 	unsigned to;
2558 	struct page *page;
2559 	struct address_space_operations *a_ops = mapping->a_ops;
2560 	char *kaddr;
2561 	int ret = 0;
2562 
2563 	if ((offset & (blocksize - 1)) == 0)
2564 		goto out;
2565 
2566 	ret = -ENOMEM;
2567 	page = grab_cache_page(mapping, index);
2568 	if (!page)
2569 		goto out;
2570 
2571 	to = (offset + blocksize) & ~(blocksize - 1);
2572 	ret = a_ops->prepare_write(NULL, page, offset, to);
2573 	if (ret == 0) {
2574 		kaddr = kmap_atomic(page, KM_USER0);
2575 		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2576 		flush_dcache_page(page);
2577 		kunmap_atomic(kaddr, KM_USER0);
2578 		set_page_dirty(page);
2579 	}
2580 	unlock_page(page);
2581 	page_cache_release(page);
2582 out:
2583 	return ret;
2584 }
2585 EXPORT_SYMBOL(nobh_truncate_page);
2586 
2587 int block_truncate_page(struct address_space *mapping,
2588 			loff_t from, get_block_t *get_block)
2589 {
2590 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2591 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2592 	unsigned blocksize;
2593 	pgoff_t iblock;
2594 	unsigned length, pos;
2595 	struct inode *inode = mapping->host;
2596 	struct page *page;
2597 	struct buffer_head *bh;
2598 	void *kaddr;
2599 	int err;
2600 
2601 	blocksize = 1 << inode->i_blkbits;
2602 	length = offset & (blocksize - 1);
2603 
2604 	/* Block boundary? Nothing to do */
2605 	if (!length)
2606 		return 0;
2607 
2608 	length = blocksize - length;
2609 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2610 
2611 	page = grab_cache_page(mapping, index);
2612 	err = -ENOMEM;
2613 	if (!page)
2614 		goto out;
2615 
2616 	if (!page_has_buffers(page))
2617 		create_empty_buffers(page, blocksize, 0);
2618 
2619 	/* Find the buffer that contains "offset" */
2620 	bh = page_buffers(page);
2621 	pos = blocksize;
2622 	while (offset >= pos) {
2623 		bh = bh->b_this_page;
2624 		iblock++;
2625 		pos += blocksize;
2626 	}
2627 
2628 	err = 0;
2629 	if (!buffer_mapped(bh)) {
2630 		err = get_block(inode, iblock, bh, 0);
2631 		if (err)
2632 			goto unlock;
2633 		/* unmapped? It's a hole - nothing to do */
2634 		if (!buffer_mapped(bh))
2635 			goto unlock;
2636 	}
2637 
2638 	/* Ok, it's mapped. Make sure it's up-to-date */
2639 	if (PageUptodate(page))
2640 		set_buffer_uptodate(bh);
2641 
2642 	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2643 		err = -EIO;
2644 		ll_rw_block(READ, 1, &bh);
2645 		wait_on_buffer(bh);
2646 		/* Uhhuh. Read error. Complain and punt. */
2647 		if (!buffer_uptodate(bh))
2648 			goto unlock;
2649 	}
2650 
2651 	kaddr = kmap_atomic(page, KM_USER0);
2652 	memset(kaddr + offset, 0, length);
2653 	flush_dcache_page(page);
2654 	kunmap_atomic(kaddr, KM_USER0);
2655 
2656 	mark_buffer_dirty(bh);
2657 	err = 0;
2658 
2659 unlock:
2660 	unlock_page(page);
2661 	page_cache_release(page);
2662 out:
2663 	return err;
2664 }
2665 
2666 /*
2667  * The generic ->writepage function for buffer-backed address_spaces
2668  */
2669 int block_write_full_page(struct page *page, get_block_t *get_block,
2670 			struct writeback_control *wbc)
2671 {
2672 	struct inode * const inode = page->mapping->host;
2673 	loff_t i_size = i_size_read(inode);
2674 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2675 	unsigned offset;
2676 	void *kaddr;
2677 
2678 	/* Is the page fully inside i_size? */
2679 	if (page->index < end_index)
2680 		return __block_write_full_page(inode, page, get_block, wbc);
2681 
2682 	/* Is the page fully outside i_size? (truncate in progress) */
2683 	offset = i_size & (PAGE_CACHE_SIZE-1);
2684 	if (page->index >= end_index+1 || !offset) {
2685 		/*
2686 		 * The page may have dirty, unmapped buffers.  For example,
2687 		 * they may have been added in ext3_writepage().  Make them
2688 		 * freeable here, so the page does not leak.
2689 		 */
2690 		block_invalidatepage(page, 0);
2691 		unlock_page(page);
2692 		return 0; /* don't care */
2693 	}
2694 
2695 	/*
2696 	 * The page straddles i_size.  It must be zeroed out on each and every
2697 	 * writepage invokation because it may be mmapped.  "A file is mapped
2698 	 * in multiples of the page size.  For a file that is not a multiple of
2699 	 * the  page size, the remaining memory is zeroed when mapped, and
2700 	 * writes to that region are not written out to the file."
2701 	 */
2702 	kaddr = kmap_atomic(page, KM_USER0);
2703 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2704 	flush_dcache_page(page);
2705 	kunmap_atomic(kaddr, KM_USER0);
2706 	return __block_write_full_page(inode, page, get_block, wbc);
2707 }
2708 
2709 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2710 			    get_block_t *get_block)
2711 {
2712 	struct buffer_head tmp;
2713 	struct inode *inode = mapping->host;
2714 	tmp.b_state = 0;
2715 	tmp.b_blocknr = 0;
2716 	get_block(inode, block, &tmp, 0);
2717 	return tmp.b_blocknr;
2718 }
2719 
2720 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2721 {
2722 	struct buffer_head *bh = bio->bi_private;
2723 
2724 	if (bio->bi_size)
2725 		return 1;
2726 
2727 	if (err == -EOPNOTSUPP) {
2728 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2729 		set_bit(BH_Eopnotsupp, &bh->b_state);
2730 	}
2731 
2732 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2733 	bio_put(bio);
2734 	return 0;
2735 }
2736 
2737 int submit_bh(int rw, struct buffer_head * bh)
2738 {
2739 	struct bio *bio;
2740 	int ret = 0;
2741 
2742 	BUG_ON(!buffer_locked(bh));
2743 	BUG_ON(!buffer_mapped(bh));
2744 	BUG_ON(!bh->b_end_io);
2745 
2746 	if (buffer_ordered(bh) && (rw == WRITE))
2747 		rw = WRITE_BARRIER;
2748 
2749 	/*
2750 	 * Only clear out a write error when rewriting, should this
2751 	 * include WRITE_SYNC as well?
2752 	 */
2753 	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2754 		clear_buffer_write_io_error(bh);
2755 
2756 	/*
2757 	 * from here on down, it's all bio -- do the initial mapping,
2758 	 * submit_bio -> generic_make_request may further map this bio around
2759 	 */
2760 	bio = bio_alloc(GFP_NOIO, 1);
2761 
2762 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2763 	bio->bi_bdev = bh->b_bdev;
2764 	bio->bi_io_vec[0].bv_page = bh->b_page;
2765 	bio->bi_io_vec[0].bv_len = bh->b_size;
2766 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2767 
2768 	bio->bi_vcnt = 1;
2769 	bio->bi_idx = 0;
2770 	bio->bi_size = bh->b_size;
2771 
2772 	bio->bi_end_io = end_bio_bh_io_sync;
2773 	bio->bi_private = bh;
2774 
2775 	bio_get(bio);
2776 	submit_bio(rw, bio);
2777 
2778 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2779 		ret = -EOPNOTSUPP;
2780 
2781 	bio_put(bio);
2782 	return ret;
2783 }
2784 
2785 /**
2786  * ll_rw_block: low-level access to block devices (DEPRECATED)
2787  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2788  * @nr: number of &struct buffer_heads in the array
2789  * @bhs: array of pointers to &struct buffer_head
2790  *
2791  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
2792  * and requests an I/O operation on them, either a %READ or a %WRITE.
2793  * The third %READA option is described in the documentation for
2794  * generic_make_request() which ll_rw_block() calls.
2795  *
2796  * This function drops any buffer that it cannot get a lock on (with the
2797  * BH_Lock state bit), any buffer that appears to be clean when doing a
2798  * write request, and any buffer that appears to be up-to-date when doing
2799  * read request.  Further it marks as clean buffers that are processed for
2800  * writing (the buffer cache won't assume that they are actually clean until
2801  * the buffer gets unlocked).
2802  *
2803  * ll_rw_block sets b_end_io to simple completion handler that marks
2804  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2805  * any waiters.
2806  *
2807  * All of the buffers must be for the same device, and must also be a
2808  * multiple of the current approved size for the device.
2809  */
2810 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2811 {
2812 	int i;
2813 
2814 	for (i = 0; i < nr; i++) {
2815 		struct buffer_head *bh = bhs[i];
2816 
2817 		if (test_set_buffer_locked(bh))
2818 			continue;
2819 
2820 		get_bh(bh);
2821 		if (rw == WRITE) {
2822 			if (test_clear_buffer_dirty(bh)) {
2823 				bh->b_end_io = end_buffer_write_sync;
2824 				submit_bh(WRITE, bh);
2825 				continue;
2826 			}
2827 		} else {
2828 			if (!buffer_uptodate(bh)) {
2829 				bh->b_end_io = end_buffer_read_sync;
2830 				submit_bh(rw, bh);
2831 				continue;
2832 			}
2833 		}
2834 		unlock_buffer(bh);
2835 		put_bh(bh);
2836 	}
2837 }
2838 
2839 /*
2840  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2841  * and then start new I/O and then wait upon it.  The caller must have a ref on
2842  * the buffer_head.
2843  */
2844 int sync_dirty_buffer(struct buffer_head *bh)
2845 {
2846 	int ret = 0;
2847 
2848 	WARN_ON(atomic_read(&bh->b_count) < 1);
2849 	lock_buffer(bh);
2850 	if (test_clear_buffer_dirty(bh)) {
2851 		get_bh(bh);
2852 		bh->b_end_io = end_buffer_write_sync;
2853 		ret = submit_bh(WRITE, bh);
2854 		wait_on_buffer(bh);
2855 		if (buffer_eopnotsupp(bh)) {
2856 			clear_buffer_eopnotsupp(bh);
2857 			ret = -EOPNOTSUPP;
2858 		}
2859 		if (!ret && !buffer_uptodate(bh))
2860 			ret = -EIO;
2861 	} else {
2862 		unlock_buffer(bh);
2863 	}
2864 	return ret;
2865 }
2866 
2867 /*
2868  * try_to_free_buffers() checks if all the buffers on this particular page
2869  * are unused, and releases them if so.
2870  *
2871  * Exclusion against try_to_free_buffers may be obtained by either
2872  * locking the page or by holding its mapping's private_lock.
2873  *
2874  * If the page is dirty but all the buffers are clean then we need to
2875  * be sure to mark the page clean as well.  This is because the page
2876  * may be against a block device, and a later reattachment of buffers
2877  * to a dirty page will set *all* buffers dirty.  Which would corrupt
2878  * filesystem data on the same device.
2879  *
2880  * The same applies to regular filesystem pages: if all the buffers are
2881  * clean then we set the page clean and proceed.  To do that, we require
2882  * total exclusion from __set_page_dirty_buffers().  That is obtained with
2883  * private_lock.
2884  *
2885  * try_to_free_buffers() is non-blocking.
2886  */
2887 static inline int buffer_busy(struct buffer_head *bh)
2888 {
2889 	return atomic_read(&bh->b_count) |
2890 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2891 }
2892 
2893 static int
2894 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2895 {
2896 	struct buffer_head *head = page_buffers(page);
2897 	struct buffer_head *bh;
2898 
2899 	bh = head;
2900 	do {
2901 		if (buffer_write_io_error(bh) && page->mapping)
2902 			set_bit(AS_EIO, &page->mapping->flags);
2903 		if (buffer_busy(bh))
2904 			goto failed;
2905 		bh = bh->b_this_page;
2906 	} while (bh != head);
2907 
2908 	do {
2909 		struct buffer_head *next = bh->b_this_page;
2910 
2911 		if (!list_empty(&bh->b_assoc_buffers))
2912 			__remove_assoc_queue(bh);
2913 		bh = next;
2914 	} while (bh != head);
2915 	*buffers_to_free = head;
2916 	__clear_page_buffers(page);
2917 	return 1;
2918 failed:
2919 	return 0;
2920 }
2921 
2922 int try_to_free_buffers(struct page *page)
2923 {
2924 	struct address_space * const mapping = page->mapping;
2925 	struct buffer_head *buffers_to_free = NULL;
2926 	int ret = 0;
2927 
2928 	BUG_ON(!PageLocked(page));
2929 	if (PageWriteback(page))
2930 		return 0;
2931 
2932 	if (mapping == NULL) {		/* can this still happen? */
2933 		ret = drop_buffers(page, &buffers_to_free);
2934 		goto out;
2935 	}
2936 
2937 	spin_lock(&mapping->private_lock);
2938 	ret = drop_buffers(page, &buffers_to_free);
2939 	if (ret) {
2940 		/*
2941 		 * If the filesystem writes its buffers by hand (eg ext3)
2942 		 * then we can have clean buffers against a dirty page.  We
2943 		 * clean the page here; otherwise later reattachment of buffers
2944 		 * could encounter a non-uptodate page, which is unresolvable.
2945 		 * This only applies in the rare case where try_to_free_buffers
2946 		 * succeeds but the page is not freed.
2947 		 */
2948 		clear_page_dirty(page);
2949 	}
2950 	spin_unlock(&mapping->private_lock);
2951 out:
2952 	if (buffers_to_free) {
2953 		struct buffer_head *bh = buffers_to_free;
2954 
2955 		do {
2956 			struct buffer_head *next = bh->b_this_page;
2957 			free_buffer_head(bh);
2958 			bh = next;
2959 		} while (bh != buffers_to_free);
2960 	}
2961 	return ret;
2962 }
2963 EXPORT_SYMBOL(try_to_free_buffers);
2964 
2965 int block_sync_page(struct page *page)
2966 {
2967 	struct address_space *mapping;
2968 
2969 	smp_mb();
2970 	mapping = page_mapping(page);
2971 	if (mapping)
2972 		blk_run_backing_dev(mapping->backing_dev_info, page);
2973 	return 0;
2974 }
2975 
2976 /*
2977  * There are no bdflush tunables left.  But distributions are
2978  * still running obsolete flush daemons, so we terminate them here.
2979  *
2980  * Use of bdflush() is deprecated and will be removed in a future kernel.
2981  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
2982  */
2983 asmlinkage long sys_bdflush(int func, long data)
2984 {
2985 	static int msg_count;
2986 
2987 	if (!capable(CAP_SYS_ADMIN))
2988 		return -EPERM;
2989 
2990 	if (msg_count < 5) {
2991 		msg_count++;
2992 		printk(KERN_INFO
2993 			"warning: process `%s' used the obsolete bdflush"
2994 			" system call\n", current->comm);
2995 		printk(KERN_INFO "Fix your initscripts?\n");
2996 	}
2997 
2998 	if (func == 1)
2999 		do_exit(0);
3000 	return 0;
3001 }
3002 
3003 /*
3004  * Buffer-head allocation
3005  */
3006 static kmem_cache_t *bh_cachep;
3007 
3008 /*
3009  * Once the number of bh's in the machine exceeds this level, we start
3010  * stripping them in writeback.
3011  */
3012 static int max_buffer_heads;
3013 
3014 int buffer_heads_over_limit;
3015 
3016 struct bh_accounting {
3017 	int nr;			/* Number of live bh's */
3018 	int ratelimit;		/* Limit cacheline bouncing */
3019 };
3020 
3021 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3022 
3023 static void recalc_bh_state(void)
3024 {
3025 	int i;
3026 	int tot = 0;
3027 
3028 	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3029 		return;
3030 	__get_cpu_var(bh_accounting).ratelimit = 0;
3031 	for_each_cpu(i)
3032 		tot += per_cpu(bh_accounting, i).nr;
3033 	buffer_heads_over_limit = (tot > max_buffer_heads);
3034 }
3035 
3036 struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
3037 {
3038 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3039 	if (ret) {
3040 		preempt_disable();
3041 		__get_cpu_var(bh_accounting).nr++;
3042 		recalc_bh_state();
3043 		preempt_enable();
3044 	}
3045 	return ret;
3046 }
3047 EXPORT_SYMBOL(alloc_buffer_head);
3048 
3049 void free_buffer_head(struct buffer_head *bh)
3050 {
3051 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3052 	kmem_cache_free(bh_cachep, bh);
3053 	preempt_disable();
3054 	__get_cpu_var(bh_accounting).nr--;
3055 	recalc_bh_state();
3056 	preempt_enable();
3057 }
3058 EXPORT_SYMBOL(free_buffer_head);
3059 
3060 static void
3061 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3062 {
3063 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3064 			    SLAB_CTOR_CONSTRUCTOR) {
3065 		struct buffer_head * bh = (struct buffer_head *)data;
3066 
3067 		memset(bh, 0, sizeof(*bh));
3068 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
3069 	}
3070 }
3071 
3072 #ifdef CONFIG_HOTPLUG_CPU
3073 static void buffer_exit_cpu(int cpu)
3074 {
3075 	int i;
3076 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3077 
3078 	for (i = 0; i < BH_LRU_SIZE; i++) {
3079 		brelse(b->bhs[i]);
3080 		b->bhs[i] = NULL;
3081 	}
3082 }
3083 
3084 static int buffer_cpu_notify(struct notifier_block *self,
3085 			      unsigned long action, void *hcpu)
3086 {
3087 	if (action == CPU_DEAD)
3088 		buffer_exit_cpu((unsigned long)hcpu);
3089 	return NOTIFY_OK;
3090 }
3091 #endif /* CONFIG_HOTPLUG_CPU */
3092 
3093 void __init buffer_init(void)
3094 {
3095 	int nrpages;
3096 
3097 	bh_cachep = kmem_cache_create("buffer_head",
3098 			sizeof(struct buffer_head), 0,
3099 			SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, init_buffer_head, NULL);
3100 
3101 	/*
3102 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3103 	 */
3104 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3105 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3106 	hotcpu_notifier(buffer_cpu_notify, 0);
3107 }
3108 
3109 EXPORT_SYMBOL(__bforget);
3110 EXPORT_SYMBOL(__brelse);
3111 EXPORT_SYMBOL(__wait_on_buffer);
3112 EXPORT_SYMBOL(block_commit_write);
3113 EXPORT_SYMBOL(block_prepare_write);
3114 EXPORT_SYMBOL(block_read_full_page);
3115 EXPORT_SYMBOL(block_sync_page);
3116 EXPORT_SYMBOL(block_truncate_page);
3117 EXPORT_SYMBOL(block_write_full_page);
3118 EXPORT_SYMBOL(cont_prepare_write);
3119 EXPORT_SYMBOL(end_buffer_async_write);
3120 EXPORT_SYMBOL(end_buffer_read_sync);
3121 EXPORT_SYMBOL(end_buffer_write_sync);
3122 EXPORT_SYMBOL(file_fsync);
3123 EXPORT_SYMBOL(fsync_bdev);
3124 EXPORT_SYMBOL(generic_block_bmap);
3125 EXPORT_SYMBOL(generic_commit_write);
3126 EXPORT_SYMBOL(generic_cont_expand);
3127 EXPORT_SYMBOL(init_buffer);
3128 EXPORT_SYMBOL(invalidate_bdev);
3129 EXPORT_SYMBOL(ll_rw_block);
3130 EXPORT_SYMBOL(mark_buffer_dirty);
3131 EXPORT_SYMBOL(submit_bh);
3132 EXPORT_SYMBOL(sync_dirty_buffer);
3133 EXPORT_SYMBOL(unlock_buffer);
3134