xref: /linux/fs/buffer.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 #include <linux/cleancache.h>
45 
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 
48 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49 
50 inline void
51 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 {
53 	bh->b_end_io = handler;
54 	bh->b_private = private;
55 }
56 EXPORT_SYMBOL(init_buffer);
57 
58 static int sleep_on_buffer(void *word)
59 {
60 	io_schedule();
61 	return 0;
62 }
63 
64 void __lock_buffer(struct buffer_head *bh)
65 {
66 	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
67 							TASK_UNINTERRUPTIBLE);
68 }
69 EXPORT_SYMBOL(__lock_buffer);
70 
71 void unlock_buffer(struct buffer_head *bh)
72 {
73 	clear_bit_unlock(BH_Lock, &bh->b_state);
74 	smp_mb__after_clear_bit();
75 	wake_up_bit(&bh->b_state, BH_Lock);
76 }
77 EXPORT_SYMBOL(unlock_buffer);
78 
79 /*
80  * Block until a buffer comes unlocked.  This doesn't stop it
81  * from becoming locked again - you have to lock it yourself
82  * if you want to preserve its state.
83  */
84 void __wait_on_buffer(struct buffer_head * bh)
85 {
86 	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
87 }
88 EXPORT_SYMBOL(__wait_on_buffer);
89 
90 static void
91 __clear_page_buffers(struct page *page)
92 {
93 	ClearPagePrivate(page);
94 	set_page_private(page, 0);
95 	page_cache_release(page);
96 }
97 
98 
99 static int quiet_error(struct buffer_head *bh)
100 {
101 	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
102 		return 0;
103 	return 1;
104 }
105 
106 
107 static void buffer_io_error(struct buffer_head *bh)
108 {
109 	char b[BDEVNAME_SIZE];
110 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
111 			bdevname(bh->b_bdev, b),
112 			(unsigned long long)bh->b_blocknr);
113 }
114 
115 /*
116  * End-of-IO handler helper function which does not touch the bh after
117  * unlocking it.
118  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
119  * a race there is benign: unlock_buffer() only use the bh's address for
120  * hashing after unlocking the buffer, so it doesn't actually touch the bh
121  * itself.
122  */
123 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
124 {
125 	if (uptodate) {
126 		set_buffer_uptodate(bh);
127 	} else {
128 		/* This happens, due to failed READA attempts. */
129 		clear_buffer_uptodate(bh);
130 	}
131 	unlock_buffer(bh);
132 }
133 
134 /*
135  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
136  * unlock the buffer. This is what ll_rw_block uses too.
137  */
138 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
139 {
140 	__end_buffer_read_notouch(bh, uptodate);
141 	put_bh(bh);
142 }
143 EXPORT_SYMBOL(end_buffer_read_sync);
144 
145 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
146 {
147 	char b[BDEVNAME_SIZE];
148 
149 	if (uptodate) {
150 		set_buffer_uptodate(bh);
151 	} else {
152 		if (!quiet_error(bh)) {
153 			buffer_io_error(bh);
154 			printk(KERN_WARNING "lost page write due to "
155 					"I/O error on %s\n",
156 				       bdevname(bh->b_bdev, b));
157 		}
158 		set_buffer_write_io_error(bh);
159 		clear_buffer_uptodate(bh);
160 	}
161 	unlock_buffer(bh);
162 	put_bh(bh);
163 }
164 EXPORT_SYMBOL(end_buffer_write_sync);
165 
166 /*
167  * Various filesystems appear to want __find_get_block to be non-blocking.
168  * But it's the page lock which protects the buffers.  To get around this,
169  * we get exclusion from try_to_free_buffers with the blockdev mapping's
170  * private_lock.
171  *
172  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
173  * may be quite high.  This code could TryLock the page, and if that
174  * succeeds, there is no need to take private_lock. (But if
175  * private_lock is contended then so is mapping->tree_lock).
176  */
177 static struct buffer_head *
178 __find_get_block_slow(struct block_device *bdev, sector_t block)
179 {
180 	struct inode *bd_inode = bdev->bd_inode;
181 	struct address_space *bd_mapping = bd_inode->i_mapping;
182 	struct buffer_head *ret = NULL;
183 	pgoff_t index;
184 	struct buffer_head *bh;
185 	struct buffer_head *head;
186 	struct page *page;
187 	int all_mapped = 1;
188 
189 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
190 	page = find_get_page(bd_mapping, index);
191 	if (!page)
192 		goto out;
193 
194 	spin_lock(&bd_mapping->private_lock);
195 	if (!page_has_buffers(page))
196 		goto out_unlock;
197 	head = page_buffers(page);
198 	bh = head;
199 	do {
200 		if (!buffer_mapped(bh))
201 			all_mapped = 0;
202 		else if (bh->b_blocknr == block) {
203 			ret = bh;
204 			get_bh(bh);
205 			goto out_unlock;
206 		}
207 		bh = bh->b_this_page;
208 	} while (bh != head);
209 
210 	/* we might be here because some of the buffers on this page are
211 	 * not mapped.  This is due to various races between
212 	 * file io on the block device and getblk.  It gets dealt with
213 	 * elsewhere, don't buffer_error if we had some unmapped buffers
214 	 */
215 	if (all_mapped) {
216 		char b[BDEVNAME_SIZE];
217 
218 		printk("__find_get_block_slow() failed. "
219 			"block=%llu, b_blocknr=%llu\n",
220 			(unsigned long long)block,
221 			(unsigned long long)bh->b_blocknr);
222 		printk("b_state=0x%08lx, b_size=%zu\n",
223 			bh->b_state, bh->b_size);
224 		printk("device %s blocksize: %d\n", bdevname(bdev, b),
225 			1 << bd_inode->i_blkbits);
226 	}
227 out_unlock:
228 	spin_unlock(&bd_mapping->private_lock);
229 	page_cache_release(page);
230 out:
231 	return ret;
232 }
233 
234 /* If invalidate_buffers() will trash dirty buffers, it means some kind
235    of fs corruption is going on. Trashing dirty data always imply losing
236    information that was supposed to be just stored on the physical layer
237    by the user.
238 
239    Thus invalidate_buffers in general usage is not allwowed to trash
240    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
241    be preserved.  These buffers are simply skipped.
242 
243    We also skip buffers which are still in use.  For example this can
244    happen if a userspace program is reading the block device.
245 
246    NOTE: In the case where the user removed a removable-media-disk even if
247    there's still dirty data not synced on disk (due a bug in the device driver
248    or due an error of the user), by not destroying the dirty buffers we could
249    generate corruption also on the next media inserted, thus a parameter is
250    necessary to handle this case in the most safe way possible (trying
251    to not corrupt also the new disk inserted with the data belonging to
252    the old now corrupted disk). Also for the ramdisk the natural thing
253    to do in order to release the ramdisk memory is to destroy dirty buffers.
254 
255    These are two special cases. Normal usage imply the device driver
256    to issue a sync on the device (without waiting I/O completion) and
257    then an invalidate_buffers call that doesn't trash dirty buffers.
258 
259    For handling cache coherency with the blkdev pagecache the 'update' case
260    is been introduced. It is needed to re-read from disk any pinned
261    buffer. NOTE: re-reading from disk is destructive so we can do it only
262    when we assume nobody is changing the buffercache under our I/O and when
263    we think the disk contains more recent information than the buffercache.
264    The update == 1 pass marks the buffers we need to update, the update == 2
265    pass does the actual I/O. */
266 void invalidate_bdev(struct block_device *bdev)
267 {
268 	struct address_space *mapping = bdev->bd_inode->i_mapping;
269 
270 	if (mapping->nrpages == 0)
271 		return;
272 
273 	invalidate_bh_lrus();
274 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
275 	invalidate_mapping_pages(mapping, 0, -1);
276 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
277 	 * But, for the strange corners, lets be cautious
278 	 */
279 	cleancache_flush_inode(mapping);
280 }
281 EXPORT_SYMBOL(invalidate_bdev);
282 
283 /*
284  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
285  */
286 static void free_more_memory(void)
287 {
288 	struct zone *zone;
289 	int nid;
290 
291 	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
292 	yield();
293 
294 	for_each_online_node(nid) {
295 		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
296 						gfp_zone(GFP_NOFS), NULL,
297 						&zone);
298 		if (zone)
299 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
300 						GFP_NOFS, NULL);
301 	}
302 }
303 
304 /*
305  * I/O completion handler for block_read_full_page() - pages
306  * which come unlocked at the end of I/O.
307  */
308 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
309 {
310 	unsigned long flags;
311 	struct buffer_head *first;
312 	struct buffer_head *tmp;
313 	struct page *page;
314 	int page_uptodate = 1;
315 
316 	BUG_ON(!buffer_async_read(bh));
317 
318 	page = bh->b_page;
319 	if (uptodate) {
320 		set_buffer_uptodate(bh);
321 	} else {
322 		clear_buffer_uptodate(bh);
323 		if (!quiet_error(bh))
324 			buffer_io_error(bh);
325 		SetPageError(page);
326 	}
327 
328 	/*
329 	 * Be _very_ careful from here on. Bad things can happen if
330 	 * two buffer heads end IO at almost the same time and both
331 	 * decide that the page is now completely done.
332 	 */
333 	first = page_buffers(page);
334 	local_irq_save(flags);
335 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
336 	clear_buffer_async_read(bh);
337 	unlock_buffer(bh);
338 	tmp = bh;
339 	do {
340 		if (!buffer_uptodate(tmp))
341 			page_uptodate = 0;
342 		if (buffer_async_read(tmp)) {
343 			BUG_ON(!buffer_locked(tmp));
344 			goto still_busy;
345 		}
346 		tmp = tmp->b_this_page;
347 	} while (tmp != bh);
348 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
349 	local_irq_restore(flags);
350 
351 	/*
352 	 * If none of the buffers had errors and they are all
353 	 * uptodate then we can set the page uptodate.
354 	 */
355 	if (page_uptodate && !PageError(page))
356 		SetPageUptodate(page);
357 	unlock_page(page);
358 	return;
359 
360 still_busy:
361 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
362 	local_irq_restore(flags);
363 	return;
364 }
365 
366 /*
367  * Completion handler for block_write_full_page() - pages which are unlocked
368  * during I/O, and which have PageWriteback cleared upon I/O completion.
369  */
370 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
371 {
372 	char b[BDEVNAME_SIZE];
373 	unsigned long flags;
374 	struct buffer_head *first;
375 	struct buffer_head *tmp;
376 	struct page *page;
377 
378 	BUG_ON(!buffer_async_write(bh));
379 
380 	page = bh->b_page;
381 	if (uptodate) {
382 		set_buffer_uptodate(bh);
383 	} else {
384 		if (!quiet_error(bh)) {
385 			buffer_io_error(bh);
386 			printk(KERN_WARNING "lost page write due to "
387 					"I/O error on %s\n",
388 			       bdevname(bh->b_bdev, b));
389 		}
390 		set_bit(AS_EIO, &page->mapping->flags);
391 		set_buffer_write_io_error(bh);
392 		clear_buffer_uptodate(bh);
393 		SetPageError(page);
394 	}
395 
396 	first = page_buffers(page);
397 	local_irq_save(flags);
398 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
399 
400 	clear_buffer_async_write(bh);
401 	unlock_buffer(bh);
402 	tmp = bh->b_this_page;
403 	while (tmp != bh) {
404 		if (buffer_async_write(tmp)) {
405 			BUG_ON(!buffer_locked(tmp));
406 			goto still_busy;
407 		}
408 		tmp = tmp->b_this_page;
409 	}
410 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
411 	local_irq_restore(flags);
412 	end_page_writeback(page);
413 	return;
414 
415 still_busy:
416 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
417 	local_irq_restore(flags);
418 	return;
419 }
420 EXPORT_SYMBOL(end_buffer_async_write);
421 
422 /*
423  * If a page's buffers are under async readin (end_buffer_async_read
424  * completion) then there is a possibility that another thread of
425  * control could lock one of the buffers after it has completed
426  * but while some of the other buffers have not completed.  This
427  * locked buffer would confuse end_buffer_async_read() into not unlocking
428  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
429  * that this buffer is not under async I/O.
430  *
431  * The page comes unlocked when it has no locked buffer_async buffers
432  * left.
433  *
434  * PageLocked prevents anyone starting new async I/O reads any of
435  * the buffers.
436  *
437  * PageWriteback is used to prevent simultaneous writeout of the same
438  * page.
439  *
440  * PageLocked prevents anyone from starting writeback of a page which is
441  * under read I/O (PageWriteback is only ever set against a locked page).
442  */
443 static void mark_buffer_async_read(struct buffer_head *bh)
444 {
445 	bh->b_end_io = end_buffer_async_read;
446 	set_buffer_async_read(bh);
447 }
448 
449 static void mark_buffer_async_write_endio(struct buffer_head *bh,
450 					  bh_end_io_t *handler)
451 {
452 	bh->b_end_io = handler;
453 	set_buffer_async_write(bh);
454 }
455 
456 void mark_buffer_async_write(struct buffer_head *bh)
457 {
458 	mark_buffer_async_write_endio(bh, end_buffer_async_write);
459 }
460 EXPORT_SYMBOL(mark_buffer_async_write);
461 
462 
463 /*
464  * fs/buffer.c contains helper functions for buffer-backed address space's
465  * fsync functions.  A common requirement for buffer-based filesystems is
466  * that certain data from the backing blockdev needs to be written out for
467  * a successful fsync().  For example, ext2 indirect blocks need to be
468  * written back and waited upon before fsync() returns.
469  *
470  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
471  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
472  * management of a list of dependent buffers at ->i_mapping->private_list.
473  *
474  * Locking is a little subtle: try_to_free_buffers() will remove buffers
475  * from their controlling inode's queue when they are being freed.  But
476  * try_to_free_buffers() will be operating against the *blockdev* mapping
477  * at the time, not against the S_ISREG file which depends on those buffers.
478  * So the locking for private_list is via the private_lock in the address_space
479  * which backs the buffers.  Which is different from the address_space
480  * against which the buffers are listed.  So for a particular address_space,
481  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
482  * mapping->private_list will always be protected by the backing blockdev's
483  * ->private_lock.
484  *
485  * Which introduces a requirement: all buffers on an address_space's
486  * ->private_list must be from the same address_space: the blockdev's.
487  *
488  * address_spaces which do not place buffers at ->private_list via these
489  * utility functions are free to use private_lock and private_list for
490  * whatever they want.  The only requirement is that list_empty(private_list)
491  * be true at clear_inode() time.
492  *
493  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
494  * filesystems should do that.  invalidate_inode_buffers() should just go
495  * BUG_ON(!list_empty).
496  *
497  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
498  * take an address_space, not an inode.  And it should be called
499  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
500  * queued up.
501  *
502  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
503  * list if it is already on a list.  Because if the buffer is on a list,
504  * it *must* already be on the right one.  If not, the filesystem is being
505  * silly.  This will save a ton of locking.  But first we have to ensure
506  * that buffers are taken *off* the old inode's list when they are freed
507  * (presumably in truncate).  That requires careful auditing of all
508  * filesystems (do it inside bforget()).  It could also be done by bringing
509  * b_inode back.
510  */
511 
512 /*
513  * The buffer's backing address_space's private_lock must be held
514  */
515 static void __remove_assoc_queue(struct buffer_head *bh)
516 {
517 	list_del_init(&bh->b_assoc_buffers);
518 	WARN_ON(!bh->b_assoc_map);
519 	if (buffer_write_io_error(bh))
520 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
521 	bh->b_assoc_map = NULL;
522 }
523 
524 int inode_has_buffers(struct inode *inode)
525 {
526 	return !list_empty(&inode->i_data.private_list);
527 }
528 
529 /*
530  * osync is designed to support O_SYNC io.  It waits synchronously for
531  * all already-submitted IO to complete, but does not queue any new
532  * writes to the disk.
533  *
534  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
535  * you dirty the buffers, and then use osync_inode_buffers to wait for
536  * completion.  Any other dirty buffers which are not yet queued for
537  * write will not be flushed to disk by the osync.
538  */
539 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
540 {
541 	struct buffer_head *bh;
542 	struct list_head *p;
543 	int err = 0;
544 
545 	spin_lock(lock);
546 repeat:
547 	list_for_each_prev(p, list) {
548 		bh = BH_ENTRY(p);
549 		if (buffer_locked(bh)) {
550 			get_bh(bh);
551 			spin_unlock(lock);
552 			wait_on_buffer(bh);
553 			if (!buffer_uptodate(bh))
554 				err = -EIO;
555 			brelse(bh);
556 			spin_lock(lock);
557 			goto repeat;
558 		}
559 	}
560 	spin_unlock(lock);
561 	return err;
562 }
563 
564 static void do_thaw_one(struct super_block *sb, void *unused)
565 {
566 	char b[BDEVNAME_SIZE];
567 	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 		printk(KERN_WARNING "Emergency Thaw on %s\n",
569 		       bdevname(sb->s_bdev, b));
570 }
571 
572 static void do_thaw_all(struct work_struct *work)
573 {
574 	iterate_supers(do_thaw_one, NULL);
575 	kfree(work);
576 	printk(KERN_WARNING "Emergency Thaw complete\n");
577 }
578 
579 /**
580  * emergency_thaw_all -- forcibly thaw every frozen filesystem
581  *
582  * Used for emergency unfreeze of all filesystems via SysRq
583  */
584 void emergency_thaw_all(void)
585 {
586 	struct work_struct *work;
587 
588 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 	if (work) {
590 		INIT_WORK(work, do_thaw_all);
591 		schedule_work(work);
592 	}
593 }
594 
595 /**
596  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
597  * @mapping: the mapping which wants those buffers written
598  *
599  * Starts I/O against the buffers at mapping->private_list, and waits upon
600  * that I/O.
601  *
602  * Basically, this is a convenience function for fsync().
603  * @mapping is a file or directory which needs those buffers to be written for
604  * a successful fsync().
605  */
606 int sync_mapping_buffers(struct address_space *mapping)
607 {
608 	struct address_space *buffer_mapping = mapping->assoc_mapping;
609 
610 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
611 		return 0;
612 
613 	return fsync_buffers_list(&buffer_mapping->private_lock,
614 					&mapping->private_list);
615 }
616 EXPORT_SYMBOL(sync_mapping_buffers);
617 
618 /*
619  * Called when we've recently written block `bblock', and it is known that
620  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
621  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
622  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
623  */
624 void write_boundary_block(struct block_device *bdev,
625 			sector_t bblock, unsigned blocksize)
626 {
627 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
628 	if (bh) {
629 		if (buffer_dirty(bh))
630 			ll_rw_block(WRITE, 1, &bh);
631 		put_bh(bh);
632 	}
633 }
634 
635 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
636 {
637 	struct address_space *mapping = inode->i_mapping;
638 	struct address_space *buffer_mapping = bh->b_page->mapping;
639 
640 	mark_buffer_dirty(bh);
641 	if (!mapping->assoc_mapping) {
642 		mapping->assoc_mapping = buffer_mapping;
643 	} else {
644 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
645 	}
646 	if (!bh->b_assoc_map) {
647 		spin_lock(&buffer_mapping->private_lock);
648 		list_move_tail(&bh->b_assoc_buffers,
649 				&mapping->private_list);
650 		bh->b_assoc_map = mapping;
651 		spin_unlock(&buffer_mapping->private_lock);
652 	}
653 }
654 EXPORT_SYMBOL(mark_buffer_dirty_inode);
655 
656 /*
657  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
658  * dirty.
659  *
660  * If warn is true, then emit a warning if the page is not uptodate and has
661  * not been truncated.
662  */
663 static void __set_page_dirty(struct page *page,
664 		struct address_space *mapping, int warn)
665 {
666 	spin_lock_irq(&mapping->tree_lock);
667 	if (page->mapping) {	/* Race with truncate? */
668 		WARN_ON_ONCE(warn && !PageUptodate(page));
669 		account_page_dirtied(page, mapping);
670 		radix_tree_tag_set(&mapping->page_tree,
671 				page_index(page), PAGECACHE_TAG_DIRTY);
672 	}
673 	spin_unlock_irq(&mapping->tree_lock);
674 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
675 }
676 
677 /*
678  * Add a page to the dirty page list.
679  *
680  * It is a sad fact of life that this function is called from several places
681  * deeply under spinlocking.  It may not sleep.
682  *
683  * If the page has buffers, the uptodate buffers are set dirty, to preserve
684  * dirty-state coherency between the page and the buffers.  It the page does
685  * not have buffers then when they are later attached they will all be set
686  * dirty.
687  *
688  * The buffers are dirtied before the page is dirtied.  There's a small race
689  * window in which a writepage caller may see the page cleanness but not the
690  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
691  * before the buffers, a concurrent writepage caller could clear the page dirty
692  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
693  * page on the dirty page list.
694  *
695  * We use private_lock to lock against try_to_free_buffers while using the
696  * page's buffer list.  Also use this to protect against clean buffers being
697  * added to the page after it was set dirty.
698  *
699  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
700  * address_space though.
701  */
702 int __set_page_dirty_buffers(struct page *page)
703 {
704 	int newly_dirty;
705 	struct address_space *mapping = page_mapping(page);
706 
707 	if (unlikely(!mapping))
708 		return !TestSetPageDirty(page);
709 
710 	spin_lock(&mapping->private_lock);
711 	if (page_has_buffers(page)) {
712 		struct buffer_head *head = page_buffers(page);
713 		struct buffer_head *bh = head;
714 
715 		do {
716 			set_buffer_dirty(bh);
717 			bh = bh->b_this_page;
718 		} while (bh != head);
719 	}
720 	newly_dirty = !TestSetPageDirty(page);
721 	spin_unlock(&mapping->private_lock);
722 
723 	if (newly_dirty)
724 		__set_page_dirty(page, mapping, 1);
725 	return newly_dirty;
726 }
727 EXPORT_SYMBOL(__set_page_dirty_buffers);
728 
729 /*
730  * Write out and wait upon a list of buffers.
731  *
732  * We have conflicting pressures: we want to make sure that all
733  * initially dirty buffers get waited on, but that any subsequently
734  * dirtied buffers don't.  After all, we don't want fsync to last
735  * forever if somebody is actively writing to the file.
736  *
737  * Do this in two main stages: first we copy dirty buffers to a
738  * temporary inode list, queueing the writes as we go.  Then we clean
739  * up, waiting for those writes to complete.
740  *
741  * During this second stage, any subsequent updates to the file may end
742  * up refiling the buffer on the original inode's dirty list again, so
743  * there is a chance we will end up with a buffer queued for write but
744  * not yet completed on that list.  So, as a final cleanup we go through
745  * the osync code to catch these locked, dirty buffers without requeuing
746  * any newly dirty buffers for write.
747  */
748 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
749 {
750 	struct buffer_head *bh;
751 	struct list_head tmp;
752 	struct address_space *mapping;
753 	int err = 0, err2;
754 	struct blk_plug plug;
755 
756 	INIT_LIST_HEAD(&tmp);
757 	blk_start_plug(&plug);
758 
759 	spin_lock(lock);
760 	while (!list_empty(list)) {
761 		bh = BH_ENTRY(list->next);
762 		mapping = bh->b_assoc_map;
763 		__remove_assoc_queue(bh);
764 		/* Avoid race with mark_buffer_dirty_inode() which does
765 		 * a lockless check and we rely on seeing the dirty bit */
766 		smp_mb();
767 		if (buffer_dirty(bh) || buffer_locked(bh)) {
768 			list_add(&bh->b_assoc_buffers, &tmp);
769 			bh->b_assoc_map = mapping;
770 			if (buffer_dirty(bh)) {
771 				get_bh(bh);
772 				spin_unlock(lock);
773 				/*
774 				 * Ensure any pending I/O completes so that
775 				 * write_dirty_buffer() actually writes the
776 				 * current contents - it is a noop if I/O is
777 				 * still in flight on potentially older
778 				 * contents.
779 				 */
780 				write_dirty_buffer(bh, WRITE_SYNC);
781 
782 				/*
783 				 * Kick off IO for the previous mapping. Note
784 				 * that we will not run the very last mapping,
785 				 * wait_on_buffer() will do that for us
786 				 * through sync_buffer().
787 				 */
788 				brelse(bh);
789 				spin_lock(lock);
790 			}
791 		}
792 	}
793 
794 	spin_unlock(lock);
795 	blk_finish_plug(&plug);
796 	spin_lock(lock);
797 
798 	while (!list_empty(&tmp)) {
799 		bh = BH_ENTRY(tmp.prev);
800 		get_bh(bh);
801 		mapping = bh->b_assoc_map;
802 		__remove_assoc_queue(bh);
803 		/* Avoid race with mark_buffer_dirty_inode() which does
804 		 * a lockless check and we rely on seeing the dirty bit */
805 		smp_mb();
806 		if (buffer_dirty(bh)) {
807 			list_add(&bh->b_assoc_buffers,
808 				 &mapping->private_list);
809 			bh->b_assoc_map = mapping;
810 		}
811 		spin_unlock(lock);
812 		wait_on_buffer(bh);
813 		if (!buffer_uptodate(bh))
814 			err = -EIO;
815 		brelse(bh);
816 		spin_lock(lock);
817 	}
818 
819 	spin_unlock(lock);
820 	err2 = osync_buffers_list(lock, list);
821 	if (err)
822 		return err;
823 	else
824 		return err2;
825 }
826 
827 /*
828  * Invalidate any and all dirty buffers on a given inode.  We are
829  * probably unmounting the fs, but that doesn't mean we have already
830  * done a sync().  Just drop the buffers from the inode list.
831  *
832  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
833  * assumes that all the buffers are against the blockdev.  Not true
834  * for reiserfs.
835  */
836 void invalidate_inode_buffers(struct inode *inode)
837 {
838 	if (inode_has_buffers(inode)) {
839 		struct address_space *mapping = &inode->i_data;
840 		struct list_head *list = &mapping->private_list;
841 		struct address_space *buffer_mapping = mapping->assoc_mapping;
842 
843 		spin_lock(&buffer_mapping->private_lock);
844 		while (!list_empty(list))
845 			__remove_assoc_queue(BH_ENTRY(list->next));
846 		spin_unlock(&buffer_mapping->private_lock);
847 	}
848 }
849 EXPORT_SYMBOL(invalidate_inode_buffers);
850 
851 /*
852  * Remove any clean buffers from the inode's buffer list.  This is called
853  * when we're trying to free the inode itself.  Those buffers can pin it.
854  *
855  * Returns true if all buffers were removed.
856  */
857 int remove_inode_buffers(struct inode *inode)
858 {
859 	int ret = 1;
860 
861 	if (inode_has_buffers(inode)) {
862 		struct address_space *mapping = &inode->i_data;
863 		struct list_head *list = &mapping->private_list;
864 		struct address_space *buffer_mapping = mapping->assoc_mapping;
865 
866 		spin_lock(&buffer_mapping->private_lock);
867 		while (!list_empty(list)) {
868 			struct buffer_head *bh = BH_ENTRY(list->next);
869 			if (buffer_dirty(bh)) {
870 				ret = 0;
871 				break;
872 			}
873 			__remove_assoc_queue(bh);
874 		}
875 		spin_unlock(&buffer_mapping->private_lock);
876 	}
877 	return ret;
878 }
879 
880 /*
881  * Create the appropriate buffers when given a page for data area and
882  * the size of each buffer.. Use the bh->b_this_page linked list to
883  * follow the buffers created.  Return NULL if unable to create more
884  * buffers.
885  *
886  * The retry flag is used to differentiate async IO (paging, swapping)
887  * which may not fail from ordinary buffer allocations.
888  */
889 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
890 		int retry)
891 {
892 	struct buffer_head *bh, *head;
893 	long offset;
894 
895 try_again:
896 	head = NULL;
897 	offset = PAGE_SIZE;
898 	while ((offset -= size) >= 0) {
899 		bh = alloc_buffer_head(GFP_NOFS);
900 		if (!bh)
901 			goto no_grow;
902 
903 		bh->b_bdev = NULL;
904 		bh->b_this_page = head;
905 		bh->b_blocknr = -1;
906 		head = bh;
907 
908 		bh->b_state = 0;
909 		atomic_set(&bh->b_count, 0);
910 		bh->b_size = size;
911 
912 		/* Link the buffer to its page */
913 		set_bh_page(bh, page, offset);
914 
915 		init_buffer(bh, NULL, NULL);
916 	}
917 	return head;
918 /*
919  * In case anything failed, we just free everything we got.
920  */
921 no_grow:
922 	if (head) {
923 		do {
924 			bh = head;
925 			head = head->b_this_page;
926 			free_buffer_head(bh);
927 		} while (head);
928 	}
929 
930 	/*
931 	 * Return failure for non-async IO requests.  Async IO requests
932 	 * are not allowed to fail, so we have to wait until buffer heads
933 	 * become available.  But we don't want tasks sleeping with
934 	 * partially complete buffers, so all were released above.
935 	 */
936 	if (!retry)
937 		return NULL;
938 
939 	/* We're _really_ low on memory. Now we just
940 	 * wait for old buffer heads to become free due to
941 	 * finishing IO.  Since this is an async request and
942 	 * the reserve list is empty, we're sure there are
943 	 * async buffer heads in use.
944 	 */
945 	free_more_memory();
946 	goto try_again;
947 }
948 EXPORT_SYMBOL_GPL(alloc_page_buffers);
949 
950 static inline void
951 link_dev_buffers(struct page *page, struct buffer_head *head)
952 {
953 	struct buffer_head *bh, *tail;
954 
955 	bh = head;
956 	do {
957 		tail = bh;
958 		bh = bh->b_this_page;
959 	} while (bh);
960 	tail->b_this_page = head;
961 	attach_page_buffers(page, head);
962 }
963 
964 /*
965  * Initialise the state of a blockdev page's buffers.
966  */
967 static void
968 init_page_buffers(struct page *page, struct block_device *bdev,
969 			sector_t block, int size)
970 {
971 	struct buffer_head *head = page_buffers(page);
972 	struct buffer_head *bh = head;
973 	int uptodate = PageUptodate(page);
974 
975 	do {
976 		if (!buffer_mapped(bh)) {
977 			init_buffer(bh, NULL, NULL);
978 			bh->b_bdev = bdev;
979 			bh->b_blocknr = block;
980 			if (uptodate)
981 				set_buffer_uptodate(bh);
982 			set_buffer_mapped(bh);
983 		}
984 		block++;
985 		bh = bh->b_this_page;
986 	} while (bh != head);
987 }
988 
989 /*
990  * Create the page-cache page that contains the requested block.
991  *
992  * This is user purely for blockdev mappings.
993  */
994 static struct page *
995 grow_dev_page(struct block_device *bdev, sector_t block,
996 		pgoff_t index, int size)
997 {
998 	struct inode *inode = bdev->bd_inode;
999 	struct page *page;
1000 	struct buffer_head *bh;
1001 
1002 	page = find_or_create_page(inode->i_mapping, index,
1003 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1004 	if (!page)
1005 		return NULL;
1006 
1007 	BUG_ON(!PageLocked(page));
1008 
1009 	if (page_has_buffers(page)) {
1010 		bh = page_buffers(page);
1011 		if (bh->b_size == size) {
1012 			init_page_buffers(page, bdev, block, size);
1013 			return page;
1014 		}
1015 		if (!try_to_free_buffers(page))
1016 			goto failed;
1017 	}
1018 
1019 	/*
1020 	 * Allocate some buffers for this page
1021 	 */
1022 	bh = alloc_page_buffers(page, size, 0);
1023 	if (!bh)
1024 		goto failed;
1025 
1026 	/*
1027 	 * Link the page to the buffers and initialise them.  Take the
1028 	 * lock to be atomic wrt __find_get_block(), which does not
1029 	 * run under the page lock.
1030 	 */
1031 	spin_lock(&inode->i_mapping->private_lock);
1032 	link_dev_buffers(page, bh);
1033 	init_page_buffers(page, bdev, block, size);
1034 	spin_unlock(&inode->i_mapping->private_lock);
1035 	return page;
1036 
1037 failed:
1038 	BUG();
1039 	unlock_page(page);
1040 	page_cache_release(page);
1041 	return NULL;
1042 }
1043 
1044 /*
1045  * Create buffers for the specified block device block's page.  If
1046  * that page was dirty, the buffers are set dirty also.
1047  */
1048 static int
1049 grow_buffers(struct block_device *bdev, sector_t block, int size)
1050 {
1051 	struct page *page;
1052 	pgoff_t index;
1053 	int sizebits;
1054 
1055 	sizebits = -1;
1056 	do {
1057 		sizebits++;
1058 	} while ((size << sizebits) < PAGE_SIZE);
1059 
1060 	index = block >> sizebits;
1061 
1062 	/*
1063 	 * Check for a block which wants to lie outside our maximum possible
1064 	 * pagecache index.  (this comparison is done using sector_t types).
1065 	 */
1066 	if (unlikely(index != block >> sizebits)) {
1067 		char b[BDEVNAME_SIZE];
1068 
1069 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1070 			"device %s\n",
1071 			__func__, (unsigned long long)block,
1072 			bdevname(bdev, b));
1073 		return -EIO;
1074 	}
1075 	block = index << sizebits;
1076 	/* Create a page with the proper size buffers.. */
1077 	page = grow_dev_page(bdev, block, index, size);
1078 	if (!page)
1079 		return 0;
1080 	unlock_page(page);
1081 	page_cache_release(page);
1082 	return 1;
1083 }
1084 
1085 static struct buffer_head *
1086 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1087 {
1088 	/* Size must be multiple of hard sectorsize */
1089 	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1090 			(size < 512 || size > PAGE_SIZE))) {
1091 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1092 					size);
1093 		printk(KERN_ERR "logical block size: %d\n",
1094 					bdev_logical_block_size(bdev));
1095 
1096 		dump_stack();
1097 		return NULL;
1098 	}
1099 
1100 	for (;;) {
1101 		struct buffer_head * bh;
1102 		int ret;
1103 
1104 		bh = __find_get_block(bdev, block, size);
1105 		if (bh)
1106 			return bh;
1107 
1108 		ret = grow_buffers(bdev, block, size);
1109 		if (ret < 0)
1110 			return NULL;
1111 		if (ret == 0)
1112 			free_more_memory();
1113 	}
1114 }
1115 
1116 /*
1117  * The relationship between dirty buffers and dirty pages:
1118  *
1119  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1120  * the page is tagged dirty in its radix tree.
1121  *
1122  * At all times, the dirtiness of the buffers represents the dirtiness of
1123  * subsections of the page.  If the page has buffers, the page dirty bit is
1124  * merely a hint about the true dirty state.
1125  *
1126  * When a page is set dirty in its entirety, all its buffers are marked dirty
1127  * (if the page has buffers).
1128  *
1129  * When a buffer is marked dirty, its page is dirtied, but the page's other
1130  * buffers are not.
1131  *
1132  * Also.  When blockdev buffers are explicitly read with bread(), they
1133  * individually become uptodate.  But their backing page remains not
1134  * uptodate - even if all of its buffers are uptodate.  A subsequent
1135  * block_read_full_page() against that page will discover all the uptodate
1136  * buffers, will set the page uptodate and will perform no I/O.
1137  */
1138 
1139 /**
1140  * mark_buffer_dirty - mark a buffer_head as needing writeout
1141  * @bh: the buffer_head to mark dirty
1142  *
1143  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1144  * backing page dirty, then tag the page as dirty in its address_space's radix
1145  * tree and then attach the address_space's inode to its superblock's dirty
1146  * inode list.
1147  *
1148  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1149  * mapping->tree_lock and mapping->host->i_lock.
1150  */
1151 void mark_buffer_dirty(struct buffer_head *bh)
1152 {
1153 	WARN_ON_ONCE(!buffer_uptodate(bh));
1154 
1155 	/*
1156 	 * Very *carefully* optimize the it-is-already-dirty case.
1157 	 *
1158 	 * Don't let the final "is it dirty" escape to before we
1159 	 * perhaps modified the buffer.
1160 	 */
1161 	if (buffer_dirty(bh)) {
1162 		smp_mb();
1163 		if (buffer_dirty(bh))
1164 			return;
1165 	}
1166 
1167 	if (!test_set_buffer_dirty(bh)) {
1168 		struct page *page = bh->b_page;
1169 		if (!TestSetPageDirty(page)) {
1170 			struct address_space *mapping = page_mapping(page);
1171 			if (mapping)
1172 				__set_page_dirty(page, mapping, 0);
1173 		}
1174 	}
1175 }
1176 EXPORT_SYMBOL(mark_buffer_dirty);
1177 
1178 /*
1179  * Decrement a buffer_head's reference count.  If all buffers against a page
1180  * have zero reference count, are clean and unlocked, and if the page is clean
1181  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1182  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1183  * a page but it ends up not being freed, and buffers may later be reattached).
1184  */
1185 void __brelse(struct buffer_head * buf)
1186 {
1187 	if (atomic_read(&buf->b_count)) {
1188 		put_bh(buf);
1189 		return;
1190 	}
1191 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1192 }
1193 EXPORT_SYMBOL(__brelse);
1194 
1195 /*
1196  * bforget() is like brelse(), except it discards any
1197  * potentially dirty data.
1198  */
1199 void __bforget(struct buffer_head *bh)
1200 {
1201 	clear_buffer_dirty(bh);
1202 	if (bh->b_assoc_map) {
1203 		struct address_space *buffer_mapping = bh->b_page->mapping;
1204 
1205 		spin_lock(&buffer_mapping->private_lock);
1206 		list_del_init(&bh->b_assoc_buffers);
1207 		bh->b_assoc_map = NULL;
1208 		spin_unlock(&buffer_mapping->private_lock);
1209 	}
1210 	__brelse(bh);
1211 }
1212 EXPORT_SYMBOL(__bforget);
1213 
1214 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1215 {
1216 	lock_buffer(bh);
1217 	if (buffer_uptodate(bh)) {
1218 		unlock_buffer(bh);
1219 		return bh;
1220 	} else {
1221 		get_bh(bh);
1222 		bh->b_end_io = end_buffer_read_sync;
1223 		submit_bh(READ, bh);
1224 		wait_on_buffer(bh);
1225 		if (buffer_uptodate(bh))
1226 			return bh;
1227 	}
1228 	brelse(bh);
1229 	return NULL;
1230 }
1231 
1232 /*
1233  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1234  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1235  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1236  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1237  * CPU's LRUs at the same time.
1238  *
1239  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1240  * sb_find_get_block().
1241  *
1242  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1243  * a local interrupt disable for that.
1244  */
1245 
1246 #define BH_LRU_SIZE	8
1247 
1248 struct bh_lru {
1249 	struct buffer_head *bhs[BH_LRU_SIZE];
1250 };
1251 
1252 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1253 
1254 #ifdef CONFIG_SMP
1255 #define bh_lru_lock()	local_irq_disable()
1256 #define bh_lru_unlock()	local_irq_enable()
1257 #else
1258 #define bh_lru_lock()	preempt_disable()
1259 #define bh_lru_unlock()	preempt_enable()
1260 #endif
1261 
1262 static inline void check_irqs_on(void)
1263 {
1264 #ifdef irqs_disabled
1265 	BUG_ON(irqs_disabled());
1266 #endif
1267 }
1268 
1269 /*
1270  * The LRU management algorithm is dopey-but-simple.  Sorry.
1271  */
1272 static void bh_lru_install(struct buffer_head *bh)
1273 {
1274 	struct buffer_head *evictee = NULL;
1275 
1276 	check_irqs_on();
1277 	bh_lru_lock();
1278 	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1279 		struct buffer_head *bhs[BH_LRU_SIZE];
1280 		int in;
1281 		int out = 0;
1282 
1283 		get_bh(bh);
1284 		bhs[out++] = bh;
1285 		for (in = 0; in < BH_LRU_SIZE; in++) {
1286 			struct buffer_head *bh2 =
1287 				__this_cpu_read(bh_lrus.bhs[in]);
1288 
1289 			if (bh2 == bh) {
1290 				__brelse(bh2);
1291 			} else {
1292 				if (out >= BH_LRU_SIZE) {
1293 					BUG_ON(evictee != NULL);
1294 					evictee = bh2;
1295 				} else {
1296 					bhs[out++] = bh2;
1297 				}
1298 			}
1299 		}
1300 		while (out < BH_LRU_SIZE)
1301 			bhs[out++] = NULL;
1302 		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1303 	}
1304 	bh_lru_unlock();
1305 
1306 	if (evictee)
1307 		__brelse(evictee);
1308 }
1309 
1310 /*
1311  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1312  */
1313 static struct buffer_head *
1314 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1315 {
1316 	struct buffer_head *ret = NULL;
1317 	unsigned int i;
1318 
1319 	check_irqs_on();
1320 	bh_lru_lock();
1321 	for (i = 0; i < BH_LRU_SIZE; i++) {
1322 		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1323 
1324 		if (bh && bh->b_bdev == bdev &&
1325 				bh->b_blocknr == block && bh->b_size == size) {
1326 			if (i) {
1327 				while (i) {
1328 					__this_cpu_write(bh_lrus.bhs[i],
1329 						__this_cpu_read(bh_lrus.bhs[i - 1]));
1330 					i--;
1331 				}
1332 				__this_cpu_write(bh_lrus.bhs[0], bh);
1333 			}
1334 			get_bh(bh);
1335 			ret = bh;
1336 			break;
1337 		}
1338 	}
1339 	bh_lru_unlock();
1340 	return ret;
1341 }
1342 
1343 /*
1344  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1345  * it in the LRU and mark it as accessed.  If it is not present then return
1346  * NULL
1347  */
1348 struct buffer_head *
1349 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1350 {
1351 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1352 
1353 	if (bh == NULL) {
1354 		bh = __find_get_block_slow(bdev, block);
1355 		if (bh)
1356 			bh_lru_install(bh);
1357 	}
1358 	if (bh)
1359 		touch_buffer(bh);
1360 	return bh;
1361 }
1362 EXPORT_SYMBOL(__find_get_block);
1363 
1364 /*
1365  * __getblk will locate (and, if necessary, create) the buffer_head
1366  * which corresponds to the passed block_device, block and size. The
1367  * returned buffer has its reference count incremented.
1368  *
1369  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1370  * illegal block number, __getblk() will happily return a buffer_head
1371  * which represents the non-existent block.  Very weird.
1372  *
1373  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1374  * attempt is failing.  FIXME, perhaps?
1375  */
1376 struct buffer_head *
1377 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1378 {
1379 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1380 
1381 	might_sleep();
1382 	if (bh == NULL)
1383 		bh = __getblk_slow(bdev, block, size);
1384 	return bh;
1385 }
1386 EXPORT_SYMBOL(__getblk);
1387 
1388 /*
1389  * Do async read-ahead on a buffer..
1390  */
1391 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1392 {
1393 	struct buffer_head *bh = __getblk(bdev, block, size);
1394 	if (likely(bh)) {
1395 		ll_rw_block(READA, 1, &bh);
1396 		brelse(bh);
1397 	}
1398 }
1399 EXPORT_SYMBOL(__breadahead);
1400 
1401 /**
1402  *  __bread() - reads a specified block and returns the bh
1403  *  @bdev: the block_device to read from
1404  *  @block: number of block
1405  *  @size: size (in bytes) to read
1406  *
1407  *  Reads a specified block, and returns buffer head that contains it.
1408  *  It returns NULL if the block was unreadable.
1409  */
1410 struct buffer_head *
1411 __bread(struct block_device *bdev, sector_t block, unsigned size)
1412 {
1413 	struct buffer_head *bh = __getblk(bdev, block, size);
1414 
1415 	if (likely(bh) && !buffer_uptodate(bh))
1416 		bh = __bread_slow(bh);
1417 	return bh;
1418 }
1419 EXPORT_SYMBOL(__bread);
1420 
1421 /*
1422  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1423  * This doesn't race because it runs in each cpu either in irq
1424  * or with preempt disabled.
1425  */
1426 static void invalidate_bh_lru(void *arg)
1427 {
1428 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1429 	int i;
1430 
1431 	for (i = 0; i < BH_LRU_SIZE; i++) {
1432 		brelse(b->bhs[i]);
1433 		b->bhs[i] = NULL;
1434 	}
1435 	put_cpu_var(bh_lrus);
1436 }
1437 
1438 void invalidate_bh_lrus(void)
1439 {
1440 	on_each_cpu(invalidate_bh_lru, NULL, 1);
1441 }
1442 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1443 
1444 void set_bh_page(struct buffer_head *bh,
1445 		struct page *page, unsigned long offset)
1446 {
1447 	bh->b_page = page;
1448 	BUG_ON(offset >= PAGE_SIZE);
1449 	if (PageHighMem(page))
1450 		/*
1451 		 * This catches illegal uses and preserves the offset:
1452 		 */
1453 		bh->b_data = (char *)(0 + offset);
1454 	else
1455 		bh->b_data = page_address(page) + offset;
1456 }
1457 EXPORT_SYMBOL(set_bh_page);
1458 
1459 /*
1460  * Called when truncating a buffer on a page completely.
1461  */
1462 static void discard_buffer(struct buffer_head * bh)
1463 {
1464 	lock_buffer(bh);
1465 	clear_buffer_dirty(bh);
1466 	bh->b_bdev = NULL;
1467 	clear_buffer_mapped(bh);
1468 	clear_buffer_req(bh);
1469 	clear_buffer_new(bh);
1470 	clear_buffer_delay(bh);
1471 	clear_buffer_unwritten(bh);
1472 	unlock_buffer(bh);
1473 }
1474 
1475 /**
1476  * block_invalidatepage - invalidate part or all of a buffer-backed page
1477  *
1478  * @page: the page which is affected
1479  * @offset: the index of the truncation point
1480  *
1481  * block_invalidatepage() is called when all or part of the page has become
1482  * invalidated by a truncate operation.
1483  *
1484  * block_invalidatepage() does not have to release all buffers, but it must
1485  * ensure that no dirty buffer is left outside @offset and that no I/O
1486  * is underway against any of the blocks which are outside the truncation
1487  * point.  Because the caller is about to free (and possibly reuse) those
1488  * blocks on-disk.
1489  */
1490 void block_invalidatepage(struct page *page, unsigned long offset)
1491 {
1492 	struct buffer_head *head, *bh, *next;
1493 	unsigned int curr_off = 0;
1494 
1495 	BUG_ON(!PageLocked(page));
1496 	if (!page_has_buffers(page))
1497 		goto out;
1498 
1499 	head = page_buffers(page);
1500 	bh = head;
1501 	do {
1502 		unsigned int next_off = curr_off + bh->b_size;
1503 		next = bh->b_this_page;
1504 
1505 		/*
1506 		 * is this block fully invalidated?
1507 		 */
1508 		if (offset <= curr_off)
1509 			discard_buffer(bh);
1510 		curr_off = next_off;
1511 		bh = next;
1512 	} while (bh != head);
1513 
1514 	/*
1515 	 * We release buffers only if the entire page is being invalidated.
1516 	 * The get_block cached value has been unconditionally invalidated,
1517 	 * so real IO is not possible anymore.
1518 	 */
1519 	if (offset == 0)
1520 		try_to_release_page(page, 0);
1521 out:
1522 	return;
1523 }
1524 EXPORT_SYMBOL(block_invalidatepage);
1525 
1526 /*
1527  * We attach and possibly dirty the buffers atomically wrt
1528  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1529  * is already excluded via the page lock.
1530  */
1531 void create_empty_buffers(struct page *page,
1532 			unsigned long blocksize, unsigned long b_state)
1533 {
1534 	struct buffer_head *bh, *head, *tail;
1535 
1536 	head = alloc_page_buffers(page, blocksize, 1);
1537 	bh = head;
1538 	do {
1539 		bh->b_state |= b_state;
1540 		tail = bh;
1541 		bh = bh->b_this_page;
1542 	} while (bh);
1543 	tail->b_this_page = head;
1544 
1545 	spin_lock(&page->mapping->private_lock);
1546 	if (PageUptodate(page) || PageDirty(page)) {
1547 		bh = head;
1548 		do {
1549 			if (PageDirty(page))
1550 				set_buffer_dirty(bh);
1551 			if (PageUptodate(page))
1552 				set_buffer_uptodate(bh);
1553 			bh = bh->b_this_page;
1554 		} while (bh != head);
1555 	}
1556 	attach_page_buffers(page, head);
1557 	spin_unlock(&page->mapping->private_lock);
1558 }
1559 EXPORT_SYMBOL(create_empty_buffers);
1560 
1561 /*
1562  * We are taking a block for data and we don't want any output from any
1563  * buffer-cache aliases starting from return from that function and
1564  * until the moment when something will explicitly mark the buffer
1565  * dirty (hopefully that will not happen until we will free that block ;-)
1566  * We don't even need to mark it not-uptodate - nobody can expect
1567  * anything from a newly allocated buffer anyway. We used to used
1568  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1569  * don't want to mark the alias unmapped, for example - it would confuse
1570  * anyone who might pick it with bread() afterwards...
1571  *
1572  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1573  * be writeout I/O going on against recently-freed buffers.  We don't
1574  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1575  * only if we really need to.  That happens here.
1576  */
1577 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1578 {
1579 	struct buffer_head *old_bh;
1580 
1581 	might_sleep();
1582 
1583 	old_bh = __find_get_block_slow(bdev, block);
1584 	if (old_bh) {
1585 		clear_buffer_dirty(old_bh);
1586 		wait_on_buffer(old_bh);
1587 		clear_buffer_req(old_bh);
1588 		__brelse(old_bh);
1589 	}
1590 }
1591 EXPORT_SYMBOL(unmap_underlying_metadata);
1592 
1593 /*
1594  * NOTE! All mapped/uptodate combinations are valid:
1595  *
1596  *	Mapped	Uptodate	Meaning
1597  *
1598  *	No	No		"unknown" - must do get_block()
1599  *	No	Yes		"hole" - zero-filled
1600  *	Yes	No		"allocated" - allocated on disk, not read in
1601  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1602  *
1603  * "Dirty" is valid only with the last case (mapped+uptodate).
1604  */
1605 
1606 /*
1607  * While block_write_full_page is writing back the dirty buffers under
1608  * the page lock, whoever dirtied the buffers may decide to clean them
1609  * again at any time.  We handle that by only looking at the buffer
1610  * state inside lock_buffer().
1611  *
1612  * If block_write_full_page() is called for regular writeback
1613  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1614  * locked buffer.   This only can happen if someone has written the buffer
1615  * directly, with submit_bh().  At the address_space level PageWriteback
1616  * prevents this contention from occurring.
1617  *
1618  * If block_write_full_page() is called with wbc->sync_mode ==
1619  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1620  * causes the writes to be flagged as synchronous writes.
1621  */
1622 static int __block_write_full_page(struct inode *inode, struct page *page,
1623 			get_block_t *get_block, struct writeback_control *wbc,
1624 			bh_end_io_t *handler)
1625 {
1626 	int err;
1627 	sector_t block;
1628 	sector_t last_block;
1629 	struct buffer_head *bh, *head;
1630 	const unsigned blocksize = 1 << inode->i_blkbits;
1631 	int nr_underway = 0;
1632 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1633 			WRITE_SYNC : WRITE);
1634 
1635 	BUG_ON(!PageLocked(page));
1636 
1637 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1638 
1639 	if (!page_has_buffers(page)) {
1640 		create_empty_buffers(page, blocksize,
1641 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1642 	}
1643 
1644 	/*
1645 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1646 	 * here, and the (potentially unmapped) buffers may become dirty at
1647 	 * any time.  If a buffer becomes dirty here after we've inspected it
1648 	 * then we just miss that fact, and the page stays dirty.
1649 	 *
1650 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1651 	 * handle that here by just cleaning them.
1652 	 */
1653 
1654 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1655 	head = page_buffers(page);
1656 	bh = head;
1657 
1658 	/*
1659 	 * Get all the dirty buffers mapped to disk addresses and
1660 	 * handle any aliases from the underlying blockdev's mapping.
1661 	 */
1662 	do {
1663 		if (block > last_block) {
1664 			/*
1665 			 * mapped buffers outside i_size will occur, because
1666 			 * this page can be outside i_size when there is a
1667 			 * truncate in progress.
1668 			 */
1669 			/*
1670 			 * The buffer was zeroed by block_write_full_page()
1671 			 */
1672 			clear_buffer_dirty(bh);
1673 			set_buffer_uptodate(bh);
1674 		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1675 			   buffer_dirty(bh)) {
1676 			WARN_ON(bh->b_size != blocksize);
1677 			err = get_block(inode, block, bh, 1);
1678 			if (err)
1679 				goto recover;
1680 			clear_buffer_delay(bh);
1681 			if (buffer_new(bh)) {
1682 				/* blockdev mappings never come here */
1683 				clear_buffer_new(bh);
1684 				unmap_underlying_metadata(bh->b_bdev,
1685 							bh->b_blocknr);
1686 			}
1687 		}
1688 		bh = bh->b_this_page;
1689 		block++;
1690 	} while (bh != head);
1691 
1692 	do {
1693 		if (!buffer_mapped(bh))
1694 			continue;
1695 		/*
1696 		 * If it's a fully non-blocking write attempt and we cannot
1697 		 * lock the buffer then redirty the page.  Note that this can
1698 		 * potentially cause a busy-wait loop from writeback threads
1699 		 * and kswapd activity, but those code paths have their own
1700 		 * higher-level throttling.
1701 		 */
1702 		if (wbc->sync_mode != WB_SYNC_NONE) {
1703 			lock_buffer(bh);
1704 		} else if (!trylock_buffer(bh)) {
1705 			redirty_page_for_writepage(wbc, page);
1706 			continue;
1707 		}
1708 		if (test_clear_buffer_dirty(bh)) {
1709 			mark_buffer_async_write_endio(bh, handler);
1710 		} else {
1711 			unlock_buffer(bh);
1712 		}
1713 	} while ((bh = bh->b_this_page) != head);
1714 
1715 	/*
1716 	 * The page and its buffers are protected by PageWriteback(), so we can
1717 	 * drop the bh refcounts early.
1718 	 */
1719 	BUG_ON(PageWriteback(page));
1720 	set_page_writeback(page);
1721 
1722 	do {
1723 		struct buffer_head *next = bh->b_this_page;
1724 		if (buffer_async_write(bh)) {
1725 			submit_bh(write_op, bh);
1726 			nr_underway++;
1727 		}
1728 		bh = next;
1729 	} while (bh != head);
1730 	unlock_page(page);
1731 
1732 	err = 0;
1733 done:
1734 	if (nr_underway == 0) {
1735 		/*
1736 		 * The page was marked dirty, but the buffers were
1737 		 * clean.  Someone wrote them back by hand with
1738 		 * ll_rw_block/submit_bh.  A rare case.
1739 		 */
1740 		end_page_writeback(page);
1741 
1742 		/*
1743 		 * The page and buffer_heads can be released at any time from
1744 		 * here on.
1745 		 */
1746 	}
1747 	return err;
1748 
1749 recover:
1750 	/*
1751 	 * ENOSPC, or some other error.  We may already have added some
1752 	 * blocks to the file, so we need to write these out to avoid
1753 	 * exposing stale data.
1754 	 * The page is currently locked and not marked for writeback
1755 	 */
1756 	bh = head;
1757 	/* Recovery: lock and submit the mapped buffers */
1758 	do {
1759 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1760 		    !buffer_delay(bh)) {
1761 			lock_buffer(bh);
1762 			mark_buffer_async_write_endio(bh, handler);
1763 		} else {
1764 			/*
1765 			 * The buffer may have been set dirty during
1766 			 * attachment to a dirty page.
1767 			 */
1768 			clear_buffer_dirty(bh);
1769 		}
1770 	} while ((bh = bh->b_this_page) != head);
1771 	SetPageError(page);
1772 	BUG_ON(PageWriteback(page));
1773 	mapping_set_error(page->mapping, err);
1774 	set_page_writeback(page);
1775 	do {
1776 		struct buffer_head *next = bh->b_this_page;
1777 		if (buffer_async_write(bh)) {
1778 			clear_buffer_dirty(bh);
1779 			submit_bh(write_op, bh);
1780 			nr_underway++;
1781 		}
1782 		bh = next;
1783 	} while (bh != head);
1784 	unlock_page(page);
1785 	goto done;
1786 }
1787 
1788 /*
1789  * If a page has any new buffers, zero them out here, and mark them uptodate
1790  * and dirty so they'll be written out (in order to prevent uninitialised
1791  * block data from leaking). And clear the new bit.
1792  */
1793 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1794 {
1795 	unsigned int block_start, block_end;
1796 	struct buffer_head *head, *bh;
1797 
1798 	BUG_ON(!PageLocked(page));
1799 	if (!page_has_buffers(page))
1800 		return;
1801 
1802 	bh = head = page_buffers(page);
1803 	block_start = 0;
1804 	do {
1805 		block_end = block_start + bh->b_size;
1806 
1807 		if (buffer_new(bh)) {
1808 			if (block_end > from && block_start < to) {
1809 				if (!PageUptodate(page)) {
1810 					unsigned start, size;
1811 
1812 					start = max(from, block_start);
1813 					size = min(to, block_end) - start;
1814 
1815 					zero_user(page, start, size);
1816 					set_buffer_uptodate(bh);
1817 				}
1818 
1819 				clear_buffer_new(bh);
1820 				mark_buffer_dirty(bh);
1821 			}
1822 		}
1823 
1824 		block_start = block_end;
1825 		bh = bh->b_this_page;
1826 	} while (bh != head);
1827 }
1828 EXPORT_SYMBOL(page_zero_new_buffers);
1829 
1830 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1831 		get_block_t *get_block)
1832 {
1833 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1834 	unsigned to = from + len;
1835 	struct inode *inode = page->mapping->host;
1836 	unsigned block_start, block_end;
1837 	sector_t block;
1838 	int err = 0;
1839 	unsigned blocksize, bbits;
1840 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1841 
1842 	BUG_ON(!PageLocked(page));
1843 	BUG_ON(from > PAGE_CACHE_SIZE);
1844 	BUG_ON(to > PAGE_CACHE_SIZE);
1845 	BUG_ON(from > to);
1846 
1847 	blocksize = 1 << inode->i_blkbits;
1848 	if (!page_has_buffers(page))
1849 		create_empty_buffers(page, blocksize, 0);
1850 	head = page_buffers(page);
1851 
1852 	bbits = inode->i_blkbits;
1853 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1854 
1855 	for(bh = head, block_start = 0; bh != head || !block_start;
1856 	    block++, block_start=block_end, bh = bh->b_this_page) {
1857 		block_end = block_start + blocksize;
1858 		if (block_end <= from || block_start >= to) {
1859 			if (PageUptodate(page)) {
1860 				if (!buffer_uptodate(bh))
1861 					set_buffer_uptodate(bh);
1862 			}
1863 			continue;
1864 		}
1865 		if (buffer_new(bh))
1866 			clear_buffer_new(bh);
1867 		if (!buffer_mapped(bh)) {
1868 			WARN_ON(bh->b_size != blocksize);
1869 			err = get_block(inode, block, bh, 1);
1870 			if (err)
1871 				break;
1872 			if (buffer_new(bh)) {
1873 				unmap_underlying_metadata(bh->b_bdev,
1874 							bh->b_blocknr);
1875 				if (PageUptodate(page)) {
1876 					clear_buffer_new(bh);
1877 					set_buffer_uptodate(bh);
1878 					mark_buffer_dirty(bh);
1879 					continue;
1880 				}
1881 				if (block_end > to || block_start < from)
1882 					zero_user_segments(page,
1883 						to, block_end,
1884 						block_start, from);
1885 				continue;
1886 			}
1887 		}
1888 		if (PageUptodate(page)) {
1889 			if (!buffer_uptodate(bh))
1890 				set_buffer_uptodate(bh);
1891 			continue;
1892 		}
1893 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1894 		    !buffer_unwritten(bh) &&
1895 		     (block_start < from || block_end > to)) {
1896 			ll_rw_block(READ, 1, &bh);
1897 			*wait_bh++=bh;
1898 		}
1899 	}
1900 	/*
1901 	 * If we issued read requests - let them complete.
1902 	 */
1903 	while(wait_bh > wait) {
1904 		wait_on_buffer(*--wait_bh);
1905 		if (!buffer_uptodate(*wait_bh))
1906 			err = -EIO;
1907 	}
1908 	if (unlikely(err))
1909 		page_zero_new_buffers(page, from, to);
1910 	return err;
1911 }
1912 EXPORT_SYMBOL(__block_write_begin);
1913 
1914 static int __block_commit_write(struct inode *inode, struct page *page,
1915 		unsigned from, unsigned to)
1916 {
1917 	unsigned block_start, block_end;
1918 	int partial = 0;
1919 	unsigned blocksize;
1920 	struct buffer_head *bh, *head;
1921 
1922 	blocksize = 1 << inode->i_blkbits;
1923 
1924 	for(bh = head = page_buffers(page), block_start = 0;
1925 	    bh != head || !block_start;
1926 	    block_start=block_end, bh = bh->b_this_page) {
1927 		block_end = block_start + blocksize;
1928 		if (block_end <= from || block_start >= to) {
1929 			if (!buffer_uptodate(bh))
1930 				partial = 1;
1931 		} else {
1932 			set_buffer_uptodate(bh);
1933 			mark_buffer_dirty(bh);
1934 		}
1935 		clear_buffer_new(bh);
1936 	}
1937 
1938 	/*
1939 	 * If this is a partial write which happened to make all buffers
1940 	 * uptodate then we can optimize away a bogus readpage() for
1941 	 * the next read(). Here we 'discover' whether the page went
1942 	 * uptodate as a result of this (potentially partial) write.
1943 	 */
1944 	if (!partial)
1945 		SetPageUptodate(page);
1946 	return 0;
1947 }
1948 
1949 /*
1950  * block_write_begin takes care of the basic task of block allocation and
1951  * bringing partial write blocks uptodate first.
1952  *
1953  * The filesystem needs to handle block truncation upon failure.
1954  */
1955 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1956 		unsigned flags, struct page **pagep, get_block_t *get_block)
1957 {
1958 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1959 	struct page *page;
1960 	int status;
1961 
1962 	page = grab_cache_page_write_begin(mapping, index, flags);
1963 	if (!page)
1964 		return -ENOMEM;
1965 
1966 	status = __block_write_begin(page, pos, len, get_block);
1967 	if (unlikely(status)) {
1968 		unlock_page(page);
1969 		page_cache_release(page);
1970 		page = NULL;
1971 	}
1972 
1973 	*pagep = page;
1974 	return status;
1975 }
1976 EXPORT_SYMBOL(block_write_begin);
1977 
1978 int block_write_end(struct file *file, struct address_space *mapping,
1979 			loff_t pos, unsigned len, unsigned copied,
1980 			struct page *page, void *fsdata)
1981 {
1982 	struct inode *inode = mapping->host;
1983 	unsigned start;
1984 
1985 	start = pos & (PAGE_CACHE_SIZE - 1);
1986 
1987 	if (unlikely(copied < len)) {
1988 		/*
1989 		 * The buffers that were written will now be uptodate, so we
1990 		 * don't have to worry about a readpage reading them and
1991 		 * overwriting a partial write. However if we have encountered
1992 		 * a short write and only partially written into a buffer, it
1993 		 * will not be marked uptodate, so a readpage might come in and
1994 		 * destroy our partial write.
1995 		 *
1996 		 * Do the simplest thing, and just treat any short write to a
1997 		 * non uptodate page as a zero-length write, and force the
1998 		 * caller to redo the whole thing.
1999 		 */
2000 		if (!PageUptodate(page))
2001 			copied = 0;
2002 
2003 		page_zero_new_buffers(page, start+copied, start+len);
2004 	}
2005 	flush_dcache_page(page);
2006 
2007 	/* This could be a short (even 0-length) commit */
2008 	__block_commit_write(inode, page, start, start+copied);
2009 
2010 	return copied;
2011 }
2012 EXPORT_SYMBOL(block_write_end);
2013 
2014 int generic_write_end(struct file *file, struct address_space *mapping,
2015 			loff_t pos, unsigned len, unsigned copied,
2016 			struct page *page, void *fsdata)
2017 {
2018 	struct inode *inode = mapping->host;
2019 	int i_size_changed = 0;
2020 
2021 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2022 
2023 	/*
2024 	 * No need to use i_size_read() here, the i_size
2025 	 * cannot change under us because we hold i_mutex.
2026 	 *
2027 	 * But it's important to update i_size while still holding page lock:
2028 	 * page writeout could otherwise come in and zero beyond i_size.
2029 	 */
2030 	if (pos+copied > inode->i_size) {
2031 		i_size_write(inode, pos+copied);
2032 		i_size_changed = 1;
2033 	}
2034 
2035 	unlock_page(page);
2036 	page_cache_release(page);
2037 
2038 	/*
2039 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2040 	 * makes the holding time of page lock longer. Second, it forces lock
2041 	 * ordering of page lock and transaction start for journaling
2042 	 * filesystems.
2043 	 */
2044 	if (i_size_changed)
2045 		mark_inode_dirty(inode);
2046 
2047 	return copied;
2048 }
2049 EXPORT_SYMBOL(generic_write_end);
2050 
2051 /*
2052  * block_is_partially_uptodate checks whether buffers within a page are
2053  * uptodate or not.
2054  *
2055  * Returns true if all buffers which correspond to a file portion
2056  * we want to read are uptodate.
2057  */
2058 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2059 					unsigned long from)
2060 {
2061 	struct inode *inode = page->mapping->host;
2062 	unsigned block_start, block_end, blocksize;
2063 	unsigned to;
2064 	struct buffer_head *bh, *head;
2065 	int ret = 1;
2066 
2067 	if (!page_has_buffers(page))
2068 		return 0;
2069 
2070 	blocksize = 1 << inode->i_blkbits;
2071 	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2072 	to = from + to;
2073 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2074 		return 0;
2075 
2076 	head = page_buffers(page);
2077 	bh = head;
2078 	block_start = 0;
2079 	do {
2080 		block_end = block_start + blocksize;
2081 		if (block_end > from && block_start < to) {
2082 			if (!buffer_uptodate(bh)) {
2083 				ret = 0;
2084 				break;
2085 			}
2086 			if (block_end >= to)
2087 				break;
2088 		}
2089 		block_start = block_end;
2090 		bh = bh->b_this_page;
2091 	} while (bh != head);
2092 
2093 	return ret;
2094 }
2095 EXPORT_SYMBOL(block_is_partially_uptodate);
2096 
2097 /*
2098  * Generic "read page" function for block devices that have the normal
2099  * get_block functionality. This is most of the block device filesystems.
2100  * Reads the page asynchronously --- the unlock_buffer() and
2101  * set/clear_buffer_uptodate() functions propagate buffer state into the
2102  * page struct once IO has completed.
2103  */
2104 int block_read_full_page(struct page *page, get_block_t *get_block)
2105 {
2106 	struct inode *inode = page->mapping->host;
2107 	sector_t iblock, lblock;
2108 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2109 	unsigned int blocksize;
2110 	int nr, i;
2111 	int fully_mapped = 1;
2112 
2113 	BUG_ON(!PageLocked(page));
2114 	blocksize = 1 << inode->i_blkbits;
2115 	if (!page_has_buffers(page))
2116 		create_empty_buffers(page, blocksize, 0);
2117 	head = page_buffers(page);
2118 
2119 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2120 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2121 	bh = head;
2122 	nr = 0;
2123 	i = 0;
2124 
2125 	do {
2126 		if (buffer_uptodate(bh))
2127 			continue;
2128 
2129 		if (!buffer_mapped(bh)) {
2130 			int err = 0;
2131 
2132 			fully_mapped = 0;
2133 			if (iblock < lblock) {
2134 				WARN_ON(bh->b_size != blocksize);
2135 				err = get_block(inode, iblock, bh, 0);
2136 				if (err)
2137 					SetPageError(page);
2138 			}
2139 			if (!buffer_mapped(bh)) {
2140 				zero_user(page, i * blocksize, blocksize);
2141 				if (!err)
2142 					set_buffer_uptodate(bh);
2143 				continue;
2144 			}
2145 			/*
2146 			 * get_block() might have updated the buffer
2147 			 * synchronously
2148 			 */
2149 			if (buffer_uptodate(bh))
2150 				continue;
2151 		}
2152 		arr[nr++] = bh;
2153 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2154 
2155 	if (fully_mapped)
2156 		SetPageMappedToDisk(page);
2157 
2158 	if (!nr) {
2159 		/*
2160 		 * All buffers are uptodate - we can set the page uptodate
2161 		 * as well. But not if get_block() returned an error.
2162 		 */
2163 		if (!PageError(page))
2164 			SetPageUptodate(page);
2165 		unlock_page(page);
2166 		return 0;
2167 	}
2168 
2169 	/* Stage two: lock the buffers */
2170 	for (i = 0; i < nr; i++) {
2171 		bh = arr[i];
2172 		lock_buffer(bh);
2173 		mark_buffer_async_read(bh);
2174 	}
2175 
2176 	/*
2177 	 * Stage 3: start the IO.  Check for uptodateness
2178 	 * inside the buffer lock in case another process reading
2179 	 * the underlying blockdev brought it uptodate (the sct fix).
2180 	 */
2181 	for (i = 0; i < nr; i++) {
2182 		bh = arr[i];
2183 		if (buffer_uptodate(bh))
2184 			end_buffer_async_read(bh, 1);
2185 		else
2186 			submit_bh(READ, bh);
2187 	}
2188 	return 0;
2189 }
2190 EXPORT_SYMBOL(block_read_full_page);
2191 
2192 /* utility function for filesystems that need to do work on expanding
2193  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2194  * deal with the hole.
2195  */
2196 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2197 {
2198 	struct address_space *mapping = inode->i_mapping;
2199 	struct page *page;
2200 	void *fsdata;
2201 	int err;
2202 
2203 	err = inode_newsize_ok(inode, size);
2204 	if (err)
2205 		goto out;
2206 
2207 	err = pagecache_write_begin(NULL, mapping, size, 0,
2208 				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2209 				&page, &fsdata);
2210 	if (err)
2211 		goto out;
2212 
2213 	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2214 	BUG_ON(err > 0);
2215 
2216 out:
2217 	return err;
2218 }
2219 EXPORT_SYMBOL(generic_cont_expand_simple);
2220 
2221 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2222 			    loff_t pos, loff_t *bytes)
2223 {
2224 	struct inode *inode = mapping->host;
2225 	unsigned blocksize = 1 << inode->i_blkbits;
2226 	struct page *page;
2227 	void *fsdata;
2228 	pgoff_t index, curidx;
2229 	loff_t curpos;
2230 	unsigned zerofrom, offset, len;
2231 	int err = 0;
2232 
2233 	index = pos >> PAGE_CACHE_SHIFT;
2234 	offset = pos & ~PAGE_CACHE_MASK;
2235 
2236 	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2237 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2238 		if (zerofrom & (blocksize-1)) {
2239 			*bytes |= (blocksize-1);
2240 			(*bytes)++;
2241 		}
2242 		len = PAGE_CACHE_SIZE - zerofrom;
2243 
2244 		err = pagecache_write_begin(file, mapping, curpos, len,
2245 						AOP_FLAG_UNINTERRUPTIBLE,
2246 						&page, &fsdata);
2247 		if (err)
2248 			goto out;
2249 		zero_user(page, zerofrom, len);
2250 		err = pagecache_write_end(file, mapping, curpos, len, len,
2251 						page, fsdata);
2252 		if (err < 0)
2253 			goto out;
2254 		BUG_ON(err != len);
2255 		err = 0;
2256 
2257 		balance_dirty_pages_ratelimited(mapping);
2258 	}
2259 
2260 	/* page covers the boundary, find the boundary offset */
2261 	if (index == curidx) {
2262 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2263 		/* if we will expand the thing last block will be filled */
2264 		if (offset <= zerofrom) {
2265 			goto out;
2266 		}
2267 		if (zerofrom & (blocksize-1)) {
2268 			*bytes |= (blocksize-1);
2269 			(*bytes)++;
2270 		}
2271 		len = offset - zerofrom;
2272 
2273 		err = pagecache_write_begin(file, mapping, curpos, len,
2274 						AOP_FLAG_UNINTERRUPTIBLE,
2275 						&page, &fsdata);
2276 		if (err)
2277 			goto out;
2278 		zero_user(page, zerofrom, len);
2279 		err = pagecache_write_end(file, mapping, curpos, len, len,
2280 						page, fsdata);
2281 		if (err < 0)
2282 			goto out;
2283 		BUG_ON(err != len);
2284 		err = 0;
2285 	}
2286 out:
2287 	return err;
2288 }
2289 
2290 /*
2291  * For moronic filesystems that do not allow holes in file.
2292  * We may have to extend the file.
2293  */
2294 int cont_write_begin(struct file *file, struct address_space *mapping,
2295 			loff_t pos, unsigned len, unsigned flags,
2296 			struct page **pagep, void **fsdata,
2297 			get_block_t *get_block, loff_t *bytes)
2298 {
2299 	struct inode *inode = mapping->host;
2300 	unsigned blocksize = 1 << inode->i_blkbits;
2301 	unsigned zerofrom;
2302 	int err;
2303 
2304 	err = cont_expand_zero(file, mapping, pos, bytes);
2305 	if (err)
2306 		return err;
2307 
2308 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2309 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2310 		*bytes |= (blocksize-1);
2311 		(*bytes)++;
2312 	}
2313 
2314 	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2315 }
2316 EXPORT_SYMBOL(cont_write_begin);
2317 
2318 int block_commit_write(struct page *page, unsigned from, unsigned to)
2319 {
2320 	struct inode *inode = page->mapping->host;
2321 	__block_commit_write(inode,page,from,to);
2322 	return 0;
2323 }
2324 EXPORT_SYMBOL(block_commit_write);
2325 
2326 /*
2327  * block_page_mkwrite() is not allowed to change the file size as it gets
2328  * called from a page fault handler when a page is first dirtied. Hence we must
2329  * be careful to check for EOF conditions here. We set the page up correctly
2330  * for a written page which means we get ENOSPC checking when writing into
2331  * holes and correct delalloc and unwritten extent mapping on filesystems that
2332  * support these features.
2333  *
2334  * We are not allowed to take the i_mutex here so we have to play games to
2335  * protect against truncate races as the page could now be beyond EOF.  Because
2336  * truncate writes the inode size before removing pages, once we have the
2337  * page lock we can determine safely if the page is beyond EOF. If it is not
2338  * beyond EOF, then the page is guaranteed safe against truncation until we
2339  * unlock the page.
2340  *
2341  * Direct callers of this function should call vfs_check_frozen() so that page
2342  * fault does not busyloop until the fs is thawed.
2343  */
2344 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2345 			 get_block_t get_block)
2346 {
2347 	struct page *page = vmf->page;
2348 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2349 	unsigned long end;
2350 	loff_t size;
2351 	int ret;
2352 
2353 	lock_page(page);
2354 	size = i_size_read(inode);
2355 	if ((page->mapping != inode->i_mapping) ||
2356 	    (page_offset(page) > size)) {
2357 		/* We overload EFAULT to mean page got truncated */
2358 		ret = -EFAULT;
2359 		goto out_unlock;
2360 	}
2361 
2362 	/* page is wholly or partially inside EOF */
2363 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2364 		end = size & ~PAGE_CACHE_MASK;
2365 	else
2366 		end = PAGE_CACHE_SIZE;
2367 
2368 	ret = __block_write_begin(page, 0, end, get_block);
2369 	if (!ret)
2370 		ret = block_commit_write(page, 0, end);
2371 
2372 	if (unlikely(ret < 0))
2373 		goto out_unlock;
2374 	/*
2375 	 * Freezing in progress? We check after the page is marked dirty and
2376 	 * with page lock held so if the test here fails, we are sure freezing
2377 	 * code will wait during syncing until the page fault is done - at that
2378 	 * point page will be dirty and unlocked so freezing code will write it
2379 	 * and writeprotect it again.
2380 	 */
2381 	set_page_dirty(page);
2382 	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2383 		ret = -EAGAIN;
2384 		goto out_unlock;
2385 	}
2386 	wait_on_page_writeback(page);
2387 	return 0;
2388 out_unlock:
2389 	unlock_page(page);
2390 	return ret;
2391 }
2392 EXPORT_SYMBOL(__block_page_mkwrite);
2393 
2394 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2395 		   get_block_t get_block)
2396 {
2397 	int ret;
2398 	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2399 
2400 	/*
2401 	 * This check is racy but catches the common case. The check in
2402 	 * __block_page_mkwrite() is reliable.
2403 	 */
2404 	vfs_check_frozen(sb, SB_FREEZE_WRITE);
2405 	ret = __block_page_mkwrite(vma, vmf, get_block);
2406 	return block_page_mkwrite_return(ret);
2407 }
2408 EXPORT_SYMBOL(block_page_mkwrite);
2409 
2410 /*
2411  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2412  * immediately, while under the page lock.  So it needs a special end_io
2413  * handler which does not touch the bh after unlocking it.
2414  */
2415 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2416 {
2417 	__end_buffer_read_notouch(bh, uptodate);
2418 }
2419 
2420 /*
2421  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2422  * the page (converting it to circular linked list and taking care of page
2423  * dirty races).
2424  */
2425 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2426 {
2427 	struct buffer_head *bh;
2428 
2429 	BUG_ON(!PageLocked(page));
2430 
2431 	spin_lock(&page->mapping->private_lock);
2432 	bh = head;
2433 	do {
2434 		if (PageDirty(page))
2435 			set_buffer_dirty(bh);
2436 		if (!bh->b_this_page)
2437 			bh->b_this_page = head;
2438 		bh = bh->b_this_page;
2439 	} while (bh != head);
2440 	attach_page_buffers(page, head);
2441 	spin_unlock(&page->mapping->private_lock);
2442 }
2443 
2444 /*
2445  * On entry, the page is fully not uptodate.
2446  * On exit the page is fully uptodate in the areas outside (from,to)
2447  * The filesystem needs to handle block truncation upon failure.
2448  */
2449 int nobh_write_begin(struct address_space *mapping,
2450 			loff_t pos, unsigned len, unsigned flags,
2451 			struct page **pagep, void **fsdata,
2452 			get_block_t *get_block)
2453 {
2454 	struct inode *inode = mapping->host;
2455 	const unsigned blkbits = inode->i_blkbits;
2456 	const unsigned blocksize = 1 << blkbits;
2457 	struct buffer_head *head, *bh;
2458 	struct page *page;
2459 	pgoff_t index;
2460 	unsigned from, to;
2461 	unsigned block_in_page;
2462 	unsigned block_start, block_end;
2463 	sector_t block_in_file;
2464 	int nr_reads = 0;
2465 	int ret = 0;
2466 	int is_mapped_to_disk = 1;
2467 
2468 	index = pos >> PAGE_CACHE_SHIFT;
2469 	from = pos & (PAGE_CACHE_SIZE - 1);
2470 	to = from + len;
2471 
2472 	page = grab_cache_page_write_begin(mapping, index, flags);
2473 	if (!page)
2474 		return -ENOMEM;
2475 	*pagep = page;
2476 	*fsdata = NULL;
2477 
2478 	if (page_has_buffers(page)) {
2479 		ret = __block_write_begin(page, pos, len, get_block);
2480 		if (unlikely(ret))
2481 			goto out_release;
2482 		return ret;
2483 	}
2484 
2485 	if (PageMappedToDisk(page))
2486 		return 0;
2487 
2488 	/*
2489 	 * Allocate buffers so that we can keep track of state, and potentially
2490 	 * attach them to the page if an error occurs. In the common case of
2491 	 * no error, they will just be freed again without ever being attached
2492 	 * to the page (which is all OK, because we're under the page lock).
2493 	 *
2494 	 * Be careful: the buffer linked list is a NULL terminated one, rather
2495 	 * than the circular one we're used to.
2496 	 */
2497 	head = alloc_page_buffers(page, blocksize, 0);
2498 	if (!head) {
2499 		ret = -ENOMEM;
2500 		goto out_release;
2501 	}
2502 
2503 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2504 
2505 	/*
2506 	 * We loop across all blocks in the page, whether or not they are
2507 	 * part of the affected region.  This is so we can discover if the
2508 	 * page is fully mapped-to-disk.
2509 	 */
2510 	for (block_start = 0, block_in_page = 0, bh = head;
2511 		  block_start < PAGE_CACHE_SIZE;
2512 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2513 		int create;
2514 
2515 		block_end = block_start + blocksize;
2516 		bh->b_state = 0;
2517 		create = 1;
2518 		if (block_start >= to)
2519 			create = 0;
2520 		ret = get_block(inode, block_in_file + block_in_page,
2521 					bh, create);
2522 		if (ret)
2523 			goto failed;
2524 		if (!buffer_mapped(bh))
2525 			is_mapped_to_disk = 0;
2526 		if (buffer_new(bh))
2527 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2528 		if (PageUptodate(page)) {
2529 			set_buffer_uptodate(bh);
2530 			continue;
2531 		}
2532 		if (buffer_new(bh) || !buffer_mapped(bh)) {
2533 			zero_user_segments(page, block_start, from,
2534 							to, block_end);
2535 			continue;
2536 		}
2537 		if (buffer_uptodate(bh))
2538 			continue;	/* reiserfs does this */
2539 		if (block_start < from || block_end > to) {
2540 			lock_buffer(bh);
2541 			bh->b_end_io = end_buffer_read_nobh;
2542 			submit_bh(READ, bh);
2543 			nr_reads++;
2544 		}
2545 	}
2546 
2547 	if (nr_reads) {
2548 		/*
2549 		 * The page is locked, so these buffers are protected from
2550 		 * any VM or truncate activity.  Hence we don't need to care
2551 		 * for the buffer_head refcounts.
2552 		 */
2553 		for (bh = head; bh; bh = bh->b_this_page) {
2554 			wait_on_buffer(bh);
2555 			if (!buffer_uptodate(bh))
2556 				ret = -EIO;
2557 		}
2558 		if (ret)
2559 			goto failed;
2560 	}
2561 
2562 	if (is_mapped_to_disk)
2563 		SetPageMappedToDisk(page);
2564 
2565 	*fsdata = head; /* to be released by nobh_write_end */
2566 
2567 	return 0;
2568 
2569 failed:
2570 	BUG_ON(!ret);
2571 	/*
2572 	 * Error recovery is a bit difficult. We need to zero out blocks that
2573 	 * were newly allocated, and dirty them to ensure they get written out.
2574 	 * Buffers need to be attached to the page at this point, otherwise
2575 	 * the handling of potential IO errors during writeout would be hard
2576 	 * (could try doing synchronous writeout, but what if that fails too?)
2577 	 */
2578 	attach_nobh_buffers(page, head);
2579 	page_zero_new_buffers(page, from, to);
2580 
2581 out_release:
2582 	unlock_page(page);
2583 	page_cache_release(page);
2584 	*pagep = NULL;
2585 
2586 	return ret;
2587 }
2588 EXPORT_SYMBOL(nobh_write_begin);
2589 
2590 int nobh_write_end(struct file *file, struct address_space *mapping,
2591 			loff_t pos, unsigned len, unsigned copied,
2592 			struct page *page, void *fsdata)
2593 {
2594 	struct inode *inode = page->mapping->host;
2595 	struct buffer_head *head = fsdata;
2596 	struct buffer_head *bh;
2597 	BUG_ON(fsdata != NULL && page_has_buffers(page));
2598 
2599 	if (unlikely(copied < len) && head)
2600 		attach_nobh_buffers(page, head);
2601 	if (page_has_buffers(page))
2602 		return generic_write_end(file, mapping, pos, len,
2603 					copied, page, fsdata);
2604 
2605 	SetPageUptodate(page);
2606 	set_page_dirty(page);
2607 	if (pos+copied > inode->i_size) {
2608 		i_size_write(inode, pos+copied);
2609 		mark_inode_dirty(inode);
2610 	}
2611 
2612 	unlock_page(page);
2613 	page_cache_release(page);
2614 
2615 	while (head) {
2616 		bh = head;
2617 		head = head->b_this_page;
2618 		free_buffer_head(bh);
2619 	}
2620 
2621 	return copied;
2622 }
2623 EXPORT_SYMBOL(nobh_write_end);
2624 
2625 /*
2626  * nobh_writepage() - based on block_full_write_page() except
2627  * that it tries to operate without attaching bufferheads to
2628  * the page.
2629  */
2630 int nobh_writepage(struct page *page, get_block_t *get_block,
2631 			struct writeback_control *wbc)
2632 {
2633 	struct inode * const inode = page->mapping->host;
2634 	loff_t i_size = i_size_read(inode);
2635 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2636 	unsigned offset;
2637 	int ret;
2638 
2639 	/* Is the page fully inside i_size? */
2640 	if (page->index < end_index)
2641 		goto out;
2642 
2643 	/* Is the page fully outside i_size? (truncate in progress) */
2644 	offset = i_size & (PAGE_CACHE_SIZE-1);
2645 	if (page->index >= end_index+1 || !offset) {
2646 		/*
2647 		 * The page may have dirty, unmapped buffers.  For example,
2648 		 * they may have been added in ext3_writepage().  Make them
2649 		 * freeable here, so the page does not leak.
2650 		 */
2651 #if 0
2652 		/* Not really sure about this  - do we need this ? */
2653 		if (page->mapping->a_ops->invalidatepage)
2654 			page->mapping->a_ops->invalidatepage(page, offset);
2655 #endif
2656 		unlock_page(page);
2657 		return 0; /* don't care */
2658 	}
2659 
2660 	/*
2661 	 * The page straddles i_size.  It must be zeroed out on each and every
2662 	 * writepage invocation because it may be mmapped.  "A file is mapped
2663 	 * in multiples of the page size.  For a file that is not a multiple of
2664 	 * the  page size, the remaining memory is zeroed when mapped, and
2665 	 * writes to that region are not written out to the file."
2666 	 */
2667 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2668 out:
2669 	ret = mpage_writepage(page, get_block, wbc);
2670 	if (ret == -EAGAIN)
2671 		ret = __block_write_full_page(inode, page, get_block, wbc,
2672 					      end_buffer_async_write);
2673 	return ret;
2674 }
2675 EXPORT_SYMBOL(nobh_writepage);
2676 
2677 int nobh_truncate_page(struct address_space *mapping,
2678 			loff_t from, get_block_t *get_block)
2679 {
2680 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2681 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2682 	unsigned blocksize;
2683 	sector_t iblock;
2684 	unsigned length, pos;
2685 	struct inode *inode = mapping->host;
2686 	struct page *page;
2687 	struct buffer_head map_bh;
2688 	int err;
2689 
2690 	blocksize = 1 << inode->i_blkbits;
2691 	length = offset & (blocksize - 1);
2692 
2693 	/* Block boundary? Nothing to do */
2694 	if (!length)
2695 		return 0;
2696 
2697 	length = blocksize - length;
2698 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2699 
2700 	page = grab_cache_page(mapping, index);
2701 	err = -ENOMEM;
2702 	if (!page)
2703 		goto out;
2704 
2705 	if (page_has_buffers(page)) {
2706 has_buffers:
2707 		unlock_page(page);
2708 		page_cache_release(page);
2709 		return block_truncate_page(mapping, from, get_block);
2710 	}
2711 
2712 	/* Find the buffer that contains "offset" */
2713 	pos = blocksize;
2714 	while (offset >= pos) {
2715 		iblock++;
2716 		pos += blocksize;
2717 	}
2718 
2719 	map_bh.b_size = blocksize;
2720 	map_bh.b_state = 0;
2721 	err = get_block(inode, iblock, &map_bh, 0);
2722 	if (err)
2723 		goto unlock;
2724 	/* unmapped? It's a hole - nothing to do */
2725 	if (!buffer_mapped(&map_bh))
2726 		goto unlock;
2727 
2728 	/* Ok, it's mapped. Make sure it's up-to-date */
2729 	if (!PageUptodate(page)) {
2730 		err = mapping->a_ops->readpage(NULL, page);
2731 		if (err) {
2732 			page_cache_release(page);
2733 			goto out;
2734 		}
2735 		lock_page(page);
2736 		if (!PageUptodate(page)) {
2737 			err = -EIO;
2738 			goto unlock;
2739 		}
2740 		if (page_has_buffers(page))
2741 			goto has_buffers;
2742 	}
2743 	zero_user(page, offset, length);
2744 	set_page_dirty(page);
2745 	err = 0;
2746 
2747 unlock:
2748 	unlock_page(page);
2749 	page_cache_release(page);
2750 out:
2751 	return err;
2752 }
2753 EXPORT_SYMBOL(nobh_truncate_page);
2754 
2755 int block_truncate_page(struct address_space *mapping,
2756 			loff_t from, get_block_t *get_block)
2757 {
2758 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2759 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2760 	unsigned blocksize;
2761 	sector_t iblock;
2762 	unsigned length, pos;
2763 	struct inode *inode = mapping->host;
2764 	struct page *page;
2765 	struct buffer_head *bh;
2766 	int err;
2767 
2768 	blocksize = 1 << inode->i_blkbits;
2769 	length = offset & (blocksize - 1);
2770 
2771 	/* Block boundary? Nothing to do */
2772 	if (!length)
2773 		return 0;
2774 
2775 	length = blocksize - length;
2776 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2777 
2778 	page = grab_cache_page(mapping, index);
2779 	err = -ENOMEM;
2780 	if (!page)
2781 		goto out;
2782 
2783 	if (!page_has_buffers(page))
2784 		create_empty_buffers(page, blocksize, 0);
2785 
2786 	/* Find the buffer that contains "offset" */
2787 	bh = page_buffers(page);
2788 	pos = blocksize;
2789 	while (offset >= pos) {
2790 		bh = bh->b_this_page;
2791 		iblock++;
2792 		pos += blocksize;
2793 	}
2794 
2795 	err = 0;
2796 	if (!buffer_mapped(bh)) {
2797 		WARN_ON(bh->b_size != blocksize);
2798 		err = get_block(inode, iblock, bh, 0);
2799 		if (err)
2800 			goto unlock;
2801 		/* unmapped? It's a hole - nothing to do */
2802 		if (!buffer_mapped(bh))
2803 			goto unlock;
2804 	}
2805 
2806 	/* Ok, it's mapped. Make sure it's up-to-date */
2807 	if (PageUptodate(page))
2808 		set_buffer_uptodate(bh);
2809 
2810 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2811 		err = -EIO;
2812 		ll_rw_block(READ, 1, &bh);
2813 		wait_on_buffer(bh);
2814 		/* Uhhuh. Read error. Complain and punt. */
2815 		if (!buffer_uptodate(bh))
2816 			goto unlock;
2817 	}
2818 
2819 	zero_user(page, offset, length);
2820 	mark_buffer_dirty(bh);
2821 	err = 0;
2822 
2823 unlock:
2824 	unlock_page(page);
2825 	page_cache_release(page);
2826 out:
2827 	return err;
2828 }
2829 EXPORT_SYMBOL(block_truncate_page);
2830 
2831 /*
2832  * The generic ->writepage function for buffer-backed address_spaces
2833  * this form passes in the end_io handler used to finish the IO.
2834  */
2835 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2836 			struct writeback_control *wbc, bh_end_io_t *handler)
2837 {
2838 	struct inode * const inode = page->mapping->host;
2839 	loff_t i_size = i_size_read(inode);
2840 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2841 	unsigned offset;
2842 
2843 	/* Is the page fully inside i_size? */
2844 	if (page->index < end_index)
2845 		return __block_write_full_page(inode, page, get_block, wbc,
2846 					       handler);
2847 
2848 	/* Is the page fully outside i_size? (truncate in progress) */
2849 	offset = i_size & (PAGE_CACHE_SIZE-1);
2850 	if (page->index >= end_index+1 || !offset) {
2851 		/*
2852 		 * The page may have dirty, unmapped buffers.  For example,
2853 		 * they may have been added in ext3_writepage().  Make them
2854 		 * freeable here, so the page does not leak.
2855 		 */
2856 		do_invalidatepage(page, 0);
2857 		unlock_page(page);
2858 		return 0; /* don't care */
2859 	}
2860 
2861 	/*
2862 	 * The page straddles i_size.  It must be zeroed out on each and every
2863 	 * writepage invocation because it may be mmapped.  "A file is mapped
2864 	 * in multiples of the page size.  For a file that is not a multiple of
2865 	 * the  page size, the remaining memory is zeroed when mapped, and
2866 	 * writes to that region are not written out to the file."
2867 	 */
2868 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2869 	return __block_write_full_page(inode, page, get_block, wbc, handler);
2870 }
2871 EXPORT_SYMBOL(block_write_full_page_endio);
2872 
2873 /*
2874  * The generic ->writepage function for buffer-backed address_spaces
2875  */
2876 int block_write_full_page(struct page *page, get_block_t *get_block,
2877 			struct writeback_control *wbc)
2878 {
2879 	return block_write_full_page_endio(page, get_block, wbc,
2880 					   end_buffer_async_write);
2881 }
2882 EXPORT_SYMBOL(block_write_full_page);
2883 
2884 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2885 			    get_block_t *get_block)
2886 {
2887 	struct buffer_head tmp;
2888 	struct inode *inode = mapping->host;
2889 	tmp.b_state = 0;
2890 	tmp.b_blocknr = 0;
2891 	tmp.b_size = 1 << inode->i_blkbits;
2892 	get_block(inode, block, &tmp, 0);
2893 	return tmp.b_blocknr;
2894 }
2895 EXPORT_SYMBOL(generic_block_bmap);
2896 
2897 static void end_bio_bh_io_sync(struct bio *bio, int err)
2898 {
2899 	struct buffer_head *bh = bio->bi_private;
2900 
2901 	if (err == -EOPNOTSUPP) {
2902 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2903 	}
2904 
2905 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2906 		set_bit(BH_Quiet, &bh->b_state);
2907 
2908 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2909 	bio_put(bio);
2910 }
2911 
2912 int submit_bh(int rw, struct buffer_head * bh)
2913 {
2914 	struct bio *bio;
2915 	int ret = 0;
2916 
2917 	BUG_ON(!buffer_locked(bh));
2918 	BUG_ON(!buffer_mapped(bh));
2919 	BUG_ON(!bh->b_end_io);
2920 	BUG_ON(buffer_delay(bh));
2921 	BUG_ON(buffer_unwritten(bh));
2922 
2923 	/*
2924 	 * Only clear out a write error when rewriting
2925 	 */
2926 	if (test_set_buffer_req(bh) && (rw & WRITE))
2927 		clear_buffer_write_io_error(bh);
2928 
2929 	/*
2930 	 * from here on down, it's all bio -- do the initial mapping,
2931 	 * submit_bio -> generic_make_request may further map this bio around
2932 	 */
2933 	bio = bio_alloc(GFP_NOIO, 1);
2934 
2935 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2936 	bio->bi_bdev = bh->b_bdev;
2937 	bio->bi_io_vec[0].bv_page = bh->b_page;
2938 	bio->bi_io_vec[0].bv_len = bh->b_size;
2939 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2940 
2941 	bio->bi_vcnt = 1;
2942 	bio->bi_idx = 0;
2943 	bio->bi_size = bh->b_size;
2944 
2945 	bio->bi_end_io = end_bio_bh_io_sync;
2946 	bio->bi_private = bh;
2947 
2948 	bio_get(bio);
2949 	submit_bio(rw, bio);
2950 
2951 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2952 		ret = -EOPNOTSUPP;
2953 
2954 	bio_put(bio);
2955 	return ret;
2956 }
2957 EXPORT_SYMBOL(submit_bh);
2958 
2959 /**
2960  * ll_rw_block: low-level access to block devices (DEPRECATED)
2961  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2962  * @nr: number of &struct buffer_heads in the array
2963  * @bhs: array of pointers to &struct buffer_head
2964  *
2965  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2966  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2967  * %READA option is described in the documentation for generic_make_request()
2968  * which ll_rw_block() calls.
2969  *
2970  * This function drops any buffer that it cannot get a lock on (with the
2971  * BH_Lock state bit), any buffer that appears to be clean when doing a write
2972  * request, and any buffer that appears to be up-to-date when doing read
2973  * request.  Further it marks as clean buffers that are processed for
2974  * writing (the buffer cache won't assume that they are actually clean
2975  * until the buffer gets unlocked).
2976  *
2977  * ll_rw_block sets b_end_io to simple completion handler that marks
2978  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2979  * any waiters.
2980  *
2981  * All of the buffers must be for the same device, and must also be a
2982  * multiple of the current approved size for the device.
2983  */
2984 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2985 {
2986 	int i;
2987 
2988 	for (i = 0; i < nr; i++) {
2989 		struct buffer_head *bh = bhs[i];
2990 
2991 		if (!trylock_buffer(bh))
2992 			continue;
2993 		if (rw == WRITE) {
2994 			if (test_clear_buffer_dirty(bh)) {
2995 				bh->b_end_io = end_buffer_write_sync;
2996 				get_bh(bh);
2997 				submit_bh(WRITE, bh);
2998 				continue;
2999 			}
3000 		} else {
3001 			if (!buffer_uptodate(bh)) {
3002 				bh->b_end_io = end_buffer_read_sync;
3003 				get_bh(bh);
3004 				submit_bh(rw, bh);
3005 				continue;
3006 			}
3007 		}
3008 		unlock_buffer(bh);
3009 	}
3010 }
3011 EXPORT_SYMBOL(ll_rw_block);
3012 
3013 void write_dirty_buffer(struct buffer_head *bh, int rw)
3014 {
3015 	lock_buffer(bh);
3016 	if (!test_clear_buffer_dirty(bh)) {
3017 		unlock_buffer(bh);
3018 		return;
3019 	}
3020 	bh->b_end_io = end_buffer_write_sync;
3021 	get_bh(bh);
3022 	submit_bh(rw, bh);
3023 }
3024 EXPORT_SYMBOL(write_dirty_buffer);
3025 
3026 /*
3027  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3028  * and then start new I/O and then wait upon it.  The caller must have a ref on
3029  * the buffer_head.
3030  */
3031 int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3032 {
3033 	int ret = 0;
3034 
3035 	WARN_ON(atomic_read(&bh->b_count) < 1);
3036 	lock_buffer(bh);
3037 	if (test_clear_buffer_dirty(bh)) {
3038 		get_bh(bh);
3039 		bh->b_end_io = end_buffer_write_sync;
3040 		ret = submit_bh(rw, bh);
3041 		wait_on_buffer(bh);
3042 		if (!ret && !buffer_uptodate(bh))
3043 			ret = -EIO;
3044 	} else {
3045 		unlock_buffer(bh);
3046 	}
3047 	return ret;
3048 }
3049 EXPORT_SYMBOL(__sync_dirty_buffer);
3050 
3051 int sync_dirty_buffer(struct buffer_head *bh)
3052 {
3053 	return __sync_dirty_buffer(bh, WRITE_SYNC);
3054 }
3055 EXPORT_SYMBOL(sync_dirty_buffer);
3056 
3057 /*
3058  * try_to_free_buffers() checks if all the buffers on this particular page
3059  * are unused, and releases them if so.
3060  *
3061  * Exclusion against try_to_free_buffers may be obtained by either
3062  * locking the page or by holding its mapping's private_lock.
3063  *
3064  * If the page is dirty but all the buffers are clean then we need to
3065  * be sure to mark the page clean as well.  This is because the page
3066  * may be against a block device, and a later reattachment of buffers
3067  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3068  * filesystem data on the same device.
3069  *
3070  * The same applies to regular filesystem pages: if all the buffers are
3071  * clean then we set the page clean and proceed.  To do that, we require
3072  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3073  * private_lock.
3074  *
3075  * try_to_free_buffers() is non-blocking.
3076  */
3077 static inline int buffer_busy(struct buffer_head *bh)
3078 {
3079 	return atomic_read(&bh->b_count) |
3080 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3081 }
3082 
3083 static int
3084 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3085 {
3086 	struct buffer_head *head = page_buffers(page);
3087 	struct buffer_head *bh;
3088 
3089 	bh = head;
3090 	do {
3091 		if (buffer_write_io_error(bh) && page->mapping)
3092 			set_bit(AS_EIO, &page->mapping->flags);
3093 		if (buffer_busy(bh))
3094 			goto failed;
3095 		bh = bh->b_this_page;
3096 	} while (bh != head);
3097 
3098 	do {
3099 		struct buffer_head *next = bh->b_this_page;
3100 
3101 		if (bh->b_assoc_map)
3102 			__remove_assoc_queue(bh);
3103 		bh = next;
3104 	} while (bh != head);
3105 	*buffers_to_free = head;
3106 	__clear_page_buffers(page);
3107 	return 1;
3108 failed:
3109 	return 0;
3110 }
3111 
3112 int try_to_free_buffers(struct page *page)
3113 {
3114 	struct address_space * const mapping = page->mapping;
3115 	struct buffer_head *buffers_to_free = NULL;
3116 	int ret = 0;
3117 
3118 	BUG_ON(!PageLocked(page));
3119 	if (PageWriteback(page))
3120 		return 0;
3121 
3122 	if (mapping == NULL) {		/* can this still happen? */
3123 		ret = drop_buffers(page, &buffers_to_free);
3124 		goto out;
3125 	}
3126 
3127 	spin_lock(&mapping->private_lock);
3128 	ret = drop_buffers(page, &buffers_to_free);
3129 
3130 	/*
3131 	 * If the filesystem writes its buffers by hand (eg ext3)
3132 	 * then we can have clean buffers against a dirty page.  We
3133 	 * clean the page here; otherwise the VM will never notice
3134 	 * that the filesystem did any IO at all.
3135 	 *
3136 	 * Also, during truncate, discard_buffer will have marked all
3137 	 * the page's buffers clean.  We discover that here and clean
3138 	 * the page also.
3139 	 *
3140 	 * private_lock must be held over this entire operation in order
3141 	 * to synchronise against __set_page_dirty_buffers and prevent the
3142 	 * dirty bit from being lost.
3143 	 */
3144 	if (ret)
3145 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3146 	spin_unlock(&mapping->private_lock);
3147 out:
3148 	if (buffers_to_free) {
3149 		struct buffer_head *bh = buffers_to_free;
3150 
3151 		do {
3152 			struct buffer_head *next = bh->b_this_page;
3153 			free_buffer_head(bh);
3154 			bh = next;
3155 		} while (bh != buffers_to_free);
3156 	}
3157 	return ret;
3158 }
3159 EXPORT_SYMBOL(try_to_free_buffers);
3160 
3161 /*
3162  * There are no bdflush tunables left.  But distributions are
3163  * still running obsolete flush daemons, so we terminate them here.
3164  *
3165  * Use of bdflush() is deprecated and will be removed in a future kernel.
3166  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3167  */
3168 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3169 {
3170 	static int msg_count;
3171 
3172 	if (!capable(CAP_SYS_ADMIN))
3173 		return -EPERM;
3174 
3175 	if (msg_count < 5) {
3176 		msg_count++;
3177 		printk(KERN_INFO
3178 			"warning: process `%s' used the obsolete bdflush"
3179 			" system call\n", current->comm);
3180 		printk(KERN_INFO "Fix your initscripts?\n");
3181 	}
3182 
3183 	if (func == 1)
3184 		do_exit(0);
3185 	return 0;
3186 }
3187 
3188 /*
3189  * Buffer-head allocation
3190  */
3191 static struct kmem_cache *bh_cachep;
3192 
3193 /*
3194  * Once the number of bh's in the machine exceeds this level, we start
3195  * stripping them in writeback.
3196  */
3197 static int max_buffer_heads;
3198 
3199 int buffer_heads_over_limit;
3200 
3201 struct bh_accounting {
3202 	int nr;			/* Number of live bh's */
3203 	int ratelimit;		/* Limit cacheline bouncing */
3204 };
3205 
3206 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3207 
3208 static void recalc_bh_state(void)
3209 {
3210 	int i;
3211 	int tot = 0;
3212 
3213 	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3214 		return;
3215 	__this_cpu_write(bh_accounting.ratelimit, 0);
3216 	for_each_online_cpu(i)
3217 		tot += per_cpu(bh_accounting, i).nr;
3218 	buffer_heads_over_limit = (tot > max_buffer_heads);
3219 }
3220 
3221 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3222 {
3223 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3224 	if (ret) {
3225 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3226 		preempt_disable();
3227 		__this_cpu_inc(bh_accounting.nr);
3228 		recalc_bh_state();
3229 		preempt_enable();
3230 	}
3231 	return ret;
3232 }
3233 EXPORT_SYMBOL(alloc_buffer_head);
3234 
3235 void free_buffer_head(struct buffer_head *bh)
3236 {
3237 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3238 	kmem_cache_free(bh_cachep, bh);
3239 	preempt_disable();
3240 	__this_cpu_dec(bh_accounting.nr);
3241 	recalc_bh_state();
3242 	preempt_enable();
3243 }
3244 EXPORT_SYMBOL(free_buffer_head);
3245 
3246 static void buffer_exit_cpu(int cpu)
3247 {
3248 	int i;
3249 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3250 
3251 	for (i = 0; i < BH_LRU_SIZE; i++) {
3252 		brelse(b->bhs[i]);
3253 		b->bhs[i] = NULL;
3254 	}
3255 	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3256 	per_cpu(bh_accounting, cpu).nr = 0;
3257 }
3258 
3259 static int buffer_cpu_notify(struct notifier_block *self,
3260 			      unsigned long action, void *hcpu)
3261 {
3262 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3263 		buffer_exit_cpu((unsigned long)hcpu);
3264 	return NOTIFY_OK;
3265 }
3266 
3267 /**
3268  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3269  * @bh: struct buffer_head
3270  *
3271  * Return true if the buffer is up-to-date and false,
3272  * with the buffer locked, if not.
3273  */
3274 int bh_uptodate_or_lock(struct buffer_head *bh)
3275 {
3276 	if (!buffer_uptodate(bh)) {
3277 		lock_buffer(bh);
3278 		if (!buffer_uptodate(bh))
3279 			return 0;
3280 		unlock_buffer(bh);
3281 	}
3282 	return 1;
3283 }
3284 EXPORT_SYMBOL(bh_uptodate_or_lock);
3285 
3286 /**
3287  * bh_submit_read - Submit a locked buffer for reading
3288  * @bh: struct buffer_head
3289  *
3290  * Returns zero on success and -EIO on error.
3291  */
3292 int bh_submit_read(struct buffer_head *bh)
3293 {
3294 	BUG_ON(!buffer_locked(bh));
3295 
3296 	if (buffer_uptodate(bh)) {
3297 		unlock_buffer(bh);
3298 		return 0;
3299 	}
3300 
3301 	get_bh(bh);
3302 	bh->b_end_io = end_buffer_read_sync;
3303 	submit_bh(READ, bh);
3304 	wait_on_buffer(bh);
3305 	if (buffer_uptodate(bh))
3306 		return 0;
3307 	return -EIO;
3308 }
3309 EXPORT_SYMBOL(bh_submit_read);
3310 
3311 void __init buffer_init(void)
3312 {
3313 	int nrpages;
3314 
3315 	bh_cachep = kmem_cache_create("buffer_head",
3316 			sizeof(struct buffer_head), 0,
3317 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3318 				SLAB_MEM_SPREAD),
3319 				NULL);
3320 
3321 	/*
3322 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3323 	 */
3324 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3325 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3326 	hotcpu_notifier(buffer_cpu_notify, 0);
3327 }
3328