xref: /linux/fs/buffer.c (revision e8f4aa6087fa80732382881ef7c0c96733bb1984)
1  /*
2   *  linux/fs/buffer.c
3   *
4   *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5   */
6  
7  /*
8   * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9   *
10   * Removed a lot of unnecessary code and simplified things now that
11   * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12   *
13   * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14   * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15   *
16   * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17   *
18   * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19   */
20  
21  #include <linux/kernel.h>
22  #include <linux/syscalls.h>
23  #include <linux/fs.h>
24  #include <linux/iomap.h>
25  #include <linux/mm.h>
26  #include <linux/percpu.h>
27  #include <linux/slab.h>
28  #include <linux/capability.h>
29  #include <linux/blkdev.h>
30  #include <linux/file.h>
31  #include <linux/quotaops.h>
32  #include <linux/highmem.h>
33  #include <linux/export.h>
34  #include <linux/backing-dev.h>
35  #include <linux/writeback.h>
36  #include <linux/hash.h>
37  #include <linux/suspend.h>
38  #include <linux/buffer_head.h>
39  #include <linux/task_io_accounting_ops.h>
40  #include <linux/bio.h>
41  #include <linux/notifier.h>
42  #include <linux/cpu.h>
43  #include <linux/bitops.h>
44  #include <linux/mpage.h>
45  #include <linux/bit_spinlock.h>
46  #include <trace/events/block.h>
47  
48  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
49  static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
50  			 unsigned long bio_flags,
51  			 struct writeback_control *wbc);
52  
53  #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
54  
55  void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
56  {
57  	bh->b_end_io = handler;
58  	bh->b_private = private;
59  }
60  EXPORT_SYMBOL(init_buffer);
61  
62  inline void touch_buffer(struct buffer_head *bh)
63  {
64  	trace_block_touch_buffer(bh);
65  	mark_page_accessed(bh->b_page);
66  }
67  EXPORT_SYMBOL(touch_buffer);
68  
69  void __lock_buffer(struct buffer_head *bh)
70  {
71  	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
72  }
73  EXPORT_SYMBOL(__lock_buffer);
74  
75  void unlock_buffer(struct buffer_head *bh)
76  {
77  	clear_bit_unlock(BH_Lock, &bh->b_state);
78  	smp_mb__after_atomic();
79  	wake_up_bit(&bh->b_state, BH_Lock);
80  }
81  EXPORT_SYMBOL(unlock_buffer);
82  
83  /*
84   * Returns if the page has dirty or writeback buffers. If all the buffers
85   * are unlocked and clean then the PageDirty information is stale. If
86   * any of the pages are locked, it is assumed they are locked for IO.
87   */
88  void buffer_check_dirty_writeback(struct page *page,
89  				     bool *dirty, bool *writeback)
90  {
91  	struct buffer_head *head, *bh;
92  	*dirty = false;
93  	*writeback = false;
94  
95  	BUG_ON(!PageLocked(page));
96  
97  	if (!page_has_buffers(page))
98  		return;
99  
100  	if (PageWriteback(page))
101  		*writeback = true;
102  
103  	head = page_buffers(page);
104  	bh = head;
105  	do {
106  		if (buffer_locked(bh))
107  			*writeback = true;
108  
109  		if (buffer_dirty(bh))
110  			*dirty = true;
111  
112  		bh = bh->b_this_page;
113  	} while (bh != head);
114  }
115  EXPORT_SYMBOL(buffer_check_dirty_writeback);
116  
117  /*
118   * Block until a buffer comes unlocked.  This doesn't stop it
119   * from becoming locked again - you have to lock it yourself
120   * if you want to preserve its state.
121   */
122  void __wait_on_buffer(struct buffer_head * bh)
123  {
124  	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
125  }
126  EXPORT_SYMBOL(__wait_on_buffer);
127  
128  static void
129  __clear_page_buffers(struct page *page)
130  {
131  	ClearPagePrivate(page);
132  	set_page_private(page, 0);
133  	put_page(page);
134  }
135  
136  static void buffer_io_error(struct buffer_head *bh, char *msg)
137  {
138  	if (!test_bit(BH_Quiet, &bh->b_state))
139  		printk_ratelimited(KERN_ERR
140  			"Buffer I/O error on dev %pg, logical block %llu%s\n",
141  			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
142  }
143  
144  /*
145   * End-of-IO handler helper function which does not touch the bh after
146   * unlocking it.
147   * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
148   * a race there is benign: unlock_buffer() only use the bh's address for
149   * hashing after unlocking the buffer, so it doesn't actually touch the bh
150   * itself.
151   */
152  static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
153  {
154  	if (uptodate) {
155  		set_buffer_uptodate(bh);
156  	} else {
157  		/* This happens, due to failed read-ahead attempts. */
158  		clear_buffer_uptodate(bh);
159  	}
160  	unlock_buffer(bh);
161  }
162  
163  /*
164   * Default synchronous end-of-IO handler..  Just mark it up-to-date and
165   * unlock the buffer. This is what ll_rw_block uses too.
166   */
167  void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
168  {
169  	__end_buffer_read_notouch(bh, uptodate);
170  	put_bh(bh);
171  }
172  EXPORT_SYMBOL(end_buffer_read_sync);
173  
174  void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
175  {
176  	if (uptodate) {
177  		set_buffer_uptodate(bh);
178  	} else {
179  		buffer_io_error(bh, ", lost sync page write");
180  		set_buffer_write_io_error(bh);
181  		clear_buffer_uptodate(bh);
182  	}
183  	unlock_buffer(bh);
184  	put_bh(bh);
185  }
186  EXPORT_SYMBOL(end_buffer_write_sync);
187  
188  /*
189   * Various filesystems appear to want __find_get_block to be non-blocking.
190   * But it's the page lock which protects the buffers.  To get around this,
191   * we get exclusion from try_to_free_buffers with the blockdev mapping's
192   * private_lock.
193   *
194   * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
195   * may be quite high.  This code could TryLock the page, and if that
196   * succeeds, there is no need to take private_lock. (But if
197   * private_lock is contended then so is mapping->tree_lock).
198   */
199  static struct buffer_head *
200  __find_get_block_slow(struct block_device *bdev, sector_t block)
201  {
202  	struct inode *bd_inode = bdev->bd_inode;
203  	struct address_space *bd_mapping = bd_inode->i_mapping;
204  	struct buffer_head *ret = NULL;
205  	pgoff_t index;
206  	struct buffer_head *bh;
207  	struct buffer_head *head;
208  	struct page *page;
209  	int all_mapped = 1;
210  
211  	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
212  	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
213  	if (!page)
214  		goto out;
215  
216  	spin_lock(&bd_mapping->private_lock);
217  	if (!page_has_buffers(page))
218  		goto out_unlock;
219  	head = page_buffers(page);
220  	bh = head;
221  	do {
222  		if (!buffer_mapped(bh))
223  			all_mapped = 0;
224  		else if (bh->b_blocknr == block) {
225  			ret = bh;
226  			get_bh(bh);
227  			goto out_unlock;
228  		}
229  		bh = bh->b_this_page;
230  	} while (bh != head);
231  
232  	/* we might be here because some of the buffers on this page are
233  	 * not mapped.  This is due to various races between
234  	 * file io on the block device and getblk.  It gets dealt with
235  	 * elsewhere, don't buffer_error if we had some unmapped buffers
236  	 */
237  	if (all_mapped) {
238  		printk("__find_get_block_slow() failed. "
239  			"block=%llu, b_blocknr=%llu\n",
240  			(unsigned long long)block,
241  			(unsigned long long)bh->b_blocknr);
242  		printk("b_state=0x%08lx, b_size=%zu\n",
243  			bh->b_state, bh->b_size);
244  		printk("device %pg blocksize: %d\n", bdev,
245  			1 << bd_inode->i_blkbits);
246  	}
247  out_unlock:
248  	spin_unlock(&bd_mapping->private_lock);
249  	put_page(page);
250  out:
251  	return ret;
252  }
253  
254  /*
255   * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
256   */
257  static void free_more_memory(void)
258  {
259  	struct zoneref *z;
260  	int nid;
261  
262  	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
263  	yield();
264  
265  	for_each_online_node(nid) {
266  
267  		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
268  						gfp_zone(GFP_NOFS), NULL);
269  		if (z->zone)
270  			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
271  						GFP_NOFS, NULL);
272  	}
273  }
274  
275  /*
276   * I/O completion handler for block_read_full_page() - pages
277   * which come unlocked at the end of I/O.
278   */
279  static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
280  {
281  	unsigned long flags;
282  	struct buffer_head *first;
283  	struct buffer_head *tmp;
284  	struct page *page;
285  	int page_uptodate = 1;
286  
287  	BUG_ON(!buffer_async_read(bh));
288  
289  	page = bh->b_page;
290  	if (uptodate) {
291  		set_buffer_uptodate(bh);
292  	} else {
293  		clear_buffer_uptodate(bh);
294  		buffer_io_error(bh, ", async page read");
295  		SetPageError(page);
296  	}
297  
298  	/*
299  	 * Be _very_ careful from here on. Bad things can happen if
300  	 * two buffer heads end IO at almost the same time and both
301  	 * decide that the page is now completely done.
302  	 */
303  	first = page_buffers(page);
304  	local_irq_save(flags);
305  	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
306  	clear_buffer_async_read(bh);
307  	unlock_buffer(bh);
308  	tmp = bh;
309  	do {
310  		if (!buffer_uptodate(tmp))
311  			page_uptodate = 0;
312  		if (buffer_async_read(tmp)) {
313  			BUG_ON(!buffer_locked(tmp));
314  			goto still_busy;
315  		}
316  		tmp = tmp->b_this_page;
317  	} while (tmp != bh);
318  	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
319  	local_irq_restore(flags);
320  
321  	/*
322  	 * If none of the buffers had errors and they are all
323  	 * uptodate then we can set the page uptodate.
324  	 */
325  	if (page_uptodate && !PageError(page))
326  		SetPageUptodate(page);
327  	unlock_page(page);
328  	return;
329  
330  still_busy:
331  	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
332  	local_irq_restore(flags);
333  	return;
334  }
335  
336  /*
337   * Completion handler for block_write_full_page() - pages which are unlocked
338   * during I/O, and which have PageWriteback cleared upon I/O completion.
339   */
340  void end_buffer_async_write(struct buffer_head *bh, int uptodate)
341  {
342  	unsigned long flags;
343  	struct buffer_head *first;
344  	struct buffer_head *tmp;
345  	struct page *page;
346  
347  	BUG_ON(!buffer_async_write(bh));
348  
349  	page = bh->b_page;
350  	if (uptodate) {
351  		set_buffer_uptodate(bh);
352  	} else {
353  		buffer_io_error(bh, ", lost async page write");
354  		mapping_set_error(page->mapping, -EIO);
355  		set_buffer_write_io_error(bh);
356  		clear_buffer_uptodate(bh);
357  		SetPageError(page);
358  	}
359  
360  	first = page_buffers(page);
361  	local_irq_save(flags);
362  	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
363  
364  	clear_buffer_async_write(bh);
365  	unlock_buffer(bh);
366  	tmp = bh->b_this_page;
367  	while (tmp != bh) {
368  		if (buffer_async_write(tmp)) {
369  			BUG_ON(!buffer_locked(tmp));
370  			goto still_busy;
371  		}
372  		tmp = tmp->b_this_page;
373  	}
374  	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
375  	local_irq_restore(flags);
376  	end_page_writeback(page);
377  	return;
378  
379  still_busy:
380  	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
381  	local_irq_restore(flags);
382  	return;
383  }
384  EXPORT_SYMBOL(end_buffer_async_write);
385  
386  /*
387   * If a page's buffers are under async readin (end_buffer_async_read
388   * completion) then there is a possibility that another thread of
389   * control could lock one of the buffers after it has completed
390   * but while some of the other buffers have not completed.  This
391   * locked buffer would confuse end_buffer_async_read() into not unlocking
392   * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
393   * that this buffer is not under async I/O.
394   *
395   * The page comes unlocked when it has no locked buffer_async buffers
396   * left.
397   *
398   * PageLocked prevents anyone starting new async I/O reads any of
399   * the buffers.
400   *
401   * PageWriteback is used to prevent simultaneous writeout of the same
402   * page.
403   *
404   * PageLocked prevents anyone from starting writeback of a page which is
405   * under read I/O (PageWriteback is only ever set against a locked page).
406   */
407  static void mark_buffer_async_read(struct buffer_head *bh)
408  {
409  	bh->b_end_io = end_buffer_async_read;
410  	set_buffer_async_read(bh);
411  }
412  
413  static void mark_buffer_async_write_endio(struct buffer_head *bh,
414  					  bh_end_io_t *handler)
415  {
416  	bh->b_end_io = handler;
417  	set_buffer_async_write(bh);
418  }
419  
420  void mark_buffer_async_write(struct buffer_head *bh)
421  {
422  	mark_buffer_async_write_endio(bh, end_buffer_async_write);
423  }
424  EXPORT_SYMBOL(mark_buffer_async_write);
425  
426  
427  /*
428   * fs/buffer.c contains helper functions for buffer-backed address space's
429   * fsync functions.  A common requirement for buffer-based filesystems is
430   * that certain data from the backing blockdev needs to be written out for
431   * a successful fsync().  For example, ext2 indirect blocks need to be
432   * written back and waited upon before fsync() returns.
433   *
434   * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
435   * inode_has_buffers() and invalidate_inode_buffers() are provided for the
436   * management of a list of dependent buffers at ->i_mapping->private_list.
437   *
438   * Locking is a little subtle: try_to_free_buffers() will remove buffers
439   * from their controlling inode's queue when they are being freed.  But
440   * try_to_free_buffers() will be operating against the *blockdev* mapping
441   * at the time, not against the S_ISREG file which depends on those buffers.
442   * So the locking for private_list is via the private_lock in the address_space
443   * which backs the buffers.  Which is different from the address_space
444   * against which the buffers are listed.  So for a particular address_space,
445   * mapping->private_lock does *not* protect mapping->private_list!  In fact,
446   * mapping->private_list will always be protected by the backing blockdev's
447   * ->private_lock.
448   *
449   * Which introduces a requirement: all buffers on an address_space's
450   * ->private_list must be from the same address_space: the blockdev's.
451   *
452   * address_spaces which do not place buffers at ->private_list via these
453   * utility functions are free to use private_lock and private_list for
454   * whatever they want.  The only requirement is that list_empty(private_list)
455   * be true at clear_inode() time.
456   *
457   * FIXME: clear_inode should not call invalidate_inode_buffers().  The
458   * filesystems should do that.  invalidate_inode_buffers() should just go
459   * BUG_ON(!list_empty).
460   *
461   * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
462   * take an address_space, not an inode.  And it should be called
463   * mark_buffer_dirty_fsync() to clearly define why those buffers are being
464   * queued up.
465   *
466   * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
467   * list if it is already on a list.  Because if the buffer is on a list,
468   * it *must* already be on the right one.  If not, the filesystem is being
469   * silly.  This will save a ton of locking.  But first we have to ensure
470   * that buffers are taken *off* the old inode's list when they are freed
471   * (presumably in truncate).  That requires careful auditing of all
472   * filesystems (do it inside bforget()).  It could also be done by bringing
473   * b_inode back.
474   */
475  
476  /*
477   * The buffer's backing address_space's private_lock must be held
478   */
479  static void __remove_assoc_queue(struct buffer_head *bh)
480  {
481  	list_del_init(&bh->b_assoc_buffers);
482  	WARN_ON(!bh->b_assoc_map);
483  	if (buffer_write_io_error(bh))
484  		set_bit(AS_EIO, &bh->b_assoc_map->flags);
485  	bh->b_assoc_map = NULL;
486  }
487  
488  int inode_has_buffers(struct inode *inode)
489  {
490  	return !list_empty(&inode->i_data.private_list);
491  }
492  
493  /*
494   * osync is designed to support O_SYNC io.  It waits synchronously for
495   * all already-submitted IO to complete, but does not queue any new
496   * writes to the disk.
497   *
498   * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
499   * you dirty the buffers, and then use osync_inode_buffers to wait for
500   * completion.  Any other dirty buffers which are not yet queued for
501   * write will not be flushed to disk by the osync.
502   */
503  static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
504  {
505  	struct buffer_head *bh;
506  	struct list_head *p;
507  	int err = 0;
508  
509  	spin_lock(lock);
510  repeat:
511  	list_for_each_prev(p, list) {
512  		bh = BH_ENTRY(p);
513  		if (buffer_locked(bh)) {
514  			get_bh(bh);
515  			spin_unlock(lock);
516  			wait_on_buffer(bh);
517  			if (!buffer_uptodate(bh))
518  				err = -EIO;
519  			brelse(bh);
520  			spin_lock(lock);
521  			goto repeat;
522  		}
523  	}
524  	spin_unlock(lock);
525  	return err;
526  }
527  
528  static void do_thaw_one(struct super_block *sb, void *unused)
529  {
530  	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
531  		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
532  }
533  
534  static void do_thaw_all(struct work_struct *work)
535  {
536  	iterate_supers(do_thaw_one, NULL);
537  	kfree(work);
538  	printk(KERN_WARNING "Emergency Thaw complete\n");
539  }
540  
541  /**
542   * emergency_thaw_all -- forcibly thaw every frozen filesystem
543   *
544   * Used for emergency unfreeze of all filesystems via SysRq
545   */
546  void emergency_thaw_all(void)
547  {
548  	struct work_struct *work;
549  
550  	work = kmalloc(sizeof(*work), GFP_ATOMIC);
551  	if (work) {
552  		INIT_WORK(work, do_thaw_all);
553  		schedule_work(work);
554  	}
555  }
556  
557  /**
558   * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
559   * @mapping: the mapping which wants those buffers written
560   *
561   * Starts I/O against the buffers at mapping->private_list, and waits upon
562   * that I/O.
563   *
564   * Basically, this is a convenience function for fsync().
565   * @mapping is a file or directory which needs those buffers to be written for
566   * a successful fsync().
567   */
568  int sync_mapping_buffers(struct address_space *mapping)
569  {
570  	struct address_space *buffer_mapping = mapping->private_data;
571  
572  	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
573  		return 0;
574  
575  	return fsync_buffers_list(&buffer_mapping->private_lock,
576  					&mapping->private_list);
577  }
578  EXPORT_SYMBOL(sync_mapping_buffers);
579  
580  /*
581   * Called when we've recently written block `bblock', and it is known that
582   * `bblock' was for a buffer_boundary() buffer.  This means that the block at
583   * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
584   * dirty, schedule it for IO.  So that indirects merge nicely with their data.
585   */
586  void write_boundary_block(struct block_device *bdev,
587  			sector_t bblock, unsigned blocksize)
588  {
589  	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
590  	if (bh) {
591  		if (buffer_dirty(bh))
592  			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
593  		put_bh(bh);
594  	}
595  }
596  
597  void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
598  {
599  	struct address_space *mapping = inode->i_mapping;
600  	struct address_space *buffer_mapping = bh->b_page->mapping;
601  
602  	mark_buffer_dirty(bh);
603  	if (!mapping->private_data) {
604  		mapping->private_data = buffer_mapping;
605  	} else {
606  		BUG_ON(mapping->private_data != buffer_mapping);
607  	}
608  	if (!bh->b_assoc_map) {
609  		spin_lock(&buffer_mapping->private_lock);
610  		list_move_tail(&bh->b_assoc_buffers,
611  				&mapping->private_list);
612  		bh->b_assoc_map = mapping;
613  		spin_unlock(&buffer_mapping->private_lock);
614  	}
615  }
616  EXPORT_SYMBOL(mark_buffer_dirty_inode);
617  
618  /*
619   * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
620   * dirty.
621   *
622   * If warn is true, then emit a warning if the page is not uptodate and has
623   * not been truncated.
624   *
625   * The caller must hold lock_page_memcg().
626   */
627  static void __set_page_dirty(struct page *page, struct address_space *mapping,
628  			     int warn)
629  {
630  	unsigned long flags;
631  
632  	spin_lock_irqsave(&mapping->tree_lock, flags);
633  	if (page->mapping) {	/* Race with truncate? */
634  		WARN_ON_ONCE(warn && !PageUptodate(page));
635  		account_page_dirtied(page, mapping);
636  		radix_tree_tag_set(&mapping->page_tree,
637  				page_index(page), PAGECACHE_TAG_DIRTY);
638  	}
639  	spin_unlock_irqrestore(&mapping->tree_lock, flags);
640  }
641  
642  /*
643   * Add a page to the dirty page list.
644   *
645   * It is a sad fact of life that this function is called from several places
646   * deeply under spinlocking.  It may not sleep.
647   *
648   * If the page has buffers, the uptodate buffers are set dirty, to preserve
649   * dirty-state coherency between the page and the buffers.  It the page does
650   * not have buffers then when they are later attached they will all be set
651   * dirty.
652   *
653   * The buffers are dirtied before the page is dirtied.  There's a small race
654   * window in which a writepage caller may see the page cleanness but not the
655   * buffer dirtiness.  That's fine.  If this code were to set the page dirty
656   * before the buffers, a concurrent writepage caller could clear the page dirty
657   * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
658   * page on the dirty page list.
659   *
660   * We use private_lock to lock against try_to_free_buffers while using the
661   * page's buffer list.  Also use this to protect against clean buffers being
662   * added to the page after it was set dirty.
663   *
664   * FIXME: may need to call ->reservepage here as well.  That's rather up to the
665   * address_space though.
666   */
667  int __set_page_dirty_buffers(struct page *page)
668  {
669  	int newly_dirty;
670  	struct address_space *mapping = page_mapping(page);
671  
672  	if (unlikely(!mapping))
673  		return !TestSetPageDirty(page);
674  
675  	spin_lock(&mapping->private_lock);
676  	if (page_has_buffers(page)) {
677  		struct buffer_head *head = page_buffers(page);
678  		struct buffer_head *bh = head;
679  
680  		do {
681  			set_buffer_dirty(bh);
682  			bh = bh->b_this_page;
683  		} while (bh != head);
684  	}
685  	/*
686  	 * Lock out page->mem_cgroup migration to keep PageDirty
687  	 * synchronized with per-memcg dirty page counters.
688  	 */
689  	lock_page_memcg(page);
690  	newly_dirty = !TestSetPageDirty(page);
691  	spin_unlock(&mapping->private_lock);
692  
693  	if (newly_dirty)
694  		__set_page_dirty(page, mapping, 1);
695  
696  	unlock_page_memcg(page);
697  
698  	if (newly_dirty)
699  		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
700  
701  	return newly_dirty;
702  }
703  EXPORT_SYMBOL(__set_page_dirty_buffers);
704  
705  /*
706   * Write out and wait upon a list of buffers.
707   *
708   * We have conflicting pressures: we want to make sure that all
709   * initially dirty buffers get waited on, but that any subsequently
710   * dirtied buffers don't.  After all, we don't want fsync to last
711   * forever if somebody is actively writing to the file.
712   *
713   * Do this in two main stages: first we copy dirty buffers to a
714   * temporary inode list, queueing the writes as we go.  Then we clean
715   * up, waiting for those writes to complete.
716   *
717   * During this second stage, any subsequent updates to the file may end
718   * up refiling the buffer on the original inode's dirty list again, so
719   * there is a chance we will end up with a buffer queued for write but
720   * not yet completed on that list.  So, as a final cleanup we go through
721   * the osync code to catch these locked, dirty buffers without requeuing
722   * any newly dirty buffers for write.
723   */
724  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
725  {
726  	struct buffer_head *bh;
727  	struct list_head tmp;
728  	struct address_space *mapping;
729  	int err = 0, err2;
730  	struct blk_plug plug;
731  
732  	INIT_LIST_HEAD(&tmp);
733  	blk_start_plug(&plug);
734  
735  	spin_lock(lock);
736  	while (!list_empty(list)) {
737  		bh = BH_ENTRY(list->next);
738  		mapping = bh->b_assoc_map;
739  		__remove_assoc_queue(bh);
740  		/* Avoid race with mark_buffer_dirty_inode() which does
741  		 * a lockless check and we rely on seeing the dirty bit */
742  		smp_mb();
743  		if (buffer_dirty(bh) || buffer_locked(bh)) {
744  			list_add(&bh->b_assoc_buffers, &tmp);
745  			bh->b_assoc_map = mapping;
746  			if (buffer_dirty(bh)) {
747  				get_bh(bh);
748  				spin_unlock(lock);
749  				/*
750  				 * Ensure any pending I/O completes so that
751  				 * write_dirty_buffer() actually writes the
752  				 * current contents - it is a noop if I/O is
753  				 * still in flight on potentially older
754  				 * contents.
755  				 */
756  				write_dirty_buffer(bh, WRITE_SYNC);
757  
758  				/*
759  				 * Kick off IO for the previous mapping. Note
760  				 * that we will not run the very last mapping,
761  				 * wait_on_buffer() will do that for us
762  				 * through sync_buffer().
763  				 */
764  				brelse(bh);
765  				spin_lock(lock);
766  			}
767  		}
768  	}
769  
770  	spin_unlock(lock);
771  	blk_finish_plug(&plug);
772  	spin_lock(lock);
773  
774  	while (!list_empty(&tmp)) {
775  		bh = BH_ENTRY(tmp.prev);
776  		get_bh(bh);
777  		mapping = bh->b_assoc_map;
778  		__remove_assoc_queue(bh);
779  		/* Avoid race with mark_buffer_dirty_inode() which does
780  		 * a lockless check and we rely on seeing the dirty bit */
781  		smp_mb();
782  		if (buffer_dirty(bh)) {
783  			list_add(&bh->b_assoc_buffers,
784  				 &mapping->private_list);
785  			bh->b_assoc_map = mapping;
786  		}
787  		spin_unlock(lock);
788  		wait_on_buffer(bh);
789  		if (!buffer_uptodate(bh))
790  			err = -EIO;
791  		brelse(bh);
792  		spin_lock(lock);
793  	}
794  
795  	spin_unlock(lock);
796  	err2 = osync_buffers_list(lock, list);
797  	if (err)
798  		return err;
799  	else
800  		return err2;
801  }
802  
803  /*
804   * Invalidate any and all dirty buffers on a given inode.  We are
805   * probably unmounting the fs, but that doesn't mean we have already
806   * done a sync().  Just drop the buffers from the inode list.
807   *
808   * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
809   * assumes that all the buffers are against the blockdev.  Not true
810   * for reiserfs.
811   */
812  void invalidate_inode_buffers(struct inode *inode)
813  {
814  	if (inode_has_buffers(inode)) {
815  		struct address_space *mapping = &inode->i_data;
816  		struct list_head *list = &mapping->private_list;
817  		struct address_space *buffer_mapping = mapping->private_data;
818  
819  		spin_lock(&buffer_mapping->private_lock);
820  		while (!list_empty(list))
821  			__remove_assoc_queue(BH_ENTRY(list->next));
822  		spin_unlock(&buffer_mapping->private_lock);
823  	}
824  }
825  EXPORT_SYMBOL(invalidate_inode_buffers);
826  
827  /*
828   * Remove any clean buffers from the inode's buffer list.  This is called
829   * when we're trying to free the inode itself.  Those buffers can pin it.
830   *
831   * Returns true if all buffers were removed.
832   */
833  int remove_inode_buffers(struct inode *inode)
834  {
835  	int ret = 1;
836  
837  	if (inode_has_buffers(inode)) {
838  		struct address_space *mapping = &inode->i_data;
839  		struct list_head *list = &mapping->private_list;
840  		struct address_space *buffer_mapping = mapping->private_data;
841  
842  		spin_lock(&buffer_mapping->private_lock);
843  		while (!list_empty(list)) {
844  			struct buffer_head *bh = BH_ENTRY(list->next);
845  			if (buffer_dirty(bh)) {
846  				ret = 0;
847  				break;
848  			}
849  			__remove_assoc_queue(bh);
850  		}
851  		spin_unlock(&buffer_mapping->private_lock);
852  	}
853  	return ret;
854  }
855  
856  /*
857   * Create the appropriate buffers when given a page for data area and
858   * the size of each buffer.. Use the bh->b_this_page linked list to
859   * follow the buffers created.  Return NULL if unable to create more
860   * buffers.
861   *
862   * The retry flag is used to differentiate async IO (paging, swapping)
863   * which may not fail from ordinary buffer allocations.
864   */
865  struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
866  		int retry)
867  {
868  	struct buffer_head *bh, *head;
869  	long offset;
870  
871  try_again:
872  	head = NULL;
873  	offset = PAGE_SIZE;
874  	while ((offset -= size) >= 0) {
875  		bh = alloc_buffer_head(GFP_NOFS);
876  		if (!bh)
877  			goto no_grow;
878  
879  		bh->b_this_page = head;
880  		bh->b_blocknr = -1;
881  		head = bh;
882  
883  		bh->b_size = size;
884  
885  		/* Link the buffer to its page */
886  		set_bh_page(bh, page, offset);
887  	}
888  	return head;
889  /*
890   * In case anything failed, we just free everything we got.
891   */
892  no_grow:
893  	if (head) {
894  		do {
895  			bh = head;
896  			head = head->b_this_page;
897  			free_buffer_head(bh);
898  		} while (head);
899  	}
900  
901  	/*
902  	 * Return failure for non-async IO requests.  Async IO requests
903  	 * are not allowed to fail, so we have to wait until buffer heads
904  	 * become available.  But we don't want tasks sleeping with
905  	 * partially complete buffers, so all were released above.
906  	 */
907  	if (!retry)
908  		return NULL;
909  
910  	/* We're _really_ low on memory. Now we just
911  	 * wait for old buffer heads to become free due to
912  	 * finishing IO.  Since this is an async request and
913  	 * the reserve list is empty, we're sure there are
914  	 * async buffer heads in use.
915  	 */
916  	free_more_memory();
917  	goto try_again;
918  }
919  EXPORT_SYMBOL_GPL(alloc_page_buffers);
920  
921  static inline void
922  link_dev_buffers(struct page *page, struct buffer_head *head)
923  {
924  	struct buffer_head *bh, *tail;
925  
926  	bh = head;
927  	do {
928  		tail = bh;
929  		bh = bh->b_this_page;
930  	} while (bh);
931  	tail->b_this_page = head;
932  	attach_page_buffers(page, head);
933  }
934  
935  static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
936  {
937  	sector_t retval = ~((sector_t)0);
938  	loff_t sz = i_size_read(bdev->bd_inode);
939  
940  	if (sz) {
941  		unsigned int sizebits = blksize_bits(size);
942  		retval = (sz >> sizebits);
943  	}
944  	return retval;
945  }
946  
947  /*
948   * Initialise the state of a blockdev page's buffers.
949   */
950  static sector_t
951  init_page_buffers(struct page *page, struct block_device *bdev,
952  			sector_t block, int size)
953  {
954  	struct buffer_head *head = page_buffers(page);
955  	struct buffer_head *bh = head;
956  	int uptodate = PageUptodate(page);
957  	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
958  
959  	do {
960  		if (!buffer_mapped(bh)) {
961  			init_buffer(bh, NULL, NULL);
962  			bh->b_bdev = bdev;
963  			bh->b_blocknr = block;
964  			if (uptodate)
965  				set_buffer_uptodate(bh);
966  			if (block < end_block)
967  				set_buffer_mapped(bh);
968  		}
969  		block++;
970  		bh = bh->b_this_page;
971  	} while (bh != head);
972  
973  	/*
974  	 * Caller needs to validate requested block against end of device.
975  	 */
976  	return end_block;
977  }
978  
979  /*
980   * Create the page-cache page that contains the requested block.
981   *
982   * This is used purely for blockdev mappings.
983   */
984  static int
985  grow_dev_page(struct block_device *bdev, sector_t block,
986  	      pgoff_t index, int size, int sizebits, gfp_t gfp)
987  {
988  	struct inode *inode = bdev->bd_inode;
989  	struct page *page;
990  	struct buffer_head *bh;
991  	sector_t end_block;
992  	int ret = 0;		/* Will call free_more_memory() */
993  	gfp_t gfp_mask;
994  
995  	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
996  
997  	/*
998  	 * XXX: __getblk_slow() can not really deal with failure and
999  	 * will endlessly loop on improvised global reclaim.  Prefer
1000  	 * looping in the allocator rather than here, at least that
1001  	 * code knows what it's doing.
1002  	 */
1003  	gfp_mask |= __GFP_NOFAIL;
1004  
1005  	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1006  	if (!page)
1007  		return ret;
1008  
1009  	BUG_ON(!PageLocked(page));
1010  
1011  	if (page_has_buffers(page)) {
1012  		bh = page_buffers(page);
1013  		if (bh->b_size == size) {
1014  			end_block = init_page_buffers(page, bdev,
1015  						(sector_t)index << sizebits,
1016  						size);
1017  			goto done;
1018  		}
1019  		if (!try_to_free_buffers(page))
1020  			goto failed;
1021  	}
1022  
1023  	/*
1024  	 * Allocate some buffers for this page
1025  	 */
1026  	bh = alloc_page_buffers(page, size, 0);
1027  	if (!bh)
1028  		goto failed;
1029  
1030  	/*
1031  	 * Link the page to the buffers and initialise them.  Take the
1032  	 * lock to be atomic wrt __find_get_block(), which does not
1033  	 * run under the page lock.
1034  	 */
1035  	spin_lock(&inode->i_mapping->private_lock);
1036  	link_dev_buffers(page, bh);
1037  	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1038  			size);
1039  	spin_unlock(&inode->i_mapping->private_lock);
1040  done:
1041  	ret = (block < end_block) ? 1 : -ENXIO;
1042  failed:
1043  	unlock_page(page);
1044  	put_page(page);
1045  	return ret;
1046  }
1047  
1048  /*
1049   * Create buffers for the specified block device block's page.  If
1050   * that page was dirty, the buffers are set dirty also.
1051   */
1052  static int
1053  grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1054  {
1055  	pgoff_t index;
1056  	int sizebits;
1057  
1058  	sizebits = -1;
1059  	do {
1060  		sizebits++;
1061  	} while ((size << sizebits) < PAGE_SIZE);
1062  
1063  	index = block >> sizebits;
1064  
1065  	/*
1066  	 * Check for a block which wants to lie outside our maximum possible
1067  	 * pagecache index.  (this comparison is done using sector_t types).
1068  	 */
1069  	if (unlikely(index != block >> sizebits)) {
1070  		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1071  			"device %pg\n",
1072  			__func__, (unsigned long long)block,
1073  			bdev);
1074  		return -EIO;
1075  	}
1076  
1077  	/* Create a page with the proper size buffers.. */
1078  	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1079  }
1080  
1081  static struct buffer_head *
1082  __getblk_slow(struct block_device *bdev, sector_t block,
1083  	     unsigned size, gfp_t gfp)
1084  {
1085  	/* Size must be multiple of hard sectorsize */
1086  	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087  			(size < 512 || size > PAGE_SIZE))) {
1088  		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089  					size);
1090  		printk(KERN_ERR "logical block size: %d\n",
1091  					bdev_logical_block_size(bdev));
1092  
1093  		dump_stack();
1094  		return NULL;
1095  	}
1096  
1097  	for (;;) {
1098  		struct buffer_head *bh;
1099  		int ret;
1100  
1101  		bh = __find_get_block(bdev, block, size);
1102  		if (bh)
1103  			return bh;
1104  
1105  		ret = grow_buffers(bdev, block, size, gfp);
1106  		if (ret < 0)
1107  			return NULL;
1108  		if (ret == 0)
1109  			free_more_memory();
1110  	}
1111  }
1112  
1113  /*
1114   * The relationship between dirty buffers and dirty pages:
1115   *
1116   * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117   * the page is tagged dirty in its radix tree.
1118   *
1119   * At all times, the dirtiness of the buffers represents the dirtiness of
1120   * subsections of the page.  If the page has buffers, the page dirty bit is
1121   * merely a hint about the true dirty state.
1122   *
1123   * When a page is set dirty in its entirety, all its buffers are marked dirty
1124   * (if the page has buffers).
1125   *
1126   * When a buffer is marked dirty, its page is dirtied, but the page's other
1127   * buffers are not.
1128   *
1129   * Also.  When blockdev buffers are explicitly read with bread(), they
1130   * individually become uptodate.  But their backing page remains not
1131   * uptodate - even if all of its buffers are uptodate.  A subsequent
1132   * block_read_full_page() against that page will discover all the uptodate
1133   * buffers, will set the page uptodate and will perform no I/O.
1134   */
1135  
1136  /**
1137   * mark_buffer_dirty - mark a buffer_head as needing writeout
1138   * @bh: the buffer_head to mark dirty
1139   *
1140   * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141   * backing page dirty, then tag the page as dirty in its address_space's radix
1142   * tree and then attach the address_space's inode to its superblock's dirty
1143   * inode list.
1144   *
1145   * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1146   * mapping->tree_lock and mapping->host->i_lock.
1147   */
1148  void mark_buffer_dirty(struct buffer_head *bh)
1149  {
1150  	WARN_ON_ONCE(!buffer_uptodate(bh));
1151  
1152  	trace_block_dirty_buffer(bh);
1153  
1154  	/*
1155  	 * Very *carefully* optimize the it-is-already-dirty case.
1156  	 *
1157  	 * Don't let the final "is it dirty" escape to before we
1158  	 * perhaps modified the buffer.
1159  	 */
1160  	if (buffer_dirty(bh)) {
1161  		smp_mb();
1162  		if (buffer_dirty(bh))
1163  			return;
1164  	}
1165  
1166  	if (!test_set_buffer_dirty(bh)) {
1167  		struct page *page = bh->b_page;
1168  		struct address_space *mapping = NULL;
1169  
1170  		lock_page_memcg(page);
1171  		if (!TestSetPageDirty(page)) {
1172  			mapping = page_mapping(page);
1173  			if (mapping)
1174  				__set_page_dirty(page, mapping, 0);
1175  		}
1176  		unlock_page_memcg(page);
1177  		if (mapping)
1178  			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1179  	}
1180  }
1181  EXPORT_SYMBOL(mark_buffer_dirty);
1182  
1183  /*
1184   * Decrement a buffer_head's reference count.  If all buffers against a page
1185   * have zero reference count, are clean and unlocked, and if the page is clean
1186   * and unlocked then try_to_free_buffers() may strip the buffers from the page
1187   * in preparation for freeing it (sometimes, rarely, buffers are removed from
1188   * a page but it ends up not being freed, and buffers may later be reattached).
1189   */
1190  void __brelse(struct buffer_head * buf)
1191  {
1192  	if (atomic_read(&buf->b_count)) {
1193  		put_bh(buf);
1194  		return;
1195  	}
1196  	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1197  }
1198  EXPORT_SYMBOL(__brelse);
1199  
1200  /*
1201   * bforget() is like brelse(), except it discards any
1202   * potentially dirty data.
1203   */
1204  void __bforget(struct buffer_head *bh)
1205  {
1206  	clear_buffer_dirty(bh);
1207  	if (bh->b_assoc_map) {
1208  		struct address_space *buffer_mapping = bh->b_page->mapping;
1209  
1210  		spin_lock(&buffer_mapping->private_lock);
1211  		list_del_init(&bh->b_assoc_buffers);
1212  		bh->b_assoc_map = NULL;
1213  		spin_unlock(&buffer_mapping->private_lock);
1214  	}
1215  	__brelse(bh);
1216  }
1217  EXPORT_SYMBOL(__bforget);
1218  
1219  static struct buffer_head *__bread_slow(struct buffer_head *bh)
1220  {
1221  	lock_buffer(bh);
1222  	if (buffer_uptodate(bh)) {
1223  		unlock_buffer(bh);
1224  		return bh;
1225  	} else {
1226  		get_bh(bh);
1227  		bh->b_end_io = end_buffer_read_sync;
1228  		submit_bh(REQ_OP_READ, 0, bh);
1229  		wait_on_buffer(bh);
1230  		if (buffer_uptodate(bh))
1231  			return bh;
1232  	}
1233  	brelse(bh);
1234  	return NULL;
1235  }
1236  
1237  /*
1238   * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1239   * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1240   * refcount elevated by one when they're in an LRU.  A buffer can only appear
1241   * once in a particular CPU's LRU.  A single buffer can be present in multiple
1242   * CPU's LRUs at the same time.
1243   *
1244   * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1245   * sb_find_get_block().
1246   *
1247   * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1248   * a local interrupt disable for that.
1249   */
1250  
1251  #define BH_LRU_SIZE	16
1252  
1253  struct bh_lru {
1254  	struct buffer_head *bhs[BH_LRU_SIZE];
1255  };
1256  
1257  static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1258  
1259  #ifdef CONFIG_SMP
1260  #define bh_lru_lock()	local_irq_disable()
1261  #define bh_lru_unlock()	local_irq_enable()
1262  #else
1263  #define bh_lru_lock()	preempt_disable()
1264  #define bh_lru_unlock()	preempt_enable()
1265  #endif
1266  
1267  static inline void check_irqs_on(void)
1268  {
1269  #ifdef irqs_disabled
1270  	BUG_ON(irqs_disabled());
1271  #endif
1272  }
1273  
1274  /*
1275   * The LRU management algorithm is dopey-but-simple.  Sorry.
1276   */
1277  static void bh_lru_install(struct buffer_head *bh)
1278  {
1279  	struct buffer_head *evictee = NULL;
1280  
1281  	check_irqs_on();
1282  	bh_lru_lock();
1283  	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1284  		struct buffer_head *bhs[BH_LRU_SIZE];
1285  		int in;
1286  		int out = 0;
1287  
1288  		get_bh(bh);
1289  		bhs[out++] = bh;
1290  		for (in = 0; in < BH_LRU_SIZE; in++) {
1291  			struct buffer_head *bh2 =
1292  				__this_cpu_read(bh_lrus.bhs[in]);
1293  
1294  			if (bh2 == bh) {
1295  				__brelse(bh2);
1296  			} else {
1297  				if (out >= BH_LRU_SIZE) {
1298  					BUG_ON(evictee != NULL);
1299  					evictee = bh2;
1300  				} else {
1301  					bhs[out++] = bh2;
1302  				}
1303  			}
1304  		}
1305  		while (out < BH_LRU_SIZE)
1306  			bhs[out++] = NULL;
1307  		memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1308  	}
1309  	bh_lru_unlock();
1310  
1311  	if (evictee)
1312  		__brelse(evictee);
1313  }
1314  
1315  /*
1316   * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1317   */
1318  static struct buffer_head *
1319  lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1320  {
1321  	struct buffer_head *ret = NULL;
1322  	unsigned int i;
1323  
1324  	check_irqs_on();
1325  	bh_lru_lock();
1326  	for (i = 0; i < BH_LRU_SIZE; i++) {
1327  		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1328  
1329  		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1330  		    bh->b_size == size) {
1331  			if (i) {
1332  				while (i) {
1333  					__this_cpu_write(bh_lrus.bhs[i],
1334  						__this_cpu_read(bh_lrus.bhs[i - 1]));
1335  					i--;
1336  				}
1337  				__this_cpu_write(bh_lrus.bhs[0], bh);
1338  			}
1339  			get_bh(bh);
1340  			ret = bh;
1341  			break;
1342  		}
1343  	}
1344  	bh_lru_unlock();
1345  	return ret;
1346  }
1347  
1348  /*
1349   * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1350   * it in the LRU and mark it as accessed.  If it is not present then return
1351   * NULL
1352   */
1353  struct buffer_head *
1354  __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1355  {
1356  	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1357  
1358  	if (bh == NULL) {
1359  		/* __find_get_block_slow will mark the page accessed */
1360  		bh = __find_get_block_slow(bdev, block);
1361  		if (bh)
1362  			bh_lru_install(bh);
1363  	} else
1364  		touch_buffer(bh);
1365  
1366  	return bh;
1367  }
1368  EXPORT_SYMBOL(__find_get_block);
1369  
1370  /*
1371   * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1372   * which corresponds to the passed block_device, block and size. The
1373   * returned buffer has its reference count incremented.
1374   *
1375   * __getblk_gfp() will lock up the machine if grow_dev_page's
1376   * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1377   */
1378  struct buffer_head *
1379  __getblk_gfp(struct block_device *bdev, sector_t block,
1380  	     unsigned size, gfp_t gfp)
1381  {
1382  	struct buffer_head *bh = __find_get_block(bdev, block, size);
1383  
1384  	might_sleep();
1385  	if (bh == NULL)
1386  		bh = __getblk_slow(bdev, block, size, gfp);
1387  	return bh;
1388  }
1389  EXPORT_SYMBOL(__getblk_gfp);
1390  
1391  /*
1392   * Do async read-ahead on a buffer..
1393   */
1394  void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1395  {
1396  	struct buffer_head *bh = __getblk(bdev, block, size);
1397  	if (likely(bh)) {
1398  		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
1399  		brelse(bh);
1400  	}
1401  }
1402  EXPORT_SYMBOL(__breadahead);
1403  
1404  /**
1405   *  __bread_gfp() - reads a specified block and returns the bh
1406   *  @bdev: the block_device to read from
1407   *  @block: number of block
1408   *  @size: size (in bytes) to read
1409   *  @gfp: page allocation flag
1410   *
1411   *  Reads a specified block, and returns buffer head that contains it.
1412   *  The page cache can be allocated from non-movable area
1413   *  not to prevent page migration if you set gfp to zero.
1414   *  It returns NULL if the block was unreadable.
1415   */
1416  struct buffer_head *
1417  __bread_gfp(struct block_device *bdev, sector_t block,
1418  		   unsigned size, gfp_t gfp)
1419  {
1420  	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1421  
1422  	if (likely(bh) && !buffer_uptodate(bh))
1423  		bh = __bread_slow(bh);
1424  	return bh;
1425  }
1426  EXPORT_SYMBOL(__bread_gfp);
1427  
1428  /*
1429   * invalidate_bh_lrus() is called rarely - but not only at unmount.
1430   * This doesn't race because it runs in each cpu either in irq
1431   * or with preempt disabled.
1432   */
1433  static void invalidate_bh_lru(void *arg)
1434  {
1435  	struct bh_lru *b = &get_cpu_var(bh_lrus);
1436  	int i;
1437  
1438  	for (i = 0; i < BH_LRU_SIZE; i++) {
1439  		brelse(b->bhs[i]);
1440  		b->bhs[i] = NULL;
1441  	}
1442  	put_cpu_var(bh_lrus);
1443  }
1444  
1445  static bool has_bh_in_lru(int cpu, void *dummy)
1446  {
1447  	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1448  	int i;
1449  
1450  	for (i = 0; i < BH_LRU_SIZE; i++) {
1451  		if (b->bhs[i])
1452  			return 1;
1453  	}
1454  
1455  	return 0;
1456  }
1457  
1458  void invalidate_bh_lrus(void)
1459  {
1460  	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1461  }
1462  EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1463  
1464  void set_bh_page(struct buffer_head *bh,
1465  		struct page *page, unsigned long offset)
1466  {
1467  	bh->b_page = page;
1468  	BUG_ON(offset >= PAGE_SIZE);
1469  	if (PageHighMem(page))
1470  		/*
1471  		 * This catches illegal uses and preserves the offset:
1472  		 */
1473  		bh->b_data = (char *)(0 + offset);
1474  	else
1475  		bh->b_data = page_address(page) + offset;
1476  }
1477  EXPORT_SYMBOL(set_bh_page);
1478  
1479  /*
1480   * Called when truncating a buffer on a page completely.
1481   */
1482  
1483  /* Bits that are cleared during an invalidate */
1484  #define BUFFER_FLAGS_DISCARD \
1485  	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1486  	 1 << BH_Delay | 1 << BH_Unwritten)
1487  
1488  static void discard_buffer(struct buffer_head * bh)
1489  {
1490  	unsigned long b_state, b_state_old;
1491  
1492  	lock_buffer(bh);
1493  	clear_buffer_dirty(bh);
1494  	bh->b_bdev = NULL;
1495  	b_state = bh->b_state;
1496  	for (;;) {
1497  		b_state_old = cmpxchg(&bh->b_state, b_state,
1498  				      (b_state & ~BUFFER_FLAGS_DISCARD));
1499  		if (b_state_old == b_state)
1500  			break;
1501  		b_state = b_state_old;
1502  	}
1503  	unlock_buffer(bh);
1504  }
1505  
1506  /**
1507   * block_invalidatepage - invalidate part or all of a buffer-backed page
1508   *
1509   * @page: the page which is affected
1510   * @offset: start of the range to invalidate
1511   * @length: length of the range to invalidate
1512   *
1513   * block_invalidatepage() is called when all or part of the page has become
1514   * invalidated by a truncate operation.
1515   *
1516   * block_invalidatepage() does not have to release all buffers, but it must
1517   * ensure that no dirty buffer is left outside @offset and that no I/O
1518   * is underway against any of the blocks which are outside the truncation
1519   * point.  Because the caller is about to free (and possibly reuse) those
1520   * blocks on-disk.
1521   */
1522  void block_invalidatepage(struct page *page, unsigned int offset,
1523  			  unsigned int length)
1524  {
1525  	struct buffer_head *head, *bh, *next;
1526  	unsigned int curr_off = 0;
1527  	unsigned int stop = length + offset;
1528  
1529  	BUG_ON(!PageLocked(page));
1530  	if (!page_has_buffers(page))
1531  		goto out;
1532  
1533  	/*
1534  	 * Check for overflow
1535  	 */
1536  	BUG_ON(stop > PAGE_SIZE || stop < length);
1537  
1538  	head = page_buffers(page);
1539  	bh = head;
1540  	do {
1541  		unsigned int next_off = curr_off + bh->b_size;
1542  		next = bh->b_this_page;
1543  
1544  		/*
1545  		 * Are we still fully in range ?
1546  		 */
1547  		if (next_off > stop)
1548  			goto out;
1549  
1550  		/*
1551  		 * is this block fully invalidated?
1552  		 */
1553  		if (offset <= curr_off)
1554  			discard_buffer(bh);
1555  		curr_off = next_off;
1556  		bh = next;
1557  	} while (bh != head);
1558  
1559  	/*
1560  	 * We release buffers only if the entire page is being invalidated.
1561  	 * The get_block cached value has been unconditionally invalidated,
1562  	 * so real IO is not possible anymore.
1563  	 */
1564  	if (offset == 0)
1565  		try_to_release_page(page, 0);
1566  out:
1567  	return;
1568  }
1569  EXPORT_SYMBOL(block_invalidatepage);
1570  
1571  
1572  /*
1573   * We attach and possibly dirty the buffers atomically wrt
1574   * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1575   * is already excluded via the page lock.
1576   */
1577  void create_empty_buffers(struct page *page,
1578  			unsigned long blocksize, unsigned long b_state)
1579  {
1580  	struct buffer_head *bh, *head, *tail;
1581  
1582  	head = alloc_page_buffers(page, blocksize, 1);
1583  	bh = head;
1584  	do {
1585  		bh->b_state |= b_state;
1586  		tail = bh;
1587  		bh = bh->b_this_page;
1588  	} while (bh);
1589  	tail->b_this_page = head;
1590  
1591  	spin_lock(&page->mapping->private_lock);
1592  	if (PageUptodate(page) || PageDirty(page)) {
1593  		bh = head;
1594  		do {
1595  			if (PageDirty(page))
1596  				set_buffer_dirty(bh);
1597  			if (PageUptodate(page))
1598  				set_buffer_uptodate(bh);
1599  			bh = bh->b_this_page;
1600  		} while (bh != head);
1601  	}
1602  	attach_page_buffers(page, head);
1603  	spin_unlock(&page->mapping->private_lock);
1604  }
1605  EXPORT_SYMBOL(create_empty_buffers);
1606  
1607  /*
1608   * We are taking a block for data and we don't want any output from any
1609   * buffer-cache aliases starting from return from that function and
1610   * until the moment when something will explicitly mark the buffer
1611   * dirty (hopefully that will not happen until we will free that block ;-)
1612   * We don't even need to mark it not-uptodate - nobody can expect
1613   * anything from a newly allocated buffer anyway. We used to used
1614   * unmap_buffer() for such invalidation, but that was wrong. We definitely
1615   * don't want to mark the alias unmapped, for example - it would confuse
1616   * anyone who might pick it with bread() afterwards...
1617   *
1618   * Also..  Note that bforget() doesn't lock the buffer.  So there can
1619   * be writeout I/O going on against recently-freed buffers.  We don't
1620   * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1621   * only if we really need to.  That happens here.
1622   */
1623  void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1624  {
1625  	struct buffer_head *old_bh;
1626  
1627  	might_sleep();
1628  
1629  	old_bh = __find_get_block_slow(bdev, block);
1630  	if (old_bh) {
1631  		clear_buffer_dirty(old_bh);
1632  		wait_on_buffer(old_bh);
1633  		clear_buffer_req(old_bh);
1634  		__brelse(old_bh);
1635  	}
1636  }
1637  EXPORT_SYMBOL(unmap_underlying_metadata);
1638  
1639  /*
1640   * Size is a power-of-two in the range 512..PAGE_SIZE,
1641   * and the case we care about most is PAGE_SIZE.
1642   *
1643   * So this *could* possibly be written with those
1644   * constraints in mind (relevant mostly if some
1645   * architecture has a slow bit-scan instruction)
1646   */
1647  static inline int block_size_bits(unsigned int blocksize)
1648  {
1649  	return ilog2(blocksize);
1650  }
1651  
1652  static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1653  {
1654  	BUG_ON(!PageLocked(page));
1655  
1656  	if (!page_has_buffers(page))
1657  		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1658  	return page_buffers(page);
1659  }
1660  
1661  /*
1662   * NOTE! All mapped/uptodate combinations are valid:
1663   *
1664   *	Mapped	Uptodate	Meaning
1665   *
1666   *	No	No		"unknown" - must do get_block()
1667   *	No	Yes		"hole" - zero-filled
1668   *	Yes	No		"allocated" - allocated on disk, not read in
1669   *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1670   *
1671   * "Dirty" is valid only with the last case (mapped+uptodate).
1672   */
1673  
1674  /*
1675   * While block_write_full_page is writing back the dirty buffers under
1676   * the page lock, whoever dirtied the buffers may decide to clean them
1677   * again at any time.  We handle that by only looking at the buffer
1678   * state inside lock_buffer().
1679   *
1680   * If block_write_full_page() is called for regular writeback
1681   * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1682   * locked buffer.   This only can happen if someone has written the buffer
1683   * directly, with submit_bh().  At the address_space level PageWriteback
1684   * prevents this contention from occurring.
1685   *
1686   * If block_write_full_page() is called with wbc->sync_mode ==
1687   * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1688   * causes the writes to be flagged as synchronous writes.
1689   */
1690  int __block_write_full_page(struct inode *inode, struct page *page,
1691  			get_block_t *get_block, struct writeback_control *wbc,
1692  			bh_end_io_t *handler)
1693  {
1694  	int err;
1695  	sector_t block;
1696  	sector_t last_block;
1697  	struct buffer_head *bh, *head;
1698  	unsigned int blocksize, bbits;
1699  	int nr_underway = 0;
1700  	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
1701  
1702  	head = create_page_buffers(page, inode,
1703  					(1 << BH_Dirty)|(1 << BH_Uptodate));
1704  
1705  	/*
1706  	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1707  	 * here, and the (potentially unmapped) buffers may become dirty at
1708  	 * any time.  If a buffer becomes dirty here after we've inspected it
1709  	 * then we just miss that fact, and the page stays dirty.
1710  	 *
1711  	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1712  	 * handle that here by just cleaning them.
1713  	 */
1714  
1715  	bh = head;
1716  	blocksize = bh->b_size;
1717  	bbits = block_size_bits(blocksize);
1718  
1719  	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1720  	last_block = (i_size_read(inode) - 1) >> bbits;
1721  
1722  	/*
1723  	 * Get all the dirty buffers mapped to disk addresses and
1724  	 * handle any aliases from the underlying blockdev's mapping.
1725  	 */
1726  	do {
1727  		if (block > last_block) {
1728  			/*
1729  			 * mapped buffers outside i_size will occur, because
1730  			 * this page can be outside i_size when there is a
1731  			 * truncate in progress.
1732  			 */
1733  			/*
1734  			 * The buffer was zeroed by block_write_full_page()
1735  			 */
1736  			clear_buffer_dirty(bh);
1737  			set_buffer_uptodate(bh);
1738  		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1739  			   buffer_dirty(bh)) {
1740  			WARN_ON(bh->b_size != blocksize);
1741  			err = get_block(inode, block, bh, 1);
1742  			if (err)
1743  				goto recover;
1744  			clear_buffer_delay(bh);
1745  			if (buffer_new(bh)) {
1746  				/* blockdev mappings never come here */
1747  				clear_buffer_new(bh);
1748  				unmap_underlying_metadata(bh->b_bdev,
1749  							bh->b_blocknr);
1750  			}
1751  		}
1752  		bh = bh->b_this_page;
1753  		block++;
1754  	} while (bh != head);
1755  
1756  	do {
1757  		if (!buffer_mapped(bh))
1758  			continue;
1759  		/*
1760  		 * If it's a fully non-blocking write attempt and we cannot
1761  		 * lock the buffer then redirty the page.  Note that this can
1762  		 * potentially cause a busy-wait loop from writeback threads
1763  		 * and kswapd activity, but those code paths have their own
1764  		 * higher-level throttling.
1765  		 */
1766  		if (wbc->sync_mode != WB_SYNC_NONE) {
1767  			lock_buffer(bh);
1768  		} else if (!trylock_buffer(bh)) {
1769  			redirty_page_for_writepage(wbc, page);
1770  			continue;
1771  		}
1772  		if (test_clear_buffer_dirty(bh)) {
1773  			mark_buffer_async_write_endio(bh, handler);
1774  		} else {
1775  			unlock_buffer(bh);
1776  		}
1777  	} while ((bh = bh->b_this_page) != head);
1778  
1779  	/*
1780  	 * The page and its buffers are protected by PageWriteback(), so we can
1781  	 * drop the bh refcounts early.
1782  	 */
1783  	BUG_ON(PageWriteback(page));
1784  	set_page_writeback(page);
1785  
1786  	do {
1787  		struct buffer_head *next = bh->b_this_page;
1788  		if (buffer_async_write(bh)) {
1789  			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
1790  			nr_underway++;
1791  		}
1792  		bh = next;
1793  	} while (bh != head);
1794  	unlock_page(page);
1795  
1796  	err = 0;
1797  done:
1798  	if (nr_underway == 0) {
1799  		/*
1800  		 * The page was marked dirty, but the buffers were
1801  		 * clean.  Someone wrote them back by hand with
1802  		 * ll_rw_block/submit_bh.  A rare case.
1803  		 */
1804  		end_page_writeback(page);
1805  
1806  		/*
1807  		 * The page and buffer_heads can be released at any time from
1808  		 * here on.
1809  		 */
1810  	}
1811  	return err;
1812  
1813  recover:
1814  	/*
1815  	 * ENOSPC, or some other error.  We may already have added some
1816  	 * blocks to the file, so we need to write these out to avoid
1817  	 * exposing stale data.
1818  	 * The page is currently locked and not marked for writeback
1819  	 */
1820  	bh = head;
1821  	/* Recovery: lock and submit the mapped buffers */
1822  	do {
1823  		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1824  		    !buffer_delay(bh)) {
1825  			lock_buffer(bh);
1826  			mark_buffer_async_write_endio(bh, handler);
1827  		} else {
1828  			/*
1829  			 * The buffer may have been set dirty during
1830  			 * attachment to a dirty page.
1831  			 */
1832  			clear_buffer_dirty(bh);
1833  		}
1834  	} while ((bh = bh->b_this_page) != head);
1835  	SetPageError(page);
1836  	BUG_ON(PageWriteback(page));
1837  	mapping_set_error(page->mapping, err);
1838  	set_page_writeback(page);
1839  	do {
1840  		struct buffer_head *next = bh->b_this_page;
1841  		if (buffer_async_write(bh)) {
1842  			clear_buffer_dirty(bh);
1843  			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
1844  			nr_underway++;
1845  		}
1846  		bh = next;
1847  	} while (bh != head);
1848  	unlock_page(page);
1849  	goto done;
1850  }
1851  EXPORT_SYMBOL(__block_write_full_page);
1852  
1853  /*
1854   * If a page has any new buffers, zero them out here, and mark them uptodate
1855   * and dirty so they'll be written out (in order to prevent uninitialised
1856   * block data from leaking). And clear the new bit.
1857   */
1858  void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1859  {
1860  	unsigned int block_start, block_end;
1861  	struct buffer_head *head, *bh;
1862  
1863  	BUG_ON(!PageLocked(page));
1864  	if (!page_has_buffers(page))
1865  		return;
1866  
1867  	bh = head = page_buffers(page);
1868  	block_start = 0;
1869  	do {
1870  		block_end = block_start + bh->b_size;
1871  
1872  		if (buffer_new(bh)) {
1873  			if (block_end > from && block_start < to) {
1874  				if (!PageUptodate(page)) {
1875  					unsigned start, size;
1876  
1877  					start = max(from, block_start);
1878  					size = min(to, block_end) - start;
1879  
1880  					zero_user(page, start, size);
1881  					set_buffer_uptodate(bh);
1882  				}
1883  
1884  				clear_buffer_new(bh);
1885  				mark_buffer_dirty(bh);
1886  			}
1887  		}
1888  
1889  		block_start = block_end;
1890  		bh = bh->b_this_page;
1891  	} while (bh != head);
1892  }
1893  EXPORT_SYMBOL(page_zero_new_buffers);
1894  
1895  static void
1896  iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1897  		struct iomap *iomap)
1898  {
1899  	loff_t offset = block << inode->i_blkbits;
1900  
1901  	bh->b_bdev = iomap->bdev;
1902  
1903  	/*
1904  	 * Block points to offset in file we need to map, iomap contains
1905  	 * the offset at which the map starts. If the map ends before the
1906  	 * current block, then do not map the buffer and let the caller
1907  	 * handle it.
1908  	 */
1909  	BUG_ON(offset >= iomap->offset + iomap->length);
1910  
1911  	switch (iomap->type) {
1912  	case IOMAP_HOLE:
1913  		/*
1914  		 * If the buffer is not up to date or beyond the current EOF,
1915  		 * we need to mark it as new to ensure sub-block zeroing is
1916  		 * executed if necessary.
1917  		 */
1918  		if (!buffer_uptodate(bh) ||
1919  		    (offset >= i_size_read(inode)))
1920  			set_buffer_new(bh);
1921  		break;
1922  	case IOMAP_DELALLOC:
1923  		if (!buffer_uptodate(bh) ||
1924  		    (offset >= i_size_read(inode)))
1925  			set_buffer_new(bh);
1926  		set_buffer_uptodate(bh);
1927  		set_buffer_mapped(bh);
1928  		set_buffer_delay(bh);
1929  		break;
1930  	case IOMAP_UNWRITTEN:
1931  		/*
1932  		 * For unwritten regions, we always need to ensure that
1933  		 * sub-block writes cause the regions in the block we are not
1934  		 * writing to are zeroed. Set the buffer as new to ensure this.
1935  		 */
1936  		set_buffer_new(bh);
1937  		set_buffer_unwritten(bh);
1938  		/* FALLTHRU */
1939  	case IOMAP_MAPPED:
1940  		if (offset >= i_size_read(inode))
1941  			set_buffer_new(bh);
1942  		bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
1943  				((offset - iomap->offset) >> inode->i_blkbits);
1944  		set_buffer_mapped(bh);
1945  		break;
1946  	}
1947  }
1948  
1949  int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1950  		get_block_t *get_block, struct iomap *iomap)
1951  {
1952  	unsigned from = pos & (PAGE_SIZE - 1);
1953  	unsigned to = from + len;
1954  	struct inode *inode = page->mapping->host;
1955  	unsigned block_start, block_end;
1956  	sector_t block;
1957  	int err = 0;
1958  	unsigned blocksize, bbits;
1959  	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1960  
1961  	BUG_ON(!PageLocked(page));
1962  	BUG_ON(from > PAGE_SIZE);
1963  	BUG_ON(to > PAGE_SIZE);
1964  	BUG_ON(from > to);
1965  
1966  	head = create_page_buffers(page, inode, 0);
1967  	blocksize = head->b_size;
1968  	bbits = block_size_bits(blocksize);
1969  
1970  	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
1971  
1972  	for(bh = head, block_start = 0; bh != head || !block_start;
1973  	    block++, block_start=block_end, bh = bh->b_this_page) {
1974  		block_end = block_start + blocksize;
1975  		if (block_end <= from || block_start >= to) {
1976  			if (PageUptodate(page)) {
1977  				if (!buffer_uptodate(bh))
1978  					set_buffer_uptodate(bh);
1979  			}
1980  			continue;
1981  		}
1982  		if (buffer_new(bh))
1983  			clear_buffer_new(bh);
1984  		if (!buffer_mapped(bh)) {
1985  			WARN_ON(bh->b_size != blocksize);
1986  			if (get_block) {
1987  				err = get_block(inode, block, bh, 1);
1988  				if (err)
1989  					break;
1990  			} else {
1991  				iomap_to_bh(inode, block, bh, iomap);
1992  			}
1993  
1994  			if (buffer_new(bh)) {
1995  				unmap_underlying_metadata(bh->b_bdev,
1996  							bh->b_blocknr);
1997  				if (PageUptodate(page)) {
1998  					clear_buffer_new(bh);
1999  					set_buffer_uptodate(bh);
2000  					mark_buffer_dirty(bh);
2001  					continue;
2002  				}
2003  				if (block_end > to || block_start < from)
2004  					zero_user_segments(page,
2005  						to, block_end,
2006  						block_start, from);
2007  				continue;
2008  			}
2009  		}
2010  		if (PageUptodate(page)) {
2011  			if (!buffer_uptodate(bh))
2012  				set_buffer_uptodate(bh);
2013  			continue;
2014  		}
2015  		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
2016  		    !buffer_unwritten(bh) &&
2017  		     (block_start < from || block_end > to)) {
2018  			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2019  			*wait_bh++=bh;
2020  		}
2021  	}
2022  	/*
2023  	 * If we issued read requests - let them complete.
2024  	 */
2025  	while(wait_bh > wait) {
2026  		wait_on_buffer(*--wait_bh);
2027  		if (!buffer_uptodate(*wait_bh))
2028  			err = -EIO;
2029  	}
2030  	if (unlikely(err))
2031  		page_zero_new_buffers(page, from, to);
2032  	return err;
2033  }
2034  
2035  int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2036  		get_block_t *get_block)
2037  {
2038  	return __block_write_begin_int(page, pos, len, get_block, NULL);
2039  }
2040  EXPORT_SYMBOL(__block_write_begin);
2041  
2042  static int __block_commit_write(struct inode *inode, struct page *page,
2043  		unsigned from, unsigned to)
2044  {
2045  	unsigned block_start, block_end;
2046  	int partial = 0;
2047  	unsigned blocksize;
2048  	struct buffer_head *bh, *head;
2049  
2050  	bh = head = page_buffers(page);
2051  	blocksize = bh->b_size;
2052  
2053  	block_start = 0;
2054  	do {
2055  		block_end = block_start + blocksize;
2056  		if (block_end <= from || block_start >= to) {
2057  			if (!buffer_uptodate(bh))
2058  				partial = 1;
2059  		} else {
2060  			set_buffer_uptodate(bh);
2061  			mark_buffer_dirty(bh);
2062  		}
2063  		clear_buffer_new(bh);
2064  
2065  		block_start = block_end;
2066  		bh = bh->b_this_page;
2067  	} while (bh != head);
2068  
2069  	/*
2070  	 * If this is a partial write which happened to make all buffers
2071  	 * uptodate then we can optimize away a bogus readpage() for
2072  	 * the next read(). Here we 'discover' whether the page went
2073  	 * uptodate as a result of this (potentially partial) write.
2074  	 */
2075  	if (!partial)
2076  		SetPageUptodate(page);
2077  	return 0;
2078  }
2079  
2080  /*
2081   * block_write_begin takes care of the basic task of block allocation and
2082   * bringing partial write blocks uptodate first.
2083   *
2084   * The filesystem needs to handle block truncation upon failure.
2085   */
2086  int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2087  		unsigned flags, struct page **pagep, get_block_t *get_block)
2088  {
2089  	pgoff_t index = pos >> PAGE_SHIFT;
2090  	struct page *page;
2091  	int status;
2092  
2093  	page = grab_cache_page_write_begin(mapping, index, flags);
2094  	if (!page)
2095  		return -ENOMEM;
2096  
2097  	status = __block_write_begin(page, pos, len, get_block);
2098  	if (unlikely(status)) {
2099  		unlock_page(page);
2100  		put_page(page);
2101  		page = NULL;
2102  	}
2103  
2104  	*pagep = page;
2105  	return status;
2106  }
2107  EXPORT_SYMBOL(block_write_begin);
2108  
2109  int block_write_end(struct file *file, struct address_space *mapping,
2110  			loff_t pos, unsigned len, unsigned copied,
2111  			struct page *page, void *fsdata)
2112  {
2113  	struct inode *inode = mapping->host;
2114  	unsigned start;
2115  
2116  	start = pos & (PAGE_SIZE - 1);
2117  
2118  	if (unlikely(copied < len)) {
2119  		/*
2120  		 * The buffers that were written will now be uptodate, so we
2121  		 * don't have to worry about a readpage reading them and
2122  		 * overwriting a partial write. However if we have encountered
2123  		 * a short write and only partially written into a buffer, it
2124  		 * will not be marked uptodate, so a readpage might come in and
2125  		 * destroy our partial write.
2126  		 *
2127  		 * Do the simplest thing, and just treat any short write to a
2128  		 * non uptodate page as a zero-length write, and force the
2129  		 * caller to redo the whole thing.
2130  		 */
2131  		if (!PageUptodate(page))
2132  			copied = 0;
2133  
2134  		page_zero_new_buffers(page, start+copied, start+len);
2135  	}
2136  	flush_dcache_page(page);
2137  
2138  	/* This could be a short (even 0-length) commit */
2139  	__block_commit_write(inode, page, start, start+copied);
2140  
2141  	return copied;
2142  }
2143  EXPORT_SYMBOL(block_write_end);
2144  
2145  int generic_write_end(struct file *file, struct address_space *mapping,
2146  			loff_t pos, unsigned len, unsigned copied,
2147  			struct page *page, void *fsdata)
2148  {
2149  	struct inode *inode = mapping->host;
2150  	loff_t old_size = inode->i_size;
2151  	int i_size_changed = 0;
2152  
2153  	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2154  
2155  	/*
2156  	 * No need to use i_size_read() here, the i_size
2157  	 * cannot change under us because we hold i_mutex.
2158  	 *
2159  	 * But it's important to update i_size while still holding page lock:
2160  	 * page writeout could otherwise come in and zero beyond i_size.
2161  	 */
2162  	if (pos+copied > inode->i_size) {
2163  		i_size_write(inode, pos+copied);
2164  		i_size_changed = 1;
2165  	}
2166  
2167  	unlock_page(page);
2168  	put_page(page);
2169  
2170  	if (old_size < pos)
2171  		pagecache_isize_extended(inode, old_size, pos);
2172  	/*
2173  	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2174  	 * makes the holding time of page lock longer. Second, it forces lock
2175  	 * ordering of page lock and transaction start for journaling
2176  	 * filesystems.
2177  	 */
2178  	if (i_size_changed)
2179  		mark_inode_dirty(inode);
2180  
2181  	return copied;
2182  }
2183  EXPORT_SYMBOL(generic_write_end);
2184  
2185  /*
2186   * block_is_partially_uptodate checks whether buffers within a page are
2187   * uptodate or not.
2188   *
2189   * Returns true if all buffers which correspond to a file portion
2190   * we want to read are uptodate.
2191   */
2192  int block_is_partially_uptodate(struct page *page, unsigned long from,
2193  					unsigned long count)
2194  {
2195  	unsigned block_start, block_end, blocksize;
2196  	unsigned to;
2197  	struct buffer_head *bh, *head;
2198  	int ret = 1;
2199  
2200  	if (!page_has_buffers(page))
2201  		return 0;
2202  
2203  	head = page_buffers(page);
2204  	blocksize = head->b_size;
2205  	to = min_t(unsigned, PAGE_SIZE - from, count);
2206  	to = from + to;
2207  	if (from < blocksize && to > PAGE_SIZE - blocksize)
2208  		return 0;
2209  
2210  	bh = head;
2211  	block_start = 0;
2212  	do {
2213  		block_end = block_start + blocksize;
2214  		if (block_end > from && block_start < to) {
2215  			if (!buffer_uptodate(bh)) {
2216  				ret = 0;
2217  				break;
2218  			}
2219  			if (block_end >= to)
2220  				break;
2221  		}
2222  		block_start = block_end;
2223  		bh = bh->b_this_page;
2224  	} while (bh != head);
2225  
2226  	return ret;
2227  }
2228  EXPORT_SYMBOL(block_is_partially_uptodate);
2229  
2230  /*
2231   * Generic "read page" function for block devices that have the normal
2232   * get_block functionality. This is most of the block device filesystems.
2233   * Reads the page asynchronously --- the unlock_buffer() and
2234   * set/clear_buffer_uptodate() functions propagate buffer state into the
2235   * page struct once IO has completed.
2236   */
2237  int block_read_full_page(struct page *page, get_block_t *get_block)
2238  {
2239  	struct inode *inode = page->mapping->host;
2240  	sector_t iblock, lblock;
2241  	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2242  	unsigned int blocksize, bbits;
2243  	int nr, i;
2244  	int fully_mapped = 1;
2245  
2246  	head = create_page_buffers(page, inode, 0);
2247  	blocksize = head->b_size;
2248  	bbits = block_size_bits(blocksize);
2249  
2250  	iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
2251  	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2252  	bh = head;
2253  	nr = 0;
2254  	i = 0;
2255  
2256  	do {
2257  		if (buffer_uptodate(bh))
2258  			continue;
2259  
2260  		if (!buffer_mapped(bh)) {
2261  			int err = 0;
2262  
2263  			fully_mapped = 0;
2264  			if (iblock < lblock) {
2265  				WARN_ON(bh->b_size != blocksize);
2266  				err = get_block(inode, iblock, bh, 0);
2267  				if (err)
2268  					SetPageError(page);
2269  			}
2270  			if (!buffer_mapped(bh)) {
2271  				zero_user(page, i * blocksize, blocksize);
2272  				if (!err)
2273  					set_buffer_uptodate(bh);
2274  				continue;
2275  			}
2276  			/*
2277  			 * get_block() might have updated the buffer
2278  			 * synchronously
2279  			 */
2280  			if (buffer_uptodate(bh))
2281  				continue;
2282  		}
2283  		arr[nr++] = bh;
2284  	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2285  
2286  	if (fully_mapped)
2287  		SetPageMappedToDisk(page);
2288  
2289  	if (!nr) {
2290  		/*
2291  		 * All buffers are uptodate - we can set the page uptodate
2292  		 * as well. But not if get_block() returned an error.
2293  		 */
2294  		if (!PageError(page))
2295  			SetPageUptodate(page);
2296  		unlock_page(page);
2297  		return 0;
2298  	}
2299  
2300  	/* Stage two: lock the buffers */
2301  	for (i = 0; i < nr; i++) {
2302  		bh = arr[i];
2303  		lock_buffer(bh);
2304  		mark_buffer_async_read(bh);
2305  	}
2306  
2307  	/*
2308  	 * Stage 3: start the IO.  Check for uptodateness
2309  	 * inside the buffer lock in case another process reading
2310  	 * the underlying blockdev brought it uptodate (the sct fix).
2311  	 */
2312  	for (i = 0; i < nr; i++) {
2313  		bh = arr[i];
2314  		if (buffer_uptodate(bh))
2315  			end_buffer_async_read(bh, 1);
2316  		else
2317  			submit_bh(REQ_OP_READ, 0, bh);
2318  	}
2319  	return 0;
2320  }
2321  EXPORT_SYMBOL(block_read_full_page);
2322  
2323  /* utility function for filesystems that need to do work on expanding
2324   * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2325   * deal with the hole.
2326   */
2327  int generic_cont_expand_simple(struct inode *inode, loff_t size)
2328  {
2329  	struct address_space *mapping = inode->i_mapping;
2330  	struct page *page;
2331  	void *fsdata;
2332  	int err;
2333  
2334  	err = inode_newsize_ok(inode, size);
2335  	if (err)
2336  		goto out;
2337  
2338  	err = pagecache_write_begin(NULL, mapping, size, 0,
2339  				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2340  				&page, &fsdata);
2341  	if (err)
2342  		goto out;
2343  
2344  	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2345  	BUG_ON(err > 0);
2346  
2347  out:
2348  	return err;
2349  }
2350  EXPORT_SYMBOL(generic_cont_expand_simple);
2351  
2352  static int cont_expand_zero(struct file *file, struct address_space *mapping,
2353  			    loff_t pos, loff_t *bytes)
2354  {
2355  	struct inode *inode = mapping->host;
2356  	unsigned blocksize = 1 << inode->i_blkbits;
2357  	struct page *page;
2358  	void *fsdata;
2359  	pgoff_t index, curidx;
2360  	loff_t curpos;
2361  	unsigned zerofrom, offset, len;
2362  	int err = 0;
2363  
2364  	index = pos >> PAGE_SHIFT;
2365  	offset = pos & ~PAGE_MASK;
2366  
2367  	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
2368  		zerofrom = curpos & ~PAGE_MASK;
2369  		if (zerofrom & (blocksize-1)) {
2370  			*bytes |= (blocksize-1);
2371  			(*bytes)++;
2372  		}
2373  		len = PAGE_SIZE - zerofrom;
2374  
2375  		err = pagecache_write_begin(file, mapping, curpos, len,
2376  						AOP_FLAG_UNINTERRUPTIBLE,
2377  						&page, &fsdata);
2378  		if (err)
2379  			goto out;
2380  		zero_user(page, zerofrom, len);
2381  		err = pagecache_write_end(file, mapping, curpos, len, len,
2382  						page, fsdata);
2383  		if (err < 0)
2384  			goto out;
2385  		BUG_ON(err != len);
2386  		err = 0;
2387  
2388  		balance_dirty_pages_ratelimited(mapping);
2389  
2390  		if (unlikely(fatal_signal_pending(current))) {
2391  			err = -EINTR;
2392  			goto out;
2393  		}
2394  	}
2395  
2396  	/* page covers the boundary, find the boundary offset */
2397  	if (index == curidx) {
2398  		zerofrom = curpos & ~PAGE_MASK;
2399  		/* if we will expand the thing last block will be filled */
2400  		if (offset <= zerofrom) {
2401  			goto out;
2402  		}
2403  		if (zerofrom & (blocksize-1)) {
2404  			*bytes |= (blocksize-1);
2405  			(*bytes)++;
2406  		}
2407  		len = offset - zerofrom;
2408  
2409  		err = pagecache_write_begin(file, mapping, curpos, len,
2410  						AOP_FLAG_UNINTERRUPTIBLE,
2411  						&page, &fsdata);
2412  		if (err)
2413  			goto out;
2414  		zero_user(page, zerofrom, len);
2415  		err = pagecache_write_end(file, mapping, curpos, len, len,
2416  						page, fsdata);
2417  		if (err < 0)
2418  			goto out;
2419  		BUG_ON(err != len);
2420  		err = 0;
2421  	}
2422  out:
2423  	return err;
2424  }
2425  
2426  /*
2427   * For moronic filesystems that do not allow holes in file.
2428   * We may have to extend the file.
2429   */
2430  int cont_write_begin(struct file *file, struct address_space *mapping,
2431  			loff_t pos, unsigned len, unsigned flags,
2432  			struct page **pagep, void **fsdata,
2433  			get_block_t *get_block, loff_t *bytes)
2434  {
2435  	struct inode *inode = mapping->host;
2436  	unsigned blocksize = 1 << inode->i_blkbits;
2437  	unsigned zerofrom;
2438  	int err;
2439  
2440  	err = cont_expand_zero(file, mapping, pos, bytes);
2441  	if (err)
2442  		return err;
2443  
2444  	zerofrom = *bytes & ~PAGE_MASK;
2445  	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2446  		*bytes |= (blocksize-1);
2447  		(*bytes)++;
2448  	}
2449  
2450  	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2451  }
2452  EXPORT_SYMBOL(cont_write_begin);
2453  
2454  int block_commit_write(struct page *page, unsigned from, unsigned to)
2455  {
2456  	struct inode *inode = page->mapping->host;
2457  	__block_commit_write(inode,page,from,to);
2458  	return 0;
2459  }
2460  EXPORT_SYMBOL(block_commit_write);
2461  
2462  /*
2463   * block_page_mkwrite() is not allowed to change the file size as it gets
2464   * called from a page fault handler when a page is first dirtied. Hence we must
2465   * be careful to check for EOF conditions here. We set the page up correctly
2466   * for a written page which means we get ENOSPC checking when writing into
2467   * holes and correct delalloc and unwritten extent mapping on filesystems that
2468   * support these features.
2469   *
2470   * We are not allowed to take the i_mutex here so we have to play games to
2471   * protect against truncate races as the page could now be beyond EOF.  Because
2472   * truncate writes the inode size before removing pages, once we have the
2473   * page lock we can determine safely if the page is beyond EOF. If it is not
2474   * beyond EOF, then the page is guaranteed safe against truncation until we
2475   * unlock the page.
2476   *
2477   * Direct callers of this function should protect against filesystem freezing
2478   * using sb_start_pagefault() - sb_end_pagefault() functions.
2479   */
2480  int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2481  			 get_block_t get_block)
2482  {
2483  	struct page *page = vmf->page;
2484  	struct inode *inode = file_inode(vma->vm_file);
2485  	unsigned long end;
2486  	loff_t size;
2487  	int ret;
2488  
2489  	lock_page(page);
2490  	size = i_size_read(inode);
2491  	if ((page->mapping != inode->i_mapping) ||
2492  	    (page_offset(page) > size)) {
2493  		/* We overload EFAULT to mean page got truncated */
2494  		ret = -EFAULT;
2495  		goto out_unlock;
2496  	}
2497  
2498  	/* page is wholly or partially inside EOF */
2499  	if (((page->index + 1) << PAGE_SHIFT) > size)
2500  		end = size & ~PAGE_MASK;
2501  	else
2502  		end = PAGE_SIZE;
2503  
2504  	ret = __block_write_begin(page, 0, end, get_block);
2505  	if (!ret)
2506  		ret = block_commit_write(page, 0, end);
2507  
2508  	if (unlikely(ret < 0))
2509  		goto out_unlock;
2510  	set_page_dirty(page);
2511  	wait_for_stable_page(page);
2512  	return 0;
2513  out_unlock:
2514  	unlock_page(page);
2515  	return ret;
2516  }
2517  EXPORT_SYMBOL(block_page_mkwrite);
2518  
2519  /*
2520   * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2521   * immediately, while under the page lock.  So it needs a special end_io
2522   * handler which does not touch the bh after unlocking it.
2523   */
2524  static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2525  {
2526  	__end_buffer_read_notouch(bh, uptodate);
2527  }
2528  
2529  /*
2530   * Attach the singly-linked list of buffers created by nobh_write_begin, to
2531   * the page (converting it to circular linked list and taking care of page
2532   * dirty races).
2533   */
2534  static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2535  {
2536  	struct buffer_head *bh;
2537  
2538  	BUG_ON(!PageLocked(page));
2539  
2540  	spin_lock(&page->mapping->private_lock);
2541  	bh = head;
2542  	do {
2543  		if (PageDirty(page))
2544  			set_buffer_dirty(bh);
2545  		if (!bh->b_this_page)
2546  			bh->b_this_page = head;
2547  		bh = bh->b_this_page;
2548  	} while (bh != head);
2549  	attach_page_buffers(page, head);
2550  	spin_unlock(&page->mapping->private_lock);
2551  }
2552  
2553  /*
2554   * On entry, the page is fully not uptodate.
2555   * On exit the page is fully uptodate in the areas outside (from,to)
2556   * The filesystem needs to handle block truncation upon failure.
2557   */
2558  int nobh_write_begin(struct address_space *mapping,
2559  			loff_t pos, unsigned len, unsigned flags,
2560  			struct page **pagep, void **fsdata,
2561  			get_block_t *get_block)
2562  {
2563  	struct inode *inode = mapping->host;
2564  	const unsigned blkbits = inode->i_blkbits;
2565  	const unsigned blocksize = 1 << blkbits;
2566  	struct buffer_head *head, *bh;
2567  	struct page *page;
2568  	pgoff_t index;
2569  	unsigned from, to;
2570  	unsigned block_in_page;
2571  	unsigned block_start, block_end;
2572  	sector_t block_in_file;
2573  	int nr_reads = 0;
2574  	int ret = 0;
2575  	int is_mapped_to_disk = 1;
2576  
2577  	index = pos >> PAGE_SHIFT;
2578  	from = pos & (PAGE_SIZE - 1);
2579  	to = from + len;
2580  
2581  	page = grab_cache_page_write_begin(mapping, index, flags);
2582  	if (!page)
2583  		return -ENOMEM;
2584  	*pagep = page;
2585  	*fsdata = NULL;
2586  
2587  	if (page_has_buffers(page)) {
2588  		ret = __block_write_begin(page, pos, len, get_block);
2589  		if (unlikely(ret))
2590  			goto out_release;
2591  		return ret;
2592  	}
2593  
2594  	if (PageMappedToDisk(page))
2595  		return 0;
2596  
2597  	/*
2598  	 * Allocate buffers so that we can keep track of state, and potentially
2599  	 * attach them to the page if an error occurs. In the common case of
2600  	 * no error, they will just be freed again without ever being attached
2601  	 * to the page (which is all OK, because we're under the page lock).
2602  	 *
2603  	 * Be careful: the buffer linked list is a NULL terminated one, rather
2604  	 * than the circular one we're used to.
2605  	 */
2606  	head = alloc_page_buffers(page, blocksize, 0);
2607  	if (!head) {
2608  		ret = -ENOMEM;
2609  		goto out_release;
2610  	}
2611  
2612  	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
2613  
2614  	/*
2615  	 * We loop across all blocks in the page, whether or not they are
2616  	 * part of the affected region.  This is so we can discover if the
2617  	 * page is fully mapped-to-disk.
2618  	 */
2619  	for (block_start = 0, block_in_page = 0, bh = head;
2620  		  block_start < PAGE_SIZE;
2621  		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2622  		int create;
2623  
2624  		block_end = block_start + blocksize;
2625  		bh->b_state = 0;
2626  		create = 1;
2627  		if (block_start >= to)
2628  			create = 0;
2629  		ret = get_block(inode, block_in_file + block_in_page,
2630  					bh, create);
2631  		if (ret)
2632  			goto failed;
2633  		if (!buffer_mapped(bh))
2634  			is_mapped_to_disk = 0;
2635  		if (buffer_new(bh))
2636  			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2637  		if (PageUptodate(page)) {
2638  			set_buffer_uptodate(bh);
2639  			continue;
2640  		}
2641  		if (buffer_new(bh) || !buffer_mapped(bh)) {
2642  			zero_user_segments(page, block_start, from,
2643  							to, block_end);
2644  			continue;
2645  		}
2646  		if (buffer_uptodate(bh))
2647  			continue;	/* reiserfs does this */
2648  		if (block_start < from || block_end > to) {
2649  			lock_buffer(bh);
2650  			bh->b_end_io = end_buffer_read_nobh;
2651  			submit_bh(REQ_OP_READ, 0, bh);
2652  			nr_reads++;
2653  		}
2654  	}
2655  
2656  	if (nr_reads) {
2657  		/*
2658  		 * The page is locked, so these buffers are protected from
2659  		 * any VM or truncate activity.  Hence we don't need to care
2660  		 * for the buffer_head refcounts.
2661  		 */
2662  		for (bh = head; bh; bh = bh->b_this_page) {
2663  			wait_on_buffer(bh);
2664  			if (!buffer_uptodate(bh))
2665  				ret = -EIO;
2666  		}
2667  		if (ret)
2668  			goto failed;
2669  	}
2670  
2671  	if (is_mapped_to_disk)
2672  		SetPageMappedToDisk(page);
2673  
2674  	*fsdata = head; /* to be released by nobh_write_end */
2675  
2676  	return 0;
2677  
2678  failed:
2679  	BUG_ON(!ret);
2680  	/*
2681  	 * Error recovery is a bit difficult. We need to zero out blocks that
2682  	 * were newly allocated, and dirty them to ensure they get written out.
2683  	 * Buffers need to be attached to the page at this point, otherwise
2684  	 * the handling of potential IO errors during writeout would be hard
2685  	 * (could try doing synchronous writeout, but what if that fails too?)
2686  	 */
2687  	attach_nobh_buffers(page, head);
2688  	page_zero_new_buffers(page, from, to);
2689  
2690  out_release:
2691  	unlock_page(page);
2692  	put_page(page);
2693  	*pagep = NULL;
2694  
2695  	return ret;
2696  }
2697  EXPORT_SYMBOL(nobh_write_begin);
2698  
2699  int nobh_write_end(struct file *file, struct address_space *mapping,
2700  			loff_t pos, unsigned len, unsigned copied,
2701  			struct page *page, void *fsdata)
2702  {
2703  	struct inode *inode = page->mapping->host;
2704  	struct buffer_head *head = fsdata;
2705  	struct buffer_head *bh;
2706  	BUG_ON(fsdata != NULL && page_has_buffers(page));
2707  
2708  	if (unlikely(copied < len) && head)
2709  		attach_nobh_buffers(page, head);
2710  	if (page_has_buffers(page))
2711  		return generic_write_end(file, mapping, pos, len,
2712  					copied, page, fsdata);
2713  
2714  	SetPageUptodate(page);
2715  	set_page_dirty(page);
2716  	if (pos+copied > inode->i_size) {
2717  		i_size_write(inode, pos+copied);
2718  		mark_inode_dirty(inode);
2719  	}
2720  
2721  	unlock_page(page);
2722  	put_page(page);
2723  
2724  	while (head) {
2725  		bh = head;
2726  		head = head->b_this_page;
2727  		free_buffer_head(bh);
2728  	}
2729  
2730  	return copied;
2731  }
2732  EXPORT_SYMBOL(nobh_write_end);
2733  
2734  /*
2735   * nobh_writepage() - based on block_full_write_page() except
2736   * that it tries to operate without attaching bufferheads to
2737   * the page.
2738   */
2739  int nobh_writepage(struct page *page, get_block_t *get_block,
2740  			struct writeback_control *wbc)
2741  {
2742  	struct inode * const inode = page->mapping->host;
2743  	loff_t i_size = i_size_read(inode);
2744  	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2745  	unsigned offset;
2746  	int ret;
2747  
2748  	/* Is the page fully inside i_size? */
2749  	if (page->index < end_index)
2750  		goto out;
2751  
2752  	/* Is the page fully outside i_size? (truncate in progress) */
2753  	offset = i_size & (PAGE_SIZE-1);
2754  	if (page->index >= end_index+1 || !offset) {
2755  		/*
2756  		 * The page may have dirty, unmapped buffers.  For example,
2757  		 * they may have been added in ext3_writepage().  Make them
2758  		 * freeable here, so the page does not leak.
2759  		 */
2760  #if 0
2761  		/* Not really sure about this  - do we need this ? */
2762  		if (page->mapping->a_ops->invalidatepage)
2763  			page->mapping->a_ops->invalidatepage(page, offset);
2764  #endif
2765  		unlock_page(page);
2766  		return 0; /* don't care */
2767  	}
2768  
2769  	/*
2770  	 * The page straddles i_size.  It must be zeroed out on each and every
2771  	 * writepage invocation because it may be mmapped.  "A file is mapped
2772  	 * in multiples of the page size.  For a file that is not a multiple of
2773  	 * the  page size, the remaining memory is zeroed when mapped, and
2774  	 * writes to that region are not written out to the file."
2775  	 */
2776  	zero_user_segment(page, offset, PAGE_SIZE);
2777  out:
2778  	ret = mpage_writepage(page, get_block, wbc);
2779  	if (ret == -EAGAIN)
2780  		ret = __block_write_full_page(inode, page, get_block, wbc,
2781  					      end_buffer_async_write);
2782  	return ret;
2783  }
2784  EXPORT_SYMBOL(nobh_writepage);
2785  
2786  int nobh_truncate_page(struct address_space *mapping,
2787  			loff_t from, get_block_t *get_block)
2788  {
2789  	pgoff_t index = from >> PAGE_SHIFT;
2790  	unsigned offset = from & (PAGE_SIZE-1);
2791  	unsigned blocksize;
2792  	sector_t iblock;
2793  	unsigned length, pos;
2794  	struct inode *inode = mapping->host;
2795  	struct page *page;
2796  	struct buffer_head map_bh;
2797  	int err;
2798  
2799  	blocksize = 1 << inode->i_blkbits;
2800  	length = offset & (blocksize - 1);
2801  
2802  	/* Block boundary? Nothing to do */
2803  	if (!length)
2804  		return 0;
2805  
2806  	length = blocksize - length;
2807  	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2808  
2809  	page = grab_cache_page(mapping, index);
2810  	err = -ENOMEM;
2811  	if (!page)
2812  		goto out;
2813  
2814  	if (page_has_buffers(page)) {
2815  has_buffers:
2816  		unlock_page(page);
2817  		put_page(page);
2818  		return block_truncate_page(mapping, from, get_block);
2819  	}
2820  
2821  	/* Find the buffer that contains "offset" */
2822  	pos = blocksize;
2823  	while (offset >= pos) {
2824  		iblock++;
2825  		pos += blocksize;
2826  	}
2827  
2828  	map_bh.b_size = blocksize;
2829  	map_bh.b_state = 0;
2830  	err = get_block(inode, iblock, &map_bh, 0);
2831  	if (err)
2832  		goto unlock;
2833  	/* unmapped? It's a hole - nothing to do */
2834  	if (!buffer_mapped(&map_bh))
2835  		goto unlock;
2836  
2837  	/* Ok, it's mapped. Make sure it's up-to-date */
2838  	if (!PageUptodate(page)) {
2839  		err = mapping->a_ops->readpage(NULL, page);
2840  		if (err) {
2841  			put_page(page);
2842  			goto out;
2843  		}
2844  		lock_page(page);
2845  		if (!PageUptodate(page)) {
2846  			err = -EIO;
2847  			goto unlock;
2848  		}
2849  		if (page_has_buffers(page))
2850  			goto has_buffers;
2851  	}
2852  	zero_user(page, offset, length);
2853  	set_page_dirty(page);
2854  	err = 0;
2855  
2856  unlock:
2857  	unlock_page(page);
2858  	put_page(page);
2859  out:
2860  	return err;
2861  }
2862  EXPORT_SYMBOL(nobh_truncate_page);
2863  
2864  int block_truncate_page(struct address_space *mapping,
2865  			loff_t from, get_block_t *get_block)
2866  {
2867  	pgoff_t index = from >> PAGE_SHIFT;
2868  	unsigned offset = from & (PAGE_SIZE-1);
2869  	unsigned blocksize;
2870  	sector_t iblock;
2871  	unsigned length, pos;
2872  	struct inode *inode = mapping->host;
2873  	struct page *page;
2874  	struct buffer_head *bh;
2875  	int err;
2876  
2877  	blocksize = 1 << inode->i_blkbits;
2878  	length = offset & (blocksize - 1);
2879  
2880  	/* Block boundary? Nothing to do */
2881  	if (!length)
2882  		return 0;
2883  
2884  	length = blocksize - length;
2885  	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
2886  
2887  	page = grab_cache_page(mapping, index);
2888  	err = -ENOMEM;
2889  	if (!page)
2890  		goto out;
2891  
2892  	if (!page_has_buffers(page))
2893  		create_empty_buffers(page, blocksize, 0);
2894  
2895  	/* Find the buffer that contains "offset" */
2896  	bh = page_buffers(page);
2897  	pos = blocksize;
2898  	while (offset >= pos) {
2899  		bh = bh->b_this_page;
2900  		iblock++;
2901  		pos += blocksize;
2902  	}
2903  
2904  	err = 0;
2905  	if (!buffer_mapped(bh)) {
2906  		WARN_ON(bh->b_size != blocksize);
2907  		err = get_block(inode, iblock, bh, 0);
2908  		if (err)
2909  			goto unlock;
2910  		/* unmapped? It's a hole - nothing to do */
2911  		if (!buffer_mapped(bh))
2912  			goto unlock;
2913  	}
2914  
2915  	/* Ok, it's mapped. Make sure it's up-to-date */
2916  	if (PageUptodate(page))
2917  		set_buffer_uptodate(bh);
2918  
2919  	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2920  		err = -EIO;
2921  		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
2922  		wait_on_buffer(bh);
2923  		/* Uhhuh. Read error. Complain and punt. */
2924  		if (!buffer_uptodate(bh))
2925  			goto unlock;
2926  	}
2927  
2928  	zero_user(page, offset, length);
2929  	mark_buffer_dirty(bh);
2930  	err = 0;
2931  
2932  unlock:
2933  	unlock_page(page);
2934  	put_page(page);
2935  out:
2936  	return err;
2937  }
2938  EXPORT_SYMBOL(block_truncate_page);
2939  
2940  /*
2941   * The generic ->writepage function for buffer-backed address_spaces
2942   */
2943  int block_write_full_page(struct page *page, get_block_t *get_block,
2944  			struct writeback_control *wbc)
2945  {
2946  	struct inode * const inode = page->mapping->host;
2947  	loff_t i_size = i_size_read(inode);
2948  	const pgoff_t end_index = i_size >> PAGE_SHIFT;
2949  	unsigned offset;
2950  
2951  	/* Is the page fully inside i_size? */
2952  	if (page->index < end_index)
2953  		return __block_write_full_page(inode, page, get_block, wbc,
2954  					       end_buffer_async_write);
2955  
2956  	/* Is the page fully outside i_size? (truncate in progress) */
2957  	offset = i_size & (PAGE_SIZE-1);
2958  	if (page->index >= end_index+1 || !offset) {
2959  		/*
2960  		 * The page may have dirty, unmapped buffers.  For example,
2961  		 * they may have been added in ext3_writepage().  Make them
2962  		 * freeable here, so the page does not leak.
2963  		 */
2964  		do_invalidatepage(page, 0, PAGE_SIZE);
2965  		unlock_page(page);
2966  		return 0; /* don't care */
2967  	}
2968  
2969  	/*
2970  	 * The page straddles i_size.  It must be zeroed out on each and every
2971  	 * writepage invocation because it may be mmapped.  "A file is mapped
2972  	 * in multiples of the page size.  For a file that is not a multiple of
2973  	 * the  page size, the remaining memory is zeroed when mapped, and
2974  	 * writes to that region are not written out to the file."
2975  	 */
2976  	zero_user_segment(page, offset, PAGE_SIZE);
2977  	return __block_write_full_page(inode, page, get_block, wbc,
2978  							end_buffer_async_write);
2979  }
2980  EXPORT_SYMBOL(block_write_full_page);
2981  
2982  sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2983  			    get_block_t *get_block)
2984  {
2985  	struct buffer_head tmp;
2986  	struct inode *inode = mapping->host;
2987  	tmp.b_state = 0;
2988  	tmp.b_blocknr = 0;
2989  	tmp.b_size = 1 << inode->i_blkbits;
2990  	get_block(inode, block, &tmp, 0);
2991  	return tmp.b_blocknr;
2992  }
2993  EXPORT_SYMBOL(generic_block_bmap);
2994  
2995  static void end_bio_bh_io_sync(struct bio *bio)
2996  {
2997  	struct buffer_head *bh = bio->bi_private;
2998  
2999  	if (unlikely(bio_flagged(bio, BIO_QUIET)))
3000  		set_bit(BH_Quiet, &bh->b_state);
3001  
3002  	bh->b_end_io(bh, !bio->bi_error);
3003  	bio_put(bio);
3004  }
3005  
3006  /*
3007   * This allows us to do IO even on the odd last sectors
3008   * of a device, even if the block size is some multiple
3009   * of the physical sector size.
3010   *
3011   * We'll just truncate the bio to the size of the device,
3012   * and clear the end of the buffer head manually.
3013   *
3014   * Truly out-of-range accesses will turn into actual IO
3015   * errors, this only handles the "we need to be able to
3016   * do IO at the final sector" case.
3017   */
3018  void guard_bio_eod(int op, struct bio *bio)
3019  {
3020  	sector_t maxsector;
3021  	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
3022  	unsigned truncated_bytes;
3023  
3024  	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
3025  	if (!maxsector)
3026  		return;
3027  
3028  	/*
3029  	 * If the *whole* IO is past the end of the device,
3030  	 * let it through, and the IO layer will turn it into
3031  	 * an EIO.
3032  	 */
3033  	if (unlikely(bio->bi_iter.bi_sector >= maxsector))
3034  		return;
3035  
3036  	maxsector -= bio->bi_iter.bi_sector;
3037  	if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
3038  		return;
3039  
3040  	/* Uhhuh. We've got a bio that straddles the device size! */
3041  	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
3042  
3043  	/* Truncate the bio.. */
3044  	bio->bi_iter.bi_size -= truncated_bytes;
3045  	bvec->bv_len -= truncated_bytes;
3046  
3047  	/* ..and clear the end of the buffer for reads */
3048  	if (op == REQ_OP_READ) {
3049  		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
3050  				truncated_bytes);
3051  	}
3052  }
3053  
3054  static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
3055  			 unsigned long bio_flags, struct writeback_control *wbc)
3056  {
3057  	struct bio *bio;
3058  
3059  	BUG_ON(!buffer_locked(bh));
3060  	BUG_ON(!buffer_mapped(bh));
3061  	BUG_ON(!bh->b_end_io);
3062  	BUG_ON(buffer_delay(bh));
3063  	BUG_ON(buffer_unwritten(bh));
3064  
3065  	/*
3066  	 * Only clear out a write error when rewriting
3067  	 */
3068  	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
3069  		clear_buffer_write_io_error(bh);
3070  
3071  	/*
3072  	 * from here on down, it's all bio -- do the initial mapping,
3073  	 * submit_bio -> generic_make_request may further map this bio around
3074  	 */
3075  	bio = bio_alloc(GFP_NOIO, 1);
3076  
3077  	if (wbc) {
3078  		wbc_init_bio(wbc, bio);
3079  		wbc_account_io(wbc, bh->b_page, bh->b_size);
3080  	}
3081  
3082  	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3083  	bio->bi_bdev = bh->b_bdev;
3084  
3085  	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3086  	BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3087  
3088  	bio->bi_end_io = end_bio_bh_io_sync;
3089  	bio->bi_private = bh;
3090  	bio->bi_flags |= bio_flags;
3091  
3092  	/* Take care of bh's that straddle the end of the device */
3093  	guard_bio_eod(op, bio);
3094  
3095  	if (buffer_meta(bh))
3096  		op_flags |= REQ_META;
3097  	if (buffer_prio(bh))
3098  		op_flags |= REQ_PRIO;
3099  	bio_set_op_attrs(bio, op, op_flags);
3100  
3101  	submit_bio(bio);
3102  	return 0;
3103  }
3104  
3105  int _submit_bh(int op, int op_flags, struct buffer_head *bh,
3106  	       unsigned long bio_flags)
3107  {
3108  	return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
3109  }
3110  EXPORT_SYMBOL_GPL(_submit_bh);
3111  
3112  int submit_bh(int op, int op_flags,  struct buffer_head *bh)
3113  {
3114  	return submit_bh_wbc(op, op_flags, bh, 0, NULL);
3115  }
3116  EXPORT_SYMBOL(submit_bh);
3117  
3118  /**
3119   * ll_rw_block: low-level access to block devices (DEPRECATED)
3120   * @op: whether to %READ or %WRITE
3121   * @op_flags: rq_flag_bits
3122   * @nr: number of &struct buffer_heads in the array
3123   * @bhs: array of pointers to &struct buffer_head
3124   *
3125   * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3126   * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
3127   * @op_flags contains flags modifying the detailed I/O behavior, most notably
3128   * %REQ_RAHEAD.
3129   *
3130   * This function drops any buffer that it cannot get a lock on (with the
3131   * BH_Lock state bit), any buffer that appears to be clean when doing a write
3132   * request, and any buffer that appears to be up-to-date when doing read
3133   * request.  Further it marks as clean buffers that are processed for
3134   * writing (the buffer cache won't assume that they are actually clean
3135   * until the buffer gets unlocked).
3136   *
3137   * ll_rw_block sets b_end_io to simple completion handler that marks
3138   * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3139   * any waiters.
3140   *
3141   * All of the buffers must be for the same device, and must also be a
3142   * multiple of the current approved size for the device.
3143   */
3144  void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
3145  {
3146  	int i;
3147  
3148  	for (i = 0; i < nr; i++) {
3149  		struct buffer_head *bh = bhs[i];
3150  
3151  		if (!trylock_buffer(bh))
3152  			continue;
3153  		if (op == WRITE) {
3154  			if (test_clear_buffer_dirty(bh)) {
3155  				bh->b_end_io = end_buffer_write_sync;
3156  				get_bh(bh);
3157  				submit_bh(op, op_flags, bh);
3158  				continue;
3159  			}
3160  		} else {
3161  			if (!buffer_uptodate(bh)) {
3162  				bh->b_end_io = end_buffer_read_sync;
3163  				get_bh(bh);
3164  				submit_bh(op, op_flags, bh);
3165  				continue;
3166  			}
3167  		}
3168  		unlock_buffer(bh);
3169  	}
3170  }
3171  EXPORT_SYMBOL(ll_rw_block);
3172  
3173  void write_dirty_buffer(struct buffer_head *bh, int op_flags)
3174  {
3175  	lock_buffer(bh);
3176  	if (!test_clear_buffer_dirty(bh)) {
3177  		unlock_buffer(bh);
3178  		return;
3179  	}
3180  	bh->b_end_io = end_buffer_write_sync;
3181  	get_bh(bh);
3182  	submit_bh(REQ_OP_WRITE, op_flags, bh);
3183  }
3184  EXPORT_SYMBOL(write_dirty_buffer);
3185  
3186  /*
3187   * For a data-integrity writeout, we need to wait upon any in-progress I/O
3188   * and then start new I/O and then wait upon it.  The caller must have a ref on
3189   * the buffer_head.
3190   */
3191  int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
3192  {
3193  	int ret = 0;
3194  
3195  	WARN_ON(atomic_read(&bh->b_count) < 1);
3196  	lock_buffer(bh);
3197  	if (test_clear_buffer_dirty(bh)) {
3198  		get_bh(bh);
3199  		bh->b_end_io = end_buffer_write_sync;
3200  		ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
3201  		wait_on_buffer(bh);
3202  		if (!ret && !buffer_uptodate(bh))
3203  			ret = -EIO;
3204  	} else {
3205  		unlock_buffer(bh);
3206  	}
3207  	return ret;
3208  }
3209  EXPORT_SYMBOL(__sync_dirty_buffer);
3210  
3211  int sync_dirty_buffer(struct buffer_head *bh)
3212  {
3213  	return __sync_dirty_buffer(bh, WRITE_SYNC);
3214  }
3215  EXPORT_SYMBOL(sync_dirty_buffer);
3216  
3217  /*
3218   * try_to_free_buffers() checks if all the buffers on this particular page
3219   * are unused, and releases them if so.
3220   *
3221   * Exclusion against try_to_free_buffers may be obtained by either
3222   * locking the page or by holding its mapping's private_lock.
3223   *
3224   * If the page is dirty but all the buffers are clean then we need to
3225   * be sure to mark the page clean as well.  This is because the page
3226   * may be against a block device, and a later reattachment of buffers
3227   * to a dirty page will set *all* buffers dirty.  Which would corrupt
3228   * filesystem data on the same device.
3229   *
3230   * The same applies to regular filesystem pages: if all the buffers are
3231   * clean then we set the page clean and proceed.  To do that, we require
3232   * total exclusion from __set_page_dirty_buffers().  That is obtained with
3233   * private_lock.
3234   *
3235   * try_to_free_buffers() is non-blocking.
3236   */
3237  static inline int buffer_busy(struct buffer_head *bh)
3238  {
3239  	return atomic_read(&bh->b_count) |
3240  		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3241  }
3242  
3243  static int
3244  drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3245  {
3246  	struct buffer_head *head = page_buffers(page);
3247  	struct buffer_head *bh;
3248  
3249  	bh = head;
3250  	do {
3251  		if (buffer_write_io_error(bh) && page->mapping)
3252  			mapping_set_error(page->mapping, -EIO);
3253  		if (buffer_busy(bh))
3254  			goto failed;
3255  		bh = bh->b_this_page;
3256  	} while (bh != head);
3257  
3258  	do {
3259  		struct buffer_head *next = bh->b_this_page;
3260  
3261  		if (bh->b_assoc_map)
3262  			__remove_assoc_queue(bh);
3263  		bh = next;
3264  	} while (bh != head);
3265  	*buffers_to_free = head;
3266  	__clear_page_buffers(page);
3267  	return 1;
3268  failed:
3269  	return 0;
3270  }
3271  
3272  int try_to_free_buffers(struct page *page)
3273  {
3274  	struct address_space * const mapping = page->mapping;
3275  	struct buffer_head *buffers_to_free = NULL;
3276  	int ret = 0;
3277  
3278  	BUG_ON(!PageLocked(page));
3279  	if (PageWriteback(page))
3280  		return 0;
3281  
3282  	if (mapping == NULL) {		/* can this still happen? */
3283  		ret = drop_buffers(page, &buffers_to_free);
3284  		goto out;
3285  	}
3286  
3287  	spin_lock(&mapping->private_lock);
3288  	ret = drop_buffers(page, &buffers_to_free);
3289  
3290  	/*
3291  	 * If the filesystem writes its buffers by hand (eg ext3)
3292  	 * then we can have clean buffers against a dirty page.  We
3293  	 * clean the page here; otherwise the VM will never notice
3294  	 * that the filesystem did any IO at all.
3295  	 *
3296  	 * Also, during truncate, discard_buffer will have marked all
3297  	 * the page's buffers clean.  We discover that here and clean
3298  	 * the page also.
3299  	 *
3300  	 * private_lock must be held over this entire operation in order
3301  	 * to synchronise against __set_page_dirty_buffers and prevent the
3302  	 * dirty bit from being lost.
3303  	 */
3304  	if (ret)
3305  		cancel_dirty_page(page);
3306  	spin_unlock(&mapping->private_lock);
3307  out:
3308  	if (buffers_to_free) {
3309  		struct buffer_head *bh = buffers_to_free;
3310  
3311  		do {
3312  			struct buffer_head *next = bh->b_this_page;
3313  			free_buffer_head(bh);
3314  			bh = next;
3315  		} while (bh != buffers_to_free);
3316  	}
3317  	return ret;
3318  }
3319  EXPORT_SYMBOL(try_to_free_buffers);
3320  
3321  /*
3322   * There are no bdflush tunables left.  But distributions are
3323   * still running obsolete flush daemons, so we terminate them here.
3324   *
3325   * Use of bdflush() is deprecated and will be removed in a future kernel.
3326   * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3327   */
3328  SYSCALL_DEFINE2(bdflush, int, func, long, data)
3329  {
3330  	static int msg_count;
3331  
3332  	if (!capable(CAP_SYS_ADMIN))
3333  		return -EPERM;
3334  
3335  	if (msg_count < 5) {
3336  		msg_count++;
3337  		printk(KERN_INFO
3338  			"warning: process `%s' used the obsolete bdflush"
3339  			" system call\n", current->comm);
3340  		printk(KERN_INFO "Fix your initscripts?\n");
3341  	}
3342  
3343  	if (func == 1)
3344  		do_exit(0);
3345  	return 0;
3346  }
3347  
3348  /*
3349   * Buffer-head allocation
3350   */
3351  static struct kmem_cache *bh_cachep __read_mostly;
3352  
3353  /*
3354   * Once the number of bh's in the machine exceeds this level, we start
3355   * stripping them in writeback.
3356   */
3357  static unsigned long max_buffer_heads;
3358  
3359  int buffer_heads_over_limit;
3360  
3361  struct bh_accounting {
3362  	int nr;			/* Number of live bh's */
3363  	int ratelimit;		/* Limit cacheline bouncing */
3364  };
3365  
3366  static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3367  
3368  static void recalc_bh_state(void)
3369  {
3370  	int i;
3371  	int tot = 0;
3372  
3373  	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3374  		return;
3375  	__this_cpu_write(bh_accounting.ratelimit, 0);
3376  	for_each_online_cpu(i)
3377  		tot += per_cpu(bh_accounting, i).nr;
3378  	buffer_heads_over_limit = (tot > max_buffer_heads);
3379  }
3380  
3381  struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3382  {
3383  	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3384  	if (ret) {
3385  		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3386  		preempt_disable();
3387  		__this_cpu_inc(bh_accounting.nr);
3388  		recalc_bh_state();
3389  		preempt_enable();
3390  	}
3391  	return ret;
3392  }
3393  EXPORT_SYMBOL(alloc_buffer_head);
3394  
3395  void free_buffer_head(struct buffer_head *bh)
3396  {
3397  	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3398  	kmem_cache_free(bh_cachep, bh);
3399  	preempt_disable();
3400  	__this_cpu_dec(bh_accounting.nr);
3401  	recalc_bh_state();
3402  	preempt_enable();
3403  }
3404  EXPORT_SYMBOL(free_buffer_head);
3405  
3406  static void buffer_exit_cpu(int cpu)
3407  {
3408  	int i;
3409  	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3410  
3411  	for (i = 0; i < BH_LRU_SIZE; i++) {
3412  		brelse(b->bhs[i]);
3413  		b->bhs[i] = NULL;
3414  	}
3415  	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3416  	per_cpu(bh_accounting, cpu).nr = 0;
3417  }
3418  
3419  static int buffer_cpu_notify(struct notifier_block *self,
3420  			      unsigned long action, void *hcpu)
3421  {
3422  	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3423  		buffer_exit_cpu((unsigned long)hcpu);
3424  	return NOTIFY_OK;
3425  }
3426  
3427  /**
3428   * bh_uptodate_or_lock - Test whether the buffer is uptodate
3429   * @bh: struct buffer_head
3430   *
3431   * Return true if the buffer is up-to-date and false,
3432   * with the buffer locked, if not.
3433   */
3434  int bh_uptodate_or_lock(struct buffer_head *bh)
3435  {
3436  	if (!buffer_uptodate(bh)) {
3437  		lock_buffer(bh);
3438  		if (!buffer_uptodate(bh))
3439  			return 0;
3440  		unlock_buffer(bh);
3441  	}
3442  	return 1;
3443  }
3444  EXPORT_SYMBOL(bh_uptodate_or_lock);
3445  
3446  /**
3447   * bh_submit_read - Submit a locked buffer for reading
3448   * @bh: struct buffer_head
3449   *
3450   * Returns zero on success and -EIO on error.
3451   */
3452  int bh_submit_read(struct buffer_head *bh)
3453  {
3454  	BUG_ON(!buffer_locked(bh));
3455  
3456  	if (buffer_uptodate(bh)) {
3457  		unlock_buffer(bh);
3458  		return 0;
3459  	}
3460  
3461  	get_bh(bh);
3462  	bh->b_end_io = end_buffer_read_sync;
3463  	submit_bh(REQ_OP_READ, 0, bh);
3464  	wait_on_buffer(bh);
3465  	if (buffer_uptodate(bh))
3466  		return 0;
3467  	return -EIO;
3468  }
3469  EXPORT_SYMBOL(bh_submit_read);
3470  
3471  void __init buffer_init(void)
3472  {
3473  	unsigned long nrpages;
3474  
3475  	bh_cachep = kmem_cache_create("buffer_head",
3476  			sizeof(struct buffer_head), 0,
3477  				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3478  				SLAB_MEM_SPREAD),
3479  				NULL);
3480  
3481  	/*
3482  	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3483  	 */
3484  	nrpages = (nr_free_buffer_pages() * 10) / 100;
3485  	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3486  	hotcpu_notifier(buffer_cpu_notify, 0);
3487  }
3488