xref: /linux/fs/fs-writeback.c (revision 93d90ad708b8da6efc0e487b66111aa9db7f70c7)
1 /*
2  * fs/fs-writeback.c
3  *
4  * Copyright (C) 2002, Linus Torvalds.
5  *
6  * Contains all the functions related to writing back and waiting
7  * upon dirty inodes against superblocks, and writing back dirty
8  * pages against inodes.  ie: data writeback.  Writeout of the
9  * inode itself is not handled here.
10  *
11  * 10Apr2002	Andrew Morton
12  *		Split out of fs/inode.c
13  *		Additions for address_space-based writeback
14  */
15 
16 #include <linux/kernel.h>
17 #include <linux/export.h>
18 #include <linux/spinlock.h>
19 #include <linux/slab.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/kthread.h>
25 #include <linux/writeback.h>
26 #include <linux/blkdev.h>
27 #include <linux/backing-dev.h>
28 #include <linux/tracepoint.h>
29 #include <linux/device.h>
30 #include "internal.h"
31 
32 /*
33  * 4MB minimal write chunk size
34  */
35 #define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
36 
37 /*
38  * Passed into wb_writeback(), essentially a subset of writeback_control
39  */
40 struct wb_writeback_work {
41 	long nr_pages;
42 	struct super_block *sb;
43 	unsigned long *older_than_this;
44 	enum writeback_sync_modes sync_mode;
45 	unsigned int tagged_writepages:1;
46 	unsigned int for_kupdate:1;
47 	unsigned int range_cyclic:1;
48 	unsigned int for_background:1;
49 	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
50 	enum wb_reason reason;		/* why was writeback initiated? */
51 
52 	struct list_head list;		/* pending work list */
53 	struct completion *done;	/* set if the caller waits */
54 };
55 
56 /**
57  * writeback_in_progress - determine whether there is writeback in progress
58  * @bdi: the device's backing_dev_info structure.
59  *
60  * Determine whether there is writeback waiting to be handled against a
61  * backing device.
62  */
63 int writeback_in_progress(struct backing_dev_info *bdi)
64 {
65 	return test_bit(BDI_writeback_running, &bdi->state);
66 }
67 EXPORT_SYMBOL(writeback_in_progress);
68 
69 static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
70 {
71 	struct super_block *sb = inode->i_sb;
72 
73 	if (sb_is_blkdev_sb(sb))
74 		return inode->i_mapping->backing_dev_info;
75 
76 	return sb->s_bdi;
77 }
78 
79 static inline struct inode *wb_inode(struct list_head *head)
80 {
81 	return list_entry(head, struct inode, i_wb_list);
82 }
83 
84 /*
85  * Include the creation of the trace points after defining the
86  * wb_writeback_work structure and inline functions so that the definition
87  * remains local to this file.
88  */
89 #define CREATE_TRACE_POINTS
90 #include <trace/events/writeback.h>
91 
92 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
93 
94 static void bdi_wakeup_thread(struct backing_dev_info *bdi)
95 {
96 	spin_lock_bh(&bdi->wb_lock);
97 	if (test_bit(BDI_registered, &bdi->state))
98 		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
99 	spin_unlock_bh(&bdi->wb_lock);
100 }
101 
102 static void bdi_queue_work(struct backing_dev_info *bdi,
103 			   struct wb_writeback_work *work)
104 {
105 	trace_writeback_queue(bdi, work);
106 
107 	spin_lock_bh(&bdi->wb_lock);
108 	if (!test_bit(BDI_registered, &bdi->state)) {
109 		if (work->done)
110 			complete(work->done);
111 		goto out_unlock;
112 	}
113 	list_add_tail(&work->list, &bdi->work_list);
114 	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
115 out_unlock:
116 	spin_unlock_bh(&bdi->wb_lock);
117 }
118 
119 static void
120 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
121 		      bool range_cyclic, enum wb_reason reason)
122 {
123 	struct wb_writeback_work *work;
124 
125 	/*
126 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
127 	 * wakeup the thread for old dirty data writeback
128 	 */
129 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
130 	if (!work) {
131 		trace_writeback_nowork(bdi);
132 		bdi_wakeup_thread(bdi);
133 		return;
134 	}
135 
136 	work->sync_mode	= WB_SYNC_NONE;
137 	work->nr_pages	= nr_pages;
138 	work->range_cyclic = range_cyclic;
139 	work->reason	= reason;
140 
141 	bdi_queue_work(bdi, work);
142 }
143 
144 /**
145  * bdi_start_writeback - start writeback
146  * @bdi: the backing device to write from
147  * @nr_pages: the number of pages to write
148  * @reason: reason why some writeback work was initiated
149  *
150  * Description:
151  *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
152  *   started when this function returns, we make no guarantees on
153  *   completion. Caller need not hold sb s_umount semaphore.
154  *
155  */
156 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
157 			enum wb_reason reason)
158 {
159 	__bdi_start_writeback(bdi, nr_pages, true, reason);
160 }
161 
162 /**
163  * bdi_start_background_writeback - start background writeback
164  * @bdi: the backing device to write from
165  *
166  * Description:
167  *   This makes sure WB_SYNC_NONE background writeback happens. When
168  *   this function returns, it is only guaranteed that for given BDI
169  *   some IO is happening if we are over background dirty threshold.
170  *   Caller need not hold sb s_umount semaphore.
171  */
172 void bdi_start_background_writeback(struct backing_dev_info *bdi)
173 {
174 	/*
175 	 * We just wake up the flusher thread. It will perform background
176 	 * writeback as soon as there is no other work to do.
177 	 */
178 	trace_writeback_wake_background(bdi);
179 	bdi_wakeup_thread(bdi);
180 }
181 
182 /*
183  * Remove the inode from the writeback list it is on.
184  */
185 void inode_wb_list_del(struct inode *inode)
186 {
187 	struct backing_dev_info *bdi = inode_to_bdi(inode);
188 
189 	spin_lock(&bdi->wb.list_lock);
190 	list_del_init(&inode->i_wb_list);
191 	spin_unlock(&bdi->wb.list_lock);
192 }
193 
194 /*
195  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
196  * furthest end of its superblock's dirty-inode list.
197  *
198  * Before stamping the inode's ->dirtied_when, we check to see whether it is
199  * already the most-recently-dirtied inode on the b_dirty list.  If that is
200  * the case then the inode must have been redirtied while it was being written
201  * out and we don't reset its dirtied_when.
202  */
203 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
204 {
205 	assert_spin_locked(&wb->list_lock);
206 	if (!list_empty(&wb->b_dirty)) {
207 		struct inode *tail;
208 
209 		tail = wb_inode(wb->b_dirty.next);
210 		if (time_before(inode->dirtied_when, tail->dirtied_when))
211 			inode->dirtied_when = jiffies;
212 	}
213 	list_move(&inode->i_wb_list, &wb->b_dirty);
214 }
215 
216 /*
217  * requeue inode for re-scanning after bdi->b_io list is exhausted.
218  */
219 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
220 {
221 	assert_spin_locked(&wb->list_lock);
222 	list_move(&inode->i_wb_list, &wb->b_more_io);
223 }
224 
225 static void inode_sync_complete(struct inode *inode)
226 {
227 	inode->i_state &= ~I_SYNC;
228 	/* If inode is clean an unused, put it into LRU now... */
229 	inode_add_lru(inode);
230 	/* Waiters must see I_SYNC cleared before being woken up */
231 	smp_mb();
232 	wake_up_bit(&inode->i_state, __I_SYNC);
233 }
234 
235 static bool inode_dirtied_after(struct inode *inode, unsigned long t)
236 {
237 	bool ret = time_after(inode->dirtied_when, t);
238 #ifndef CONFIG_64BIT
239 	/*
240 	 * For inodes being constantly redirtied, dirtied_when can get stuck.
241 	 * It _appears_ to be in the future, but is actually in distant past.
242 	 * This test is necessary to prevent such wrapped-around relative times
243 	 * from permanently stopping the whole bdi writeback.
244 	 */
245 	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
246 #endif
247 	return ret;
248 }
249 
250 /*
251  * Move expired (dirtied before work->older_than_this) dirty inodes from
252  * @delaying_queue to @dispatch_queue.
253  */
254 static int move_expired_inodes(struct list_head *delaying_queue,
255 			       struct list_head *dispatch_queue,
256 			       struct wb_writeback_work *work)
257 {
258 	LIST_HEAD(tmp);
259 	struct list_head *pos, *node;
260 	struct super_block *sb = NULL;
261 	struct inode *inode;
262 	int do_sb_sort = 0;
263 	int moved = 0;
264 
265 	while (!list_empty(delaying_queue)) {
266 		inode = wb_inode(delaying_queue->prev);
267 		if (work->older_than_this &&
268 		    inode_dirtied_after(inode, *work->older_than_this))
269 			break;
270 		list_move(&inode->i_wb_list, &tmp);
271 		moved++;
272 		if (sb_is_blkdev_sb(inode->i_sb))
273 			continue;
274 		if (sb && sb != inode->i_sb)
275 			do_sb_sort = 1;
276 		sb = inode->i_sb;
277 	}
278 
279 	/* just one sb in list, splice to dispatch_queue and we're done */
280 	if (!do_sb_sort) {
281 		list_splice(&tmp, dispatch_queue);
282 		goto out;
283 	}
284 
285 	/* Move inodes from one superblock together */
286 	while (!list_empty(&tmp)) {
287 		sb = wb_inode(tmp.prev)->i_sb;
288 		list_for_each_prev_safe(pos, node, &tmp) {
289 			inode = wb_inode(pos);
290 			if (inode->i_sb == sb)
291 				list_move(&inode->i_wb_list, dispatch_queue);
292 		}
293 	}
294 out:
295 	return moved;
296 }
297 
298 /*
299  * Queue all expired dirty inodes for io, eldest first.
300  * Before
301  *         newly dirtied     b_dirty    b_io    b_more_io
302  *         =============>    gf         edc     BA
303  * After
304  *         newly dirtied     b_dirty    b_io    b_more_io
305  *         =============>    g          fBAedc
306  *                                           |
307  *                                           +--> dequeue for IO
308  */
309 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
310 {
311 	int moved;
312 	assert_spin_locked(&wb->list_lock);
313 	list_splice_init(&wb->b_more_io, &wb->b_io);
314 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
315 	trace_writeback_queue_io(wb, work, moved);
316 }
317 
318 static int write_inode(struct inode *inode, struct writeback_control *wbc)
319 {
320 	int ret;
321 
322 	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
323 		trace_writeback_write_inode_start(inode, wbc);
324 		ret = inode->i_sb->s_op->write_inode(inode, wbc);
325 		trace_writeback_write_inode(inode, wbc);
326 		return ret;
327 	}
328 	return 0;
329 }
330 
331 /*
332  * Wait for writeback on an inode to complete. Called with i_lock held.
333  * Caller must make sure inode cannot go away when we drop i_lock.
334  */
335 static void __inode_wait_for_writeback(struct inode *inode)
336 	__releases(inode->i_lock)
337 	__acquires(inode->i_lock)
338 {
339 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
340 	wait_queue_head_t *wqh;
341 
342 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
343 	while (inode->i_state & I_SYNC) {
344 		spin_unlock(&inode->i_lock);
345 		__wait_on_bit(wqh, &wq, bit_wait,
346 			      TASK_UNINTERRUPTIBLE);
347 		spin_lock(&inode->i_lock);
348 	}
349 }
350 
351 /*
352  * Wait for writeback on an inode to complete. Caller must have inode pinned.
353  */
354 void inode_wait_for_writeback(struct inode *inode)
355 {
356 	spin_lock(&inode->i_lock);
357 	__inode_wait_for_writeback(inode);
358 	spin_unlock(&inode->i_lock);
359 }
360 
361 /*
362  * Sleep until I_SYNC is cleared. This function must be called with i_lock
363  * held and drops it. It is aimed for callers not holding any inode reference
364  * so once i_lock is dropped, inode can go away.
365  */
366 static void inode_sleep_on_writeback(struct inode *inode)
367 	__releases(inode->i_lock)
368 {
369 	DEFINE_WAIT(wait);
370 	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
371 	int sleep;
372 
373 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
374 	sleep = inode->i_state & I_SYNC;
375 	spin_unlock(&inode->i_lock);
376 	if (sleep)
377 		schedule();
378 	finish_wait(wqh, &wait);
379 }
380 
381 /*
382  * Find proper writeback list for the inode depending on its current state and
383  * possibly also change of its state while we were doing writeback.  Here we
384  * handle things such as livelock prevention or fairness of writeback among
385  * inodes. This function can be called only by flusher thread - noone else
386  * processes all inodes in writeback lists and requeueing inodes behind flusher
387  * thread's back can have unexpected consequences.
388  */
389 static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
390 			  struct writeback_control *wbc)
391 {
392 	if (inode->i_state & I_FREEING)
393 		return;
394 
395 	/*
396 	 * Sync livelock prevention. Each inode is tagged and synced in one
397 	 * shot. If still dirty, it will be redirty_tail()'ed below.  Update
398 	 * the dirty time to prevent enqueue and sync it again.
399 	 */
400 	if ((inode->i_state & I_DIRTY) &&
401 	    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
402 		inode->dirtied_when = jiffies;
403 
404 	if (wbc->pages_skipped) {
405 		/*
406 		 * writeback is not making progress due to locked
407 		 * buffers. Skip this inode for now.
408 		 */
409 		redirty_tail(inode, wb);
410 		return;
411 	}
412 
413 	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
414 		/*
415 		 * We didn't write back all the pages.  nfs_writepages()
416 		 * sometimes bales out without doing anything.
417 		 */
418 		if (wbc->nr_to_write <= 0) {
419 			/* Slice used up. Queue for next turn. */
420 			requeue_io(inode, wb);
421 		} else {
422 			/*
423 			 * Writeback blocked by something other than
424 			 * congestion. Delay the inode for some time to
425 			 * avoid spinning on the CPU (100% iowait)
426 			 * retrying writeback of the dirty page/inode
427 			 * that cannot be performed immediately.
428 			 */
429 			redirty_tail(inode, wb);
430 		}
431 	} else if (inode->i_state & I_DIRTY) {
432 		/*
433 		 * Filesystems can dirty the inode during writeback operations,
434 		 * such as delayed allocation during submission or metadata
435 		 * updates after data IO completion.
436 		 */
437 		redirty_tail(inode, wb);
438 	} else {
439 		/* The inode is clean. Remove from writeback lists. */
440 		list_del_init(&inode->i_wb_list);
441 	}
442 }
443 
444 /*
445  * Write out an inode and its dirty pages. Do not update the writeback list
446  * linkage. That is left to the caller. The caller is also responsible for
447  * setting I_SYNC flag and calling inode_sync_complete() to clear it.
448  */
449 static int
450 __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
451 {
452 	struct address_space *mapping = inode->i_mapping;
453 	long nr_to_write = wbc->nr_to_write;
454 	unsigned dirty;
455 	int ret;
456 
457 	WARN_ON(!(inode->i_state & I_SYNC));
458 
459 	trace_writeback_single_inode_start(inode, wbc, nr_to_write);
460 
461 	ret = do_writepages(mapping, wbc);
462 
463 	/*
464 	 * Make sure to wait on the data before writing out the metadata.
465 	 * This is important for filesystems that modify metadata on data
466 	 * I/O completion. We don't do it for sync(2) writeback because it has a
467 	 * separate, external IO completion path and ->sync_fs for guaranteeing
468 	 * inode metadata is written back correctly.
469 	 */
470 	if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
471 		int err = filemap_fdatawait(mapping);
472 		if (ret == 0)
473 			ret = err;
474 	}
475 
476 	/*
477 	 * Some filesystems may redirty the inode during the writeback
478 	 * due to delalloc, clear dirty metadata flags right before
479 	 * write_inode()
480 	 */
481 	spin_lock(&inode->i_lock);
482 
483 	dirty = inode->i_state & I_DIRTY;
484 	inode->i_state &= ~I_DIRTY;
485 
486 	/*
487 	 * Paired with smp_mb() in __mark_inode_dirty().  This allows
488 	 * __mark_inode_dirty() to test i_state without grabbing i_lock -
489 	 * either they see the I_DIRTY bits cleared or we see the dirtied
490 	 * inode.
491 	 *
492 	 * I_DIRTY_PAGES is always cleared together above even if @mapping
493 	 * still has dirty pages.  The flag is reinstated after smp_mb() if
494 	 * necessary.  This guarantees that either __mark_inode_dirty()
495 	 * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
496 	 */
497 	smp_mb();
498 
499 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
500 		inode->i_state |= I_DIRTY_PAGES;
501 
502 	spin_unlock(&inode->i_lock);
503 
504 	/* Don't write the inode if only I_DIRTY_PAGES was set */
505 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
506 		int err = write_inode(inode, wbc);
507 		if (ret == 0)
508 			ret = err;
509 	}
510 	trace_writeback_single_inode(inode, wbc, nr_to_write);
511 	return ret;
512 }
513 
514 /*
515  * Write out an inode's dirty pages. Either the caller has an active reference
516  * on the inode or the inode has I_WILL_FREE set.
517  *
518  * This function is designed to be called for writing back one inode which
519  * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
520  * and does more profound writeback list handling in writeback_sb_inodes().
521  */
522 static int
523 writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
524 		       struct writeback_control *wbc)
525 {
526 	int ret = 0;
527 
528 	spin_lock(&inode->i_lock);
529 	if (!atomic_read(&inode->i_count))
530 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
531 	else
532 		WARN_ON(inode->i_state & I_WILL_FREE);
533 
534 	if (inode->i_state & I_SYNC) {
535 		if (wbc->sync_mode != WB_SYNC_ALL)
536 			goto out;
537 		/*
538 		 * It's a data-integrity sync. We must wait. Since callers hold
539 		 * inode reference or inode has I_WILL_FREE set, it cannot go
540 		 * away under us.
541 		 */
542 		__inode_wait_for_writeback(inode);
543 	}
544 	WARN_ON(inode->i_state & I_SYNC);
545 	/*
546 	 * Skip inode if it is clean and we have no outstanding writeback in
547 	 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
548 	 * function since flusher thread may be doing for example sync in
549 	 * parallel and if we move the inode, it could get skipped. So here we
550 	 * make sure inode is on some writeback list and leave it there unless
551 	 * we have completely cleaned the inode.
552 	 */
553 	if (!(inode->i_state & I_DIRTY) &&
554 	    (wbc->sync_mode != WB_SYNC_ALL ||
555 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
556 		goto out;
557 	inode->i_state |= I_SYNC;
558 	spin_unlock(&inode->i_lock);
559 
560 	ret = __writeback_single_inode(inode, wbc);
561 
562 	spin_lock(&wb->list_lock);
563 	spin_lock(&inode->i_lock);
564 	/*
565 	 * If inode is clean, remove it from writeback lists. Otherwise don't
566 	 * touch it. See comment above for explanation.
567 	 */
568 	if (!(inode->i_state & I_DIRTY))
569 		list_del_init(&inode->i_wb_list);
570 	spin_unlock(&wb->list_lock);
571 	inode_sync_complete(inode);
572 out:
573 	spin_unlock(&inode->i_lock);
574 	return ret;
575 }
576 
577 static long writeback_chunk_size(struct backing_dev_info *bdi,
578 				 struct wb_writeback_work *work)
579 {
580 	long pages;
581 
582 	/*
583 	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
584 	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
585 	 * here avoids calling into writeback_inodes_wb() more than once.
586 	 *
587 	 * The intended call sequence for WB_SYNC_ALL writeback is:
588 	 *
589 	 *      wb_writeback()
590 	 *          writeback_sb_inodes()       <== called only once
591 	 *              write_cache_pages()     <== called once for each inode
592 	 *                   (quickly) tag currently dirty pages
593 	 *                   (maybe slowly) sync all tagged pages
594 	 */
595 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
596 		pages = LONG_MAX;
597 	else {
598 		pages = min(bdi->avg_write_bandwidth / 2,
599 			    global_dirty_limit / DIRTY_SCOPE);
600 		pages = min(pages, work->nr_pages);
601 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
602 				   MIN_WRITEBACK_PAGES);
603 	}
604 
605 	return pages;
606 }
607 
608 /*
609  * Write a portion of b_io inodes which belong to @sb.
610  *
611  * Return the number of pages and/or inodes written.
612  */
613 static long writeback_sb_inodes(struct super_block *sb,
614 				struct bdi_writeback *wb,
615 				struct wb_writeback_work *work)
616 {
617 	struct writeback_control wbc = {
618 		.sync_mode		= work->sync_mode,
619 		.tagged_writepages	= work->tagged_writepages,
620 		.for_kupdate		= work->for_kupdate,
621 		.for_background		= work->for_background,
622 		.for_sync		= work->for_sync,
623 		.range_cyclic		= work->range_cyclic,
624 		.range_start		= 0,
625 		.range_end		= LLONG_MAX,
626 	};
627 	unsigned long start_time = jiffies;
628 	long write_chunk;
629 	long wrote = 0;  /* count both pages and inodes */
630 
631 	while (!list_empty(&wb->b_io)) {
632 		struct inode *inode = wb_inode(wb->b_io.prev);
633 
634 		if (inode->i_sb != sb) {
635 			if (work->sb) {
636 				/*
637 				 * We only want to write back data for this
638 				 * superblock, move all inodes not belonging
639 				 * to it back onto the dirty list.
640 				 */
641 				redirty_tail(inode, wb);
642 				continue;
643 			}
644 
645 			/*
646 			 * The inode belongs to a different superblock.
647 			 * Bounce back to the caller to unpin this and
648 			 * pin the next superblock.
649 			 */
650 			break;
651 		}
652 
653 		/*
654 		 * Don't bother with new inodes or inodes being freed, first
655 		 * kind does not need periodic writeout yet, and for the latter
656 		 * kind writeout is handled by the freer.
657 		 */
658 		spin_lock(&inode->i_lock);
659 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
660 			spin_unlock(&inode->i_lock);
661 			redirty_tail(inode, wb);
662 			continue;
663 		}
664 		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
665 			/*
666 			 * If this inode is locked for writeback and we are not
667 			 * doing writeback-for-data-integrity, move it to
668 			 * b_more_io so that writeback can proceed with the
669 			 * other inodes on s_io.
670 			 *
671 			 * We'll have another go at writing back this inode
672 			 * when we completed a full scan of b_io.
673 			 */
674 			spin_unlock(&inode->i_lock);
675 			requeue_io(inode, wb);
676 			trace_writeback_sb_inodes_requeue(inode);
677 			continue;
678 		}
679 		spin_unlock(&wb->list_lock);
680 
681 		/*
682 		 * We already requeued the inode if it had I_SYNC set and we
683 		 * are doing WB_SYNC_NONE writeback. So this catches only the
684 		 * WB_SYNC_ALL case.
685 		 */
686 		if (inode->i_state & I_SYNC) {
687 			/* Wait for I_SYNC. This function drops i_lock... */
688 			inode_sleep_on_writeback(inode);
689 			/* Inode may be gone, start again */
690 			spin_lock(&wb->list_lock);
691 			continue;
692 		}
693 		inode->i_state |= I_SYNC;
694 		spin_unlock(&inode->i_lock);
695 
696 		write_chunk = writeback_chunk_size(wb->bdi, work);
697 		wbc.nr_to_write = write_chunk;
698 		wbc.pages_skipped = 0;
699 
700 		/*
701 		 * We use I_SYNC to pin the inode in memory. While it is set
702 		 * evict_inode() will wait so the inode cannot be freed.
703 		 */
704 		__writeback_single_inode(inode, &wbc);
705 
706 		work->nr_pages -= write_chunk - wbc.nr_to_write;
707 		wrote += write_chunk - wbc.nr_to_write;
708 		spin_lock(&wb->list_lock);
709 		spin_lock(&inode->i_lock);
710 		if (!(inode->i_state & I_DIRTY))
711 			wrote++;
712 		requeue_inode(inode, wb, &wbc);
713 		inode_sync_complete(inode);
714 		spin_unlock(&inode->i_lock);
715 		cond_resched_lock(&wb->list_lock);
716 		/*
717 		 * bail out to wb_writeback() often enough to check
718 		 * background threshold and other termination conditions.
719 		 */
720 		if (wrote) {
721 			if (time_is_before_jiffies(start_time + HZ / 10UL))
722 				break;
723 			if (work->nr_pages <= 0)
724 				break;
725 		}
726 	}
727 	return wrote;
728 }
729 
730 static long __writeback_inodes_wb(struct bdi_writeback *wb,
731 				  struct wb_writeback_work *work)
732 {
733 	unsigned long start_time = jiffies;
734 	long wrote = 0;
735 
736 	while (!list_empty(&wb->b_io)) {
737 		struct inode *inode = wb_inode(wb->b_io.prev);
738 		struct super_block *sb = inode->i_sb;
739 
740 		if (!grab_super_passive(sb)) {
741 			/*
742 			 * grab_super_passive() may fail consistently due to
743 			 * s_umount being grabbed by someone else. Don't use
744 			 * requeue_io() to avoid busy retrying the inode/sb.
745 			 */
746 			redirty_tail(inode, wb);
747 			continue;
748 		}
749 		wrote += writeback_sb_inodes(sb, wb, work);
750 		drop_super(sb);
751 
752 		/* refer to the same tests at the end of writeback_sb_inodes */
753 		if (wrote) {
754 			if (time_is_before_jiffies(start_time + HZ / 10UL))
755 				break;
756 			if (work->nr_pages <= 0)
757 				break;
758 		}
759 	}
760 	/* Leave any unwritten inodes on b_io */
761 	return wrote;
762 }
763 
764 static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
765 				enum wb_reason reason)
766 {
767 	struct wb_writeback_work work = {
768 		.nr_pages	= nr_pages,
769 		.sync_mode	= WB_SYNC_NONE,
770 		.range_cyclic	= 1,
771 		.reason		= reason,
772 	};
773 
774 	spin_lock(&wb->list_lock);
775 	if (list_empty(&wb->b_io))
776 		queue_io(wb, &work);
777 	__writeback_inodes_wb(wb, &work);
778 	spin_unlock(&wb->list_lock);
779 
780 	return nr_pages - work.nr_pages;
781 }
782 
783 static bool over_bground_thresh(struct backing_dev_info *bdi)
784 {
785 	unsigned long background_thresh, dirty_thresh;
786 
787 	global_dirty_limits(&background_thresh, &dirty_thresh);
788 
789 	if (global_page_state(NR_FILE_DIRTY) +
790 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
791 		return true;
792 
793 	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
794 				bdi_dirty_limit(bdi, background_thresh))
795 		return true;
796 
797 	return false;
798 }
799 
800 /*
801  * Called under wb->list_lock. If there are multiple wb per bdi,
802  * only the flusher working on the first wb should do it.
803  */
804 static void wb_update_bandwidth(struct bdi_writeback *wb,
805 				unsigned long start_time)
806 {
807 	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
808 }
809 
810 /*
811  * Explicit flushing or periodic writeback of "old" data.
812  *
813  * Define "old": the first time one of an inode's pages is dirtied, we mark the
814  * dirtying-time in the inode's address_space.  So this periodic writeback code
815  * just walks the superblock inode list, writing back any inodes which are
816  * older than a specific point in time.
817  *
818  * Try to run once per dirty_writeback_interval.  But if a writeback event
819  * takes longer than a dirty_writeback_interval interval, then leave a
820  * one-second gap.
821  *
822  * older_than_this takes precedence over nr_to_write.  So we'll only write back
823  * all dirty pages if they are all attached to "old" mappings.
824  */
825 static long wb_writeback(struct bdi_writeback *wb,
826 			 struct wb_writeback_work *work)
827 {
828 	unsigned long wb_start = jiffies;
829 	long nr_pages = work->nr_pages;
830 	unsigned long oldest_jif;
831 	struct inode *inode;
832 	long progress;
833 
834 	oldest_jif = jiffies;
835 	work->older_than_this = &oldest_jif;
836 
837 	spin_lock(&wb->list_lock);
838 	for (;;) {
839 		/*
840 		 * Stop writeback when nr_pages has been consumed
841 		 */
842 		if (work->nr_pages <= 0)
843 			break;
844 
845 		/*
846 		 * Background writeout and kupdate-style writeback may
847 		 * run forever. Stop them if there is other work to do
848 		 * so that e.g. sync can proceed. They'll be restarted
849 		 * after the other works are all done.
850 		 */
851 		if ((work->for_background || work->for_kupdate) &&
852 		    !list_empty(&wb->bdi->work_list))
853 			break;
854 
855 		/*
856 		 * For background writeout, stop when we are below the
857 		 * background dirty threshold
858 		 */
859 		if (work->for_background && !over_bground_thresh(wb->bdi))
860 			break;
861 
862 		/*
863 		 * Kupdate and background works are special and we want to
864 		 * include all inodes that need writing. Livelock avoidance is
865 		 * handled by these works yielding to any other work so we are
866 		 * safe.
867 		 */
868 		if (work->for_kupdate) {
869 			oldest_jif = jiffies -
870 				msecs_to_jiffies(dirty_expire_interval * 10);
871 		} else if (work->for_background)
872 			oldest_jif = jiffies;
873 
874 		trace_writeback_start(wb->bdi, work);
875 		if (list_empty(&wb->b_io))
876 			queue_io(wb, work);
877 		if (work->sb)
878 			progress = writeback_sb_inodes(work->sb, wb, work);
879 		else
880 			progress = __writeback_inodes_wb(wb, work);
881 		trace_writeback_written(wb->bdi, work);
882 
883 		wb_update_bandwidth(wb, wb_start);
884 
885 		/*
886 		 * Did we write something? Try for more
887 		 *
888 		 * Dirty inodes are moved to b_io for writeback in batches.
889 		 * The completion of the current batch does not necessarily
890 		 * mean the overall work is done. So we keep looping as long
891 		 * as made some progress on cleaning pages or inodes.
892 		 */
893 		if (progress)
894 			continue;
895 		/*
896 		 * No more inodes for IO, bail
897 		 */
898 		if (list_empty(&wb->b_more_io))
899 			break;
900 		/*
901 		 * Nothing written. Wait for some inode to
902 		 * become available for writeback. Otherwise
903 		 * we'll just busyloop.
904 		 */
905 		if (!list_empty(&wb->b_more_io))  {
906 			trace_writeback_wait(wb->bdi, work);
907 			inode = wb_inode(wb->b_more_io.prev);
908 			spin_lock(&inode->i_lock);
909 			spin_unlock(&wb->list_lock);
910 			/* This function drops i_lock... */
911 			inode_sleep_on_writeback(inode);
912 			spin_lock(&wb->list_lock);
913 		}
914 	}
915 	spin_unlock(&wb->list_lock);
916 
917 	return nr_pages - work->nr_pages;
918 }
919 
920 /*
921  * Return the next wb_writeback_work struct that hasn't been processed yet.
922  */
923 static struct wb_writeback_work *
924 get_next_work_item(struct backing_dev_info *bdi)
925 {
926 	struct wb_writeback_work *work = NULL;
927 
928 	spin_lock_bh(&bdi->wb_lock);
929 	if (!list_empty(&bdi->work_list)) {
930 		work = list_entry(bdi->work_list.next,
931 				  struct wb_writeback_work, list);
932 		list_del_init(&work->list);
933 	}
934 	spin_unlock_bh(&bdi->wb_lock);
935 	return work;
936 }
937 
938 /*
939  * Add in the number of potentially dirty inodes, because each inode
940  * write can dirty pagecache in the underlying blockdev.
941  */
942 static unsigned long get_nr_dirty_pages(void)
943 {
944 	return global_page_state(NR_FILE_DIRTY) +
945 		global_page_state(NR_UNSTABLE_NFS) +
946 		get_nr_dirty_inodes();
947 }
948 
949 static long wb_check_background_flush(struct bdi_writeback *wb)
950 {
951 	if (over_bground_thresh(wb->bdi)) {
952 
953 		struct wb_writeback_work work = {
954 			.nr_pages	= LONG_MAX,
955 			.sync_mode	= WB_SYNC_NONE,
956 			.for_background	= 1,
957 			.range_cyclic	= 1,
958 			.reason		= WB_REASON_BACKGROUND,
959 		};
960 
961 		return wb_writeback(wb, &work);
962 	}
963 
964 	return 0;
965 }
966 
967 static long wb_check_old_data_flush(struct bdi_writeback *wb)
968 {
969 	unsigned long expired;
970 	long nr_pages;
971 
972 	/*
973 	 * When set to zero, disable periodic writeback
974 	 */
975 	if (!dirty_writeback_interval)
976 		return 0;
977 
978 	expired = wb->last_old_flush +
979 			msecs_to_jiffies(dirty_writeback_interval * 10);
980 	if (time_before(jiffies, expired))
981 		return 0;
982 
983 	wb->last_old_flush = jiffies;
984 	nr_pages = get_nr_dirty_pages();
985 
986 	if (nr_pages) {
987 		struct wb_writeback_work work = {
988 			.nr_pages	= nr_pages,
989 			.sync_mode	= WB_SYNC_NONE,
990 			.for_kupdate	= 1,
991 			.range_cyclic	= 1,
992 			.reason		= WB_REASON_PERIODIC,
993 		};
994 
995 		return wb_writeback(wb, &work);
996 	}
997 
998 	return 0;
999 }
1000 
1001 /*
1002  * Retrieve work items and do the writeback they describe
1003  */
1004 static long wb_do_writeback(struct bdi_writeback *wb)
1005 {
1006 	struct backing_dev_info *bdi = wb->bdi;
1007 	struct wb_writeback_work *work;
1008 	long wrote = 0;
1009 
1010 	set_bit(BDI_writeback_running, &wb->bdi->state);
1011 	while ((work = get_next_work_item(bdi)) != NULL) {
1012 
1013 		trace_writeback_exec(bdi, work);
1014 
1015 		wrote += wb_writeback(wb, work);
1016 
1017 		/*
1018 		 * Notify the caller of completion if this is a synchronous
1019 		 * work item, otherwise just free it.
1020 		 */
1021 		if (work->done)
1022 			complete(work->done);
1023 		else
1024 			kfree(work);
1025 	}
1026 
1027 	/*
1028 	 * Check for periodic writeback, kupdated() style
1029 	 */
1030 	wrote += wb_check_old_data_flush(wb);
1031 	wrote += wb_check_background_flush(wb);
1032 	clear_bit(BDI_writeback_running, &wb->bdi->state);
1033 
1034 	return wrote;
1035 }
1036 
1037 /*
1038  * Handle writeback of dirty data for the device backed by this bdi. Also
1039  * reschedules periodically and does kupdated style flushing.
1040  */
1041 void bdi_writeback_workfn(struct work_struct *work)
1042 {
1043 	struct bdi_writeback *wb = container_of(to_delayed_work(work),
1044 						struct bdi_writeback, dwork);
1045 	struct backing_dev_info *bdi = wb->bdi;
1046 	long pages_written;
1047 
1048 	set_worker_desc("flush-%s", dev_name(bdi->dev));
1049 	current->flags |= PF_SWAPWRITE;
1050 
1051 	if (likely(!current_is_workqueue_rescuer() ||
1052 		   !test_bit(BDI_registered, &bdi->state))) {
1053 		/*
1054 		 * The normal path.  Keep writing back @bdi until its
1055 		 * work_list is empty.  Note that this path is also taken
1056 		 * if @bdi is shutting down even when we're running off the
1057 		 * rescuer as work_list needs to be drained.
1058 		 */
1059 		do {
1060 			pages_written = wb_do_writeback(wb);
1061 			trace_writeback_pages_written(pages_written);
1062 		} while (!list_empty(&bdi->work_list));
1063 	} else {
1064 		/*
1065 		 * bdi_wq can't get enough workers and we're running off
1066 		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
1067 		 * enough for efficient IO.
1068 		 */
1069 		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
1070 						    WB_REASON_FORKER_THREAD);
1071 		trace_writeback_pages_written(pages_written);
1072 	}
1073 
1074 	if (!list_empty(&bdi->work_list))
1075 		mod_delayed_work(bdi_wq, &wb->dwork, 0);
1076 	else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1077 		bdi_wakeup_thread_delayed(bdi);
1078 
1079 	current->flags &= ~PF_SWAPWRITE;
1080 }
1081 
1082 /*
1083  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
1084  * the whole world.
1085  */
1086 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1087 {
1088 	struct backing_dev_info *bdi;
1089 
1090 	if (!nr_pages)
1091 		nr_pages = get_nr_dirty_pages();
1092 
1093 	rcu_read_lock();
1094 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
1095 		if (!bdi_has_dirty_io(bdi))
1096 			continue;
1097 		__bdi_start_writeback(bdi, nr_pages, false, reason);
1098 	}
1099 	rcu_read_unlock();
1100 }
1101 
1102 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1103 {
1104 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
1105 		struct dentry *dentry;
1106 		const char *name = "?";
1107 
1108 		dentry = d_find_alias(inode);
1109 		if (dentry) {
1110 			spin_lock(&dentry->d_lock);
1111 			name = (const char *) dentry->d_name.name;
1112 		}
1113 		printk(KERN_DEBUG
1114 		       "%s(%d): dirtied inode %lu (%s) on %s\n",
1115 		       current->comm, task_pid_nr(current), inode->i_ino,
1116 		       name, inode->i_sb->s_id);
1117 		if (dentry) {
1118 			spin_unlock(&dentry->d_lock);
1119 			dput(dentry);
1120 		}
1121 	}
1122 }
1123 
1124 /**
1125  *	__mark_inode_dirty -	internal function
1126  *	@inode: inode to mark
1127  *	@flags: what kind of dirty (i.e. I_DIRTY_SYNC)
1128  *	Mark an inode as dirty. Callers should use mark_inode_dirty or
1129  *  	mark_inode_dirty_sync.
1130  *
1131  * Put the inode on the super block's dirty list.
1132  *
1133  * CAREFUL! We mark it dirty unconditionally, but move it onto the
1134  * dirty list only if it is hashed or if it refers to a blockdev.
1135  * If it was not hashed, it will never be added to the dirty list
1136  * even if it is later hashed, as it will have been marked dirty already.
1137  *
1138  * In short, make sure you hash any inodes _before_ you start marking
1139  * them dirty.
1140  *
1141  * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1142  * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
1143  * the kernel-internal blockdev inode represents the dirtying time of the
1144  * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
1145  * page->mapping->host, so the page-dirtying time is recorded in the internal
1146  * blockdev inode.
1147  */
1148 void __mark_inode_dirty(struct inode *inode, int flags)
1149 {
1150 	struct super_block *sb = inode->i_sb;
1151 	struct backing_dev_info *bdi = NULL;
1152 
1153 	/*
1154 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
1155 	 * dirty the inode itself
1156 	 */
1157 	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1158 		trace_writeback_dirty_inode_start(inode, flags);
1159 
1160 		if (sb->s_op->dirty_inode)
1161 			sb->s_op->dirty_inode(inode, flags);
1162 
1163 		trace_writeback_dirty_inode(inode, flags);
1164 	}
1165 
1166 	/*
1167 	 * Paired with smp_mb() in __writeback_single_inode() for the
1168 	 * following lockless i_state test.  See there for details.
1169 	 */
1170 	smp_mb();
1171 
1172 	if ((inode->i_state & flags) == flags)
1173 		return;
1174 
1175 	if (unlikely(block_dump))
1176 		block_dump___mark_inode_dirty(inode);
1177 
1178 	spin_lock(&inode->i_lock);
1179 	if ((inode->i_state & flags) != flags) {
1180 		const int was_dirty = inode->i_state & I_DIRTY;
1181 
1182 		inode->i_state |= flags;
1183 
1184 		/*
1185 		 * If the inode is being synced, just update its dirty state.
1186 		 * The unlocker will place the inode on the appropriate
1187 		 * superblock list, based upon its state.
1188 		 */
1189 		if (inode->i_state & I_SYNC)
1190 			goto out_unlock_inode;
1191 
1192 		/*
1193 		 * Only add valid (hashed) inodes to the superblock's
1194 		 * dirty list.  Add blockdev inodes as well.
1195 		 */
1196 		if (!S_ISBLK(inode->i_mode)) {
1197 			if (inode_unhashed(inode))
1198 				goto out_unlock_inode;
1199 		}
1200 		if (inode->i_state & I_FREEING)
1201 			goto out_unlock_inode;
1202 
1203 		/*
1204 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
1205 		 * reposition it (that would break b_dirty time-ordering).
1206 		 */
1207 		if (!was_dirty) {
1208 			bool wakeup_bdi = false;
1209 			bdi = inode_to_bdi(inode);
1210 
1211 			spin_unlock(&inode->i_lock);
1212 			spin_lock(&bdi->wb.list_lock);
1213 			if (bdi_cap_writeback_dirty(bdi)) {
1214 				WARN(!test_bit(BDI_registered, &bdi->state),
1215 				     "bdi-%s not registered\n", bdi->name);
1216 
1217 				/*
1218 				 * If this is the first dirty inode for this
1219 				 * bdi, we have to wake-up the corresponding
1220 				 * bdi thread to make sure background
1221 				 * write-back happens later.
1222 				 */
1223 				if (!wb_has_dirty_io(&bdi->wb))
1224 					wakeup_bdi = true;
1225 			}
1226 
1227 			inode->dirtied_when = jiffies;
1228 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1229 			spin_unlock(&bdi->wb.list_lock);
1230 
1231 			if (wakeup_bdi)
1232 				bdi_wakeup_thread_delayed(bdi);
1233 			return;
1234 		}
1235 	}
1236 out_unlock_inode:
1237 	spin_unlock(&inode->i_lock);
1238 
1239 }
1240 EXPORT_SYMBOL(__mark_inode_dirty);
1241 
1242 static void wait_sb_inodes(struct super_block *sb)
1243 {
1244 	struct inode *inode, *old_inode = NULL;
1245 
1246 	/*
1247 	 * We need to be protected against the filesystem going from
1248 	 * r/o to r/w or vice versa.
1249 	 */
1250 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1251 
1252 	spin_lock(&inode_sb_list_lock);
1253 
1254 	/*
1255 	 * Data integrity sync. Must wait for all pages under writeback,
1256 	 * because there may have been pages dirtied before our sync
1257 	 * call, but which had writeout started before we write it out.
1258 	 * In which case, the inode may not be on the dirty list, but
1259 	 * we still have to wait for that writeout.
1260 	 */
1261 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1262 		struct address_space *mapping = inode->i_mapping;
1263 
1264 		spin_lock(&inode->i_lock);
1265 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1266 		    (mapping->nrpages == 0)) {
1267 			spin_unlock(&inode->i_lock);
1268 			continue;
1269 		}
1270 		__iget(inode);
1271 		spin_unlock(&inode->i_lock);
1272 		spin_unlock(&inode_sb_list_lock);
1273 
1274 		/*
1275 		 * We hold a reference to 'inode' so it couldn't have been
1276 		 * removed from s_inodes list while we dropped the
1277 		 * inode_sb_list_lock.  We cannot iput the inode now as we can
1278 		 * be holding the last reference and we cannot iput it under
1279 		 * inode_sb_list_lock. So we keep the reference and iput it
1280 		 * later.
1281 		 */
1282 		iput(old_inode);
1283 		old_inode = inode;
1284 
1285 		filemap_fdatawait(mapping);
1286 
1287 		cond_resched();
1288 
1289 		spin_lock(&inode_sb_list_lock);
1290 	}
1291 	spin_unlock(&inode_sb_list_lock);
1292 	iput(old_inode);
1293 }
1294 
1295 /**
1296  * writeback_inodes_sb_nr -	writeback dirty inodes from given super_block
1297  * @sb: the superblock
1298  * @nr: the number of pages to write
1299  * @reason: reason why some writeback work initiated
1300  *
1301  * Start writeback on some inodes on this super_block. No guarantees are made
1302  * on how many (if any) will be written, and this function does not wait
1303  * for IO completion of submitted IO.
1304  */
1305 void writeback_inodes_sb_nr(struct super_block *sb,
1306 			    unsigned long nr,
1307 			    enum wb_reason reason)
1308 {
1309 	DECLARE_COMPLETION_ONSTACK(done);
1310 	struct wb_writeback_work work = {
1311 		.sb			= sb,
1312 		.sync_mode		= WB_SYNC_NONE,
1313 		.tagged_writepages	= 1,
1314 		.done			= &done,
1315 		.nr_pages		= nr,
1316 		.reason			= reason,
1317 	};
1318 
1319 	if (sb->s_bdi == &noop_backing_dev_info)
1320 		return;
1321 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1322 	bdi_queue_work(sb->s_bdi, &work);
1323 	wait_for_completion(&done);
1324 }
1325 EXPORT_SYMBOL(writeback_inodes_sb_nr);
1326 
1327 /**
1328  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
1329  * @sb: the superblock
1330  * @reason: reason why some writeback work was initiated
1331  *
1332  * Start writeback on some inodes on this super_block. No guarantees are made
1333  * on how many (if any) will be written, and this function does not wait
1334  * for IO completion of submitted IO.
1335  */
1336 void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1337 {
1338 	return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1339 }
1340 EXPORT_SYMBOL(writeback_inodes_sb);
1341 
1342 /**
1343  * try_to_writeback_inodes_sb_nr - try to start writeback if none underway
1344  * @sb: the superblock
1345  * @nr: the number of pages to write
1346  * @reason: the reason of writeback
1347  *
1348  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
1349  * Returns 1 if writeback was started, 0 if not.
1350  */
1351 int try_to_writeback_inodes_sb_nr(struct super_block *sb,
1352 				  unsigned long nr,
1353 				  enum wb_reason reason)
1354 {
1355 	if (writeback_in_progress(sb->s_bdi))
1356 		return 1;
1357 
1358 	if (!down_read_trylock(&sb->s_umount))
1359 		return 0;
1360 
1361 	writeback_inodes_sb_nr(sb, nr, reason);
1362 	up_read(&sb->s_umount);
1363 	return 1;
1364 }
1365 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
1366 
1367 /**
1368  * try_to_writeback_inodes_sb - try to start writeback if none underway
1369  * @sb: the superblock
1370  * @reason: reason why some writeback work was initiated
1371  *
1372  * Implement by try_to_writeback_inodes_sb_nr()
1373  * Returns 1 if writeback was started, 0 if not.
1374  */
1375 int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
1376 {
1377 	return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
1378 }
1379 EXPORT_SYMBOL(try_to_writeback_inodes_sb);
1380 
1381 /**
1382  * sync_inodes_sb	-	sync sb inode pages
1383  * @sb: the superblock
1384  *
1385  * This function writes and waits on any dirty inode belonging to this
1386  * super_block.
1387  */
1388 void sync_inodes_sb(struct super_block *sb)
1389 {
1390 	DECLARE_COMPLETION_ONSTACK(done);
1391 	struct wb_writeback_work work = {
1392 		.sb		= sb,
1393 		.sync_mode	= WB_SYNC_ALL,
1394 		.nr_pages	= LONG_MAX,
1395 		.range_cyclic	= 0,
1396 		.done		= &done,
1397 		.reason		= WB_REASON_SYNC,
1398 		.for_sync	= 1,
1399 	};
1400 
1401 	/* Nothing to do? */
1402 	if (sb->s_bdi == &noop_backing_dev_info)
1403 		return;
1404 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
1405 
1406 	bdi_queue_work(sb->s_bdi, &work);
1407 	wait_for_completion(&done);
1408 
1409 	wait_sb_inodes(sb);
1410 }
1411 EXPORT_SYMBOL(sync_inodes_sb);
1412 
1413 /**
1414  * write_inode_now	-	write an inode to disk
1415  * @inode: inode to write to disk
1416  * @sync: whether the write should be synchronous or not
1417  *
1418  * This function commits an inode to disk immediately if it is dirty. This is
1419  * primarily needed by knfsd.
1420  *
1421  * The caller must either have a ref on the inode or must have set I_WILL_FREE.
1422  */
1423 int write_inode_now(struct inode *inode, int sync)
1424 {
1425 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1426 	struct writeback_control wbc = {
1427 		.nr_to_write = LONG_MAX,
1428 		.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
1429 		.range_start = 0,
1430 		.range_end = LLONG_MAX,
1431 	};
1432 
1433 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
1434 		wbc.nr_to_write = 0;
1435 
1436 	might_sleep();
1437 	return writeback_single_inode(inode, wb, &wbc);
1438 }
1439 EXPORT_SYMBOL(write_inode_now);
1440 
1441 /**
1442  * sync_inode - write an inode and its pages to disk.
1443  * @inode: the inode to sync
1444  * @wbc: controls the writeback mode
1445  *
1446  * sync_inode() will write an inode and its pages to disk.  It will also
1447  * correctly update the inode on its superblock's dirty inode lists and will
1448  * update inode->i_state.
1449  *
1450  * The caller must have a ref on the inode.
1451  */
1452 int sync_inode(struct inode *inode, struct writeback_control *wbc)
1453 {
1454 	return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
1455 }
1456 EXPORT_SYMBOL(sync_inode);
1457 
1458 /**
1459  * sync_inode_metadata - write an inode to disk
1460  * @inode: the inode to sync
1461  * @wait: wait for I/O to complete.
1462  *
1463  * Write an inode to disk and adjust its dirty state after completion.
1464  *
1465  * Note: only writes the actual inode, no associated data or other metadata.
1466  */
1467 int sync_inode_metadata(struct inode *inode, int wait)
1468 {
1469 	struct writeback_control wbc = {
1470 		.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1471 		.nr_to_write = 0, /* metadata-only */
1472 	};
1473 
1474 	return sync_inode(inode, &wbc);
1475 }
1476 EXPORT_SYMBOL(sync_inode_metadata);
1477