page-writeback.c - OpenGrok cross reference for /linux/mm/page-writeback.c

Lines Matching +full:charge +full:- +full:current +full:- +full:limit +full:- +full:mapping
1 // SPDX-License-Identifier: GPL-2.0-only
3  * mm/page-writeback.c
26 #include <linux/backing-dev.h>
54 #define DIRTY_POLL_THRESH	(128 >> (PAGE_SHIFT - 10))
57  * Estimate write bandwidth or update dirty limit at 200ms intervals.
100  * The interval between `kupdate'-style writebacks
119 /* End of sysctl-exported parameters */
137 	unsigned long		wb_dirty;	/* per-wb counterparts */
149  * reflect changes in current writeout rate.
157 				.wb_completions = &(__wb)->completions
163 				.wb_completions = &(__wb)->memcg_completions, \
168 	return dtc->dom;
173 	return dtc->dom;
178 	return mdtc->gdtc;
183 	return &wb->memcg_completions;
189 	unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
190 	unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
191 	unsigned long long min = wb->bdi->min_ratio;
192 	unsigned long long max = wb->bdi->max_ratio;
216 				.wb_completions = &(__wb)->completions
243 	*minp = wb->bdi->min_ratio;
244 	*maxp = wb->bdi->max_ratio;
257  * user-configurable dirty ratio is the effective number of pages that
261  * Because the user is allowed to specify the dirty limit globally as
262  * absolute number of bytes, calculating the per-zone dirty limit can
263  * require translating the configured limit into a percentage of
268  * node_dirtyable_memory - number of dirtyable pages in a node
272  * page cache.  This is the base value for the per-node dirty limits.
280 		struct zone *zone = pgdat->node_zones + z;
293 	nr_pages -= min(nr_pages, pgdat->totalreserve_pages);
316 			z = &NODE_DATA(node)->node_zones[i];
322 			nr_pages -= min(nr_pages, high_wmark_pages(z));
342  * global_dirtyable_memory - number of globally dirtyable pages
357 	x -= min(x, totalreserve_pages);
363 		x -= highmem_dirtyable_memory(x);
369  * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
372  * Calculate @dtc->thresh and ->bg_thresh considering
374  * must ensure that @dtc->avail is set before calling this function.  The
375  * dirty limits will be lifted by 1/4 for real-time tasks.
379 	const unsigned long available_memory = dtc->avail;
383 	/* convert ratios to per-PAGE_SIZE for higher precision */
392 		unsigned long global_avail = gdtc->avail;
398 		 * per-PAGE_SIZE, they can be obtained by dividing bytes by
420 	tsk = current;
427 	 * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
431 	/* This makes sure bg_thresh is within 32-bits as well */
434 	dtc->thresh = thresh;
435 	dtc->bg_thresh = bg_thresh;
443  * global_dirty_limits - background-writeback and dirty-throttling thresholds
462  * node_dirty_limit - maximum number of dirty pages allowed in a node
471 	struct task_struct *tsk = current;
485 	 * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
491  * node_dirty_ok - tells whether a node is within its dirty limits
495  * dirty limit, %false if the limit is exceeded.
499 	unsigned long limit = node_dirty_limit(pgdat);
505 	return nr_pages <= limit;
531 			return -ERANGE;
562 			return -ERANGE;
584 	__fprop_add_percpu_max(&dom->completions, completions,
587 	if (unlikely(!dom->period_time)) {
594 		dom->period_time = wp_next_time(jiffies);
595 		mod_timer(&dom->period_timer, dom->period_time);
608 	wb_domain_writeout_add(&global_wb_domain, &wb->completions,
609 			       wb->bdi->max_prop_frac, nr);
614 				       wb->bdi->max_prop_frac, nr);
634 	int miss_periods = (jiffies - dom->period_time) /
637 	if (fprop_new_period(&dom->completions, miss_periods + 1)) {
638 		dom->period_time = wp_next_time(dom->period_time +
640 		mod_timer(&dom->period_timer, dom->period_time);
646 		dom->period_time = 0;
654 	spin_lock_init(&dom->lock);
656 	timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
658 	dom->dirty_limit_tstamp = jiffies;
660 	return fprop_global_init(&dom->completions, gfp);
666 	del_timer_sync(&dom->period_timer);
667 	fprop_global_destroy(&dom->completions);
683 		return -EINVAL;
718 		return -EINVAL;
721 	if (min_ratio > bdi->max_ratio) {
722 		ret = -EINVAL;
724 		if (min_ratio < bdi->min_ratio) {
725 			delta = bdi->min_ratio - min_ratio;
726 			bdi_min_ratio -= delta;
727 			bdi->min_ratio = min_ratio;
729 			delta = min_ratio - bdi->min_ratio;
732 				bdi->min_ratio = min_ratio;
734 				ret = -EINVAL;
748 		return -EINVAL;
751 	if (bdi->min_ratio > max_ratio) {
752 		ret = -EINVAL;
754 		bdi->max_ratio = max_ratio;
755 		bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) /
786 	return bdi_get_bytes(bdi->min_ratio);
805 	return bdi_get_bytes(bdi->max_ratio);
825 		return -EINVAL;
829 		bdi->capabilities |= BDI_CAP_STRICTLIMIT;
831 		bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
846 	return max(thresh, dom->dirty_limit);
851  * system-wide clean memory excluding the amount being used in the domain.
857 	unsigned long clean = filepages - min(filepages, mdtc->dirty);
858 	unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
859 	unsigned long other_clean = global_clean - min(global_clean, clean);
861 	mdtc->avail = filepages + min(headroom, other_clean);
877 		dtc->avail = global_dirtyable_memory();
878 		dtc->dirty = global_node_page_state(NR_FILE_DIRTY);
880 			dtc->dirty += global_node_page_state(NR_WRITEBACK);
884 		mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty,
887 			dtc->dirty += writeback;
893  * __wb_calc_thresh - @wb's share of dirty threshold
898  * threshold as a hard limit when sleeping max_pause per page is not enough
906  * - starving fast devices
907  * - piling up dirty pages (that will take long time to sync) on slow devices
909  * The wb's share of dirty limit will be adapting to its throughput and
910  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
912  * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
920 	struct bdi_writeback *wb = dtc->wb;
929 	fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
932 	wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
945 	 * a hard limit in balance_dirty_pages() and wb_position_ratio().
950 	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
951 		unsigned long limit = hard_dirty_limit(dom, dtc->thresh);
954 		if (limit > dtc->dirty)
955 			wb_scale_thresh = (limit - dtc->dirty) / 100;
982  *                           setpoint - dirty 3
983  *        f(dirty) := 1.0 + (----------------)
984  *                           limit - setpoint
990  * (3) f(limit)    = 0   => the hard limit
997 					  unsigned long limit)
1002 	x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
1003 		      (limit - setpoint) | 1);
1050  *   0 +------------.------------------.----------------------*------------->
1051  *           freerun^          setpoint^                 limit^   dirty pages
1078  *   0 +----------------------.-------------------------------.------------->
1083  * - start writing to a slow SD card and a fast disk at the same time. The SD
1085  * - the wb dirty thresh drops quickly due to change of JBOD workload
1089 	struct bdi_writeback *wb = dtc->wb;
1090 	unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
1091 	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
1092 	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
1093 	unsigned long wb_thresh = dtc->wb_thresh;
1098 	long long pos_ratio;		/* for scaling up/down the rate limit */
1101 	dtc->pos_ratio = 0;
1103 	if (unlikely(dtc->dirty >= limit))
1111 	setpoint = (freerun + limit) / 2;
1112 	pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
1119 	 * This is especially important for fuse which sets bdi->max_ratio to
1126 	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
1139 	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1142 		if (dtc->wb_dirty < 8) {
1143 			dtc->pos_ratio = min_t(long long, pos_ratio * 2,
1148 		if (dtc->wb_dirty >= wb_thresh)
1152 						    dtc->wb_bg_thresh);
1157 		wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
1167 		 * wb's) while given strictlimit wb is below limit.
1170 		 * but it would look too non-natural for the case of all
1172 		 * with bdi->max_ratio == 100%.
1181 		dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
1194 	 *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
1196 	 *                        x_intercept - wb_dirty
1197 	 *                     := --------------------------
1198 	 *                        x_intercept - wb_setpoint
1203 	 * (2) k = - 1 / (8 * write_bw)  (in single wb case)
1208 	 *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
1216 	if (unlikely(wb_thresh > dtc->thresh))
1217 		wb_thresh = dtc->thresh;
1225 	wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
1230 	x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
1234 	 * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
1236 	 *        wb_thresh                    thresh - wb_thresh
1237 	 * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
1240 	span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
1243 	if (dtc->wb_dirty < x_intercept - span / 4) {
1244 		pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
1245 				      (x_intercept - wb_setpoint) | 1);
1255 	if (dtc->wb_dirty < x_intercept) {
1256 		if (dtc->wb_dirty > x_intercept / 8)
1258 					    dtc->wb_dirty);
1263 	dtc->pos_ratio = pos_ratio;
1271 	unsigned long avg = wb->avg_write_bandwidth;
1272 	unsigned long old = wb->write_bandwidth;
1278 	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
1279 	 * write_bandwidth = ---------------------------------------------------
1285 	bw = written - min(written, wb->written_stamp);
1292 	bw += (u64)wb->write_bandwidth * (period - elapsed);
1299 		avg -= (avg - old) >> 3;
1302 		avg += (old - avg) >> 3;
1308 		long delta = avg - wb->avg_write_bandwidth;
1310 					&wb->bdi->tot_write_bandwidth) <= 0);
1312 	wb->write_bandwidth = bw;
1313 	WRITE_ONCE(wb->avg_write_bandwidth, avg);
1319 	unsigned long thresh = dtc->thresh;
1320 	unsigned long limit = dom->dirty_limit;
1325 	if (limit < thresh) {
1326 		limit = thresh;
1333 	 * dom->dirty_limit which is guaranteed to lie above the dirty pages.
1335 	thresh = max(thresh, dtc->dirty);
1336 	if (limit > thresh) {
1337 		limit -= (limit - thresh) >> 5;
1342 	dom->dirty_limit = limit;
1353 	if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
1356 	spin_lock(&dom->lock);
1357 	if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
1359 		dom->dirty_limit_tstamp = now;
1361 	spin_unlock(&dom->lock);
1365  * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
1374 	struct bdi_writeback *wb = dtc->wb;
1375 	unsigned long dirty = dtc->dirty;
1376 	unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
1377 	unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
1378 	unsigned long setpoint = (freerun + limit) / 2;
1379 	unsigned long write_bw = wb->avg_write_bandwidth;
1380 	unsigned long dirty_ratelimit = wb->dirty_ratelimit;
1390 	 * when dirty pages are truncated by userspace or re-dirtied by FS.
1392 	dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
1398 					dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
1405 	 * formula will yield the balanced rate limit (write_bw / N).
1442 	 *	wb->dirty_ratelimit = balanced_dirty_ratelimit;
1446 	 * limit the step size.
1450 	 *	task_ratelimit - dirty_ratelimit
1451 	 *	= (pos_ratio - 1) * dirty_ratelimit
1460 	 * - dirty_ratelimit > balanced_dirty_ratelimit
1461 	 * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
1467 	 * |task_ratelimit - dirty_ratelimit| is used to limit the step size
1486 	if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1487 		dirty = dtc->wb_dirty;
1488 		if (dtc->wb_dirty < 8)
1489 			setpoint = dtc->wb_dirty + 1;
1491 			setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
1495 		x = min3(wb->balanced_dirty_ratelimit,
1498 			step = x - dirty_ratelimit;
1500 		x = max3(wb->balanced_dirty_ratelimit,
1503 			step = dirty_ratelimit - x;
1520 		dirty_ratelimit -= step;
1522 	WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
1523 	wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
1532 	struct bdi_writeback *wb = gdtc->wb;
1538 	spin_lock(&wb->list_lock);
1546 	elapsed = max(now - wb->bw_time_stamp, 1UL);
1547 	dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
1548 	written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
1565 	wb->dirtied_stamp = dirtied;
1566 	wb->written_stamp = written;
1567 	WRITE_ONCE(wb->bw_time_stamp, now);
1568 	spin_unlock(&wb->list_lock);
1584 	unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);
1587 	    !atomic_read(&wb->writeback_inodes)) {
1588 		spin_lock(&wb->list_lock);
1589 		wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
1590 		wb->written_stamp = wb_stat(wb, WB_WRITTEN);
1591 		WRITE_ONCE(wb->bw_time_stamp, now);
1592 		spin_unlock(&wb->list_lock);
1601  * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
1608 		return 1UL << (ilog2(thresh - dirty) >> 1);
1616 	unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
1620 	 * Limit pause time for small memory systems. If sleeping for too long
1638 	long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
1639 	long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
1644 	/* target for 10ms pause on 1-dd case */
1654 		t += (hi - lo) * (10 * HZ) / 1024;
1670 	 * 2) limit the target pause time to max_pause/2, so that the normal
1679 	 * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
1709 	struct bdi_writeback *wb = dtc->wb;
1715 	 * - in JBOD setup, wb_thresh can fluctuate a lot
1716 	 * - in a system with HDD and USB key, the USB key may somehow
1725 	dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
1726 	dtc->wb_bg_thresh = dtc->thresh ?
1727 		div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
1735 	 * reported dirty, even though there are thresh-m pages
1739 	if (dtc->wb_thresh < 2 * wb_stat_error()) {
1741 		dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
1744 		dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
1754 		dirty = dtc->wb_dirty;
1755 		thresh = dtc->wb_thresh;
1757 		dirty = dtc->dirty;
1758 		thresh = dtc->thresh;
1765  * Throttle it only when the background writeback cannot catch-up. This avoids
1771  * for strictlimit-ing.
1780 		dirty = dtc->wb_dirty;
1781 		thresh = dtc->wb_thresh;
1782 		bg_thresh = dtc->wb_bg_thresh;
1784 		dirty = dtc->dirty;
1785 		thresh = dtc->thresh;
1786 		bg_thresh = dtc->bg_thresh;
1788 	dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh);
1802 	dtc->freerun = false;
1810 	 * LOCAL_THROTTLE tasks must not be throttled when below the per-wb
1813 	if (!(current->flags & PF_LOCAL_THROTTLE))
1816 	dtc->freerun = dtc->wb_dirty <
1817 		       dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh);
1823 	dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) &&
1824 		((dtc->dirty > dtc->thresh) || strictlimit);
1835 	if (dtc->freerun)
1866 	struct backing_dev_info *bdi = wb->bdi;
1867 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1894 		if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
1902 		if (gdtc->freerun && (!mdtc || mdtc->freerun)) {
1910 			current->dirty_paused_when = now;
1911 			current->nr_dirtied = 0;
1914 			current->nr_dirtied_pause = min(intv, m_intv);
1929 		if (gdtc->freerun)
1941 			if (mdtc->freerun)
1943 			if (mdtc->pos_ratio < gdtc->pos_ratio)
1947 		wb->dirty_exceeded = gdtc->dirty_exceeded ||
1948 				     (mdtc && mdtc->dirty_exceeded);
1949 		if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
1954 		dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
1955 		task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
1957 		max_pause = wb_max_pause(wb, sdtc->wb_dirty);
1969 		if (current->dirty_paused_when)
1970 			pause -= now - current->dirty_paused_when;
1973 		 * for up to 800ms from time to time on 1-HDD; so does xfs,
1980 						  sdtc->thresh,
1981 						  sdtc->bg_thresh,
1982 						  sdtc->dirty,
1983 						  sdtc->wb_thresh,
1984 						  sdtc->wb_dirty,
1991 			if (pause < -HZ) {
1992 				current->dirty_paused_when = now;
1993 				current->nr_dirtied = 0;
1995 				current->dirty_paused_when += period;
1996 				current->nr_dirtied = 0;
1997 			} else if (current->nr_dirtied_pause <= pages_dirtied)
1998 				current->nr_dirtied_pause += pages_dirtied;
2003 			now += min(pause - max_pause, max_pause);
2009 					  sdtc->thresh,
2010 					  sdtc->bg_thresh,
2011 					  sdtc->dirty,
2012 					  sdtc->wb_thresh,
2013 					  sdtc->wb_dirty,
2021 			ret = -EAGAIN;
2025 		bdi->last_bdp_sleep = jiffies;
2028 		current->dirty_paused_when = now + pause;
2029 		current->nr_dirtied = 0;
2030 		current->nr_dirtied_pause = nr_dirtied_pause;
2044 		 * In theory 1 page is enough to keep the consumer-producer
2049 		if (sdtc->wb_dirty <= wb_stat_error())
2052 		if (fatal_signal_pending(current))
2063  *		dirty tsk->nr_dirtied_pause pages;
2067  * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
2069  * throttled page dirties in dirty_throttle_leaks on task exit and charge them
2077  * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
2078  * @mapping: address_space which was dirtied.
2087  * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
2093 int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
2096 	struct inode *inode = mapping->host;
2103 	if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
2109 		wb = &bdi->wb;
2111 	ratelimit = current->nr_dirtied_pause;
2112 	if (wb->dirty_exceeded)
2113 		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
2120 	 * time, hence all honoured too large initial task->nr_dirtied_pause.
2123 	if (unlikely(current->nr_dirtied >= ratelimit))
2131 	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
2132 	 * the dirty throttling and livelock other long-run dirtiers.
2135 	if (*p > 0 && current->nr_dirtied < ratelimit) {
2137 		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
2138 		*p -= nr_pages_dirtied;
2139 		current->nr_dirtied += nr_pages_dirtied;
2143 	if (unlikely(current->nr_dirtied >= ratelimit))
2144 		ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
2152  * balance_dirty_pages_ratelimited - balance dirty memory state.
2153  * @mapping: address_space which was dirtied.
2159  * Once we're over the dirty memory limit we decrease the ratelimiting
2160  * by a lot, to prevent individual processes from overshooting the limit
2163 void balance_dirty_pages_ratelimited(struct address_space *mapping)
2165 	balance_dirty_pages_ratelimited_flags(mapping, 0);
2175 	struct bdi_writeback *wb = dtc->wb;
2177 	dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh);
2178 	if (dtc->wb_bg_thresh < 2 * wb_stat_error())
2179 		dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
2181 		dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE);
2188 	if (dtc->dirty > dtc->bg_thresh)
2192 	if (dtc->wb_dirty > dtc->wb_bg_thresh)
2199  * wb_over_bg_thresh - does @wb need to be written back?
2235 	 * and a different non-zero value will wakeup the writeback threads.
2259  * then push it back - the user is still using the disk.
2263 	mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
2278 		del_timer(&bdi->laptop_mode_wb_timer);
2284  * If ratelimit_pages is too high then we can get into dirty-data overload
2299 	dom->dirty_limit = dirty_thresh;
2395  * is now applied to total non-HIGHPAGE memory, and as such we can't
2398  * non-HIGHMEM memory.
2417  * tag_pages_for_writeback - tag pages to be written by writeback
2418  * @mapping: address space structure to write
2430 void tag_pages_for_writeback(struct address_space *mapping,
2433 	XA_STATE(xas, &mapping->i_pages, start);
2452 static bool folio_prepare_writeback(struct address_space *mapping,
2462 	if (unlikely(folio->mapping != mapping))
2472 		if (wbc->sync_mode == WB_SYNC_NONE)
2486 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2493 	if (wbc->range_cyclic)
2494 		return -1;
2495 	return wbc->range_end >> PAGE_SHIFT;
2498 static struct folio *writeback_get_folio(struct address_space *mapping,
2504 	folio = folio_batch_next(&wbc->fbatch);
2506 		folio_batch_release(&wbc->fbatch);
2508 		filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc),
2509 				wbc_to_tag(wbc), &wbc->fbatch);
2510 		folio = folio_batch_next(&wbc->fbatch);
2516 	if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) {
2521 	trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
2526  * writeback_iter - iterate folio of a mapping for writeback
2527  * @mapping: address space structure to write
2530  * @error: in-out pointer for writeback errors (see below)
2533  * @wbc on @mapping and  should be called in a while loop in the ->writepages
2540  * If there was an error in the per-folio writeback inside the writeback_iter()
2551 struct folio *writeback_iter(struct address_space *mapping,
2555 		folio_batch_init(&wbc->fbatch);
2556 		wbc->saved_err = *error = 0;
2562 		 * For non-cyclic writeback we always start at the beginning of
2565 		if (wbc->range_cyclic)
2566 			wbc->index = mapping->writeback_index;
2568 			wbc->index = wbc->range_start >> PAGE_SHIFT;
2575 		 * For data-integrity writeback we have to be careful so that we
2581 		if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2582 			tag_pages_for_writeback(mapping, wbc->index,
2585 		wbc->nr_to_write -= folio_nr_pages(folio);
2592 		 * we run past wbc->nr_to_write or encounter errors.
2593 		 * We stash away the first error we encounter in wbc->saved_err
2598 		 * wbc->nr_to_write or encounter the first error.
2600 		if (wbc->sync_mode == WB_SYNC_ALL) {
2601 			if (*error && !wbc->saved_err)
2602 				wbc->saved_err = *error;
2604 			if (*error || wbc->nr_to_write <= 0)
2609 	folio = writeback_get_folio(mapping, wbc);
2616 		 * writeback access order inversion - we should only ever lock
2617 		 * multiple pages in ascending page->index order, and looping
2621 		if (wbc->range_cyclic)
2622 			mapping->writeback_index = 0;
2628 		*error = wbc->saved_err;
2633 	if (wbc->range_cyclic)
2634 		mapping->writeback_index = folio_next_index(folio);
2635 	folio_batch_release(&wbc->fbatch);
2641  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2642  * @mapping: address space structure to write
2643  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2651 int write_cache_pages(struct address_space *mapping,
2658 	while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
2670 static int writeback_use_writepage(struct address_space *mapping,
2678 	while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
2679 		err = mapping->a_ops->writepage(&folio->page, wbc);
2684 		mapping_set_error(mapping, err);
2691 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
2696 	if (wbc->nr_to_write <= 0)
2698 	wb = inode_to_wb_wbc(mapping->host, wbc);
2701 		if (mapping->a_ops->writepages) {
2702 			ret = mapping->a_ops->writepages(mapping, wbc);
2703 		} else if (mapping->a_ops->writepage) {
2704 			ret = writeback_use_writepage(mapping, wbc);
2709 		if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
2726 	if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
2735 bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
2749 		struct address_space *mapping)
2751 	struct inode *inode = mapping->host;
2753 	trace_writeback_dirty_folio(folio, mapping);
2755 	if (mapping_can_writeback(mapping)) {
2768 		current->nr_dirtied += nr;
2783 	lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
2784 	zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
2785 	wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
2804 void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
2809 	xa_lock_irqsave(&mapping->i_pages, flags);
2810 	if (folio->mapping) {	/* Race with truncate? */
2812 		folio_account_dirtied(folio, mapping);
2813 		__xa_set_mark(&mapping->i_pages, folio_index(folio),
2816 	xa_unlock_irqrestore(&mapping->i_pages, flags);
2820  * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
2821  * @mapping: Address space this folio belongs to.
2831  * that case, but not all the buffers.  This is a "bottom-up" dirtying,
2832  * whereas block_dirty_folio() is a "top-down" dirtying.
2838 bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
2843 	__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
2845 	if (mapping->host) {
2847 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
2854  * folio_redirty_for_writepage - Decline to write a dirty folio.
2868 	struct address_space *mapping = folio->mapping;
2872 	wbc->pages_skipped += nr;
2873 	ret = filemap_dirty_folio(mapping, folio);
2874 	if (mapping && mapping_can_writeback(mapping)) {
2875 		struct inode *inode = mapping->host;
2880 		current->nr_dirtied -= nr;
2881 		node_stat_mod_folio(folio, NR_DIRTIED, -nr);
2882 		wb_stat_mod(wb, WB_DIRTIED, -nr);
2890  * folio_mark_dirty - Mark a folio as being modified.
2898  * unmaps pages before removing the folio from its mapping.
2904 	struct address_space *mapping = folio_mapping(folio);
2906 	if (likely(mapping)) {
2920 		return mapping->a_ops->dirty_folio(mapping, folio);
2923 	return noop_dirty_folio(mapping, folio);
2929  * folio->mapping->host, and if the folio is unlocked.  This is because another
2930  * CPU could truncate the folio off the mapping and then free the mapping.
2932  * Usually, the folio _is_ locked, or the caller is a user-space process which
2963 	struct address_space *mapping = folio_mapping(folio);
2965 	if (mapping_can_writeback(mapping)) {
2966 		struct inode *inode = mapping->host;
2988  * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
2989  * The ->writepage implementation will run either folio_start_writeback()
2998 	struct address_space *mapping = folio_mapping(folio);
3003 	if (mapping && mapping_can_writeback(mapping)) {
3004 		struct inode *inode = mapping->host;
3013 		 *  (b) we tell the low-level filesystem to
3024 		 * has no effect on the actual dirty bit - since
3046 			lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
3047 			zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
3048 			wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
3060 	atomic_inc(&wb->writeback_inodes);
3066 	atomic_dec(&wb->writeback_inodes);
3074 	spin_lock_irqsave(&wb->work_lock, flags);
3075 	if (test_bit(WB_registered, &wb->state))
3076 		queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
3077 	spin_unlock_irqrestore(&wb->work_lock, flags);
3083 	struct address_space *mapping = folio_mapping(folio);
3086 	if (mapping && mapping_use_writeback_tags(mapping)) {
3087 		struct inode *inode = mapping->host;
3091 		xa_lock_irqsave(&mapping->i_pages, flags);
3093 		__xa_clear_mark(&mapping->i_pages, folio_index(folio),
3095 		if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
3098 			wb_stat_mod(wb, WB_WRITEBACK, -nr);
3100 			if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
3104 		if (mapping->host && !mapping_tagged(mapping,
3106 			sb_clear_inode_writeback(mapping->host);
3108 		xa_unlock_irqrestore(&mapping->i_pages, flags);
3113 	lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
3114 	zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
3123 	struct address_space *mapping = folio_mapping(folio);
3128 	if (mapping && mapping_use_writeback_tags(mapping)) {
3129 		XA_STATE(xas, &mapping->i_pages, folio_index(folio));
3130 		struct inode *inode = mapping->host;
3139 		on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
3142 		if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
3155 		if (mapping->host && !on_wblist)
3156 			sb_mark_inode_writeback(mapping->host);
3179  * folio_wait_writeback - Wait for a folio to finish writeback.
3200  * folio_wait_writeback_killable - Wait for a folio to finish writeback.
3210  * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
3217 			return -EINTR;
3225  * folio_wait_stable() - wait for writeback to finish, if necessary.