xref: /linux/mm/compaction.c (revision 100c85421b52e41269ada88f7d71a6b8a06c7a11)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2748446bbSMel Gorman /*
3748446bbSMel Gorman  * linux/mm/compaction.c
4748446bbSMel Gorman  *
5748446bbSMel Gorman  * Memory compaction for the reduction of external fragmentation. Note that
6748446bbSMel Gorman  * this heavily depends upon page migration to do all the real heavy
7748446bbSMel Gorman  * lifting
8748446bbSMel Gorman  *
9748446bbSMel Gorman  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
10748446bbSMel Gorman  */
11698b1b30SVlastimil Babka #include <linux/cpu.h>
12748446bbSMel Gorman #include <linux/swap.h>
13748446bbSMel Gorman #include <linux/migrate.h>
14748446bbSMel Gorman #include <linux/compaction.h>
15748446bbSMel Gorman #include <linux/mm_inline.h>
16174cd4b1SIngo Molnar #include <linux/sched/signal.h>
17748446bbSMel Gorman #include <linux/backing-dev.h>
1876ab0f53SMel Gorman #include <linux/sysctl.h>
19ed4a6d7fSMel Gorman #include <linux/sysfs.h>
20194159fbSMinchan Kim #include <linux/page-isolation.h>
21b8c73fc2SAndrey Ryabinin #include <linux/kasan.h>
22698b1b30SVlastimil Babka #include <linux/kthread.h>
23698b1b30SVlastimil Babka #include <linux/freezer.h>
2483358eceSJoonsoo Kim #include <linux/page_owner.h>
25eb414681SJohannes Weiner #include <linux/psi.h>
26748446bbSMel Gorman #include "internal.h"
27748446bbSMel Gorman 
28010fc29aSMinchan Kim #ifdef CONFIG_COMPACTION
2931ca72faSCharan Teja Kalla /*
3031ca72faSCharan Teja Kalla  * Fragmentation score check interval for proactive compaction purposes.
3131ca72faSCharan Teja Kalla  */
3231ca72faSCharan Teja Kalla #define HPAGE_FRAG_CHECK_INTERVAL_MSEC	(500)
3331ca72faSCharan Teja Kalla 
34010fc29aSMinchan Kim static inline void count_compact_event(enum vm_event_item item)
35010fc29aSMinchan Kim {
36010fc29aSMinchan Kim 	count_vm_event(item);
37010fc29aSMinchan Kim }
38010fc29aSMinchan Kim 
39010fc29aSMinchan Kim static inline void count_compact_events(enum vm_event_item item, long delta)
40010fc29aSMinchan Kim {
41010fc29aSMinchan Kim 	count_vm_events(item, delta);
42010fc29aSMinchan Kim }
43ee6f62fdSZi Yan 
44ee6f62fdSZi Yan /*
45ee6f62fdSZi Yan  * order == -1 is expected when compacting proactively via
46ee6f62fdSZi Yan  * 1. /proc/sys/vm/compact_memory
47ee6f62fdSZi Yan  * 2. /sys/devices/system/node/nodex/compact
48ee6f62fdSZi Yan  * 3. /proc/sys/vm/compaction_proactiveness
49ee6f62fdSZi Yan  */
50ee6f62fdSZi Yan static inline bool is_via_compact_memory(int order)
51ee6f62fdSZi Yan {
52ee6f62fdSZi Yan 	return order == -1;
53ee6f62fdSZi Yan }
54ee6f62fdSZi Yan 
55010fc29aSMinchan Kim #else
56010fc29aSMinchan Kim #define count_compact_event(item) do { } while (0)
57010fc29aSMinchan Kim #define count_compact_events(item, delta) do { } while (0)
58ee6f62fdSZi Yan static inline bool is_via_compact_memory(int order) { return false; }
59010fc29aSMinchan Kim #endif
60010fc29aSMinchan Kim 
61ff9543fdSMichal Nazarewicz #if defined CONFIG_COMPACTION || defined CONFIG_CMA
62ff9543fdSMichal Nazarewicz 
63b7aba698SMel Gorman #define CREATE_TRACE_POINTS
64b7aba698SMel Gorman #include <trace/events/compaction.h>
65b7aba698SMel Gorman 
6606b6640aSVlastimil Babka #define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
6706b6640aSVlastimil Babka #define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
6806b6640aSVlastimil Babka 
69facdaa91SNitin Gupta /*
70facdaa91SNitin Gupta  * Page order with-respect-to which proactive compaction
71facdaa91SNitin Gupta  * calculates external fragmentation, which is used as
72facdaa91SNitin Gupta  * the "fragmentation score" of a node/zone.
73facdaa91SNitin Gupta  */
74facdaa91SNitin Gupta #if defined CONFIG_TRANSPARENT_HUGEPAGE
75facdaa91SNitin Gupta #define COMPACTION_HPAGE_ORDER	HPAGE_PMD_ORDER
7625788738SNitin Gupta #elif defined CONFIG_HUGETLBFS
77facdaa91SNitin Gupta #define COMPACTION_HPAGE_ORDER	HUGETLB_PAGE_ORDER
78facdaa91SNitin Gupta #else
79facdaa91SNitin Gupta #define COMPACTION_HPAGE_ORDER	(PMD_SHIFT - PAGE_SHIFT)
80facdaa91SNitin Gupta #endif
81facdaa91SNitin Gupta 
82733aea0bSZi Yan static void split_map_pages(struct list_head *freepages)
83748446bbSMel Gorman {
84733aea0bSZi Yan 	unsigned int i, order;
8566c64223SJoonsoo Kim 	struct page *page, *next;
8666c64223SJoonsoo Kim 	LIST_HEAD(tmp_list);
87ff9543fdSMichal Nazarewicz 
88733aea0bSZi Yan 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
89733aea0bSZi Yan 		list_for_each_entry_safe(page, next, &freepages[order], lru) {
90733aea0bSZi Yan 			unsigned int nr_pages;
91733aea0bSZi Yan 
9266c64223SJoonsoo Kim 			list_del(&page->lru);
9366c64223SJoonsoo Kim 
9466c64223SJoonsoo Kim 			nr_pages = 1 << order;
9566c64223SJoonsoo Kim 
9646f24fd8SJoonsoo Kim 			post_alloc_hook(page, order, __GFP_MOVABLE);
9766c64223SJoonsoo Kim 			if (order)
9866c64223SJoonsoo Kim 				split_page(page, order);
9966c64223SJoonsoo Kim 
10066c64223SJoonsoo Kim 			for (i = 0; i < nr_pages; i++) {
10166c64223SJoonsoo Kim 				list_add(&page->lru, &tmp_list);
10266c64223SJoonsoo Kim 				page++;
103ff9543fdSMichal Nazarewicz 			}
104ff9543fdSMichal Nazarewicz 		}
105733aea0bSZi Yan 		list_splice_init(&tmp_list, &freepages[0]);
106733aea0bSZi Yan 	}
107733aea0bSZi Yan }
108ff9543fdSMichal Nazarewicz 
109733aea0bSZi Yan static unsigned long release_free_list(struct list_head *freepages)
110733aea0bSZi Yan {
111733aea0bSZi Yan 	int order;
112733aea0bSZi Yan 	unsigned long high_pfn = 0;
113733aea0bSZi Yan 
114733aea0bSZi Yan 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
115733aea0bSZi Yan 		struct page *page, *next;
116733aea0bSZi Yan 
117733aea0bSZi Yan 		list_for_each_entry_safe(page, next, &freepages[order], lru) {
118733aea0bSZi Yan 			unsigned long pfn = page_to_pfn(page);
119733aea0bSZi Yan 
120733aea0bSZi Yan 			list_del(&page->lru);
121733aea0bSZi Yan 			/*
122733aea0bSZi Yan 			 * Convert free pages into post allocation pages, so
123733aea0bSZi Yan 			 * that we can free them via __free_page.
124733aea0bSZi Yan 			 */
125733aea0bSZi Yan 			post_alloc_hook(page, order, __GFP_MOVABLE);
126733aea0bSZi Yan 			__free_pages(page, order);
127733aea0bSZi Yan 			if (pfn > high_pfn)
128733aea0bSZi Yan 				high_pfn = pfn;
129733aea0bSZi Yan 		}
130733aea0bSZi Yan 	}
131733aea0bSZi Yan 	return high_pfn;
13266c64223SJoonsoo Kim }
13366c64223SJoonsoo Kim 
134bb13ffebSMel Gorman #ifdef CONFIG_COMPACTION
13568f2736aSMatthew Wilcox (Oracle) bool PageMovable(struct page *page)
136bda807d4SMinchan Kim {
13768f2736aSMatthew Wilcox (Oracle) 	const struct movable_operations *mops;
138bda807d4SMinchan Kim 
139bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageLocked(page), page);
140bda807d4SMinchan Kim 	if (!__PageMovable(page))
14168f2736aSMatthew Wilcox (Oracle) 		return false;
142bda807d4SMinchan Kim 
14368f2736aSMatthew Wilcox (Oracle) 	mops = page_movable_ops(page);
14468f2736aSMatthew Wilcox (Oracle) 	if (mops)
14568f2736aSMatthew Wilcox (Oracle) 		return true;
146bda807d4SMinchan Kim 
14768f2736aSMatthew Wilcox (Oracle) 	return false;
148bda807d4SMinchan Kim }
149bda807d4SMinchan Kim 
15068f2736aSMatthew Wilcox (Oracle) void __SetPageMovable(struct page *page, const struct movable_operations *mops)
151bda807d4SMinchan Kim {
152bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageLocked(page), page);
15368f2736aSMatthew Wilcox (Oracle) 	VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
15468f2736aSMatthew Wilcox (Oracle) 	page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
155bda807d4SMinchan Kim }
156bda807d4SMinchan Kim EXPORT_SYMBOL(__SetPageMovable);
157bda807d4SMinchan Kim 
158bda807d4SMinchan Kim void __ClearPageMovable(struct page *page)
159bda807d4SMinchan Kim {
160bda807d4SMinchan Kim 	VM_BUG_ON_PAGE(!PageMovable(page), page);
161bda807d4SMinchan Kim 	/*
16268f2736aSMatthew Wilcox (Oracle) 	 * This page still has the type of a movable page, but it's
16368f2736aSMatthew Wilcox (Oracle) 	 * actually not movable any more.
164bda807d4SMinchan Kim 	 */
16568f2736aSMatthew Wilcox (Oracle) 	page->mapping = (void *)PAGE_MAPPING_MOVABLE;
166bda807d4SMinchan Kim }
167bda807d4SMinchan Kim EXPORT_SYMBOL(__ClearPageMovable);
168bda807d4SMinchan Kim 
16924e2716fSJoonsoo Kim /* Do not skip compaction more than 64 times */
17024e2716fSJoonsoo Kim #define COMPACT_MAX_DEFER_SHIFT 6
17124e2716fSJoonsoo Kim 
17224e2716fSJoonsoo Kim /*
17324e2716fSJoonsoo Kim  * Compaction is deferred when compaction fails to result in a page
174860b3272SAlex Shi  * allocation success. 1 << compact_defer_shift, compactions are skipped up
17524e2716fSJoonsoo Kim  * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
17624e2716fSJoonsoo Kim  */
1772271b016SHui Su static void defer_compaction(struct zone *zone, int order)
17824e2716fSJoonsoo Kim {
17924e2716fSJoonsoo Kim 	zone->compact_considered = 0;
18024e2716fSJoonsoo Kim 	zone->compact_defer_shift++;
18124e2716fSJoonsoo Kim 
18224e2716fSJoonsoo Kim 	if (order < zone->compact_order_failed)
18324e2716fSJoonsoo Kim 		zone->compact_order_failed = order;
18424e2716fSJoonsoo Kim 
18524e2716fSJoonsoo Kim 	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
18624e2716fSJoonsoo Kim 		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
18724e2716fSJoonsoo Kim 
18824e2716fSJoonsoo Kim 	trace_mm_compaction_defer_compaction(zone, order);
18924e2716fSJoonsoo Kim }
19024e2716fSJoonsoo Kim 
19124e2716fSJoonsoo Kim /* Returns true if compaction should be skipped this time */
1922271b016SHui Su static bool compaction_deferred(struct zone *zone, int order)
19324e2716fSJoonsoo Kim {
19424e2716fSJoonsoo Kim 	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
19524e2716fSJoonsoo Kim 
19624e2716fSJoonsoo Kim 	if (order < zone->compact_order_failed)
19724e2716fSJoonsoo Kim 		return false;
19824e2716fSJoonsoo Kim 
19924e2716fSJoonsoo Kim 	/* Avoid possible overflow */
20062b35fe0SMateusz Nosek 	if (++zone->compact_considered >= defer_limit) {
20124e2716fSJoonsoo Kim 		zone->compact_considered = defer_limit;
20224e2716fSJoonsoo Kim 		return false;
20362b35fe0SMateusz Nosek 	}
20424e2716fSJoonsoo Kim 
20524e2716fSJoonsoo Kim 	trace_mm_compaction_deferred(zone, order);
20624e2716fSJoonsoo Kim 
20724e2716fSJoonsoo Kim 	return true;
20824e2716fSJoonsoo Kim }
20924e2716fSJoonsoo Kim 
21024e2716fSJoonsoo Kim /*
21124e2716fSJoonsoo Kim  * Update defer tracking counters after successful compaction of given order,
21224e2716fSJoonsoo Kim  * which means an allocation either succeeded (alloc_success == true) or is
21324e2716fSJoonsoo Kim  * expected to succeed.
21424e2716fSJoonsoo Kim  */
21524e2716fSJoonsoo Kim void compaction_defer_reset(struct zone *zone, int order,
21624e2716fSJoonsoo Kim 		bool alloc_success)
21724e2716fSJoonsoo Kim {
21824e2716fSJoonsoo Kim 	if (alloc_success) {
21924e2716fSJoonsoo Kim 		zone->compact_considered = 0;
22024e2716fSJoonsoo Kim 		zone->compact_defer_shift = 0;
22124e2716fSJoonsoo Kim 	}
22224e2716fSJoonsoo Kim 	if (order >= zone->compact_order_failed)
22324e2716fSJoonsoo Kim 		zone->compact_order_failed = order + 1;
22424e2716fSJoonsoo Kim 
22524e2716fSJoonsoo Kim 	trace_mm_compaction_defer_reset(zone, order);
22624e2716fSJoonsoo Kim }
22724e2716fSJoonsoo Kim 
22824e2716fSJoonsoo Kim /* Returns true if restarting compaction after many failures */
2292271b016SHui Su static bool compaction_restarting(struct zone *zone, int order)
23024e2716fSJoonsoo Kim {
23124e2716fSJoonsoo Kim 	if (order < zone->compact_order_failed)
23224e2716fSJoonsoo Kim 		return false;
23324e2716fSJoonsoo Kim 
23424e2716fSJoonsoo Kim 	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
23524e2716fSJoonsoo Kim 		zone->compact_considered >= 1UL << zone->compact_defer_shift;
23624e2716fSJoonsoo Kim }
23724e2716fSJoonsoo Kim 
238bb13ffebSMel Gorman /* Returns true if the pageblock should be scanned for pages to isolate. */
239bb13ffebSMel Gorman static inline bool isolation_suitable(struct compact_control *cc,
240bb13ffebSMel Gorman 					struct page *page)
241bb13ffebSMel Gorman {
242bb13ffebSMel Gorman 	if (cc->ignore_skip_hint)
243bb13ffebSMel Gorman 		return true;
244bb13ffebSMel Gorman 
245bb13ffebSMel Gorman 	return !get_pageblock_skip(page);
246bb13ffebSMel Gorman }
247bb13ffebSMel Gorman 
24802333641SVlastimil Babka static void reset_cached_positions(struct zone *zone)
24902333641SVlastimil Babka {
25002333641SVlastimil Babka 	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
25102333641SVlastimil Babka 	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
252623446e4SJoonsoo Kim 	zone->compact_cached_free_pfn =
25306b6640aSVlastimil Babka 				pageblock_start_pfn(zone_end_pfn(zone) - 1);
25402333641SVlastimil Babka }
25502333641SVlastimil Babka 
2569721fd82SBaolin Wang #ifdef CONFIG_SPARSEMEM
2579721fd82SBaolin Wang /*
2589721fd82SBaolin Wang  * If the PFN falls into an offline section, return the start PFN of the
2599721fd82SBaolin Wang  * next online section. If the PFN falls into an online section or if
2609721fd82SBaolin Wang  * there is no next online section, return 0.
2619721fd82SBaolin Wang  */
2629721fd82SBaolin Wang static unsigned long skip_offline_sections(unsigned long start_pfn)
2639721fd82SBaolin Wang {
2649721fd82SBaolin Wang 	unsigned long start_nr = pfn_to_section_nr(start_pfn);
2659721fd82SBaolin Wang 
2669721fd82SBaolin Wang 	if (online_section_nr(start_nr))
2679721fd82SBaolin Wang 		return 0;
2689721fd82SBaolin Wang 
2699721fd82SBaolin Wang 	while (++start_nr <= __highest_present_section_nr) {
2709721fd82SBaolin Wang 		if (online_section_nr(start_nr))
2719721fd82SBaolin Wang 			return section_nr_to_pfn(start_nr);
2729721fd82SBaolin Wang 	}
2739721fd82SBaolin Wang 
2749721fd82SBaolin Wang 	return 0;
2759721fd82SBaolin Wang }
276e6e0c767SBaolin Wang 
277e6e0c767SBaolin Wang /*
278e6e0c767SBaolin Wang  * If the PFN falls into an offline section, return the end PFN of the
279e6e0c767SBaolin Wang  * next online section in reverse. If the PFN falls into an online section
280e6e0c767SBaolin Wang  * or if there is no next online section in reverse, return 0.
281e6e0c767SBaolin Wang  */
282e6e0c767SBaolin Wang static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
283e6e0c767SBaolin Wang {
284e6e0c767SBaolin Wang 	unsigned long start_nr = pfn_to_section_nr(start_pfn);
285e6e0c767SBaolin Wang 
286e6e0c767SBaolin Wang 	if (!start_nr || online_section_nr(start_nr))
287e6e0c767SBaolin Wang 		return 0;
288e6e0c767SBaolin Wang 
289e6e0c767SBaolin Wang 	while (start_nr-- > 0) {
290e6e0c767SBaolin Wang 		if (online_section_nr(start_nr))
291e6e0c767SBaolin Wang 			return section_nr_to_pfn(start_nr) + PAGES_PER_SECTION;
292e6e0c767SBaolin Wang 	}
293e6e0c767SBaolin Wang 
294e6e0c767SBaolin Wang 	return 0;
295e6e0c767SBaolin Wang }
2969721fd82SBaolin Wang #else
2979721fd82SBaolin Wang static unsigned long skip_offline_sections(unsigned long start_pfn)
2989721fd82SBaolin Wang {
2999721fd82SBaolin Wang 	return 0;
3009721fd82SBaolin Wang }
301e6e0c767SBaolin Wang 
302e6e0c767SBaolin Wang static unsigned long skip_offline_sections_reverse(unsigned long start_pfn)
303e6e0c767SBaolin Wang {
304e6e0c767SBaolin Wang 	return 0;
305e6e0c767SBaolin Wang }
3069721fd82SBaolin Wang #endif
3079721fd82SBaolin Wang 
308bb13ffebSMel Gorman /*
3092271b016SHui Su  * Compound pages of >= pageblock_order should consistently be skipped until
310b527cfe5SVlastimil Babka  * released. It is always pointless to compact pages of such order (if they are
311b527cfe5SVlastimil Babka  * migratable), and the pageblocks they occupy cannot contain any free pages.
31221dc7e02SDavid Rientjes  */
313b527cfe5SVlastimil Babka static bool pageblock_skip_persistent(struct page *page)
31421dc7e02SDavid Rientjes {
315b527cfe5SVlastimil Babka 	if (!PageCompound(page))
31621dc7e02SDavid Rientjes 		return false;
317b527cfe5SVlastimil Babka 
318b527cfe5SVlastimil Babka 	page = compound_head(page);
319b527cfe5SVlastimil Babka 
320b527cfe5SVlastimil Babka 	if (compound_order(page) >= pageblock_order)
32121dc7e02SDavid Rientjes 		return true;
322b527cfe5SVlastimil Babka 
323b527cfe5SVlastimil Babka 	return false;
32421dc7e02SDavid Rientjes }
32521dc7e02SDavid Rientjes 
326e332f741SMel Gorman static bool
327e332f741SMel Gorman __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
328e332f741SMel Gorman 							bool check_target)
329e332f741SMel Gorman {
330e332f741SMel Gorman 	struct page *page = pfn_to_online_page(pfn);
3316b0868c8SMel Gorman 	struct page *block_page;
332e332f741SMel Gorman 	struct page *end_page;
333e332f741SMel Gorman 	unsigned long block_pfn;
334e332f741SMel Gorman 
335e332f741SMel Gorman 	if (!page)
336e332f741SMel Gorman 		return false;
337e332f741SMel Gorman 	if (zone != page_zone(page))
338e332f741SMel Gorman 		return false;
339e332f741SMel Gorman 	if (pageblock_skip_persistent(page))
340e332f741SMel Gorman 		return false;
341e332f741SMel Gorman 
342e332f741SMel Gorman 	/*
343e332f741SMel Gorman 	 * If skip is already cleared do no further checking once the
344e332f741SMel Gorman 	 * restart points have been set.
345e332f741SMel Gorman 	 */
346e332f741SMel Gorman 	if (check_source && check_target && !get_pageblock_skip(page))
347e332f741SMel Gorman 		return true;
348e332f741SMel Gorman 
349e332f741SMel Gorman 	/*
350e332f741SMel Gorman 	 * If clearing skip for the target scanner, do not select a
351e332f741SMel Gorman 	 * non-movable pageblock as the starting point.
352e332f741SMel Gorman 	 */
353e332f741SMel Gorman 	if (!check_source && check_target &&
354e332f741SMel Gorman 	    get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
355e332f741SMel Gorman 		return false;
356e332f741SMel Gorman 
3576b0868c8SMel Gorman 	/* Ensure the start of the pageblock or zone is online and valid */
3586b0868c8SMel Gorman 	block_pfn = pageblock_start_pfn(pfn);
359a2e9a5afSVlastimil Babka 	block_pfn = max(block_pfn, zone->zone_start_pfn);
360a2e9a5afSVlastimil Babka 	block_page = pfn_to_online_page(block_pfn);
3616b0868c8SMel Gorman 	if (block_page) {
3626b0868c8SMel Gorman 		page = block_page;
3636b0868c8SMel Gorman 		pfn = block_pfn;
3646b0868c8SMel Gorman 	}
3656b0868c8SMel Gorman 
3666b0868c8SMel Gorman 	/* Ensure the end of the pageblock or zone is online and valid */
367a2e9a5afSVlastimil Babka 	block_pfn = pageblock_end_pfn(pfn) - 1;
3686b0868c8SMel Gorman 	block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
3696b0868c8SMel Gorman 	end_page = pfn_to_online_page(block_pfn);
3706b0868c8SMel Gorman 	if (!end_page)
3716b0868c8SMel Gorman 		return false;
3726b0868c8SMel Gorman 
373e332f741SMel Gorman 	/*
374e332f741SMel Gorman 	 * Only clear the hint if a sample indicates there is either a
375e332f741SMel Gorman 	 * free page or an LRU page in the block. One or other condition
376e332f741SMel Gorman 	 * is necessary for the block to be a migration source/target.
377e332f741SMel Gorman 	 */
378e332f741SMel Gorman 	do {
379e332f741SMel Gorman 		if (check_source && PageLRU(page)) {
380e332f741SMel Gorman 			clear_pageblock_skip(page);
381e332f741SMel Gorman 			return true;
382e332f741SMel Gorman 		}
383e332f741SMel Gorman 
384e332f741SMel Gorman 		if (check_target && PageBuddy(page)) {
385e332f741SMel Gorman 			clear_pageblock_skip(page);
386e332f741SMel Gorman 			return true;
387e332f741SMel Gorman 		}
388e332f741SMel Gorman 
389e332f741SMel Gorman 		page += (1 << PAGE_ALLOC_COSTLY_ORDER);
390a2e9a5afSVlastimil Babka 	} while (page <= end_page);
391e332f741SMel Gorman 
392e332f741SMel Gorman 	return false;
393e332f741SMel Gorman }
394e332f741SMel Gorman 
39521dc7e02SDavid Rientjes /*
396bb13ffebSMel Gorman  * This function is called to clear all cached information on pageblocks that
397bb13ffebSMel Gorman  * should be skipped for page isolation when the migrate and free page scanner
398bb13ffebSMel Gorman  * meet.
399bb13ffebSMel Gorman  */
40062997027SMel Gorman static void __reset_isolation_suitable(struct zone *zone)
401bb13ffebSMel Gorman {
402e332f741SMel Gorman 	unsigned long migrate_pfn = zone->zone_start_pfn;
4036b0868c8SMel Gorman 	unsigned long free_pfn = zone_end_pfn(zone) - 1;
404e332f741SMel Gorman 	unsigned long reset_migrate = free_pfn;
405e332f741SMel Gorman 	unsigned long reset_free = migrate_pfn;
406e332f741SMel Gorman 	bool source_set = false;
407e332f741SMel Gorman 	bool free_set = false;
408e332f741SMel Gorman 
4098df4e28cSKemeng Shi 	/* Only flush if a full compaction finished recently */
410e332f741SMel Gorman 	if (!zone->compact_blockskip_flush)
411e332f741SMel Gorman 		return;
412bb13ffebSMel Gorman 
41362997027SMel Gorman 	zone->compact_blockskip_flush = false;
414bb13ffebSMel Gorman 
415e332f741SMel Gorman 	/*
416e332f741SMel Gorman 	 * Walk the zone and update pageblock skip information. Source looks
417e332f741SMel Gorman 	 * for PageLRU while target looks for PageBuddy. When the scanner
418e332f741SMel Gorman 	 * is found, both PageBuddy and PageLRU are checked as the pageblock
419e332f741SMel Gorman 	 * is suitable as both source and target.
420e332f741SMel Gorman 	 */
421e332f741SMel Gorman 	for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
422e332f741SMel Gorman 					free_pfn -= pageblock_nr_pages) {
423bb13ffebSMel Gorman 		cond_resched();
424bb13ffebSMel Gorman 
425e332f741SMel Gorman 		/* Update the migrate PFN */
426e332f741SMel Gorman 		if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
427e332f741SMel Gorman 		    migrate_pfn < reset_migrate) {
428e332f741SMel Gorman 			source_set = true;
429e332f741SMel Gorman 			reset_migrate = migrate_pfn;
430e332f741SMel Gorman 			zone->compact_init_migrate_pfn = reset_migrate;
431e332f741SMel Gorman 			zone->compact_cached_migrate_pfn[0] = reset_migrate;
432e332f741SMel Gorman 			zone->compact_cached_migrate_pfn[1] = reset_migrate;
433bb13ffebSMel Gorman 		}
43402333641SVlastimil Babka 
435e332f741SMel Gorman 		/* Update the free PFN */
436e332f741SMel Gorman 		if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
437e332f741SMel Gorman 		    free_pfn > reset_free) {
438e332f741SMel Gorman 			free_set = true;
439e332f741SMel Gorman 			reset_free = free_pfn;
440e332f741SMel Gorman 			zone->compact_init_free_pfn = reset_free;
441e332f741SMel Gorman 			zone->compact_cached_free_pfn = reset_free;
442e332f741SMel Gorman 		}
443e332f741SMel Gorman 	}
444e332f741SMel Gorman 
445e332f741SMel Gorman 	/* Leave no distance if no suitable block was reset */
446e332f741SMel Gorman 	if (reset_migrate >= reset_free) {
447e332f741SMel Gorman 		zone->compact_cached_migrate_pfn[0] = migrate_pfn;
448e332f741SMel Gorman 		zone->compact_cached_migrate_pfn[1] = migrate_pfn;
449e332f741SMel Gorman 		zone->compact_cached_free_pfn = free_pfn;
450e332f741SMel Gorman 	}
451bb13ffebSMel Gorman }
452bb13ffebSMel Gorman 
45362997027SMel Gorman void reset_isolation_suitable(pg_data_t *pgdat)
45462997027SMel Gorman {
45562997027SMel Gorman 	int zoneid;
45662997027SMel Gorman 
45762997027SMel Gorman 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
45862997027SMel Gorman 		struct zone *zone = &pgdat->node_zones[zoneid];
45962997027SMel Gorman 		if (!populated_zone(zone))
46062997027SMel Gorman 			continue;
46162997027SMel Gorman 
46262997027SMel Gorman 		__reset_isolation_suitable(zone);
46362997027SMel Gorman 	}
46462997027SMel Gorman }
46562997027SMel Gorman 
466bb13ffebSMel Gorman /*
467e380bebeSMel Gorman  * Sets the pageblock skip bit if it was clear. Note that this is a hint as
468e380bebeSMel Gorman  * locks are not required for read/writers. Returns true if it was already set.
469e380bebeSMel Gorman  */
470590ccea8SMel Gorman static bool test_and_set_skip(struct compact_control *cc, struct page *page)
471e380bebeSMel Gorman {
472e380bebeSMel Gorman 	bool skip;
473e380bebeSMel Gorman 
474590ccea8SMel Gorman 	/* Do not update if skip hint is being ignored */
475e380bebeSMel Gorman 	if (cc->ignore_skip_hint)
476e380bebeSMel Gorman 		return false;
477e380bebeSMel Gorman 
478e380bebeSMel Gorman 	skip = get_pageblock_skip(page);
479e380bebeSMel Gorman 	if (!skip && !cc->no_set_skip_hint)
480e380bebeSMel Gorman 		set_pageblock_skip(page);
481e380bebeSMel Gorman 
482e380bebeSMel Gorman 	return skip;
483e380bebeSMel Gorman }
484e380bebeSMel Gorman 
485e380bebeSMel Gorman static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
486e380bebeSMel Gorman {
487e380bebeSMel Gorman 	struct zone *zone = cc->zone;
488e380bebeSMel Gorman 
489e380bebeSMel Gorman 	/* Set for isolation rather than compaction */
490e380bebeSMel Gorman 	if (cc->no_set_skip_hint)
491e380bebeSMel Gorman 		return;
492e380bebeSMel Gorman 
4933c099a2bSKemeng Shi 	pfn = pageblock_end_pfn(pfn);
4943c099a2bSKemeng Shi 
495cf043a00SKemeng Shi 	/* Update where async and sync compaction should restart */
496e380bebeSMel Gorman 	if (pfn > zone->compact_cached_migrate_pfn[0])
497e380bebeSMel Gorman 		zone->compact_cached_migrate_pfn[0] = pfn;
498e380bebeSMel Gorman 	if (cc->mode != MIGRATE_ASYNC &&
499e380bebeSMel Gorman 	    pfn > zone->compact_cached_migrate_pfn[1])
500e380bebeSMel Gorman 		zone->compact_cached_migrate_pfn[1] = pfn;
501e380bebeSMel Gorman }
502e380bebeSMel Gorman 
503e380bebeSMel Gorman /*
504bb13ffebSMel Gorman  * If no pages were isolated then mark this pageblock to be skipped in the
50562997027SMel Gorman  * future. The information is later cleared by __reset_isolation_suitable().
506bb13ffebSMel Gorman  */
507c89511abSMel Gorman static void update_pageblock_skip(struct compact_control *cc,
508d097a6f6SMel Gorman 			struct page *page, unsigned long pfn)
509bb13ffebSMel Gorman {
510c89511abSMel Gorman 	struct zone *zone = cc->zone;
5116815bf3fSJoonsoo Kim 
5122583d671SVlastimil Babka 	if (cc->no_set_skip_hint)
5136815bf3fSJoonsoo Kim 		return;
5146815bf3fSJoonsoo Kim 
515bb13ffebSMel Gorman 	set_pageblock_skip(page);
516c89511abSMel Gorman 
51735979ef3SDavid Rientjes 	if (pfn < zone->compact_cached_free_pfn)
518c89511abSMel Gorman 		zone->compact_cached_free_pfn = pfn;
519c89511abSMel Gorman }
520bb13ffebSMel Gorman #else
521bb13ffebSMel Gorman static inline bool isolation_suitable(struct compact_control *cc,
522bb13ffebSMel Gorman 					struct page *page)
523bb13ffebSMel Gorman {
524bb13ffebSMel Gorman 	return true;
525bb13ffebSMel Gorman }
526bb13ffebSMel Gorman 
527b527cfe5SVlastimil Babka static inline bool pageblock_skip_persistent(struct page *page)
52821dc7e02SDavid Rientjes {
52921dc7e02SDavid Rientjes 	return false;
53021dc7e02SDavid Rientjes }
53121dc7e02SDavid Rientjes 
53221dc7e02SDavid Rientjes static inline void update_pageblock_skip(struct compact_control *cc,
533d097a6f6SMel Gorman 			struct page *page, unsigned long pfn)
534bb13ffebSMel Gorman {
535bb13ffebSMel Gorman }
536e380bebeSMel Gorman 
537e380bebeSMel Gorman static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
538e380bebeSMel Gorman {
539e380bebeSMel Gorman }
540e380bebeSMel Gorman 
541590ccea8SMel Gorman static bool test_and_set_skip(struct compact_control *cc, struct page *page)
542e380bebeSMel Gorman {
543e380bebeSMel Gorman 	return false;
544e380bebeSMel Gorman }
545bb13ffebSMel Gorman #endif /* CONFIG_COMPACTION */
546bb13ffebSMel Gorman 
5471f9efdefSVlastimil Babka /*
5488b44d279SVlastimil Babka  * Compaction requires the taking of some coarse locks that are potentially
549cb2dcaf0SMel Gorman  * very heavily contended. For async compaction, trylock and record if the
550cb2dcaf0SMel Gorman  * lock is contended. The lock will still be acquired but compaction will
551cb2dcaf0SMel Gorman  * abort when the current block is finished regardless of success rate.
552cb2dcaf0SMel Gorman  * Sync compaction acquires the lock.
5538b44d279SVlastimil Babka  *
554cb2dcaf0SMel Gorman  * Always returns true which makes it easier to track lock state in callers.
5551f9efdefSVlastimil Babka  */
556cb2dcaf0SMel Gorman static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
5578b44d279SVlastimil Babka 						struct compact_control *cc)
55877337edeSJules Irenge 	__acquires(lock)
5598b44d279SVlastimil Babka {
560cb2dcaf0SMel Gorman 	/* Track if the lock is contended in async mode */
561cb2dcaf0SMel Gorman 	if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
562cb2dcaf0SMel Gorman 		if (spin_trylock_irqsave(lock, *flags))
563cb2dcaf0SMel Gorman 			return true;
564cb2dcaf0SMel Gorman 
565c3486f53SVlastimil Babka 		cc->contended = true;
5668b44d279SVlastimil Babka 	}
5671f9efdefSVlastimil Babka 
568cb2dcaf0SMel Gorman 	spin_lock_irqsave(lock, *flags);
5698b44d279SVlastimil Babka 	return true;
5702a1402aaSMel Gorman }
5712a1402aaSMel Gorman 
57285aa125fSMichal Nazarewicz /*
573c67fe375SMel Gorman  * Compaction requires the taking of some coarse locks that are potentially
5748b44d279SVlastimil Babka  * very heavily contended. The lock should be periodically unlocked to avoid
5758b44d279SVlastimil Babka  * having disabled IRQs for a long time, even when there is nobody waiting on
5768b44d279SVlastimil Babka  * the lock. It might also be that allowing the IRQs will result in
577d56c1584SMiaohe Lin  * need_resched() becoming true. If scheduling is needed, compaction schedules.
5788b44d279SVlastimil Babka  * Either compaction type will also abort if a fatal signal is pending.
5798b44d279SVlastimil Babka  * In either case if the lock was locked, it is dropped and not regained.
580c67fe375SMel Gorman  *
581d56c1584SMiaohe Lin  * Returns true if compaction should abort due to fatal signal pending.
582d56c1584SMiaohe Lin  * Returns false when compaction can continue.
583c67fe375SMel Gorman  */
5848b44d279SVlastimil Babka static bool compact_unlock_should_abort(spinlock_t *lock,
5858b44d279SVlastimil Babka 		unsigned long flags, bool *locked, struct compact_control *cc)
586c67fe375SMel Gorman {
5878b44d279SVlastimil Babka 	if (*locked) {
5888b44d279SVlastimil Babka 		spin_unlock_irqrestore(lock, flags);
5898b44d279SVlastimil Babka 		*locked = false;
590c67fe375SMel Gorman 	}
591c67fe375SMel Gorman 
5928b44d279SVlastimil Babka 	if (fatal_signal_pending(current)) {
593c3486f53SVlastimil Babka 		cc->contended = true;
5948b44d279SVlastimil Babka 		return true;
5958b44d279SVlastimil Babka 	}
5968b44d279SVlastimil Babka 
597cf66f070SMel Gorman 	cond_resched();
598be976572SVlastimil Babka 
599be976572SVlastimil Babka 	return false;
600be976572SVlastimil Babka }
601be976572SVlastimil Babka 
602c67fe375SMel Gorman /*
6039e4be470SJerome Marchand  * Isolate free pages onto a private freelist. If @strict is true, will abort
6049e4be470SJerome Marchand  * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
6059e4be470SJerome Marchand  * (even though it may still end up isolating some pages).
60685aa125fSMichal Nazarewicz  */
607f40d1e42SMel Gorman static unsigned long isolate_freepages_block(struct compact_control *cc,
608e14c720eSVlastimil Babka 				unsigned long *start_pfn,
60985aa125fSMichal Nazarewicz 				unsigned long end_pfn,
61085aa125fSMichal Nazarewicz 				struct list_head *freelist,
6114fca9730SMel Gorman 				unsigned int stride,
61285aa125fSMichal Nazarewicz 				bool strict)
613748446bbSMel Gorman {
614b7aba698SMel Gorman 	int nr_scanned = 0, total_isolated = 0;
615dc13292cSKemeng Shi 	struct page *page;
616b8b2d825SXiubo Li 	unsigned long flags = 0;
617f40d1e42SMel Gorman 	bool locked = false;
618e14c720eSVlastimil Babka 	unsigned long blockpfn = *start_pfn;
61966c64223SJoonsoo Kim 	unsigned int order;
620748446bbSMel Gorman 
6214fca9730SMel Gorman 	/* Strict mode is for isolation, speed is secondary */
6224fca9730SMel Gorman 	if (strict)
6234fca9730SMel Gorman 		stride = 1;
6244fca9730SMel Gorman 
625dc13292cSKemeng Shi 	page = pfn_to_page(blockpfn);
626748446bbSMel Gorman 
627f40d1e42SMel Gorman 	/* Isolate free pages. */
628dc13292cSKemeng Shi 	for (; blockpfn < end_pfn; blockpfn += stride, page += stride) {
62966c64223SJoonsoo Kim 		int isolated;
630748446bbSMel Gorman 
6318b44d279SVlastimil Babka 		/*
6328b44d279SVlastimil Babka 		 * Periodically drop the lock (if held) regardless of its
6338b44d279SVlastimil Babka 		 * contention, to give chance to IRQs. Abort if fatal signal
634d56c1584SMiaohe Lin 		 * pending.
6358b44d279SVlastimil Babka 		 */
636c036ddffSMiaohe Lin 		if (!(blockpfn % COMPACT_CLUSTER_MAX)
6378b44d279SVlastimil Babka 		    && compact_unlock_should_abort(&cc->zone->lock, flags,
6388b44d279SVlastimil Babka 								&locked, cc))
6398b44d279SVlastimil Babka 			break;
6408b44d279SVlastimil Babka 
641b7aba698SMel Gorman 		nr_scanned++;
6422af120bcSLaura Abbott 
6439fcd6d2eSVlastimil Babka 		/*
6449fcd6d2eSVlastimil Babka 		 * For compound pages such as THP and hugetlbfs, we can save
6459fcd6d2eSVlastimil Babka 		 * potentially a lot of iterations if we skip them at once.
6469fcd6d2eSVlastimil Babka 		 * The check is racy, but we can consider only valid values
6479fcd6d2eSVlastimil Babka 		 * and the only danger is skipping too much.
6489fcd6d2eSVlastimil Babka 		 */
6499fcd6d2eSVlastimil Babka 		if (PageCompound(page)) {
65021dc7e02SDavid Rientjes 			const unsigned int order = compound_order(page);
6519fcd6d2eSVlastimil Babka 
6523da0272aSKemeng Shi 			if (blockpfn + (1UL << order) <= end_pfn) {
65321dc7e02SDavid Rientjes 				blockpfn += (1UL << order) - 1;
654dc13292cSKemeng Shi 				page += (1UL << order) - 1;
65556d48d8dSBaolin Wang 				nr_scanned += (1UL << order) - 1;
6569fcd6d2eSVlastimil Babka 			}
6573da0272aSKemeng Shi 
6589fcd6d2eSVlastimil Babka 			goto isolate_fail;
6599fcd6d2eSVlastimil Babka 		}
6609fcd6d2eSVlastimil Babka 
661f40d1e42SMel Gorman 		if (!PageBuddy(page))
6622af120bcSLaura Abbott 			goto isolate_fail;
663f40d1e42SMel Gorman 
66485f73e6dSMiaohe Lin 		/* If we already hold the lock, we can skip some rechecking. */
66569b7189fSVlastimil Babka 		if (!locked) {
666cb2dcaf0SMel Gorman 			locked = compact_lock_irqsave(&cc->zone->lock,
6678b44d279SVlastimil Babka 								&flags, cc);
668f40d1e42SMel Gorman 
669f40d1e42SMel Gorman 			/* Recheck this is a buddy page under lock */
670f40d1e42SMel Gorman 			if (!PageBuddy(page))
6712af120bcSLaura Abbott 				goto isolate_fail;
67269b7189fSVlastimil Babka 		}
673748446bbSMel Gorman 
67466c64223SJoonsoo Kim 		/* Found a free page, will break it into order-0 pages */
675ab130f91SMatthew Wilcox (Oracle) 		order = buddy_order(page);
67666c64223SJoonsoo Kim 		isolated = __isolate_free_page(page, order);
677a4f04f2cSDavid Rientjes 		if (!isolated)
678a4f04f2cSDavid Rientjes 			break;
67966c64223SJoonsoo Kim 		set_page_private(page, order);
680a4f04f2cSDavid Rientjes 
681b717d6b9SWilliam Lam 		nr_scanned += isolated - 1;
682748446bbSMel Gorman 		total_isolated += isolated;
683a4f04f2cSDavid Rientjes 		cc->nr_freepages += isolated;
684733aea0bSZi Yan 		list_add_tail(&page->lru, &freelist[order]);
68566c64223SJoonsoo Kim 
686a4f04f2cSDavid Rientjes 		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
687932ff6bbSJoonsoo Kim 			blockpfn += isolated;
688932ff6bbSJoonsoo Kim 			break;
689932ff6bbSJoonsoo Kim 		}
690a4f04f2cSDavid Rientjes 		/* Advance to the end of split page */
691748446bbSMel Gorman 		blockpfn += isolated - 1;
692dc13292cSKemeng Shi 		page += isolated - 1;
6932af120bcSLaura Abbott 		continue;
6942af120bcSLaura Abbott 
6952af120bcSLaura Abbott isolate_fail:
6962af120bcSLaura Abbott 		if (strict)
6972af120bcSLaura Abbott 			break;
6982af120bcSLaura Abbott 
699748446bbSMel Gorman 	}
700748446bbSMel Gorman 
701a4f04f2cSDavid Rientjes 	if (locked)
702a4f04f2cSDavid Rientjes 		spin_unlock_irqrestore(&cc->zone->lock, flags);
703a4f04f2cSDavid Rientjes 
7049fcd6d2eSVlastimil Babka 	/*
7053da0272aSKemeng Shi 	 * Be careful to not go outside of the pageblock.
7069fcd6d2eSVlastimil Babka 	 */
7079fcd6d2eSVlastimil Babka 	if (unlikely(blockpfn > end_pfn))
7089fcd6d2eSVlastimil Babka 		blockpfn = end_pfn;
7099fcd6d2eSVlastimil Babka 
710e34d85f0SJoonsoo Kim 	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
711e34d85f0SJoonsoo Kim 					nr_scanned, total_isolated);
712e34d85f0SJoonsoo Kim 
713e14c720eSVlastimil Babka 	/* Record how far we have got within the block */
714e14c720eSVlastimil Babka 	*start_pfn = blockpfn;
715e14c720eSVlastimil Babka 
716f40d1e42SMel Gorman 	/*
717f40d1e42SMel Gorman 	 * If strict isolation is requested by CMA then check that all the
718f40d1e42SMel Gorman 	 * pages requested were isolated. If there were any failures, 0 is
719f40d1e42SMel Gorman 	 * returned and CMA will fail.
720f40d1e42SMel Gorman 	 */
7212af120bcSLaura Abbott 	if (strict && blockpfn < end_pfn)
722f40d1e42SMel Gorman 		total_isolated = 0;
723f40d1e42SMel Gorman 
7247f354a54SDavid Rientjes 	cc->total_free_scanned += nr_scanned;
725397487dbSMel Gorman 	if (total_isolated)
726010fc29aSMinchan Kim 		count_compact_events(COMPACTISOLATED, total_isolated);
727748446bbSMel Gorman 	return total_isolated;
728748446bbSMel Gorman }
729748446bbSMel Gorman 
73085aa125fSMichal Nazarewicz /**
73185aa125fSMichal Nazarewicz  * isolate_freepages_range() - isolate free pages.
732e8b098fcSMike Rapoport  * @cc:        Compaction control structure.
73385aa125fSMichal Nazarewicz  * @start_pfn: The first PFN to start isolating.
73485aa125fSMichal Nazarewicz  * @end_pfn:   The one-past-last PFN.
73585aa125fSMichal Nazarewicz  *
73685aa125fSMichal Nazarewicz  * Non-free pages, invalid PFNs, or zone boundaries within the
73785aa125fSMichal Nazarewicz  * [start_pfn, end_pfn) range are considered errors, cause function to
73885aa125fSMichal Nazarewicz  * undo its actions and return zero.
73985aa125fSMichal Nazarewicz  *
74085aa125fSMichal Nazarewicz  * Otherwise, function returns one-past-the-last PFN of isolated page
74185aa125fSMichal Nazarewicz  * (which may be greater then end_pfn if end fell in a middle of
74285aa125fSMichal Nazarewicz  * a free page).
74385aa125fSMichal Nazarewicz  */
744ff9543fdSMichal Nazarewicz unsigned long
745bb13ffebSMel Gorman isolate_freepages_range(struct compact_control *cc,
746bb13ffebSMel Gorman 			unsigned long start_pfn, unsigned long end_pfn)
74785aa125fSMichal Nazarewicz {
748e1409c32SJoonsoo Kim 	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
749733aea0bSZi Yan 	int order;
750733aea0bSZi Yan 	struct list_head tmp_freepages[NR_PAGE_ORDERS];
751733aea0bSZi Yan 
752733aea0bSZi Yan 	for (order = 0; order < NR_PAGE_ORDERS; order++)
753733aea0bSZi Yan 		INIT_LIST_HEAD(&tmp_freepages[order]);
75485aa125fSMichal Nazarewicz 
7557d49d886SVlastimil Babka 	pfn = start_pfn;
75606b6640aSVlastimil Babka 	block_start_pfn = pageblock_start_pfn(pfn);
757e1409c32SJoonsoo Kim 	if (block_start_pfn < cc->zone->zone_start_pfn)
758e1409c32SJoonsoo Kim 		block_start_pfn = cc->zone->zone_start_pfn;
75906b6640aSVlastimil Babka 	block_end_pfn = pageblock_end_pfn(pfn);
7607d49d886SVlastimil Babka 
7617d49d886SVlastimil Babka 	for (; pfn < end_pfn; pfn += isolated,
762e1409c32SJoonsoo Kim 				block_start_pfn = block_end_pfn,
7637d49d886SVlastimil Babka 				block_end_pfn += pageblock_nr_pages) {
764e14c720eSVlastimil Babka 		/* Protect pfn from changing by isolate_freepages_block */
765e14c720eSVlastimil Babka 		unsigned long isolate_start_pfn = pfn;
7667d49d886SVlastimil Babka 
76758420016SJoonsoo Kim 		/*
76858420016SJoonsoo Kim 		 * pfn could pass the block_end_pfn if isolated freepage
76958420016SJoonsoo Kim 		 * is more than pageblock order. In this case, we adjust
77058420016SJoonsoo Kim 		 * scanning range to right one.
77158420016SJoonsoo Kim 		 */
77258420016SJoonsoo Kim 		if (pfn >= block_end_pfn) {
77306b6640aSVlastimil Babka 			block_start_pfn = pageblock_start_pfn(pfn);
77406b6640aSVlastimil Babka 			block_end_pfn = pageblock_end_pfn(pfn);
77558420016SJoonsoo Kim 		}
77658420016SJoonsoo Kim 
777a2864a67SKemeng Shi 		block_end_pfn = min(block_end_pfn, end_pfn);
778a2864a67SKemeng Shi 
779e1409c32SJoonsoo Kim 		if (!pageblock_pfn_to_page(block_start_pfn,
780e1409c32SJoonsoo Kim 					block_end_pfn, cc->zone))
7817d49d886SVlastimil Babka 			break;
7827d49d886SVlastimil Babka 
783e14c720eSVlastimil Babka 		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
784733aea0bSZi Yan 					block_end_pfn, tmp_freepages, 0, true);
78585aa125fSMichal Nazarewicz 
78685aa125fSMichal Nazarewicz 		/*
78785aa125fSMichal Nazarewicz 		 * In strict mode, isolate_freepages_block() returns 0 if
78885aa125fSMichal Nazarewicz 		 * there are any holes in the block (ie. invalid PFNs or
78985aa125fSMichal Nazarewicz 		 * non-free pages).
79085aa125fSMichal Nazarewicz 		 */
79185aa125fSMichal Nazarewicz 		if (!isolated)
79285aa125fSMichal Nazarewicz 			break;
79385aa125fSMichal Nazarewicz 
79485aa125fSMichal Nazarewicz 		/*
79585aa125fSMichal Nazarewicz 		 * If we managed to isolate pages, it is always (1 << n) *
79685aa125fSMichal Nazarewicz 		 * pageblock_nr_pages for some non-negative n.  (Max order
79785aa125fSMichal Nazarewicz 		 * page may span two pageblocks).
79885aa125fSMichal Nazarewicz 		 */
79985aa125fSMichal Nazarewicz 	}
80085aa125fSMichal Nazarewicz 
80185aa125fSMichal Nazarewicz 	if (pfn < end_pfn) {
80285aa125fSMichal Nazarewicz 		/* Loop terminated early, cleanup. */
803733aea0bSZi Yan 		release_free_list(tmp_freepages);
80485aa125fSMichal Nazarewicz 		return 0;
80585aa125fSMichal Nazarewicz 	}
80685aa125fSMichal Nazarewicz 
807733aea0bSZi Yan 	/* __isolate_free_page() does not map the pages */
808733aea0bSZi Yan 	split_map_pages(tmp_freepages);
809733aea0bSZi Yan 
81085aa125fSMichal Nazarewicz 	/* We don't use freelists for anything. */
81185aa125fSMichal Nazarewicz 	return pfn;
81285aa125fSMichal Nazarewicz }
81385aa125fSMichal Nazarewicz 
814748446bbSMel Gorman /* Similar to reclaim, but different enough that they don't share logic */
8154fbbb3fdSJohannes Weiner static bool too_many_isolated(struct compact_control *cc)
816748446bbSMel Gorman {
8174fbbb3fdSJohannes Weiner 	pg_data_t *pgdat = cc->zone->zone_pgdat;
818d818fca1SMel Gorman 	bool too_many;
819d818fca1SMel Gorman 
820bc693045SMinchan Kim 	unsigned long active, inactive, isolated;
821748446bbSMel Gorman 
8225f438eeeSAndrey Ryabinin 	inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
8235f438eeeSAndrey Ryabinin 			node_page_state(pgdat, NR_INACTIVE_ANON);
8245f438eeeSAndrey Ryabinin 	active = node_page_state(pgdat, NR_ACTIVE_FILE) +
8255f438eeeSAndrey Ryabinin 			node_page_state(pgdat, NR_ACTIVE_ANON);
8265f438eeeSAndrey Ryabinin 	isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
8275f438eeeSAndrey Ryabinin 			node_page_state(pgdat, NR_ISOLATED_ANON);
828748446bbSMel Gorman 
8294fbbb3fdSJohannes Weiner 	/*
8304fbbb3fdSJohannes Weiner 	 * Allow GFP_NOFS to isolate past the limit set for regular
8314fbbb3fdSJohannes Weiner 	 * compaction runs. This prevents an ABBA deadlock when other
8324fbbb3fdSJohannes Weiner 	 * compactors have already isolated to the limit, but are
8334fbbb3fdSJohannes Weiner 	 * blocked on filesystem locks held by the GFP_NOFS thread.
8344fbbb3fdSJohannes Weiner 	 */
8354fbbb3fdSJohannes Weiner 	if (cc->gfp_mask & __GFP_FS) {
8364fbbb3fdSJohannes Weiner 		inactive >>= 3;
8374fbbb3fdSJohannes Weiner 		active >>= 3;
8384fbbb3fdSJohannes Weiner 	}
8394fbbb3fdSJohannes Weiner 
840d818fca1SMel Gorman 	too_many = isolated > (inactive + active) / 2;
841d818fca1SMel Gorman 	if (!too_many)
842d818fca1SMel Gorman 		wake_throttle_isolated(pgdat);
843d818fca1SMel Gorman 
844d818fca1SMel Gorman 	return too_many;
845748446bbSMel Gorman }
846748446bbSMel Gorman 
8472fe86e00SMichal Nazarewicz /**
848ee6f62fdSZi Yan  * skip_isolation_on_order() - determine when to skip folio isolation based on
849ee6f62fdSZi Yan  *			       folio order and compaction target order
850ee6f62fdSZi Yan  * @order:		to-be-isolated folio order
851ee6f62fdSZi Yan  * @target_order:	compaction target order
852ee6f62fdSZi Yan  *
853ee6f62fdSZi Yan  * This avoids unnecessary folio isolations during compaction.
854ee6f62fdSZi Yan  */
855ee6f62fdSZi Yan static bool skip_isolation_on_order(int order, int target_order)
856ee6f62fdSZi Yan {
857ee6f62fdSZi Yan 	/*
858ee6f62fdSZi Yan 	 * Unless we are performing global compaction (i.e.,
859ee6f62fdSZi Yan 	 * is_via_compact_memory), skip any folios that are larger than the
860ee6f62fdSZi Yan 	 * target order: we wouldn't be here if we'd have a free folio with
861ee6f62fdSZi Yan 	 * the desired target_order, so migrating this folio would likely fail
862ee6f62fdSZi Yan 	 * later.
863ee6f62fdSZi Yan 	 */
864ee6f62fdSZi Yan 	if (!is_via_compact_memory(target_order) && order >= target_order)
865ee6f62fdSZi Yan 		return true;
866ee6f62fdSZi Yan 	/*
867ee6f62fdSZi Yan 	 * We limit memory compaction to pageblocks and won't try
868ee6f62fdSZi Yan 	 * creating free blocks of memory that are larger than that.
869ee6f62fdSZi Yan 	 */
870ee6f62fdSZi Yan 	return order >= pageblock_order;
871ee6f62fdSZi Yan }
872ee6f62fdSZi Yan 
873ee6f62fdSZi Yan /**
874edc2ca61SVlastimil Babka  * isolate_migratepages_block() - isolate all migrate-able pages within
875edc2ca61SVlastimil Babka  *				  a single pageblock
8762fe86e00SMichal Nazarewicz  * @cc:		Compaction control structure.
877edc2ca61SVlastimil Babka  * @low_pfn:	The first PFN to isolate
878edc2ca61SVlastimil Babka  * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
87989f6c88aSHugh Dickins  * @mode:	Isolation mode to be used.
8802fe86e00SMichal Nazarewicz  *
8812fe86e00SMichal Nazarewicz  * Isolate all pages that can be migrated from the range specified by
882edc2ca61SVlastimil Babka  * [low_pfn, end_pfn). The range is expected to be within same pageblock.
883c2ad7a1fSOscar Salvador  * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion,
884369fa227SOscar Salvador  * -ENOMEM in case we could not allocate a page, or 0.
885c2ad7a1fSOscar Salvador  * cc->migrate_pfn will contain the next pfn to scan.
8862fe86e00SMichal Nazarewicz  *
887edc2ca61SVlastimil Babka  * The pages are isolated on cc->migratepages list (not required to be empty),
888c2ad7a1fSOscar Salvador  * and cc->nr_migratepages is updated accordingly.
889748446bbSMel Gorman  */
890c2ad7a1fSOscar Salvador static int
891edc2ca61SVlastimil Babka isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
89289f6c88aSHugh Dickins 			unsigned long end_pfn, isolate_mode_t mode)
893748446bbSMel Gorman {
8945f438eeeSAndrey Ryabinin 	pg_data_t *pgdat = cc->zone->zone_pgdat;
895b7aba698SMel Gorman 	unsigned long nr_scanned = 0, nr_isolated = 0;
896fa9add64SHugh Dickins 	struct lruvec *lruvec;
897b8b2d825SXiubo Li 	unsigned long flags = 0;
8986168d0daSAlex Shi 	struct lruvec *locked = NULL;
89956ae0bb3SKefeng Wang 	struct folio *folio = NULL;
900bb13ffebSMel Gorman 	struct page *page = NULL, *valid_page = NULL;
90189f6c88aSHugh Dickins 	struct address_space *mapping;
902e34d85f0SJoonsoo Kim 	unsigned long start_pfn = low_pfn;
903fdd048e1SVlastimil Babka 	bool skip_on_failure = false;
904fdd048e1SVlastimil Babka 	unsigned long next_skip_pfn = 0;
905e380bebeSMel Gorman 	bool skip_updated = false;
906c2ad7a1fSOscar Salvador 	int ret = 0;
907c2ad7a1fSOscar Salvador 
908c2ad7a1fSOscar Salvador 	cc->migrate_pfn = low_pfn;
909748446bbSMel Gorman 
910748446bbSMel Gorman 	/*
911748446bbSMel Gorman 	 * Ensure that there are not too many pages isolated from the LRU
912748446bbSMel Gorman 	 * list by either parallel reclaimers or compaction. If there are,
913748446bbSMel Gorman 	 * delay for some time until fewer pages are isolated
914748446bbSMel Gorman 	 */
9154fbbb3fdSJohannes Weiner 	while (unlikely(too_many_isolated(cc))) {
916d20bdd57SZi Yan 		/* stop isolation if there are still pages not migrated */
917d20bdd57SZi Yan 		if (cc->nr_migratepages)
918c2ad7a1fSOscar Salvador 			return -EAGAIN;
919d20bdd57SZi Yan 
920f9e35b3bSMel Gorman 		/* async migration should just abort */
921e0b9daebSDavid Rientjes 		if (cc->mode == MIGRATE_ASYNC)
922c2ad7a1fSOscar Salvador 			return -EAGAIN;
923f9e35b3bSMel Gorman 
924c3f4a9a2SMel Gorman 		reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
925748446bbSMel Gorman 
926748446bbSMel Gorman 		if (fatal_signal_pending(current))
927c2ad7a1fSOscar Salvador 			return -EINTR;
928748446bbSMel Gorman 	}
929748446bbSMel Gorman 
930cf66f070SMel Gorman 	cond_resched();
931aeef4b83SDavid Rientjes 
932fdd048e1SVlastimil Babka 	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
933fdd048e1SVlastimil Babka 		skip_on_failure = true;
934fdd048e1SVlastimil Babka 		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
935fdd048e1SVlastimil Babka 	}
936fdd048e1SVlastimil Babka 
937748446bbSMel Gorman 	/* Time to isolate some pages for migration */
938748446bbSMel Gorman 	for (; low_pfn < end_pfn; low_pfn++) {
9390003e2a4SSean Christopherson 		bool is_dirty, is_unevictable;
94029c0dde8SVlastimil Babka 
941fdd048e1SVlastimil Babka 		if (skip_on_failure && low_pfn >= next_skip_pfn) {
942fdd048e1SVlastimil Babka 			/*
943fdd048e1SVlastimil Babka 			 * We have isolated all migration candidates in the
944fdd048e1SVlastimil Babka 			 * previous order-aligned block, and did not skip it due
945fdd048e1SVlastimil Babka 			 * to failure. We should migrate the pages now and
946fdd048e1SVlastimil Babka 			 * hopefully succeed compaction.
947fdd048e1SVlastimil Babka 			 */
948fdd048e1SVlastimil Babka 			if (nr_isolated)
949fdd048e1SVlastimil Babka 				break;
950fdd048e1SVlastimil Babka 
951fdd048e1SVlastimil Babka 			/*
952fdd048e1SVlastimil Babka 			 * We failed to isolate in the previous order-aligned
953fdd048e1SVlastimil Babka 			 * block. Set the new boundary to the end of the
954fdd048e1SVlastimil Babka 			 * current block. Note we can't simply increase
955fdd048e1SVlastimil Babka 			 * next_skip_pfn by 1 << order, as low_pfn might have
956fdd048e1SVlastimil Babka 			 * been incremented by a higher number due to skipping
957fdd048e1SVlastimil Babka 			 * a compound or a high-order buddy page in the
958fdd048e1SVlastimil Babka 			 * previous loop iteration.
959fdd048e1SVlastimil Babka 			 */
960fdd048e1SVlastimil Babka 			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
961fdd048e1SVlastimil Babka 		}
962fdd048e1SVlastimil Babka 
9638b44d279SVlastimil Babka 		/*
9648b44d279SVlastimil Babka 		 * Periodically drop the lock (if held) regardless of its
965670105a2SMel Gorman 		 * contention, to give chance to IRQs. Abort completely if
966670105a2SMel Gorman 		 * a fatal signal is pending.
9678b44d279SVlastimil Babka 		 */
968c036ddffSMiaohe Lin 		if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
9696168d0daSAlex Shi 			if (locked) {
9706168d0daSAlex Shi 				unlock_page_lruvec_irqrestore(locked, flags);
9716168d0daSAlex Shi 				locked = NULL;
9726168d0daSAlex Shi 			}
9736168d0daSAlex Shi 
9746168d0daSAlex Shi 			if (fatal_signal_pending(current)) {
9756168d0daSAlex Shi 				cc->contended = true;
976c2ad7a1fSOscar Salvador 				ret = -EINTR;
9776168d0daSAlex Shi 
978670105a2SMel Gorman 				goto fatal_pending;
979670105a2SMel Gorman 			}
980b2eef8c0SAndrea Arcangeli 
9816168d0daSAlex Shi 			cond_resched();
9826168d0daSAlex Shi 		}
9836168d0daSAlex Shi 
984b7aba698SMel Gorman 		nr_scanned++;
985748446bbSMel Gorman 
986748446bbSMel Gorman 		page = pfn_to_page(low_pfn);
987dc908600SMel Gorman 
988e380bebeSMel Gorman 		/*
989e380bebeSMel Gorman 		 * Check if the pageblock has already been marked skipped.
990493614daSJohannes Weiner 		 * Only the first PFN is checked as the caller isolates
991e380bebeSMel Gorman 		 * COMPACT_CLUSTER_MAX at a time so the second call must
992e380bebeSMel Gorman 		 * not falsely conclude that the block should be skipped.
993e380bebeSMel Gorman 		 */
994493614daSJohannes Weiner 		if (!valid_page && (pageblock_aligned(low_pfn) ||
995493614daSJohannes Weiner 				    low_pfn == cc->zone->zone_start_pfn)) {
9964af12d04SMiaohe Lin 			if (!isolation_suitable(cc, page)) {
997e380bebeSMel Gorman 				low_pfn = end_pfn;
99856ae0bb3SKefeng Wang 				folio = NULL;
999e380bebeSMel Gorman 				goto isolate_abort;
1000e380bebeSMel Gorman 			}
1001bb13ffebSMel Gorman 			valid_page = page;
1002e380bebeSMel Gorman 		}
1003bb13ffebSMel Gorman 
1004ee6f62fdSZi Yan 		if (PageHuge(page)) {
1005ee6f62fdSZi Yan 			/*
1006ee6f62fdSZi Yan 			 * skip hugetlbfs if we are not compacting for pages
1007ee6f62fdSZi Yan 			 * bigger than its order. THPs and other compound pages
1008ee6f62fdSZi Yan 			 * are handled below.
1009ee6f62fdSZi Yan 			 */
1010ee6f62fdSZi Yan 			if (!cc->alloc_contig) {
1011ee6f62fdSZi Yan 				const unsigned int order = compound_order(page);
1012ee6f62fdSZi Yan 
1013ee6f62fdSZi Yan 				if (order <= MAX_PAGE_ORDER) {
1014ee6f62fdSZi Yan 					low_pfn += (1UL << order) - 1;
1015ee6f62fdSZi Yan 					nr_scanned += (1UL << order) - 1;
1016ee6f62fdSZi Yan 				}
1017ee6f62fdSZi Yan 				goto isolate_fail;
1018ee6f62fdSZi Yan 			}
1019ee6f62fdSZi Yan 			/* for alloc_contig case */
10201c06b6a5SBaolin Wang 			if (locked) {
10211c06b6a5SBaolin Wang 				unlock_page_lruvec_irqrestore(locked, flags);
10221c06b6a5SBaolin Wang 				locked = NULL;
10231c06b6a5SBaolin Wang 			}
10241c06b6a5SBaolin Wang 
1025ae37c7ffSOscar Salvador 			ret = isolate_or_dissolve_huge_page(page, &cc->migratepages);
1026369fa227SOscar Salvador 
1027369fa227SOscar Salvador 			/*
1028369fa227SOscar Salvador 			 * Fail isolation in case isolate_or_dissolve_huge_page()
1029369fa227SOscar Salvador 			 * reports an error. In case of -ENOMEM, abort right away.
1030369fa227SOscar Salvador 			 */
1031369fa227SOscar Salvador 			if (ret < 0) {
1032369fa227SOscar Salvador 				 /* Do not report -EBUSY down the chain */
1033369fa227SOscar Salvador 				if (ret == -EBUSY)
1034369fa227SOscar Salvador 					ret = 0;
103566fe1cf7SMiaohe Lin 				low_pfn += compound_nr(page) - 1;
103656d48d8dSBaolin Wang 				nr_scanned += compound_nr(page) - 1;
1037369fa227SOscar Salvador 				goto isolate_fail;
1038369fa227SOscar Salvador 			}
1039369fa227SOscar Salvador 
1040ae37c7ffSOscar Salvador 			if (PageHuge(page)) {
1041ae37c7ffSOscar Salvador 				/*
1042ae37c7ffSOscar Salvador 				 * Hugepage was successfully isolated and placed
1043ae37c7ffSOscar Salvador 				 * on the cc->migratepages list.
1044ae37c7ffSOscar Salvador 				 */
104556ae0bb3SKefeng Wang 				folio = page_folio(page);
104656ae0bb3SKefeng Wang 				low_pfn += folio_nr_pages(folio) - 1;
1047ae37c7ffSOscar Salvador 				goto isolate_success_no_list;
1048ae37c7ffSOscar Salvador 			}
1049ae37c7ffSOscar Salvador 
1050369fa227SOscar Salvador 			/*
1051369fa227SOscar Salvador 			 * Ok, the hugepage was dissolved. Now these pages are
1052369fa227SOscar Salvador 			 * Buddy and cannot be re-allocated because they are
1053369fa227SOscar Salvador 			 * isolated. Fall-through as the check below handles
1054369fa227SOscar Salvador 			 * Buddy pages.
1055369fa227SOscar Salvador 			 */
1056369fa227SOscar Salvador 		}
1057369fa227SOscar Salvador 
1058c122b208SJoonsoo Kim 		/*
105999c0fd5eSVlastimil Babka 		 * Skip if free. We read page order here without zone lock
106099c0fd5eSVlastimil Babka 		 * which is generally unsafe, but the race window is small and
106199c0fd5eSVlastimil Babka 		 * the worst thing that can happen is that we skip some
106299c0fd5eSVlastimil Babka 		 * potential isolation targets.
10636c14466cSMel Gorman 		 */
106499c0fd5eSVlastimil Babka 		if (PageBuddy(page)) {
1065ab130f91SMatthew Wilcox (Oracle) 			unsigned long freepage_order = buddy_order_unsafe(page);
106699c0fd5eSVlastimil Babka 
106799c0fd5eSVlastimil Babka 			/*
106899c0fd5eSVlastimil Babka 			 * Without lock, we cannot be sure that what we got is
106999c0fd5eSVlastimil Babka 			 * a valid page order. Consider only values in the
107099c0fd5eSVlastimil Babka 			 * valid order range to prevent low_pfn overflow.
107199c0fd5eSVlastimil Babka 			 */
10725e0a760bSKirill A. Shutemov 			if (freepage_order > 0 && freepage_order <= MAX_PAGE_ORDER) {
107399c0fd5eSVlastimil Babka 				low_pfn += (1UL << freepage_order) - 1;
107456d48d8dSBaolin Wang 				nr_scanned += (1UL << freepage_order) - 1;
107556d48d8dSBaolin Wang 			}
1076748446bbSMel Gorman 			continue;
107799c0fd5eSVlastimil Babka 		}
1078748446bbSMel Gorman 
10799927af74SMel Gorman 		/*
1080ee6f62fdSZi Yan 		 * Regardless of being on LRU, compound pages such as THP
1081ee6f62fdSZi Yan 		 * (hugetlbfs is handled above) are not to be compacted unless
1082ee6f62fdSZi Yan 		 * we are attempting an allocation larger than the compound
1083ee6f62fdSZi Yan 		 * page size. We can potentially save a lot of iterations if we
1084ee6f62fdSZi Yan 		 * skip them at once. The check is racy, but we can consider
1085ee6f62fdSZi Yan 		 * only valid values and the only danger is skipping too much.
1086bc835011SAndrea Arcangeli 		 */
10871da2f328SRik van Riel 		if (PageCompound(page) && !cc->alloc_contig) {
108821dc7e02SDavid Rientjes 			const unsigned int order = compound_order(page);
108929c0dde8SVlastimil Babka 
1090ee6f62fdSZi Yan 			/* Skip based on page order and compaction target order. */
1091ee6f62fdSZi Yan 			if (skip_isolation_on_order(order, cc->order)) {
1092ee6f62fdSZi Yan 				if (order <= MAX_PAGE_ORDER) {
109321dc7e02SDavid Rientjes 					low_pfn += (1UL << order) - 1;
109456d48d8dSBaolin Wang 					nr_scanned += (1UL << order) - 1;
109556d48d8dSBaolin Wang 				}
1096fdd048e1SVlastimil Babka 				goto isolate_fail;
10972a1402aaSMel Gorman 			}
1098ee6f62fdSZi Yan 		}
10992a1402aaSMel Gorman 
1100bda807d4SMinchan Kim 		/*
1101bda807d4SMinchan Kim 		 * Check may be lockless but that's ok as we recheck later.
1102bda807d4SMinchan Kim 		 * It's possible to migrate LRU and non-lru movable pages.
1103bda807d4SMinchan Kim 		 * Skip any other type of page
1104bda807d4SMinchan Kim 		 */
1105bda807d4SMinchan Kim 		if (!PageLRU(page)) {
1106bda807d4SMinchan Kim 			/*
1107bda807d4SMinchan Kim 			 * __PageMovable can return false positive so we need
1108bda807d4SMinchan Kim 			 * to verify it under page_lock.
1109bda807d4SMinchan Kim 			 */
1110bda807d4SMinchan Kim 			if (unlikely(__PageMovable(page)) &&
1111bda807d4SMinchan Kim 					!PageIsolated(page)) {
1112bda807d4SMinchan Kim 				if (locked) {
11136168d0daSAlex Shi 					unlock_page_lruvec_irqrestore(locked, flags);
11146168d0daSAlex Shi 					locked = NULL;
1115bda807d4SMinchan Kim 				}
1116bda807d4SMinchan Kim 
111756ae0bb3SKefeng Wang 				if (isolate_movable_page(page, mode)) {
111856ae0bb3SKefeng Wang 					folio = page_folio(page);
1119bda807d4SMinchan Kim 					goto isolate_success;
1120bda807d4SMinchan Kim 				}
112156ae0bb3SKefeng Wang 			}
1122bda807d4SMinchan Kim 
1123fdd048e1SVlastimil Babka 			goto isolate_fail;
1124bda807d4SMinchan Kim 		}
112529c0dde8SVlastimil Babka 
1126119d6d59SDavid Rientjes 		/*
11279df41314SAlex Shi 		 * Be careful not to clear PageLRU until after we're
11289df41314SAlex Shi 		 * sure the page is not being freed elsewhere -- the
11299df41314SAlex Shi 		 * page release code relies on it.
11309df41314SAlex Shi 		 */
113156ae0bb3SKefeng Wang 		folio = folio_get_nontail_page(page);
113256ae0bb3SKefeng Wang 		if (unlikely(!folio))
11339df41314SAlex Shi 			goto isolate_fail;
11349df41314SAlex Shi 
1135829ae0f8SGavin Shan 		/*
1136829ae0f8SGavin Shan 		 * Migration will fail if an anonymous page is pinned in memory,
1137829ae0f8SGavin Shan 		 * so avoid taking lru_lock and isolating it unnecessarily in an
1138829ae0f8SGavin Shan 		 * admittedly racy check.
1139829ae0f8SGavin Shan 		 */
114056ae0bb3SKefeng Wang 		mapping = folio_mapping(folio);
114156ae0bb3SKefeng Wang 		if (!mapping && (folio_ref_count(folio) - 1) > folio_mapcount(folio))
1142829ae0f8SGavin Shan 			goto isolate_fail_put;
1143829ae0f8SGavin Shan 
1144829ae0f8SGavin Shan 		/*
1145829ae0f8SGavin Shan 		 * Only allow to migrate anonymous pages in GFP_NOFS context
1146829ae0f8SGavin Shan 		 * because those do not depend on fs locks.
1147829ae0f8SGavin Shan 		 */
1148829ae0f8SGavin Shan 		if (!(cc->gfp_mask & __GFP_FS) && mapping)
1149829ae0f8SGavin Shan 			goto isolate_fail_put;
1150829ae0f8SGavin Shan 
115189f6c88aSHugh Dickins 		/* Only take pages on LRU: a check now makes later tests safe */
115256ae0bb3SKefeng Wang 		if (!folio_test_lru(folio))
11539df41314SAlex Shi 			goto isolate_fail_put;
11549df41314SAlex Shi 
11550003e2a4SSean Christopherson 		is_unevictable = folio_test_unevictable(folio);
11560003e2a4SSean Christopherson 
115789f6c88aSHugh Dickins 		/* Compaction might skip unevictable pages but CMA takes them */
11580003e2a4SSean Christopherson 		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
115989f6c88aSHugh Dickins 			goto isolate_fail_put;
116089f6c88aSHugh Dickins 
116189f6c88aSHugh Dickins 		/*
116289f6c88aSHugh Dickins 		 * To minimise LRU disruption, the caller can indicate with
116389f6c88aSHugh Dickins 		 * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
116489f6c88aSHugh Dickins 		 * it will be able to migrate without blocking - clean pages
116589f6c88aSHugh Dickins 		 * for the most part.  PageWriteback would require blocking.
116689f6c88aSHugh Dickins 		 */
116756ae0bb3SKefeng Wang 		if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio))
116889f6c88aSHugh Dickins 			goto isolate_fail_put;
116989f6c88aSHugh Dickins 
11700003e2a4SSean Christopherson 		is_dirty = folio_test_dirty(folio);
11710003e2a4SSean Christopherson 
11720003e2a4SSean Christopherson 		if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) ||
11730003e2a4SSean Christopherson 		    (mapping && is_unevictable)) {
11740003e2a4SSean Christopherson 			bool migrate_dirty = true;
11750003e2a4SSean Christopherson 			bool is_unmovable;
117689f6c88aSHugh Dickins 
117789f6c88aSHugh Dickins 			/*
1178866ff801SMatthew Wilcox 			 * Only folios without mappings or that have
11790003e2a4SSean Christopherson 			 * a ->migrate_folio callback are possible to migrate
11800003e2a4SSean Christopherson 			 * without blocking.
11810003e2a4SSean Christopherson 			 *
11820003e2a4SSean Christopherson 			 * Folios from unmovable mappings are not migratable.
11830003e2a4SSean Christopherson 			 *
11840003e2a4SSean Christopherson 			 * However, we can be racing with truncation, which can
11850003e2a4SSean Christopherson 			 * free the mapping that we need to check. Truncation
11860003e2a4SSean Christopherson 			 * holds the folio lock until after the folio is removed
11870003e2a4SSean Christopherson 			 * from the page so holding it ourselves is sufficient.
11880003e2a4SSean Christopherson 			 *
11890003e2a4SSean Christopherson 			 * To avoid locking the folio just to check unmovable,
11900003e2a4SSean Christopherson 			 * assume every unmovable folio is also unevictable,
11910003e2a4SSean Christopherson 			 * which is a cheaper test.  If our assumption goes
11920003e2a4SSean Christopherson 			 * wrong, it's not a correctness bug, just potentially
11930003e2a4SSean Christopherson 			 * wasted cycles.
119489f6c88aSHugh Dickins 			 */
119556ae0bb3SKefeng Wang 			if (!folio_trylock(folio))
119689f6c88aSHugh Dickins 				goto isolate_fail_put;
119789f6c88aSHugh Dickins 
119856ae0bb3SKefeng Wang 			mapping = folio_mapping(folio);
11990003e2a4SSean Christopherson 			if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) {
12005490da4fSMatthew Wilcox (Oracle) 				migrate_dirty = !mapping ||
12019d0ddc0cSMatthew Wilcox (Oracle) 						mapping->a_ops->migrate_folio;
12020003e2a4SSean Christopherson 			}
12030003e2a4SSean Christopherson 			is_unmovable = mapping && mapping_unmovable(mapping);
120456ae0bb3SKefeng Wang 			folio_unlock(folio);
12050003e2a4SSean Christopherson 			if (!migrate_dirty || is_unmovable)
120689f6c88aSHugh Dickins 				goto isolate_fail_put;
120789f6c88aSHugh Dickins 		}
120889f6c88aSHugh Dickins 
120956ae0bb3SKefeng Wang 		/* Try isolate the folio */
121056ae0bb3SKefeng Wang 		if (!folio_test_clear_lru(folio))
12119df41314SAlex Shi 			goto isolate_fail_put;
12129df41314SAlex Shi 
121356ae0bb3SKefeng Wang 		lruvec = folio_lruvec(folio);
12146168d0daSAlex Shi 
121569b7189fSVlastimil Babka 		/* If we already hold the lock, we can skip some rechecking */
12166168d0daSAlex Shi 		if (lruvec != locked) {
12176168d0daSAlex Shi 			if (locked)
12186168d0daSAlex Shi 				unlock_page_lruvec_irqrestore(locked, flags);
12196168d0daSAlex Shi 
12206168d0daSAlex Shi 			compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
12216168d0daSAlex Shi 			locked = lruvec;
12226168d0daSAlex Shi 
122356ae0bb3SKefeng Wang 			lruvec_memcg_debug(lruvec, folio);
1224e380bebeSMel Gorman 
1225590ccea8SMel Gorman 			/*
1226590ccea8SMel Gorman 			 * Try get exclusive access under lock. If marked for
1227590ccea8SMel Gorman 			 * skip, the scan is aborted unless the current context
1228590ccea8SMel Gorman 			 * is a rescan to reach the end of the pageblock.
1229590ccea8SMel Gorman 			 */
1230590ccea8SMel Gorman 			if (!skip_updated && valid_page) {
1231e380bebeSMel Gorman 				skip_updated = true;
1232590ccea8SMel Gorman 				if (test_and_set_skip(cc, valid_page) &&
1233590ccea8SMel Gorman 				    !cc->finish_pageblock) {
12347545e2f2SKemeng Shi 					low_pfn = end_pfn;
1235e380bebeSMel Gorman 					goto isolate_abort;
1236e380bebeSMel Gorman 				}
1237590ccea8SMel Gorman 			}
12382a1402aaSMel Gorman 
123929c0dde8SVlastimil Babka 			/*
1240ee6f62fdSZi Yan 			 * Check LRU folio order under the lock
124129c0dde8SVlastimil Babka 			 */
1242ee6f62fdSZi Yan 			if (unlikely(skip_isolation_on_order(folio_order(folio),
1243ee6f62fdSZi Yan 							     cc->order) &&
1244ee6f62fdSZi Yan 				     !cc->alloc_contig)) {
124556ae0bb3SKefeng Wang 				low_pfn += folio_nr_pages(folio) - 1;
124656ae0bb3SKefeng Wang 				nr_scanned += folio_nr_pages(folio) - 1;
124756ae0bb3SKefeng Wang 				folio_set_lru(folio);
12489df41314SAlex Shi 				goto isolate_fail_put;
1249bc835011SAndrea Arcangeli 			}
1250d99fd5feSAlex Shi 		}
1251fa9add64SHugh Dickins 
125256ae0bb3SKefeng Wang 		/* The folio is taken off the LRU */
125356ae0bb3SKefeng Wang 		if (folio_test_large(folio))
125456ae0bb3SKefeng Wang 			low_pfn += folio_nr_pages(folio) - 1;
1255bc835011SAndrea Arcangeli 
1256748446bbSMel Gorman 		/* Successfully isolated */
125756ae0bb3SKefeng Wang 		lruvec_del_folio(lruvec, folio);
125856ae0bb3SKefeng Wang 		node_stat_mod_folio(folio,
125956ae0bb3SKefeng Wang 				NR_ISOLATED_ANON + folio_is_file_lru(folio),
126056ae0bb3SKefeng Wang 				folio_nr_pages(folio));
1261b6c75016SJoonsoo Kim 
1262b6c75016SJoonsoo Kim isolate_success:
126356ae0bb3SKefeng Wang 		list_add(&folio->lru, &cc->migratepages);
1264ae37c7ffSOscar Salvador isolate_success_no_list:
126556ae0bb3SKefeng Wang 		cc->nr_migratepages += folio_nr_pages(folio);
126656ae0bb3SKefeng Wang 		nr_isolated += folio_nr_pages(folio);
126756ae0bb3SKefeng Wang 		nr_scanned += folio_nr_pages(folio) - 1;
1268748446bbSMel Gorman 
1269804d3121SMel Gorman 		/*
1270804d3121SMel Gorman 		 * Avoid isolating too much unless this block is being
127148731c84SMel Gorman 		 * fully scanned (e.g. dirty/writeback pages, parallel allocation)
1272cb2dcaf0SMel Gorman 		 * or a lock is contended. For contention, isolate quickly to
1273cb2dcaf0SMel Gorman 		 * potentially remove one source of contention.
1274804d3121SMel Gorman 		 */
127538935861SZi Yan 		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
127648731c84SMel Gorman 		    !cc->finish_pageblock && !cc->contended) {
127731b8384aSHillf Danton 			++low_pfn;
1278748446bbSMel Gorman 			break;
1279748446bbSMel Gorman 		}
1280fdd048e1SVlastimil Babka 
1281fdd048e1SVlastimil Babka 		continue;
12829df41314SAlex Shi 
12839df41314SAlex Shi isolate_fail_put:
12849df41314SAlex Shi 		/* Avoid potential deadlock in freeing page under lru_lock */
12859df41314SAlex Shi 		if (locked) {
12866168d0daSAlex Shi 			unlock_page_lruvec_irqrestore(locked, flags);
12876168d0daSAlex Shi 			locked = NULL;
12889df41314SAlex Shi 		}
128956ae0bb3SKefeng Wang 		folio_put(folio);
12909df41314SAlex Shi 
1291fdd048e1SVlastimil Babka isolate_fail:
1292369fa227SOscar Salvador 		if (!skip_on_failure && ret != -ENOMEM)
1293fdd048e1SVlastimil Babka 			continue;
1294fdd048e1SVlastimil Babka 
1295fdd048e1SVlastimil Babka 		/*
1296fdd048e1SVlastimil Babka 		 * We have isolated some pages, but then failed. Release them
1297fdd048e1SVlastimil Babka 		 * instead of migrating, as we cannot form the cc->order buddy
1298fdd048e1SVlastimil Babka 		 * page anyway.
1299fdd048e1SVlastimil Babka 		 */
1300fdd048e1SVlastimil Babka 		if (nr_isolated) {
1301fdd048e1SVlastimil Babka 			if (locked) {
13026168d0daSAlex Shi 				unlock_page_lruvec_irqrestore(locked, flags);
13036168d0daSAlex Shi 				locked = NULL;
1304fdd048e1SVlastimil Babka 			}
1305fdd048e1SVlastimil Babka 			putback_movable_pages(&cc->migratepages);
1306fdd048e1SVlastimil Babka 			cc->nr_migratepages = 0;
1307fdd048e1SVlastimil Babka 			nr_isolated = 0;
1308fdd048e1SVlastimil Babka 		}
1309fdd048e1SVlastimil Babka 
1310fdd048e1SVlastimil Babka 		if (low_pfn < next_skip_pfn) {
1311fdd048e1SVlastimil Babka 			low_pfn = next_skip_pfn - 1;
1312fdd048e1SVlastimil Babka 			/*
1313fdd048e1SVlastimil Babka 			 * The check near the loop beginning would have updated
1314fdd048e1SVlastimil Babka 			 * next_skip_pfn too, but this is a bit simpler.
1315fdd048e1SVlastimil Babka 			 */
1316fdd048e1SVlastimil Babka 			next_skip_pfn += 1UL << cc->order;
1317fdd048e1SVlastimil Babka 		}
1318369fa227SOscar Salvador 
1319369fa227SOscar Salvador 		if (ret == -ENOMEM)
1320369fa227SOscar Salvador 			break;
132131b8384aSHillf Danton 	}
1322748446bbSMel Gorman 
132399c0fd5eSVlastimil Babka 	/*
132499c0fd5eSVlastimil Babka 	 * The PageBuddy() check could have potentially brought us outside
132599c0fd5eSVlastimil Babka 	 * the range to be scanned.
132699c0fd5eSVlastimil Babka 	 */
132799c0fd5eSVlastimil Babka 	if (unlikely(low_pfn > end_pfn))
132899c0fd5eSVlastimil Babka 		low_pfn = end_pfn;
132999c0fd5eSVlastimil Babka 
133056ae0bb3SKefeng Wang 	folio = NULL;
13319df41314SAlex Shi 
1332e380bebeSMel Gorman isolate_abort:
1333c67fe375SMel Gorman 	if (locked)
13346168d0daSAlex Shi 		unlock_page_lruvec_irqrestore(locked, flags);
133556ae0bb3SKefeng Wang 	if (folio) {
133656ae0bb3SKefeng Wang 		folio_set_lru(folio);
133756ae0bb3SKefeng Wang 		folio_put(folio);
13389df41314SAlex Shi 	}
1339748446bbSMel Gorman 
134050b5b094SVlastimil Babka 	/*
134148731c84SMel Gorman 	 * Update the cached scanner pfn once the pageblock has been scanned.
1342804d3121SMel Gorman 	 * Pages will either be migrated in which case there is no point
1343804d3121SMel Gorman 	 * scanning in the near future or migration failed in which case the
1344804d3121SMel Gorman 	 * failure reason may persist. The block is marked for skipping if
1345804d3121SMel Gorman 	 * there were no pages isolated in the block or if the block is
1346804d3121SMel Gorman 	 * rescanned twice in a row.
134750b5b094SVlastimil Babka 	 */
134848731c84SMel Gorman 	if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) {
13498b71b499SBaolin Wang 		if (!cc->no_set_skip_hint && valid_page && !skip_updated)
1350e380bebeSMel Gorman 			set_pageblock_skip(valid_page);
1351e380bebeSMel Gorman 		update_cached_migrate(cc, low_pfn);
1352e380bebeSMel Gorman 	}
1353bb13ffebSMel Gorman 
1354e34d85f0SJoonsoo Kim 	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
1355e34d85f0SJoonsoo Kim 						nr_scanned, nr_isolated);
1356b7aba698SMel Gorman 
1357670105a2SMel Gorman fatal_pending:
13587f354a54SDavid Rientjes 	cc->total_migrate_scanned += nr_scanned;
1359397487dbSMel Gorman 	if (nr_isolated)
1360010fc29aSMinchan Kim 		count_compact_events(COMPACTISOLATED, nr_isolated);
1361397487dbSMel Gorman 
1362c2ad7a1fSOscar Salvador 	cc->migrate_pfn = low_pfn;
1363c2ad7a1fSOscar Salvador 
1364c2ad7a1fSOscar Salvador 	return ret;
13652fe86e00SMichal Nazarewicz }
13662fe86e00SMichal Nazarewicz 
1367edc2ca61SVlastimil Babka /**
1368edc2ca61SVlastimil Babka  * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
1369edc2ca61SVlastimil Babka  * @cc:        Compaction control structure.
1370edc2ca61SVlastimil Babka  * @start_pfn: The first PFN to start isolating.
1371edc2ca61SVlastimil Babka  * @end_pfn:   The one-past-last PFN.
1372edc2ca61SVlastimil Babka  *
1373369fa227SOscar Salvador  * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM
1374369fa227SOscar Salvador  * in case we could not allocate a page, or 0.
1375edc2ca61SVlastimil Babka  */
1376c2ad7a1fSOscar Salvador int
1377edc2ca61SVlastimil Babka isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
1378edc2ca61SVlastimil Babka 							unsigned long end_pfn)
1379edc2ca61SVlastimil Babka {
1380e1409c32SJoonsoo Kim 	unsigned long pfn, block_start_pfn, block_end_pfn;
1381c2ad7a1fSOscar Salvador 	int ret = 0;
1382edc2ca61SVlastimil Babka 
1383edc2ca61SVlastimil Babka 	/* Scan block by block. First and last block may be incomplete */
1384edc2ca61SVlastimil Babka 	pfn = start_pfn;
138506b6640aSVlastimil Babka 	block_start_pfn = pageblock_start_pfn(pfn);
1386e1409c32SJoonsoo Kim 	if (block_start_pfn < cc->zone->zone_start_pfn)
1387e1409c32SJoonsoo Kim 		block_start_pfn = cc->zone->zone_start_pfn;
138806b6640aSVlastimil Babka 	block_end_pfn = pageblock_end_pfn(pfn);
1389edc2ca61SVlastimil Babka 
1390edc2ca61SVlastimil Babka 	for (; pfn < end_pfn; pfn = block_end_pfn,
1391e1409c32SJoonsoo Kim 				block_start_pfn = block_end_pfn,
1392edc2ca61SVlastimil Babka 				block_end_pfn += pageblock_nr_pages) {
1393edc2ca61SVlastimil Babka 
1394edc2ca61SVlastimil Babka 		block_end_pfn = min(block_end_pfn, end_pfn);
1395edc2ca61SVlastimil Babka 
1396e1409c32SJoonsoo Kim 		if (!pageblock_pfn_to_page(block_start_pfn,
1397e1409c32SJoonsoo Kim 					block_end_pfn, cc->zone))
1398edc2ca61SVlastimil Babka 			continue;
1399edc2ca61SVlastimil Babka 
1400c2ad7a1fSOscar Salvador 		ret = isolate_migratepages_block(cc, pfn, block_end_pfn,
1401edc2ca61SVlastimil Babka 						 ISOLATE_UNEVICTABLE);
1402edc2ca61SVlastimil Babka 
1403c2ad7a1fSOscar Salvador 		if (ret)
1404edc2ca61SVlastimil Babka 			break;
14056ea41c0cSJoonsoo Kim 
140638935861SZi Yan 		if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
14076ea41c0cSJoonsoo Kim 			break;
1408edc2ca61SVlastimil Babka 	}
1409edc2ca61SVlastimil Babka 
1410c2ad7a1fSOscar Salvador 	return ret;
1411edc2ca61SVlastimil Babka }
1412edc2ca61SVlastimil Babka 
1413ff9543fdSMichal Nazarewicz #endif /* CONFIG_COMPACTION || CONFIG_CMA */
1414ff9543fdSMichal Nazarewicz #ifdef CONFIG_COMPACTION
1415018e9a49SAndrew Morton 
1416b682debdSVlastimil Babka static bool suitable_migration_source(struct compact_control *cc,
1417b682debdSVlastimil Babka 							struct page *page)
1418b682debdSVlastimil Babka {
1419282722b0SVlastimil Babka 	int block_mt;
1420282722b0SVlastimil Babka 
14219bebefd5SMel Gorman 	if (pageblock_skip_persistent(page))
14229bebefd5SMel Gorman 		return false;
14239bebefd5SMel Gorman 
1424282722b0SVlastimil Babka 	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
1425b682debdSVlastimil Babka 		return true;
1426b682debdSVlastimil Babka 
1427282722b0SVlastimil Babka 	block_mt = get_pageblock_migratetype(page);
1428282722b0SVlastimil Babka 
1429282722b0SVlastimil Babka 	if (cc->migratetype == MIGRATE_MOVABLE)
1430282722b0SVlastimil Babka 		return is_migrate_movable(block_mt);
1431282722b0SVlastimil Babka 	else
1432282722b0SVlastimil Babka 		return block_mt == cc->migratetype;
1433b682debdSVlastimil Babka }
1434b682debdSVlastimil Babka 
1435018e9a49SAndrew Morton /* Returns true if the page is within a block suitable for migration to */
14369f7e3387SVlastimil Babka static bool suitable_migration_target(struct compact_control *cc,
14379f7e3387SVlastimil Babka 							struct page *page)
1438018e9a49SAndrew Morton {
1439018e9a49SAndrew Morton 	/* If the page is a large free page, then disallow migration */
1440018e9a49SAndrew Morton 	if (PageBuddy(page)) {
14411883e8acSBaolin Wang 		int order = cc->order > 0 ? cc->order : pageblock_order;
14421883e8acSBaolin Wang 
1443018e9a49SAndrew Morton 		/*
1444018e9a49SAndrew Morton 		 * We are checking page_order without zone->lock taken. But
1445018e9a49SAndrew Morton 		 * the only small danger is that we skip a potentially suitable
1446018e9a49SAndrew Morton 		 * pageblock, so it's not worth to check order for valid range.
1447018e9a49SAndrew Morton 		 */
14481883e8acSBaolin Wang 		if (buddy_order_unsafe(page) >= order)
1449018e9a49SAndrew Morton 			return false;
1450018e9a49SAndrew Morton 	}
1451018e9a49SAndrew Morton 
14521ef36db2SYisheng Xie 	if (cc->ignore_block_suitable)
14531ef36db2SYisheng Xie 		return true;
14541ef36db2SYisheng Xie 
1455018e9a49SAndrew Morton 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1456b682debdSVlastimil Babka 	if (is_migrate_movable(get_pageblock_migratetype(page)))
1457018e9a49SAndrew Morton 		return true;
1458018e9a49SAndrew Morton 
1459018e9a49SAndrew Morton 	/* Otherwise skip the block */
1460018e9a49SAndrew Morton 	return false;
1461018e9a49SAndrew Morton }
1462018e9a49SAndrew Morton 
146370b44595SMel Gorman static inline unsigned int
146470b44595SMel Gorman freelist_scan_limit(struct compact_control *cc)
146570b44595SMel Gorman {
1466dd7ef7bdSQian Cai 	unsigned short shift = BITS_PER_LONG - 1;
1467dd7ef7bdSQian Cai 
1468dd7ef7bdSQian Cai 	return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
146970b44595SMel Gorman }
147070b44595SMel Gorman 
1471ff9543fdSMichal Nazarewicz /*
1472f2849aa0SVlastimil Babka  * Test whether the free scanner has reached the same or lower pageblock than
1473f2849aa0SVlastimil Babka  * the migration scanner, and compaction should thus terminate.
1474f2849aa0SVlastimil Babka  */
1475f2849aa0SVlastimil Babka static inline bool compact_scanners_met(struct compact_control *cc)
1476f2849aa0SVlastimil Babka {
1477f2849aa0SVlastimil Babka 	return (cc->free_pfn >> pageblock_order)
1478f2849aa0SVlastimil Babka 		<= (cc->migrate_pfn >> pageblock_order);
1479f2849aa0SVlastimil Babka }
1480f2849aa0SVlastimil Babka 
14815a811889SMel Gorman /*
14825a811889SMel Gorman  * Used when scanning for a suitable migration target which scans freelists
14835a811889SMel Gorman  * in reverse. Reorders the list such as the unscanned pages are scanned
14845a811889SMel Gorman  * first on the next iteration of the free scanner
14855a811889SMel Gorman  */
14865a811889SMel Gorman static void
14875a811889SMel Gorman move_freelist_head(struct list_head *freelist, struct page *freepage)
14885a811889SMel Gorman {
14895a811889SMel Gorman 	LIST_HEAD(sublist);
14905a811889SMel Gorman 
14914c179891SKemeng Shi 	if (!list_is_first(&freepage->buddy_list, freelist)) {
1492bbefa0fcSKemeng Shi 		list_cut_before(&sublist, freelist, &freepage->buddy_list);
14935a811889SMel Gorman 		list_splice_tail(&sublist, freelist);
14945a811889SMel Gorman 	}
14955a811889SMel Gorman }
14965a811889SMel Gorman 
14975a811889SMel Gorman /*
14985a811889SMel Gorman  * Similar to move_freelist_head except used by the migration scanner
14995a811889SMel Gorman  * when scanning forward. It's possible for these list operations to
15005a811889SMel Gorman  * move against each other if they search the free list exactly in
15015a811889SMel Gorman  * lockstep.
15025a811889SMel Gorman  */
150370b44595SMel Gorman static void
150470b44595SMel Gorman move_freelist_tail(struct list_head *freelist, struct page *freepage)
150570b44595SMel Gorman {
150670b44595SMel Gorman 	LIST_HEAD(sublist);
150770b44595SMel Gorman 
15084c179891SKemeng Shi 	if (!list_is_last(&freepage->buddy_list, freelist)) {
1509bbefa0fcSKemeng Shi 		list_cut_position(&sublist, freelist, &freepage->buddy_list);
151070b44595SMel Gorman 		list_splice_tail(&sublist, freelist);
151170b44595SMel Gorman 	}
151270b44595SMel Gorman }
151370b44595SMel Gorman 
15145a811889SMel Gorman static void
1515be21b32aSNARIBAYASHI Akira fast_isolate_around(struct compact_control *cc, unsigned long pfn)
15165a811889SMel Gorman {
15175a811889SMel Gorman 	unsigned long start_pfn, end_pfn;
15186e2b7044SVlastimil Babka 	struct page *page;
15195a811889SMel Gorman 
15205a811889SMel Gorman 	/* Do not search around if there are enough pages already */
15215a811889SMel Gorman 	if (cc->nr_freepages >= cc->nr_migratepages)
15225a811889SMel Gorman 		return;
15235a811889SMel Gorman 
15245a811889SMel Gorman 	/* Minimise scanning during async compaction */
15255a811889SMel Gorman 	if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
15265a811889SMel Gorman 		return;
15275a811889SMel Gorman 
15285a811889SMel Gorman 	/* Pageblock boundaries */
15296e2b7044SVlastimil Babka 	start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
15306e2b7044SVlastimil Babka 	end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
15316e2b7044SVlastimil Babka 
15326e2b7044SVlastimil Babka 	page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
15336e2b7044SVlastimil Babka 	if (!page)
15346e2b7044SVlastimil Babka 		return;
15355a811889SMel Gorman 
1536733aea0bSZi Yan 	isolate_freepages_block(cc, &start_pfn, end_pfn, cc->freepages, 1, false);
15375a811889SMel Gorman 
15385a811889SMel Gorman 	/* Skip this pageblock in the future as it's full or nearly full */
153918c59d58SKemeng Shi 	if (start_pfn == end_pfn && !cc->no_set_skip_hint)
15405a811889SMel Gorman 		set_pageblock_skip(page);
15415a811889SMel Gorman }
15425a811889SMel Gorman 
1543dbe2d4e4SMel Gorman /* Search orders in round-robin fashion */
1544dbe2d4e4SMel Gorman static int next_search_order(struct compact_control *cc, int order)
1545dbe2d4e4SMel Gorman {
1546dbe2d4e4SMel Gorman 	order--;
1547dbe2d4e4SMel Gorman 	if (order < 0)
1548dbe2d4e4SMel Gorman 		order = cc->order - 1;
1549dbe2d4e4SMel Gorman 
1550dbe2d4e4SMel Gorman 	/* Search wrapped around? */
1551dbe2d4e4SMel Gorman 	if (order == cc->search_order) {
1552dbe2d4e4SMel Gorman 		cc->search_order--;
1553dbe2d4e4SMel Gorman 		if (cc->search_order < 0)
1554dbe2d4e4SMel Gorman 			cc->search_order = cc->order - 1;
1555dbe2d4e4SMel Gorman 		return -1;
1556dbe2d4e4SMel Gorman 	}
1557dbe2d4e4SMel Gorman 
1558dbe2d4e4SMel Gorman 	return order;
1559dbe2d4e4SMel Gorman }
1560dbe2d4e4SMel Gorman 
15612dbd9005SBaolin Wang static void fast_isolate_freepages(struct compact_control *cc)
15625a811889SMel Gorman {
1563b55ca526SWonhyuk Yang 	unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
1564447ba886SBaolin Wang 	unsigned int nr_scanned = 0, total_isolated = 0;
156574e21484SRokudo Yan 	unsigned long low_pfn, min_pfn, highest = 0;
15665a811889SMel Gorman 	unsigned long nr_isolated = 0;
15675a811889SMel Gorman 	unsigned long distance;
15685a811889SMel Gorman 	struct page *page = NULL;
15695a811889SMel Gorman 	bool scan_start = false;
15705a811889SMel Gorman 	int order;
15715a811889SMel Gorman 
15725a811889SMel Gorman 	/* Full compaction passes in a negative order */
15735a811889SMel Gorman 	if (cc->order <= 0)
15742dbd9005SBaolin Wang 		return;
15755a811889SMel Gorman 
15765a811889SMel Gorman 	/*
15775a811889SMel Gorman 	 * If starting the scan, use a deeper search and use the highest
15785a811889SMel Gorman 	 * PFN found if a suitable one is not found.
15795a811889SMel Gorman 	 */
1580e332f741SMel Gorman 	if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
15815a811889SMel Gorman 		limit = pageblock_nr_pages >> 1;
15825a811889SMel Gorman 		scan_start = true;
15835a811889SMel Gorman 	}
15845a811889SMel Gorman 
15855a811889SMel Gorman 	/*
15865a811889SMel Gorman 	 * Preferred point is in the top quarter of the scan space but take
15875a811889SMel Gorman 	 * a pfn from the top half if the search is problematic.
15885a811889SMel Gorman 	 */
15895a811889SMel Gorman 	distance = (cc->free_pfn - cc->migrate_pfn);
15905a811889SMel Gorman 	low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
15915a811889SMel Gorman 	min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
15925a811889SMel Gorman 
15935a811889SMel Gorman 	if (WARN_ON_ONCE(min_pfn > low_pfn))
15945a811889SMel Gorman 		low_pfn = min_pfn;
15955a811889SMel Gorman 
1596dbe2d4e4SMel Gorman 	/*
1597dbe2d4e4SMel Gorman 	 * Search starts from the last successful isolation order or the next
1598dbe2d4e4SMel Gorman 	 * order to search after a previous failure
1599dbe2d4e4SMel Gorman 	 */
1600dbe2d4e4SMel Gorman 	cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
1601dbe2d4e4SMel Gorman 
1602dbe2d4e4SMel Gorman 	for (order = cc->search_order;
1603dbe2d4e4SMel Gorman 	     !page && order >= 0;
1604dbe2d4e4SMel Gorman 	     order = next_search_order(cc, order)) {
16055a811889SMel Gorman 		struct free_area *area = &cc->zone->free_area[order];
16065a811889SMel Gorman 		struct list_head *freelist;
16075a811889SMel Gorman 		struct page *freepage;
16085a811889SMel Gorman 		unsigned long flags;
16095a811889SMel Gorman 		unsigned int order_scanned = 0;
161074e21484SRokudo Yan 		unsigned long high_pfn = 0;
16115a811889SMel Gorman 
16125a811889SMel Gorman 		if (!area->nr_free)
16135a811889SMel Gorman 			continue;
16145a811889SMel Gorman 
16155a811889SMel Gorman 		spin_lock_irqsave(&cc->zone->lock, flags);
16165a811889SMel Gorman 		freelist = &area->free_list[MIGRATE_MOVABLE];
161794ec2003SBaolin Wang 		list_for_each_entry_reverse(freepage, freelist, buddy_list) {
16185a811889SMel Gorman 			unsigned long pfn;
16195a811889SMel Gorman 
16205a811889SMel Gorman 			order_scanned++;
16215a811889SMel Gorman 			nr_scanned++;
16225a811889SMel Gorman 			pfn = page_to_pfn(freepage);
16235a811889SMel Gorman 
16245a811889SMel Gorman 			if (pfn >= highest)
16256e2b7044SVlastimil Babka 				highest = max(pageblock_start_pfn(pfn),
16266e2b7044SVlastimil Babka 					      cc->zone->zone_start_pfn);
16275a811889SMel Gorman 
16285a811889SMel Gorman 			if (pfn >= low_pfn) {
16295a811889SMel Gorman 				cc->fast_search_fail = 0;
1630dbe2d4e4SMel Gorman 				cc->search_order = order;
16315a811889SMel Gorman 				page = freepage;
16325a811889SMel Gorman 				break;
16335a811889SMel Gorman 			}
16345a811889SMel Gorman 
16355a811889SMel Gorman 			if (pfn >= min_pfn && pfn > high_pfn) {
16365a811889SMel Gorman 				high_pfn = pfn;
16375a811889SMel Gorman 
16385a811889SMel Gorman 				/* Shorten the scan if a candidate is found */
16395a811889SMel Gorman 				limit >>= 1;
16405a811889SMel Gorman 			}
16415a811889SMel Gorman 
16425a811889SMel Gorman 			if (order_scanned >= limit)
16435a811889SMel Gorman 				break;
16445a811889SMel Gorman 		}
16455a811889SMel Gorman 
1646e6bd14ecSKemeng Shi 		/* Use a maximum candidate pfn if a preferred one was not found */
16475a811889SMel Gorman 		if (!page && high_pfn) {
16485a811889SMel Gorman 			page = pfn_to_page(high_pfn);
16495a811889SMel Gorman 
16505a811889SMel Gorman 			/* Update freepage for the list reorder below */
16515a811889SMel Gorman 			freepage = page;
16525a811889SMel Gorman 		}
16535a811889SMel Gorman 
16545a811889SMel Gorman 		/* Reorder to so a future search skips recent pages */
16555a811889SMel Gorman 		move_freelist_head(freelist, freepage);
16565a811889SMel Gorman 
16575a811889SMel Gorman 		/* Isolate the page if available */
16585a811889SMel Gorman 		if (page) {
16595a811889SMel Gorman 			if (__isolate_free_page(page, order)) {
16605a811889SMel Gorman 				set_page_private(page, order);
16615a811889SMel Gorman 				nr_isolated = 1 << order;
1662b717d6b9SWilliam Lam 				nr_scanned += nr_isolated - 1;
1663447ba886SBaolin Wang 				total_isolated += nr_isolated;
16645a811889SMel Gorman 				cc->nr_freepages += nr_isolated;
1665733aea0bSZi Yan 				list_add_tail(&page->lru, &cc->freepages[order]);
16665a811889SMel Gorman 				count_compact_events(COMPACTISOLATED, nr_isolated);
16675a811889SMel Gorman 			} else {
16685a811889SMel Gorman 				/* If isolation fails, abort the search */
16695b56d996SQian Cai 				order = cc->search_order + 1;
16705a811889SMel Gorman 				page = NULL;
16715a811889SMel Gorman 			}
16725a811889SMel Gorman 		}
16735a811889SMel Gorman 
16745a811889SMel Gorman 		spin_unlock_irqrestore(&cc->zone->lock, flags);
16755a811889SMel Gorman 
1676a8d13355SBaolin Wang 		/* Skip fast search if enough freepages isolated */
1677a8d13355SBaolin Wang 		if (cc->nr_freepages >= cc->nr_migratepages)
1678a8d13355SBaolin Wang 			break;
1679a8d13355SBaolin Wang 
16805a811889SMel Gorman 		/*
1681b55ca526SWonhyuk Yang 		 * Smaller scan on next order so the total scan is related
16825a811889SMel Gorman 		 * to freelist_scan_limit.
16835a811889SMel Gorman 		 */
16845a811889SMel Gorman 		if (order_scanned >= limit)
1685b55ca526SWonhyuk Yang 			limit = max(1U, limit >> 1);
16865a811889SMel Gorman 	}
16875a811889SMel Gorman 
1688447ba886SBaolin Wang 	trace_mm_compaction_fast_isolate_freepages(min_pfn, cc->free_pfn,
1689447ba886SBaolin Wang 						   nr_scanned, total_isolated);
1690447ba886SBaolin Wang 
16915a811889SMel Gorman 	if (!page) {
16925a811889SMel Gorman 		cc->fast_search_fail++;
16935a811889SMel Gorman 		if (scan_start) {
16945a811889SMel Gorman 			/*
16955a811889SMel Gorman 			 * Use the highest PFN found above min. If one was
1696f3867755SEthon Paul 			 * not found, be pessimistic for direct compaction
16975a811889SMel Gorman 			 * and use the min mark.
16985a811889SMel Gorman 			 */
1699ca2864e5SMiaohe Lin 			if (highest >= min_pfn) {
17005a811889SMel Gorman 				page = pfn_to_page(highest);
17015a811889SMel Gorman 				cc->free_pfn = highest;
17025a811889SMel Gorman 			} else {
1703e577c8b6SSuzuki K Poulose 				if (cc->direct_compaction && pfn_valid(min_pfn)) {
170473a6e474SBaoquan He 					page = pageblock_pfn_to_page(min_pfn,
17056e2b7044SVlastimil Babka 						min(pageblock_end_pfn(min_pfn),
17066e2b7044SVlastimil Babka 						    zone_end_pfn(cc->zone)),
170773a6e474SBaoquan He 						cc->zone);
1708d19b1a17SBarry Song 					if (page && !suitable_migration_target(cc, page))
1709d19b1a17SBarry Song 						page = NULL;
1710d19b1a17SBarry Song 
17115a811889SMel Gorman 					cc->free_pfn = min_pfn;
17125a811889SMel Gorman 				}
17135a811889SMel Gorman 			}
17145a811889SMel Gorman 		}
17155a811889SMel Gorman 	}
17165a811889SMel Gorman 
1717d097a6f6SMel Gorman 	if (highest && highest >= cc->zone->compact_cached_free_pfn) {
1718d097a6f6SMel Gorman 		highest -= pageblock_nr_pages;
17195a811889SMel Gorman 		cc->zone->compact_cached_free_pfn = highest;
1720d097a6f6SMel Gorman 	}
17215a811889SMel Gorman 
17225a811889SMel Gorman 	cc->total_free_scanned += nr_scanned;
17235a811889SMel Gorman 	if (!page)
17242dbd9005SBaolin Wang 		return;
17255a811889SMel Gorman 
17265a811889SMel Gorman 	low_pfn = page_to_pfn(page);
1727be21b32aSNARIBAYASHI Akira 	fast_isolate_around(cc, low_pfn);
17285a811889SMel Gorman }
17295a811889SMel Gorman 
1730f2849aa0SVlastimil Babka /*
1731ff9543fdSMichal Nazarewicz  * Based on information in the current compact_control, find blocks
1732ff9543fdSMichal Nazarewicz  * suitable for isolating free pages from and then isolate them.
1733ff9543fdSMichal Nazarewicz  */
1734edc2ca61SVlastimil Babka static void isolate_freepages(struct compact_control *cc)
1735ff9543fdSMichal Nazarewicz {
1736edc2ca61SVlastimil Babka 	struct zone *zone = cc->zone;
1737ff9543fdSMichal Nazarewicz 	struct page *page;
1738c96b9e50SVlastimil Babka 	unsigned long block_start_pfn;	/* start of current pageblock */
1739e14c720eSVlastimil Babka 	unsigned long isolate_start_pfn; /* exact pfn we start at */
1740c96b9e50SVlastimil Babka 	unsigned long block_end_pfn;	/* end of current pageblock */
1741c96b9e50SVlastimil Babka 	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
17424fca9730SMel Gorman 	unsigned int stride;
17432fe86e00SMichal Nazarewicz 
17445a811889SMel Gorman 	/* Try a small search of the free lists for a candidate */
174500bc102fSMiaohe Lin 	fast_isolate_freepages(cc);
17465a811889SMel Gorman 	if (cc->nr_freepages)
1747733aea0bSZi Yan 		return;
17485a811889SMel Gorman 
1749ff9543fdSMichal Nazarewicz 	/*
1750ff9543fdSMichal Nazarewicz 	 * Initialise the free scanner. The starting point is where we last
175149e068f0SVlastimil Babka 	 * successfully isolated from, zone-cached value, or the end of the
1752e14c720eSVlastimil Babka 	 * zone when isolating for the first time. For looping we also need
1753e14c720eSVlastimil Babka 	 * this pfn aligned down to the pageblock boundary, because we do
1754c96b9e50SVlastimil Babka 	 * block_start_pfn -= pageblock_nr_pages in the for loop.
1755c96b9e50SVlastimil Babka 	 * For ending point, take care when isolating in last pageblock of a
1756a1c1dbebSRandy Dunlap 	 * zone which ends in the middle of a pageblock.
175749e068f0SVlastimil Babka 	 * The low boundary is the end of the pageblock the migration scanner
175849e068f0SVlastimil Babka 	 * is using.
1759ff9543fdSMichal Nazarewicz 	 */
1760e14c720eSVlastimil Babka 	isolate_start_pfn = cc->free_pfn;
17615a811889SMel Gorman 	block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
1762c96b9e50SVlastimil Babka 	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1763c96b9e50SVlastimil Babka 						zone_end_pfn(zone));
176406b6640aSVlastimil Babka 	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
17654fca9730SMel Gorman 	stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
17662fe86e00SMichal Nazarewicz 
1767ff9543fdSMichal Nazarewicz 	/*
1768ff9543fdSMichal Nazarewicz 	 * Isolate free pages until enough are available to migrate the
1769ff9543fdSMichal Nazarewicz 	 * pages on cc->migratepages. We stop searching if the migrate
1770ff9543fdSMichal Nazarewicz 	 * and free page scanners meet or enough free pages are isolated.
1771ff9543fdSMichal Nazarewicz 	 */
1772f5f61a32SVlastimil Babka 	for (; block_start_pfn >= low_pfn;
1773c96b9e50SVlastimil Babka 				block_end_pfn = block_start_pfn,
1774e14c720eSVlastimil Babka 				block_start_pfn -= pageblock_nr_pages,
1775e14c720eSVlastimil Babka 				isolate_start_pfn = block_start_pfn) {
17764fca9730SMel Gorman 		unsigned long nr_isolated;
17774fca9730SMel Gorman 
1778f6ea3adbSDavid Rientjes 		/*
1779f6ea3adbSDavid Rientjes 		 * This can iterate a massively long zone without finding any
1780cb810ad2SMel Gorman 		 * suitable migration targets, so periodically check resched.
1781f6ea3adbSDavid Rientjes 		 */
1782c036ddffSMiaohe Lin 		if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
1783cf66f070SMel Gorman 			cond_resched();
1784f6ea3adbSDavid Rientjes 
17857d49d886SVlastimil Babka 		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
17867d49d886SVlastimil Babka 									zone);
1787e6e0c767SBaolin Wang 		if (!page) {
1788e6e0c767SBaolin Wang 			unsigned long next_pfn;
1789e6e0c767SBaolin Wang 
1790e6e0c767SBaolin Wang 			next_pfn = skip_offline_sections_reverse(block_start_pfn);
1791e6e0c767SBaolin Wang 			if (next_pfn)
1792e6e0c767SBaolin Wang 				block_start_pfn = max(next_pfn, low_pfn);
1793e6e0c767SBaolin Wang 
1794ff9543fdSMichal Nazarewicz 			continue;
1795e6e0c767SBaolin Wang 		}
1796ff9543fdSMichal Nazarewicz 
1797ff9543fdSMichal Nazarewicz 		/* Check the block is suitable for migration */
17989f7e3387SVlastimil Babka 		if (!suitable_migration_target(cc, page))
1799ff9543fdSMichal Nazarewicz 			continue;
180068e3e926SLinus Torvalds 
1801bb13ffebSMel Gorman 		/* If isolation recently failed, do not retry */
1802bb13ffebSMel Gorman 		if (!isolation_suitable(cc, page))
1803bb13ffebSMel Gorman 			continue;
1804bb13ffebSMel Gorman 
1805e14c720eSVlastimil Babka 		/* Found a block suitable for isolating free pages from. */
18064fca9730SMel Gorman 		nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
1807733aea0bSZi Yan 					block_end_pfn, cc->freepages, stride, false);
1808ff9543fdSMichal Nazarewicz 
1809d097a6f6SMel Gorman 		/* Update the skip hint if the full pageblock was scanned */
1810d097a6f6SMel Gorman 		if (isolate_start_pfn == block_end_pfn)
181116951789SKemeng Shi 			update_pageblock_skip(cc, page, block_start_pfn -
181216951789SKemeng Shi 					      pageblock_nr_pages);
1813d097a6f6SMel Gorman 
1814cb2dcaf0SMel Gorman 		/* Are enough freepages isolated? */
1815cb2dcaf0SMel Gorman 		if (cc->nr_freepages >= cc->nr_migratepages) {
1816a46cbf3bSDavid Rientjes 			if (isolate_start_pfn >= block_end_pfn) {
1817a46cbf3bSDavid Rientjes 				/*
1818a46cbf3bSDavid Rientjes 				 * Restart at previous pageblock if more
1819a46cbf3bSDavid Rientjes 				 * freepages can be isolated next time.
1820a46cbf3bSDavid Rientjes 				 */
1821f5f61a32SVlastimil Babka 				isolate_start_pfn =
1822e14c720eSVlastimil Babka 					block_start_pfn - pageblock_nr_pages;
1823a46cbf3bSDavid Rientjes 			}
1824be976572SVlastimil Babka 			break;
1825a46cbf3bSDavid Rientjes 		} else if (isolate_start_pfn < block_end_pfn) {
1826f5f61a32SVlastimil Babka 			/*
1827a46cbf3bSDavid Rientjes 			 * If isolation failed early, do not continue
1828a46cbf3bSDavid Rientjes 			 * needlessly.
1829f5f61a32SVlastimil Babka 			 */
1830a46cbf3bSDavid Rientjes 			break;
1831f5f61a32SVlastimil Babka 		}
18324fca9730SMel Gorman 
18334fca9730SMel Gorman 		/* Adjust stride depending on isolation */
18344fca9730SMel Gorman 		if (nr_isolated) {
18354fca9730SMel Gorman 			stride = 1;
18364fca9730SMel Gorman 			continue;
18374fca9730SMel Gorman 		}
18384fca9730SMel Gorman 		stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
1839c89511abSMel Gorman 	}
1840ff9543fdSMichal Nazarewicz 
18417ed695e0SVlastimil Babka 	/*
1842f5f61a32SVlastimil Babka 	 * Record where the free scanner will restart next time. Either we
1843f5f61a32SVlastimil Babka 	 * broke from the loop and set isolate_start_pfn based on the last
1844f5f61a32SVlastimil Babka 	 * call to isolate_freepages_block(), or we met the migration scanner
1845f5f61a32SVlastimil Babka 	 * and the loop terminated due to isolate_start_pfn < low_pfn
18467ed695e0SVlastimil Babka 	 */
1847f5f61a32SVlastimil Babka 	cc->free_pfn = isolate_start_pfn;
1848748446bbSMel Gorman }
1849748446bbSMel Gorman 
1850748446bbSMel Gorman /*
1851748446bbSMel Gorman  * This is a migrate-callback that "allocates" freepages by taking pages
1852748446bbSMel Gorman  * from the isolated freelists in the block we are migrating to.
1853748446bbSMel Gorman  */
18544e096ae1SMatthew Wilcox (Oracle) static struct folio *compaction_alloc(struct folio *src, unsigned long data)
1855748446bbSMel Gorman {
1856748446bbSMel Gorman 	struct compact_control *cc = (struct compact_control *)data;
18574e096ae1SMatthew Wilcox (Oracle) 	struct folio *dst;
1858733aea0bSZi Yan 	int order = folio_order(src);
1859*73318e2cSZi Yan 	bool has_isolated_pages = false;
1860*73318e2cSZi Yan 	int start_order;
1861*73318e2cSZi Yan 	struct page *freepage;
1862*73318e2cSZi Yan 	unsigned long size;
1863748446bbSMel Gorman 
1864*73318e2cSZi Yan again:
1865*73318e2cSZi Yan 	for (start_order = order; start_order < NR_PAGE_ORDERS; start_order++)
1866*73318e2cSZi Yan 		if (!list_empty(&cc->freepages[start_order]))
1867*73318e2cSZi Yan 			break;
1868748446bbSMel Gorman 
1869*73318e2cSZi Yan 	/* no free pages in the list */
1870*73318e2cSZi Yan 	if (start_order == NR_PAGE_ORDERS) {
1871*73318e2cSZi Yan 		if (has_isolated_pages)
1872748446bbSMel Gorman 			return NULL;
1873*73318e2cSZi Yan 		isolate_freepages(cc);
1874*73318e2cSZi Yan 		has_isolated_pages = true;
1875*73318e2cSZi Yan 		goto again;
1876748446bbSMel Gorman 	}
1877748446bbSMel Gorman 
1878*73318e2cSZi Yan 	freepage = list_first_entry(&cc->freepages[start_order], struct page,
1879*73318e2cSZi Yan 				lru);
1880*73318e2cSZi Yan 	size = 1 << start_order;
1881748446bbSMel Gorman 
1882*73318e2cSZi Yan 	list_del(&freepage->lru);
1883*73318e2cSZi Yan 
1884*73318e2cSZi Yan 	while (start_order > order) {
1885*73318e2cSZi Yan 		start_order--;
1886*73318e2cSZi Yan 		size >>= 1;
1887*73318e2cSZi Yan 
1888*73318e2cSZi Yan 		list_add(&freepage[size].lru, &cc->freepages[start_order]);
1889*73318e2cSZi Yan 		set_page_private(&freepage[size], start_order);
1890*73318e2cSZi Yan 	}
1891*73318e2cSZi Yan 	dst = (struct folio *)freepage;
1892*73318e2cSZi Yan 
1893733aea0bSZi Yan 	post_alloc_hook(&dst->page, order, __GFP_MOVABLE);
1894733aea0bSZi Yan 	if (order)
1895733aea0bSZi Yan 		prep_compound_page(&dst->page, order);
1896733aea0bSZi Yan 	cc->nr_freepages -= 1 << order;
1897733aea0bSZi Yan 	cc->nr_migratepages -= 1 << order;
1898733aea0bSZi Yan 	return page_rmappable_folio(&dst->page);
1899748446bbSMel Gorman }
1900748446bbSMel Gorman 
1901748446bbSMel Gorman /*
1902d53aea3dSDavid Rientjes  * This is a migrate-callback that "frees" freepages back to the isolated
1903d53aea3dSDavid Rientjes  * freelist.  All pages on the freelist are from the same zone, so there is no
1904d53aea3dSDavid Rientjes  * special handling needed for NUMA.
1905d53aea3dSDavid Rientjes  */
19064e096ae1SMatthew Wilcox (Oracle) static void compaction_free(struct folio *dst, unsigned long data)
1907d53aea3dSDavid Rientjes {
1908d53aea3dSDavid Rientjes 	struct compact_control *cc = (struct compact_control *)data;
1909733aea0bSZi Yan 	int order = folio_order(dst);
1910733aea0bSZi Yan 	struct page *page = &dst->page;
1911d53aea3dSDavid Rientjes 
1912733aea0bSZi Yan 	if (folio_put_testzero(dst)) {
1913733aea0bSZi Yan 		free_pages_prepare(page, order);
1914733aea0bSZi Yan 		list_add(&dst->lru, &cc->freepages[order]);
1915733aea0bSZi Yan 		cc->nr_freepages += 1 << order;
1916733aea0bSZi Yan 	}
1917733aea0bSZi Yan 	cc->nr_migratepages += 1 << order;
1918733aea0bSZi Yan 	/*
1919733aea0bSZi Yan 	 * someone else has referenced the page, we cannot take it back to our
1920733aea0bSZi Yan 	 * free list.
1921733aea0bSZi Yan 	 */
1922d53aea3dSDavid Rientjes }
1923d53aea3dSDavid Rientjes 
1924ff9543fdSMichal Nazarewicz /* possible outcome of isolate_migratepages */
1925ff9543fdSMichal Nazarewicz typedef enum {
1926ff9543fdSMichal Nazarewicz 	ISOLATE_ABORT,		/* Abort compaction now */
1927ff9543fdSMichal Nazarewicz 	ISOLATE_NONE,		/* No pages isolated, continue scanning */
1928ff9543fdSMichal Nazarewicz 	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
1929ff9543fdSMichal Nazarewicz } isolate_migrate_t;
1930ff9543fdSMichal Nazarewicz 
1931ff9543fdSMichal Nazarewicz /*
19325bbe3547SEric B Munson  * Allow userspace to control policy on scanning the unevictable LRU for
19335bbe3547SEric B Munson  * compactable pages.
19345bbe3547SEric B Munson  */
193548fe8ab8SMinghao Chi static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
193648fe8ab8SMinghao Chi /*
193748fe8ab8SMinghao Chi  * Tunable for proactive compaction. It determines how
193848fe8ab8SMinghao Chi  * aggressively the kernel should compact memory in the
193948fe8ab8SMinghao Chi  * background. It takes values in the range [0, 100].
194048fe8ab8SMinghao Chi  */
194148fe8ab8SMinghao Chi static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
194248fe8ab8SMinghao Chi static int sysctl_extfrag_threshold = 500;
19438b9167cdSWen Yang static int __read_mostly sysctl_compact_memory;
19445bbe3547SEric B Munson 
194570b44595SMel Gorman static inline void
194670b44595SMel Gorman update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
194770b44595SMel Gorman {
194870b44595SMel Gorman 	if (cc->fast_start_pfn == ULONG_MAX)
194970b44595SMel Gorman 		return;
195070b44595SMel Gorman 
195170b44595SMel Gorman 	if (!cc->fast_start_pfn)
195270b44595SMel Gorman 		cc->fast_start_pfn = pfn;
195370b44595SMel Gorman 
195470b44595SMel Gorman 	cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
195570b44595SMel Gorman }
195670b44595SMel Gorman 
195770b44595SMel Gorman static inline unsigned long
195870b44595SMel Gorman reinit_migrate_pfn(struct compact_control *cc)
195970b44595SMel Gorman {
196070b44595SMel Gorman 	if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
196170b44595SMel Gorman 		return cc->migrate_pfn;
196270b44595SMel Gorman 
196370b44595SMel Gorman 	cc->migrate_pfn = cc->fast_start_pfn;
196470b44595SMel Gorman 	cc->fast_start_pfn = ULONG_MAX;
196570b44595SMel Gorman 
196670b44595SMel Gorman 	return cc->migrate_pfn;
196770b44595SMel Gorman }
196870b44595SMel Gorman 
196970b44595SMel Gorman /*
197070b44595SMel Gorman  * Briefly search the free lists for a migration source that already has
197170b44595SMel Gorman  * some free pages to reduce the number of pages that need migration
197270b44595SMel Gorman  * before a pageblock is free.
197370b44595SMel Gorman  */
197470b44595SMel Gorman static unsigned long fast_find_migrateblock(struct compact_control *cc)
197570b44595SMel Gorman {
197670b44595SMel Gorman 	unsigned int limit = freelist_scan_limit(cc);
197770b44595SMel Gorman 	unsigned int nr_scanned = 0;
197870b44595SMel Gorman 	unsigned long distance;
197970b44595SMel Gorman 	unsigned long pfn = cc->migrate_pfn;
198070b44595SMel Gorman 	unsigned long high_pfn;
198170b44595SMel Gorman 	int order;
198215d28d0dSWonhyuk Yang 	bool found_block = false;
198370b44595SMel Gorman 
198470b44595SMel Gorman 	/* Skip hints are relied on to avoid repeats on the fast search */
198570b44595SMel Gorman 	if (cc->ignore_skip_hint)
198670b44595SMel Gorman 		return pfn;
198770b44595SMel Gorman 
198870b44595SMel Gorman 	/*
1989f9d7fc1aSMel Gorman 	 * If the pageblock should be finished then do not select a different
1990f9d7fc1aSMel Gorman 	 * pageblock.
1991f9d7fc1aSMel Gorman 	 */
1992f9d7fc1aSMel Gorman 	if (cc->finish_pageblock)
1993f9d7fc1aSMel Gorman 		return pfn;
1994f9d7fc1aSMel Gorman 
1995f9d7fc1aSMel Gorman 	/*
199670b44595SMel Gorman 	 * If the migrate_pfn is not at the start of a zone or the start
199770b44595SMel Gorman 	 * of a pageblock then assume this is a continuation of a previous
199870b44595SMel Gorman 	 * scan restarted due to COMPACT_CLUSTER_MAX.
199970b44595SMel Gorman 	 */
200070b44595SMel Gorman 	if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
200170b44595SMel Gorman 		return pfn;
200270b44595SMel Gorman 
200370b44595SMel Gorman 	/*
200470b44595SMel Gorman 	 * For smaller orders, just linearly scan as the number of pages
200570b44595SMel Gorman 	 * to migrate should be relatively small and does not necessarily
200670b44595SMel Gorman 	 * justify freeing up a large block for a small allocation.
200770b44595SMel Gorman 	 */
200870b44595SMel Gorman 	if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
200970b44595SMel Gorman 		return pfn;
201070b44595SMel Gorman 
201170b44595SMel Gorman 	/*
201270b44595SMel Gorman 	 * Only allow kcompactd and direct requests for movable pages to
201370b44595SMel Gorman 	 * quickly clear out a MOVABLE pageblock for allocation. This
201470b44595SMel Gorman 	 * reduces the risk that a large movable pageblock is freed for
201570b44595SMel Gorman 	 * an unmovable/reclaimable small allocation.
201670b44595SMel Gorman 	 */
201770b44595SMel Gorman 	if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
201870b44595SMel Gorman 		return pfn;
201970b44595SMel Gorman 
202070b44595SMel Gorman 	/*
202170b44595SMel Gorman 	 * When starting the migration scanner, pick any pageblock within the
202270b44595SMel Gorman 	 * first half of the search space. Otherwise try and pick a pageblock
202370b44595SMel Gorman 	 * within the first eighth to reduce the chances that a migration
202470b44595SMel Gorman 	 * target later becomes a source.
202570b44595SMel Gorman 	 */
202670b44595SMel Gorman 	distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
202770b44595SMel Gorman 	if (cc->migrate_pfn != cc->zone->zone_start_pfn)
202870b44595SMel Gorman 		distance >>= 2;
202970b44595SMel Gorman 	high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
203070b44595SMel Gorman 
203170b44595SMel Gorman 	for (order = cc->order - 1;
203215d28d0dSWonhyuk Yang 	     order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
203370b44595SMel Gorman 	     order--) {
203470b44595SMel Gorman 		struct free_area *area = &cc->zone->free_area[order];
203570b44595SMel Gorman 		struct list_head *freelist;
203670b44595SMel Gorman 		unsigned long flags;
203770b44595SMel Gorman 		struct page *freepage;
203870b44595SMel Gorman 
203970b44595SMel Gorman 		if (!area->nr_free)
204070b44595SMel Gorman 			continue;
204170b44595SMel Gorman 
204270b44595SMel Gorman 		spin_lock_irqsave(&cc->zone->lock, flags);
204370b44595SMel Gorman 		freelist = &area->free_list[MIGRATE_MOVABLE];
204494ec2003SBaolin Wang 		list_for_each_entry(freepage, freelist, buddy_list) {
204570b44595SMel Gorman 			unsigned long free_pfn;
204670b44595SMel Gorman 
204715d28d0dSWonhyuk Yang 			if (nr_scanned++ >= limit) {
204815d28d0dSWonhyuk Yang 				move_freelist_tail(freelist, freepage);
204915d28d0dSWonhyuk Yang 				break;
205015d28d0dSWonhyuk Yang 			}
205115d28d0dSWonhyuk Yang 
205270b44595SMel Gorman 			free_pfn = page_to_pfn(freepage);
205370b44595SMel Gorman 			if (free_pfn < high_pfn) {
205470b44595SMel Gorman 				/*
205570b44595SMel Gorman 				 * Avoid if skipped recently. Ideally it would
205670b44595SMel Gorman 				 * move to the tail but even safe iteration of
205770b44595SMel Gorman 				 * the list assumes an entry is deleted, not
205870b44595SMel Gorman 				 * reordered.
205970b44595SMel Gorman 				 */
206015d28d0dSWonhyuk Yang 				if (get_pageblock_skip(freepage))
206170b44595SMel Gorman 					continue;
206270b44595SMel Gorman 
206370b44595SMel Gorman 				/* Reorder to so a future search skips recent pages */
206470b44595SMel Gorman 				move_freelist_tail(freelist, freepage);
206570b44595SMel Gorman 
2066e380bebeSMel Gorman 				update_fast_start_pfn(cc, free_pfn);
206770b44595SMel Gorman 				pfn = pageblock_start_pfn(free_pfn);
2068bbe832b9SRei Yamamoto 				if (pfn < cc->zone->zone_start_pfn)
2069bbe832b9SRei Yamamoto 					pfn = cc->zone->zone_start_pfn;
207070b44595SMel Gorman 				cc->fast_search_fail = 0;
207115d28d0dSWonhyuk Yang 				found_block = true;
207270b44595SMel Gorman 				break;
207370b44595SMel Gorman 			}
207470b44595SMel Gorman 		}
207570b44595SMel Gorman 		spin_unlock_irqrestore(&cc->zone->lock, flags);
207670b44595SMel Gorman 	}
207770b44595SMel Gorman 
207870b44595SMel Gorman 	cc->total_migrate_scanned += nr_scanned;
207970b44595SMel Gorman 
208070b44595SMel Gorman 	/*
208170b44595SMel Gorman 	 * If fast scanning failed then use a cached entry for a page block
208270b44595SMel Gorman 	 * that had free pages as the basis for starting a linear scan.
208370b44595SMel Gorman 	 */
208415d28d0dSWonhyuk Yang 	if (!found_block) {
208515d28d0dSWonhyuk Yang 		cc->fast_search_fail++;
208670b44595SMel Gorman 		pfn = reinit_migrate_pfn(cc);
208715d28d0dSWonhyuk Yang 	}
208870b44595SMel Gorman 	return pfn;
208970b44595SMel Gorman }
209070b44595SMel Gorman 
20915bbe3547SEric B Munson /*
2092edc2ca61SVlastimil Babka  * Isolate all pages that can be migrated from the first suitable block,
2093edc2ca61SVlastimil Babka  * starting at the block pointed to by the migrate scanner pfn within
2094edc2ca61SVlastimil Babka  * compact_control.
2095ff9543fdSMichal Nazarewicz  */
209632aaf055SPengfei Li static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
2097ff9543fdSMichal Nazarewicz {
2098e1409c32SJoonsoo Kim 	unsigned long block_start_pfn;
2099e1409c32SJoonsoo Kim 	unsigned long block_end_pfn;
2100e1409c32SJoonsoo Kim 	unsigned long low_pfn;
2101edc2ca61SVlastimil Babka 	struct page *page;
2102edc2ca61SVlastimil Babka 	const isolate_mode_t isolate_mode =
21035bbe3547SEric B Munson 		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
21041d2047feSHugh Dickins 		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
210570b44595SMel Gorman 	bool fast_find_block;
2106ff9543fdSMichal Nazarewicz 
2107edc2ca61SVlastimil Babka 	/*
2108edc2ca61SVlastimil Babka 	 * Start at where we last stopped, or beginning of the zone as
210970b44595SMel Gorman 	 * initialized by compact_zone(). The first failure will use
211070b44595SMel Gorman 	 * the lowest PFN as the starting point for linear scanning.
2111edc2ca61SVlastimil Babka 	 */
211270b44595SMel Gorman 	low_pfn = fast_find_migrateblock(cc);
211306b6640aSVlastimil Babka 	block_start_pfn = pageblock_start_pfn(low_pfn);
211432aaf055SPengfei Li 	if (block_start_pfn < cc->zone->zone_start_pfn)
211532aaf055SPengfei Li 		block_start_pfn = cc->zone->zone_start_pfn;
2116ff9543fdSMichal Nazarewicz 
211770b44595SMel Gorman 	/*
21180aa8ea3cSKemeng Shi 	 * fast_find_migrateblock() has already ensured the pageblock is not
21190aa8ea3cSKemeng Shi 	 * set with a skipped flag, so to avoid the isolation_suitable check
21200aa8ea3cSKemeng Shi 	 * below again, check whether the fast search was successful.
212170b44595SMel Gorman 	 */
212270b44595SMel Gorman 	fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
212370b44595SMel Gorman 
2124ff9543fdSMichal Nazarewicz 	/* Only scan within a pageblock boundary */
212506b6640aSVlastimil Babka 	block_end_pfn = pageblock_end_pfn(low_pfn);
2126ff9543fdSMichal Nazarewicz 
2127edc2ca61SVlastimil Babka 	/*
2128edc2ca61SVlastimil Babka 	 * Iterate over whole pageblocks until we find the first suitable.
2129edc2ca61SVlastimil Babka 	 * Do not cross the free scanner.
2130edc2ca61SVlastimil Babka 	 */
2131e1409c32SJoonsoo Kim 	for (; block_end_pfn <= cc->free_pfn;
213270b44595SMel Gorman 			fast_find_block = false,
2133c2ad7a1fSOscar Salvador 			cc->migrate_pfn = low_pfn = block_end_pfn,
2134e1409c32SJoonsoo Kim 			block_start_pfn = block_end_pfn,
2135e1409c32SJoonsoo Kim 			block_end_pfn += pageblock_nr_pages) {
2136edc2ca61SVlastimil Babka 
2137edc2ca61SVlastimil Babka 		/*
2138edc2ca61SVlastimil Babka 		 * This can potentially iterate a massively long zone with
2139edc2ca61SVlastimil Babka 		 * many pageblocks unsuitable, so periodically check if we
2140cb810ad2SMel Gorman 		 * need to schedule.
2141edc2ca61SVlastimil Babka 		 */
2142c036ddffSMiaohe Lin 		if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
2143cf66f070SMel Gorman 			cond_resched();
2144edc2ca61SVlastimil Babka 
214532aaf055SPengfei Li 		page = pageblock_pfn_to_page(block_start_pfn,
214632aaf055SPengfei Li 						block_end_pfn, cc->zone);
21479721fd82SBaolin Wang 		if (!page) {
21489721fd82SBaolin Wang 			unsigned long next_pfn;
21499721fd82SBaolin Wang 
21509721fd82SBaolin Wang 			next_pfn = skip_offline_sections(block_start_pfn);
21519721fd82SBaolin Wang 			if (next_pfn)
21529721fd82SBaolin Wang 				block_end_pfn = min(next_pfn, cc->free_pfn);
2153edc2ca61SVlastimil Babka 			continue;
21549721fd82SBaolin Wang 		}
2155edc2ca61SVlastimil Babka 
2156e380bebeSMel Gorman 		/*
2157e380bebeSMel Gorman 		 * If isolation recently failed, do not retry. Only check the
2158e380bebeSMel Gorman 		 * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
2159e380bebeSMel Gorman 		 * to be visited multiple times. Assume skip was checked
2160e380bebeSMel Gorman 		 * before making it "skip" so other compaction instances do
2161e380bebeSMel Gorman 		 * not scan the same block.
2162e380bebeSMel Gorman 		 */
2163493614daSJohannes Weiner 		if ((pageblock_aligned(low_pfn) ||
2164493614daSJohannes Weiner 		     low_pfn == cc->zone->zone_start_pfn) &&
2165e380bebeSMel Gorman 		    !fast_find_block && !isolation_suitable(cc, page))
2166edc2ca61SVlastimil Babka 			continue;
2167edc2ca61SVlastimil Babka 
2168edc2ca61SVlastimil Babka 		/*
2169556162bfSMiaohe Lin 		 * For async direct compaction, only scan the pageblocks of the
2170556162bfSMiaohe Lin 		 * same migratetype without huge pages. Async direct compaction
2171556162bfSMiaohe Lin 		 * is optimistic to see if the minimum amount of work satisfies
2172556162bfSMiaohe Lin 		 * the allocation. The cached PFN is updated as it's possible
2173556162bfSMiaohe Lin 		 * that all remaining blocks between source and target are
2174556162bfSMiaohe Lin 		 * unsuitable and the compaction scanners fail to meet.
2175edc2ca61SVlastimil Babka 		 */
21769bebefd5SMel Gorman 		if (!suitable_migration_source(cc, page)) {
21779bebefd5SMel Gorman 			update_cached_migrate(cc, block_end_pfn);
2178edc2ca61SVlastimil Babka 			continue;
21799bebefd5SMel Gorman 		}
2180ff9543fdSMichal Nazarewicz 
2181ff9543fdSMichal Nazarewicz 		/* Perform the isolation */
2182c2ad7a1fSOscar Salvador 		if (isolate_migratepages_block(cc, low_pfn, block_end_pfn,
2183c2ad7a1fSOscar Salvador 						isolate_mode))
2184ff9543fdSMichal Nazarewicz 			return ISOLATE_ABORT;
2185ff9543fdSMichal Nazarewicz 
2186edc2ca61SVlastimil Babka 		/*
2187edc2ca61SVlastimil Babka 		 * Either we isolated something and proceed with migration. Or
2188edc2ca61SVlastimil Babka 		 * we failed and compact_zone should decide if we should
2189edc2ca61SVlastimil Babka 		 * continue or not.
2190edc2ca61SVlastimil Babka 		 */
2191edc2ca61SVlastimil Babka 		break;
2192edc2ca61SVlastimil Babka 	}
2193edc2ca61SVlastimil Babka 
2194edc2ca61SVlastimil Babka 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
2195ff9543fdSMichal Nazarewicz }
2196ff9543fdSMichal Nazarewicz 
219721c527a3SYaowei Bai /*
2198b4a0215eSKefeng Wang  * Determine whether kswapd is (or recently was!) running on this node.
2199b4a0215eSKefeng Wang  *
2200b4a0215eSKefeng Wang  * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
2201b4a0215eSKefeng Wang  * zero it.
2202b4a0215eSKefeng Wang  */
2203facdaa91SNitin Gupta static bool kswapd_is_running(pg_data_t *pgdat)
2204facdaa91SNitin Gupta {
2205b4a0215eSKefeng Wang 	bool running;
2206b4a0215eSKefeng Wang 
2207b4a0215eSKefeng Wang 	pgdat_kswapd_lock(pgdat);
2208b4a0215eSKefeng Wang 	running = pgdat->kswapd && task_is_running(pgdat->kswapd);
2209b4a0215eSKefeng Wang 	pgdat_kswapd_unlock(pgdat);
2210b4a0215eSKefeng Wang 
2211b4a0215eSKefeng Wang 	return running;
2212facdaa91SNitin Gupta }
2213facdaa91SNitin Gupta 
2214facdaa91SNitin Gupta /*
2215facdaa91SNitin Gupta  * A zone's fragmentation score is the external fragmentation wrt to the
221640d7e203SCharan Teja Reddy  * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
221740d7e203SCharan Teja Reddy  */
221840d7e203SCharan Teja Reddy static unsigned int fragmentation_score_zone(struct zone *zone)
221940d7e203SCharan Teja Reddy {
222040d7e203SCharan Teja Reddy 	return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
222140d7e203SCharan Teja Reddy }
222240d7e203SCharan Teja Reddy 
222340d7e203SCharan Teja Reddy /*
222440d7e203SCharan Teja Reddy  * A weighted zone's fragmentation score is the external fragmentation
222540d7e203SCharan Teja Reddy  * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
222640d7e203SCharan Teja Reddy  * returns a value in the range [0, 100].
2227facdaa91SNitin Gupta  *
2228facdaa91SNitin Gupta  * The scaling factor ensures that proactive compaction focuses on larger
2229facdaa91SNitin Gupta  * zones like ZONE_NORMAL, rather than smaller, specialized zones like
2230facdaa91SNitin Gupta  * ZONE_DMA32. For smaller zones, the score value remains close to zero,
2231facdaa91SNitin Gupta  * and thus never exceeds the high threshold for proactive compaction.
2232facdaa91SNitin Gupta  */
223340d7e203SCharan Teja Reddy static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
2234facdaa91SNitin Gupta {
2235facdaa91SNitin Gupta 	unsigned long score;
2236facdaa91SNitin Gupta 
223740d7e203SCharan Teja Reddy 	score = zone->present_pages * fragmentation_score_zone(zone);
2238facdaa91SNitin Gupta 	return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
2239facdaa91SNitin Gupta }
2240facdaa91SNitin Gupta 
2241facdaa91SNitin Gupta /*
2242facdaa91SNitin Gupta  * The per-node proactive (background) compaction process is started by its
2243facdaa91SNitin Gupta  * corresponding kcompactd thread when the node's fragmentation score
2244facdaa91SNitin Gupta  * exceeds the high threshold. The compaction process remains active till
2245facdaa91SNitin Gupta  * the node's score falls below the low threshold, or one of the back-off
2246facdaa91SNitin Gupta  * conditions is met.
2247facdaa91SNitin Gupta  */
2248d34c0a75SNitin Gupta static unsigned int fragmentation_score_node(pg_data_t *pgdat)
2249facdaa91SNitin Gupta {
2250d34c0a75SNitin Gupta 	unsigned int score = 0;
2251facdaa91SNitin Gupta 	int zoneid;
2252facdaa91SNitin Gupta 
2253facdaa91SNitin Gupta 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
2254facdaa91SNitin Gupta 		struct zone *zone;
2255facdaa91SNitin Gupta 
2256facdaa91SNitin Gupta 		zone = &pgdat->node_zones[zoneid];
22579e552271SBaolin Wang 		if (!populated_zone(zone))
22589e552271SBaolin Wang 			continue;
225940d7e203SCharan Teja Reddy 		score += fragmentation_score_zone_weighted(zone);
2260facdaa91SNitin Gupta 	}
2261facdaa91SNitin Gupta 
2262facdaa91SNitin Gupta 	return score;
2263facdaa91SNitin Gupta }
2264facdaa91SNitin Gupta 
22658fbb92bdSKemeng Shi static unsigned int fragmentation_score_wmark(bool low)
2266facdaa91SNitin Gupta {
2267d34c0a75SNitin Gupta 	unsigned int wmark_low;
2268facdaa91SNitin Gupta 
2269facdaa91SNitin Gupta 	/*
2270f0953a1bSIngo Molnar 	 * Cap the low watermark to avoid excessive compaction
2271f0953a1bSIngo Molnar 	 * activity in case a user sets the proactiveness tunable
2272facdaa91SNitin Gupta 	 * close to 100 (maximum).
2273facdaa91SNitin Gupta 	 */
2274d34c0a75SNitin Gupta 	wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
2275d34c0a75SNitin Gupta 	return low ? wmark_low : min(wmark_low + 10, 100U);
2276facdaa91SNitin Gupta }
2277facdaa91SNitin Gupta 
2278facdaa91SNitin Gupta static bool should_proactive_compact_node(pg_data_t *pgdat)
2279facdaa91SNitin Gupta {
2280facdaa91SNitin Gupta 	int wmark_high;
2281facdaa91SNitin Gupta 
2282facdaa91SNitin Gupta 	if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
2283facdaa91SNitin Gupta 		return false;
2284facdaa91SNitin Gupta 
22858fbb92bdSKemeng Shi 	wmark_high = fragmentation_score_wmark(false);
2286facdaa91SNitin Gupta 	return fragmentation_score_node(pgdat) > wmark_high;
2287facdaa91SNitin Gupta }
2288facdaa91SNitin Gupta 
228940cacbcbSMel Gorman static enum compact_result __compact_finished(struct compact_control *cc)
2290748446bbSMel Gorman {
22918fb74b9fSMel Gorman 	unsigned int order;
2292d39773a0SVlastimil Babka 	const int migratetype = cc->migratetype;
2293cb2dcaf0SMel Gorman 	int ret;
2294748446bbSMel Gorman 
2295753341a4SMel Gorman 	/* Compaction run completes if the migrate and free scanner meet */
2296f2849aa0SVlastimil Babka 	if (compact_scanners_met(cc)) {
229755b7c4c9SVlastimil Babka 		/* Let the next compaction start anew. */
229840cacbcbSMel Gorman 		reset_cached_positions(cc->zone);
229955b7c4c9SVlastimil Babka 
230062997027SMel Gorman 		/*
230162997027SMel Gorman 		 * Mark that the PG_migrate_skip information should be cleared
2302accf6242SVlastimil Babka 		 * by kswapd when it goes to sleep. kcompactd does not set the
230362997027SMel Gorman 		 * flag itself as the decision to be clear should be directly
230462997027SMel Gorman 		 * based on an allocation request.
230562997027SMel Gorman 		 */
2306accf6242SVlastimil Babka 		if (cc->direct_compaction)
230740cacbcbSMel Gorman 			cc->zone->compact_blockskip_flush = true;
230862997027SMel Gorman 
2309c8f7de0bSMichal Hocko 		if (cc->whole_zone)
2310748446bbSMel Gorman 			return COMPACT_COMPLETE;
2311c8f7de0bSMichal Hocko 		else
2312c8f7de0bSMichal Hocko 			return COMPACT_PARTIAL_SKIPPED;
2313bb13ffebSMel Gorman 	}
2314748446bbSMel Gorman 
2315facdaa91SNitin Gupta 	if (cc->proactive_compaction) {
2316facdaa91SNitin Gupta 		int score, wmark_low;
2317facdaa91SNitin Gupta 		pg_data_t *pgdat;
2318facdaa91SNitin Gupta 
2319facdaa91SNitin Gupta 		pgdat = cc->zone->zone_pgdat;
2320facdaa91SNitin Gupta 		if (kswapd_is_running(pgdat))
2321facdaa91SNitin Gupta 			return COMPACT_PARTIAL_SKIPPED;
2322facdaa91SNitin Gupta 
2323facdaa91SNitin Gupta 		score = fragmentation_score_zone(cc->zone);
23248fbb92bdSKemeng Shi 		wmark_low = fragmentation_score_wmark(true);
2325facdaa91SNitin Gupta 
2326facdaa91SNitin Gupta 		if (score > wmark_low)
2327facdaa91SNitin Gupta 			ret = COMPACT_CONTINUE;
2328facdaa91SNitin Gupta 		else
2329facdaa91SNitin Gupta 			ret = COMPACT_SUCCESS;
2330facdaa91SNitin Gupta 
2331facdaa91SNitin Gupta 		goto out;
2332facdaa91SNitin Gupta 	}
2333facdaa91SNitin Gupta 
233421c527a3SYaowei Bai 	if (is_via_compact_memory(cc->order))
233556de7263SMel Gorman 		return COMPACT_CONTINUE;
233656de7263SMel Gorman 
2337baf6a9a1SVlastimil Babka 	/*
2338efe771c7SMel Gorman 	 * Always finish scanning a pageblock to reduce the possibility of
2339efe771c7SMel Gorman 	 * fallbacks in the future. This is particularly important when
2340efe771c7SMel Gorman 	 * migration source is unmovable/reclaimable but it's not worth
2341efe771c7SMel Gorman 	 * special casing.
2342baf6a9a1SVlastimil Babka 	 */
2343ee0913c4SKefeng Wang 	if (!pageblock_aligned(cc->migrate_pfn))
2344baf6a9a1SVlastimil Babka 		return COMPACT_CONTINUE;
2345baf6a9a1SVlastimil Babka 
234656de7263SMel Gorman 	/* Direct compactor: Is a suitable page free? */
2347cb2dcaf0SMel Gorman 	ret = COMPACT_NO_SUITABLE_PAGE;
2348fd377218SKirill A. Shutemov 	for (order = cc->order; order < NR_PAGE_ORDERS; order++) {
234940cacbcbSMel Gorman 		struct free_area *area = &cc->zone->free_area[order];
23502149cdaeSJoonsoo Kim 		bool can_steal;
23518fb74b9fSMel Gorman 
235256de7263SMel Gorman 		/* Job done if page is free of the right migratetype */
2353b03641afSDan Williams 		if (!free_area_empty(area, migratetype))
2354cf378319SVlastimil Babka 			return COMPACT_SUCCESS;
235556de7263SMel Gorman 
23562149cdaeSJoonsoo Kim #ifdef CONFIG_CMA
23572149cdaeSJoonsoo Kim 		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
23582149cdaeSJoonsoo Kim 		if (migratetype == MIGRATE_MOVABLE &&
2359b03641afSDan Williams 			!free_area_empty(area, MIGRATE_CMA))
2360cf378319SVlastimil Babka 			return COMPACT_SUCCESS;
23612149cdaeSJoonsoo Kim #endif
23622149cdaeSJoonsoo Kim 		/*
23632149cdaeSJoonsoo Kim 		 * Job done if allocation would steal freepages from
23642149cdaeSJoonsoo Kim 		 * other migratetype buddy lists.
23652149cdaeSJoonsoo Kim 		 */
23662149cdaeSJoonsoo Kim 		if (find_suitable_fallback(area, order, migratetype,
2367fa599c44SMiaohe Lin 						true, &can_steal) != -1)
2368baf6a9a1SVlastimil Babka 			/*
2369fa599c44SMiaohe Lin 			 * Movable pages are OK in any pageblock. If we are
2370fa599c44SMiaohe Lin 			 * stealing for a non-movable allocation, make sure
2371fa599c44SMiaohe Lin 			 * we finish compacting the current pageblock first
2372fa599c44SMiaohe Lin 			 * (which is assured by the above migrate_pfn align
2373fa599c44SMiaohe Lin 			 * check) so it is as free as possible and we won't
2374fa599c44SMiaohe Lin 			 * have to steal another one soon.
2375baf6a9a1SVlastimil Babka 			 */
2376baf6a9a1SVlastimil Babka 			return COMPACT_SUCCESS;
2377baf6a9a1SVlastimil Babka 	}
2378baf6a9a1SVlastimil Babka 
2379facdaa91SNitin Gupta out:
2380cb2dcaf0SMel Gorman 	if (cc->contended || fatal_signal_pending(current))
2381cb2dcaf0SMel Gorman 		ret = COMPACT_CONTENDED;
2382cb2dcaf0SMel Gorman 
2383cb2dcaf0SMel Gorman 	return ret;
2384837d026dSJoonsoo Kim }
2385837d026dSJoonsoo Kim 
238640cacbcbSMel Gorman static enum compact_result compact_finished(struct compact_control *cc)
2387837d026dSJoonsoo Kim {
2388837d026dSJoonsoo Kim 	int ret;
2389837d026dSJoonsoo Kim 
239040cacbcbSMel Gorman 	ret = __compact_finished(cc);
239140cacbcbSMel Gorman 	trace_mm_compaction_finished(cc->zone, cc->order, ret);
2392837d026dSJoonsoo Kim 	if (ret == COMPACT_NO_SUITABLE_PAGE)
2393837d026dSJoonsoo Kim 		ret = COMPACT_CONTINUE;
2394837d026dSJoonsoo Kim 
2395837d026dSJoonsoo Kim 	return ret;
2396748446bbSMel Gorman }
2397748446bbSMel Gorman 
23983cf04937SJohannes Weiner static bool __compaction_suitable(struct zone *zone, int order,
239997a225e6SJoonsoo Kim 				  int highest_zoneidx,
240086a294a8SMichal Hocko 				  unsigned long wmark_target)
24013e7d3449SMel Gorman {
24023e7d3449SMel Gorman 	unsigned long watermark;
24033957c776SMichal Hocko 	/*
24049861a62cSVlastimil Babka 	 * Watermarks for order-0 must be met for compaction to be able to
2405984fdba6SVlastimil Babka 	 * isolate free pages for migration targets. This means that the
2406984fdba6SVlastimil Babka 	 * watermark and alloc_flags have to match, or be more pessimistic than
2407984fdba6SVlastimil Babka 	 * the check in __isolate_free_page(). We don't use the direct
2408984fdba6SVlastimil Babka 	 * compactor's alloc_flags, as they are not relevant for freepage
240997a225e6SJoonsoo Kim 	 * isolation. We however do use the direct compactor's highest_zoneidx
241097a225e6SJoonsoo Kim 	 * to skip over zones where lowmem reserves would prevent allocation
241197a225e6SJoonsoo Kim 	 * even if compaction succeeds.
24128348faf9SVlastimil Babka 	 * For costly orders, we require low watermark instead of min for
24138348faf9SVlastimil Babka 	 * compaction to proceed to increase its chances.
2414d883c6cfSJoonsoo Kim 	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
2415d883c6cfSJoonsoo Kim 	 * suitable migration targets
24163e7d3449SMel Gorman 	 */
24178348faf9SVlastimil Babka 	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
24188348faf9SVlastimil Babka 				low_wmark_pages(zone) : min_wmark_pages(zone);
24198348faf9SVlastimil Babka 	watermark += compact_gap(order);
24203cf04937SJohannes Weiner 	return __zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
24213cf04937SJohannes Weiner 				   ALLOC_CMA, wmark_target);
2422cc5c9f09SVlastimil Babka }
2423cc5c9f09SVlastimil Babka 
24242b1a20c3SHui Su /*
24252b1a20c3SHui Su  * compaction_suitable: Is this suitable to run compaction on this zone now?
24262b1a20c3SHui Su  */
24273cf04937SJohannes Weiner bool compaction_suitable(struct zone *zone, int order, int highest_zoneidx)
2428cc5c9f09SVlastimil Babka {
24293cf04937SJohannes Weiner 	enum compact_result compact_result;
24303cf04937SJohannes Weiner 	bool suitable;
2431cc5c9f09SVlastimil Babka 
24323cf04937SJohannes Weiner 	suitable = __compaction_suitable(zone, order, highest_zoneidx,
2433cc5c9f09SVlastimil Babka 					 zone_page_state(zone, NR_FREE_PAGES));
24343e7d3449SMel Gorman 	/*
24353e7d3449SMel Gorman 	 * fragmentation index determines if allocation failures are due to
24363e7d3449SMel Gorman 	 * low memory or external fragmentation
24373e7d3449SMel Gorman 	 *
2438ebff3980SVlastimil Babka 	 * index of -1000 would imply allocations might succeed depending on
2439ebff3980SVlastimil Babka 	 * watermarks, but we already failed the high-order watermark check
24403e7d3449SMel Gorman 	 * index towards 0 implies failure is due to lack of memory
24413e7d3449SMel Gorman 	 * index towards 1000 implies failure is due to fragmentation
24423e7d3449SMel Gorman 	 *
244320311420SVlastimil Babka 	 * Only compact if a failure would be due to fragmentation. Also
244420311420SVlastimil Babka 	 * ignore fragindex for non-costly orders where the alternative to
244520311420SVlastimil Babka 	 * a successful reclaim/compaction is OOM. Fragindex and the
244620311420SVlastimil Babka 	 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
244720311420SVlastimil Babka 	 * excessive compaction for costly orders, but it should not be at the
244820311420SVlastimil Babka 	 * expense of system stability.
24493e7d3449SMel Gorman 	 */
24503cf04937SJohannes Weiner 	if (suitable) {
24513cf04937SJohannes Weiner 		compact_result = COMPACT_CONTINUE;
24523cf04937SJohannes Weiner 		if (order > PAGE_ALLOC_COSTLY_ORDER) {
24533cf04937SJohannes Weiner 			int fragindex = fragmentation_index(zone, order);
24543cf04937SJohannes Weiner 
24553cf04937SJohannes Weiner 			if (fragindex >= 0 &&
24563cf04937SJohannes Weiner 			    fragindex <= sysctl_extfrag_threshold) {
24573cf04937SJohannes Weiner 				suitable = false;
24583cf04937SJohannes Weiner 				compact_result = COMPACT_NOT_SUITABLE_ZONE;
24593cf04937SJohannes Weiner 			}
24603cf04937SJohannes Weiner 		}
24613cf04937SJohannes Weiner 	} else {
24623cf04937SJohannes Weiner 		compact_result = COMPACT_SKIPPED;
24633e7d3449SMel Gorman 	}
24643e7d3449SMel Gorman 
24653cf04937SJohannes Weiner 	trace_mm_compaction_suitable(zone, order, compact_result);
2466837d026dSJoonsoo Kim 
24673cf04937SJohannes Weiner 	return suitable;
2468837d026dSJoonsoo Kim }
2469837d026dSJoonsoo Kim 
247086a294a8SMichal Hocko bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
247186a294a8SMichal Hocko 		int alloc_flags)
247286a294a8SMichal Hocko {
247386a294a8SMichal Hocko 	struct zone *zone;
247486a294a8SMichal Hocko 	struct zoneref *z;
247586a294a8SMichal Hocko 
247686a294a8SMichal Hocko 	/*
247786a294a8SMichal Hocko 	 * Make sure at least one zone would pass __compaction_suitable if we continue
247886a294a8SMichal Hocko 	 * retrying the reclaim.
247986a294a8SMichal Hocko 	 */
248097a225e6SJoonsoo Kim 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
248197a225e6SJoonsoo Kim 				ac->highest_zoneidx, ac->nodemask) {
248286a294a8SMichal Hocko 		unsigned long available;
248386a294a8SMichal Hocko 
248486a294a8SMichal Hocko 		/*
248586a294a8SMichal Hocko 		 * Do not consider all the reclaimable memory because we do not
248686a294a8SMichal Hocko 		 * want to trash just for a single high order allocation which
248786a294a8SMichal Hocko 		 * is even not guaranteed to appear even if __compaction_suitable
248886a294a8SMichal Hocko 		 * is happy about the watermark check.
248986a294a8SMichal Hocko 		 */
24905a1c84b4SMel Gorman 		available = zone_reclaimable_pages(zone) / order;
249186a294a8SMichal Hocko 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
2492e8606320SJohannes Weiner 		if (__compaction_suitable(zone, order, ac->highest_zoneidx,
24933cf04937SJohannes Weiner 					  available))
249486a294a8SMichal Hocko 			return true;
249586a294a8SMichal Hocko 	}
249686a294a8SMichal Hocko 
249786a294a8SMichal Hocko 	return false;
249886a294a8SMichal Hocko }
249986a294a8SMichal Hocko 
2500e19a3f59SKemeng Shi /*
2501e19a3f59SKemeng Shi  * Should we do compaction for target allocation order.
2502e19a3f59SKemeng Shi  * Return COMPACT_SUCCESS if allocation for target order can be already
2503e19a3f59SKemeng Shi  * satisfied
2504e19a3f59SKemeng Shi  * Return COMPACT_SKIPPED if compaction for target order is likely to fail
2505e19a3f59SKemeng Shi  * Return COMPACT_CONTINUE if compaction for target order should be ran
2506e19a3f59SKemeng Shi  */
2507e19a3f59SKemeng Shi static enum compact_result
2508e19a3f59SKemeng Shi compaction_suit_allocation_order(struct zone *zone, unsigned int order,
2509e19a3f59SKemeng Shi 				 int highest_zoneidx, unsigned int alloc_flags)
2510e19a3f59SKemeng Shi {
2511e19a3f59SKemeng Shi 	unsigned long watermark;
2512e19a3f59SKemeng Shi 
2513e19a3f59SKemeng Shi 	watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
2514e19a3f59SKemeng Shi 	if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
2515e19a3f59SKemeng Shi 			      alloc_flags))
2516e19a3f59SKemeng Shi 		return COMPACT_SUCCESS;
2517e19a3f59SKemeng Shi 
2518e19a3f59SKemeng Shi 	if (!compaction_suitable(zone, order, highest_zoneidx))
2519e19a3f59SKemeng Shi 		return COMPACT_SKIPPED;
2520e19a3f59SKemeng Shi 
2521e19a3f59SKemeng Shi 	return COMPACT_CONTINUE;
2522e19a3f59SKemeng Shi }
2523e19a3f59SKemeng Shi 
25245e1f0f09SMel Gorman static enum compact_result
25255e1f0f09SMel Gorman compact_zone(struct compact_control *cc, struct capture_control *capc)
2526748446bbSMel Gorman {
2527ea7ab982SMichal Hocko 	enum compact_result ret;
252840cacbcbSMel Gorman 	unsigned long start_pfn = cc->zone->zone_start_pfn;
252940cacbcbSMel Gorman 	unsigned long end_pfn = zone_end_pfn(cc->zone);
2530566e54e1SMel Gorman 	unsigned long last_migrated_pfn;
2531e0b9daebSDavid Rientjes 	const bool sync = cc->mode != MIGRATE_ASYNC;
25328854c55fSMel Gorman 	bool update_cached;
2533ab755bf4SBaolin Wang 	unsigned int nr_succeeded = 0, nr_migratepages;
2534733aea0bSZi Yan 	int order;
2535748446bbSMel Gorman 
2536a94b5252SYafang Shao 	/*
2537a94b5252SYafang Shao 	 * These counters track activities during zone compaction.  Initialize
2538a94b5252SYafang Shao 	 * them before compacting a new zone.
2539a94b5252SYafang Shao 	 */
2540a94b5252SYafang Shao 	cc->total_migrate_scanned = 0;
2541a94b5252SYafang Shao 	cc->total_free_scanned = 0;
2542a94b5252SYafang Shao 	cc->nr_migratepages = 0;
2543a94b5252SYafang Shao 	cc->nr_freepages = 0;
2544733aea0bSZi Yan 	for (order = 0; order < NR_PAGE_ORDERS; order++)
2545733aea0bSZi Yan 		INIT_LIST_HEAD(&cc->freepages[order]);
2546a94b5252SYafang Shao 	INIT_LIST_HEAD(&cc->migratepages);
2547a94b5252SYafang Shao 
254801c0bfe0SWei Yang 	cc->migratetype = gfp_migratetype(cc->gfp_mask);
2549e8606320SJohannes Weiner 
2550e8606320SJohannes Weiner 	if (!is_via_compact_memory(cc->order)) {
2551e19a3f59SKemeng Shi 		ret = compaction_suit_allocation_order(cc->zone, cc->order,
2552e19a3f59SKemeng Shi 						       cc->highest_zoneidx,
2553e19a3f59SKemeng Shi 						       cc->alloc_flags);
2554e19a3f59SKemeng Shi 		if (ret != COMPACT_CONTINUE)
2555e19a3f59SKemeng Shi 			return ret;
2556e8606320SJohannes Weiner 	}
2557c46649deSMichal Hocko 
2558c89511abSMel Gorman 	/*
2559d3132e4bSVlastimil Babka 	 * Clear pageblock skip if there were failures recently and compaction
2560accf6242SVlastimil Babka 	 * is about to be retried after being deferred.
2561d3132e4bSVlastimil Babka 	 */
256240cacbcbSMel Gorman 	if (compaction_restarting(cc->zone, cc->order))
256340cacbcbSMel Gorman 		__reset_isolation_suitable(cc->zone);
2564d3132e4bSVlastimil Babka 
2565d3132e4bSVlastimil Babka 	/*
2566c89511abSMel Gorman 	 * Setup to move all movable pages to the end of the zone. Used cached
256706ed2998SVlastimil Babka 	 * information on where the scanners should start (unless we explicitly
256806ed2998SVlastimil Babka 	 * want to compact the whole zone), but check that it is initialised
256906ed2998SVlastimil Babka 	 * by ensuring the values are within zone boundaries.
2570c89511abSMel Gorman 	 */
257170b44595SMel Gorman 	cc->fast_start_pfn = 0;
257206ed2998SVlastimil Babka 	if (cc->whole_zone) {
257306ed2998SVlastimil Babka 		cc->migrate_pfn = start_pfn;
257406ed2998SVlastimil Babka 		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
257506ed2998SVlastimil Babka 	} else {
257640cacbcbSMel Gorman 		cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
257740cacbcbSMel Gorman 		cc->free_pfn = cc->zone->compact_cached_free_pfn;
2578623446e4SJoonsoo Kim 		if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
257906b6640aSVlastimil Babka 			cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
258040cacbcbSMel Gorman 			cc->zone->compact_cached_free_pfn = cc->free_pfn;
2581c89511abSMel Gorman 		}
2582623446e4SJoonsoo Kim 		if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
2583c89511abSMel Gorman 			cc->migrate_pfn = start_pfn;
258440cacbcbSMel Gorman 			cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
258540cacbcbSMel Gorman 			cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
2586c89511abSMel Gorman 		}
2587c8f7de0bSMichal Hocko 
2588e332f741SMel Gorman 		if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
2589c8f7de0bSMichal Hocko 			cc->whole_zone = true;
259006ed2998SVlastimil Babka 	}
2591c8f7de0bSMichal Hocko 
2592566e54e1SMel Gorman 	last_migrated_pfn = 0;
2593748446bbSMel Gorman 
25948854c55fSMel Gorman 	/*
25958854c55fSMel Gorman 	 * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
25968854c55fSMel Gorman 	 * the basis that some migrations will fail in ASYNC mode. However,
25978854c55fSMel Gorman 	 * if the cached PFNs match and pageblocks are skipped due to having
25988854c55fSMel Gorman 	 * no isolation candidates, then the sync state does not matter.
25998854c55fSMel Gorman 	 * Until a pageblock with isolation candidates is found, keep the
26008854c55fSMel Gorman 	 * cached PFNs in sync to avoid revisiting the same blocks.
26018854c55fSMel Gorman 	 */
26028854c55fSMel Gorman 	update_cached = !sync &&
26038854c55fSMel Gorman 		cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
26048854c55fSMel Gorman 
2605abd4349fSBaolin Wang 	trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync);
26060eb927c0SMel Gorman 
2607361a2a22SMinchan Kim 	/* lru_add_drain_all could be expensive with involving other CPUs */
2608361a2a22SMinchan Kim 	lru_add_drain();
2609748446bbSMel Gorman 
261040cacbcbSMel Gorman 	while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
26119d502c1cSMinchan Kim 		int err;
261219d3cf9dSYanfei Xu 		unsigned long iteration_start_pfn = cc->migrate_pfn;
2613748446bbSMel Gorman 
2614804d3121SMel Gorman 		/*
261548731c84SMel Gorman 		 * Avoid multiple rescans of the same pageblock which can
261648731c84SMel Gorman 		 * happen if a page cannot be isolated (dirty/writeback in
261748731c84SMel Gorman 		 * async mode) or if the migrated pages are being allocated
261848731c84SMel Gorman 		 * before the pageblock is cleared.  The first rescan will
261948731c84SMel Gorman 		 * capture the entire pageblock for migration. If it fails,
262048731c84SMel Gorman 		 * it'll be marked skip and scanning will proceed as normal.
2621804d3121SMel Gorman 		 */
262248731c84SMel Gorman 		cc->finish_pageblock = false;
2623804d3121SMel Gorman 		if (pageblock_start_pfn(last_migrated_pfn) ==
262419d3cf9dSYanfei Xu 		    pageblock_start_pfn(iteration_start_pfn)) {
262548731c84SMel Gorman 			cc->finish_pageblock = true;
2626804d3121SMel Gorman 		}
2627804d3121SMel Gorman 
2628cfccd2e6SMel Gorman rescan:
262932aaf055SPengfei Li 		switch (isolate_migratepages(cc)) {
2630f9e35b3bSMel Gorman 		case ISOLATE_ABORT:
26312d1e1041SVlastimil Babka 			ret = COMPACT_CONTENDED;
26325733c7d1SRafael Aquini 			putback_movable_pages(&cc->migratepages);
2633e64c5237SShaohua Li 			cc->nr_migratepages = 0;
2634f9e35b3bSMel Gorman 			goto out;
2635f9e35b3bSMel Gorman 		case ISOLATE_NONE:
26368854c55fSMel Gorman 			if (update_cached) {
26378854c55fSMel Gorman 				cc->zone->compact_cached_migrate_pfn[1] =
26388854c55fSMel Gorman 					cc->zone->compact_cached_migrate_pfn[0];
26398854c55fSMel Gorman 			}
26408854c55fSMel Gorman 
2641fdaf7f5cSVlastimil Babka 			/*
2642fdaf7f5cSVlastimil Babka 			 * We haven't isolated and migrated anything, but
2643fdaf7f5cSVlastimil Babka 			 * there might still be unflushed migrations from
2644fdaf7f5cSVlastimil Babka 			 * previous cc->order aligned block.
2645fdaf7f5cSVlastimil Babka 			 */
2646fdaf7f5cSVlastimil Babka 			goto check_drain;
2647f9e35b3bSMel Gorman 		case ISOLATE_SUCCESS:
26488854c55fSMel Gorman 			update_cached = false;
26497c0a84bdSKemeng Shi 			last_migrated_pfn = max(cc->zone->zone_start_pfn,
26507c0a84bdSKemeng Shi 				pageblock_start_pfn(cc->migrate_pfn - 1));
2651f9e35b3bSMel Gorman 		}
2652748446bbSMel Gorman 
2653ab755bf4SBaolin Wang 		/*
2654ab755bf4SBaolin Wang 		 * Record the number of pages to migrate since the
2655ab755bf4SBaolin Wang 		 * compaction_alloc/free() will update cc->nr_migratepages
2656ab755bf4SBaolin Wang 		 * properly.
2657ab755bf4SBaolin Wang 		 */
2658ab755bf4SBaolin Wang 		nr_migratepages = cc->nr_migratepages;
2659d53aea3dSDavid Rientjes 		err = migrate_pages(&cc->migratepages, compaction_alloc,
2660e0b9daebSDavid Rientjes 				compaction_free, (unsigned long)cc, cc->mode,
266184b328aaSBaolin Wang 				MR_COMPACTION, &nr_succeeded);
2662748446bbSMel Gorman 
2663ab755bf4SBaolin Wang 		trace_mm_compaction_migratepages(nr_migratepages, nr_succeeded);
2664748446bbSMel Gorman 
2665f8c9301fSVlastimil Babka 		/* All pages were either migrated or will be released */
2666f8c9301fSVlastimil Babka 		cc->nr_migratepages = 0;
26679d502c1cSMinchan Kim 		if (err) {
26685733c7d1SRafael Aquini 			putback_movable_pages(&cc->migratepages);
26697ed695e0SVlastimil Babka 			/*
26707ed695e0SVlastimil Babka 			 * migrate_pages() may return -ENOMEM when scanners meet
26717ed695e0SVlastimil Babka 			 * and we want compact_finished() to detect it
26727ed695e0SVlastimil Babka 			 */
2673f2849aa0SVlastimil Babka 			if (err == -ENOMEM && !compact_scanners_met(cc)) {
26742d1e1041SVlastimil Babka 				ret = COMPACT_CONTENDED;
26754bf2bba3SDavid Rientjes 				goto out;
2676748446bbSMel Gorman 			}
2677fdd048e1SVlastimil Babka 			/*
2678cfccd2e6SMel Gorman 			 * If an ASYNC or SYNC_LIGHT fails to migrate a page
2679c3750cc7SKemeng Shi 			 * within the pageblock_order-aligned block and
26809ecc5fc5SMel Gorman 			 * fast_find_migrateblock may be used then scan the
2681cfccd2e6SMel Gorman 			 * remainder of the pageblock. This will mark the
2682cfccd2e6SMel Gorman 			 * pageblock "skip" to avoid rescanning in the near
2683cfccd2e6SMel Gorman 			 * future. This will isolate more pages than necessary
2684cfccd2e6SMel Gorman 			 * for the request but avoid loops due to
2685cfccd2e6SMel Gorman 			 * fast_find_migrateblock revisiting blocks that were
2686cfccd2e6SMel Gorman 			 * recently partially scanned.
2687fdd048e1SVlastimil Babka 			 */
2688539aa041SMel Gorman 			if (!pageblock_aligned(cc->migrate_pfn) &&
26899ecc5fc5SMel Gorman 			    !cc->ignore_skip_hint && !cc->finish_pageblock &&
2690cfccd2e6SMel Gorman 			    (cc->mode < MIGRATE_SYNC)) {
2691cfccd2e6SMel Gorman 				cc->finish_pageblock = true;
2692cfccd2e6SMel Gorman 
2693cfccd2e6SMel Gorman 				/*
2694cfccd2e6SMel Gorman 				 * Draining pcplists does not help THP if
2695cfccd2e6SMel Gorman 				 * any page failed to migrate. Even after
2696cfccd2e6SMel Gorman 				 * drain, the pageblock will not be free.
2697cfccd2e6SMel Gorman 				 */
2698cfccd2e6SMel Gorman 				if (cc->order == COMPACTION_HPAGE_ORDER)
2699566e54e1SMel Gorman 					last_migrated_pfn = 0;
2700cfccd2e6SMel Gorman 
2701cfccd2e6SMel Gorman 				goto rescan;
2702fdd048e1SVlastimil Babka 			}
27034bf2bba3SDavid Rientjes 		}
2704fdaf7f5cSVlastimil Babka 
270516b3be40SMel Gorman 		/* Stop if a page has been captured */
270616b3be40SMel Gorman 		if (capc && capc->page) {
270716b3be40SMel Gorman 			ret = COMPACT_SUCCESS;
270816b3be40SMel Gorman 			break;
270916b3be40SMel Gorman 		}
271016b3be40SMel Gorman 
2711fdaf7f5cSVlastimil Babka check_drain:
2712fdaf7f5cSVlastimil Babka 		/*
2713fdaf7f5cSVlastimil Babka 		 * Has the migration scanner moved away from the previous
2714fdaf7f5cSVlastimil Babka 		 * cc->order aligned block where we migrated from? If yes,
2715fdaf7f5cSVlastimil Babka 		 * flush the pages that were freed, so that they can merge and
2716fdaf7f5cSVlastimil Babka 		 * compact_finished() can detect immediately if allocation
2717fdaf7f5cSVlastimil Babka 		 * would succeed.
2718fdaf7f5cSVlastimil Babka 		 */
2719566e54e1SMel Gorman 		if (cc->order > 0 && last_migrated_pfn) {
2720fdaf7f5cSVlastimil Babka 			unsigned long current_block_start =
272106b6640aSVlastimil Babka 				block_start_pfn(cc->migrate_pfn, cc->order);
2722fdaf7f5cSVlastimil Babka 
2723566e54e1SMel Gorman 			if (last_migrated_pfn < current_block_start) {
2724b01b2141SIngo Molnar 				lru_add_drain_cpu_zone(cc->zone);
2725fdaf7f5cSVlastimil Babka 				/* No more flushing until we migrate again */
2726566e54e1SMel Gorman 				last_migrated_pfn = 0;
2727fdaf7f5cSVlastimil Babka 			}
2728fdaf7f5cSVlastimil Babka 		}
2729748446bbSMel Gorman 	}
2730748446bbSMel Gorman 
2731f9e35b3bSMel Gorman out:
27326bace090SVlastimil Babka 	/*
27336bace090SVlastimil Babka 	 * Release free pages and update where the free scanner should restart,
27346bace090SVlastimil Babka 	 * so we don't leave any returned pages behind in the next attempt.
27356bace090SVlastimil Babka 	 */
27366bace090SVlastimil Babka 	if (cc->nr_freepages > 0) {
2737733aea0bSZi Yan 		unsigned long free_pfn = release_free_list(cc->freepages);
27386bace090SVlastimil Babka 
27396bace090SVlastimil Babka 		cc->nr_freepages = 0;
27406bace090SVlastimil Babka 		VM_BUG_ON(free_pfn == 0);
27416bace090SVlastimil Babka 		/* The cached pfn is always the first in a pageblock */
274206b6640aSVlastimil Babka 		free_pfn = pageblock_start_pfn(free_pfn);
27436bace090SVlastimil Babka 		/*
27446bace090SVlastimil Babka 		 * Only go back, not forward. The cached pfn might have been
27456bace090SVlastimil Babka 		 * already reset to zone end in compact_finished()
27466bace090SVlastimil Babka 		 */
274740cacbcbSMel Gorman 		if (free_pfn > cc->zone->compact_cached_free_pfn)
274840cacbcbSMel Gorman 			cc->zone->compact_cached_free_pfn = free_pfn;
27496bace090SVlastimil Babka 	}
2750748446bbSMel Gorman 
27517f354a54SDavid Rientjes 	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
27527f354a54SDavid Rientjes 	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
27537f354a54SDavid Rientjes 
2754abd4349fSBaolin Wang 	trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
27550eb927c0SMel Gorman 
2756753ec50dSBaolin Wang 	VM_BUG_ON(!list_empty(&cc->migratepages));
2757753ec50dSBaolin Wang 
2758748446bbSMel Gorman 	return ret;
2759748446bbSMel Gorman }
276076ab0f53SMel Gorman 
2761ea7ab982SMichal Hocko static enum compact_result compact_zone_order(struct zone *zone, int order,
2762c3486f53SVlastimil Babka 		gfp_t gfp_mask, enum compact_priority prio,
276397a225e6SJoonsoo Kim 		unsigned int alloc_flags, int highest_zoneidx,
27645e1f0f09SMel Gorman 		struct page **capture)
276556de7263SMel Gorman {
2766ea7ab982SMichal Hocko 	enum compact_result ret;
276756de7263SMel Gorman 	struct compact_control cc = {
276856de7263SMel Gorman 		.order = order,
2769dbe2d4e4SMel Gorman 		.search_order = order,
27706d7ce559SDavid Rientjes 		.gfp_mask = gfp_mask,
277156de7263SMel Gorman 		.zone = zone,
2772a5508cd8SVlastimil Babka 		.mode = (prio == COMPACT_PRIO_ASYNC) ?
2773a5508cd8SVlastimil Babka 					MIGRATE_ASYNC :	MIGRATE_SYNC_LIGHT,
2774ebff3980SVlastimil Babka 		.alloc_flags = alloc_flags,
277597a225e6SJoonsoo Kim 		.highest_zoneidx = highest_zoneidx,
2776accf6242SVlastimil Babka 		.direct_compaction = true,
2777a8e025e5SVlastimil Babka 		.whole_zone = (prio == MIN_COMPACT_PRIORITY),
27789f7e3387SVlastimil Babka 		.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
27799f7e3387SVlastimil Babka 		.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
278056de7263SMel Gorman 	};
27815e1f0f09SMel Gorman 	struct capture_control capc = {
27825e1f0f09SMel Gorman 		.cc = &cc,
27835e1f0f09SMel Gorman 		.page = NULL,
27845e1f0f09SMel Gorman 	};
27855e1f0f09SMel Gorman 
2786b9e20f0dSVlastimil Babka 	/*
2787b9e20f0dSVlastimil Babka 	 * Make sure the structs are really initialized before we expose the
2788b9e20f0dSVlastimil Babka 	 * capture control, in case we are interrupted and the interrupt handler
2789b9e20f0dSVlastimil Babka 	 * frees a page.
2790b9e20f0dSVlastimil Babka 	 */
2791b9e20f0dSVlastimil Babka 	barrier();
2792b9e20f0dSVlastimil Babka 	WRITE_ONCE(current->capture_control, &capc);
279356de7263SMel Gorman 
27945e1f0f09SMel Gorman 	ret = compact_zone(&cc, &capc);
2795e64c5237SShaohua Li 
2796b9e20f0dSVlastimil Babka 	/*
2797b9e20f0dSVlastimil Babka 	 * Make sure we hide capture control first before we read the captured
2798b9e20f0dSVlastimil Babka 	 * page pointer, otherwise an interrupt could free and capture a page
2799b9e20f0dSVlastimil Babka 	 * and we would leak it.
2800b9e20f0dSVlastimil Babka 	 */
2801b9e20f0dSVlastimil Babka 	WRITE_ONCE(current->capture_control, NULL);
2802b9e20f0dSVlastimil Babka 	*capture = READ_ONCE(capc.page);
280306dac2f4SCharan Teja Reddy 	/*
280406dac2f4SCharan Teja Reddy 	 * Technically, it is also possible that compaction is skipped but
280506dac2f4SCharan Teja Reddy 	 * the page is still captured out of luck(IRQ came and freed the page).
280606dac2f4SCharan Teja Reddy 	 * Returning COMPACT_SUCCESS in such cases helps in properly accounting
280706dac2f4SCharan Teja Reddy 	 * the COMPACT[STALL|FAIL] when compaction is skipped.
280806dac2f4SCharan Teja Reddy 	 */
280906dac2f4SCharan Teja Reddy 	if (*capture)
281006dac2f4SCharan Teja Reddy 		ret = COMPACT_SUCCESS;
28115e1f0f09SMel Gorman 
2812e64c5237SShaohua Li 	return ret;
281356de7263SMel Gorman }
281456de7263SMel Gorman 
281556de7263SMel Gorman /**
281656de7263SMel Gorman  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
281756de7263SMel Gorman  * @gfp_mask: The GFP mask of the current allocation
28181a6d53a1SVlastimil Babka  * @order: The order of the current allocation
28191a6d53a1SVlastimil Babka  * @alloc_flags: The allocation flags of the current allocation
28201a6d53a1SVlastimil Babka  * @ac: The context of current allocation
2821112d2d29SYang Shi  * @prio: Determines how hard direct compaction should try to succeed
28226467552cSVlastimil Babka  * @capture: Pointer to free page created by compaction will be stored here
282356de7263SMel Gorman  *
282456de7263SMel Gorman  * This is the main entry point for direct page compaction.
282556de7263SMel Gorman  */
2826ea7ab982SMichal Hocko enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
2827c603844bSMel Gorman 		unsigned int alloc_flags, const struct alloc_context *ac,
28285e1f0f09SMel Gorman 		enum compact_priority prio, struct page **capture)
282956de7263SMel Gorman {
283056de7263SMel Gorman 	struct zoneref *z;
283156de7263SMel Gorman 	struct zone *zone;
28321d4746d3SMichal Hocko 	enum compact_result rc = COMPACT_SKIPPED;
283356de7263SMel Gorman 
2834803de900SVlastimil Babka 	if (!gfp_compaction_allowed(gfp_mask))
283553853e2dSVlastimil Babka 		return COMPACT_SKIPPED;
283656de7263SMel Gorman 
2837a5508cd8SVlastimil Babka 	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
2838837d026dSJoonsoo Kim 
283956de7263SMel Gorman 	/* Compact each zone in the list */
284097a225e6SJoonsoo Kim 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
284197a225e6SJoonsoo Kim 					ac->highest_zoneidx, ac->nodemask) {
2842ea7ab982SMichal Hocko 		enum compact_result status;
284356de7263SMel Gorman 
2844a8e025e5SVlastimil Babka 		if (prio > MIN_COMPACT_PRIORITY
2845a8e025e5SVlastimil Babka 					&& compaction_deferred(zone, order)) {
28461d4746d3SMichal Hocko 			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
284753853e2dSVlastimil Babka 			continue;
28481d4746d3SMichal Hocko 		}
284953853e2dSVlastimil Babka 
2850a5508cd8SVlastimil Babka 		status = compact_zone_order(zone, order, gfp_mask, prio,
285197a225e6SJoonsoo Kim 				alloc_flags, ac->highest_zoneidx, capture);
285256de7263SMel Gorman 		rc = max(status, rc);
285356de7263SMel Gorman 
28547ceb009aSVlastimil Babka 		/* The allocation should succeed, stop compacting */
28557ceb009aSVlastimil Babka 		if (status == COMPACT_SUCCESS) {
285653853e2dSVlastimil Babka 			/*
285753853e2dSVlastimil Babka 			 * We think the allocation will succeed in this zone,
285853853e2dSVlastimil Babka 			 * but it is not certain, hence the false. The caller
285953853e2dSVlastimil Babka 			 * will repeat this with true if allocation indeed
286053853e2dSVlastimil Babka 			 * succeeds in this zone.
286153853e2dSVlastimil Babka 			 */
286253853e2dSVlastimil Babka 			compaction_defer_reset(zone, order, false);
28631f9efdefSVlastimil Babka 
2864c3486f53SVlastimil Babka 			break;
28651f9efdefSVlastimil Babka 		}
28661f9efdefSVlastimil Babka 
2867a5508cd8SVlastimil Babka 		if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
2868c3486f53SVlastimil Babka 					status == COMPACT_PARTIAL_SKIPPED))
286953853e2dSVlastimil Babka 			/*
287053853e2dSVlastimil Babka 			 * We think that allocation won't succeed in this zone
287153853e2dSVlastimil Babka 			 * so we defer compaction there. If it ends up
287253853e2dSVlastimil Babka 			 * succeeding after all, it will be reset.
287353853e2dSVlastimil Babka 			 */
287453853e2dSVlastimil Babka 			defer_compaction(zone, order);
28751f9efdefSVlastimil Babka 
28761f9efdefSVlastimil Babka 		/*
28771f9efdefSVlastimil Babka 		 * We might have stopped compacting due to need_resched() in
28781f9efdefSVlastimil Babka 		 * async compaction, or due to a fatal signal detected. In that
2879c3486f53SVlastimil Babka 		 * case do not try further zones
28801f9efdefSVlastimil Babka 		 */
2881c3486f53SVlastimil Babka 		if ((prio == COMPACT_PRIO_ASYNC && need_resched())
2882c3486f53SVlastimil Babka 					|| fatal_signal_pending(current))
28831f9efdefSVlastimil Babka 			break;
28841f9efdefSVlastimil Babka 	}
28851f9efdefSVlastimil Babka 
288656de7263SMel Gorman 	return rc;
288756de7263SMel Gorman }
288856de7263SMel Gorman 
2889facdaa91SNitin Gupta /*
28903e40b3f4SKefeng Wang  * compact_node() - compact all zones within a node
28913e40b3f4SKefeng Wang  * @pgdat: The node page data
28923e40b3f4SKefeng Wang  * @proactive: Whether the compaction is proactive
2893facdaa91SNitin Gupta  *
28943e40b3f4SKefeng Wang  * For proactive compaction, compact till each zone's fragmentation score
28953e40b3f4SKefeng Wang  * reaches within proactive compaction thresholds (as determined by the
28963e40b3f4SKefeng Wang  * proactiveness tunable), it is possible that the function returns before
28973e40b3f4SKefeng Wang  * reaching score targets due to various back-off conditions, such as,
28983e40b3f4SKefeng Wang  * contention on per-node or per-zone locks.
2899facdaa91SNitin Gupta  */
2900f6f3f275SKefeng Wang static int compact_node(pg_data_t *pgdat, bool proactive)
2901facdaa91SNitin Gupta {
2902facdaa91SNitin Gupta 	int zoneid;
2903facdaa91SNitin Gupta 	struct zone *zone;
2904facdaa91SNitin Gupta 	struct compact_control cc = {
2905facdaa91SNitin Gupta 		.order = -1,
29063e40b3f4SKefeng Wang 		.mode = proactive ? MIGRATE_SYNC_LIGHT : MIGRATE_SYNC,
2907facdaa91SNitin Gupta 		.ignore_skip_hint = true,
2908facdaa91SNitin Gupta 		.whole_zone = true,
2909facdaa91SNitin Gupta 		.gfp_mask = GFP_KERNEL,
29103e40b3f4SKefeng Wang 		.proactive_compaction = proactive,
2911facdaa91SNitin Gupta 	};
2912facdaa91SNitin Gupta 
2913facdaa91SNitin Gupta 	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
2914facdaa91SNitin Gupta 		zone = &pgdat->node_zones[zoneid];
2915facdaa91SNitin Gupta 		if (!populated_zone(zone))
2916facdaa91SNitin Gupta 			continue;
2917facdaa91SNitin Gupta 
2918f6f3f275SKefeng Wang 		if (fatal_signal_pending(current))
2919f6f3f275SKefeng Wang 			return -EINTR;
2920f6f3f275SKefeng Wang 
2921facdaa91SNitin Gupta 		cc.zone = zone;
2922facdaa91SNitin Gupta 
2923facdaa91SNitin Gupta 		compact_zone(&cc, NULL);
2924facdaa91SNitin Gupta 
29253e40b3f4SKefeng Wang 		if (proactive) {
29261bfb7684SBaolin Wang 			count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
29271bfb7684SBaolin Wang 					     cc.total_migrate_scanned);
29281bfb7684SBaolin Wang 			count_compact_events(KCOMPACTD_FREE_SCANNED,
29291bfb7684SBaolin Wang 					     cc.total_free_scanned);
2930facdaa91SNitin Gupta 		}
2931facdaa91SNitin Gupta 	}
293256de7263SMel Gorman 
2933f6f3f275SKefeng Wang 	return 0;
29347be62de9SRik van Riel }
29357be62de9SRik van Riel 
29363e40b3f4SKefeng Wang /* Compact all zones of all nodes in the system */
2937f6f3f275SKefeng Wang static int compact_nodes(void)
293876ab0f53SMel Gorman {
2939f6f3f275SKefeng Wang 	int ret, nid;
294076ab0f53SMel Gorman 
29418575ec29SHugh Dickins 	/* Flush pending updates to the LRU lists */
29428575ec29SHugh Dickins 	lru_add_drain_all();
29438575ec29SHugh Dickins 
2944f6f3f275SKefeng Wang 	for_each_online_node(nid) {
2945f6f3f275SKefeng Wang 		ret = compact_node(NODE_DATA(nid), false);
2946f6f3f275SKefeng Wang 		if (ret)
2947f6f3f275SKefeng Wang 			return ret;
2948f6f3f275SKefeng Wang 	}
2949f6f3f275SKefeng Wang 
2950f6f3f275SKefeng Wang 	return 0;
295176ab0f53SMel Gorman }
295276ab0f53SMel Gorman 
295348fe8ab8SMinghao Chi static int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write,
295465d759c8SCharan Teja Reddy 		void *buffer, size_t *length, loff_t *ppos)
295565d759c8SCharan Teja Reddy {
295665d759c8SCharan Teja Reddy 	int rc, nid;
295765d759c8SCharan Teja Reddy 
295865d759c8SCharan Teja Reddy 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
295965d759c8SCharan Teja Reddy 	if (rc)
296065d759c8SCharan Teja Reddy 		return rc;
296165d759c8SCharan Teja Reddy 
296265d759c8SCharan Teja Reddy 	if (write && sysctl_compaction_proactiveness) {
296365d759c8SCharan Teja Reddy 		for_each_online_node(nid) {
296465d759c8SCharan Teja Reddy 			pg_data_t *pgdat = NODE_DATA(nid);
296565d759c8SCharan Teja Reddy 
296665d759c8SCharan Teja Reddy 			if (pgdat->proactive_compact_trigger)
296765d759c8SCharan Teja Reddy 				continue;
296865d759c8SCharan Teja Reddy 
296965d759c8SCharan Teja Reddy 			pgdat->proactive_compact_trigger = true;
29708fff8b6fSBaolin Wang 			trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1,
29718fff8b6fSBaolin Wang 							     pgdat->nr_zones - 1);
297265d759c8SCharan Teja Reddy 			wake_up_interruptible(&pgdat->kcompactd_wait);
297365d759c8SCharan Teja Reddy 		}
297465d759c8SCharan Teja Reddy 	}
297565d759c8SCharan Teja Reddy 
297665d759c8SCharan Teja Reddy 	return 0;
297765d759c8SCharan Teja Reddy }
297865d759c8SCharan Teja Reddy 
2979facdaa91SNitin Gupta /*
2980fec4eb2cSYaowei Bai  * This is the entry point for compacting all nodes via
2981fec4eb2cSYaowei Bai  * /proc/sys/vm/compact_memory
2982fec4eb2cSYaowei Bai  */
298348fe8ab8SMinghao Chi static int sysctl_compaction_handler(struct ctl_table *table, int write,
298432927393SChristoph Hellwig 			void *buffer, size_t *length, loff_t *ppos)
298576ab0f53SMel Gorman {
29868b9167cdSWen Yang 	int ret;
29878b9167cdSWen Yang 
29888b9167cdSWen Yang 	ret = proc_dointvec(table, write, buffer, length, ppos);
29898b9167cdSWen Yang 	if (ret)
29908b9167cdSWen Yang 		return ret;
29918b9167cdSWen Yang 
29928b9167cdSWen Yang 	if (sysctl_compact_memory != 1)
29938b9167cdSWen Yang 		return -EINVAL;
29948b9167cdSWen Yang 
299576ab0f53SMel Gorman 	if (write)
2996f6f3f275SKefeng Wang 		ret = compact_nodes();
299776ab0f53SMel Gorman 
2998f6f3f275SKefeng Wang 	return ret;
299976ab0f53SMel Gorman }
3000ed4a6d7fSMel Gorman 
3001ed4a6d7fSMel Gorman #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
300217adb230SYueHaibing static ssize_t compact_store(struct device *dev,
300310fbcf4cSKay Sievers 			     struct device_attribute *attr,
3004ed4a6d7fSMel Gorman 			     const char *buf, size_t count)
3005ed4a6d7fSMel Gorman {
30068575ec29SHugh Dickins 	int nid = dev->id;
30078575ec29SHugh Dickins 
30088575ec29SHugh Dickins 	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
30098575ec29SHugh Dickins 		/* Flush pending updates to the LRU lists */
30108575ec29SHugh Dickins 		lru_add_drain_all();
30118575ec29SHugh Dickins 
30123e40b3f4SKefeng Wang 		compact_node(NODE_DATA(nid), false);
30138575ec29SHugh Dickins 	}
3014ed4a6d7fSMel Gorman 
3015ed4a6d7fSMel Gorman 	return count;
3016ed4a6d7fSMel Gorman }
301717adb230SYueHaibing static DEVICE_ATTR_WO(compact);
3018ed4a6d7fSMel Gorman 
3019ed4a6d7fSMel Gorman int compaction_register_node(struct node *node)
3020ed4a6d7fSMel Gorman {
302110fbcf4cSKay Sievers 	return device_create_file(&node->dev, &dev_attr_compact);
3022ed4a6d7fSMel Gorman }
3023ed4a6d7fSMel Gorman 
3024ed4a6d7fSMel Gorman void compaction_unregister_node(struct node *node)
3025ed4a6d7fSMel Gorman {
3026f82024cbSKemeng Shi 	device_remove_file(&node->dev, &dev_attr_compact);
3027ed4a6d7fSMel Gorman }
3028ed4a6d7fSMel Gorman #endif /* CONFIG_SYSFS && CONFIG_NUMA */
3029ff9543fdSMichal Nazarewicz 
3030698b1b30SVlastimil Babka static inline bool kcompactd_work_requested(pg_data_t *pgdat)
3031698b1b30SVlastimil Babka {
303265d759c8SCharan Teja Reddy 	return pgdat->kcompactd_max_order > 0 || kthread_should_stop() ||
303365d759c8SCharan Teja Reddy 		pgdat->proactive_compact_trigger;
3034698b1b30SVlastimil Babka }
3035698b1b30SVlastimil Babka 
3036698b1b30SVlastimil Babka static bool kcompactd_node_suitable(pg_data_t *pgdat)
3037698b1b30SVlastimil Babka {
3038698b1b30SVlastimil Babka 	int zoneid;
3039698b1b30SVlastimil Babka 	struct zone *zone;
304097a225e6SJoonsoo Kim 	enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
3041e19a3f59SKemeng Shi 	enum compact_result ret;
3042698b1b30SVlastimil Babka 
304397a225e6SJoonsoo Kim 	for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
3044698b1b30SVlastimil Babka 		zone = &pgdat->node_zones[zoneid];
3045698b1b30SVlastimil Babka 
3046698b1b30SVlastimil Babka 		if (!populated_zone(zone))
3047698b1b30SVlastimil Babka 			continue;
3048698b1b30SVlastimil Babka 
3049e19a3f59SKemeng Shi 		ret = compaction_suit_allocation_order(zone,
3050e19a3f59SKemeng Shi 				pgdat->kcompactd_max_order,
3051e19a3f59SKemeng Shi 				highest_zoneidx, ALLOC_WMARK_MIN);
3052e19a3f59SKemeng Shi 		if (ret == COMPACT_CONTINUE)
3053698b1b30SVlastimil Babka 			return true;
3054698b1b30SVlastimil Babka 	}
3055698b1b30SVlastimil Babka 
3056698b1b30SVlastimil Babka 	return false;
3057698b1b30SVlastimil Babka }
3058698b1b30SVlastimil Babka 
3059698b1b30SVlastimil Babka static void kcompactd_do_work(pg_data_t *pgdat)
3060698b1b30SVlastimil Babka {
3061698b1b30SVlastimil Babka 	/*
3062698b1b30SVlastimil Babka 	 * With no special task, compact all zones so that a page of requested
3063698b1b30SVlastimil Babka 	 * order is allocatable.
3064698b1b30SVlastimil Babka 	 */
3065698b1b30SVlastimil Babka 	int zoneid;
3066698b1b30SVlastimil Babka 	struct zone *zone;
3067698b1b30SVlastimil Babka 	struct compact_control cc = {
3068698b1b30SVlastimil Babka 		.order = pgdat->kcompactd_max_order,
3069dbe2d4e4SMel Gorman 		.search_order = pgdat->kcompactd_max_order,
307097a225e6SJoonsoo Kim 		.highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
3071698b1b30SVlastimil Babka 		.mode = MIGRATE_SYNC_LIGHT,
3072a0647dc9SDavid Rientjes 		.ignore_skip_hint = false,
307373e64c51SMichal Hocko 		.gfp_mask = GFP_KERNEL,
3074698b1b30SVlastimil Babka 	};
3075e19a3f59SKemeng Shi 	enum compact_result ret;
3076e19a3f59SKemeng Shi 
3077698b1b30SVlastimil Babka 	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
307897a225e6SJoonsoo Kim 							cc.highest_zoneidx);
30797f354a54SDavid Rientjes 	count_compact_event(KCOMPACTD_WAKE);
3080698b1b30SVlastimil Babka 
308197a225e6SJoonsoo Kim 	for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
3082698b1b30SVlastimil Babka 		int status;
3083698b1b30SVlastimil Babka 
3084698b1b30SVlastimil Babka 		zone = &pgdat->node_zones[zoneid];
3085698b1b30SVlastimil Babka 		if (!populated_zone(zone))
3086698b1b30SVlastimil Babka 			continue;
3087698b1b30SVlastimil Babka 
3088698b1b30SVlastimil Babka 		if (compaction_deferred(zone, cc.order))
3089698b1b30SVlastimil Babka 			continue;
3090698b1b30SVlastimil Babka 
3091e19a3f59SKemeng Shi 		ret = compaction_suit_allocation_order(zone,
3092e19a3f59SKemeng Shi 				cc.order, zoneid, ALLOC_WMARK_MIN);
3093e19a3f59SKemeng Shi 		if (ret != COMPACT_CONTINUE)
3094e8606320SJohannes Weiner 			continue;
3095f98a497eSJohannes Weiner 
3096172400c6SVlastimil Babka 		if (kthread_should_stop())
3097172400c6SVlastimil Babka 			return;
3098a94b5252SYafang Shao 
3099a94b5252SYafang Shao 		cc.zone = zone;
31005e1f0f09SMel Gorman 		status = compact_zone(&cc, NULL);
3101698b1b30SVlastimil Babka 
31027ceb009aSVlastimil Babka 		if (status == COMPACT_SUCCESS) {
3103698b1b30SVlastimil Babka 			compaction_defer_reset(zone, cc.order, false);
3104c8f7de0bSMichal Hocko 		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
3105698b1b30SVlastimil Babka 			/*
3106bc3106b2SDavid Rientjes 			 * Buddy pages may become stranded on pcps that could
3107bc3106b2SDavid Rientjes 			 * otherwise coalesce on the zone's free area for
3108bc3106b2SDavid Rientjes 			 * order >= cc.order.  This is ratelimited by the
3109bc3106b2SDavid Rientjes 			 * upcoming deferral.
3110bc3106b2SDavid Rientjes 			 */
3111bc3106b2SDavid Rientjes 			drain_all_pages(zone);
3112bc3106b2SDavid Rientjes 
3113bc3106b2SDavid Rientjes 			/*
3114698b1b30SVlastimil Babka 			 * We use sync migration mode here, so we defer like
3115698b1b30SVlastimil Babka 			 * sync direct compaction does.
3116698b1b30SVlastimil Babka 			 */
3117698b1b30SVlastimil Babka 			defer_compaction(zone, cc.order);
3118698b1b30SVlastimil Babka 		}
3119698b1b30SVlastimil Babka 
31207f354a54SDavid Rientjes 		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
31217f354a54SDavid Rientjes 				     cc.total_migrate_scanned);
31227f354a54SDavid Rientjes 		count_compact_events(KCOMPACTD_FREE_SCANNED,
31237f354a54SDavid Rientjes 				     cc.total_free_scanned);
3124698b1b30SVlastimil Babka 	}
3125698b1b30SVlastimil Babka 
3126698b1b30SVlastimil Babka 	/*
3127698b1b30SVlastimil Babka 	 * Regardless of success, we are done until woken up next. But remember
312897a225e6SJoonsoo Kim 	 * the requested order/highest_zoneidx in case it was higher/tighter
312997a225e6SJoonsoo Kim 	 * than our current ones
3130698b1b30SVlastimil Babka 	 */
3131698b1b30SVlastimil Babka 	if (pgdat->kcompactd_max_order <= cc.order)
3132698b1b30SVlastimil Babka 		pgdat->kcompactd_max_order = 0;
313397a225e6SJoonsoo Kim 	if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
313497a225e6SJoonsoo Kim 		pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
3135698b1b30SVlastimil Babka }
3136698b1b30SVlastimil Babka 
313797a225e6SJoonsoo Kim void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
3138698b1b30SVlastimil Babka {
3139698b1b30SVlastimil Babka 	if (!order)
3140698b1b30SVlastimil Babka 		return;
3141698b1b30SVlastimil Babka 
3142698b1b30SVlastimil Babka 	if (pgdat->kcompactd_max_order < order)
3143698b1b30SVlastimil Babka 		pgdat->kcompactd_max_order = order;
3144698b1b30SVlastimil Babka 
314597a225e6SJoonsoo Kim 	if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
314697a225e6SJoonsoo Kim 		pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
3147698b1b30SVlastimil Babka 
31486818600fSDavidlohr Bueso 	/*
31496818600fSDavidlohr Bueso 	 * Pairs with implicit barrier in wait_event_freezable()
31506818600fSDavidlohr Bueso 	 * such that wakeups are not missed.
31516818600fSDavidlohr Bueso 	 */
31526818600fSDavidlohr Bueso 	if (!wq_has_sleeper(&pgdat->kcompactd_wait))
3153698b1b30SVlastimil Babka 		return;
3154698b1b30SVlastimil Babka 
3155698b1b30SVlastimil Babka 	if (!kcompactd_node_suitable(pgdat))
3156698b1b30SVlastimil Babka 		return;
3157698b1b30SVlastimil Babka 
3158698b1b30SVlastimil Babka 	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
315997a225e6SJoonsoo Kim 							highest_zoneidx);
3160698b1b30SVlastimil Babka 	wake_up_interruptible(&pgdat->kcompactd_wait);
3161698b1b30SVlastimil Babka }
3162698b1b30SVlastimil Babka 
3163698b1b30SVlastimil Babka /*
3164698b1b30SVlastimil Babka  * The background compaction daemon, started as a kernel thread
3165698b1b30SVlastimil Babka  * from the init process.
3166698b1b30SVlastimil Babka  */
3167698b1b30SVlastimil Babka static int kcompactd(void *p)
3168698b1b30SVlastimil Babka {
3169698b1b30SVlastimil Babka 	pg_data_t *pgdat = (pg_data_t *)p;
3170698b1b30SVlastimil Babka 	struct task_struct *tsk = current;
3171e1e92bfaSCharan Teja Reddy 	long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
3172e1e92bfaSCharan Teja Reddy 	long timeout = default_timeout;
3173698b1b30SVlastimil Babka 
3174698b1b30SVlastimil Babka 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3175698b1b30SVlastimil Babka 
3176698b1b30SVlastimil Babka 	if (!cpumask_empty(cpumask))
3177698b1b30SVlastimil Babka 		set_cpus_allowed_ptr(tsk, cpumask);
3178698b1b30SVlastimil Babka 
3179698b1b30SVlastimil Babka 	set_freezable();
3180698b1b30SVlastimil Babka 
3181698b1b30SVlastimil Babka 	pgdat->kcompactd_max_order = 0;
318297a225e6SJoonsoo Kim 	pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
3183698b1b30SVlastimil Babka 
3184698b1b30SVlastimil Babka 	while (!kthread_should_stop()) {
3185eb414681SJohannes Weiner 		unsigned long pflags;
3186eb414681SJohannes Weiner 
318765d759c8SCharan Teja Reddy 		/*
318865d759c8SCharan Teja Reddy 		 * Avoid the unnecessary wakeup for proactive compaction
318965d759c8SCharan Teja Reddy 		 * when it is disabled.
319065d759c8SCharan Teja Reddy 		 */
319165d759c8SCharan Teja Reddy 		if (!sysctl_compaction_proactiveness)
319265d759c8SCharan Teja Reddy 			timeout = MAX_SCHEDULE_TIMEOUT;
3193698b1b30SVlastimil Babka 		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
3194facdaa91SNitin Gupta 		if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
319565d759c8SCharan Teja Reddy 			kcompactd_work_requested(pgdat), timeout) &&
319665d759c8SCharan Teja Reddy 			!pgdat->proactive_compact_trigger) {
3197698b1b30SVlastimil Babka 
3198eb414681SJohannes Weiner 			psi_memstall_enter(&pflags);
3199698b1b30SVlastimil Babka 			kcompactd_do_work(pgdat);
3200eb414681SJohannes Weiner 			psi_memstall_leave(&pflags);
3201e1e92bfaSCharan Teja Reddy 			/*
3202e1e92bfaSCharan Teja Reddy 			 * Reset the timeout value. The defer timeout from
3203e1e92bfaSCharan Teja Reddy 			 * proactive compaction is lost here but that is fine
3204e1e92bfaSCharan Teja Reddy 			 * as the condition of the zone changing substantionally
3205e1e92bfaSCharan Teja Reddy 			 * then carrying on with the previous defer interval is
3206e1e92bfaSCharan Teja Reddy 			 * not useful.
3207e1e92bfaSCharan Teja Reddy 			 */
3208e1e92bfaSCharan Teja Reddy 			timeout = default_timeout;
3209facdaa91SNitin Gupta 			continue;
3210facdaa91SNitin Gupta 		}
3211facdaa91SNitin Gupta 
3212e1e92bfaSCharan Teja Reddy 		/*
3213e1e92bfaSCharan Teja Reddy 		 * Start the proactive work with default timeout. Based
3214e1e92bfaSCharan Teja Reddy 		 * on the fragmentation score, this timeout is updated.
3215e1e92bfaSCharan Teja Reddy 		 */
3216e1e92bfaSCharan Teja Reddy 		timeout = default_timeout;
3217facdaa91SNitin Gupta 		if (should_proactive_compact_node(pgdat)) {
3218facdaa91SNitin Gupta 			unsigned int prev_score, score;
3219facdaa91SNitin Gupta 
3220facdaa91SNitin Gupta 			prev_score = fragmentation_score_node(pgdat);
32213e40b3f4SKefeng Wang 			compact_node(pgdat, true);
3222facdaa91SNitin Gupta 			score = fragmentation_score_node(pgdat);
3223facdaa91SNitin Gupta 			/*
3224facdaa91SNitin Gupta 			 * Defer proactive compaction if the fragmentation
3225facdaa91SNitin Gupta 			 * score did not go down i.e. no progress made.
3226facdaa91SNitin Gupta 			 */
3227e1e92bfaSCharan Teja Reddy 			if (unlikely(score >= prev_score))
3228e1e92bfaSCharan Teja Reddy 				timeout =
3229e1e92bfaSCharan Teja Reddy 				   default_timeout << COMPACT_MAX_DEFER_SHIFT;
3230facdaa91SNitin Gupta 		}
323165d759c8SCharan Teja Reddy 		if (unlikely(pgdat->proactive_compact_trigger))
323265d759c8SCharan Teja Reddy 			pgdat->proactive_compact_trigger = false;
3233698b1b30SVlastimil Babka 	}
3234698b1b30SVlastimil Babka 
3235698b1b30SVlastimil Babka 	return 0;
3236698b1b30SVlastimil Babka }
3237698b1b30SVlastimil Babka 
3238698b1b30SVlastimil Babka /*
3239698b1b30SVlastimil Babka  * This kcompactd start function will be called by init and node-hot-add.
3240698b1b30SVlastimil Babka  * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
3241698b1b30SVlastimil Babka  */
3242833dfc00SMiaohe Lin void __meminit kcompactd_run(int nid)
3243698b1b30SVlastimil Babka {
3244698b1b30SVlastimil Babka 	pg_data_t *pgdat = NODE_DATA(nid);
3245698b1b30SVlastimil Babka 
3246698b1b30SVlastimil Babka 	if (pgdat->kcompactd)
3247024c61eaSMiaohe Lin 		return;
3248698b1b30SVlastimil Babka 
3249698b1b30SVlastimil Babka 	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
3250698b1b30SVlastimil Babka 	if (IS_ERR(pgdat->kcompactd)) {
3251698b1b30SVlastimil Babka 		pr_err("Failed to start kcompactd on node %d\n", nid);
3252698b1b30SVlastimil Babka 		pgdat->kcompactd = NULL;
3253698b1b30SVlastimil Babka 	}
3254698b1b30SVlastimil Babka }
3255698b1b30SVlastimil Babka 
3256698b1b30SVlastimil Babka /*
3257698b1b30SVlastimil Babka  * Called by memory hotplug when all memory in a node is offlined. Caller must
3258e8da368aSYun-Ze Li  * be holding mem_hotplug_begin/done().
3259698b1b30SVlastimil Babka  */
3260833dfc00SMiaohe Lin void __meminit kcompactd_stop(int nid)
3261698b1b30SVlastimil Babka {
3262698b1b30SVlastimil Babka 	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
3263698b1b30SVlastimil Babka 
3264698b1b30SVlastimil Babka 	if (kcompactd) {
3265698b1b30SVlastimil Babka 		kthread_stop(kcompactd);
3266698b1b30SVlastimil Babka 		NODE_DATA(nid)->kcompactd = NULL;
3267698b1b30SVlastimil Babka 	}
3268698b1b30SVlastimil Babka }
3269698b1b30SVlastimil Babka 
3270698b1b30SVlastimil Babka /*
3271698b1b30SVlastimil Babka  * It's optimal to keep kcompactd on the same CPUs as their memory, but
3272698b1b30SVlastimil Babka  * not required for correctness. So if the last cpu in a node goes
3273698b1b30SVlastimil Babka  * away, we get changed to run anywhere: as the first one comes back,
3274698b1b30SVlastimil Babka  * restore their cpu bindings.
3275698b1b30SVlastimil Babka  */
3276e46b1db2SAnna-Maria Gleixner static int kcompactd_cpu_online(unsigned int cpu)
3277698b1b30SVlastimil Babka {
3278698b1b30SVlastimil Babka 	int nid;
3279698b1b30SVlastimil Babka 
3280698b1b30SVlastimil Babka 	for_each_node_state(nid, N_MEMORY) {
3281698b1b30SVlastimil Babka 		pg_data_t *pgdat = NODE_DATA(nid);
3282698b1b30SVlastimil Babka 		const struct cpumask *mask;
3283698b1b30SVlastimil Babka 
3284698b1b30SVlastimil Babka 		mask = cpumask_of_node(pgdat->node_id);
3285698b1b30SVlastimil Babka 
3286698b1b30SVlastimil Babka 		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3287698b1b30SVlastimil Babka 			/* One of our CPUs online: restore mask */
32883109de30SMiaohe Lin 			if (pgdat->kcompactd)
3289698b1b30SVlastimil Babka 				set_cpus_allowed_ptr(pgdat->kcompactd, mask);
3290698b1b30SVlastimil Babka 	}
3291e46b1db2SAnna-Maria Gleixner 	return 0;
3292698b1b30SVlastimil Babka }
3293698b1b30SVlastimil Babka 
329448fe8ab8SMinghao Chi static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
329548fe8ab8SMinghao Chi 		int write, void *buffer, size_t *lenp, loff_t *ppos)
329648fe8ab8SMinghao Chi {
329748fe8ab8SMinghao Chi 	int ret, old;
329848fe8ab8SMinghao Chi 
329948fe8ab8SMinghao Chi 	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
330048fe8ab8SMinghao Chi 		return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
330148fe8ab8SMinghao Chi 
330248fe8ab8SMinghao Chi 	old = *(int *)table->data;
330348fe8ab8SMinghao Chi 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
330448fe8ab8SMinghao Chi 	if (ret)
330548fe8ab8SMinghao Chi 		return ret;
330648fe8ab8SMinghao Chi 	if (old != *(int *)table->data)
330748fe8ab8SMinghao Chi 		pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
330848fe8ab8SMinghao Chi 			     table->procname, current->comm,
330948fe8ab8SMinghao Chi 			     task_pid_nr(current));
331048fe8ab8SMinghao Chi 	return ret;
331148fe8ab8SMinghao Chi }
331248fe8ab8SMinghao Chi 
331348fe8ab8SMinghao Chi static struct ctl_table vm_compaction[] = {
331448fe8ab8SMinghao Chi 	{
331548fe8ab8SMinghao Chi 		.procname	= "compact_memory",
33168b9167cdSWen Yang 		.data		= &sysctl_compact_memory,
331748fe8ab8SMinghao Chi 		.maxlen		= sizeof(int),
331848fe8ab8SMinghao Chi 		.mode		= 0200,
331948fe8ab8SMinghao Chi 		.proc_handler	= sysctl_compaction_handler,
332048fe8ab8SMinghao Chi 	},
332148fe8ab8SMinghao Chi 	{
332248fe8ab8SMinghao Chi 		.procname	= "compaction_proactiveness",
332348fe8ab8SMinghao Chi 		.data		= &sysctl_compaction_proactiveness,
332448fe8ab8SMinghao Chi 		.maxlen		= sizeof(sysctl_compaction_proactiveness),
332548fe8ab8SMinghao Chi 		.mode		= 0644,
332648fe8ab8SMinghao Chi 		.proc_handler	= compaction_proactiveness_sysctl_handler,
332748fe8ab8SMinghao Chi 		.extra1		= SYSCTL_ZERO,
332848fe8ab8SMinghao Chi 		.extra2		= SYSCTL_ONE_HUNDRED,
332948fe8ab8SMinghao Chi 	},
333048fe8ab8SMinghao Chi 	{
333148fe8ab8SMinghao Chi 		.procname	= "extfrag_threshold",
333248fe8ab8SMinghao Chi 		.data		= &sysctl_extfrag_threshold,
333348fe8ab8SMinghao Chi 		.maxlen		= sizeof(int),
333448fe8ab8SMinghao Chi 		.mode		= 0644,
333548fe8ab8SMinghao Chi 		.proc_handler	= proc_dointvec_minmax,
333648fe8ab8SMinghao Chi 		.extra1		= SYSCTL_ZERO,
333748fe8ab8SMinghao Chi 		.extra2		= SYSCTL_ONE_THOUSAND,
333848fe8ab8SMinghao Chi 	},
333948fe8ab8SMinghao Chi 	{
334048fe8ab8SMinghao Chi 		.procname	= "compact_unevictable_allowed",
334148fe8ab8SMinghao Chi 		.data		= &sysctl_compact_unevictable_allowed,
334248fe8ab8SMinghao Chi 		.maxlen		= sizeof(int),
334348fe8ab8SMinghao Chi 		.mode		= 0644,
334448fe8ab8SMinghao Chi 		.proc_handler	= proc_dointvec_minmax_warn_RT_change,
334548fe8ab8SMinghao Chi 		.extra1		= SYSCTL_ZERO,
334648fe8ab8SMinghao Chi 		.extra2		= SYSCTL_ONE,
334748fe8ab8SMinghao Chi 	},
334848fe8ab8SMinghao Chi 	{ }
334948fe8ab8SMinghao Chi };
335048fe8ab8SMinghao Chi 
3351698b1b30SVlastimil Babka static int __init kcompactd_init(void)
3352698b1b30SVlastimil Babka {
3353698b1b30SVlastimil Babka 	int nid;
3354e46b1db2SAnna-Maria Gleixner 	int ret;
3355e46b1db2SAnna-Maria Gleixner 
3356e46b1db2SAnna-Maria Gleixner 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
3357e46b1db2SAnna-Maria Gleixner 					"mm/compaction:online",
3358e46b1db2SAnna-Maria Gleixner 					kcompactd_cpu_online, NULL);
3359e46b1db2SAnna-Maria Gleixner 	if (ret < 0) {
3360e46b1db2SAnna-Maria Gleixner 		pr_err("kcompactd: failed to register hotplug callbacks.\n");
3361e46b1db2SAnna-Maria Gleixner 		return ret;
3362e46b1db2SAnna-Maria Gleixner 	}
3363698b1b30SVlastimil Babka 
3364698b1b30SVlastimil Babka 	for_each_node_state(nid, N_MEMORY)
3365698b1b30SVlastimil Babka 		kcompactd_run(nid);
336648fe8ab8SMinghao Chi 	register_sysctl_init("vm", vm_compaction);
3367698b1b30SVlastimil Babka 	return 0;
3368698b1b30SVlastimil Babka }
3369698b1b30SVlastimil Babka subsys_initcall(kcompactd_init)
3370698b1b30SVlastimil Babka 
3371ff9543fdSMichal Nazarewicz #endif /* CONFIG_COMPACTION */
3372